{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996261682242991, "eval_steps": 250, "global_step": 1337, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05009345794392523, "grad_norm": 4.856017112731934, "learning_rate": 4.925373134328357e-07, "loss": 0.6864, "step": 67 }, { "epoch": 0.10018691588785046, "grad_norm": 5.972659587860107, "learning_rate": 9.925373134328357e-07, "loss": 0.7251, "step": 134 }, { "epoch": 0.1502803738317757, "grad_norm": 3.9563512802124023, "learning_rate": 9.451371571072319e-07, "loss": 0.6276, "step": 201 }, { "epoch": 0.18691588785046728, "eval_loss": 0.530458390712738, "eval_runtime": 26.5207, "eval_samples_per_second": 18.853, "eval_steps_per_second": 9.427, "step": 250 }, { "epoch": 0.20037383177570092, "grad_norm": 5.4779744148254395, "learning_rate": 8.894430590191188e-07, "loss": 0.5709, "step": 268 }, { "epoch": 0.2504672897196262, "grad_norm": 3.915782928466797, "learning_rate": 8.337489609310058e-07, "loss": 0.4635, "step": 335 }, { "epoch": 0.3005607476635514, "grad_norm": 3.473511219024658, "learning_rate": 7.780548628428927e-07, "loss": 0.3408, "step": 402 }, { "epoch": 0.3506542056074766, "grad_norm": 3.5871734619140625, "learning_rate": 7.223607647547797e-07, "loss": 0.2889, "step": 469 }, { "epoch": 0.37383177570093457, "eval_loss": 0.24327994883060455, "eval_runtime": 26.5274, "eval_samples_per_second": 18.848, "eval_steps_per_second": 9.424, "step": 500 }, { "epoch": 0.40074766355140184, "grad_norm": 2.3577873706817627, "learning_rate": 6.666666666666666e-07, "loss": 0.2366, "step": 536 }, { "epoch": 0.4508411214953271, "grad_norm": 2.283566951751709, "learning_rate": 6.109725685785536e-07, "loss": 0.201, "step": 603 }, { "epoch": 0.5009345794392523, "grad_norm": 3.1220109462738037, "learning_rate": 5.552784704904405e-07, "loss": 0.1925, "step": 670 }, { "epoch": 0.5510280373831775, "grad_norm": 1.7564232349395752, "learning_rate": 4.995843724023275e-07, "loss": 0.1926, "step": 737 }, { "epoch": 0.5607476635514018, "eval_loss": 0.1771223098039627, "eval_runtime": 26.532, "eval_samples_per_second": 18.845, "eval_steps_per_second": 9.423, "step": 750 }, { "epoch": 0.6011214953271028, "grad_norm": 2.3800840377807617, "learning_rate": 4.438902743142144e-07, "loss": 0.1808, "step": 804 }, { "epoch": 0.6512149532710281, "grad_norm": 2.1542205810546875, "learning_rate": 3.881961762261014e-07, "loss": 0.1792, "step": 871 }, { "epoch": 0.7013084112149532, "grad_norm": 1.8653497695922852, "learning_rate": 3.3250207813798835e-07, "loss": 0.1655, "step": 938 }, { "epoch": 0.7476635514018691, "eval_loss": 0.15651728212833405, "eval_runtime": 26.5298, "eval_samples_per_second": 18.847, "eval_steps_per_second": 9.423, "step": 1000 }, { "epoch": 0.7514018691588785, "grad_norm": 1.4686954021453857, "learning_rate": 2.7680798004987534e-07, "loss": 0.1297, "step": 1005 }, { "epoch": 0.8014953271028037, "grad_norm": 1.8491053581237793, "learning_rate": 2.2111388196176226e-07, "loss": 0.1413, "step": 1072 }, { "epoch": 0.851588785046729, "grad_norm": 3.7557272911071777, "learning_rate": 1.6541978387364923e-07, "loss": 0.1543, "step": 1139 }, { "epoch": 0.9016822429906542, "grad_norm": 1.0174674987792969, "learning_rate": 1.0972568578553615e-07, "loss": 0.1353, "step": 1206 }, { "epoch": 0.9345794392523364, "eval_loss": 0.15059247612953186, "eval_runtime": 26.5259, "eval_samples_per_second": 18.85, "eval_steps_per_second": 9.425, "step": 1250 }, { "epoch": 0.9517757009345794, "grad_norm": 2.875340700149536, "learning_rate": 5.403158769742311e-08, "loss": 0.1429, "step": 1273 }, { "epoch": 0.9996261682242991, "step": 1337, "total_flos": 8.688878626214707e+16, "train_loss": 0.29530572784241227, "train_runtime": 1905.129, "train_samples_per_second": 5.616, "train_steps_per_second": 0.702 } ], "logging_steps": 67, "max_steps": 1337, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.688878626214707e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }