| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.991869918699187, | |
| "eval_steps": 500, | |
| "global_step": 276, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.10840108401084012, | |
| "grad_norm": 1.2238898092954031, | |
| "learning_rate": 5e-06, | |
| "loss": 1.1139, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.21680216802168023, | |
| "grad_norm": 0.7661987078288646, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0234, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.3252032520325203, | |
| "grad_norm": 0.6931478916835131, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0067, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.43360433604336046, | |
| "grad_norm": 0.6288766129904495, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0004, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.5420054200542005, | |
| "grad_norm": 0.6511705997239466, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9872, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.6504065040650406, | |
| "grad_norm": 0.6237606284279241, | |
| "learning_rate": 5e-06, | |
| "loss": 0.983, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.7588075880758808, | |
| "grad_norm": 0.6061811934000912, | |
| "learning_rate": 5e-06, | |
| "loss": 0.977, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.8672086720867209, | |
| "grad_norm": 0.5691567098461193, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9609, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.975609756097561, | |
| "grad_norm": 0.579566994994573, | |
| "learning_rate": 5e-06, | |
| "loss": 0.967, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.997289972899729, | |
| "eval_loss": 0.9647226333618164, | |
| "eval_runtime": 97.7674, | |
| "eval_samples_per_second": 25.417, | |
| "eval_steps_per_second": 0.399, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.084010840108401, | |
| "grad_norm": 0.7893915919770154, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0085, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.1924119241192412, | |
| "grad_norm": 0.6881452344372211, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9266, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.3008130081300813, | |
| "grad_norm": 0.7094638595829068, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9176, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.4092140921409215, | |
| "grad_norm": 0.757737014933452, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9165, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.5176151761517616, | |
| "grad_norm": 0.7420578994185969, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9178, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.6260162601626016, | |
| "grad_norm": 0.6872431636486115, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9095, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.7344173441734418, | |
| "grad_norm": 0.6839268163247622, | |
| "learning_rate": 5e-06, | |
| "loss": 0.922, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.8428184281842819, | |
| "grad_norm": 0.6646443132529639, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9161, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.951219512195122, | |
| "grad_norm": 0.7695800503499497, | |
| "learning_rate": 5e-06, | |
| "loss": 0.917, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.994579945799458, | |
| "eval_loss": 0.9558340907096863, | |
| "eval_runtime": 98.5336, | |
| "eval_samples_per_second": 25.22, | |
| "eval_steps_per_second": 0.396, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 2.059620596205962, | |
| "grad_norm": 0.8673539202787472, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9576, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.168021680216802, | |
| "grad_norm": 0.6595504985017855, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8711, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.2764227642276422, | |
| "grad_norm": 0.7339078950029295, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8692, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.3848238482384825, | |
| "grad_norm": 0.7590090712265437, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8727, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.4932249322493227, | |
| "grad_norm": 0.6567177012645716, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8714, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.6016260162601625, | |
| "grad_norm": 0.7927473560858851, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8675, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.710027100271003, | |
| "grad_norm": 0.6928632841645139, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8694, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.818428184281843, | |
| "grad_norm": 0.7142607867077723, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8728, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.926829268292683, | |
| "grad_norm": 0.7218062765913728, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8735, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.991869918699187, | |
| "eval_loss": 0.9591670632362366, | |
| "eval_runtime": 96.4736, | |
| "eval_samples_per_second": 25.758, | |
| "eval_steps_per_second": 0.404, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 2.991869918699187, | |
| "step": 276, | |
| "total_flos": 462100900085760.0, | |
| "train_loss": 0.935515574786974, | |
| "train_runtime": 16221.6629, | |
| "train_samples_per_second": 8.73, | |
| "train_steps_per_second": 0.017 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 276, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 462100900085760.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |