| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 6.0, |
| "eval_steps": 500, |
| "global_step": 1320, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.22727272727272727, |
| "grad_norm": 0.2216796875, |
| "learning_rate": 4.943181818181818e-05, |
| "loss": 0.5826, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.45454545454545453, |
| "grad_norm": 0.150390625, |
| "learning_rate": 4.886363636363637e-05, |
| "loss": 0.3128, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.6818181818181818, |
| "grad_norm": 0.1298828125, |
| "learning_rate": 4.829545454545455e-05, |
| "loss": 0.289, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.9090909090909091, |
| "grad_norm": 0.1435546875, |
| "learning_rate": 4.772727272727273e-05, |
| "loss": 0.2702, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.24845334887504578, |
| "eval_runtime": 52.787, |
| "eval_samples_per_second": 14.89, |
| "eval_steps_per_second": 0.474, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.1363636363636362, |
| "grad_norm": 0.1396484375, |
| "learning_rate": 4.715909090909091e-05, |
| "loss": 0.2663, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.3636363636363638, |
| "grad_norm": 0.166015625, |
| "learning_rate": 4.659090909090909e-05, |
| "loss": 0.2591, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.5909090909090908, |
| "grad_norm": 0.162109375, |
| "learning_rate": 4.602272727272727e-05, |
| "loss": 0.2498, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.8181818181818183, |
| "grad_norm": 0.119140625, |
| "learning_rate": 4.545454545454546e-05, |
| "loss": 0.2557, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.2350398302078247, |
| "eval_runtime": 53.0739, |
| "eval_samples_per_second": 14.81, |
| "eval_steps_per_second": 0.471, |
| "step": 440 |
| }, |
| { |
| "epoch": 2.0454545454545454, |
| "grad_norm": 0.1259765625, |
| "learning_rate": 4.488636363636364e-05, |
| "loss": 0.2466, |
| "step": 450 |
| }, |
| { |
| "epoch": 2.2727272727272725, |
| "grad_norm": 0.1416015625, |
| "learning_rate": 4.431818181818182e-05, |
| "loss": 0.2361, |
| "step": 500 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 0.1337890625, |
| "learning_rate": 4.375e-05, |
| "loss": 0.253, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.7272727272727275, |
| "grad_norm": 0.1513671875, |
| "learning_rate": 4.318181818181819e-05, |
| "loss": 0.2409, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.9545454545454546, |
| "grad_norm": 0.1640625, |
| "learning_rate": 4.261363636363637e-05, |
| "loss": 0.2436, |
| "step": 650 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.22992941737174988, |
| "eval_runtime": 53.1353, |
| "eval_samples_per_second": 14.792, |
| "eval_steps_per_second": 0.47, |
| "step": 660 |
| }, |
| { |
| "epoch": 3.1818181818181817, |
| "grad_norm": 0.16015625, |
| "learning_rate": 4.204545454545455e-05, |
| "loss": 0.2363, |
| "step": 700 |
| }, |
| { |
| "epoch": 3.409090909090909, |
| "grad_norm": 0.1650390625, |
| "learning_rate": 4.1477272727272734e-05, |
| "loss": 0.2256, |
| "step": 750 |
| }, |
| { |
| "epoch": 3.6363636363636362, |
| "grad_norm": 0.158203125, |
| "learning_rate": 4.0909090909090915e-05, |
| "loss": 0.2389, |
| "step": 800 |
| }, |
| { |
| "epoch": 3.8636363636363638, |
| "grad_norm": 0.169921875, |
| "learning_rate": 4.034090909090909e-05, |
| "loss": 0.2291, |
| "step": 850 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.22730520367622375, |
| "eval_runtime": 53.1918, |
| "eval_samples_per_second": 14.777, |
| "eval_steps_per_second": 0.47, |
| "step": 880 |
| }, |
| { |
| "epoch": 4.090909090909091, |
| "grad_norm": 0.1708984375, |
| "learning_rate": 3.9772727272727275e-05, |
| "loss": 0.2318, |
| "step": 900 |
| }, |
| { |
| "epoch": 4.318181818181818, |
| "grad_norm": 0.181640625, |
| "learning_rate": 3.9204545454545456e-05, |
| "loss": 0.2322, |
| "step": 950 |
| }, |
| { |
| "epoch": 4.545454545454545, |
| "grad_norm": 0.1591796875, |
| "learning_rate": 3.8636363636363636e-05, |
| "loss": 0.2194, |
| "step": 1000 |
| }, |
| { |
| "epoch": 4.7727272727272725, |
| "grad_norm": 0.166015625, |
| "learning_rate": 3.8068181818181816e-05, |
| "loss": 0.2277, |
| "step": 1050 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.1767578125, |
| "learning_rate": 3.7500000000000003e-05, |
| "loss": 0.2329, |
| "step": 1100 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 0.2264343947172165, |
| "eval_runtime": 54.0094, |
| "eval_samples_per_second": 14.553, |
| "eval_steps_per_second": 0.463, |
| "step": 1100 |
| }, |
| { |
| "epoch": 5.2272727272727275, |
| "grad_norm": 0.1806640625, |
| "learning_rate": 3.6931818181818184e-05, |
| "loss": 0.2234, |
| "step": 1150 |
| }, |
| { |
| "epoch": 5.454545454545454, |
| "grad_norm": 0.19140625, |
| "learning_rate": 3.6363636363636364e-05, |
| "loss": 0.2154, |
| "step": 1200 |
| }, |
| { |
| "epoch": 5.681818181818182, |
| "grad_norm": 0.173828125, |
| "learning_rate": 3.579545454545455e-05, |
| "loss": 0.2349, |
| "step": 1250 |
| }, |
| { |
| "epoch": 5.909090909090909, |
| "grad_norm": 0.162109375, |
| "learning_rate": 3.522727272727273e-05, |
| "loss": 0.2172, |
| "step": 1300 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 0.22602269053459167, |
| "eval_runtime": 54.0842, |
| "eval_samples_per_second": 14.533, |
| "eval_steps_per_second": 0.462, |
| "step": 1320 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 4400, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 20, |
| "save_steps": 500, |
| "total_flos": 9.285391673846661e+17, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|