| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 1085, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.2304147465437788, | |
| "grad_norm": 0.1171875, | |
| "learning_rate": 4.9423963133640554e-05, | |
| "loss": 0.45, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.4608294930875576, | |
| "grad_norm": 0.12060546875, | |
| "learning_rate": 4.8847926267281106e-05, | |
| "loss": 0.2531, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6912442396313364, | |
| "grad_norm": 0.1259765625, | |
| "learning_rate": 4.827188940092166e-05, | |
| "loss": 0.2317, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.9216589861751152, | |
| "grad_norm": 0.11865234375, | |
| "learning_rate": 4.7695852534562216e-05, | |
| "loss": 0.2204, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.21880939602851868, | |
| "eval_runtime": 52.8765, | |
| "eval_samples_per_second": 14.827, | |
| "eval_steps_per_second": 0.473, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 1.1520737327188941, | |
| "grad_norm": 0.1201171875, | |
| "learning_rate": 4.711981566820277e-05, | |
| "loss": 0.2114, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.3824884792626728, | |
| "grad_norm": 0.1083984375, | |
| "learning_rate": 4.654377880184332e-05, | |
| "loss": 0.2109, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.6129032258064515, | |
| "grad_norm": 0.11865234375, | |
| "learning_rate": 4.596774193548387e-05, | |
| "loss": 0.2045, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.8433179723502304, | |
| "grad_norm": 0.158203125, | |
| "learning_rate": 4.539170506912442e-05, | |
| "loss": 0.21, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.20793424546718597, | |
| "eval_runtime": 52.6064, | |
| "eval_samples_per_second": 14.903, | |
| "eval_steps_per_second": 0.475, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 2.0737327188940093, | |
| "grad_norm": 0.12109375, | |
| "learning_rate": 4.4815668202764974e-05, | |
| "loss": 0.2032, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.3041474654377883, | |
| "grad_norm": 0.1201171875, | |
| "learning_rate": 4.423963133640553e-05, | |
| "loss": 0.2001, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.5345622119815667, | |
| "grad_norm": 0.125, | |
| "learning_rate": 4.366359447004609e-05, | |
| "loss": 0.1953, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.7649769585253456, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 4.308755760368664e-05, | |
| "loss": 0.1956, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.9953917050691246, | |
| "grad_norm": 0.1220703125, | |
| "learning_rate": 4.2511520737327194e-05, | |
| "loss": 0.1978, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.20394934713840485, | |
| "eval_runtime": 52.684, | |
| "eval_samples_per_second": 14.881, | |
| "eval_steps_per_second": 0.475, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 3.225806451612903, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 4.1935483870967746e-05, | |
| "loss": 0.1875, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.456221198156682, | |
| "grad_norm": 0.119140625, | |
| "learning_rate": 4.13594470046083e-05, | |
| "loss": 0.1883, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 3.686635944700461, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 4.078341013824885e-05, | |
| "loss": 0.1925, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 3.9170506912442398, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 4.02073732718894e-05, | |
| "loss": 0.1903, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.20200659334659576, | |
| "eval_runtime": 53.2269, | |
| "eval_samples_per_second": 14.729, | |
| "eval_steps_per_second": 0.47, | |
| "step": 868 | |
| }, | |
| { | |
| "epoch": 4.147465437788019, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 3.963133640552996e-05, | |
| "loss": 0.1838, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 4.377880184331797, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 3.905529953917051e-05, | |
| "loss": 0.182, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 4.6082949308755765, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 3.847926267281106e-05, | |
| "loss": 0.1827, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 4.838709677419355, | |
| "grad_norm": 0.1416015625, | |
| "learning_rate": 3.7903225806451614e-05, | |
| "loss": 0.1808, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.2014361470937729, | |
| "eval_runtime": 54.1943, | |
| "eval_samples_per_second": 14.466, | |
| "eval_steps_per_second": 0.461, | |
| "step": 1085 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 4340, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 20, | |
| "save_steps": 500, | |
| "total_flos": 7.632310487786455e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |