{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 1085, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2304147465437788, "grad_norm": 0.1171875, "learning_rate": 4.9423963133640554e-05, "loss": 0.45, "step": 50 }, { "epoch": 0.4608294930875576, "grad_norm": 0.12060546875, "learning_rate": 4.8847926267281106e-05, "loss": 0.2531, "step": 100 }, { "epoch": 0.6912442396313364, "grad_norm": 0.1259765625, "learning_rate": 4.827188940092166e-05, "loss": 0.2317, "step": 150 }, { "epoch": 0.9216589861751152, "grad_norm": 0.11865234375, "learning_rate": 4.7695852534562216e-05, "loss": 0.2204, "step": 200 }, { "epoch": 1.0, "eval_loss": 0.21880939602851868, "eval_runtime": 52.8765, "eval_samples_per_second": 14.827, "eval_steps_per_second": 0.473, "step": 217 }, { "epoch": 1.1520737327188941, "grad_norm": 0.1201171875, "learning_rate": 4.711981566820277e-05, "loss": 0.2114, "step": 250 }, { "epoch": 1.3824884792626728, "grad_norm": 0.1083984375, "learning_rate": 4.654377880184332e-05, "loss": 0.2109, "step": 300 }, { "epoch": 1.6129032258064515, "grad_norm": 0.11865234375, "learning_rate": 4.596774193548387e-05, "loss": 0.2045, "step": 350 }, { "epoch": 1.8433179723502304, "grad_norm": 0.158203125, "learning_rate": 4.539170506912442e-05, "loss": 0.21, "step": 400 }, { "epoch": 2.0, "eval_loss": 0.20793424546718597, "eval_runtime": 52.6064, "eval_samples_per_second": 14.903, "eval_steps_per_second": 0.475, "step": 434 }, { "epoch": 2.0737327188940093, "grad_norm": 0.12109375, "learning_rate": 4.4815668202764974e-05, "loss": 0.2032, "step": 450 }, { "epoch": 2.3041474654377883, "grad_norm": 0.1201171875, "learning_rate": 4.423963133640553e-05, "loss": 0.2001, "step": 500 }, { "epoch": 2.5345622119815667, "grad_norm": 0.125, "learning_rate": 4.366359447004609e-05, "loss": 0.1953, "step": 550 }, { "epoch": 2.7649769585253456, "grad_norm": 0.130859375, "learning_rate": 4.308755760368664e-05, "loss": 0.1956, "step": 600 }, { "epoch": 2.9953917050691246, "grad_norm": 0.1220703125, "learning_rate": 4.2511520737327194e-05, "loss": 0.1978, "step": 650 }, { "epoch": 3.0, "eval_loss": 0.20394934713840485, "eval_runtime": 52.684, "eval_samples_per_second": 14.881, "eval_steps_per_second": 0.475, "step": 651 }, { "epoch": 3.225806451612903, "grad_norm": 0.1376953125, "learning_rate": 4.1935483870967746e-05, "loss": 0.1875, "step": 700 }, { "epoch": 3.456221198156682, "grad_norm": 0.119140625, "learning_rate": 4.13594470046083e-05, "loss": 0.1883, "step": 750 }, { "epoch": 3.686635944700461, "grad_norm": 0.1376953125, "learning_rate": 4.078341013824885e-05, "loss": 0.1925, "step": 800 }, { "epoch": 3.9170506912442398, "grad_norm": 0.1357421875, "learning_rate": 4.02073732718894e-05, "loss": 0.1903, "step": 850 }, { "epoch": 4.0, "eval_loss": 0.20200659334659576, "eval_runtime": 53.2269, "eval_samples_per_second": 14.729, "eval_steps_per_second": 0.47, "step": 868 }, { "epoch": 4.147465437788019, "grad_norm": 0.140625, "learning_rate": 3.963133640552996e-05, "loss": 0.1838, "step": 900 }, { "epoch": 4.377880184331797, "grad_norm": 0.1455078125, "learning_rate": 3.905529953917051e-05, "loss": 0.182, "step": 950 }, { "epoch": 4.6082949308755765, "grad_norm": 0.1396484375, "learning_rate": 3.847926267281106e-05, "loss": 0.1827, "step": 1000 }, { "epoch": 4.838709677419355, "grad_norm": 0.1416015625, "learning_rate": 3.7903225806451614e-05, "loss": 0.1808, "step": 1050 }, { "epoch": 5.0, "eval_loss": 0.2014361470937729, "eval_runtime": 54.1943, "eval_samples_per_second": 14.466, "eval_steps_per_second": 0.461, "step": 1085 } ], "logging_steps": 50, "max_steps": 4340, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 7.632310487786455e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }