{ "best_metric": 0.017106017097830772, "best_model_checkpoint": "saves/chess/tactic/checkpoint-1000", "epoch": 5.0, "eval_steps": 1000, "global_step": 3075, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.16260162601626016, "grad_norm": 4.799216229524151, "learning_rate": 1.6233766233766235e-06, "loss": 0.8994, "step": 100 }, { "epoch": 0.3252032520325203, "grad_norm": 0.7153367237536395, "learning_rate": 3.246753246753247e-06, "loss": 0.0317, "step": 200 }, { "epoch": 0.4878048780487805, "grad_norm": 0.7387556653345047, "learning_rate": 4.870129870129871e-06, "loss": 0.0245, "step": 300 }, { "epoch": 0.6504065040650406, "grad_norm": 1.9740058439148331, "learning_rate": 4.986373880811079e-06, "loss": 0.0222, "step": 400 }, { "epoch": 0.8130081300813008, "grad_norm": 0.6344698055959135, "learning_rate": 4.940833840455932e-06, "loss": 0.0208, "step": 500 }, { "epoch": 0.975609756097561, "grad_norm": 0.6970636016265686, "learning_rate": 4.863863172170709e-06, "loss": 0.02, "step": 600 }, { "epoch": 1.1382113821138211, "grad_norm": 0.6801856070533878, "learning_rate": 4.756453027584134e-06, "loss": 0.0171, "step": 700 }, { "epoch": 1.3008130081300813, "grad_norm": 0.275933332771696, "learning_rate": 4.619986527593033e-06, "loss": 0.0173, "step": 800 }, { "epoch": 1.4634146341463414, "grad_norm": 0.5872805765471233, "learning_rate": 4.4562209519085615e-06, "loss": 0.0174, "step": 900 }, { "epoch": 1.6260162601626016, "grad_norm": 0.3927107277822733, "learning_rate": 4.26726511055776e-06, "loss": 0.0169, "step": 1000 }, { "epoch": 1.6260162601626016, "eval_loss": 0.017106017097830772, "eval_runtime": 190.9398, "eval_samples_per_second": 183.105, "eval_steps_per_second": 0.718, "step": 1000 }, { "epoch": 1.7886178861788617, "grad_norm": 0.33192992931188736, "learning_rate": 4.055552188727706e-06, "loss": 0.0159, "step": 1100 }, { "epoch": 1.951219512195122, "grad_norm": 0.4102076449010124, "learning_rate": 3.823808414629323e-06, "loss": 0.016, "step": 1200 }, { "epoch": 2.113821138211382, "grad_norm": 0.4366312248115214, "learning_rate": 3.575017953844908e-06, "loss": 0.0126, "step": 1300 }, { "epoch": 2.2764227642276422, "grad_norm": 0.38777672675426217, "learning_rate": 3.3123844822150126e-06, "loss": 0.0119, "step": 1400 }, { "epoch": 2.4390243902439024, "grad_norm": 0.573942741447281, "learning_rate": 3.0392899320907716e-06, "loss": 0.0118, "step": 1500 }, { "epoch": 2.6016260162601625, "grad_norm": 0.4798738867766201, "learning_rate": 2.759250943176377e-06, "loss": 0.0117, "step": 1600 }, { "epoch": 2.7642276422764227, "grad_norm": 0.2747941215742872, "learning_rate": 2.4758735787443878e-06, "loss": 0.0116, "step": 1700 }, { "epoch": 2.926829268292683, "grad_norm": 0.39700972538910706, "learning_rate": 2.192806890343352e-06, "loss": 0.0111, "step": 1800 }, { "epoch": 3.089430894308943, "grad_norm": 0.45186864274833893, "learning_rate": 1.9136959289452223e-06, "loss": 0.0077, "step": 1900 }, { "epoch": 3.252032520325203, "grad_norm": 0.6795786745533399, "learning_rate": 1.6421348076082123e-06, "loss": 0.0049, "step": 2000 }, { "epoch": 3.252032520325203, "eval_loss": 0.023808766156435013, "eval_runtime": 191.1442, "eval_samples_per_second": 182.909, "eval_steps_per_second": 0.717, "step": 2000 }, { "epoch": 3.4146341463414633, "grad_norm": 0.6090255517872498, "learning_rate": 1.3816204200673827e-06, "loss": 0.0052, "step": 2100 }, { "epoch": 3.5772357723577235, "grad_norm": 0.7553892788821762, "learning_rate": 1.1355074112188802e-06, "loss": 0.0048, "step": 2200 }, { "epoch": 3.7398373983739837, "grad_norm": 0.33557205962825315, "learning_rate": 9.069649793430869e-07, "loss": 0.0046, "step": 2300 }, { "epoch": 3.902439024390244, "grad_norm": 0.39121616514997803, "learning_rate": 6.989360663246406e-07, "loss": 0.0043, "step": 2400 }, { "epoch": 4.065040650406504, "grad_norm": 0.37417208848590455, "learning_rate": 5.1409946137705e-07, "loss": 0.003, "step": 2500 }, { "epoch": 4.227642276422764, "grad_norm": 0.3450534162422346, "learning_rate": 3.548353062623949e-07, "loss": 0.0007, "step": 2600 }, { "epoch": 4.390243902439025, "grad_norm": 0.055395596176947635, "learning_rate": 2.231944461955507e-07, "loss": 0.0007, "step": 2700 }, { "epoch": 4.5528455284552845, "grad_norm": 0.7534781076862362, "learning_rate": 1.2087202110147994e-07, "loss": 0.0007, "step": 2800 }, { "epoch": 4.715447154471545, "grad_norm": 0.6312940968696128, "learning_rate": 4.9185637291078724e-08, "loss": 0.0006, "step": 2900 }, { "epoch": 4.878048780487805, "grad_norm": 0.6118892403786919, "learning_rate": 9.058400639009313e-09, "loss": 0.0005, "step": 3000 }, { "epoch": 4.878048780487805, "eval_loss": 0.03251836076378822, "eval_runtime": 191.2785, "eval_samples_per_second": 182.781, "eval_steps_per_second": 0.716, "step": 3000 }, { "epoch": 5.0, "step": 3075, "total_flos": 502860412354560.0, "train_loss": 0.03991650681670119, "train_runtime": 33327.5964, "train_samples_per_second": 47.207, "train_steps_per_second": 0.092 } ], "logging_steps": 100, "max_steps": 3075, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 502860412354560.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }