{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.014790468364832, "eval_steps": 200, "global_step": 3660, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.3286770747740345, "grad_norm": 0.4525283873081207, "learning_rate": 1.9879568983287468e-05, "loss": 2.0164, "step": 200 }, { "epoch": 0.3286770747740345, "eval_loss": 0.808789849281311, "eval_runtime": 648.6859, "eval_samples_per_second": 1.893, "eval_steps_per_second": 0.237, "step": 200 }, { "epoch": 0.657354149548069, "grad_norm": 0.48705440759658813, "learning_rate": 1.9466986959828063e-05, "loss": 1.1703, "step": 400 }, { "epoch": 0.657354149548069, "eval_loss": 0.7549917697906494, "eval_runtime": 645.5309, "eval_samples_per_second": 1.902, "eval_steps_per_second": 0.239, "step": 400 }, { "epoch": 0.9860312243221035, "grad_norm": 0.6883838176727295, "learning_rate": 1.877302694199442e-05, "loss": 1.0269, "step": 600 }, { "epoch": 0.9860312243221035, "eval_loss": 0.7438387870788574, "eval_runtime": 649.872, "eval_samples_per_second": 1.89, "eval_steps_per_second": 0.237, "step": 600 }, { "epoch": 1.314708299096138, "grad_norm": 0.5763583183288574, "learning_rate": 1.78183148246803e-05, "loss": 0.9723, "step": 800 }, { "epoch": 1.314708299096138, "eval_loss": 0.7379218935966492, "eval_runtime": 649.5248, "eval_samples_per_second": 1.891, "eval_steps_per_second": 0.237, "step": 800 }, { "epoch": 1.6433853738701725, "grad_norm": 0.5101306438446045, "learning_rate": 1.6631226582407954e-05, "loss": 0.9505, "step": 1000 }, { "epoch": 1.6433853738701725, "eval_loss": 0.7350366711616516, "eval_runtime": 650.1904, "eval_samples_per_second": 1.889, "eval_steps_per_second": 0.237, "step": 1000 }, { "epoch": 1.972062448644207, "grad_norm": 0.5611714124679565, "learning_rate": 1.524704487799008e-05, "loss": 0.9255, "step": 1200 }, { "epoch": 1.972062448644207, "eval_loss": 0.7326610684394836, "eval_runtime": 652.5211, "eval_samples_per_second": 1.882, "eval_steps_per_second": 0.236, "step": 1200 }, { "epoch": 2.3007395234182417, "grad_norm": 0.5886391401290894, "learning_rate": 1.3706910390450679e-05, "loss": 0.9188, "step": 1400 }, { "epoch": 2.3007395234182417, "eval_loss": 0.7313700914382935, "eval_runtime": 647.8807, "eval_samples_per_second": 1.895, "eval_steps_per_second": 0.238, "step": 1400 }, { "epoch": 2.629416598192276, "grad_norm": 0.5193888545036316, "learning_rate": 1.2056599030859367e-05, "loss": 0.9001, "step": 1600 }, { "epoch": 2.629416598192276, "eval_loss": 0.7305780053138733, "eval_runtime": 648.1334, "eval_samples_per_second": 1.895, "eval_steps_per_second": 0.238, "step": 1600 }, { "epoch": 2.9580936729663105, "grad_norm": 0.580741286277771, "learning_rate": 1.0345161389697083e-05, "loss": 0.8896, "step": 1800 }, { "epoch": 2.9580936729663105, "eval_loss": 0.7289234399795532, "eval_runtime": 648.6387, "eval_samples_per_second": 1.893, "eval_steps_per_second": 0.237, "step": 1800 }, { "epoch": 3.286770747740345, "grad_norm": 0.5864242911338806, "learning_rate": 8.62346485412832e-06, "loss": 0.8851, "step": 2000 }, { "epoch": 3.286770747740345, "eval_loss": 0.7283803224563599, "eval_runtime": 648.5025, "eval_samples_per_second": 1.894, "eval_steps_per_second": 0.237, "step": 2000 }, { "epoch": 3.61544782251438, "grad_norm": 0.8759085536003113, "learning_rate": 6.942681726402474e-06, "loss": 0.893, "step": 2200 }, { "epoch": 3.61544782251438, "eval_loss": 0.7277897596359253, "eval_runtime": 649.5645, "eval_samples_per_second": 1.89, "eval_steps_per_second": 0.237, "step": 2200 }, { "epoch": 3.944124897288414, "grad_norm": 0.6328697204589844, "learning_rate": 5.352768279562315e-06, "loss": 0.8713, "step": 2400 }, { "epoch": 3.944124897288414, "eval_loss": 0.7274895906448364, "eval_runtime": 648.0857, "eval_samples_per_second": 1.895, "eval_steps_per_second": 0.238, "step": 2400 }, { "epoch": 4.272801972062449, "grad_norm": 0.5513933300971985, "learning_rate": 3.900979955999271e-06, "loss": 0.8729, "step": 2600 }, { "epoch": 4.272801972062449, "eval_loss": 0.7273133993148804, "eval_runtime": 648.0021, "eval_samples_per_second": 1.895, "eval_steps_per_second": 0.238, "step": 2600 }, { "epoch": 4.6014790468364835, "grad_norm": 0.4977937638759613, "learning_rate": 2.6304668401566334e-06, "loss": 0.8796, "step": 2800 }, { "epoch": 4.6014790468364835, "eval_loss": 0.727094829082489, "eval_runtime": 650.7111, "eval_samples_per_second": 1.887, "eval_steps_per_second": 0.237, "step": 2800 }, { "epoch": 4.930156121610517, "grad_norm": 0.5514353513717651, "learning_rate": 1.5789911507718824e-06, "loss": 0.8674, "step": 3000 }, { "epoch": 4.930156121610517, "eval_loss": 0.7267969250679016, "eval_runtime": 647.9762, "eval_samples_per_second": 1.895, "eval_steps_per_second": 0.238, "step": 3000 }, { "epoch": 5.258833196384552, "grad_norm": 0.573556661605835, "learning_rate": 7.778048713818975e-07, "loss": 0.8716, "step": 3200 }, { "epoch": 5.258833196384552, "eval_loss": 0.7268173694610596, "eval_runtime": 646.5688, "eval_samples_per_second": 1.899, "eval_steps_per_second": 0.238, "step": 3200 }, { "epoch": 5.587510271158587, "grad_norm": 0.6027952432632446, "learning_rate": 2.507208781817638e-07, "loss": 0.8666, "step": 3400 }, { "epoch": 5.587510271158587, "eval_loss": 0.726804256439209, "eval_runtime": 646.8487, "eval_samples_per_second": 1.898, "eval_steps_per_second": 0.238, "step": 3400 }, { "epoch": 5.916187345932621, "grad_norm": 0.7252454161643982, "learning_rate": 1.340517319543877e-08, "loss": 0.8669, "step": 3600 }, { "epoch": 5.916187345932621, "eval_loss": 0.7267909049987793, "eval_runtime": 645.845, "eval_samples_per_second": 1.901, "eval_steps_per_second": 0.238, "step": 3600 } ], "logging_steps": 200, "max_steps": 3660, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.222777133780173e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }