{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.999158337716991, "eval_steps": 2000, "global_step": 18994, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00010529640939243972, "grad_norm": 6.751945495605469, "learning_rate": 0.0, "loss": 2.8442, "step": 1 }, { "epoch": 0.21059281878487943, "grad_norm": 4.618383884429932, "learning_rate": 6.315290648694187e-06, "loss": 2.5015, "step": 2000 }, { "epoch": 0.21059281878487943, "eval_cosine_accuracy": 0.9469176530838013, "eval_loss": 0.7108581066131592, "eval_runtime": 24.9028, "eval_samples_per_second": 379.756, "eval_steps_per_second": 1.486, "step": 2000 }, { "epoch": 0.42118563756975885, "grad_norm": 4.100494861602783, "learning_rate": 1.2630581297388374e-05, "loss": 1.8564, "step": 4000 }, { "epoch": 0.42118563756975885, "eval_cosine_accuracy": 0.9481865167617798, "eval_loss": 0.6932559013366699, "eval_runtime": 25.2131, "eval_samples_per_second": 375.083, "eval_steps_per_second": 1.467, "step": 4000 }, { "epoch": 0.6317784563546383, "grad_norm": 4.006742477416992, "learning_rate": 1.8949031171019377e-05, "loss": 1.504, "step": 6000 }, { "epoch": 0.6317784563546383, "eval_cosine_accuracy": 0.9519932270050049, "eval_loss": 0.6632474064826965, "eval_runtime": 25.5415, "eval_samples_per_second": 370.261, "eval_steps_per_second": 1.449, "step": 6000 }, { "epoch": 0.8423712751395177, "grad_norm": 3.7652177810668945, "learning_rate": 2.5264321819713565e-05, "loss": 1.2874, "step": 8000 }, { "epoch": 0.8423712751395177, "eval_cosine_accuracy": 0.9577032923698425, "eval_loss": 0.6434259414672852, "eval_runtime": 25.6225, "eval_samples_per_second": 369.089, "eval_steps_per_second": 1.444, "step": 8000 }, { "epoch": 1.0529195160441873, "grad_norm": 3.5891263484954834, "learning_rate": 2.9795333581180082e-05, "loss": 1.1986, "step": 10000 }, { "epoch": 1.0529195160441873, "eval_cosine_accuracy": 0.9589722156524658, "eval_loss": 0.6360689401626587, "eval_runtime": 25.8862, "eval_samples_per_second": 365.33, "eval_steps_per_second": 1.429, "step": 10000 }, { "epoch": 1.263335086796423, "grad_norm": 3.1852927207946777, "learning_rate": 2.516095252036212e-05, "loss": 1.0185, "step": 12000 }, { "epoch": 1.263335086796423, "eval_cosine_accuracy": 0.961192786693573, "eval_loss": 0.6109231114387512, "eval_runtime": 25.7475, "eval_samples_per_second": 367.298, "eval_steps_per_second": 1.437, "step": 12000 }, { "epoch": 1.4737506575486585, "grad_norm": 3.5387814044952393, "learning_rate": 1.624389485225718e-05, "loss": 0.945, "step": 14000 }, { "epoch": 1.4737506575486585, "eval_cosine_accuracy": 0.9624616503715515, "eval_loss": 0.5934072732925415, "eval_runtime": 25.7367, "eval_samples_per_second": 367.451, "eval_steps_per_second": 1.438, "step": 14000 }, { "epoch": 1.6841662283008942, "grad_norm": 3.443584680557251, "learning_rate": 6.802555688395712e-06, "loss": 0.9076, "step": 16000 }, { "epoch": 1.6841662283008942, "eval_cosine_accuracy": 0.9616157412528992, "eval_loss": 0.5919129252433777, "eval_runtime": 25.6697, "eval_samples_per_second": 368.411, "eval_steps_per_second": 1.441, "step": 16000 }, { "epoch": 1.89458179905313, "grad_norm": 3.5262327194213867, "learning_rate": 8.163062833430229e-07, "loss": 0.8964, "step": 18000 }, { "epoch": 1.89458179905313, "eval_cosine_accuracy": 0.9633076190948486, "eval_loss": 0.5839827060699463, "eval_runtime": 25.6367, "eval_samples_per_second": 368.885, "eval_steps_per_second": 1.443, "step": 18000 } ], "logging_steps": 2000, "max_steps": 18994, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 256, "trial_name": null, "trial_params": null }