{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.998719590268886, "eval_steps": 1000, "global_step": 9366, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00032030749519538755, "grad_norm": 6.107327938079834, "learning_rate": 0.0, "loss": 2.2696, "step": 1 }, { "epoch": 0.3203074951953876, "grad_norm": 3.857839345932007, "learning_rate": 2.977933325424131e-05, "loss": 1.6542, "step": 1000 }, { "epoch": 0.3203074951953876, "eval_cosine_accuracy": 0.9641829133033752, "eval_loss": 0.5915806889533997, "eval_runtime": 28.6636, "eval_samples_per_second": 327.279, "eval_steps_per_second": 1.291, "step": 1000 }, { "epoch": 0.6406149903907752, "grad_norm": 3.1807026863098145, "learning_rate": 2.6220192193617276e-05, "loss": 1.0828, "step": 2000 }, { "epoch": 0.6406149903907752, "eval_cosine_accuracy": 0.9688732624053955, "eval_loss": 0.5482337474822998, "eval_runtime": 27.6361, "eval_samples_per_second": 339.447, "eval_steps_per_second": 1.339, "step": 2000 }, { "epoch": 0.9609224855861627, "grad_norm": 4.243162155151367, "learning_rate": 2.2661051132993238e-05, "loss": 0.8294, "step": 3000 }, { "epoch": 0.9609224855861627, "eval_cosine_accuracy": 0.971858024597168, "eval_loss": 0.5288547277450562, "eval_runtime": 27.5987, "eval_samples_per_second": 339.908, "eval_steps_per_second": 1.341, "step": 3000 }, { "epoch": 1.2810499359795133, "grad_norm": 5.0122575759887695, "learning_rate": 1.910902835449045e-05, "loss": 0.762, "step": 4000 }, { "epoch": 1.2810499359795133, "eval_cosine_accuracy": 0.9751625657081604, "eval_loss": 0.5255293250083923, "eval_runtime": 27.7311, "eval_samples_per_second": 338.284, "eval_steps_per_second": 1.334, "step": 4000 }, { "epoch": 1.6011523687580027, "grad_norm": 3.2703332901000977, "learning_rate": 1.5549887293866415e-05, "loss": 0.7273, "step": 5000 }, { "epoch": 1.6011523687580027, "eval_cosine_accuracy": 0.9755889773368835, "eval_loss": 0.5073443055152893, "eval_runtime": 27.5769, "eval_samples_per_second": 340.176, "eval_steps_per_second": 1.342, "step": 5000 }, { "epoch": 1.9212548015364916, "grad_norm": 3.8906168937683105, "learning_rate": 1.1990746233242378e-05, "loss": 0.6962, "step": 6000 }, { "epoch": 1.9212548015364916, "eval_cosine_accuracy": 0.9772945046424866, "eval_loss": 0.49614542722702026, "eval_runtime": 27.8537, "eval_samples_per_second": 336.795, "eval_steps_per_second": 1.328, "step": 6000 }, { "epoch": 2.2413572343149806, "grad_norm": 3.059576988220215, "learning_rate": 8.431605172618341e-06, "loss": 0.6648, "step": 7000 }, { "epoch": 2.2413572343149806, "eval_cosine_accuracy": 0.9775077104568481, "eval_loss": 0.5007394552230835, "eval_runtime": 28.1686, "eval_samples_per_second": 333.03, "eval_steps_per_second": 1.314, "step": 7000 }, { "epoch": 2.56145966709347, "grad_norm": 3.0882303714752197, "learning_rate": 4.872464111994306e-06, "loss": 0.6362, "step": 8000 }, { "epoch": 2.56145966709347, "eval_cosine_accuracy": 0.9779341220855713, "eval_loss": 0.5001775622367859, "eval_runtime": 27.7547, "eval_samples_per_second": 337.996, "eval_steps_per_second": 1.333, "step": 8000 }, { "epoch": 2.881562099871959, "grad_norm": 2.918884038925171, "learning_rate": 1.3133230513702693e-06, "loss": 0.6228, "step": 9000 }, { "epoch": 2.881562099871959, "eval_cosine_accuracy": 0.9784671068191528, "eval_loss": 0.4983352720737457, "eval_runtime": 28.0171, "eval_samples_per_second": 334.831, "eval_steps_per_second": 1.321, "step": 9000 } ], "logging_steps": 1000, "max_steps": 9366, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 256, "trial_name": null, "trial_params": null }