{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 7.0, "eval_steps": 1000, "global_step": 14509, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004828585224529213, "grad_norm": 6.790188789367676, "learning_rate": 0.0, "loss": 4.1585, "step": 1 }, { "epoch": 0.48285852245292127, "grad_norm": 6.372687339782715, "learning_rate": 6.028968014484008e-06, "loss": 3.2055, "step": 1000 }, { "epoch": 0.48285852245292127, "eval_cosine_accuracy": 0.9400568008422852, "eval_loss": 0.5675864219665527, "eval_runtime": 35.3676, "eval_samples_per_second": 268.862, "eval_steps_per_second": 1.074, "step": 1000 }, { "epoch": 0.9657170449058425, "grad_norm": 8.225760459899902, "learning_rate": 1.2063971031985518e-05, "loss": 2.0069, "step": 2000 }, { "epoch": 0.9657170449058425, "eval_cosine_accuracy": 0.9479440450668335, "eval_loss": 0.5088897347450256, "eval_runtime": 35.33, "eval_samples_per_second": 269.148, "eval_steps_per_second": 1.076, "step": 2000 }, { "epoch": 1.448142788229619, "grad_norm": 6.469565391540527, "learning_rate": 1.8098974049487025e-05, "loss": 1.8871, "step": 3000 }, { "epoch": 1.448142788229619, "eval_cosine_accuracy": 0.9563571214675903, "eval_loss": 0.4804830253124237, "eval_runtime": 34.5635, "eval_samples_per_second": 275.117, "eval_steps_per_second": 1.099, "step": 3000 }, { "epoch": 1.930535455861071, "grad_norm": 6.702775955200195, "learning_rate": 1.896634978119813e-05, "loss": 1.6352, "step": 4000 }, { "epoch": 1.930535455861071, "eval_cosine_accuracy": 0.9602481722831726, "eval_loss": 0.47556957602500916, "eval_runtime": 34.5281, "eval_samples_per_second": 275.399, "eval_steps_per_second": 1.101, "step": 4000 }, { "epoch": 2.412928123492523, "grad_norm": 5.3217620849609375, "learning_rate": 1.745888033801117e-05, "loss": 1.4252, "step": 5000 }, { "epoch": 2.412928123492523, "eval_cosine_accuracy": 0.9619308114051819, "eval_loss": 0.4694240689277649, "eval_runtime": 34.5322, "eval_samples_per_second": 275.366, "eval_steps_per_second": 1.1, "step": 5000 }, { "epoch": 2.895320791123975, "grad_norm": 4.8427605628967285, "learning_rate": 1.5949901916402598e-05, "loss": 1.2913, "step": 6000 }, { "epoch": 2.895320791123975, "eval_cosine_accuracy": 0.9642444252967834, "eval_loss": 0.45982933044433594, "eval_runtime": 35.5416, "eval_samples_per_second": 267.545, "eval_steps_per_second": 1.069, "step": 6000 }, { "epoch": 3.377713458755427, "grad_norm": 5.968652248382568, "learning_rate": 1.4442432473215635e-05, "loss": 1.1875, "step": 7000 }, { "epoch": 3.377713458755427, "eval_cosine_accuracy": 0.9643495678901672, "eval_loss": 0.4546539783477783, "eval_runtime": 34.7743, "eval_samples_per_second": 273.449, "eval_steps_per_second": 1.093, "step": 7000 }, { "epoch": 3.8601061263868788, "grad_norm": 6.551515102386475, "learning_rate": 1.2933454051607064e-05, "loss": 1.1294, "step": 8000 }, { "epoch": 3.8601061263868788, "eval_cosine_accuracy": 0.9650856852531433, "eval_loss": 0.45289501547813416, "eval_runtime": 34.4015, "eval_samples_per_second": 276.412, "eval_steps_per_second": 1.105, "step": 8000 }, { "epoch": 4.342498794018331, "grad_norm": 7.067383766174316, "learning_rate": 1.14259846084201e-05, "loss": 1.054, "step": 9000 }, { "epoch": 4.342498794018331, "eval_cosine_accuracy": 0.9661373496055603, "eval_loss": 0.4549243152141571, "eval_runtime": 34.3397, "eval_samples_per_second": 276.909, "eval_steps_per_second": 1.107, "step": 9000 }, { "epoch": 4.824891461649782, "grad_norm": 6.8474202156066895, "learning_rate": 9.917006186811529e-06, "loss": 1.0193, "step": 10000 }, { "epoch": 4.824891461649782, "eval_cosine_accuracy": 0.9671889543533325, "eval_loss": 0.4545239210128784, "eval_runtime": 34.4393, "eval_samples_per_second": 276.109, "eval_steps_per_second": 1.103, "step": 10000 }, { "epoch": 5.307284129281235, "grad_norm": 6.33509635925293, "learning_rate": 8.411045722046174e-06, "loss": 0.9886, "step": 11000 }, { "epoch": 5.307284129281235, "eval_cosine_accuracy": 0.9665579795837402, "eval_loss": 0.4523693025112152, "eval_runtime": 35.1677, "eval_samples_per_second": 270.39, "eval_steps_per_second": 1.081, "step": 11000 }, { "epoch": 5.789676796912687, "grad_norm": 4.673709392547607, "learning_rate": 6.903576278859213e-06, "loss": 0.9365, "step": 12000 }, { "epoch": 5.789676796912687, "eval_cosine_accuracy": 0.9676096439361572, "eval_loss": 0.4460844397544861, "eval_runtime": 33.9756, "eval_samples_per_second": 279.877, "eval_steps_per_second": 1.118, "step": 12000 }, { "epoch": 6.272069464544139, "grad_norm": 5.602967739105225, "learning_rate": 5.394597857250642e-06, "loss": 0.929, "step": 13000 }, { "epoch": 6.272069464544139, "eval_cosine_accuracy": 0.9686612486839294, "eval_loss": 0.4498312175273895, "eval_runtime": 34.6216, "eval_samples_per_second": 274.655, "eval_steps_per_second": 1.098, "step": 13000 }, { "epoch": 6.754462132175591, "grad_norm": 5.7968339920043945, "learning_rate": 3.8856194356420705e-06, "loss": 0.91, "step": 14000 }, { "epoch": 6.754462132175591, "eval_cosine_accuracy": 0.9678199887275696, "eval_loss": 0.4490343928337097, "eval_runtime": 34.6271, "eval_samples_per_second": 274.612, "eval_steps_per_second": 1.097, "step": 14000 } ], "logging_steps": 1000, "max_steps": 16568, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 256, "trial_name": null, "trial_params": null }