| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 7.99324650265316, | |
| "eval_steps": 1000, | |
| "global_step": 16568, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0004828585224529213, | |
| "grad_norm": 6.790188789367676, | |
| "learning_rate": 0.0, | |
| "loss": 4.1585, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.48285852245292127, | |
| "grad_norm": 6.372687339782715, | |
| "learning_rate": 6.028968014484008e-06, | |
| "loss": 3.2055, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.48285852245292127, | |
| "eval_cosine_accuracy": 0.9400568008422852, | |
| "eval_loss": 0.5675864219665527, | |
| "eval_runtime": 35.3676, | |
| "eval_samples_per_second": 268.862, | |
| "eval_steps_per_second": 1.074, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.9657170449058425, | |
| "grad_norm": 8.225760459899902, | |
| "learning_rate": 1.2063971031985518e-05, | |
| "loss": 2.0069, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.9657170449058425, | |
| "eval_cosine_accuracy": 0.9479440450668335, | |
| "eval_loss": 0.5088897347450256, | |
| "eval_runtime": 35.33, | |
| "eval_samples_per_second": 269.148, | |
| "eval_steps_per_second": 1.076, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.448142788229619, | |
| "grad_norm": 6.469565391540527, | |
| "learning_rate": 1.8098974049487025e-05, | |
| "loss": 1.8871, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.448142788229619, | |
| "eval_cosine_accuracy": 0.9563571214675903, | |
| "eval_loss": 0.4804830253124237, | |
| "eval_runtime": 34.5635, | |
| "eval_samples_per_second": 275.117, | |
| "eval_steps_per_second": 1.099, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.930535455861071, | |
| "grad_norm": 6.702775955200195, | |
| "learning_rate": 1.896634978119813e-05, | |
| "loss": 1.6352, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.930535455861071, | |
| "eval_cosine_accuracy": 0.9602481722831726, | |
| "eval_loss": 0.47556957602500916, | |
| "eval_runtime": 34.5281, | |
| "eval_samples_per_second": 275.399, | |
| "eval_steps_per_second": 1.101, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.412928123492523, | |
| "grad_norm": 5.3217620849609375, | |
| "learning_rate": 1.745888033801117e-05, | |
| "loss": 1.4252, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.412928123492523, | |
| "eval_cosine_accuracy": 0.9619308114051819, | |
| "eval_loss": 0.4694240689277649, | |
| "eval_runtime": 34.5322, | |
| "eval_samples_per_second": 275.366, | |
| "eval_steps_per_second": 1.1, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 2.895320791123975, | |
| "grad_norm": 4.8427605628967285, | |
| "learning_rate": 1.5949901916402598e-05, | |
| "loss": 1.2913, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.895320791123975, | |
| "eval_cosine_accuracy": 0.9642444252967834, | |
| "eval_loss": 0.45982933044433594, | |
| "eval_runtime": 35.5416, | |
| "eval_samples_per_second": 267.545, | |
| "eval_steps_per_second": 1.069, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 3.377713458755427, | |
| "grad_norm": 5.968652248382568, | |
| "learning_rate": 1.4442432473215635e-05, | |
| "loss": 1.1875, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 3.377713458755427, | |
| "eval_cosine_accuracy": 0.9643495678901672, | |
| "eval_loss": 0.4546539783477783, | |
| "eval_runtime": 34.7743, | |
| "eval_samples_per_second": 273.449, | |
| "eval_steps_per_second": 1.093, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 3.8601061263868788, | |
| "grad_norm": 6.551515102386475, | |
| "learning_rate": 1.2933454051607064e-05, | |
| "loss": 1.1294, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 3.8601061263868788, | |
| "eval_cosine_accuracy": 0.9650856852531433, | |
| "eval_loss": 0.45289501547813416, | |
| "eval_runtime": 34.4015, | |
| "eval_samples_per_second": 276.412, | |
| "eval_steps_per_second": 1.105, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 4.342498794018331, | |
| "grad_norm": 7.067383766174316, | |
| "learning_rate": 1.14259846084201e-05, | |
| "loss": 1.054, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 4.342498794018331, | |
| "eval_cosine_accuracy": 0.9661373496055603, | |
| "eval_loss": 0.4549243152141571, | |
| "eval_runtime": 34.3397, | |
| "eval_samples_per_second": 276.909, | |
| "eval_steps_per_second": 1.107, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 4.824891461649782, | |
| "grad_norm": 6.8474202156066895, | |
| "learning_rate": 9.917006186811529e-06, | |
| "loss": 1.0193, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 4.824891461649782, | |
| "eval_cosine_accuracy": 0.9671889543533325, | |
| "eval_loss": 0.4545239210128784, | |
| "eval_runtime": 34.4393, | |
| "eval_samples_per_second": 276.109, | |
| "eval_steps_per_second": 1.103, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 5.307284129281235, | |
| "grad_norm": 6.33509635925293, | |
| "learning_rate": 8.411045722046174e-06, | |
| "loss": 0.9886, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 5.307284129281235, | |
| "eval_cosine_accuracy": 0.9665579795837402, | |
| "eval_loss": 0.4523693025112152, | |
| "eval_runtime": 35.1677, | |
| "eval_samples_per_second": 270.39, | |
| "eval_steps_per_second": 1.081, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 5.789676796912687, | |
| "grad_norm": 4.673709392547607, | |
| "learning_rate": 6.903576278859213e-06, | |
| "loss": 0.9365, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 5.789676796912687, | |
| "eval_cosine_accuracy": 0.9676096439361572, | |
| "eval_loss": 0.4460844397544861, | |
| "eval_runtime": 33.9756, | |
| "eval_samples_per_second": 279.877, | |
| "eval_steps_per_second": 1.118, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 6.272069464544139, | |
| "grad_norm": 5.602967739105225, | |
| "learning_rate": 5.394597857250642e-06, | |
| "loss": 0.929, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 6.272069464544139, | |
| "eval_cosine_accuracy": 0.9686612486839294, | |
| "eval_loss": 0.4498312175273895, | |
| "eval_runtime": 34.6216, | |
| "eval_samples_per_second": 274.655, | |
| "eval_steps_per_second": 1.098, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 6.754462132175591, | |
| "grad_norm": 5.7968339920043945, | |
| "learning_rate": 3.8856194356420705e-06, | |
| "loss": 0.91, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 6.754462132175591, | |
| "eval_cosine_accuracy": 0.9678199887275696, | |
| "eval_loss": 0.4490343928337097, | |
| "eval_runtime": 34.6271, | |
| "eval_samples_per_second": 274.612, | |
| "eval_steps_per_second": 1.097, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 7.236854799807043, | |
| "grad_norm": 6.673821926116943, | |
| "learning_rate": 2.3766410140334995e-06, | |
| "loss": 0.8904, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 7.236854799807043, | |
| "eval_cosine_accuracy": 0.9683457612991333, | |
| "eval_loss": 0.448638916015625, | |
| "eval_runtime": 34.8102, | |
| "eval_samples_per_second": 273.167, | |
| "eval_steps_per_second": 1.092, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 7.719247467438495, | |
| "grad_norm": 4.653373718261719, | |
| "learning_rate": 8.676625924249284e-07, | |
| "loss": 0.8718, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 7.719247467438495, | |
| "eval_cosine_accuracy": 0.9684509634971619, | |
| "eval_loss": 0.4478535056114197, | |
| "eval_runtime": 35.902, | |
| "eval_samples_per_second": 264.86, | |
| "eval_steps_per_second": 1.058, | |
| "step": 16000 | |
| } | |
| ], | |
| "logging_steps": 1000, | |
| "max_steps": 16568, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 8, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 256, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |