| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.837993870423606, |
| "eval_steps": 10000, |
| "global_step": 168364, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 2.3758048038773133e-05, |
| "grad_norm": 4.939470291137695, |
| "learning_rate": 0.0, |
| "loss": 3.9948, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.23758048038773136, |
| "grad_norm": 2.9926953315734863, |
| "learning_rate": 3.5623663578047044e-06, |
| "loss": 3.2694, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.23758048038773136, |
| "eval_cosine_accuracy": 0.9539926052093506, |
| "eval_loss": 0.764169454574585, |
| "eval_runtime": 26.3677, |
| "eval_samples_per_second": 358.583, |
| "eval_steps_per_second": 0.721, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.4751609607754627, |
| "grad_norm": 3.001824140548706, |
| "learning_rate": 7.124732715609409e-06, |
| "loss": 2.6376, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.4751609607754627, |
| "eval_cosine_accuracy": 0.9616076350212097, |
| "eval_loss": 0.6928738355636597, |
| "eval_runtime": 26.4682, |
| "eval_samples_per_second": 357.221, |
| "eval_steps_per_second": 0.718, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.712741441163194, |
| "grad_norm": 2.8817107677459717, |
| "learning_rate": 1.0687099073414113e-05, |
| "loss": 2.4101, |
| "step": 30000 |
| }, |
| { |
| "epoch": 0.712741441163194, |
| "eval_cosine_accuracy": 0.9626652598381042, |
| "eval_loss": 0.6757322549819946, |
| "eval_runtime": 26.9757, |
| "eval_samples_per_second": 350.5, |
| "eval_steps_per_second": 0.704, |
| "step": 30000 |
| }, |
| { |
| "epoch": 0.9503219215509254, |
| "grad_norm": 2.5843374729156494, |
| "learning_rate": 1.4249465431218818e-05, |
| "loss": 2.274, |
| "step": 40000 |
| }, |
| { |
| "epoch": 0.9503219215509254, |
| "eval_cosine_accuracy": 0.9663670063018799, |
| "eval_loss": 0.6607074737548828, |
| "eval_runtime": 27.5896, |
| "eval_samples_per_second": 342.701, |
| "eval_steps_per_second": 0.689, |
| "step": 40000 |
| }, |
| { |
| "epoch": 1.1879024019386568, |
| "grad_norm": 2.4177308082580566, |
| "learning_rate": 1.7811831789023522e-05, |
| "loss": 2.2229, |
| "step": 50000 |
| }, |
| { |
| "epoch": 1.1879024019386568, |
| "eval_cosine_accuracy": 0.9650978446006775, |
| "eval_loss": 0.6596832871437073, |
| "eval_runtime": 28.4294, |
| "eval_samples_per_second": 332.578, |
| "eval_steps_per_second": 0.668, |
| "step": 50000 |
| }, |
| { |
| "epoch": 1.425482882326388, |
| "grad_norm": 2.5173802375793457, |
| "learning_rate": 2.1374198146828226e-05, |
| "loss": 2.1147, |
| "step": 60000 |
| }, |
| { |
| "epoch": 1.425482882326388, |
| "eval_cosine_accuracy": 0.967107355594635, |
| "eval_loss": 0.6737655997276306, |
| "eval_runtime": 28.0648, |
| "eval_samples_per_second": 336.899, |
| "eval_steps_per_second": 0.677, |
| "step": 60000 |
| }, |
| { |
| "epoch": 1.6630633627141194, |
| "grad_norm": 2.5599939823150635, |
| "learning_rate": 2.4935851746258018e-05, |
| "loss": 2.0603, |
| "step": 70000 |
| }, |
| { |
| "epoch": 1.6630633627141194, |
| "eval_cosine_accuracy": 0.9672130942344666, |
| "eval_loss": 0.6480635404586792, |
| "eval_runtime": 27.0703, |
| "eval_samples_per_second": 349.276, |
| "eval_steps_per_second": 0.702, |
| "step": 70000 |
| }, |
| { |
| "epoch": 1.9006438431018506, |
| "grad_norm": 2.274336099624634, |
| "learning_rate": 2.8498218104062724e-05, |
| "loss": 2.0208, |
| "step": 80000 |
| }, |
| { |
| "epoch": 1.9006438431018506, |
| "eval_cosine_accuracy": 0.9679534435272217, |
| "eval_loss": 0.6634311079978943, |
| "eval_runtime": 26.5714, |
| "eval_samples_per_second": 355.834, |
| "eval_steps_per_second": 0.715, |
| "step": 80000 |
| }, |
| { |
| "epoch": 2.138224323489582, |
| "grad_norm": 38.125125885009766, |
| "learning_rate": 2.9652644925951974e-05, |
| "loss": 1.4874, |
| "step": 90000 |
| }, |
| { |
| "epoch": 2.138224323489582, |
| "eval_cosine_accuracy": 0.9315705895423889, |
| "eval_loss": 0.8910566568374634, |
| "eval_runtime": 27.0766, |
| "eval_samples_per_second": 349.195, |
| "eval_steps_per_second": 0.702, |
| "step": 90000 |
| }, |
| { |
| "epoch": 2.21436886745385, |
| "grad_norm": 1.930785059928894, |
| "learning_rate": 2.7474065271765394e-05, |
| "loss": 1.7984, |
| "step": 100000 |
| }, |
| { |
| "epoch": 2.21436886745385, |
| "eval_cosine_accuracy": 0.9664727449417114, |
| "eval_loss": 0.6721837520599365, |
| "eval_runtime": 29.2032, |
| "eval_samples_per_second": 323.766, |
| "eval_steps_per_second": 0.651, |
| "step": 100000 |
| }, |
| { |
| "epoch": 2.4519493478415813, |
| "grad_norm": 1.9230319261550903, |
| "learning_rate": 2.3579735325035144e-05, |
| "loss": 1.9529, |
| "step": 110000 |
| }, |
| { |
| "epoch": 2.4519493478415813, |
| "eval_cosine_accuracy": 0.9691168665885925, |
| "eval_loss": 0.6769542694091797, |
| "eval_runtime": 26.9479, |
| "eval_samples_per_second": 350.863, |
| "eval_steps_per_second": 0.705, |
| "step": 110000 |
| }, |
| { |
| "epoch": 2.6895298282293125, |
| "grad_norm": 1.9773123264312744, |
| "learning_rate": 1.8505599334305437e-05, |
| "loss": 1.9337, |
| "step": 120000 |
| }, |
| { |
| "epoch": 2.6895298282293125, |
| "eval_cosine_accuracy": 0.9710206389427185, |
| "eval_loss": 0.6512405276298523, |
| "eval_runtime": 26.87, |
| "eval_samples_per_second": 351.879, |
| "eval_steps_per_second": 0.707, |
| "step": 120000 |
| }, |
| { |
| "epoch": 3.0885462450405075, |
| "grad_norm": 12.169724464416504, |
| "learning_rate": 1.2950387678813519e-05, |
| "loss": 1.6912, |
| "step": 130000 |
| }, |
| { |
| "epoch": 3.0885462450405075, |
| "eval_cosine_accuracy": 0.9476467370986938, |
| "eval_loss": 0.8111016750335693, |
| "eval_runtime": 26.3122, |
| "eval_samples_per_second": 359.339, |
| "eval_steps_per_second": 0.722, |
| "step": 130000 |
| }, |
| { |
| "epoch": 3.164120595851845, |
| "grad_norm": 1.8629993200302124, |
| "learning_rate": 7.675028535386121e-06, |
| "loss": 1.4797, |
| "step": 140000 |
| }, |
| { |
| "epoch": 3.164120595851845, |
| "eval_cosine_accuracy": 0.9709148406982422, |
| "eval_loss": 0.6482858061790466, |
| "eval_runtime": 29.0766, |
| "eval_samples_per_second": 325.176, |
| "eval_steps_per_second": 0.653, |
| "step": 140000 |
| }, |
| { |
| "epoch": 3.401701076239576, |
| "grad_norm": 1.8768196105957031, |
| "learning_rate": 3.407595832899329e-06, |
| "loss": 1.9017, |
| "step": 150000 |
| }, |
| { |
| "epoch": 3.401701076239576, |
| "eval_cosine_accuracy": 0.9719725251197815, |
| "eval_loss": 0.6554318070411682, |
| "eval_runtime": 27.6894, |
| "eval_samples_per_second": 341.466, |
| "eval_steps_per_second": 0.686, |
| "step": 150000 |
| }, |
| { |
| "epoch": 3.6392815566273073, |
| "grad_norm": 2.0169677734375, |
| "learning_rate": 7.356406115445186e-07, |
| "loss": 1.8975, |
| "step": 160000 |
| }, |
| { |
| "epoch": 3.6392815566273073, |
| "eval_cosine_accuracy": 0.9717609882354736, |
| "eval_loss": 0.6504931449890137, |
| "eval_runtime": 27.7147, |
| "eval_samples_per_second": 341.155, |
| "eval_steps_per_second": 0.686, |
| "step": 160000 |
| } |
| ], |
| "logging_steps": 10000, |
| "max_steps": 168364, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 30000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 512, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|