| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 1000, | |
| "global_step": 22440, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0002228163992869875, | |
| "grad_norm": 5.737454891204834, | |
| "learning_rate": 0.0, | |
| "loss": 1.9528, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.22281639928698752, | |
| "grad_norm": 3.56156587600708, | |
| "learning_rate": 8.903743315508023e-06, | |
| "loss": 1.6859, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.22281639928698752, | |
| "eval_cosine_accuracy": 0.9496266841888428, | |
| "eval_loss": 0.5503689050674438, | |
| "eval_runtime": 34.9667, | |
| "eval_samples_per_second": 271.944, | |
| "eval_steps_per_second": 1.087, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.44563279857397503, | |
| "grad_norm": 4.381109237670898, | |
| "learning_rate": 1.7816399286987524e-05, | |
| "loss": 1.1429, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.44563279857397503, | |
| "eval_cosine_accuracy": 0.9598275423049927, | |
| "eval_loss": 0.4875295162200928, | |
| "eval_runtime": 34.4488, | |
| "eval_samples_per_second": 276.033, | |
| "eval_steps_per_second": 1.103, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6684491978609626, | |
| "grad_norm": 4.41341495513916, | |
| "learning_rate": 1.9252327193503666e-05, | |
| "loss": 0.7836, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.6684491978609626, | |
| "eval_cosine_accuracy": 0.9656115174293518, | |
| "eval_loss": 0.45152774453163147, | |
| "eval_runtime": 37.3805, | |
| "eval_samples_per_second": 254.384, | |
| "eval_steps_per_second": 1.017, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.8912655971479501, | |
| "grad_norm": 4.65971040725708, | |
| "learning_rate": 1.8262032085561498e-05, | |
| "loss": 0.601, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.8912655971479501, | |
| "eval_cosine_accuracy": 0.9693974256515503, | |
| "eval_loss": 0.4410804212093353, | |
| "eval_runtime": 36.7023, | |
| "eval_samples_per_second": 259.085, | |
| "eval_steps_per_second": 1.035, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.1140819964349375, | |
| "grad_norm": 3.0785202980041504, | |
| "learning_rate": 1.7272727272727274e-05, | |
| "loss": 0.7514, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.1140819964349375, | |
| "eval_cosine_accuracy": 0.9719213247299194, | |
| "eval_loss": 0.4301469624042511, | |
| "eval_runtime": 37.2185, | |
| "eval_samples_per_second": 255.491, | |
| "eval_steps_per_second": 1.021, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.3368983957219251, | |
| "grad_norm": 3.4481372833251953, | |
| "learning_rate": 1.6282432164785106e-05, | |
| "loss": 0.8103, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.3368983957219251, | |
| "eval_cosine_accuracy": 0.9718161821365356, | |
| "eval_loss": 0.4272350072860718, | |
| "eval_runtime": 36.6881, | |
| "eval_samples_per_second": 259.185, | |
| "eval_steps_per_second": 1.036, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.5597147950089125, | |
| "grad_norm": 3.873903512954712, | |
| "learning_rate": 1.529312735195088e-05, | |
| "loss": 0.5951, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.5597147950089125, | |
| "eval_cosine_accuracy": 0.9736039638519287, | |
| "eval_loss": 0.3999342918395996, | |
| "eval_runtime": 34.8565, | |
| "eval_samples_per_second": 272.804, | |
| "eval_steps_per_second": 1.09, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.7825311942959001, | |
| "grad_norm": 2.972379446029663, | |
| "learning_rate": 1.4302832244008715e-05, | |
| "loss": 0.4595, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.7825311942959001, | |
| "eval_cosine_accuracy": 0.9741297960281372, | |
| "eval_loss": 0.39611557126045227, | |
| "eval_runtime": 35.4253, | |
| "eval_samples_per_second": 268.424, | |
| "eval_steps_per_second": 1.073, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.0053475935828877, | |
| "grad_norm": 3.5307512283325195, | |
| "learning_rate": 1.3313527431174491e-05, | |
| "loss": 0.4257, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.0053475935828877, | |
| "eval_cosine_accuracy": 0.9749710559844971, | |
| "eval_loss": 0.379986435174942, | |
| "eval_runtime": 34.6941, | |
| "eval_samples_per_second": 274.082, | |
| "eval_steps_per_second": 1.095, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.228163992869875, | |
| "grad_norm": 2.854247570037842, | |
| "learning_rate": 1.2323232323232323e-05, | |
| "loss": 0.7208, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.228163992869875, | |
| "eval_cosine_accuracy": 0.9744452834129333, | |
| "eval_loss": 0.4024476706981659, | |
| "eval_runtime": 34.6805, | |
| "eval_samples_per_second": 274.189, | |
| "eval_steps_per_second": 1.096, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 2.450980392156863, | |
| "grad_norm": 3.3613624572753906, | |
| "learning_rate": 1.1332937215290158e-05, | |
| "loss": 0.5693, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.450980392156863, | |
| "eval_cosine_accuracy": 0.9767588376998901, | |
| "eval_loss": 0.38983264565467834, | |
| "eval_runtime": 37.0648, | |
| "eval_samples_per_second": 256.55, | |
| "eval_steps_per_second": 1.025, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 2.6737967914438503, | |
| "grad_norm": 4.482306480407715, | |
| "learning_rate": 1.034264210734799e-05, | |
| "loss": 0.4329, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.6737967914438503, | |
| "eval_cosine_accuracy": 0.9763382077217102, | |
| "eval_loss": 0.3736518919467926, | |
| "eval_runtime": 36.2748, | |
| "eval_samples_per_second": 262.138, | |
| "eval_steps_per_second": 1.048, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 2.8966131907308377, | |
| "grad_norm": 3.686260938644409, | |
| "learning_rate": 9.353337294513766e-06, | |
| "loss": 0.3665, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 2.8966131907308377, | |
| "eval_cosine_accuracy": 0.9762330651283264, | |
| "eval_loss": 0.3715299069881439, | |
| "eval_runtime": 35.4634, | |
| "eval_samples_per_second": 268.136, | |
| "eval_steps_per_second": 1.072, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 3.1194295900178255, | |
| "grad_norm": 2.699113607406616, | |
| "learning_rate": 8.3630421865716e-06, | |
| "loss": 0.5164, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 3.1194295900178255, | |
| "eval_cosine_accuracy": 0.9758123755455017, | |
| "eval_loss": 0.38520747423171997, | |
| "eval_runtime": 35.6843, | |
| "eval_samples_per_second": 266.476, | |
| "eval_steps_per_second": 1.065, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 3.342245989304813, | |
| "grad_norm": 3.126573324203491, | |
| "learning_rate": 7.372747078629432e-06, | |
| "loss": 0.5911, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 3.342245989304813, | |
| "eval_cosine_accuracy": 0.9763382077217102, | |
| "eval_loss": 0.38103941082954407, | |
| "eval_runtime": 38.6917, | |
| "eval_samples_per_second": 245.763, | |
| "eval_steps_per_second": 0.982, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 3.5650623885918002, | |
| "grad_norm": 4.231788158416748, | |
| "learning_rate": 6.382451970687266e-06, | |
| "loss": 0.44, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 3.5650623885918002, | |
| "eval_cosine_accuracy": 0.9768640398979187, | |
| "eval_loss": 0.37009483575820923, | |
| "eval_runtime": 38.6634, | |
| "eval_samples_per_second": 245.943, | |
| "eval_steps_per_second": 0.983, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 3.787878787878788, | |
| "grad_norm": 3.735926389694214, | |
| "learning_rate": 5.39314715785304e-06, | |
| "loss": 0.3515, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 3.787878787878788, | |
| "eval_cosine_accuracy": 0.9772846698760986, | |
| "eval_loss": 0.36204129457473755, | |
| "eval_runtime": 36.6175, | |
| "eval_samples_per_second": 259.684, | |
| "eval_steps_per_second": 1.038, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 4.010695187165775, | |
| "grad_norm": 2.816643714904785, | |
| "learning_rate": 4.403842345018816e-06, | |
| "loss": 0.344, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 4.010695187165775, | |
| "eval_cosine_accuracy": 0.9770743250846863, | |
| "eval_loss": 0.35978883504867554, | |
| "eval_runtime": 36.1523, | |
| "eval_samples_per_second": 263.026, | |
| "eval_steps_per_second": 1.051, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 4.233511586452763, | |
| "grad_norm": 2.7152769565582275, | |
| "learning_rate": 3.4145375321845912e-06, | |
| "loss": 0.5974, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 4.233511586452763, | |
| "eval_cosine_accuracy": 0.9769691824913025, | |
| "eval_loss": 0.37677305936813354, | |
| "eval_runtime": 38.2851, | |
| "eval_samples_per_second": 248.374, | |
| "eval_steps_per_second": 0.993, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 4.45632798573975, | |
| "grad_norm": 3.6697447299957275, | |
| "learning_rate": 2.4242424242424244e-06, | |
| "loss": 0.4834, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 4.45632798573975, | |
| "eval_cosine_accuracy": 0.9773898124694824, | |
| "eval_loss": 0.3683561086654663, | |
| "eval_runtime": 36.823, | |
| "eval_samples_per_second": 258.236, | |
| "eval_steps_per_second": 1.032, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 4.6791443850267385, | |
| "grad_norm": 3.8561153411865234, | |
| "learning_rate": 1.4339473163002576e-06, | |
| "loss": 0.3707, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 4.6791443850267385, | |
| "eval_cosine_accuracy": 0.9770743250846863, | |
| "eval_loss": 0.3652487099170685, | |
| "eval_runtime": 36.7077, | |
| "eval_samples_per_second": 259.047, | |
| "eval_steps_per_second": 1.035, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 4.901960784313726, | |
| "grad_norm": 3.9723620414733887, | |
| "learning_rate": 4.4365220835809074e-07, | |
| "loss": 0.3266, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 4.901960784313726, | |
| "eval_cosine_accuracy": 0.9773898124694824, | |
| "eval_loss": 0.361878365278244, | |
| "eval_runtime": 35.5072, | |
| "eval_samples_per_second": 267.805, | |
| "eval_steps_per_second": 1.07, | |
| "step": 22000 | |
| } | |
| ], | |
| "logging_steps": 1000, | |
| "max_steps": 22440, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 256, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |