{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 2000, "global_step": 30786, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.496459429610862e-05, "grad_norm": 8.543986320495605, "learning_rate": 0.0, "loss": 2.8055, "step": 1 }, { "epoch": 0.12992918859221725, "grad_norm": 4.228509902954102, "learning_rate": 9.740449584199584e-07, "loss": 2.5053, "step": 2000 }, { "epoch": 0.12992918859221725, "eval_cosine_accuracy": 0.9490216970443726, "eval_loss": 0.9158492088317871, "eval_runtime": 24.8196, "eval_samples_per_second": 380.948, "eval_steps_per_second": 1.491, "step": 2000 }, { "epoch": 0.2598583771844345, "grad_norm": 4.29846715927124, "learning_rate": 1.948577182952183e-06, "loss": 2.0544, "step": 4000 }, { "epoch": 0.2598583771844345, "eval_cosine_accuracy": 0.9499735832214355, "eval_loss": 0.9200904369354248, "eval_runtime": 24.5453, "eval_samples_per_second": 385.206, "eval_steps_per_second": 1.507, "step": 4000 }, { "epoch": 0.38978756577665175, "grad_norm": 4.714716911315918, "learning_rate": 2.9226221413721413e-06, "loss": 1.7583, "step": 6000 }, { "epoch": 0.38978756577665175, "eval_cosine_accuracy": 0.9456372261047363, "eval_loss": 0.8998962640762329, "eval_runtime": 24.5547, "eval_samples_per_second": 385.059, "eval_steps_per_second": 1.507, "step": 6000 }, { "epoch": 0.519716754368869, "grad_norm": 4.514920234680176, "learning_rate": 3.8966670997921e-06, "loss": 1.5402, "step": 8000 }, { "epoch": 0.519716754368869, "eval_cosine_accuracy": 0.9466948509216309, "eval_loss": 0.8402906060218811, "eval_runtime": 24.5672, "eval_samples_per_second": 384.862, "eval_steps_per_second": 1.506, "step": 8000 }, { "epoch": 0.6496459429610862, "grad_norm": 4.10648250579834, "learning_rate": 4.871199324324324e-06, "loss": 1.3738, "step": 10000 }, { "epoch": 0.6496459429610862, "eval_cosine_accuracy": 0.947012186050415, "eval_loss": 0.8332843780517578, "eval_runtime": 24.6002, "eval_samples_per_second": 384.346, "eval_steps_per_second": 1.504, "step": 10000 }, { "epoch": 0.7795751315533035, "grad_norm": 4.327205657958984, "learning_rate": 5.844757016632017e-06, "loss": 1.241, "step": 12000 }, { "epoch": 0.7795751315533035, "eval_cosine_accuracy": 0.9478582739830017, "eval_loss": 0.8381510972976685, "eval_runtime": 24.3575, "eval_samples_per_second": 388.176, "eval_steps_per_second": 1.519, "step": 12000 }, { "epoch": 0.9095043201455207, "grad_norm": 4.739709854125977, "learning_rate": 6.818801975051975e-06, "loss": 1.1337, "step": 14000 }, { "epoch": 0.9095043201455207, "eval_cosine_accuracy": 0.9480698108673096, "eval_loss": 0.8515253663063049, "eval_runtime": 24.5042, "eval_samples_per_second": 385.853, "eval_steps_per_second": 1.51, "step": 14000 }, { "epoch": 1.039433508737738, "grad_norm": 3.9595558643341064, "learning_rate": 7.7933341995842e-06, "loss": 1.3481, "step": 16000 }, { "epoch": 1.039433508737738, "eval_cosine_accuracy": 0.9481755495071411, "eval_loss": 0.861483097076416, "eval_runtime": 24.9212, "eval_samples_per_second": 379.395, "eval_steps_per_second": 1.485, "step": 16000 }, { "epoch": 1.1693626973299551, "grad_norm": 3.8362483978271484, "learning_rate": 8.767379158004158e-06, "loss": 0.9775, "step": 18000 }, { "epoch": 1.1693626973299551, "eval_cosine_accuracy": 0.9490216970443726, "eval_loss": 0.866976261138916, "eval_runtime": 24.7567, "eval_samples_per_second": 381.917, "eval_steps_per_second": 1.495, "step": 18000 }, { "epoch": 1.2992918859221725, "grad_norm": 3.660410165786743, "learning_rate": 9.740936850311851e-06, "loss": 0.9106, "step": 20000 }, { "epoch": 1.2992918859221725, "eval_cosine_accuracy": 0.9511369466781616, "eval_loss": 0.8142719268798828, "eval_runtime": 24.6537, "eval_samples_per_second": 383.513, "eval_steps_per_second": 1.501, "step": 20000 }, { "epoch": 1.4292210745143896, "grad_norm": 3.5640735626220703, "learning_rate": 1.0715469074844075e-05, "loss": 0.8581, "step": 22000 }, { "epoch": 1.4292210745143896, "eval_cosine_accuracy": 0.9521946310997009, "eval_loss": 0.7982929944992065, "eval_runtime": 24.9083, "eval_samples_per_second": 379.592, "eval_steps_per_second": 1.485, "step": 22000 }, { "epoch": 1.559150263106607, "grad_norm": 3.663590908050537, "learning_rate": 1.1689514033264035e-05, "loss": 0.8119, "step": 24000 }, { "epoch": 1.559150263106607, "eval_cosine_accuracy": 0.9538868069648743, "eval_loss": 0.7904353141784668, "eval_runtime": 24.419, "eval_samples_per_second": 387.198, "eval_steps_per_second": 1.515, "step": 24000 }, { "epoch": 1.6890794516988241, "grad_norm": 3.101531505584717, "learning_rate": 1.2664046257796258e-05, "loss": 0.775, "step": 26000 }, { "epoch": 1.6890794516988241, "eval_cosine_accuracy": 0.9561078548431396, "eval_loss": 0.7433677315711975, "eval_runtime": 24.4716, "eval_samples_per_second": 386.366, "eval_steps_per_second": 1.512, "step": 26000 }, { "epoch": 1.8190086402910413, "grad_norm": 3.2498459815979004, "learning_rate": 1.3638091216216216e-05, "loss": 0.7376, "step": 28000 }, { "epoch": 1.8190086402910413, "eval_cosine_accuracy": 0.9565309286117554, "eval_loss": 0.779238760471344, "eval_runtime": 24.4481, "eval_samples_per_second": 386.737, "eval_steps_per_second": 1.513, "step": 28000 }, { "epoch": 1.9489378288832586, "grad_norm": 2.890700101852417, "learning_rate": 1.4612136174636175e-05, "loss": 0.7072, "step": 30000 }, { "epoch": 1.9489378288832586, "eval_cosine_accuracy": 0.9578000903129578, "eval_loss": 0.7882058024406433, "eval_runtime": 24.4341, "eval_samples_per_second": 386.959, "eval_steps_per_second": 1.514, "step": 30000 } ], "logging_steps": 2000, "max_steps": 153930, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 256, "trial_name": null, "trial_params": null }