| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 6.802228412256268, | |
| "eval_steps": 30, | |
| "global_step": 300, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.22284122562674094, | |
| "grad_norm": 0.2538502514362335, | |
| "learning_rate": 2.785515320334262e-07, | |
| "loss": 0.0283, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.4456824512534819, | |
| "grad_norm": 0.13635846972465515, | |
| "learning_rate": 5.571030640668524e-07, | |
| "loss": 0.0344, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.6685236768802229, | |
| "grad_norm": 0.25132906436920166, | |
| "learning_rate": 8.356545961002786e-07, | |
| "loss": 0.0305, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.6685236768802229, | |
| "eval_loss": 0.030975496396422386, | |
| "eval_runtime": 6.5316, | |
| "eval_samples_per_second": 229.652, | |
| "eval_steps_per_second": 14.392, | |
| "eval_sts_dev_pearson_cosine": 0.7953296758719961, | |
| "eval_sts_dev_pearson_dot": 0.6855921619048916, | |
| "eval_sts_dev_pearson_euclidean": 0.7647603423822984, | |
| "eval_sts_dev_pearson_manhattan": 0.7662305710281121, | |
| "eval_sts_dev_pearson_max": 0.7953296758719961, | |
| "eval_sts_dev_spearman_cosine": 0.7938998183894888, | |
| "eval_sts_dev_spearman_dot": 0.6701160606364611, | |
| "eval_sts_dev_spearman_euclidean": 0.764275064463694, | |
| "eval_sts_dev_spearman_manhattan": 0.7663956716038323, | |
| "eval_sts_dev_spearman_max": 0.7938998183894888, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.8913649025069638, | |
| "grad_norm": 0.2590219974517822, | |
| "learning_rate": 1.1142061281337048e-06, | |
| "loss": 0.0489, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 1.1337047353760445, | |
| "grad_norm": 0.2477671355009079, | |
| "learning_rate": 1.392757660167131e-06, | |
| "loss": 0.0382, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.3565459610027855, | |
| "grad_norm": 0.2230578511953354, | |
| "learning_rate": 1.6713091922005572e-06, | |
| "loss": 0.0271, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.3565459610027855, | |
| "eval_loss": 0.02927413582801819, | |
| "eval_runtime": 6.1022, | |
| "eval_samples_per_second": 245.812, | |
| "eval_steps_per_second": 15.404, | |
| "eval_sts_dev_pearson_cosine": 0.8001627825550413, | |
| "eval_sts_dev_pearson_dot": 0.7013280153939746, | |
| "eval_sts_dev_pearson_euclidean": 0.7629781135707555, | |
| "eval_sts_dev_pearson_manhattan": 0.7647370302448242, | |
| "eval_sts_dev_pearson_max": 0.8001627825550413, | |
| "eval_sts_dev_spearman_cosine": 0.7994084764965521, | |
| "eval_sts_dev_spearman_dot": 0.6877298483304968, | |
| "eval_sts_dev_spearman_euclidean": 0.7623008729981257, | |
| "eval_sts_dev_spearman_manhattan": 0.7650295208380897, | |
| "eval_sts_dev_spearman_max": 0.7994084764965521, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.5793871866295266, | |
| "grad_norm": 0.23978064954280853, | |
| "learning_rate": 1.9498607242339835e-06, | |
| "loss": 0.0344, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.8022284122562673, | |
| "grad_norm": 0.2269248366355896, | |
| "learning_rate": 2.2284122562674097e-06, | |
| "loss": 0.0382, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 2.0445682451253484, | |
| "grad_norm": 0.1311478465795517, | |
| "learning_rate": 2.506963788300836e-06, | |
| "loss": 0.0419, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.0445682451253484, | |
| "eval_loss": 0.0279527697712183, | |
| "eval_runtime": 6.1525, | |
| "eval_samples_per_second": 243.802, | |
| "eval_steps_per_second": 15.278, | |
| "eval_sts_dev_pearson_cosine": 0.8052740525083868, | |
| "eval_sts_dev_pearson_dot": 0.7129779531910554, | |
| "eval_sts_dev_pearson_euclidean": 0.7630256540163647, | |
| "eval_sts_dev_pearson_manhattan": 0.7649555842254796, | |
| "eval_sts_dev_pearson_max": 0.8052740525083868, | |
| "eval_sts_dev_spearman_cosine": 0.805932936440032, | |
| "eval_sts_dev_spearman_dot": 0.7013448783489886, | |
| "eval_sts_dev_spearman_euclidean": 0.762706783236441, | |
| "eval_sts_dev_spearman_manhattan": 0.7655443912587759, | |
| "eval_sts_dev_spearman_max": 0.805932936440032, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.267409470752089, | |
| "grad_norm": 0.15666936337947845, | |
| "learning_rate": 2.785515320334262e-06, | |
| "loss": 0.0244, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.4902506963788302, | |
| "grad_norm": 0.14549851417541504, | |
| "learning_rate": 3.064066852367688e-06, | |
| "loss": 0.0307, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 2.713091922005571, | |
| "grad_norm": 0.20197178423404694, | |
| "learning_rate": 3.3426183844011143e-06, | |
| "loss": 0.0291, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.713091922005571, | |
| "eval_loss": 0.02694467455148697, | |
| "eval_runtime": 6.528, | |
| "eval_samples_per_second": 229.78, | |
| "eval_steps_per_second": 14.4, | |
| "eval_sts_dev_pearson_cosine": 0.8095317257793349, | |
| "eval_sts_dev_pearson_dot": 0.7228217786137938, | |
| "eval_sts_dev_pearson_euclidean": 0.7635943588878411, | |
| "eval_sts_dev_pearson_manhattan": 0.7656672001584354, | |
| "eval_sts_dev_pearson_max": 0.8095317257793349, | |
| "eval_sts_dev_spearman_cosine": 0.8107539995821735, | |
| "eval_sts_dev_spearman_dot": 0.7126247484390617, | |
| "eval_sts_dev_spearman_euclidean": 0.7634838306489425, | |
| "eval_sts_dev_spearman_manhattan": 0.7664168478564297, | |
| "eval_sts_dev_spearman_max": 0.8107539995821735, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.935933147632312, | |
| "grad_norm": 0.2107369303703308, | |
| "learning_rate": 3.6211699164345405e-06, | |
| "loss": 0.038, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 3.1782729805013927, | |
| "grad_norm": 0.15846215188503265, | |
| "learning_rate": 3.899721448467967e-06, | |
| "loss": 0.0269, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 3.401114206128134, | |
| "grad_norm": 0.17715278267860413, | |
| "learning_rate": 4.178272980501394e-06, | |
| "loss": 0.0268, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 3.401114206128134, | |
| "eval_loss": 0.026173867285251617, | |
| "eval_runtime": 6.306, | |
| "eval_samples_per_second": 237.869, | |
| "eval_steps_per_second": 14.906, | |
| "eval_sts_dev_pearson_cosine": 0.8136326182189031, | |
| "eval_sts_dev_pearson_dot": 0.7289342393989602, | |
| "eval_sts_dev_pearson_euclidean": 0.7658102043154281, | |
| "eval_sts_dev_pearson_manhattan": 0.7680399446033591, | |
| "eval_sts_dev_pearson_max": 0.8136326182189031, | |
| "eval_sts_dev_spearman_cosine": 0.8154563967795785, | |
| "eval_sts_dev_spearman_dot": 0.7204276033712009, | |
| "eval_sts_dev_spearman_euclidean": 0.7661516256266799, | |
| "eval_sts_dev_spearman_manhattan": 0.7692973830139536, | |
| "eval_sts_dev_spearman_max": 0.8154563967795785, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 3.6239554317548746, | |
| "grad_norm": 0.1337411254644394, | |
| "learning_rate": 4.456824512534819e-06, | |
| "loss": 0.0246, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 3.8467966573816157, | |
| "grad_norm": 0.20471176505088806, | |
| "learning_rate": 4.735376044568246e-06, | |
| "loss": 0.0313, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 4.089136490250697, | |
| "grad_norm": 0.12327426671981812, | |
| "learning_rate": 5.013927576601672e-06, | |
| "loss": 0.0303, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 4.089136490250697, | |
| "eval_loss": 0.02586401253938675, | |
| "eval_runtime": 6.8399, | |
| "eval_samples_per_second": 219.3, | |
| "eval_steps_per_second": 13.743, | |
| "eval_sts_dev_pearson_cosine": 0.8163121986548724, | |
| "eval_sts_dev_pearson_dot": 0.7330841259509188, | |
| "eval_sts_dev_pearson_euclidean": 0.7674859088604027, | |
| "eval_sts_dev_pearson_manhattan": 0.7697974598144367, | |
| "eval_sts_dev_pearson_max": 0.8163121986548724, | |
| "eval_sts_dev_spearman_cosine": 0.8184908732804921, | |
| "eval_sts_dev_spearman_dot": 0.7250521959658871, | |
| "eval_sts_dev_spearman_euclidean": 0.7684563123887144, | |
| "eval_sts_dev_spearman_manhattan": 0.7715573641686395, | |
| "eval_sts_dev_spearman_max": 0.8184908732804921, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 4.311977715877437, | |
| "grad_norm": 0.11181030422449112, | |
| "learning_rate": 5.292479108635098e-06, | |
| "loss": 0.0198, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 4.534818941504178, | |
| "grad_norm": 0.11830934137105942, | |
| "learning_rate": 5.571030640668524e-06, | |
| "loss": 0.0257, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 4.757660167130919, | |
| "grad_norm": 0.1775977462530136, | |
| "learning_rate": 5.849582172701951e-06, | |
| "loss": 0.0242, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 4.757660167130919, | |
| "eval_loss": 0.02551957406103611, | |
| "eval_runtime": 6.4245, | |
| "eval_samples_per_second": 233.481, | |
| "eval_steps_per_second": 14.631, | |
| "eval_sts_dev_pearson_cosine": 0.8184173000480589, | |
| "eval_sts_dev_pearson_dot": 0.7369533513611706, | |
| "eval_sts_dev_pearson_euclidean": 0.7687482582532739, | |
| "eval_sts_dev_pearson_manhattan": 0.7712300663924829, | |
| "eval_sts_dev_pearson_max": 0.8184173000480589, | |
| "eval_sts_dev_spearman_cosine": 0.8201930470486518, | |
| "eval_sts_dev_spearman_dot": 0.7292325959243812, | |
| "eval_sts_dev_spearman_euclidean": 0.7696170592602297, | |
| "eval_sts_dev_spearman_manhattan": 0.7729809111066369, | |
| "eval_sts_dev_spearman_max": 0.8201930470486518, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 4.9805013927576605, | |
| "grad_norm": 0.23354189097881317, | |
| "learning_rate": 6.128133704735376e-06, | |
| "loss": 0.0293, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 5.222841225626741, | |
| "grad_norm": 0.12718431651592255, | |
| "learning_rate": 6.406685236768803e-06, | |
| "loss": 0.0193, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 5.445682451253482, | |
| "grad_norm": 0.1111082211136818, | |
| "learning_rate": 6.685236768802229e-06, | |
| "loss": 0.0222, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 5.445682451253482, | |
| "eval_loss": 0.02539980411529541, | |
| "eval_runtime": 6.3582, | |
| "eval_samples_per_second": 235.915, | |
| "eval_steps_per_second": 14.784, | |
| "eval_sts_dev_pearson_cosine": 0.8203051470878093, | |
| "eval_sts_dev_pearson_dot": 0.7391973842870876, | |
| "eval_sts_dev_pearson_euclidean": 0.7710328054708023, | |
| "eval_sts_dev_pearson_manhattan": 0.7734981812206646, | |
| "eval_sts_dev_pearson_max": 0.8203051470878093, | |
| "eval_sts_dev_spearman_cosine": 0.8222047787628998, | |
| "eval_sts_dev_spearman_dot": 0.7306726496212352, | |
| "eval_sts_dev_spearman_euclidean": 0.7721080064054946, | |
| "eval_sts_dev_spearman_manhattan": 0.7758967012553709, | |
| "eval_sts_dev_spearman_max": 0.8222047787628998, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 5.6685236768802225, | |
| "grad_norm": 0.167997807264328, | |
| "learning_rate": 6.963788300835655e-06, | |
| "loss": 0.0184, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 5.891364902506964, | |
| "grad_norm": 0.18360492587089539, | |
| "learning_rate": 7.242339832869081e-06, | |
| "loss": 0.0243, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 6.133704735376044, | |
| "grad_norm": 0.11399545520544052, | |
| "learning_rate": 7.5208913649025075e-06, | |
| "loss": 0.0204, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 6.133704735376044, | |
| "eval_loss": 0.025426626205444336, | |
| "eval_runtime": 6.3377, | |
| "eval_samples_per_second": 236.678, | |
| "eval_steps_per_second": 14.832, | |
| "eval_sts_dev_pearson_cosine": 0.8215923043460271, | |
| "eval_sts_dev_pearson_dot": 0.7427941063103285, | |
| "eval_sts_dev_pearson_euclidean": 0.7725242056053008, | |
| "eval_sts_dev_pearson_manhattan": 0.7749558209132376, | |
| "eval_sts_dev_pearson_max": 0.8215923043460271, | |
| "eval_sts_dev_spearman_cosine": 0.8234628421089484, | |
| "eval_sts_dev_spearman_dot": 0.7343279809432616, | |
| "eval_sts_dev_spearman_euclidean": 0.7742054612821838, | |
| "eval_sts_dev_spearman_manhattan": 0.777339758218875, | |
| "eval_sts_dev_spearman_max": 0.8234628421089484, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 6.3565459610027855, | |
| "grad_norm": 0.14734485745429993, | |
| "learning_rate": 7.799442896935934e-06, | |
| "loss": 0.0147, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 6.579387186629527, | |
| "grad_norm": 0.14232878386974335, | |
| "learning_rate": 8.07799442896936e-06, | |
| "loss": 0.0196, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 6.802228412256268, | |
| "grad_norm": 0.12475496530532837, | |
| "learning_rate": 8.356545961002787e-06, | |
| "loss": 0.0176, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 6.802228412256268, | |
| "eval_loss": 0.025328340008854866, | |
| "eval_runtime": 6.1771, | |
| "eval_samples_per_second": 242.832, | |
| "eval_steps_per_second": 15.217, | |
| "eval_sts_dev_pearson_cosine": 0.8219368394963247, | |
| "eval_sts_dev_pearson_dot": 0.7469111462936613, | |
| "eval_sts_dev_pearson_euclidean": 0.7729334760561297, | |
| "eval_sts_dev_pearson_manhattan": 0.7754957053869553, | |
| "eval_sts_dev_pearson_max": 0.8219368394963247, | |
| "eval_sts_dev_spearman_cosine": 0.8227360781964935, | |
| "eval_sts_dev_spearman_dot": 0.7392541828806165, | |
| "eval_sts_dev_spearman_euclidean": 0.7748490630523356, | |
| "eval_sts_dev_spearman_manhattan": 0.7782586536188661, | |
| "eval_sts_dev_spearman_max": 0.8227360781964935, | |
| "step": 300 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 440, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 300, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |