diff --git "a/checkpoint-309/trainer_state.json" "b/checkpoint-309/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-309/trainer_state.json" @@ -0,0 +1,3294 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6046966731898239, + "eval_steps": 5, + "global_step": 309, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0019569471624266144, + "grad_norm": 3.932948112487793, + "learning_rate": 7.8125e-08, + "loss": 0.107, + "step": 1 + }, + { + "epoch": 0.003913894324853229, + "grad_norm": 4.482716083526611, + "learning_rate": 1.5625e-07, + "loss": 0.1529, + "step": 2 + }, + { + "epoch": 0.005870841487279843, + "grad_norm": 4.672689437866211, + "learning_rate": 2.3437500000000003e-07, + "loss": 0.1874, + "step": 3 + }, + { + "epoch": 0.007827788649706457, + "grad_norm": 4.226949214935303, + "learning_rate": 3.125e-07, + "loss": 0.1682, + "step": 4 + }, + { + "epoch": 0.009784735812133072, + "grad_norm": 4.327479362487793, + "learning_rate": 3.90625e-07, + "loss": 0.1438, + "step": 5 + }, + { + "epoch": 0.009784735812133072, + "eval_loss": 0.1470455378293991, + "eval_runtime": 107.3614, + "eval_samples_per_second": 28.427, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8861388036460539, + "eval_sts-test_pearson_dot": 0.8769528313548112, + "eval_sts-test_pearson_euclidean": 0.9079831987750276, + "eval_sts-test_pearson_manhattan": 0.9086786527495163, + "eval_sts-test_pearson_max": 0.9086786527495163, + "eval_sts-test_spearman_cosine": 0.9077902566323186, + "eval_sts-test_spearman_dot": 0.8794770733264693, + "eval_sts-test_spearman_euclidean": 0.903967335376697, + "eval_sts-test_spearman_manhattan": 0.9043498244078092, + "eval_sts-test_spearman_max": 0.9077902566323186, + "step": 5 + }, + { + "epoch": 0.011741682974559686, + "grad_norm": 5.27250337600708, + "learning_rate": 4.6875000000000006e-07, + "loss": 0.2961, + "step": 6 + }, + { + "epoch": 0.0136986301369863, + "grad_norm": 5.903276443481445, + "learning_rate": 5.468750000000001e-07, + "loss": 0.3019, + "step": 7 + }, + { + "epoch": 0.015655577299412915, + "grad_norm": 4.000335693359375, + "learning_rate": 6.25e-07, + "loss": 0.1184, + "step": 8 + }, + { + "epoch": 0.01761252446183953, + "grad_norm": 5.876769065856934, + "learning_rate": 7.03125e-07, + "loss": 0.3176, + "step": 9 + }, + { + "epoch": 0.019569471624266144, + "grad_norm": 4.8437933921813965, + "learning_rate": 7.8125e-07, + "loss": 0.2234, + "step": 10 + }, + { + "epoch": 0.019569471624266144, + "eval_loss": 0.1467687040567398, + "eval_runtime": 107.2549, + "eval_samples_per_second": 28.456, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8861409457129842, + "eval_sts-test_pearson_dot": 0.876972814890145, + "eval_sts-test_pearson_euclidean": 0.9080268416052204, + "eval_sts-test_pearson_manhattan": 0.9087444298597203, + "eval_sts-test_pearson_max": 0.9087444298597203, + "eval_sts-test_spearman_cosine": 0.9078342918735278, + "eval_sts-test_spearman_dot": 0.8794190309404447, + "eval_sts-test_spearman_euclidean": 0.9039501508923226, + "eval_sts-test_spearman_manhattan": 0.9044244247605487, + "eval_sts-test_spearman_max": 0.9078342918735278, + "step": 10 + }, + { + "epoch": 0.021526418786692758, + "grad_norm": 4.726498603820801, + "learning_rate": 8.59375e-07, + "loss": 0.1881, + "step": 11 + }, + { + "epoch": 0.023483365949119372, + "grad_norm": 4.818070411682129, + "learning_rate": 9.375000000000001e-07, + "loss": 0.1593, + "step": 12 + }, + { + "epoch": 0.025440313111545987, + "grad_norm": 4.98201322555542, + "learning_rate": 1.0156250000000001e-06, + "loss": 0.1833, + "step": 13 + }, + { + "epoch": 0.0273972602739726, + "grad_norm": 4.269514560699463, + "learning_rate": 1.0937500000000001e-06, + "loss": 0.1352, + "step": 14 + }, + { + "epoch": 0.029354207436399216, + "grad_norm": 6.1525492668151855, + "learning_rate": 1.1718750000000001e-06, + "loss": 0.3143, + "step": 15 + }, + { + "epoch": 0.029354207436399216, + "eval_loss": 0.1462097316980362, + "eval_runtime": 107.0721, + "eval_samples_per_second": 28.504, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8860829119688085, + "eval_sts-test_pearson_dot": 0.8768990080043222, + "eval_sts-test_pearson_euclidean": 0.9080646402781543, + "eval_sts-test_pearson_manhattan": 0.9088063929836994, + "eval_sts-test_pearson_max": 0.9088063929836994, + "eval_sts-test_spearman_cosine": 0.907713597721555, + "eval_sts-test_spearman_dot": 0.8795110842851269, + "eval_sts-test_spearman_euclidean": 0.9040110126078148, + "eval_sts-test_spearman_manhattan": 0.9045081991218733, + "eval_sts-test_spearman_max": 0.907713597721555, + "step": 15 + }, + { + "epoch": 0.03131115459882583, + "grad_norm": 4.751354694366455, + "learning_rate": 1.25e-06, + "loss": 0.1583, + "step": 16 + }, + { + "epoch": 0.033268101761252444, + "grad_norm": 5.435980319976807, + "learning_rate": 1.328125e-06, + "loss": 0.2015, + "step": 17 + }, + { + "epoch": 0.03522504892367906, + "grad_norm": 4.1765851974487305, + "learning_rate": 1.40625e-06, + "loss": 0.1476, + "step": 18 + }, + { + "epoch": 0.03718199608610567, + "grad_norm": 4.689794540405273, + "learning_rate": 1.484375e-06, + "loss": 0.1676, + "step": 19 + }, + { + "epoch": 0.03913894324853229, + "grad_norm": 4.203744888305664, + "learning_rate": 1.5625e-06, + "loss": 0.1525, + "step": 20 + }, + { + "epoch": 0.03913894324853229, + "eval_loss": 0.14544810354709625, + "eval_runtime": 107.1845, + "eval_samples_per_second": 28.474, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8861436293943533, + "eval_sts-test_pearson_dot": 0.8769239163708102, + "eval_sts-test_pearson_euclidean": 0.9082269545633608, + "eval_sts-test_pearson_manhattan": 0.9089828403051001, + "eval_sts-test_pearson_max": 0.9089828403051001, + "eval_sts-test_spearman_cosine": 0.907929343552723, + "eval_sts-test_spearman_dot": 0.8796122221358714, + "eval_sts-test_spearman_euclidean": 0.9043074002120102, + "eval_sts-test_spearman_manhattan": 0.9047217521412333, + "eval_sts-test_spearman_max": 0.907929343552723, + "step": 20 + }, + { + "epoch": 0.0410958904109589, + "grad_norm": 5.152130603790283, + "learning_rate": 1.640625e-06, + "loss": 0.1717, + "step": 21 + }, + { + "epoch": 0.043052837573385516, + "grad_norm": 5.343059062957764, + "learning_rate": 1.71875e-06, + "loss": 0.198, + "step": 22 + }, + { + "epoch": 0.04500978473581213, + "grad_norm": 5.224748134613037, + "learning_rate": 1.796875e-06, + "loss": 0.3062, + "step": 23 + }, + { + "epoch": 0.046966731898238745, + "grad_norm": 4.6179423332214355, + "learning_rate": 1.8750000000000003e-06, + "loss": 0.1241, + "step": 24 + }, + { + "epoch": 0.04892367906066536, + "grad_norm": 4.200148105621338, + "learning_rate": 1.953125e-06, + "loss": 0.1087, + "step": 25 + }, + { + "epoch": 0.04892367906066536, + "eval_loss": 0.14457188546657562, + "eval_runtime": 107.3809, + "eval_samples_per_second": 28.422, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8862905994058754, + "eval_sts-test_pearson_dot": 0.877015249192232, + "eval_sts-test_pearson_euclidean": 0.9085054742522269, + "eval_sts-test_pearson_manhattan": 0.9092575877809899, + "eval_sts-test_pearson_max": 0.9092575877809899, + "eval_sts-test_spearman_cosine": 0.9082294902628751, + "eval_sts-test_spearman_dot": 0.8798810429630494, + "eval_sts-test_spearman_euclidean": 0.9047149499495015, + "eval_sts-test_spearman_manhattan": 0.9051023616193669, + "eval_sts-test_spearman_max": 0.9082294902628751, + "step": 25 + }, + { + "epoch": 0.050880626223091974, + "grad_norm": 4.890737533569336, + "learning_rate": 2.0312500000000002e-06, + "loss": 0.1767, + "step": 26 + }, + { + "epoch": 0.05283757338551859, + "grad_norm": 4.683767795562744, + "learning_rate": 2.109375e-06, + "loss": 0.1951, + "step": 27 + }, + { + "epoch": 0.0547945205479452, + "grad_norm": 4.656280040740967, + "learning_rate": 2.1875000000000002e-06, + "loss": 0.1621, + "step": 28 + }, + { + "epoch": 0.05675146771037182, + "grad_norm": 4.446409702301025, + "learning_rate": 2.265625e-06, + "loss": 0.221, + "step": 29 + }, + { + "epoch": 0.05870841487279843, + "grad_norm": 5.765133857727051, + "learning_rate": 2.3437500000000002e-06, + "loss": 0.2241, + "step": 30 + }, + { + "epoch": 0.05870841487279843, + "eval_loss": 0.14350731670856476, + "eval_runtime": 107.3747, + "eval_samples_per_second": 28.424, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8863784941826807, + "eval_sts-test_pearson_dot": 0.8768948467465629, + "eval_sts-test_pearson_euclidean": 0.9088066170487232, + "eval_sts-test_pearson_manhattan": 0.9095658568102677, + "eval_sts-test_pearson_max": 0.9095658568102677, + "eval_sts-test_spearman_cosine": 0.9082580415676429, + "eval_sts-test_spearman_dot": 0.8801849487791585, + "eval_sts-test_spearman_euclidean": 0.9051721735871375, + "eval_sts-test_spearman_manhattan": 0.9054862826908437, + "eval_sts-test_spearman_max": 0.9082580415676429, + "step": 30 + }, + { + "epoch": 0.060665362035225046, + "grad_norm": 5.359245777130127, + "learning_rate": 2.421875e-06, + "loss": 0.2093, + "step": 31 + }, + { + "epoch": 0.06262230919765166, + "grad_norm": 4.439486503601074, + "learning_rate": 2.5e-06, + "loss": 0.1615, + "step": 32 + }, + { + "epoch": 0.06457925636007827, + "grad_norm": 3.689824342727661, + "learning_rate": 2.5781250000000004e-06, + "loss": 0.1615, + "step": 33 + }, + { + "epoch": 0.06653620352250489, + "grad_norm": 4.842885494232178, + "learning_rate": 2.65625e-06, + "loss": 0.1772, + "step": 34 + }, + { + "epoch": 0.0684931506849315, + "grad_norm": 5.209301948547363, + "learning_rate": 2.7343750000000004e-06, + "loss": 0.2324, + "step": 35 + }, + { + "epoch": 0.0684931506849315, + "eval_loss": 0.14226235449314117, + "eval_runtime": 107.3108, + "eval_samples_per_second": 28.441, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8863574366132135, + "eval_sts-test_pearson_dot": 0.8765683077424664, + "eval_sts-test_pearson_euclidean": 0.9091012263251723, + "eval_sts-test_pearson_manhattan": 0.9098631032540263, + "eval_sts-test_pearson_max": 0.9098631032540263, + "eval_sts-test_spearman_cosine": 0.9083728733043733, + "eval_sts-test_spearman_dot": 0.8800282746130272, + "eval_sts-test_spearman_euclidean": 0.9052579170039636, + "eval_sts-test_spearman_manhattan": 0.9059997586640487, + "eval_sts-test_spearman_max": 0.9083728733043733, + "step": 35 + }, + { + "epoch": 0.07045009784735812, + "grad_norm": 4.740983009338379, + "learning_rate": 2.8125e-06, + "loss": 0.2611, + "step": 36 + }, + { + "epoch": 0.07240704500978473, + "grad_norm": 5.090059757232666, + "learning_rate": 2.8906250000000004e-06, + "loss": 0.214, + "step": 37 + }, + { + "epoch": 0.07436399217221135, + "grad_norm": 5.123153209686279, + "learning_rate": 2.96875e-06, + "loss": 0.1985, + "step": 38 + }, + { + "epoch": 0.07632093933463796, + "grad_norm": 5.401946067810059, + "learning_rate": 3.0468750000000004e-06, + "loss": 0.1855, + "step": 39 + }, + { + "epoch": 0.07827788649706457, + "grad_norm": 4.838700294494629, + "learning_rate": 3.125e-06, + "loss": 0.1234, + "step": 40 + }, + { + "epoch": 0.07827788649706457, + "eval_loss": 0.14100149273872375, + "eval_runtime": 107.3059, + "eval_samples_per_second": 28.442, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8864265749012155, + "eval_sts-test_pearson_dot": 0.8764612424174422, + "eval_sts-test_pearson_euclidean": 0.9094092487009695, + "eval_sts-test_pearson_manhattan": 0.9101707626021143, + "eval_sts-test_pearson_max": 0.9101707626021143, + "eval_sts-test_spearman_cosine": 0.908505695048183, + "eval_sts-test_spearman_dot": 0.8802103674956289, + "eval_sts-test_spearman_euclidean": 0.9054564783507572, + "eval_sts-test_spearman_manhattan": 0.9063046490079084, + "eval_sts-test_spearman_max": 0.908505695048183, + "step": 40 + }, + { + "epoch": 0.08023483365949119, + "grad_norm": 3.8856801986694336, + "learning_rate": 3.2031250000000004e-06, + "loss": 0.1492, + "step": 41 + }, + { + "epoch": 0.0821917808219178, + "grad_norm": 5.678151607513428, + "learning_rate": 3.28125e-06, + "loss": 0.2022, + "step": 42 + }, + { + "epoch": 0.08414872798434442, + "grad_norm": 5.104148864746094, + "learning_rate": 3.3593750000000003e-06, + "loss": 0.2146, + "step": 43 + }, + { + "epoch": 0.08610567514677103, + "grad_norm": 4.76043701171875, + "learning_rate": 3.4375e-06, + "loss": 0.1688, + "step": 44 + }, + { + "epoch": 0.08806262230919765, + "grad_norm": 5.128803730010986, + "learning_rate": 3.5156250000000003e-06, + "loss": 0.175, + "step": 45 + }, + { + "epoch": 0.08806262230919765, + "eval_loss": 0.13962982594966888, + "eval_runtime": 107.4144, + "eval_samples_per_second": 28.413, + "eval_steps_per_second": 0.223, + "eval_sts-test_pearson_cosine": 0.886410908658177, + "eval_sts-test_pearson_dot": 0.8762836795862763, + "eval_sts-test_pearson_euclidean": 0.9096890242379734, + "eval_sts-test_pearson_manhattan": 0.9104590803642174, + "eval_sts-test_pearson_max": 0.9104590803642174, + "eval_sts-test_spearman_cosine": 0.9086694846648755, + "eval_sts-test_spearman_dot": 0.8801346931126159, + "eval_sts-test_spearman_euclidean": 0.9057376952773407, + "eval_sts-test_spearman_manhattan": 0.9064708999439774, + "eval_sts-test_spearman_max": 0.9086694846648755, + "step": 45 + }, + { + "epoch": 0.09001956947162426, + "grad_norm": 4.968522548675537, + "learning_rate": 3.59375e-06, + "loss": 0.2123, + "step": 46 + }, + { + "epoch": 0.09197651663405088, + "grad_norm": 4.343472957611084, + "learning_rate": 3.6718750000000003e-06, + "loss": 0.1118, + "step": 47 + }, + { + "epoch": 0.09393346379647749, + "grad_norm": 6.252938270568848, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.3009, + "step": 48 + }, + { + "epoch": 0.0958904109589041, + "grad_norm": 3.411029815673828, + "learning_rate": 3.828125000000001e-06, + "loss": 0.1071, + "step": 49 + }, + { + "epoch": 0.09784735812133072, + "grad_norm": 5.379226207733154, + "learning_rate": 3.90625e-06, + "loss": 0.2608, + "step": 50 + }, + { + "epoch": 0.09784735812133072, + "eval_loss": 0.13823722302913666, + "eval_runtime": 107.3656, + "eval_samples_per_second": 28.426, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8863074884351817, + "eval_sts-test_pearson_dot": 0.8763122134205692, + "eval_sts-test_pearson_euclidean": 0.9097700018848961, + "eval_sts-test_pearson_manhattan": 0.9105724410858811, + "eval_sts-test_pearson_max": 0.9105724410858811, + "eval_sts-test_spearman_cosine": 0.9085105281844131, + "eval_sts-test_spearman_dot": 0.8801239975611433, + "eval_sts-test_spearman_euclidean": 0.9059798443527296, + "eval_sts-test_spearman_manhattan": 0.9065691737139927, + "eval_sts-test_spearman_max": 0.9085105281844131, + "step": 50 + }, + { + "epoch": 0.09980430528375733, + "grad_norm": 4.599095821380615, + "learning_rate": 3.984375e-06, + "loss": 0.1368, + "step": 51 + }, + { + "epoch": 0.10176125244618395, + "grad_norm": 5.634761333465576, + "learning_rate": 4.0625000000000005e-06, + "loss": 0.2307, + "step": 52 + }, + { + "epoch": 0.10371819960861056, + "grad_norm": 4.678525924682617, + "learning_rate": 4.140625000000001e-06, + "loss": 0.1366, + "step": 53 + }, + { + "epoch": 0.10567514677103718, + "grad_norm": 4.931070327758789, + "learning_rate": 4.21875e-06, + "loss": 0.1857, + "step": 54 + }, + { + "epoch": 0.10763209393346379, + "grad_norm": 4.903087139129639, + "learning_rate": 4.296875e-06, + "loss": 0.2155, + "step": 55 + }, + { + "epoch": 0.10763209393346379, + "eval_loss": 0.1367325782775879, + "eval_runtime": 107.3012, + "eval_samples_per_second": 28.443, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.88603017002284, + "eval_sts-test_pearson_dot": 0.8761626193697236, + "eval_sts-test_pearson_euclidean": 0.9096799681812165, + "eval_sts-test_pearson_manhattan": 0.9104977957475867, + "eval_sts-test_pearson_max": 0.9104977957475867, + "eval_sts-test_spearman_cosine": 0.9084685067499666, + "eval_sts-test_spearman_dot": 0.8802836700617878, + "eval_sts-test_spearman_euclidean": 0.9058409364373706, + "eval_sts-test_spearman_manhattan": 0.9064240006220393, + "eval_sts-test_spearman_max": 0.9084685067499666, + "step": 55 + }, + { + "epoch": 0.1095890410958904, + "grad_norm": 5.408311367034912, + "learning_rate": 4.3750000000000005e-06, + "loss": 0.2022, + "step": 56 + }, + { + "epoch": 0.11154598825831702, + "grad_norm": 4.5926713943481445, + "learning_rate": 4.453125000000001e-06, + "loss": 0.2076, + "step": 57 + }, + { + "epoch": 0.11350293542074363, + "grad_norm": 6.475535869598389, + "learning_rate": 4.53125e-06, + "loss": 0.4133, + "step": 58 + }, + { + "epoch": 0.11545988258317025, + "grad_norm": 4.997581481933594, + "learning_rate": 4.609375e-06, + "loss": 0.1823, + "step": 59 + }, + { + "epoch": 0.11741682974559686, + "grad_norm": 3.899284601211548, + "learning_rate": 4.6875000000000004e-06, + "loss": 0.1136, + "step": 60 + }, + { + "epoch": 0.11741682974559686, + "eval_loss": 0.13528631627559662, + "eval_runtime": 107.3435, + "eval_samples_per_second": 28.432, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8860224650016398, + "eval_sts-test_pearson_dot": 0.8762739756970772, + "eval_sts-test_pearson_euclidean": 0.9099016820022997, + "eval_sts-test_pearson_manhattan": 0.9107281338135995, + "eval_sts-test_pearson_max": 0.9107281338135995, + "eval_sts-test_spearman_cosine": 0.9087510214631306, + "eval_sts-test_spearman_dot": 0.8808623486228402, + "eval_sts-test_spearman_euclidean": 0.9060555634870038, + "eval_sts-test_spearman_manhattan": 0.9067256241238172, + "eval_sts-test_spearman_max": 0.9087510214631306, + "step": 60 + }, + { + "epoch": 0.11937377690802348, + "grad_norm": 4.476404190063477, + "learning_rate": 4.765625000000001e-06, + "loss": 0.1687, + "step": 61 + }, + { + "epoch": 0.12133072407045009, + "grad_norm": 4.893277168273926, + "learning_rate": 4.84375e-06, + "loss": 0.1591, + "step": 62 + }, + { + "epoch": 0.1232876712328767, + "grad_norm": 4.510354042053223, + "learning_rate": 4.921875e-06, + "loss": 0.1653, + "step": 63 + }, + { + "epoch": 0.12524461839530332, + "grad_norm": 4.400285243988037, + "learning_rate": 5e-06, + "loss": 0.1799, + "step": 64 + }, + { + "epoch": 0.12720156555772993, + "grad_norm": 4.631839752197266, + "learning_rate": 5.078125000000001e-06, + "loss": 0.1578, + "step": 65 + }, + { + "epoch": 0.12720156555772993, + "eval_loss": 0.1336735188961029, + "eval_runtime": 107.4984, + "eval_samples_per_second": 28.391, + "eval_steps_per_second": 0.223, + "eval_sts-test_pearson_cosine": 0.886014179849858, + "eval_sts-test_pearson_dot": 0.8762492282837839, + "eval_sts-test_pearson_euclidean": 0.9101155794045166, + "eval_sts-test_pearson_manhattan": 0.9109538919103571, + "eval_sts-test_pearson_max": 0.9109538919103571, + "eval_sts-test_spearman_cosine": 0.9089514176116413, + "eval_sts-test_spearman_dot": 0.8810853441583534, + "eval_sts-test_spearman_euclidean": 0.9061670836303911, + "eval_sts-test_spearman_manhattan": 0.9072153371772234, + "eval_sts-test_spearman_max": 0.9089514176116413, + "step": 65 + }, + { + "epoch": 0.12915851272015655, + "grad_norm": 4.043459415435791, + "learning_rate": 5.156250000000001e-06, + "loss": 0.1844, + "step": 66 + }, + { + "epoch": 0.13111545988258316, + "grad_norm": 4.447835922241211, + "learning_rate": 5.234375e-06, + "loss": 0.1489, + "step": 67 + }, + { + "epoch": 0.13307240704500978, + "grad_norm": 5.372109889984131, + "learning_rate": 5.3125e-06, + "loss": 0.1845, + "step": 68 + }, + { + "epoch": 0.1350293542074364, + "grad_norm": 3.5112483501434326, + "learning_rate": 5.390625000000001e-06, + "loss": 0.1364, + "step": 69 + }, + { + "epoch": 0.136986301369863, + "grad_norm": 4.305239200592041, + "learning_rate": 5.468750000000001e-06, + "loss": 0.1584, + "step": 70 + }, + { + "epoch": 0.136986301369863, + "eval_loss": 0.1320798397064209, + "eval_runtime": 107.505, + "eval_samples_per_second": 28.389, + "eval_steps_per_second": 0.223, + "eval_sts-test_pearson_cosine": 0.88578311613969, + "eval_sts-test_pearson_dot": 0.875928774505713, + "eval_sts-test_pearson_euclidean": 0.91024619729973, + "eval_sts-test_pearson_manhattan": 0.9110959495329505, + "eval_sts-test_pearson_max": 0.9110959495329505, + "eval_sts-test_spearman_cosine": 0.9086066538938818, + "eval_sts-test_spearman_dot": 0.8801235500485294, + "eval_sts-test_spearman_euclidean": 0.9060052183179386, + "eval_sts-test_spearman_manhattan": 0.907439182986703, + "eval_sts-test_spearman_max": 0.9086066538938818, + "step": 70 + }, + { + "epoch": 0.13894324853228962, + "grad_norm": 5.093306064605713, + "learning_rate": 5.546875e-06, + "loss": 0.2279, + "step": 71 + }, + { + "epoch": 0.14090019569471623, + "grad_norm": 4.953585147857666, + "learning_rate": 5.625e-06, + "loss": 0.2028, + "step": 72 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 4.1561102867126465, + "learning_rate": 5.7031250000000006e-06, + "loss": 0.2291, + "step": 73 + }, + { + "epoch": 0.14481409001956946, + "grad_norm": 5.00941801071167, + "learning_rate": 5.781250000000001e-06, + "loss": 0.2419, + "step": 74 + }, + { + "epoch": 0.14677103718199608, + "grad_norm": 3.6476099491119385, + "learning_rate": 5.859375e-06, + "loss": 0.1329, + "step": 75 + }, + { + "epoch": 0.14677103718199608, + "eval_loss": 0.13061992824077606, + "eval_runtime": 107.3395, + "eval_samples_per_second": 28.433, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8854112983780439, + "eval_sts-test_pearson_dot": 0.8752625071185561, + "eval_sts-test_pearson_euclidean": 0.9103378320010516, + "eval_sts-test_pearson_manhattan": 0.9112261622276095, + "eval_sts-test_pearson_max": 0.9112261622276095, + "eval_sts-test_spearman_cosine": 0.9082604133844965, + "eval_sts-test_spearman_dot": 0.8794192099454903, + "eval_sts-test_spearman_euclidean": 0.9060063370994732, + "eval_sts-test_spearman_manhattan": 0.90766132824825, + "eval_sts-test_spearman_max": 0.9082604133844965, + "step": 75 + }, + { + "epoch": 0.1487279843444227, + "grad_norm": 4.10636568069458, + "learning_rate": 5.9375e-06, + "loss": 0.204, + "step": 76 + }, + { + "epoch": 0.1506849315068493, + "grad_norm": 4.767779350280762, + "learning_rate": 6.0156250000000005e-06, + "loss": 0.2239, + "step": 77 + }, + { + "epoch": 0.15264187866927592, + "grad_norm": 5.366302490234375, + "learning_rate": 6.093750000000001e-06, + "loss": 0.2181, + "step": 78 + }, + { + "epoch": 0.15459882583170254, + "grad_norm": 4.087960720062256, + "learning_rate": 6.171875e-06, + "loss": 0.1285, + "step": 79 + }, + { + "epoch": 0.15655577299412915, + "grad_norm": 3.7557668685913086, + "learning_rate": 6.25e-06, + "loss": 0.1067, + "step": 80 + }, + { + "epoch": 0.15655577299412915, + "eval_loss": 0.12924787402153015, + "eval_runtime": 107.2528, + "eval_samples_per_second": 28.456, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8850894038300653, + "eval_sts-test_pearson_dot": 0.874941916465686, + "eval_sts-test_pearson_euclidean": 0.9101863990952803, + "eval_sts-test_pearson_manhattan": 0.9110826056950171, + "eval_sts-test_pearson_max": 0.9110826056950171, + "eval_sts-test_spearman_cosine": 0.9078700928826409, + "eval_sts-test_spearman_dot": 0.8792947566875607, + "eval_sts-test_spearman_euclidean": 0.9059290069197888, + "eval_sts-test_spearman_manhattan": 0.9075206750336968, + "eval_sts-test_spearman_max": 0.9078700928826409, + "step": 80 + }, + { + "epoch": 0.15851272015655576, + "grad_norm": 3.5708839893341064, + "learning_rate": 6.3281250000000005e-06, + "loss": 0.1189, + "step": 81 + }, + { + "epoch": 0.16046966731898238, + "grad_norm": 4.602839469909668, + "learning_rate": 6.406250000000001e-06, + "loss": 0.236, + "step": 82 + }, + { + "epoch": 0.162426614481409, + "grad_norm": 4.304513931274414, + "learning_rate": 6.484375000000001e-06, + "loss": 0.1584, + "step": 83 + }, + { + "epoch": 0.1643835616438356, + "grad_norm": 4.165163516998291, + "learning_rate": 6.5625e-06, + "loss": 0.1925, + "step": 84 + }, + { + "epoch": 0.16634050880626222, + "grad_norm": 3.9157192707061768, + "learning_rate": 6.6406250000000005e-06, + "loss": 0.129, + "step": 85 + }, + { + "epoch": 0.16634050880626222, + "eval_loss": 0.1278335303068161, + "eval_runtime": 107.1978, + "eval_samples_per_second": 28.471, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8845993101894516, + "eval_sts-test_pearson_dot": 0.8740701762146532, + "eval_sts-test_pearson_euclidean": 0.9100055922999684, + "eval_sts-test_pearson_manhattan": 0.9108899080028133, + "eval_sts-test_pearson_max": 0.9108899080028133, + "eval_sts-test_spearman_cosine": 0.9078923342595523, + "eval_sts-test_spearman_dot": 0.8788126513485913, + "eval_sts-test_spearman_euclidean": 0.9057257466905491, + "eval_sts-test_spearman_manhattan": 0.9070083178420268, + "eval_sts-test_spearman_max": 0.9078923342595523, + "step": 85 + }, + { + "epoch": 0.16829745596868884, + "grad_norm": 4.233823776245117, + "learning_rate": 6.718750000000001e-06, + "loss": 0.1376, + "step": 86 + }, + { + "epoch": 0.17025440313111545, + "grad_norm": 4.670790195465088, + "learning_rate": 6.796875000000001e-06, + "loss": 0.1691, + "step": 87 + }, + { + "epoch": 0.17221135029354206, + "grad_norm": 3.742030382156372, + "learning_rate": 6.875e-06, + "loss": 0.1045, + "step": 88 + }, + { + "epoch": 0.17416829745596868, + "grad_norm": 4.242702960968018, + "learning_rate": 6.9531250000000004e-06, + "loss": 0.165, + "step": 89 + }, + { + "epoch": 0.1761252446183953, + "grad_norm": 5.499476909637451, + "learning_rate": 7.031250000000001e-06, + "loss": 0.2926, + "step": 90 + }, + { + "epoch": 0.1761252446183953, + "eval_loss": 0.12669824063777924, + "eval_runtime": 107.2778, + "eval_samples_per_second": 28.45, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8844194771150324, + "eval_sts-test_pearson_dot": 0.873458365713796, + "eval_sts-test_pearson_euclidean": 0.9099396625521212, + "eval_sts-test_pearson_manhattan": 0.910745898918033, + "eval_sts-test_pearson_max": 0.910745898918033, + "eval_sts-test_spearman_cosine": 0.907622707909669, + "eval_sts-test_spearman_dot": 0.8783740442356941, + "eval_sts-test_spearman_euclidean": 0.9058808545625318, + "eval_sts-test_spearman_manhattan": 0.906889458491771, + "eval_sts-test_spearman_max": 0.907622707909669, + "step": 90 + }, + { + "epoch": 0.1780821917808219, + "grad_norm": 2.992021083831787, + "learning_rate": 7.109375000000001e-06, + "loss": 0.1048, + "step": 91 + }, + { + "epoch": 0.18003913894324852, + "grad_norm": 4.298286437988281, + "learning_rate": 7.1875e-06, + "loss": 0.1596, + "step": 92 + }, + { + "epoch": 0.18199608610567514, + "grad_norm": 5.210509300231934, + "learning_rate": 7.265625e-06, + "loss": 0.2474, + "step": 93 + }, + { + "epoch": 0.18395303326810175, + "grad_norm": 4.527407169342041, + "learning_rate": 7.343750000000001e-06, + "loss": 0.1652, + "step": 94 + }, + { + "epoch": 0.18590998043052837, + "grad_norm": 5.302050590515137, + "learning_rate": 7.421875000000001e-06, + "loss": 0.2483, + "step": 95 + }, + { + "epoch": 0.18590998043052837, + "eval_loss": 0.1252526491880417, + "eval_runtime": 107.5519, + "eval_samples_per_second": 28.377, + "eval_steps_per_second": 0.223, + "eval_sts-test_pearson_cosine": 0.884272350180128, + "eval_sts-test_pearson_dot": 0.8727334938335432, + "eval_sts-test_pearson_euclidean": 0.9099441972021025, + "eval_sts-test_pearson_manhattan": 0.9106991509833859, + "eval_sts-test_pearson_max": 0.9106991509833859, + "eval_sts-test_spearman_cosine": 0.9075948278738224, + "eval_sts-test_spearman_dot": 0.87780624023116, + "eval_sts-test_spearman_euclidean": 0.9060086194138042, + "eval_sts-test_spearman_manhattan": 0.9069788267607697, + "eval_sts-test_spearman_max": 0.9075948278738224, + "step": 95 + }, + { + "epoch": 0.18786692759295498, + "grad_norm": 3.690441608428955, + "learning_rate": 7.500000000000001e-06, + "loss": 0.1623, + "step": 96 + }, + { + "epoch": 0.1898238747553816, + "grad_norm": 4.585984706878662, + "learning_rate": 7.578125e-06, + "loss": 0.1955, + "step": 97 + }, + { + "epoch": 0.1917808219178082, + "grad_norm": 4.493942737579346, + "learning_rate": 7.656250000000001e-06, + "loss": 0.2023, + "step": 98 + }, + { + "epoch": 0.19373776908023482, + "grad_norm": 4.569936275482178, + "learning_rate": 7.734375e-06, + "loss": 0.1886, + "step": 99 + }, + { + "epoch": 0.19569471624266144, + "grad_norm": 3.7703664302825928, + "learning_rate": 7.8125e-06, + "loss": 0.1284, + "step": 100 + }, + { + "epoch": 0.19569471624266144, + "eval_loss": 0.12290485948324203, + "eval_runtime": 107.6958, + "eval_samples_per_second": 28.339, + "eval_steps_per_second": 0.223, + "eval_sts-test_pearson_cosine": 0.8836376979322419, + "eval_sts-test_pearson_dot": 0.8710695777275684, + "eval_sts-test_pearson_euclidean": 0.9098265834859519, + "eval_sts-test_pearson_manhattan": 0.9106248996071287, + "eval_sts-test_pearson_max": 0.9106248996071287, + "eval_sts-test_spearman_cosine": 0.9078868298544011, + "eval_sts-test_spearman_dot": 0.8773200625274038, + "eval_sts-test_spearman_euclidean": 0.9063156130669492, + "eval_sts-test_spearman_manhattan": 0.9071474495136926, + "eval_sts-test_spearman_max": 0.9078868298544011, + "step": 100 + }, + { + "epoch": 0.19765166340508805, + "grad_norm": 4.356619358062744, + "learning_rate": 7.890625e-06, + "loss": 0.2005, + "step": 101 + }, + { + "epoch": 0.19960861056751467, + "grad_norm": 4.293449878692627, + "learning_rate": 7.96875e-06, + "loss": 0.2301, + "step": 102 + }, + { + "epoch": 0.20156555772994128, + "grad_norm": 4.654509544372559, + "learning_rate": 8.046875e-06, + "loss": 0.2249, + "step": 103 + }, + { + "epoch": 0.2035225048923679, + "grad_norm": 4.510340213775635, + "learning_rate": 8.125000000000001e-06, + "loss": 0.214, + "step": 104 + }, + { + "epoch": 0.2054794520547945, + "grad_norm": 3.880908489227295, + "learning_rate": 8.203125000000001e-06, + "loss": 0.1429, + "step": 105 + }, + { + "epoch": 0.2054794520547945, + "eval_loss": 0.12076468020677567, + "eval_runtime": 107.7074, + "eval_samples_per_second": 28.336, + "eval_steps_per_second": 0.223, + "eval_sts-test_pearson_cosine": 0.8828542959864998, + "eval_sts-test_pearson_dot": 0.8689355363147886, + "eval_sts-test_pearson_euclidean": 0.9096459762354197, + "eval_sts-test_pearson_manhattan": 0.9104979967855148, + "eval_sts-test_pearson_max": 0.9104979967855148, + "eval_sts-test_spearman_cosine": 0.9076751563880199, + "eval_sts-test_spearman_dot": 0.8750991469270715, + "eval_sts-test_spearman_euclidean": 0.906379383614432, + "eval_sts-test_spearman_manhattan": 0.9071111562407043, + "eval_sts-test_spearman_max": 0.9076751563880199, + "step": 105 + }, + { + "epoch": 0.20743639921722112, + "grad_norm": 3.8524463176727295, + "learning_rate": 8.281250000000001e-06, + "loss": 0.17, + "step": 106 + }, + { + "epoch": 0.20939334637964774, + "grad_norm": 4.660905838012695, + "learning_rate": 8.359375e-06, + "loss": 0.1955, + "step": 107 + }, + { + "epoch": 0.21135029354207435, + "grad_norm": 4.391407012939453, + "learning_rate": 8.4375e-06, + "loss": 0.1964, + "step": 108 + }, + { + "epoch": 0.21330724070450097, + "grad_norm": 3.908740758895874, + "learning_rate": 8.515625e-06, + "loss": 0.1246, + "step": 109 + }, + { + "epoch": 0.21526418786692758, + "grad_norm": 3.295600414276123, + "learning_rate": 8.59375e-06, + "loss": 0.1295, + "step": 110 + }, + { + "epoch": 0.21526418786692758, + "eval_loss": 0.11901199817657471, + "eval_runtime": 107.5373, + "eval_samples_per_second": 28.381, + "eval_steps_per_second": 0.223, + "eval_sts-test_pearson_cosine": 0.8820675142963768, + "eval_sts-test_pearson_dot": 0.8664913359514981, + "eval_sts-test_pearson_euclidean": 0.9093761405951237, + "eval_sts-test_pearson_manhattan": 0.910248319457324, + "eval_sts-test_pearson_max": 0.910248319457324, + "eval_sts-test_spearman_cosine": 0.9071699146469111, + "eval_sts-test_spearman_dot": 0.8726812810253556, + "eval_sts-test_spearman_euclidean": 0.9064896954737618, + "eval_sts-test_spearman_manhattan": 0.9068174537121922, + "eval_sts-test_spearman_max": 0.9071699146469111, + "step": 110 + }, + { + "epoch": 0.2172211350293542, + "grad_norm": 5.0308518409729, + "learning_rate": 8.671875e-06, + "loss": 0.2203, + "step": 111 + }, + { + "epoch": 0.2191780821917808, + "grad_norm": 4.501624584197998, + "learning_rate": 8.750000000000001e-06, + "loss": 0.2195, + "step": 112 + }, + { + "epoch": 0.22113502935420742, + "grad_norm": 4.200097560882568, + "learning_rate": 8.828125000000001e-06, + "loss": 0.1823, + "step": 113 + }, + { + "epoch": 0.22309197651663404, + "grad_norm": 3.6750545501708984, + "learning_rate": 8.906250000000001e-06, + "loss": 0.174, + "step": 114 + }, + { + "epoch": 0.22504892367906065, + "grad_norm": 4.105295181274414, + "learning_rate": 8.984375000000002e-06, + "loss": 0.207, + "step": 115 + }, + { + "epoch": 0.22504892367906065, + "eval_loss": 0.11745984107255936, + "eval_runtime": 107.5979, + "eval_samples_per_second": 28.365, + "eval_steps_per_second": 0.223, + "eval_sts-test_pearson_cosine": 0.882042560326929, + "eval_sts-test_pearson_dot": 0.8653067979173212, + "eval_sts-test_pearson_euclidean": 0.9095832495385563, + "eval_sts-test_pearson_manhattan": 0.9103602950988618, + "eval_sts-test_pearson_max": 0.9103602950988618, + "eval_sts-test_spearman_cosine": 0.9068824772949942, + "eval_sts-test_spearman_dot": 0.8714208617482668, + "eval_sts-test_spearman_euclidean": 0.906395180809703, + "eval_sts-test_spearman_manhattan": 0.9068741088091138, + "eval_sts-test_spearman_max": 0.9068824772949942, + "step": 115 + }, + { + "epoch": 0.22700587084148727, + "grad_norm": 4.654273509979248, + "learning_rate": 9.0625e-06, + "loss": 0.2156, + "step": 116 + }, + { + "epoch": 0.22896281800391388, + "grad_norm": 4.661588191986084, + "learning_rate": 9.140625e-06, + "loss": 0.2202, + "step": 117 + }, + { + "epoch": 0.2309197651663405, + "grad_norm": 5.366416931152344, + "learning_rate": 9.21875e-06, + "loss": 0.2718, + "step": 118 + }, + { + "epoch": 0.2328767123287671, + "grad_norm": 3.672802448272705, + "learning_rate": 9.296875e-06, + "loss": 0.1387, + "step": 119 + }, + { + "epoch": 0.23483365949119372, + "grad_norm": 3.7878501415252686, + "learning_rate": 9.375000000000001e-06, + "loss": 0.1506, + "step": 120 + }, + { + "epoch": 0.23483365949119372, + "eval_loss": 0.11679373681545258, + "eval_runtime": 107.6687, + "eval_samples_per_second": 28.346, + "eval_steps_per_second": 0.223, + "eval_sts-test_pearson_cosine": 0.882107468031623, + "eval_sts-test_pearson_dot": 0.8647556765462645, + "eval_sts-test_pearson_euclidean": 0.9099443435071429, + "eval_sts-test_pearson_manhattan": 0.9105934104125866, + "eval_sts-test_pearson_max": 0.9105934104125866, + "eval_sts-test_spearman_cosine": 0.9068624287298908, + "eval_sts-test_spearman_dot": 0.8710628964083971, + "eval_sts-test_spearman_euclidean": 0.906624531024334, + "eval_sts-test_spearman_manhattan": 0.9069254385059298, + "eval_sts-test_spearman_max": 0.9069254385059298, + "step": 120 + }, + { + "epoch": 0.23679060665362034, + "grad_norm": 3.4761197566986084, + "learning_rate": 9.453125000000001e-06, + "loss": 0.1185, + "step": 121 + }, + { + "epoch": 0.23874755381604695, + "grad_norm": 3.9917871952056885, + "learning_rate": 9.531250000000001e-06, + "loss": 0.1681, + "step": 122 + }, + { + "epoch": 0.24070450097847357, + "grad_norm": 4.491674423217773, + "learning_rate": 9.609375000000001e-06, + "loss": 0.2321, + "step": 123 + }, + { + "epoch": 0.24266144814090018, + "grad_norm": 3.903496503829956, + "learning_rate": 9.6875e-06, + "loss": 0.1457, + "step": 124 + }, + { + "epoch": 0.2446183953033268, + "grad_norm": 5.046339988708496, + "learning_rate": 9.765625e-06, + "loss": 0.2027, + "step": 125 + }, + { + "epoch": 0.2446183953033268, + "eval_loss": 0.11647585779428482, + "eval_runtime": 107.5396, + "eval_samples_per_second": 28.38, + "eval_steps_per_second": 0.223, + "eval_sts-test_pearson_cosine": 0.8824938293263067, + "eval_sts-test_pearson_dot": 0.8653100788410637, + "eval_sts-test_pearson_euclidean": 0.9104636052712812, + "eval_sts-test_pearson_manhattan": 0.9109341151161342, + "eval_sts-test_pearson_max": 0.9109341151161342, + "eval_sts-test_spearman_cosine": 0.9070702535877924, + "eval_sts-test_spearman_dot": 0.8716920543922986, + "eval_sts-test_spearman_euclidean": 0.9070027239343528, + "eval_sts-test_spearman_manhattan": 0.9073061822378479, + "eval_sts-test_spearman_max": 0.9073061822378479, + "step": 125 + }, + { + "epoch": 0.2465753424657534, + "grad_norm": 4.304446697235107, + "learning_rate": 9.84375e-06, + "loss": 0.1821, + "step": 126 + }, + { + "epoch": 0.24853228962818003, + "grad_norm": 3.208357810974121, + "learning_rate": 9.921875e-06, + "loss": 0.1258, + "step": 127 + }, + { + "epoch": 0.25048923679060664, + "grad_norm": 4.275379657745361, + "learning_rate": 1e-05, + "loss": 0.184, + "step": 128 + }, + { + "epoch": 0.25244618395303325, + "grad_norm": 4.408608436584473, + "learning_rate": 1.0078125000000001e-05, + "loss": 0.2015, + "step": 129 + }, + { + "epoch": 0.25440313111545987, + "grad_norm": 3.565253973007202, + "learning_rate": 1.0156250000000001e-05, + "loss": 0.1323, + "step": 130 + }, + { + "epoch": 0.25440313111545987, + "eval_loss": 0.1154385656118393, + "eval_runtime": 107.5442, + "eval_samples_per_second": 28.379, + "eval_steps_per_second": 0.223, + "eval_sts-test_pearson_cosine": 0.8820850631122565, + "eval_sts-test_pearson_dot": 0.8648589750662984, + "eval_sts-test_pearson_euclidean": 0.9105884442785888, + "eval_sts-test_pearson_manhattan": 0.9109040210291837, + "eval_sts-test_pearson_max": 0.9109040210291837, + "eval_sts-test_spearman_cosine": 0.9074317095260507, + "eval_sts-test_spearman_dot": 0.8710452196601474, + "eval_sts-test_spearman_euclidean": 0.9070635408985837, + "eval_sts-test_spearman_manhattan": 0.9074422260724778, + "eval_sts-test_spearman_max": 0.9074422260724778, + "step": 130 + }, + { + "epoch": 0.2563600782778865, + "grad_norm": 4.261953353881836, + "learning_rate": 1.0234375000000001e-05, + "loss": 0.1939, + "step": 131 + }, + { + "epoch": 0.2583170254403131, + "grad_norm": 3.806480646133423, + "learning_rate": 1.0312500000000002e-05, + "loss": 0.1428, + "step": 132 + }, + { + "epoch": 0.2602739726027397, + "grad_norm": 2.824733257293701, + "learning_rate": 1.0390625e-05, + "loss": 0.1063, + "step": 133 + }, + { + "epoch": 0.2622309197651663, + "grad_norm": 4.076455116271973, + "learning_rate": 1.046875e-05, + "loss": 0.1602, + "step": 134 + }, + { + "epoch": 0.26418786692759294, + "grad_norm": 3.7571659088134766, + "learning_rate": 1.0546875e-05, + "loss": 0.1814, + "step": 135 + }, + { + "epoch": 0.26418786692759294, + "eval_loss": 0.11387230455875397, + "eval_runtime": 107.5968, + "eval_samples_per_second": 28.365, + "eval_steps_per_second": 0.223, + "eval_sts-test_pearson_cosine": 0.8812889193869892, + "eval_sts-test_pearson_dot": 0.8634898982579755, + "eval_sts-test_pearson_euclidean": 0.9104977472627025, + "eval_sts-test_pearson_manhattan": 0.9107178140804983, + "eval_sts-test_pearson_max": 0.9107178140804983, + "eval_sts-test_spearman_cosine": 0.9066986391131981, + "eval_sts-test_spearman_dot": 0.870129116588204, + "eval_sts-test_spearman_euclidean": 0.9070359293703052, + "eval_sts-test_spearman_manhattan": 0.9073414909830857, + "eval_sts-test_spearman_max": 0.9073414909830857, + "step": 135 + }, + { + "epoch": 0.26614481409001955, + "grad_norm": 3.864948034286499, + "learning_rate": 1.0625e-05, + "loss": 0.1518, + "step": 136 + }, + { + "epoch": 0.26810176125244617, + "grad_norm": 3.5900001525878906, + "learning_rate": 1.0703125000000001e-05, + "loss": 0.1379, + "step": 137 + }, + { + "epoch": 0.2700587084148728, + "grad_norm": 4.291954517364502, + "learning_rate": 1.0781250000000001e-05, + "loss": 0.1708, + "step": 138 + }, + { + "epoch": 0.2720156555772994, + "grad_norm": 3.8340342044830322, + "learning_rate": 1.0859375000000001e-05, + "loss": 0.2046, + "step": 139 + }, + { + "epoch": 0.273972602739726, + "grad_norm": 3.749396562576294, + "learning_rate": 1.0937500000000002e-05, + "loss": 0.1259, + "step": 140 + }, + { + "epoch": 0.273972602739726, + "eval_loss": 0.1124362125992775, + "eval_runtime": 107.5142, + "eval_samples_per_second": 28.387, + "eval_steps_per_second": 0.223, + "eval_sts-test_pearson_cosine": 0.8805714116282963, + "eval_sts-test_pearson_dot": 0.8618911680351633, + "eval_sts-test_pearson_euclidean": 0.9102979980912764, + "eval_sts-test_pearson_manhattan": 0.9105232760600299, + "eval_sts-test_pearson_max": 0.9105232760600299, + "eval_sts-test_spearman_cosine": 0.9063180743863257, + "eval_sts-test_spearman_dot": 0.8687826406354595, + "eval_sts-test_spearman_euclidean": 0.9070556199253175, + "eval_sts-test_spearman_manhattan": 0.9073570196707885, + "eval_sts-test_spearman_max": 0.9073570196707885, + "step": 140 + }, + { + "epoch": 0.2759295499021526, + "grad_norm": 2.8815276622772217, + "learning_rate": 1.1015625e-05, + "loss": 0.1181, + "step": 141 + }, + { + "epoch": 0.27788649706457924, + "grad_norm": 3.766554355621338, + "learning_rate": 1.109375e-05, + "loss": 0.2144, + "step": 142 + }, + { + "epoch": 0.27984344422700586, + "grad_norm": 4.289268493652344, + "learning_rate": 1.1171875e-05, + "loss": 0.1822, + "step": 143 + }, + { + "epoch": 0.28180039138943247, + "grad_norm": 3.9036617279052734, + "learning_rate": 1.125e-05, + "loss": 0.1667, + "step": 144 + }, + { + "epoch": 0.2837573385518591, + "grad_norm": 3.321366786956787, + "learning_rate": 1.1328125000000001e-05, + "loss": 0.0779, + "step": 145 + }, + { + "epoch": 0.2837573385518591, + "eval_loss": 0.1118142157793045, + "eval_runtime": 107.3173, + "eval_samples_per_second": 28.439, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8796044904115364, + "eval_sts-test_pearson_dot": 0.8607678603166254, + "eval_sts-test_pearson_euclidean": 0.9097479995877322, + "eval_sts-test_pearson_manhattan": 0.9098650580518599, + "eval_sts-test_pearson_max": 0.9098650580518599, + "eval_sts-test_spearman_cosine": 0.9059690592987342, + "eval_sts-test_spearman_dot": 0.8685229490656053, + "eval_sts-test_spearman_euclidean": 0.90680836920613, + "eval_sts-test_spearman_manhattan": 0.9069437865231001, + "eval_sts-test_spearman_max": 0.9069437865231001, + "step": 145 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 3.460301160812378, + "learning_rate": 1.1406250000000001e-05, + "loss": 0.147, + "step": 146 + }, + { + "epoch": 0.2876712328767123, + "grad_norm": 3.8999266624450684, + "learning_rate": 1.1484375000000001e-05, + "loss": 0.1913, + "step": 147 + }, + { + "epoch": 0.2896281800391389, + "grad_norm": 3.539788007736206, + "learning_rate": 1.1562500000000002e-05, + "loss": 0.1357, + "step": 148 + }, + { + "epoch": 0.29158512720156554, + "grad_norm": 3.499439001083374, + "learning_rate": 1.1640625000000002e-05, + "loss": 0.1128, + "step": 149 + }, + { + "epoch": 0.29354207436399216, + "grad_norm": 3.2960240840911865, + "learning_rate": 1.171875e-05, + "loss": 0.0996, + "step": 150 + }, + { + "epoch": 0.29354207436399216, + "eval_loss": 0.11132737249135971, + "eval_runtime": 107.5867, + "eval_samples_per_second": 28.368, + "eval_steps_per_second": 0.223, + "eval_sts-test_pearson_cosine": 0.8787852416493207, + "eval_sts-test_pearson_dot": 0.8593025559452621, + "eval_sts-test_pearson_euclidean": 0.9091617970047303, + "eval_sts-test_pearson_manhattan": 0.9091664157178929, + "eval_sts-test_pearson_max": 0.9091664157178929, + "eval_sts-test_spearman_cosine": 0.9054375485671886, + "eval_sts-test_spearman_dot": 0.867029912731804, + "eval_sts-test_spearman_euclidean": 0.9062253050214613, + "eval_sts-test_spearman_manhattan": 0.9062610165280517, + "eval_sts-test_spearman_max": 0.9062610165280517, + "step": 150 + }, + { + "epoch": 0.29549902152641877, + "grad_norm": 4.271719932556152, + "learning_rate": 1.1796875e-05, + "loss": 0.1956, + "step": 151 + }, + { + "epoch": 0.2974559686888454, + "grad_norm": 3.168663501739502, + "learning_rate": 1.1875e-05, + "loss": 0.0942, + "step": 152 + }, + { + "epoch": 0.299412915851272, + "grad_norm": 3.816993236541748, + "learning_rate": 1.1953125000000001e-05, + "loss": 0.1406, + "step": 153 + }, + { + "epoch": 0.3013698630136986, + "grad_norm": 5.383023738861084, + "learning_rate": 1.2031250000000001e-05, + "loss": 0.2868, + "step": 154 + }, + { + "epoch": 0.30332681017612523, + "grad_norm": 3.123462677001953, + "learning_rate": 1.2109375000000001e-05, + "loss": 0.1102, + "step": 155 + }, + { + "epoch": 0.30332681017612523, + "eval_loss": 0.11142811924219131, + "eval_runtime": 107.3019, + "eval_samples_per_second": 28.443, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8780761726881443, + "eval_sts-test_pearson_dot": 0.8581767032057357, + "eval_sts-test_pearson_euclidean": 0.9081534036571242, + "eval_sts-test_pearson_manhattan": 0.9081724370385316, + "eval_sts-test_pearson_max": 0.9081724370385316, + "eval_sts-test_spearman_cosine": 0.9048428490545583, + "eval_sts-test_spearman_dot": 0.8670075818523697, + "eval_sts-test_spearman_euclidean": 0.9052714766361651, + "eval_sts-test_spearman_manhattan": 0.9054467225757737, + "eval_sts-test_spearman_max": 0.9054467225757737, + "step": 155 + }, + { + "epoch": 0.30528375733855184, + "grad_norm": 4.1034979820251465, + "learning_rate": 1.2187500000000001e-05, + "loss": 0.1659, + "step": 156 + }, + { + "epoch": 0.30724070450097846, + "grad_norm": 3.60249400138855, + "learning_rate": 1.2265625000000002e-05, + "loss": 0.1645, + "step": 157 + }, + { + "epoch": 0.30919765166340507, + "grad_norm": 3.771853446960449, + "learning_rate": 1.234375e-05, + "loss": 0.151, + "step": 158 + }, + { + "epoch": 0.3111545988258317, + "grad_norm": 4.291686058044434, + "learning_rate": 1.2421875e-05, + "loss": 0.158, + "step": 159 + }, + { + "epoch": 0.3131115459882583, + "grad_norm": 5.1689453125, + "learning_rate": 1.25e-05, + "loss": 0.2323, + "step": 160 + }, + { + "epoch": 0.3131115459882583, + "eval_loss": 0.11126424372196198, + "eval_runtime": 107.301, + "eval_samples_per_second": 28.443, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8777597983330929, + "eval_sts-test_pearson_dot": 0.8577739588604719, + "eval_sts-test_pearson_euclidean": 0.9075483317216817, + "eval_sts-test_pearson_manhattan": 0.9075908461381532, + "eval_sts-test_pearson_max": 0.9075908461381532, + "eval_sts-test_spearman_cosine": 0.9047649818597372, + "eval_sts-test_spearman_dot": 0.867389712873391, + "eval_sts-test_spearman_euclidean": 0.9048189966322366, + "eval_sts-test_spearman_manhattan": 0.9049692713679889, + "eval_sts-test_spearman_max": 0.9049692713679889, + "step": 160 + }, + { + "epoch": 0.3150684931506849, + "grad_norm": 3.304703712463379, + "learning_rate": 1.2578125e-05, + "loss": 0.1157, + "step": 161 + }, + { + "epoch": 0.31702544031311153, + "grad_norm": 4.064731121063232, + "learning_rate": 1.2656250000000001e-05, + "loss": 0.1507, + "step": 162 + }, + { + "epoch": 0.31898238747553814, + "grad_norm": 4.615545749664307, + "learning_rate": 1.2734375000000001e-05, + "loss": 0.1879, + "step": 163 + }, + { + "epoch": 0.32093933463796476, + "grad_norm": 3.767533540725708, + "learning_rate": 1.2812500000000001e-05, + "loss": 0.143, + "step": 164 + }, + { + "epoch": 0.32289628180039137, + "grad_norm": 4.727967262268066, + "learning_rate": 1.2890625000000002e-05, + "loss": 0.2227, + "step": 165 + }, + { + "epoch": 0.32289628180039137, + "eval_loss": 0.11155427247285843, + "eval_runtime": 107.2898, + "eval_samples_per_second": 28.446, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8775899700998113, + "eval_sts-test_pearson_dot": 0.8571711542435376, + "eval_sts-test_pearson_euclidean": 0.907399950708088, + "eval_sts-test_pearson_manhattan": 0.9073879045697356, + "eval_sts-test_pearson_max": 0.907399950708088, + "eval_sts-test_spearman_cosine": 0.9049959431197784, + "eval_sts-test_spearman_dot": 0.8667648957618442, + "eval_sts-test_spearman_euclidean": 0.9048916279294749, + "eval_sts-test_spearman_manhattan": 0.9050786882020909, + "eval_sts-test_spearman_max": 0.9050786882020909, + "step": 165 + }, + { + "epoch": 0.324853228962818, + "grad_norm": 4.0150017738342285, + "learning_rate": 1.2968750000000002e-05, + "loss": 0.1624, + "step": 166 + }, + { + "epoch": 0.3268101761252446, + "grad_norm": 3.021153450012207, + "learning_rate": 1.3046875e-05, + "loss": 0.1345, + "step": 167 + }, + { + "epoch": 0.3287671232876712, + "grad_norm": 3.869710922241211, + "learning_rate": 1.3125e-05, + "loss": 0.1765, + "step": 168 + }, + { + "epoch": 0.33072407045009783, + "grad_norm": 3.538076162338257, + "learning_rate": 1.3203125e-05, + "loss": 0.1368, + "step": 169 + }, + { + "epoch": 0.33268101761252444, + "grad_norm": 3.378551483154297, + "learning_rate": 1.3281250000000001e-05, + "loss": 0.0962, + "step": 170 + }, + { + "epoch": 0.33268101761252444, + "eval_loss": 0.11131894588470459, + "eval_runtime": 107.3532, + "eval_samples_per_second": 28.43, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8782576778514848, + "eval_sts-test_pearson_dot": 0.8576530243239538, + "eval_sts-test_pearson_euclidean": 0.9077401564122008, + "eval_sts-test_pearson_manhattan": 0.907609849534313, + "eval_sts-test_pearson_max": 0.9077401564122008, + "eval_sts-test_spearman_cosine": 0.9055560946586144, + "eval_sts-test_spearman_dot": 0.8666707838591381, + "eval_sts-test_spearman_euclidean": 0.9054064016892602, + "eval_sts-test_spearman_manhattan": 0.9054834186101147, + "eval_sts-test_spearman_max": 0.9055560946586144, + "step": 170 + }, + { + "epoch": 0.33463796477495106, + "grad_norm": 4.588249683380127, + "learning_rate": 1.3359375000000001e-05, + "loss": 0.1783, + "step": 171 + }, + { + "epoch": 0.33659491193737767, + "grad_norm": 4.370199680328369, + "learning_rate": 1.3437500000000001e-05, + "loss": 0.2019, + "step": 172 + }, + { + "epoch": 0.3385518590998043, + "grad_norm": 4.000157356262207, + "learning_rate": 1.3515625000000002e-05, + "loss": 0.1761, + "step": 173 + }, + { + "epoch": 0.3405088062622309, + "grad_norm": 4.3335862159729, + "learning_rate": 1.3593750000000002e-05, + "loss": 0.1855, + "step": 174 + }, + { + "epoch": 0.3424657534246575, + "grad_norm": 4.247244358062744, + "learning_rate": 1.3671875e-05, + "loss": 0.1922, + "step": 175 + }, + { + "epoch": 0.3424657534246575, + "eval_loss": 0.1105586364865303, + "eval_runtime": 107.3507, + "eval_samples_per_second": 28.43, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8775475016000731, + "eval_sts-test_pearson_dot": 0.8543732981082479, + "eval_sts-test_pearson_euclidean": 0.9076643456809551, + "eval_sts-test_pearson_manhattan": 0.9075054089199206, + "eval_sts-test_pearson_max": 0.9076643456809551, + "eval_sts-test_spearman_cosine": 0.905357578063082, + "eval_sts-test_spearman_dot": 0.8628476388472094, + "eval_sts-test_spearman_euclidean": 0.9054710672619708, + "eval_sts-test_spearman_manhattan": 0.9055309444497123, + "eval_sts-test_spearman_max": 0.9055309444497123, + "step": 175 + }, + { + "epoch": 0.34442270058708413, + "grad_norm": 3.881108522415161, + "learning_rate": 1.375e-05, + "loss": 0.1538, + "step": 176 + }, + { + "epoch": 0.34637964774951074, + "grad_norm": 3.4271416664123535, + "learning_rate": 1.3828125e-05, + "loss": 0.1049, + "step": 177 + }, + { + "epoch": 0.34833659491193736, + "grad_norm": 3.7847940921783447, + "learning_rate": 1.3906250000000001e-05, + "loss": 0.1619, + "step": 178 + }, + { + "epoch": 0.350293542074364, + "grad_norm": 2.3725311756134033, + "learning_rate": 1.3984375000000001e-05, + "loss": 0.0731, + "step": 179 + }, + { + "epoch": 0.3522504892367906, + "grad_norm": 3.6820032596588135, + "learning_rate": 1.4062500000000001e-05, + "loss": 0.1205, + "step": 180 + }, + { + "epoch": 0.3522504892367906, + "eval_loss": 0.10974939167499542, + "eval_runtime": 107.353, + "eval_samples_per_second": 28.43, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8782123578217031, + "eval_sts-test_pearson_dot": 0.852106566478191, + "eval_sts-test_pearson_euclidean": 0.9088860377565003, + "eval_sts-test_pearson_manhattan": 0.9087269620613702, + "eval_sts-test_pearson_max": 0.9088860377565003, + "eval_sts-test_spearman_cosine": 0.9058966517578029, + "eval_sts-test_spearman_dot": 0.8595467858069799, + "eval_sts-test_spearman_euclidean": 0.9064047128283795, + "eval_sts-test_spearman_manhattan": 0.9067846510375924, + "eval_sts-test_spearman_max": 0.9067846510375924, + "step": 180 + }, + { + "epoch": 0.3542074363992172, + "grad_norm": 3.7714688777923584, + "learning_rate": 1.4140625000000002e-05, + "loss": 0.169, + "step": 181 + }, + { + "epoch": 0.3561643835616438, + "grad_norm": 3.7113559246063232, + "learning_rate": 1.4218750000000002e-05, + "loss": 0.1688, + "step": 182 + }, + { + "epoch": 0.35812133072407043, + "grad_norm": 3.1639597415924072, + "learning_rate": 1.4296875000000002e-05, + "loss": 0.1274, + "step": 183 + }, + { + "epoch": 0.36007827788649704, + "grad_norm": 4.144288539886475, + "learning_rate": 1.4375e-05, + "loss": 0.1477, + "step": 184 + }, + { + "epoch": 0.36203522504892366, + "grad_norm": 3.4342098236083984, + "learning_rate": 1.4453125e-05, + "loss": 0.1418, + "step": 185 + }, + { + "epoch": 0.36203522504892366, + "eval_loss": 0.10942607372999191, + "eval_runtime": 107.2679, + "eval_samples_per_second": 28.452, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8778855142398189, + "eval_sts-test_pearson_dot": 0.8501658695420333, + "eval_sts-test_pearson_euclidean": 0.9088432870055996, + "eval_sts-test_pearson_manhattan": 0.9086435133118579, + "eval_sts-test_pearson_max": 0.9088432870055996, + "eval_sts-test_spearman_cosine": 0.9055185931015683, + "eval_sts-test_spearman_dot": 0.8575025481866207, + "eval_sts-test_spearman_euclidean": 0.9063994321795352, + "eval_sts-test_spearman_manhattan": 0.9064969899293684, + "eval_sts-test_spearman_max": 0.9064969899293684, + "step": 185 + }, + { + "epoch": 0.3639921722113503, + "grad_norm": 4.744626045227051, + "learning_rate": 1.453125e-05, + "loss": 0.2477, + "step": 186 + }, + { + "epoch": 0.3659491193737769, + "grad_norm": 4.062248229980469, + "learning_rate": 1.4609375000000001e-05, + "loss": 0.1713, + "step": 187 + }, + { + "epoch": 0.3679060665362035, + "grad_norm": 3.989694833755493, + "learning_rate": 1.4687500000000001e-05, + "loss": 0.1703, + "step": 188 + }, + { + "epoch": 0.3698630136986301, + "grad_norm": 3.3543660640716553, + "learning_rate": 1.4765625000000001e-05, + "loss": 0.1176, + "step": 189 + }, + { + "epoch": 0.37181996086105673, + "grad_norm": 4.307045936584473, + "learning_rate": 1.4843750000000002e-05, + "loss": 0.1811, + "step": 190 + }, + { + "epoch": 0.37181996086105673, + "eval_loss": 0.10837770998477936, + "eval_runtime": 107.3429, + "eval_samples_per_second": 28.432, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8774103555789884, + "eval_sts-test_pearson_dot": 0.84942827650618, + "eval_sts-test_pearson_euclidean": 0.9086430009253119, + "eval_sts-test_pearson_manhattan": 0.9084642534632353, + "eval_sts-test_pearson_max": 0.9086430009253119, + "eval_sts-test_spearman_cosine": 0.9048482639571866, + "eval_sts-test_spearman_dot": 0.8562155914115267, + "eval_sts-test_spearman_euclidean": 0.9060070531196555, + "eval_sts-test_spearman_manhattan": 0.9061608184537963, + "eval_sts-test_spearman_max": 0.9061608184537963, + "step": 190 + }, + { + "epoch": 0.37377690802348335, + "grad_norm": 4.140930652618408, + "learning_rate": 1.4921875000000002e-05, + "loss": 0.162, + "step": 191 + }, + { + "epoch": 0.37573385518590996, + "grad_norm": 2.7555642127990723, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.1141, + "step": 192 + }, + { + "epoch": 0.3776908023483366, + "grad_norm": 4.070343017578125, + "learning_rate": 1.5078125e-05, + "loss": 0.154, + "step": 193 + }, + { + "epoch": 0.3796477495107632, + "grad_norm": 4.453440189361572, + "learning_rate": 1.515625e-05, + "loss": 0.2461, + "step": 194 + }, + { + "epoch": 0.3816046966731898, + "grad_norm": 3.7656772136688232, + "learning_rate": 1.5234375000000001e-05, + "loss": 0.1573, + "step": 195 + }, + { + "epoch": 0.3816046966731898, + "eval_loss": 0.10762027651071548, + "eval_runtime": 107.299, + "eval_samples_per_second": 28.444, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8779461080888007, + "eval_sts-test_pearson_dot": 0.8521074278329072, + "eval_sts-test_pearson_euclidean": 0.9087045359990432, + "eval_sts-test_pearson_manhattan": 0.9086340705654771, + "eval_sts-test_pearson_max": 0.9087045359990432, + "eval_sts-test_spearman_cosine": 0.9045706718827756, + "eval_sts-test_spearman_dot": 0.8584340456924826, + "eval_sts-test_spearman_euclidean": 0.9055143864829975, + "eval_sts-test_spearman_manhattan": 0.9058283613329196, + "eval_sts-test_spearman_max": 0.9058283613329196, + "step": 195 + }, + { + "epoch": 0.3835616438356164, + "grad_norm": 3.063400983810425, + "learning_rate": 1.5312500000000003e-05, + "loss": 0.1197, + "step": 196 + }, + { + "epoch": 0.38551859099804303, + "grad_norm": 3.893153429031372, + "learning_rate": 1.5390625e-05, + "loss": 0.1395, + "step": 197 + }, + { + "epoch": 0.38747553816046965, + "grad_norm": 2.95540714263916, + "learning_rate": 1.546875e-05, + "loss": 0.0847, + "step": 198 + }, + { + "epoch": 0.38943248532289626, + "grad_norm": 3.4665300846099854, + "learning_rate": 1.5546875e-05, + "loss": 0.1848, + "step": 199 + }, + { + "epoch": 0.3913894324853229, + "grad_norm": 3.6926543712615967, + "learning_rate": 1.5625e-05, + "loss": 0.1377, + "step": 200 + }, + { + "epoch": 0.3913894324853229, + "eval_loss": 0.10723523795604706, + "eval_runtime": 107.245, + "eval_samples_per_second": 28.458, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.877994665901344, + "eval_sts-test_pearson_dot": 0.854134605280733, + "eval_sts-test_pearson_euclidean": 0.9085191117850383, + "eval_sts-test_pearson_manhattan": 0.9086424100414001, + "eval_sts-test_pearson_max": 0.9086424100414001, + "eval_sts-test_spearman_cosine": 0.904685279863199, + "eval_sts-test_spearman_dot": 0.8598855528557127, + "eval_sts-test_spearman_euclidean": 0.9052407772708506, + "eval_sts-test_spearman_manhattan": 0.9058868959828196, + "eval_sts-test_spearman_max": 0.9058868959828196, + "step": 200 + }, + { + "epoch": 0.3933463796477495, + "grad_norm": 3.303112268447876, + "learning_rate": 1.5703125e-05, + "loss": 0.1109, + "step": 201 + }, + { + "epoch": 0.3953033268101761, + "grad_norm": 3.4490058422088623, + "learning_rate": 1.578125e-05, + "loss": 0.1051, + "step": 202 + }, + { + "epoch": 0.3972602739726027, + "grad_norm": 2.6598286628723145, + "learning_rate": 1.5859375e-05, + "loss": 0.0975, + "step": 203 + }, + { + "epoch": 0.39921722113502933, + "grad_norm": 3.373512029647827, + "learning_rate": 1.59375e-05, + "loss": 0.127, + "step": 204 + }, + { + "epoch": 0.40117416829745595, + "grad_norm": 3.1471354961395264, + "learning_rate": 1.6015625e-05, + "loss": 0.1297, + "step": 205 + }, + { + "epoch": 0.40117416829745595, + "eval_loss": 0.10685314983129501, + "eval_runtime": 107.3321, + "eval_samples_per_second": 28.435, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8785914848590666, + "eval_sts-test_pearson_dot": 0.8570818659891223, + "eval_sts-test_pearson_euclidean": 0.9086611488562145, + "eval_sts-test_pearson_manhattan": 0.9087606701935215, + "eval_sts-test_pearson_max": 0.9087606701935215, + "eval_sts-test_spearman_cosine": 0.9048987433800361, + "eval_sts-test_spearman_dot": 0.8616398023022556, + "eval_sts-test_spearman_euclidean": 0.9052247563192726, + "eval_sts-test_spearman_manhattan": 0.9056138237858093, + "eval_sts-test_spearman_max": 0.9056138237858093, + "step": 205 + }, + { + "epoch": 0.40313111545988256, + "grad_norm": 2.6924684047698975, + "learning_rate": 1.609375e-05, + "loss": 0.0783, + "step": 206 + }, + { + "epoch": 0.4050880626223092, + "grad_norm": 2.1100542545318604, + "learning_rate": 1.6171875000000002e-05, + "loss": 0.053, + "step": 207 + }, + { + "epoch": 0.4070450097847358, + "grad_norm": 3.7984156608581543, + "learning_rate": 1.6250000000000002e-05, + "loss": 0.1916, + "step": 208 + }, + { + "epoch": 0.4090019569471624, + "grad_norm": 4.329834461212158, + "learning_rate": 1.6328125000000002e-05, + "loss": 0.178, + "step": 209 + }, + { + "epoch": 0.410958904109589, + "grad_norm": 4.427723407745361, + "learning_rate": 1.6406250000000002e-05, + "loss": 0.2343, + "step": 210 + }, + { + "epoch": 0.410958904109589, + "eval_loss": 0.10670512914657593, + "eval_runtime": 107.2313, + "eval_samples_per_second": 28.462, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8788965355860006, + "eval_sts-test_pearson_dot": 0.8580075676260999, + "eval_sts-test_pearson_euclidean": 0.908776492246521, + "eval_sts-test_pearson_manhattan": 0.9089340980301853, + "eval_sts-test_pearson_max": 0.9089340980301853, + "eval_sts-test_spearman_cosine": 0.90530862018312, + "eval_sts-test_spearman_dot": 0.8630207814775328, + "eval_sts-test_spearman_euclidean": 0.905449362900196, + "eval_sts-test_spearman_manhattan": 0.9056519071092534, + "eval_sts-test_spearman_max": 0.9056519071092534, + "step": 210 + }, + { + "epoch": 0.41291585127201563, + "grad_norm": 3.890899419784546, + "learning_rate": 1.6484375000000003e-05, + "loss": 0.1816, + "step": 211 + }, + { + "epoch": 0.41487279843444225, + "grad_norm": 4.071934700012207, + "learning_rate": 1.6562500000000003e-05, + "loss": 0.2522, + "step": 212 + }, + { + "epoch": 0.41682974559686886, + "grad_norm": 3.8046796321868896, + "learning_rate": 1.6640625000000003e-05, + "loss": 0.1787, + "step": 213 + }, + { + "epoch": 0.4187866927592955, + "grad_norm": 3.357276201248169, + "learning_rate": 1.671875e-05, + "loss": 0.1913, + "step": 214 + }, + { + "epoch": 0.4207436399217221, + "grad_norm": 3.8679873943328857, + "learning_rate": 1.6796875e-05, + "loss": 0.175, + "step": 215 + }, + { + "epoch": 0.4207436399217221, + "eval_loss": 0.10552908480167389, + "eval_runtime": 107.6412, + "eval_samples_per_second": 28.353, + "eval_steps_per_second": 0.223, + "eval_sts-test_pearson_cosine": 0.8791676817178924, + "eval_sts-test_pearson_dot": 0.8573342496118925, + "eval_sts-test_pearson_euclidean": 0.909475190469058, + "eval_sts-test_pearson_manhattan": 0.9097533727394405, + "eval_sts-test_pearson_max": 0.9097533727394405, + "eval_sts-test_spearman_cosine": 0.9056468502167161, + "eval_sts-test_spearman_dot": 0.8624976392318674, + "eval_sts-test_spearman_euclidean": 0.9066117769148375, + "eval_sts-test_spearman_manhattan": 0.9069566301351195, + "eval_sts-test_spearman_max": 0.9069566301351195, + "step": 215 + }, + { + "epoch": 0.4227005870841487, + "grad_norm": 3.436488389968872, + "learning_rate": 1.6875e-05, + "loss": 0.1533, + "step": 216 + }, + { + "epoch": 0.4246575342465753, + "grad_norm": 3.891040563583374, + "learning_rate": 1.6953125e-05, + "loss": 0.1819, + "step": 217 + }, + { + "epoch": 0.42661448140900193, + "grad_norm": 4.554884910583496, + "learning_rate": 1.703125e-05, + "loss": 0.2541, + "step": 218 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 3.4431850910186768, + "learning_rate": 1.7109375e-05, + "loss": 0.1103, + "step": 219 + }, + { + "epoch": 0.43052837573385516, + "grad_norm": 3.5396361351013184, + "learning_rate": 1.71875e-05, + "loss": 0.1693, + "step": 220 + }, + { + "epoch": 0.43052837573385516, + "eval_loss": 0.10396925359964371, + "eval_runtime": 107.33, + "eval_samples_per_second": 28.436, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8794186180626897, + "eval_sts-test_pearson_dot": 0.8555325369075935, + "eval_sts-test_pearson_euclidean": 0.9099071011406157, + "eval_sts-test_pearson_manhattan": 0.9104095617945829, + "eval_sts-test_pearson_max": 0.9104095617945829, + "eval_sts-test_spearman_cosine": 0.9061536582519738, + "eval_sts-test_spearman_dot": 0.8609769018672648, + "eval_sts-test_spearman_euclidean": 0.9068523149448162, + "eval_sts-test_spearman_manhattan": 0.9075606826613808, + "eval_sts-test_spearman_max": 0.9075606826613808, + "step": 220 + }, + { + "epoch": 0.4324853228962818, + "grad_norm": 3.4416589736938477, + "learning_rate": 1.7265625e-05, + "loss": 0.1233, + "step": 221 + }, + { + "epoch": 0.4344422700587084, + "grad_norm": 2.9554316997528076, + "learning_rate": 1.734375e-05, + "loss": 0.0922, + "step": 222 + }, + { + "epoch": 0.436399217221135, + "grad_norm": 3.1570141315460205, + "learning_rate": 1.7421875e-05, + "loss": 0.1243, + "step": 223 + }, + { + "epoch": 0.4383561643835616, + "grad_norm": 3.8479344844818115, + "learning_rate": 1.7500000000000002e-05, + "loss": 0.1613, + "step": 224 + }, + { + "epoch": 0.44031311154598823, + "grad_norm": 3.004990339279175, + "learning_rate": 1.7578125000000002e-05, + "loss": 0.1188, + "step": 225 + }, + { + "epoch": 0.44031311154598823, + "eval_loss": 0.1029738187789917, + "eval_runtime": 107.2661, + "eval_samples_per_second": 28.453, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8796765477862789, + "eval_sts-test_pearson_dot": 0.8576485629522204, + "eval_sts-test_pearson_euclidean": 0.9098263075403831, + "eval_sts-test_pearson_manhattan": 0.9104321398639006, + "eval_sts-test_pearson_max": 0.9104321398639006, + "eval_sts-test_spearman_cosine": 0.9064603386462892, + "eval_sts-test_spearman_dot": 0.8635142088856343, + "eval_sts-test_spearman_euclidean": 0.9066103896257344, + "eval_sts-test_spearman_manhattan": 0.9076328216947436, + "eval_sts-test_spearman_max": 0.9076328216947436, + "step": 225 + }, + { + "epoch": 0.44227005870841485, + "grad_norm": 3.595667839050293, + "learning_rate": 1.7656250000000002e-05, + "loss": 0.196, + "step": 226 + }, + { + "epoch": 0.44422700587084146, + "grad_norm": 3.9599428176879883, + "learning_rate": 1.7734375000000002e-05, + "loss": 0.2254, + "step": 227 + }, + { + "epoch": 0.4461839530332681, + "grad_norm": 3.2490875720977783, + "learning_rate": 1.7812500000000003e-05, + "loss": 0.1162, + "step": 228 + }, + { + "epoch": 0.4481409001956947, + "grad_norm": 4.811342239379883, + "learning_rate": 1.7890625000000003e-05, + "loss": 0.2579, + "step": 229 + }, + { + "epoch": 0.4500978473581213, + "grad_norm": 2.993255138397217, + "learning_rate": 1.7968750000000003e-05, + "loss": 0.1203, + "step": 230 + }, + { + "epoch": 0.4500978473581213, + "eval_loss": 0.102933868765831, + "eval_runtime": 107.2515, + "eval_samples_per_second": 28.456, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8799758353085696, + "eval_sts-test_pearson_dot": 0.8592997081846103, + "eval_sts-test_pearson_euclidean": 0.9101945793558552, + "eval_sts-test_pearson_manhattan": 0.9106837055219174, + "eval_sts-test_pearson_max": 0.9106837055219174, + "eval_sts-test_spearman_cosine": 0.9071432428951217, + "eval_sts-test_spearman_dot": 0.865314059867535, + "eval_sts-test_spearman_euclidean": 0.9072587906520344, + "eval_sts-test_spearman_manhattan": 0.9077949555147645, + "eval_sts-test_spearman_max": 0.9077949555147645, + "step": 230 + }, + { + "epoch": 0.4520547945205479, + "grad_norm": 3.654191017150879, + "learning_rate": 1.8046875e-05, + "loss": 0.1654, + "step": 231 + }, + { + "epoch": 0.45401174168297453, + "grad_norm": 3.429565668106079, + "learning_rate": 1.8125e-05, + "loss": 0.1808, + "step": 232 + }, + { + "epoch": 0.45596868884540115, + "grad_norm": 3.5679566860198975, + "learning_rate": 1.8203125e-05, + "loss": 0.1397, + "step": 233 + }, + { + "epoch": 0.45792563600782776, + "grad_norm": 3.9862124919891357, + "learning_rate": 1.828125e-05, + "loss": 0.2177, + "step": 234 + }, + { + "epoch": 0.4598825831702544, + "grad_norm": 3.536984443664551, + "learning_rate": 1.8359375e-05, + "loss": 0.162, + "step": 235 + }, + { + "epoch": 0.4598825831702544, + "eval_loss": 0.10404225438833237, + "eval_runtime": 107.254, + "eval_samples_per_second": 28.456, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8802088610554777, + "eval_sts-test_pearson_dot": 0.8618209119350905, + "eval_sts-test_pearson_euclidean": 0.9103461475031536, + "eval_sts-test_pearson_manhattan": 0.9106782364335553, + "eval_sts-test_pearson_max": 0.9106782364335553, + "eval_sts-test_spearman_cosine": 0.9077748174471387, + "eval_sts-test_spearman_dot": 0.8686349167216066, + "eval_sts-test_spearman_euclidean": 0.907571109705285, + "eval_sts-test_spearman_manhattan": 0.9080472631264893, + "eval_sts-test_spearman_max": 0.9080472631264893, + "step": 235 + }, + { + "epoch": 0.461839530332681, + "grad_norm": 3.2987570762634277, + "learning_rate": 1.84375e-05, + "loss": 0.177, + "step": 236 + }, + { + "epoch": 0.4637964774951076, + "grad_norm": 1.792919397354126, + "learning_rate": 1.8515625e-05, + "loss": 0.0556, + "step": 237 + }, + { + "epoch": 0.4657534246575342, + "grad_norm": 3.8270483016967773, + "learning_rate": 1.859375e-05, + "loss": 0.2285, + "step": 238 + }, + { + "epoch": 0.46771037181996084, + "grad_norm": 3.2458577156066895, + "learning_rate": 1.8671875e-05, + "loss": 0.1657, + "step": 239 + }, + { + "epoch": 0.46966731898238745, + "grad_norm": 4.352839469909668, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.2555, + "step": 240 + }, + { + "epoch": 0.46966731898238745, + "eval_loss": 0.10528620332479477, + "eval_runtime": 107.3201, + "eval_samples_per_second": 28.438, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8794205585889476, + "eval_sts-test_pearson_dot": 0.8616236846471828, + "eval_sts-test_pearson_euclidean": 0.9100171674371834, + "eval_sts-test_pearson_manhattan": 0.9102120642982687, + "eval_sts-test_pearson_max": 0.9102120642982687, + "eval_sts-test_spearman_cosine": 0.9076779309662261, + "eval_sts-test_spearman_dot": 0.8702396969551023, + "eval_sts-test_spearman_euclidean": 0.9078436896384199, + "eval_sts-test_spearman_manhattan": 0.9080407741935878, + "eval_sts-test_spearman_max": 0.9080407741935878, + "step": 240 + }, + { + "epoch": 0.47162426614481406, + "grad_norm": 3.644327163696289, + "learning_rate": 1.8828125000000002e-05, + "loss": 0.1606, + "step": 241 + }, + { + "epoch": 0.4735812133072407, + "grad_norm": 3.0316474437713623, + "learning_rate": 1.8906250000000002e-05, + "loss": 0.1257, + "step": 242 + }, + { + "epoch": 0.4755381604696673, + "grad_norm": 3.8527326583862305, + "learning_rate": 1.8984375000000002e-05, + "loss": 0.1898, + "step": 243 + }, + { + "epoch": 0.4774951076320939, + "grad_norm": 3.91603422164917, + "learning_rate": 1.9062500000000003e-05, + "loss": 0.1621, + "step": 244 + }, + { + "epoch": 0.4794520547945205, + "grad_norm": 3.6845171451568604, + "learning_rate": 1.9140625000000003e-05, + "loss": 0.1606, + "step": 245 + }, + { + "epoch": 0.4794520547945205, + "eval_loss": 0.10541080683469772, + "eval_runtime": 107.3443, + "eval_samples_per_second": 28.432, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8782579310286232, + "eval_sts-test_pearson_dot": 0.8596847230641689, + "eval_sts-test_pearson_euclidean": 0.909741577402618, + "eval_sts-test_pearson_manhattan": 0.9098438643121189, + "eval_sts-test_pearson_max": 0.9098438643121189, + "eval_sts-test_spearman_cosine": 0.9078928712746891, + "eval_sts-test_spearman_dot": 0.8682800392187727, + "eval_sts-test_spearman_euclidean": 0.9083291960732551, + "eval_sts-test_spearman_manhattan": 0.908423397478484, + "eval_sts-test_spearman_max": 0.908423397478484, + "step": 245 + }, + { + "epoch": 0.48140900195694714, + "grad_norm": 3.31758451461792, + "learning_rate": 1.9218750000000003e-05, + "loss": 0.0983, + "step": 246 + }, + { + "epoch": 0.48336594911937375, + "grad_norm": 3.8613622188568115, + "learning_rate": 1.9296875000000003e-05, + "loss": 0.2028, + "step": 247 + }, + { + "epoch": 0.48532289628180036, + "grad_norm": 2.792924165725708, + "learning_rate": 1.9375e-05, + "loss": 0.0997, + "step": 248 + }, + { + "epoch": 0.487279843444227, + "grad_norm": 3.4162261486053467, + "learning_rate": 1.9453125e-05, + "loss": 0.1582, + "step": 249 + }, + { + "epoch": 0.4892367906066536, + "grad_norm": 4.499621391296387, + "learning_rate": 1.953125e-05, + "loss": 0.2394, + "step": 250 + }, + { + "epoch": 0.4892367906066536, + "eval_loss": 0.10517927259206772, + "eval_runtime": 107.2761, + "eval_samples_per_second": 28.45, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8775291091187776, + "eval_sts-test_pearson_dot": 0.8591957018286404, + "eval_sts-test_pearson_euclidean": 0.9092406666480166, + "eval_sts-test_pearson_manhattan": 0.909395200356788, + "eval_sts-test_pearson_max": 0.909395200356788, + "eval_sts-test_spearman_cosine": 0.9073655224104529, + "eval_sts-test_spearman_dot": 0.866218124850164, + "eval_sts-test_spearman_euclidean": 0.9077081380676655, + "eval_sts-test_spearman_manhattan": 0.907968321901395, + "eval_sts-test_spearman_max": 0.907968321901395, + "step": 250 + }, + { + "epoch": 0.4911937377690802, + "grad_norm": 4.491675853729248, + "learning_rate": 1.9609375e-05, + "loss": 0.2186, + "step": 251 + }, + { + "epoch": 0.4931506849315068, + "grad_norm": 2.9051578044891357, + "learning_rate": 1.96875e-05, + "loss": 0.0993, + "step": 252 + }, + { + "epoch": 0.49510763209393344, + "grad_norm": 3.53365421295166, + "learning_rate": 1.9765625e-05, + "loss": 0.1805, + "step": 253 + }, + { + "epoch": 0.49706457925636005, + "grad_norm": 3.2181098461151123, + "learning_rate": 1.984375e-05, + "loss": 0.1178, + "step": 254 + }, + { + "epoch": 0.49902152641878667, + "grad_norm": 4.045453071594238, + "learning_rate": 1.9921875e-05, + "loss": 0.2198, + "step": 255 + }, + { + "epoch": 0.49902152641878667, + "eval_loss": 0.10428859293460846, + "eval_runtime": 107.2698, + "eval_samples_per_second": 28.452, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8777129617944619, + "eval_sts-test_pearson_dot": 0.8588391652180615, + "eval_sts-test_pearson_euclidean": 0.9093230964292308, + "eval_sts-test_pearson_manhattan": 0.9095932968076137, + "eval_sts-test_pearson_max": 0.9095932968076137, + "eval_sts-test_spearman_cosine": 0.9069800350448274, + "eval_sts-test_spearman_dot": 0.8639776976998651, + "eval_sts-test_spearman_euclidean": 0.9072912800678044, + "eval_sts-test_spearman_manhattan": 0.9080281095866138, + "eval_sts-test_spearman_max": 0.9080281095866138, + "step": 255 + }, + { + "epoch": 0.5009784735812133, + "grad_norm": 2.8251521587371826, + "learning_rate": 2e-05, + "loss": 0.1064, + "step": 256 + }, + { + "epoch": 0.50293542074364, + "grad_norm": 3.3597464561462402, + "learning_rate": 1.999924308128909e-05, + "loss": 0.1436, + "step": 257 + }, + { + "epoch": 0.5048923679060665, + "grad_norm": 2.580488920211792, + "learning_rate": 1.9996972439741537e-05, + "loss": 0.0859, + "step": 258 + }, + { + "epoch": 0.5068493150684932, + "grad_norm": 3.937856674194336, + "learning_rate": 1.9993188419095562e-05, + "loss": 0.2157, + "step": 259 + }, + { + "epoch": 0.5088062622309197, + "grad_norm": 3.344531774520874, + "learning_rate": 1.9987891592190367e-05, + "loss": 0.1455, + "step": 260 + }, + { + "epoch": 0.5088062622309197, + "eval_loss": 0.10292962938547134, + "eval_runtime": 107.2285, + "eval_samples_per_second": 28.463, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8767515459977318, + "eval_sts-test_pearson_dot": 0.8564862360521637, + "eval_sts-test_pearson_euclidean": 0.9083760527634203, + "eval_sts-test_pearson_manhattan": 0.9086626400377007, + "eval_sts-test_pearson_max": 0.9086626400377007, + "eval_sts-test_spearman_cosine": 0.9057508521481897, + "eval_sts-test_spearman_dot": 0.8601081456298736, + "eval_sts-test_spearman_euclidean": 0.9063700753520626, + "eval_sts-test_spearman_manhattan": 0.9068438122051519, + "eval_sts-test_spearman_max": 0.9068438122051519, + "step": 260 + }, + { + "epoch": 0.5107632093933464, + "grad_norm": 3.7637484073638916, + "learning_rate": 1.9981082760879432e-05, + "loss": 0.1974, + "step": 261 + }, + { + "epoch": 0.512720156555773, + "grad_norm": 3.182102918624878, + "learning_rate": 1.997276295590912e-05, + "loss": 0.1667, + "step": 262 + }, + { + "epoch": 0.5146771037181996, + "grad_norm": 3.7908170223236084, + "learning_rate": 1.9962933436762644e-05, + "loss": 0.1512, + "step": 263 + }, + { + "epoch": 0.5166340508806262, + "grad_norm": 3.4492650032043457, + "learning_rate": 1.9951595691469397e-05, + "loss": 0.1684, + "step": 264 + }, + { + "epoch": 0.5185909980430529, + "grad_norm": 3.816772222518921, + "learning_rate": 1.9938751436379684e-05, + "loss": 0.2132, + "step": 265 + }, + { + "epoch": 0.5185909980430529, + "eval_loss": 0.10117975622415543, + "eval_runtime": 107.3212, + "eval_samples_per_second": 28.438, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8770393502752714, + "eval_sts-test_pearson_dot": 0.8567524208989885, + "eval_sts-test_pearson_euclidean": 0.9080912956763092, + "eval_sts-test_pearson_manhattan": 0.908247948105785, + "eval_sts-test_pearson_max": 0.908247948105785, + "eval_sts-test_spearman_cosine": 0.9053279079767796, + "eval_sts-test_spearman_dot": 0.8598375795035011, + "eval_sts-test_spearman_euclidean": 0.9057662913333698, + "eval_sts-test_spearman_manhattan": 0.9061448870047409, + "eval_sts-test_spearman_max": 0.9061448870047409, + "step": 265 + }, + { + "epoch": 0.5205479452054794, + "grad_norm": 3.5570499897003174, + "learning_rate": 1.992440261590491e-05, + "loss": 0.1645, + "step": 266 + }, + { + "epoch": 0.5225048923679061, + "grad_norm": 4.160579681396484, + "learning_rate": 1.9908551402223218e-05, + "loss": 0.203, + "step": 267 + }, + { + "epoch": 0.5244618395303327, + "grad_norm": 3.5718774795532227, + "learning_rate": 1.9891200194950644e-05, + "loss": 0.1539, + "step": 268 + }, + { + "epoch": 0.5264187866927593, + "grad_norm": 3.604438066482544, + "learning_rate": 1.9872351620777883e-05, + "loss": 0.1445, + "step": 269 + }, + { + "epoch": 0.5283757338551859, + "grad_norm": 3.4854915142059326, + "learning_rate": 1.9852008533072627e-05, + "loss": 0.1377, + "step": 270 + }, + { + "epoch": 0.5283757338551859, + "eval_loss": 0.09936786442995071, + "eval_runtime": 107.3119, + "eval_samples_per_second": 28.44, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8772155384897071, + "eval_sts-test_pearson_dot": 0.8577040756637748, + "eval_sts-test_pearson_euclidean": 0.9081962404777727, + "eval_sts-test_pearson_manhattan": 0.9082660411148933, + "eval_sts-test_pearson_max": 0.9082660411148933, + "eval_sts-test_spearman_cosine": 0.9056296657323417, + "eval_sts-test_spearman_dot": 0.8627456954737598, + "eval_sts-test_spearman_euclidean": 0.9061553587999066, + "eval_sts-test_spearman_manhattan": 0.9063870360801298, + "eval_sts-test_spearman_max": 0.9063870360801298, + "step": 270 + }, + { + "epoch": 0.5303326810176126, + "grad_norm": 3.662992238998413, + "learning_rate": 1.9830174011447617e-05, + "loss": 0.1719, + "step": 271 + }, + { + "epoch": 0.5322896281800391, + "grad_norm": 3.5594613552093506, + "learning_rate": 1.980685136129445e-05, + "loss": 0.1896, + "step": 272 + }, + { + "epoch": 0.5342465753424658, + "grad_norm": 3.257335662841797, + "learning_rate": 1.978204411328318e-05, + "loss": 0.1452, + "step": 273 + }, + { + "epoch": 0.5362035225048923, + "grad_norm": 3.292863368988037, + "learning_rate": 1.9755756022827847e-05, + "loss": 0.1275, + "step": 274 + }, + { + "epoch": 0.538160469667319, + "grad_norm": 4.065443515777588, + "learning_rate": 1.972799106951796e-05, + "loss": 0.1883, + "step": 275 + }, + { + "epoch": 0.538160469667319, + "eval_loss": 0.09800439327955246, + "eval_runtime": 107.2596, + "eval_samples_per_second": 28.454, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8773598628753827, + "eval_sts-test_pearson_dot": 0.8578251655808844, + "eval_sts-test_pearson_euclidean": 0.9082603623937704, + "eval_sts-test_pearson_manhattan": 0.9081101963076783, + "eval_sts-test_pearson_max": 0.9082603623937704, + "eval_sts-test_spearman_cosine": 0.9056689328319392, + "eval_sts-test_spearman_dot": 0.8647132741833555, + "eval_sts-test_spearman_euclidean": 0.9063065285608867, + "eval_sts-test_spearman_manhattan": 0.9067770433231558, + "eval_sts-test_spearman_max": 0.9067770433231558, + "step": 275 + }, + { + "epoch": 0.5401174168297456, + "grad_norm": 3.7186553478240967, + "learning_rate": 1.9698753456516047e-05, + "loss": 0.1462, + "step": 276 + }, + { + "epoch": 0.5420743639921722, + "grad_norm": 3.5399951934814453, + "learning_rate": 1.9668047609921382e-05, + "loss": 0.1595, + "step": 277 + }, + { + "epoch": 0.5440313111545988, + "grad_norm": 3.6143035888671875, + "learning_rate": 1.963587817809993e-05, + "loss": 0.1693, + "step": 278 + }, + { + "epoch": 0.5459882583170255, + "grad_norm": 4.133859634399414, + "learning_rate": 1.9602250030980657e-05, + "loss": 0.1929, + "step": 279 + }, + { + "epoch": 0.547945205479452, + "grad_norm": 3.6929726600646973, + "learning_rate": 1.9567168259318324e-05, + "loss": 0.154, + "step": 280 + }, + { + "epoch": 0.547945205479452, + "eval_loss": 0.0969705730676651, + "eval_runtime": 107.333, + "eval_samples_per_second": 28.435, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8772326487842304, + "eval_sts-test_pearson_dot": 0.8584362564160372, + "eval_sts-test_pearson_euclidean": 0.9077579223693962, + "eval_sts-test_pearson_manhattan": 0.9072827835669532, + "eval_sts-test_pearson_max": 0.9077579223693962, + "eval_sts-test_spearman_cosine": 0.9052923754752349, + "eval_sts-test_spearman_dot": 0.866326959917868, + "eval_sts-test_spearman_euclidean": 0.9057464665245734, + "eval_sts-test_spearman_manhattan": 0.9059635996448444, + "eval_sts-test_spearman_max": 0.9059635996448444, + "step": 280 + }, + { + "epoch": 0.5499021526418787, + "grad_norm": 3.515667200088501, + "learning_rate": 1.953063817392281e-05, + "loss": 0.1468, + "step": 281 + }, + { + "epoch": 0.5518590998043053, + "grad_norm": 2.3627371788024902, + "learning_rate": 1.949266530485513e-05, + "loss": 0.0898, + "step": 282 + }, + { + "epoch": 0.5538160469667319, + "grad_norm": 3.26710844039917, + "learning_rate": 1.945325540059032e-05, + "loss": 0.1425, + "step": 283 + }, + { + "epoch": 0.5557729941291585, + "grad_norm": 3.6672258377075195, + "learning_rate": 1.941241442714716e-05, + "loss": 0.1362, + "step": 284 + }, + { + "epoch": 0.5577299412915852, + "grad_norm": 3.306119203567505, + "learning_rate": 1.9370148567185043e-05, + "loss": 0.1025, + "step": 285 + }, + { + "epoch": 0.5577299412915852, + "eval_loss": 0.09782103449106216, + "eval_runtime": 107.2147, + "eval_samples_per_second": 28.466, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8780373092719852, + "eval_sts-test_pearson_dot": 0.861128415219923, + "eval_sts-test_pearson_euclidean": 0.9076094585437832, + "eval_sts-test_pearson_manhattan": 0.9068707688162918, + "eval_sts-test_pearson_max": 0.9076094585437832, + "eval_sts-test_spearman_cosine": 0.9052606468309083, + "eval_sts-test_spearman_dot": 0.868469739815811, + "eval_sts-test_spearman_euclidean": 0.9051151604801249, + "eval_sts-test_spearman_manhattan": 0.9048908224067698, + "eval_sts-test_spearman_max": 0.9052606468309083, + "step": 285 + }, + { + "epoch": 0.5596868884540117, + "grad_norm": 3.2606685161590576, + "learning_rate": 1.9326464219068023e-05, + "loss": 0.1578, + "step": 286 + }, + { + "epoch": 0.5616438356164384, + "grad_norm": 3.5152740478515625, + "learning_rate": 1.9281367995896187e-05, + "loss": 0.1235, + "step": 287 + }, + { + "epoch": 0.5636007827788649, + "grad_norm": 2.8671882152557373, + "learning_rate": 1.9234866724504554e-05, + "loss": 0.1109, + "step": 288 + }, + { + "epoch": 0.5655577299412916, + "grad_norm": 2.315185785293579, + "learning_rate": 1.9186967444429613e-05, + "loss": 0.0746, + "step": 289 + }, + { + "epoch": 0.5675146771037182, + "grad_norm": 3.4961392879486084, + "learning_rate": 1.913767740684362e-05, + "loss": 0.1471, + "step": 290 + }, + { + "epoch": 0.5675146771037182, + "eval_loss": 0.09924904257059097, + "eval_runtime": 107.3422, + "eval_samples_per_second": 28.432, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8783681424807572, + "eval_sts-test_pearson_dot": 0.861120631953773, + "eval_sts-test_pearson_euclidean": 0.9077238606316402, + "eval_sts-test_pearson_manhattan": 0.9069786963498391, + "eval_sts-test_pearson_max": 0.9077238606316402, + "eval_sts-test_spearman_cosine": 0.9052591700392825, + "eval_sts-test_spearman_dot": 0.8684268233561366, + "eval_sts-test_spearman_euclidean": 0.9046835793152661, + "eval_sts-test_spearman_manhattan": 0.9045985071673613, + "eval_sts-test_spearman_max": 0.9052591700392825, + "step": 290 + }, + { + "epoch": 0.5694716242661448, + "grad_norm": 4.221432209014893, + "learning_rate": 1.9087004073456926e-05, + "loss": 0.2631, + "step": 291 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 3.4570438861846924, + "learning_rate": 1.9034955115388364e-05, + "loss": 0.11, + "step": 292 + }, + { + "epoch": 0.5733855185909981, + "grad_norm": 3.6059136390686035, + "learning_rate": 1.898153841200398e-05, + "loss": 0.1834, + "step": 293 + }, + { + "epoch": 0.5753424657534246, + "grad_norm": 3.3278088569641113, + "learning_rate": 1.892676204972423e-05, + "loss": 0.1277, + "step": 294 + }, + { + "epoch": 0.5772994129158513, + "grad_norm": 4.314577579498291, + "learning_rate": 1.8870634320799822e-05, + "loss": 0.2104, + "step": 295 + }, + { + "epoch": 0.5772994129158513, + "eval_loss": 0.09903673827648163, + "eval_runtime": 107.6434, + "eval_samples_per_second": 28.353, + "eval_steps_per_second": 0.223, + "eval_sts-test_pearson_cosine": 0.8781909348259072, + "eval_sts-test_pearson_dot": 0.8596231931866185, + "eval_sts-test_pearson_euclidean": 0.9076411156234586, + "eval_sts-test_pearson_manhattan": 0.9069147632233857, + "eval_sts-test_pearson_max": 0.9076411156234586, + "eval_sts-test_spearman_cosine": 0.9042011607174669, + "eval_sts-test_spearman_dot": 0.8660264551976247, + "eval_sts-test_spearman_euclidean": 0.9044265280698341, + "eval_sts-test_spearman_manhattan": 0.9041656729671835, + "eval_sts-test_spearman_max": 0.9044265280698341, + "step": 295 + }, + { + "epoch": 0.5792563600782779, + "grad_norm": 3.195991039276123, + "learning_rate": 1.8813163722056397e-05, + "loss": 0.1294, + "step": 296 + }, + { + "epoch": 0.5812133072407045, + "grad_norm": 3.6352145671844482, + "learning_rate": 1.875435895360826e-05, + "loss": 0.1672, + "step": 297 + }, + { + "epoch": 0.5831702544031311, + "grad_norm": 3.7248518466949463, + "learning_rate": 1.8694228917541313e-05, + "loss": 0.2171, + "step": 298 + }, + { + "epoch": 0.5851272015655578, + "grad_norm": 3.459801435470581, + "learning_rate": 1.8632782716565438e-05, + "loss": 0.1451, + "step": 299 + }, + { + "epoch": 0.5870841487279843, + "grad_norm": 2.6911542415618896, + "learning_rate": 1.857002965263648e-05, + "loss": 0.0871, + "step": 300 + }, + { + "epoch": 0.5870841487279843, + "eval_loss": 0.09800251573324203, + "eval_runtime": 107.2338, + "eval_samples_per_second": 28.461, + "eval_steps_per_second": 0.224, + "eval_sts-test_pearson_cosine": 0.8772925626670083, + "eval_sts-test_pearson_dot": 0.8566016359384749, + "eval_sts-test_pearson_euclidean": 0.9070931796775764, + "eval_sts-test_pearson_manhattan": 0.9064105714529896, + "eval_sts-test_pearson_max": 0.9070931796775764, + "eval_sts-test_spearman_cosine": 0.9032592361677008, + "eval_sts-test_spearman_dot": 0.8623085204012272, + "eval_sts-test_spearman_euclidean": 0.9038942565668446, + "eval_sts-test_spearman_manhattan": 0.9033954590073763, + "eval_sts-test_spearman_max": 0.9038942565668446, + "step": 300 + }, + { + "epoch": 0.589041095890411, + "grad_norm": 2.913508653640747, + "learning_rate": 1.850597922554809e-05, + "loss": 0.0897, + "step": 301 + }, + { + "epoch": 0.5909980430528375, + "grad_norm": 3.2928783893585205, + "learning_rate": 1.844064113149361e-05, + "loss": 0.1296, + "step": 302 + }, + { + "epoch": 0.5929549902152642, + "grad_norm": 3.2551913261413574, + "learning_rate": 1.8374025261598224e-05, + "loss": 0.1206, + "step": 303 + }, + { + "epoch": 0.5949119373776908, + "grad_norm": 3.246716022491455, + "learning_rate": 1.8306141700421606e-05, + "loss": 0.1665, + "step": 304 + }, + { + "epoch": 0.5968688845401174, + "grad_norm": 3.980085611343384, + "learning_rate": 1.8237000724431283e-05, + "loss": 0.1511, + "step": 305 + }, + { + "epoch": 0.5968688845401174, + "eval_loss": 0.09785618633031845, + "eval_runtime": 107.3879, + "eval_samples_per_second": 28.42, + "eval_steps_per_second": 0.223, + "eval_sts-test_pearson_cosine": 0.8767817266460346, + "eval_sts-test_pearson_dot": 0.8544828510438696, + "eval_sts-test_pearson_euclidean": 0.9070553577944469, + "eval_sts-test_pearson_manhattan": 0.9065146784679962, + "eval_sts-test_pearson_max": 0.9070553577944469, + "eval_sts-test_spearman_cosine": 0.9032290290662617, + "eval_sts-test_spearman_dot": 0.8599922398628699, + "eval_sts-test_spearman_euclidean": 0.9039456310149221, + "eval_sts-test_spearman_manhattan": 0.9035283702537087, + "eval_sts-test_spearman_max": 0.9039456310149221, + "step": 305 + }, + { + "epoch": 0.598825831702544, + "grad_norm": 3.576425790786743, + "learning_rate": 1.8166612800446927e-05, + "loss": 0.1566, + "step": 306 + }, + { + "epoch": 0.6007827788649707, + "grad_norm": 3.3370437622070312, + "learning_rate": 1.809498858405589e-05, + "loss": 0.1339, + "step": 307 + }, + { + "epoch": 0.6027397260273972, + "grad_norm": 3.3882863521575928, + "learning_rate": 1.802213891800007e-05, + "loss": 0.1474, + "step": 308 + }, + { + "epoch": 0.6046966731898239, + "grad_norm": 2.9576971530914307, + "learning_rate": 1.7948074830534535e-05, + "loss": 0.1022, + "step": 309 + } + ], + "logging_steps": 1, + "max_steps": 1022, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 103, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 320, + "trial_name": null, + "trial_params": null +}