CocoRoF's picture
Training in progress, step 405, checkpoint
bb182c7 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.200557103064067,
"eval_steps": 5,
"global_step": 405,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.11142061281337047,
"eval_loss": 0.037675488740205765,
"eval_runtime": 6.7342,
"eval_samples_per_second": 222.743,
"eval_steps_per_second": 27.917,
"eval_sts_dev_pearson_cosine": 0.7494115429773479,
"eval_sts_dev_pearson_dot": 0.6583752142885668,
"eval_sts_dev_pearson_euclidean": 0.6941454281465765,
"eval_sts_dev_pearson_manhattan": 0.6964259759684527,
"eval_sts_dev_pearson_max": 0.7494115429773479,
"eval_sts_dev_spearman_cosine": 0.7470700524367354,
"eval_sts_dev_spearman_dot": 0.6497928276890669,
"eval_sts_dev_spearman_euclidean": 0.684590776689316,
"eval_sts_dev_spearman_manhattan": 0.6873610947323412,
"eval_sts_dev_spearman_max": 0.7470700524367354,
"step": 5
},
{
"epoch": 0.22284122562674094,
"grad_norm": 6.7429633140563965,
"learning_rate": 6.957731779439903e-08,
"loss": 0.6923,
"step": 10
},
{
"epoch": 0.22284122562674094,
"eval_loss": 0.0376589335501194,
"eval_runtime": 6.8403,
"eval_samples_per_second": 219.29,
"eval_steps_per_second": 27.484,
"eval_sts_dev_pearson_cosine": 0.7494940477075391,
"eval_sts_dev_pearson_dot": 0.6584328702717946,
"eval_sts_dev_pearson_euclidean": 0.6942213054869852,
"eval_sts_dev_pearson_manhattan": 0.6965001647458872,
"eval_sts_dev_pearson_max": 0.7494940477075391,
"eval_sts_dev_spearman_cosine": 0.7471377072884906,
"eval_sts_dev_spearman_dot": 0.6498755431337675,
"eval_sts_dev_spearman_euclidean": 0.6846545112671376,
"eval_sts_dev_spearman_manhattan": 0.687454500948251,
"eval_sts_dev_spearman_max": 0.7471377072884906,
"step": 10
},
{
"epoch": 0.3342618384401114,
"eval_loss": 0.03763080760836601,
"eval_runtime": 6.9686,
"eval_samples_per_second": 215.253,
"eval_steps_per_second": 26.978,
"eval_sts_dev_pearson_cosine": 0.7496395035968593,
"eval_sts_dev_pearson_dot": 0.6585292611324672,
"eval_sts_dev_pearson_euclidean": 0.6943597344549325,
"eval_sts_dev_pearson_manhattan": 0.6966356509027943,
"eval_sts_dev_pearson_max": 0.7496395035968593,
"eval_sts_dev_spearman_cosine": 0.747293071934341,
"eval_sts_dev_spearman_dot": 0.6499672916131112,
"eval_sts_dev_spearman_euclidean": 0.6848464778088699,
"eval_sts_dev_spearman_manhattan": 0.6875927784863133,
"eval_sts_dev_spearman_max": 0.747293071934341,
"step": 15
},
{
"epoch": 0.4456824512534819,
"grad_norm": 7.523725986480713,
"learning_rate": 1.3915463558879807e-07,
"loss": 0.6832,
"step": 20
},
{
"epoch": 0.4456824512534819,
"eval_loss": 0.03759082034230232,
"eval_runtime": 6.9891,
"eval_samples_per_second": 214.62,
"eval_steps_per_second": 26.899,
"eval_sts_dev_pearson_cosine": 0.7498515127163549,
"eval_sts_dev_pearson_dot": 0.6586892126695529,
"eval_sts_dev_pearson_euclidean": 0.6945632277600391,
"eval_sts_dev_pearson_manhattan": 0.6968351270246123,
"eval_sts_dev_pearson_max": 0.7498515127163549,
"eval_sts_dev_spearman_cosine": 0.7475384213284385,
"eval_sts_dev_spearman_dot": 0.6500677799755323,
"eval_sts_dev_spearman_euclidean": 0.6850767084625934,
"eval_sts_dev_spearman_manhattan": 0.6877654793239389,
"eval_sts_dev_spearman_max": 0.7475384213284385,
"step": 20
},
{
"epoch": 0.5571030640668524,
"eval_loss": 0.03754143416881561,
"eval_runtime": 6.8247,
"eval_samples_per_second": 219.79,
"eval_steps_per_second": 27.547,
"eval_sts_dev_pearson_cosine": 0.7501122656435163,
"eval_sts_dev_pearson_dot": 0.6588212748683685,
"eval_sts_dev_pearson_euclidean": 0.6948708332777139,
"eval_sts_dev_pearson_manhattan": 0.6971351224061912,
"eval_sts_dev_pearson_max": 0.7501122656435163,
"eval_sts_dev_spearman_cosine": 0.7478755024321192,
"eval_sts_dev_spearman_dot": 0.6502167543650381,
"eval_sts_dev_spearman_euclidean": 0.6854436169483377,
"eval_sts_dev_spearman_manhattan": 0.6880846722054696,
"eval_sts_dev_spearman_max": 0.7478755024321192,
"step": 25
},
{
"epoch": 0.6685236768802229,
"grad_norm": 7.176445960998535,
"learning_rate": 2.0873195338319708e-07,
"loss": 0.6787,
"step": 30
},
{
"epoch": 0.6685236768802229,
"eval_loss": 0.037479061633348465,
"eval_runtime": 6.9898,
"eval_samples_per_second": 214.598,
"eval_steps_per_second": 26.896,
"eval_sts_dev_pearson_cosine": 0.7504502235424245,
"eval_sts_dev_pearson_dot": 0.6589729935526047,
"eval_sts_dev_pearson_euclidean": 0.6952782546669927,
"eval_sts_dev_pearson_manhattan": 0.6975315748701472,
"eval_sts_dev_pearson_max": 0.7504502235424245,
"eval_sts_dev_spearman_cosine": 0.7483727549578874,
"eval_sts_dev_spearman_dot": 0.6502927839552382,
"eval_sts_dev_spearman_euclidean": 0.6858779938781956,
"eval_sts_dev_spearman_manhattan": 0.6885426870287449,
"eval_sts_dev_spearman_max": 0.7483727549578874,
"step": 30
},
{
"epoch": 0.7799442896935933,
"eval_loss": 0.03741108253598213,
"eval_runtime": 7.0272,
"eval_samples_per_second": 213.456,
"eval_steps_per_second": 26.753,
"eval_sts_dev_pearson_cosine": 0.7508317539918448,
"eval_sts_dev_pearson_dot": 0.6592089487188968,
"eval_sts_dev_pearson_euclidean": 0.6957145823768739,
"eval_sts_dev_pearson_manhattan": 0.6979566424519045,
"eval_sts_dev_pearson_max": 0.7508317539918448,
"eval_sts_dev_spearman_cosine": 0.7488095875667629,
"eval_sts_dev_spearman_dot": 0.6505123414164061,
"eval_sts_dev_spearman_euclidean": 0.6863890021142346,
"eval_sts_dev_spearman_manhattan": 0.6889574531430644,
"eval_sts_dev_spearman_max": 0.7488095875667629,
"step": 35
},
{
"epoch": 0.8913649025069638,
"grad_norm": 5.811614036560059,
"learning_rate": 2.7830927117759614e-07,
"loss": 0.6154,
"step": 40
},
{
"epoch": 0.8913649025069638,
"eval_loss": 0.03732568398118019,
"eval_runtime": 6.9675,
"eval_samples_per_second": 215.286,
"eval_steps_per_second": 26.983,
"eval_sts_dev_pearson_cosine": 0.7512943163494744,
"eval_sts_dev_pearson_dot": 0.6595252251920851,
"eval_sts_dev_pearson_euclidean": 0.6961931337237875,
"eval_sts_dev_pearson_manhattan": 0.6984244275683631,
"eval_sts_dev_pearson_max": 0.7512943163494744,
"eval_sts_dev_spearman_cosine": 0.7494136836534844,
"eval_sts_dev_spearman_dot": 0.6507477353375185,
"eval_sts_dev_spearman_euclidean": 0.6869101418254764,
"eval_sts_dev_spearman_manhattan": 0.6894841894318411,
"eval_sts_dev_spearman_max": 0.7494136836534844,
"step": 40
},
{
"epoch": 1.0222841225626742,
"eval_loss": 0.03723177686333656,
"eval_runtime": 6.807,
"eval_samples_per_second": 220.362,
"eval_steps_per_second": 27.619,
"eval_sts_dev_pearson_cosine": 0.7518064092372247,
"eval_sts_dev_pearson_dot": 0.659916579804052,
"eval_sts_dev_pearson_euclidean": 0.6966963956623822,
"eval_sts_dev_pearson_manhattan": 0.6989173818306955,
"eval_sts_dev_pearson_max": 0.7518064092372247,
"eval_sts_dev_spearman_cosine": 0.7499673977394428,
"eval_sts_dev_spearman_dot": 0.6509971484372724,
"eval_sts_dev_spearman_euclidean": 0.6874727739859278,
"eval_sts_dev_spearman_manhattan": 0.6900562750157024,
"eval_sts_dev_spearman_max": 0.7499673977394428,
"step": 45
},
{
"epoch": 1.1337047353760445,
"grad_norm": 5.788002014160156,
"learning_rate": 3.4788658897199517e-07,
"loss": 0.6231,
"step": 50
},
{
"epoch": 1.1337047353760445,
"eval_loss": 0.03712593764066696,
"eval_runtime": 6.8692,
"eval_samples_per_second": 218.366,
"eval_steps_per_second": 27.369,
"eval_sts_dev_pearson_cosine": 0.7523963675159875,
"eval_sts_dev_pearson_dot": 0.6603951554863274,
"eval_sts_dev_pearson_euclidean": 0.6972537699536556,
"eval_sts_dev_pearson_manhattan": 0.6994643420859175,
"eval_sts_dev_pearson_max": 0.7523963675159875,
"eval_sts_dev_spearman_cosine": 0.750612531997651,
"eval_sts_dev_spearman_dot": 0.6513909659491809,
"eval_sts_dev_spearman_euclidean": 0.6881037653193015,
"eval_sts_dev_spearman_manhattan": 0.690698809264565,
"eval_sts_dev_spearman_max": 0.750612531997651,
"step": 50
},
{
"epoch": 1.2451253481894151,
"eval_loss": 0.03701437637209892,
"eval_runtime": 6.9079,
"eval_samples_per_second": 217.142,
"eval_steps_per_second": 27.215,
"eval_sts_dev_pearson_cosine": 0.7530298660085821,
"eval_sts_dev_pearson_dot": 0.6608066497022934,
"eval_sts_dev_pearson_euclidean": 0.6979265726405308,
"eval_sts_dev_pearson_manhattan": 0.7001220235641434,
"eval_sts_dev_pearson_max": 0.7530298660085821,
"eval_sts_dev_spearman_cosine": 0.7512373553393066,
"eval_sts_dev_spearman_dot": 0.6517398579494034,
"eval_sts_dev_spearman_euclidean": 0.6888011095183327,
"eval_sts_dev_spearman_manhattan": 0.691414492932023,
"eval_sts_dev_spearman_max": 0.7512373553393066,
"step": 55
},
{
"epoch": 1.3565459610027855,
"grad_norm": 6.15402889251709,
"learning_rate": 4.1746390676639416e-07,
"loss": 0.6562,
"step": 60
},
{
"epoch": 1.3565459610027855,
"eval_loss": 0.03689862787723541,
"eval_runtime": 7.0618,
"eval_samples_per_second": 212.409,
"eval_steps_per_second": 26.622,
"eval_sts_dev_pearson_cosine": 0.7536755590598476,
"eval_sts_dev_pearson_dot": 0.6612397236569308,
"eval_sts_dev_pearson_euclidean": 0.6986261571902858,
"eval_sts_dev_pearson_manhattan": 0.7008037618197723,
"eval_sts_dev_pearson_max": 0.7536755590598476,
"eval_sts_dev_spearman_cosine": 0.7518746736763288,
"eval_sts_dev_spearman_dot": 0.6520463167363649,
"eval_sts_dev_spearman_euclidean": 0.6896250409475332,
"eval_sts_dev_spearman_manhattan": 0.6921595229559657,
"eval_sts_dev_spearman_max": 0.7518746736763288,
"step": 60
},
{
"epoch": 1.467966573816156,
"eval_loss": 0.03677487000823021,
"eval_runtime": 7.0563,
"eval_samples_per_second": 212.575,
"eval_steps_per_second": 26.643,
"eval_sts_dev_pearson_cosine": 0.7543139332813571,
"eval_sts_dev_pearson_dot": 0.6616585186532069,
"eval_sts_dev_pearson_euclidean": 0.6993248351610868,
"eval_sts_dev_pearson_manhattan": 0.701480628825091,
"eval_sts_dev_pearson_max": 0.7543139332813571,
"eval_sts_dev_spearman_cosine": 0.7525649831393398,
"eval_sts_dev_spearman_dot": 0.6522844686788962,
"eval_sts_dev_spearman_euclidean": 0.6904248656764869,
"eval_sts_dev_spearman_manhattan": 0.6929891697203803,
"eval_sts_dev_spearman_max": 0.7525649831393398,
"step": 65
},
{
"epoch": 1.5793871866295266,
"grad_norm": 7.177963733673096,
"learning_rate": 4.870412245607932e-07,
"loss": 0.6578,
"step": 70
},
{
"epoch": 1.5793871866295266,
"eval_loss": 0.036648884415626526,
"eval_runtime": 6.959,
"eval_samples_per_second": 215.549,
"eval_steps_per_second": 27.015,
"eval_sts_dev_pearson_cosine": 0.7550016826683398,
"eval_sts_dev_pearson_dot": 0.6621754750211006,
"eval_sts_dev_pearson_euclidean": 0.7000072525876023,
"eval_sts_dev_pearson_manhattan": 0.7021439545430929,
"eval_sts_dev_pearson_max": 0.7550016826683398,
"eval_sts_dev_spearman_cosine": 0.7533627904462605,
"eval_sts_dev_spearman_dot": 0.6528254486243784,
"eval_sts_dev_spearman_euclidean": 0.6912255844955055,
"eval_sts_dev_spearman_manhattan": 0.693756112728956,
"eval_sts_dev_spearman_max": 0.7533627904462605,
"step": 70
},
{
"epoch": 1.690807799442897,
"eval_loss": 0.03651271015405655,
"eval_runtime": 7.007,
"eval_samples_per_second": 214.073,
"eval_steps_per_second": 26.83,
"eval_sts_dev_pearson_cosine": 0.7558035417500417,
"eval_sts_dev_pearson_dot": 0.6628193044191966,
"eval_sts_dev_pearson_euclidean": 0.7007736916543533,
"eval_sts_dev_pearson_manhattan": 0.7028932537624881,
"eval_sts_dev_pearson_max": 0.7558035417500417,
"eval_sts_dev_spearman_cosine": 0.7541058046949494,
"eval_sts_dev_spearman_dot": 0.6534460767465545,
"eval_sts_dev_spearman_euclidean": 0.6920637091980477,
"eval_sts_dev_spearman_manhattan": 0.6945583716986528,
"eval_sts_dev_spearman_max": 0.7541058046949494,
"step": 75
},
{
"epoch": 1.8022284122562673,
"grad_norm": 5.0526251792907715,
"learning_rate": 5.566185423551923e-07,
"loss": 0.6669,
"step": 80
},
{
"epoch": 1.8022284122562673,
"eval_loss": 0.03638559579849243,
"eval_runtime": 6.7319,
"eval_samples_per_second": 222.821,
"eval_steps_per_second": 27.927,
"eval_sts_dev_pearson_cosine": 0.7566016073951273,
"eval_sts_dev_pearson_dot": 0.6635483139977033,
"eval_sts_dev_pearson_euclidean": 0.7014893647689773,
"eval_sts_dev_pearson_manhattan": 0.703594324322853,
"eval_sts_dev_pearson_max": 0.7566016073951273,
"eval_sts_dev_spearman_cosine": 0.7549242270132541,
"eval_sts_dev_spearman_dot": 0.6540274190985176,
"eval_sts_dev_spearman_euclidean": 0.6927527403904686,
"eval_sts_dev_spearman_manhattan": 0.695333682691011,
"eval_sts_dev_spearman_max": 0.7549242270132541,
"step": 80
},
{
"epoch": 1.9136490250696379,
"eval_loss": 0.036259058862924576,
"eval_runtime": 7.0619,
"eval_samples_per_second": 212.407,
"eval_steps_per_second": 26.622,
"eval_sts_dev_pearson_cosine": 0.7574029038481553,
"eval_sts_dev_pearson_dot": 0.6643528168117957,
"eval_sts_dev_pearson_euclidean": 0.7021549030740968,
"eval_sts_dev_pearson_manhattan": 0.7042456310839478,
"eval_sts_dev_pearson_max": 0.7574029038481553,
"eval_sts_dev_spearman_cosine": 0.7559011874733633,
"eval_sts_dev_spearman_dot": 0.654608486564625,
"eval_sts_dev_spearman_euclidean": 0.6934728497203049,
"eval_sts_dev_spearman_manhattan": 0.696009977505159,
"eval_sts_dev_spearman_max": 0.7559011874733633,
"step": 85
},
{
"epoch": 2.0445682451253484,
"grad_norm": 7.539985656738281,
"learning_rate": 6.261958601495913e-07,
"loss": 0.6428,
"step": 90
},
{
"epoch": 2.0445682451253484,
"eval_loss": 0.036127302795648575,
"eval_runtime": 7.0107,
"eval_samples_per_second": 213.958,
"eval_steps_per_second": 26.816,
"eval_sts_dev_pearson_cosine": 0.7582160683192293,
"eval_sts_dev_pearson_dot": 0.6651865048982631,
"eval_sts_dev_pearson_euclidean": 0.7028452108161203,
"eval_sts_dev_pearson_manhattan": 0.7049193272018017,
"eval_sts_dev_pearson_max": 0.7582160683192293,
"eval_sts_dev_spearman_cosine": 0.756839547083474,
"eval_sts_dev_spearman_dot": 0.6554766310741506,
"eval_sts_dev_spearman_euclidean": 0.6942368514501571,
"eval_sts_dev_spearman_manhattan": 0.6967968978437559,
"eval_sts_dev_spearman_max": 0.756839547083474,
"step": 90
},
{
"epoch": 2.1559888579387185,
"eval_loss": 0.03598429635167122,
"eval_runtime": 6.8629,
"eval_samples_per_second": 218.568,
"eval_steps_per_second": 27.394,
"eval_sts_dev_pearson_cosine": 0.7590157349155543,
"eval_sts_dev_pearson_dot": 0.6664047922354215,
"eval_sts_dev_pearson_euclidean": 0.7032597502450331,
"eval_sts_dev_pearson_manhattan": 0.7053311453976816,
"eval_sts_dev_pearson_max": 0.7590157349155543,
"eval_sts_dev_spearman_cosine": 0.7577360756559688,
"eval_sts_dev_spearman_dot": 0.6567130424552957,
"eval_sts_dev_spearman_euclidean": 0.694683268380771,
"eval_sts_dev_spearman_manhattan": 0.6973000099834088,
"eval_sts_dev_spearman_max": 0.7577360756559688,
"step": 95
},
{
"epoch": 2.267409470752089,
"grad_norm": 5.178345680236816,
"learning_rate": 6.957731779439903e-07,
"loss": 0.5854,
"step": 100
},
{
"epoch": 2.267409470752089,
"eval_loss": 0.03583008423447609,
"eval_runtime": 7.173,
"eval_samples_per_second": 209.119,
"eval_steps_per_second": 26.21,
"eval_sts_dev_pearson_cosine": 0.7597921519073876,
"eval_sts_dev_pearson_dot": 0.6678122125215467,
"eval_sts_dev_pearson_euclidean": 0.7035339087302831,
"eval_sts_dev_pearson_manhattan": 0.7056098859433702,
"eval_sts_dev_pearson_max": 0.7597921519073876,
"eval_sts_dev_spearman_cosine": 0.758602852970159,
"eval_sts_dev_spearman_dot": 0.6582467955758544,
"eval_sts_dev_spearman_euclidean": 0.6948605697617651,
"eval_sts_dev_spearman_manhattan": 0.6975703877172783,
"eval_sts_dev_spearman_max": 0.758602852970159,
"step": 100
},
{
"epoch": 2.3788300835654597,
"eval_loss": 0.035686325281858444,
"eval_runtime": 7.0872,
"eval_samples_per_second": 211.649,
"eval_steps_per_second": 26.527,
"eval_sts_dev_pearson_cosine": 0.7606568693270315,
"eval_sts_dev_pearson_dot": 0.6687514434055418,
"eval_sts_dev_pearson_euclidean": 0.704286798579542,
"eval_sts_dev_pearson_manhattan": 0.7063472791256069,
"eval_sts_dev_pearson_max": 0.7606568693270315,
"eval_sts_dev_spearman_cosine": 0.7597087921768803,
"eval_sts_dev_spearman_dot": 0.658946428183679,
"eval_sts_dev_spearman_euclidean": 0.695592274693547,
"eval_sts_dev_spearman_manhattan": 0.6983308228030709,
"eval_sts_dev_spearman_max": 0.7597087921768803,
"step": 105
},
{
"epoch": 2.4902506963788302,
"grad_norm": 5.807418346405029,
"learning_rate": 7.653504957383893e-07,
"loss": 0.6027,
"step": 110
},
{
"epoch": 2.4902506963788302,
"eval_loss": 0.035556692630052567,
"eval_runtime": 7.0493,
"eval_samples_per_second": 212.788,
"eval_steps_per_second": 26.669,
"eval_sts_dev_pearson_cosine": 0.761527279815679,
"eval_sts_dev_pearson_dot": 0.6695555734987789,
"eval_sts_dev_pearson_euclidean": 0.705168673869323,
"eval_sts_dev_pearson_manhattan": 0.7072038979059934,
"eval_sts_dev_pearson_max": 0.761527279815679,
"eval_sts_dev_spearman_cosine": 0.760744250643423,
"eval_sts_dev_spearman_dot": 0.6597526569449198,
"eval_sts_dev_spearman_euclidean": 0.6967183194293859,
"eval_sts_dev_spearman_manhattan": 0.6992467241695522,
"eval_sts_dev_spearman_max": 0.760744250643423,
"step": 110
},
{
"epoch": 2.6016713091922004,
"eval_loss": 0.03542407229542732,
"eval_runtime": 6.8156,
"eval_samples_per_second": 220.083,
"eval_steps_per_second": 27.584,
"eval_sts_dev_pearson_cosine": 0.7623648733142145,
"eval_sts_dev_pearson_dot": 0.6704772598451654,
"eval_sts_dev_pearson_euclidean": 0.7059197567148983,
"eval_sts_dev_pearson_manhattan": 0.7079355090955533,
"eval_sts_dev_pearson_max": 0.7623648733142145,
"eval_sts_dev_spearman_cosine": 0.7618131283610858,
"eval_sts_dev_spearman_dot": 0.6605908503497494,
"eval_sts_dev_spearman_euclidean": 0.6976245585578177,
"eval_sts_dev_spearman_manhattan": 0.7002055764519721,
"eval_sts_dev_spearman_max": 0.7618131283610858,
"step": 115
},
{
"epoch": 2.713091922005571,
"grad_norm": 4.760545253753662,
"learning_rate": 8.349278135327883e-07,
"loss": 0.6375,
"step": 120
},
{
"epoch": 2.713091922005571,
"eval_loss": 0.03528669476509094,
"eval_runtime": 6.9936,
"eval_samples_per_second": 214.481,
"eval_steps_per_second": 26.882,
"eval_sts_dev_pearson_cosine": 0.7631052098822656,
"eval_sts_dev_pearson_dot": 0.6714460378701741,
"eval_sts_dev_pearson_euclidean": 0.7064722681555804,
"eval_sts_dev_pearson_manhattan": 0.7084736073971417,
"eval_sts_dev_pearson_max": 0.7631052098822656,
"eval_sts_dev_spearman_cosine": 0.7627318359213398,
"eval_sts_dev_spearman_dot": 0.6614807337490313,
"eval_sts_dev_spearman_euclidean": 0.6982972981814837,
"eval_sts_dev_spearman_manhattan": 0.7008247751818659,
"eval_sts_dev_spearman_max": 0.7627318359213398,
"step": 120
},
{
"epoch": 2.8245125348189415,
"eval_loss": 0.03514046594500542,
"eval_runtime": 7.0888,
"eval_samples_per_second": 211.601,
"eval_steps_per_second": 26.521,
"eval_sts_dev_pearson_cosine": 0.7638287349941795,
"eval_sts_dev_pearson_dot": 0.6724854308235324,
"eval_sts_dev_pearson_euclidean": 0.7068315364008582,
"eval_sts_dev_pearson_manhattan": 0.7088273928548983,
"eval_sts_dev_pearson_max": 0.7638287349941795,
"eval_sts_dev_spearman_cosine": 0.7635020295116245,
"eval_sts_dev_spearman_dot": 0.6624349213377722,
"eval_sts_dev_spearman_euclidean": 0.6987404256446157,
"eval_sts_dev_spearman_manhattan": 0.7011827796563965,
"eval_sts_dev_spearman_max": 0.7635020295116245,
"step": 125
},
{
"epoch": 2.935933147632312,
"grad_norm": 4.589956760406494,
"learning_rate": 9.045051313271874e-07,
"loss": 0.6204,
"step": 130
},
{
"epoch": 2.935933147632312,
"eval_loss": 0.03499244153499603,
"eval_runtime": 7.03,
"eval_samples_per_second": 213.37,
"eval_steps_per_second": 26.742,
"eval_sts_dev_pearson_cosine": 0.7646248483324349,
"eval_sts_dev_pearson_dot": 0.6736502017218999,
"eval_sts_dev_pearson_euclidean": 0.707216661995043,
"eval_sts_dev_pearson_manhattan": 0.709212008478957,
"eval_sts_dev_pearson_max": 0.7646248483324349,
"eval_sts_dev_spearman_cosine": 0.7643307027826172,
"eval_sts_dev_spearman_dot": 0.6636213615361183,
"eval_sts_dev_spearman_euclidean": 0.6991046333767655,
"eval_sts_dev_spearman_manhattan": 0.7016301334896569,
"eval_sts_dev_spearman_max": 0.7643307027826172,
"step": 130
},
{
"epoch": 3.066852367688022,
"eval_loss": 0.0348396897315979,
"eval_runtime": 6.8191,
"eval_samples_per_second": 219.972,
"eval_steps_per_second": 27.57,
"eval_sts_dev_pearson_cosine": 0.7654784319525549,
"eval_sts_dev_pearson_dot": 0.6748874130308962,
"eval_sts_dev_pearson_euclidean": 0.707641103763532,
"eval_sts_dev_pearson_manhattan": 0.7096402166194079,
"eval_sts_dev_pearson_max": 0.7654784319525549,
"eval_sts_dev_spearman_cosine": 0.7653040232955037,
"eval_sts_dev_spearman_dot": 0.6648308844991435,
"eval_sts_dev_spearman_euclidean": 0.6995286562724882,
"eval_sts_dev_spearman_manhattan": 0.7020556361876031,
"eval_sts_dev_spearman_max": 0.7653040232955037,
"step": 135
},
{
"epoch": 3.1782729805013927,
"grad_norm": 5.2525177001953125,
"learning_rate": 9.740824491215864e-07,
"loss": 0.6077,
"step": 140
},
{
"epoch": 3.1782729805013927,
"eval_loss": 0.034706421196460724,
"eval_runtime": 6.9212,
"eval_samples_per_second": 216.726,
"eval_steps_per_second": 27.163,
"eval_sts_dev_pearson_cosine": 0.766263287304504,
"eval_sts_dev_pearson_dot": 0.675948414551205,
"eval_sts_dev_pearson_euclidean": 0.7081178470450136,
"eval_sts_dev_pearson_manhattan": 0.7101145234880011,
"eval_sts_dev_pearson_max": 0.766263287304504,
"eval_sts_dev_spearman_cosine": 0.7662688094783671,
"eval_sts_dev_spearman_dot": 0.666010980931314,
"eval_sts_dev_spearman_euclidean": 0.7000434625148538,
"eval_sts_dev_spearman_manhattan": 0.7026795019088747,
"eval_sts_dev_spearman_max": 0.7662688094783671,
"step": 140
},
{
"epoch": 3.2896935933147633,
"eval_loss": 0.03455406054854393,
"eval_runtime": 7.043,
"eval_samples_per_second": 212.977,
"eval_steps_per_second": 26.693,
"eval_sts_dev_pearson_cosine": 0.7669976344527232,
"eval_sts_dev_pearson_dot": 0.6772130506339817,
"eval_sts_dev_pearson_euclidean": 0.708340308738127,
"eval_sts_dev_pearson_manhattan": 0.7103439855119656,
"eval_sts_dev_pearson_max": 0.7669976344527232,
"eval_sts_dev_spearman_cosine": 0.7671727295922609,
"eval_sts_dev_spearman_dot": 0.6673064034335351,
"eval_sts_dev_spearman_euclidean": 0.7002877031846776,
"eval_sts_dev_spearman_manhattan": 0.702790337375281,
"eval_sts_dev_spearman_max": 0.7671727295922609,
"step": 145
},
{
"epoch": 3.401114206128134,
"grad_norm": 4.290932655334473,
"learning_rate": 1.0436597669159855e-06,
"loss": 0.5772,
"step": 150
},
{
"epoch": 3.401114206128134,
"eval_loss": 0.03440996631979942,
"eval_runtime": 6.9991,
"eval_samples_per_second": 214.314,
"eval_steps_per_second": 26.861,
"eval_sts_dev_pearson_cosine": 0.7678013334085165,
"eval_sts_dev_pearson_dot": 0.6782264568419771,
"eval_sts_dev_pearson_euclidean": 0.7088142668828982,
"eval_sts_dev_pearson_manhattan": 0.7108177316956372,
"eval_sts_dev_pearson_max": 0.7678013334085165,
"eval_sts_dev_spearman_cosine": 0.7681151208619762,
"eval_sts_dev_spearman_dot": 0.6684064347787971,
"eval_sts_dev_spearman_euclidean": 0.7007326265687747,
"eval_sts_dev_spearman_manhattan": 0.7034130912956567,
"eval_sts_dev_spearman_max": 0.7681151208619762,
"step": 150
},
{
"epoch": 3.5125348189415044,
"eval_loss": 0.03426254168152809,
"eval_runtime": 6.9148,
"eval_samples_per_second": 216.927,
"eval_steps_per_second": 27.188,
"eval_sts_dev_pearson_cosine": 0.7686124311956235,
"eval_sts_dev_pearson_dot": 0.6794161416279998,
"eval_sts_dev_pearson_euclidean": 0.709205498775362,
"eval_sts_dev_pearson_manhattan": 0.7112097548871816,
"eval_sts_dev_pearson_max": 0.7686124311956235,
"eval_sts_dev_spearman_cosine": 0.7690103394236019,
"eval_sts_dev_spearman_dot": 0.6696872196092013,
"eval_sts_dev_spearman_euclidean": 0.7011801292985436,
"eval_sts_dev_spearman_manhattan": 0.7038459216523878,
"eval_sts_dev_spearman_max": 0.7690103394236019,
"step": 155
},
{
"epoch": 3.6239554317548746,
"grad_norm": 4.819970607757568,
"learning_rate": 1.1132370847103846e-06,
"loss": 0.5793,
"step": 160
},
{
"epoch": 3.6239554317548746,
"eval_loss": 0.034103069454431534,
"eval_runtime": 6.9114,
"eval_samples_per_second": 217.034,
"eval_steps_per_second": 27.202,
"eval_sts_dev_pearson_cosine": 0.7693080146114311,
"eval_sts_dev_pearson_dot": 0.6808270827318879,
"eval_sts_dev_pearson_euclidean": 0.7092760536788121,
"eval_sts_dev_pearson_manhattan": 0.7112999822871733,
"eval_sts_dev_pearson_max": 0.7693080146114311,
"eval_sts_dev_spearman_cosine": 0.7697572707961111,
"eval_sts_dev_spearman_dot": 0.6712211078819724,
"eval_sts_dev_spearman_euclidean": 0.7012923239631217,
"eval_sts_dev_spearman_manhattan": 0.7039518721666204,
"eval_sts_dev_spearman_max": 0.7697572707961111,
"step": 160
},
{
"epoch": 3.735376044568245,
"eval_loss": 0.0339648611843586,
"eval_runtime": 7.0549,
"eval_samples_per_second": 212.619,
"eval_steps_per_second": 26.648,
"eval_sts_dev_pearson_cosine": 0.7699086602747196,
"eval_sts_dev_pearson_dot": 0.6821532615290817,
"eval_sts_dev_pearson_euclidean": 0.7093076470422841,
"eval_sts_dev_pearson_manhattan": 0.7113457981900373,
"eval_sts_dev_pearson_max": 0.7699086602747196,
"eval_sts_dev_spearman_cosine": 0.7705074022984313,
"eval_sts_dev_spearman_dot": 0.672703308451007,
"eval_sts_dev_spearman_euclidean": 0.7012209819190688,
"eval_sts_dev_spearman_manhattan": 0.7040382275601695,
"eval_sts_dev_spearman_max": 0.7705074022984313,
"step": 165
},
{
"epoch": 3.8467966573816157,
"grad_norm": 4.6819539070129395,
"learning_rate": 1.1828144025047836e-06,
"loss": 0.5807,
"step": 170
},
{
"epoch": 3.8467966573816157,
"eval_loss": 0.033846523612737656,
"eval_runtime": 7.007,
"eval_samples_per_second": 214.071,
"eval_steps_per_second": 26.83,
"eval_sts_dev_pearson_cosine": 0.7705963307037736,
"eval_sts_dev_pearson_dot": 0.6833451512490409,
"eval_sts_dev_pearson_euclidean": 0.7096001250212141,
"eval_sts_dev_pearson_manhattan": 0.711632091113557,
"eval_sts_dev_pearson_max": 0.7705963307037736,
"eval_sts_dev_spearman_cosine": 0.7712362988663569,
"eval_sts_dev_spearman_dot": 0.6739107651886832,
"eval_sts_dev_spearman_euclidean": 0.7015732848026783,
"eval_sts_dev_spearman_manhattan": 0.7044555432408592,
"eval_sts_dev_spearman_max": 0.7712362988663569,
"step": 170
},
{
"epoch": 3.958217270194986,
"eval_loss": 0.03373364359140396,
"eval_runtime": 7.0142,
"eval_samples_per_second": 213.851,
"eval_steps_per_second": 26.803,
"eval_sts_dev_pearson_cosine": 0.7712791156976446,
"eval_sts_dev_pearson_dot": 0.684640185733316,
"eval_sts_dev_pearson_euclidean": 0.7099359677821528,
"eval_sts_dev_pearson_manhattan": 0.7119586396389017,
"eval_sts_dev_pearson_max": 0.7712791156976446,
"eval_sts_dev_spearman_cosine": 0.7720608877912845,
"eval_sts_dev_spearman_dot": 0.6752035487866894,
"eval_sts_dev_spearman_euclidean": 0.702006901214985,
"eval_sts_dev_spearman_manhattan": 0.7046928888569776,
"eval_sts_dev_spearman_max": 0.7720608877912845,
"step": 175
},
{
"epoch": 4.089136490250697,
"grad_norm": 5.301053524017334,
"learning_rate": 1.2523917202991825e-06,
"loss": 0.5576,
"step": 180
},
{
"epoch": 4.089136490250697,
"eval_loss": 0.03359239175915718,
"eval_runtime": 6.9089,
"eval_samples_per_second": 217.11,
"eval_steps_per_second": 27.211,
"eval_sts_dev_pearson_cosine": 0.7720094084547121,
"eval_sts_dev_pearson_dot": 0.6861711141593462,
"eval_sts_dev_pearson_euclidean": 0.7101810374908146,
"eval_sts_dev_pearson_manhattan": 0.7122008904979185,
"eval_sts_dev_pearson_max": 0.7720094084547121,
"eval_sts_dev_spearman_cosine": 0.7729262262575222,
"eval_sts_dev_spearman_dot": 0.6768691715243821,
"eval_sts_dev_spearman_euclidean": 0.7021927194762703,
"eval_sts_dev_spearman_manhattan": 0.7048656110976538,
"eval_sts_dev_spearman_max": 0.7729262262575222,
"step": 180
},
{
"epoch": 4.2005571030640665,
"eval_loss": 0.03344343975186348,
"eval_runtime": 6.9828,
"eval_samples_per_second": 214.815,
"eval_steps_per_second": 26.923,
"eval_sts_dev_pearson_cosine": 0.7726659560898915,
"eval_sts_dev_pearson_dot": 0.687537592193805,
"eval_sts_dev_pearson_euclidean": 0.7102002040734198,
"eval_sts_dev_pearson_manhattan": 0.7122330129546837,
"eval_sts_dev_pearson_max": 0.7726659560898915,
"eval_sts_dev_spearman_cosine": 0.773425201401485,
"eval_sts_dev_spearman_dot": 0.6783853594387605,
"eval_sts_dev_spearman_euclidean": 0.7021304440842328,
"eval_sts_dev_spearman_manhattan": 0.7048113806985111,
"eval_sts_dev_spearman_max": 0.773425201401485,
"step": 185
},
{
"epoch": 4.311977715877437,
"grad_norm": 6.004096984863281,
"learning_rate": 1.3219690380935816e-06,
"loss": 0.5244,
"step": 190
},
{
"epoch": 4.311977715877437,
"eval_loss": 0.033325061202049255,
"eval_runtime": 7.0443,
"eval_samples_per_second": 212.938,
"eval_steps_per_second": 26.688,
"eval_sts_dev_pearson_cosine": 0.7733338124006317,
"eval_sts_dev_pearson_dot": 0.6884427467691409,
"eval_sts_dev_pearson_euclidean": 0.7104572948924557,
"eval_sts_dev_pearson_manhattan": 0.7124982272648852,
"eval_sts_dev_pearson_max": 0.7733338124006317,
"eval_sts_dev_spearman_cosine": 0.7740160113372371,
"eval_sts_dev_spearman_dot": 0.679314640853313,
"eval_sts_dev_spearman_euclidean": 0.7022600917103325,
"eval_sts_dev_spearman_manhattan": 0.7049775652371025,
"eval_sts_dev_spearman_max": 0.7740160113372371,
"step": 190
},
{
"epoch": 4.423398328690808,
"eval_loss": 0.03322310745716095,
"eval_runtime": 7.0234,
"eval_samples_per_second": 213.573,
"eval_steps_per_second": 26.768,
"eval_sts_dev_pearson_cosine": 0.7739695721631923,
"eval_sts_dev_pearson_dot": 0.6893517389464994,
"eval_sts_dev_pearson_euclidean": 0.7107902241882483,
"eval_sts_dev_pearson_manhattan": 0.7128377305936389,
"eval_sts_dev_pearson_max": 0.7739695721631923,
"eval_sts_dev_spearman_cosine": 0.7747804094168401,
"eval_sts_dev_spearman_dot": 0.68031602482782,
"eval_sts_dev_spearman_euclidean": 0.702670109171386,
"eval_sts_dev_spearman_manhattan": 0.7052134150159887,
"eval_sts_dev_spearman_max": 0.7747804094168401,
"step": 195
},
{
"epoch": 4.534818941504178,
"grad_norm": 4.593257427215576,
"learning_rate": 1.3915463558879807e-06,
"loss": 0.539,
"step": 200
},
{
"epoch": 4.534818941504178,
"eval_loss": 0.033111851662397385,
"eval_runtime": 6.8979,
"eval_samples_per_second": 217.459,
"eval_steps_per_second": 27.255,
"eval_sts_dev_pearson_cosine": 0.7745730498768191,
"eval_sts_dev_pearson_dot": 0.6906339428851104,
"eval_sts_dev_pearson_euclidean": 0.7109426833835167,
"eval_sts_dev_pearson_manhattan": 0.7130040632760261,
"eval_sts_dev_pearson_max": 0.7745730498768191,
"eval_sts_dev_spearman_cosine": 0.775379074216674,
"eval_sts_dev_spearman_dot": 0.6818359626434134,
"eval_sts_dev_spearman_euclidean": 0.7027815708069985,
"eval_sts_dev_spearman_manhattan": 0.7054772962806527,
"eval_sts_dev_spearman_max": 0.775379074216674,
"step": 200
},
{
"epoch": 4.646239554317549,
"eval_loss": 0.03302275016903877,
"eval_runtime": 6.926,
"eval_samples_per_second": 216.576,
"eval_steps_per_second": 27.144,
"eval_sts_dev_pearson_cosine": 0.7750720264452357,
"eval_sts_dev_pearson_dot": 0.6916453477028499,
"eval_sts_dev_pearson_euclidean": 0.7112190933233568,
"eval_sts_dev_pearson_manhattan": 0.7132769890476369,
"eval_sts_dev_pearson_max": 0.7750720264452357,
"eval_sts_dev_spearman_cosine": 0.7760014955136583,
"eval_sts_dev_spearman_dot": 0.6829098752509514,
"eval_sts_dev_spearman_euclidean": 0.7030344024642863,
"eval_sts_dev_spearman_manhattan": 0.7059066267642276,
"eval_sts_dev_spearman_max": 0.7760014955136583,
"step": 205
},
{
"epoch": 4.757660167130919,
"grad_norm": 4.035131931304932,
"learning_rate": 1.4611236736823798e-06,
"loss": 0.5517,
"step": 210
},
{
"epoch": 4.757660167130919,
"eval_loss": 0.032943133264780045,
"eval_runtime": 7.07,
"eval_samples_per_second": 212.164,
"eval_steps_per_second": 26.591,
"eval_sts_dev_pearson_cosine": 0.775436879881936,
"eval_sts_dev_pearson_dot": 0.6925918600460864,
"eval_sts_dev_pearson_euclidean": 0.7113376814593186,
"eval_sts_dev_pearson_manhattan": 0.7133931376814393,
"eval_sts_dev_pearson_max": 0.775436879881936,
"eval_sts_dev_spearman_cosine": 0.7764792385291549,
"eval_sts_dev_spearman_dot": 0.6839147456943953,
"eval_sts_dev_spearman_euclidean": 0.703141329969615,
"eval_sts_dev_spearman_manhattan": 0.7059362250994191,
"eval_sts_dev_spearman_max": 0.7764792385291549,
"step": 210
},
{
"epoch": 4.86908077994429,
"eval_loss": 0.03284618631005287,
"eval_runtime": 6.8565,
"eval_samples_per_second": 218.77,
"eval_steps_per_second": 27.419,
"eval_sts_dev_pearson_cosine": 0.775910266044483,
"eval_sts_dev_pearson_dot": 0.6934614331040406,
"eval_sts_dev_pearson_euclidean": 0.711548659602332,
"eval_sts_dev_pearson_manhattan": 0.7136048948232416,
"eval_sts_dev_pearson_max": 0.775910266044483,
"eval_sts_dev_spearman_cosine": 0.7769479258303382,
"eval_sts_dev_spearman_dot": 0.6848713805423069,
"eval_sts_dev_spearman_euclidean": 0.7033494094042918,
"eval_sts_dev_spearman_manhattan": 0.7060255698767176,
"eval_sts_dev_spearman_max": 0.7769479258303382,
"step": 215
},
{
"epoch": 4.9805013927576605,
"grad_norm": 4.164207458496094,
"learning_rate": 1.5307009914767787e-06,
"loss": 0.5265,
"step": 220
},
{
"epoch": 4.9805013927576605,
"eval_loss": 0.0327322892844677,
"eval_runtime": 6.9904,
"eval_samples_per_second": 214.58,
"eval_steps_per_second": 26.894,
"eval_sts_dev_pearson_cosine": 0.7764785690089298,
"eval_sts_dev_pearson_dot": 0.6942342520710683,
"eval_sts_dev_pearson_euclidean": 0.7119234281148877,
"eval_sts_dev_pearson_manhattan": 0.7139725405773478,
"eval_sts_dev_pearson_max": 0.7764785690089298,
"eval_sts_dev_spearman_cosine": 0.7776377175908147,
"eval_sts_dev_spearman_dot": 0.6856461394544989,
"eval_sts_dev_spearman_euclidean": 0.7037087745638393,
"eval_sts_dev_spearman_manhattan": 0.7064702298285305,
"eval_sts_dev_spearman_max": 0.7776377175908147,
"step": 220
},
{
"epoch": 5.111420612813371,
"eval_loss": 0.032635681331157684,
"eval_runtime": 6.8033,
"eval_samples_per_second": 220.482,
"eval_steps_per_second": 27.634,
"eval_sts_dev_pearson_cosine": 0.7768906949758223,
"eval_sts_dev_pearson_dot": 0.695219441450241,
"eval_sts_dev_pearson_euclidean": 0.7119427716298626,
"eval_sts_dev_pearson_manhattan": 0.7139906781614199,
"eval_sts_dev_pearson_max": 0.7768906949758223,
"eval_sts_dev_spearman_cosine": 0.7779652464100915,
"eval_sts_dev_spearman_dot": 0.6869571731826094,
"eval_sts_dev_spearman_euclidean": 0.7036077013230951,
"eval_sts_dev_spearman_manhattan": 0.7064509076431469,
"eval_sts_dev_spearman_max": 0.7779652464100915,
"step": 225
},
{
"epoch": 5.222841225626741,
"grad_norm": 4.008439064025879,
"learning_rate": 1.6002783092711777e-06,
"loss": 0.5285,
"step": 230
},
{
"epoch": 5.222841225626741,
"eval_loss": 0.03253428637981415,
"eval_runtime": 6.8983,
"eval_samples_per_second": 217.445,
"eval_steps_per_second": 27.253,
"eval_sts_dev_pearson_cosine": 0.7772382339972829,
"eval_sts_dev_pearson_dot": 0.6962971989781661,
"eval_sts_dev_pearson_euclidean": 0.7116605569376889,
"eval_sts_dev_pearson_manhattan": 0.7137176755568332,
"eval_sts_dev_pearson_max": 0.7772382339972829,
"eval_sts_dev_spearman_cosine": 0.7783426175116597,
"eval_sts_dev_spearman_dot": 0.6882750477744878,
"eval_sts_dev_spearman_euclidean": 0.7031754685029606,
"eval_sts_dev_spearman_manhattan": 0.7062052563630147,
"eval_sts_dev_spearman_max": 0.7783426175116597,
"step": 230
},
{
"epoch": 5.334261838440112,
"eval_loss": 0.032446879893541336,
"eval_runtime": 7.1024,
"eval_samples_per_second": 211.195,
"eval_steps_per_second": 26.47,
"eval_sts_dev_pearson_cosine": 0.7776669424440168,
"eval_sts_dev_pearson_dot": 0.6970405122472402,
"eval_sts_dev_pearson_euclidean": 0.7117722670287954,
"eval_sts_dev_pearson_manhattan": 0.7138312835497453,
"eval_sts_dev_pearson_max": 0.7776669424440168,
"eval_sts_dev_spearman_cosine": 0.7789160171177805,
"eval_sts_dev_spearman_dot": 0.6891076670812013,
"eval_sts_dev_spearman_euclidean": 0.7033258975002282,
"eval_sts_dev_spearman_manhattan": 0.7062752235073074,
"eval_sts_dev_spearman_max": 0.7789160171177805,
"step": 235
},
{
"epoch": 5.445682451253482,
"grad_norm": 3.6369762420654297,
"learning_rate": 1.6698556270655766e-06,
"loss": 0.4697,
"step": 240
},
{
"epoch": 5.445682451253482,
"eval_loss": 0.03234243392944336,
"eval_runtime": 6.782,
"eval_samples_per_second": 221.172,
"eval_steps_per_second": 27.72,
"eval_sts_dev_pearson_cosine": 0.7781440012528016,
"eval_sts_dev_pearson_dot": 0.6975764419235699,
"eval_sts_dev_pearson_euclidean": 0.712024820219635,
"eval_sts_dev_pearson_manhattan": 0.7140934326314853,
"eval_sts_dev_pearson_max": 0.7781440012528016,
"eval_sts_dev_spearman_cosine": 0.779282426254369,
"eval_sts_dev_spearman_dot": 0.6897740636983543,
"eval_sts_dev_spearman_euclidean": 0.7035466980830317,
"eval_sts_dev_spearman_manhattan": 0.706402706407244,
"eval_sts_dev_spearman_max": 0.779282426254369,
"step": 240
},
{
"epoch": 5.557103064066853,
"eval_loss": 0.0322665236890316,
"eval_runtime": 7.0287,
"eval_samples_per_second": 213.412,
"eval_steps_per_second": 26.748,
"eval_sts_dev_pearson_cosine": 0.7787273229273541,
"eval_sts_dev_pearson_dot": 0.6977971151317023,
"eval_sts_dev_pearson_euclidean": 0.7128704818639644,
"eval_sts_dev_pearson_manhattan": 0.7149352374625544,
"eval_sts_dev_pearson_max": 0.7787273229273541,
"eval_sts_dev_spearman_cosine": 0.77981903098488,
"eval_sts_dev_spearman_dot": 0.6899867909899472,
"eval_sts_dev_spearman_euclidean": 0.7044750738813548,
"eval_sts_dev_spearman_manhattan": 0.707203879577786,
"eval_sts_dev_spearman_max": 0.77981903098488,
"step": 245
},
{
"epoch": 5.6685236768802225,
"grad_norm": 3.939344882965088,
"learning_rate": 1.739432944859976e-06,
"loss": 0.4913,
"step": 250
},
{
"epoch": 5.6685236768802225,
"eval_loss": 0.03220539167523384,
"eval_runtime": 6.7653,
"eval_samples_per_second": 221.72,
"eval_steps_per_second": 27.789,
"eval_sts_dev_pearson_cosine": 0.7791771917276973,
"eval_sts_dev_pearson_dot": 0.6981160056071188,
"eval_sts_dev_pearson_euclidean": 0.713488315174772,
"eval_sts_dev_pearson_manhattan": 0.7155411689371374,
"eval_sts_dev_pearson_max": 0.7791771917276973,
"eval_sts_dev_spearman_cosine": 0.7803556746575578,
"eval_sts_dev_spearman_dot": 0.6902449156806119,
"eval_sts_dev_spearman_euclidean": 0.7052006351141208,
"eval_sts_dev_spearman_manhattan": 0.7079806405930662,
"eval_sts_dev_spearman_max": 0.7803556746575578,
"step": 250
},
{
"epoch": 5.779944289693593,
"eval_loss": 0.0321136973798275,
"eval_runtime": 6.8852,
"eval_samples_per_second": 217.857,
"eval_steps_per_second": 27.305,
"eval_sts_dev_pearson_cosine": 0.7795408783298017,
"eval_sts_dev_pearson_dot": 0.698621796206566,
"eval_sts_dev_pearson_euclidean": 0.713845705178594,
"eval_sts_dev_pearson_manhattan": 0.7158847989781144,
"eval_sts_dev_pearson_max": 0.7795408783298017,
"eval_sts_dev_spearman_cosine": 0.7808851254829866,
"eval_sts_dev_spearman_dot": 0.6910441279803855,
"eval_sts_dev_spearman_euclidean": 0.7057147472696849,
"eval_sts_dev_spearman_manhattan": 0.7084308417857139,
"eval_sts_dev_spearman_max": 0.7808851254829866,
"step": 255
},
{
"epoch": 5.891364902506964,
"grad_norm": 4.813522815704346,
"learning_rate": 1.8090102626543748e-06,
"loss": 0.5253,
"step": 260
},
{
"epoch": 5.891364902506964,
"eval_loss": 0.03203197568655014,
"eval_runtime": 7.0476,
"eval_samples_per_second": 212.839,
"eval_steps_per_second": 26.676,
"eval_sts_dev_pearson_cosine": 0.7799732728461426,
"eval_sts_dev_pearson_dot": 0.6992354089058229,
"eval_sts_dev_pearson_euclidean": 0.7142404896335972,
"eval_sts_dev_pearson_manhattan": 0.716270082443381,
"eval_sts_dev_pearson_max": 0.7799732728461426,
"eval_sts_dev_spearman_cosine": 0.7812777358255738,
"eval_sts_dev_spearman_dot": 0.6917093769490908,
"eval_sts_dev_spearman_euclidean": 0.7062223056881557,
"eval_sts_dev_spearman_manhattan": 0.7089598550457142,
"eval_sts_dev_spearman_max": 0.7812777358255738,
"step": 260
},
{
"epoch": 6.022284122562674,
"eval_loss": 0.03195018321275711,
"eval_runtime": 7.099,
"eval_samples_per_second": 211.299,
"eval_steps_per_second": 26.483,
"eval_sts_dev_pearson_cosine": 0.7803233438802165,
"eval_sts_dev_pearson_dot": 0.6999738035020234,
"eval_sts_dev_pearson_euclidean": 0.7143605362249807,
"eval_sts_dev_pearson_manhattan": 0.7163833317778756,
"eval_sts_dev_pearson_max": 0.7803233438802165,
"eval_sts_dev_spearman_cosine": 0.7817289518382318,
"eval_sts_dev_spearman_dot": 0.692658440982393,
"eval_sts_dev_spearman_euclidean": 0.7062913822145624,
"eval_sts_dev_spearman_manhattan": 0.7091007508962174,
"eval_sts_dev_spearman_max": 0.7817289518382318,
"step": 265
},
{
"epoch": 6.133704735376044,
"grad_norm": 3.873243570327759,
"learning_rate": 1.8785875804487739e-06,
"loss": 0.4924,
"step": 270
},
{
"epoch": 6.133704735376044,
"eval_loss": 0.031853143125772476,
"eval_runtime": 7.045,
"eval_samples_per_second": 212.918,
"eval_steps_per_second": 26.686,
"eval_sts_dev_pearson_cosine": 0.7805555688659683,
"eval_sts_dev_pearson_dot": 0.7005444051022546,
"eval_sts_dev_pearson_euclidean": 0.7142124903197049,
"eval_sts_dev_pearson_manhattan": 0.716248059084913,
"eval_sts_dev_pearson_max": 0.7805555688659683,
"eval_sts_dev_spearman_cosine": 0.7818561644430513,
"eval_sts_dev_spearman_dot": 0.6936098988133554,
"eval_sts_dev_spearman_euclidean": 0.7060309965817769,
"eval_sts_dev_spearman_manhattan": 0.7089509487437853,
"eval_sts_dev_spearman_max": 0.7818561644430513,
"step": 270
},
{
"epoch": 6.245125348189415,
"eval_loss": 0.031787075102329254,
"eval_runtime": 7.0799,
"eval_samples_per_second": 211.867,
"eval_steps_per_second": 26.554,
"eval_sts_dev_pearson_cosine": 0.7807075804252084,
"eval_sts_dev_pearson_dot": 0.7015197969666243,
"eval_sts_dev_pearson_euclidean": 0.713830705347577,
"eval_sts_dev_pearson_manhattan": 0.7158793994538133,
"eval_sts_dev_pearson_max": 0.7807075804252084,
"eval_sts_dev_spearman_cosine": 0.7819875621854264,
"eval_sts_dev_spearman_dot": 0.694826261852757,
"eval_sts_dev_spearman_euclidean": 0.7053731328646764,
"eval_sts_dev_spearman_manhattan": 0.7083527948173437,
"eval_sts_dev_spearman_max": 0.7819875621854264,
"step": 275
},
{
"epoch": 6.3565459610027855,
"grad_norm": 4.469658374786377,
"learning_rate": 1.9481648982431728e-06,
"loss": 0.4844,
"step": 280
},
{
"epoch": 6.3565459610027855,
"eval_loss": 0.031746331602334976,
"eval_runtime": 6.7748,
"eval_samples_per_second": 221.41,
"eval_steps_per_second": 27.75,
"eval_sts_dev_pearson_cosine": 0.7808289673024869,
"eval_sts_dev_pearson_dot": 0.702423126121021,
"eval_sts_dev_pearson_euclidean": 0.7134962000576563,
"eval_sts_dev_pearson_manhattan": 0.7155503733116253,
"eval_sts_dev_pearson_max": 0.7808289673024869,
"eval_sts_dev_spearman_cosine": 0.7822111314547963,
"eval_sts_dev_spearman_dot": 0.6958278382473629,
"eval_sts_dev_spearman_euclidean": 0.7049726585244658,
"eval_sts_dev_spearman_manhattan": 0.7078651037745494,
"eval_sts_dev_spearman_max": 0.7822111314547963,
"step": 280
},
{
"epoch": 6.467966573816156,
"eval_loss": 0.0316670723259449,
"eval_runtime": 7.0619,
"eval_samples_per_second": 212.406,
"eval_steps_per_second": 26.622,
"eval_sts_dev_pearson_cosine": 0.781180936397055,
"eval_sts_dev_pearson_dot": 0.7027629453006121,
"eval_sts_dev_pearson_euclidean": 0.7136902176147873,
"eval_sts_dev_pearson_manhattan": 0.715757628364657,
"eval_sts_dev_pearson_max": 0.781180936397055,
"eval_sts_dev_spearman_cosine": 0.78250079334828,
"eval_sts_dev_spearman_dot": 0.6962981450393402,
"eval_sts_dev_spearman_euclidean": 0.7051141445683561,
"eval_sts_dev_spearman_manhattan": 0.70821885209965,
"eval_sts_dev_spearman_max": 0.78250079334828,
"step": 285
},
{
"epoch": 6.579387186629527,
"grad_norm": 4.325808048248291,
"learning_rate": 2.017742216037572e-06,
"loss": 0.442,
"step": 290
},
{
"epoch": 6.579387186629527,
"eval_loss": 0.03155314922332764,
"eval_runtime": 6.933,
"eval_samples_per_second": 216.356,
"eval_steps_per_second": 27.117,
"eval_sts_dev_pearson_cosine": 0.781592834547759,
"eval_sts_dev_pearson_dot": 0.7030321075873802,
"eval_sts_dev_pearson_euclidean": 0.7138293804278546,
"eval_sts_dev_pearson_manhattan": 0.7159175761814789,
"eval_sts_dev_pearson_max": 0.781592834547759,
"eval_sts_dev_spearman_cosine": 0.7827403875693918,
"eval_sts_dev_spearman_dot": 0.6966818933630766,
"eval_sts_dev_spearman_euclidean": 0.705222522900883,
"eval_sts_dev_spearman_manhattan": 0.7082679375517423,
"eval_sts_dev_spearman_max": 0.7827403875693918,
"step": 290
},
{
"epoch": 6.690807799442897,
"eval_loss": 0.0314662829041481,
"eval_runtime": 7.0474,
"eval_samples_per_second": 212.844,
"eval_steps_per_second": 26.676,
"eval_sts_dev_pearson_cosine": 0.7820122068864954,
"eval_sts_dev_pearson_dot": 0.703421139648371,
"eval_sts_dev_pearson_euclidean": 0.7141068771656474,
"eval_sts_dev_pearson_manhattan": 0.7162068261112142,
"eval_sts_dev_pearson_max": 0.7820122068864954,
"eval_sts_dev_spearman_cosine": 0.7829970553896861,
"eval_sts_dev_spearman_dot": 0.6970113959506001,
"eval_sts_dev_spearman_euclidean": 0.7054796488454884,
"eval_sts_dev_spearman_manhattan": 0.7085587324330124,
"eval_sts_dev_spearman_max": 0.7829970553896861,
"step": 295
},
{
"epoch": 6.802228412256268,
"grad_norm": 3.6315908432006836,
"learning_rate": 2.087319533831971e-06,
"loss": 0.4665,
"step": 300
},
{
"epoch": 6.802228412256268,
"eval_loss": 0.03142312169075012,
"eval_runtime": 6.9811,
"eval_samples_per_second": 214.864,
"eval_steps_per_second": 26.93,
"eval_sts_dev_pearson_cosine": 0.7823768397963167,
"eval_sts_dev_pearson_dot": 0.7038756871911903,
"eval_sts_dev_pearson_euclidean": 0.7145009916374723,
"eval_sts_dev_pearson_manhattan": 0.7165993081434159,
"eval_sts_dev_pearson_max": 0.7823768397963167,
"eval_sts_dev_spearman_cosine": 0.7834457731278,
"eval_sts_dev_spearman_dot": 0.6973417239926998,
"eval_sts_dev_spearman_euclidean": 0.7059158400220358,
"eval_sts_dev_spearman_manhattan": 0.7090603670611569,
"eval_sts_dev_spearman_max": 0.7834457731278,
"step": 300
},
{
"epoch": 6.913649025069638,
"eval_loss": 0.03140180557966232,
"eval_runtime": 7.0935,
"eval_samples_per_second": 211.462,
"eval_steps_per_second": 26.503,
"eval_sts_dev_pearson_cosine": 0.7827189612475338,
"eval_sts_dev_pearson_dot": 0.7043799909167585,
"eval_sts_dev_pearson_euclidean": 0.715034388904346,
"eval_sts_dev_pearson_manhattan": 0.7171022025564596,
"eval_sts_dev_pearson_max": 0.7827189612475338,
"eval_sts_dev_spearman_cosine": 0.7839004976206189,
"eval_sts_dev_spearman_dot": 0.6975156259478882,
"eval_sts_dev_spearman_euclidean": 0.7065303588201288,
"eval_sts_dev_spearman_manhattan": 0.7095736568498506,
"eval_sts_dev_spearman_max": 0.7839004976206189,
"step": 305
},
{
"epoch": 7.044568245125348,
"grad_norm": 4.26026725769043,
"learning_rate": 2.15689685162637e-06,
"loss": 0.4672,
"step": 310
},
{
"epoch": 7.044568245125348,
"eval_loss": 0.03136160969734192,
"eval_runtime": 6.6776,
"eval_samples_per_second": 224.63,
"eval_steps_per_second": 28.154,
"eval_sts_dev_pearson_cosine": 0.7831698418188764,
"eval_sts_dev_pearson_dot": 0.7044122663834302,
"eval_sts_dev_pearson_euclidean": 0.7156598421085834,
"eval_sts_dev_pearson_manhattan": 0.7176890258722983,
"eval_sts_dev_pearson_max": 0.7831698418188764,
"eval_sts_dev_spearman_cosine": 0.7843284949390994,
"eval_sts_dev_spearman_dot": 0.697639093220699,
"eval_sts_dev_spearman_euclidean": 0.7073241375609828,
"eval_sts_dev_spearman_manhattan": 0.710185012169815,
"eval_sts_dev_spearman_max": 0.7843284949390994,
"step": 310
},
{
"epoch": 7.155988857938719,
"eval_loss": 0.031366512179374695,
"eval_runtime": 6.9924,
"eval_samples_per_second": 214.518,
"eval_steps_per_second": 26.886,
"eval_sts_dev_pearson_cosine": 0.7835832006721541,
"eval_sts_dev_pearson_dot": 0.7043934252027199,
"eval_sts_dev_pearson_euclidean": 0.7164264689263184,
"eval_sts_dev_pearson_manhattan": 0.7184030248845167,
"eval_sts_dev_pearson_max": 0.7835832006721541,
"eval_sts_dev_spearman_cosine": 0.7850548943796795,
"eval_sts_dev_spearman_dot": 0.6977756771302583,
"eval_sts_dev_spearman_euclidean": 0.708343725874613,
"eval_sts_dev_spearman_manhattan": 0.7111504960736558,
"eval_sts_dev_spearman_max": 0.7850548943796795,
"step": 315
},
{
"epoch": 7.2674094707520895,
"grad_norm": 3.808695077896118,
"learning_rate": 2.226474169420769e-06,
"loss": 0.4131,
"step": 320
},
{
"epoch": 7.2674094707520895,
"eval_loss": 0.03135285899043083,
"eval_runtime": 6.9057,
"eval_samples_per_second": 217.213,
"eval_steps_per_second": 27.224,
"eval_sts_dev_pearson_cosine": 0.7836045257427042,
"eval_sts_dev_pearson_dot": 0.7048735903915628,
"eval_sts_dev_pearson_euclidean": 0.7161062363729224,
"eval_sts_dev_pearson_manhattan": 0.7180798998241316,
"eval_sts_dev_pearson_max": 0.7836045257427042,
"eval_sts_dev_spearman_cosine": 0.7849975337135177,
"eval_sts_dev_spearman_dot": 0.6982899839848741,
"eval_sts_dev_spearman_euclidean": 0.7079431278357644,
"eval_sts_dev_spearman_manhattan": 0.710852480857077,
"eval_sts_dev_spearman_max": 0.7849975337135177,
"step": 320
},
{
"epoch": 7.378830083565459,
"eval_loss": 0.03127756714820862,
"eval_runtime": 6.9241,
"eval_samples_per_second": 216.634,
"eval_steps_per_second": 27.151,
"eval_sts_dev_pearson_cosine": 0.7836610063557831,
"eval_sts_dev_pearson_dot": 0.705409260823171,
"eval_sts_dev_pearson_euclidean": 0.7154023331837831,
"eval_sts_dev_pearson_manhattan": 0.717401985035912,
"eval_sts_dev_pearson_max": 0.7836610063557831,
"eval_sts_dev_spearman_cosine": 0.7848718916416149,
"eval_sts_dev_spearman_dot": 0.6991510364393221,
"eval_sts_dev_spearman_euclidean": 0.7071171759954781,
"eval_sts_dev_spearman_manhattan": 0.709827734664151,
"eval_sts_dev_spearman_max": 0.7848718916416149,
"step": 325
},
{
"epoch": 7.49025069637883,
"grad_norm": 3.8009250164031982,
"learning_rate": 2.2960514872151678e-06,
"loss": 0.4221,
"step": 330
},
{
"epoch": 7.49025069637883,
"eval_loss": 0.031188100576400757,
"eval_runtime": 7.0999,
"eval_samples_per_second": 211.272,
"eval_steps_per_second": 26.479,
"eval_sts_dev_pearson_cosine": 0.7838825238345812,
"eval_sts_dev_pearson_dot": 0.7057496676467132,
"eval_sts_dev_pearson_euclidean": 0.7150892410708943,
"eval_sts_dev_pearson_manhattan": 0.7171064711121474,
"eval_sts_dev_pearson_max": 0.7838825238345812,
"eval_sts_dev_spearman_cosine": 0.784820320759411,
"eval_sts_dev_spearman_dot": 0.6997042671311072,
"eval_sts_dev_spearman_euclidean": 0.7065608619879493,
"eval_sts_dev_spearman_manhattan": 0.7094620852598932,
"eval_sts_dev_spearman_max": 0.784820320759411,
"step": 330
},
{
"epoch": 7.6016713091922,
"eval_loss": 0.0310923233628273,
"eval_runtime": 6.9662,
"eval_samples_per_second": 215.326,
"eval_steps_per_second": 26.987,
"eval_sts_dev_pearson_cosine": 0.7843923769897447,
"eval_sts_dev_pearson_dot": 0.7058530968248947,
"eval_sts_dev_pearson_euclidean": 0.7155332189451762,
"eval_sts_dev_pearson_manhattan": 0.7175425736786123,
"eval_sts_dev_pearson_max": 0.7843923769897447,
"eval_sts_dev_spearman_cosine": 0.7853756910328091,
"eval_sts_dev_spearman_dot": 0.6999248217974418,
"eval_sts_dev_spearman_euclidean": 0.7071685659073802,
"eval_sts_dev_spearman_manhattan": 0.7099135119853421,
"eval_sts_dev_spearman_max": 0.7853756910328091,
"step": 335
},
{
"epoch": 7.713091922005571,
"grad_norm": 4.329479694366455,
"learning_rate": 2.3656288050095673e-06,
"loss": 0.4268,
"step": 340
},
{
"epoch": 7.713091922005571,
"eval_loss": 0.031015686690807343,
"eval_runtime": 6.8718,
"eval_samples_per_second": 218.283,
"eval_steps_per_second": 27.358,
"eval_sts_dev_pearson_cosine": 0.7848078944075182,
"eval_sts_dev_pearson_dot": 0.7062611613987171,
"eval_sts_dev_pearson_euclidean": 0.7156669541008578,
"eval_sts_dev_pearson_manhattan": 0.7176849379592309,
"eval_sts_dev_pearson_max": 0.7848078944075182,
"eval_sts_dev_spearman_cosine": 0.7857175803487115,
"eval_sts_dev_spearman_dot": 0.7006071388870717,
"eval_sts_dev_spearman_euclidean": 0.7074396606352066,
"eval_sts_dev_spearman_manhattan": 0.7101303213368534,
"eval_sts_dev_spearman_max": 0.7857175803487115,
"step": 340
},
{
"epoch": 7.8245125348189415,
"eval_loss": 0.030945729464292526,
"eval_runtime": 6.9722,
"eval_samples_per_second": 215.14,
"eval_steps_per_second": 26.964,
"eval_sts_dev_pearson_cosine": 0.7852280992749574,
"eval_sts_dev_pearson_dot": 0.7063015365766652,
"eval_sts_dev_pearson_euclidean": 0.71618048050416,
"eval_sts_dev_pearson_manhattan": 0.7181959951306995,
"eval_sts_dev_pearson_max": 0.7852280992749574,
"eval_sts_dev_spearman_cosine": 0.7861447827888495,
"eval_sts_dev_spearman_dot": 0.7007253260607372,
"eval_sts_dev_spearman_euclidean": 0.7080307843557273,
"eval_sts_dev_spearman_manhattan": 0.710707788624518,
"eval_sts_dev_spearman_max": 0.7861447827888495,
"step": 345
},
{
"epoch": 7.935933147632312,
"grad_norm": 4.521576881408691,
"learning_rate": 2.435206122803966e-06,
"loss": 0.4316,
"step": 350
},
{
"epoch": 7.935933147632312,
"eval_loss": 0.030903467908501625,
"eval_runtime": 6.8754,
"eval_samples_per_second": 218.169,
"eval_steps_per_second": 27.344,
"eval_sts_dev_pearson_cosine": 0.7857408106817081,
"eval_sts_dev_pearson_dot": 0.7063227803586387,
"eval_sts_dev_pearson_euclidean": 0.7171064497768416,
"eval_sts_dev_pearson_manhattan": 0.7190977579026478,
"eval_sts_dev_pearson_max": 0.7857408106817081,
"eval_sts_dev_spearman_cosine": 0.786647063435545,
"eval_sts_dev_spearman_dot": 0.7004210617791904,
"eval_sts_dev_spearman_euclidean": 0.7090060931384192,
"eval_sts_dev_spearman_manhattan": 0.7117304388117395,
"eval_sts_dev_spearman_max": 0.786647063435545,
"step": 350
},
{
"epoch": 8.066852367688023,
"eval_loss": 0.03090326115489006,
"eval_runtime": 6.7967,
"eval_samples_per_second": 220.696,
"eval_steps_per_second": 27.661,
"eval_sts_dev_pearson_cosine": 0.7860914327083659,
"eval_sts_dev_pearson_dot": 0.7067109311815922,
"eval_sts_dev_pearson_euclidean": 0.7179978723314155,
"eval_sts_dev_pearson_manhattan": 0.7199506434198831,
"eval_sts_dev_pearson_max": 0.7860914327083659,
"eval_sts_dev_spearman_cosine": 0.7871799411716375,
"eval_sts_dev_spearman_dot": 0.7005966817709771,
"eval_sts_dev_spearman_euclidean": 0.7099849983444726,
"eval_sts_dev_spearman_manhattan": 0.7126081974741519,
"eval_sts_dev_spearman_max": 0.7871799411716375,
"step": 355
},
{
"epoch": 8.178272980501394,
"grad_norm": 3.464381217956543,
"learning_rate": 2.504783440598365e-06,
"loss": 0.4277,
"step": 360
},
{
"epoch": 8.178272980501394,
"eval_loss": 0.030861668288707733,
"eval_runtime": 6.8952,
"eval_samples_per_second": 217.544,
"eval_steps_per_second": 27.265,
"eval_sts_dev_pearson_cosine": 0.7862113365203784,
"eval_sts_dev_pearson_dot": 0.7070142268847368,
"eval_sts_dev_pearson_euclidean": 0.7181137478219999,
"eval_sts_dev_pearson_manhattan": 0.7200573508948256,
"eval_sts_dev_pearson_max": 0.7862113365203784,
"eval_sts_dev_spearman_cosine": 0.7873051906331155,
"eval_sts_dev_spearman_dot": 0.700851803333668,
"eval_sts_dev_spearman_euclidean": 0.7101326235059475,
"eval_sts_dev_spearman_manhattan": 0.7126791959108771,
"eval_sts_dev_spearman_max": 0.7873051906331155,
"step": 360
},
{
"epoch": 8.289693593314762,
"eval_loss": 0.03079277276992798,
"eval_runtime": 7.0041,
"eval_samples_per_second": 214.159,
"eval_steps_per_second": 26.841,
"eval_sts_dev_pearson_cosine": 0.7861051555153227,
"eval_sts_dev_pearson_dot": 0.7077462081618229,
"eval_sts_dev_pearson_euclidean": 0.7175047036545574,
"eval_sts_dev_pearson_manhattan": 0.7194616943503004,
"eval_sts_dev_pearson_max": 0.7861051555153227,
"eval_sts_dev_spearman_cosine": 0.7869754283660466,
"eval_sts_dev_spearman_dot": 0.7018953525077267,
"eval_sts_dev_spearman_euclidean": 0.7093618435488815,
"eval_sts_dev_spearman_manhattan": 0.7120432245619701,
"eval_sts_dev_spearman_max": 0.7869754283660466,
"step": 365
},
{
"epoch": 8.401114206128133,
"grad_norm": 3.629032850265503,
"learning_rate": 2.5743607583927645e-06,
"loss": 0.3925,
"step": 370
},
{
"epoch": 8.401114206128133,
"eval_loss": 0.03077574074268341,
"eval_runtime": 6.9569,
"eval_samples_per_second": 215.613,
"eval_steps_per_second": 27.024,
"eval_sts_dev_pearson_cosine": 0.7860927703016911,
"eval_sts_dev_pearson_dot": 0.7084805810982604,
"eval_sts_dev_pearson_euclidean": 0.7171292733763057,
"eval_sts_dev_pearson_manhattan": 0.7191008391698412,
"eval_sts_dev_pearson_max": 0.7860927703016911,
"eval_sts_dev_spearman_cosine": 0.7868465023058949,
"eval_sts_dev_spearman_dot": 0.7026257860756843,
"eval_sts_dev_spearman_euclidean": 0.7087433915922463,
"eval_sts_dev_spearman_manhattan": 0.7115662090675204,
"eval_sts_dev_spearman_max": 0.7868465023058949,
"step": 370
},
{
"epoch": 8.512534818941504,
"eval_loss": 0.03077036887407303,
"eval_runtime": 6.8481,
"eval_samples_per_second": 219.038,
"eval_steps_per_second": 27.453,
"eval_sts_dev_pearson_cosine": 0.7860543259557101,
"eval_sts_dev_pearson_dot": 0.7090029747286515,
"eval_sts_dev_pearson_euclidean": 0.7168001987123229,
"eval_sts_dev_pearson_manhattan": 0.7187912798445806,
"eval_sts_dev_pearson_max": 0.7860543259557101,
"eval_sts_dev_spearman_cosine": 0.786577121013552,
"eval_sts_dev_spearman_dot": 0.7032207123703509,
"eval_sts_dev_spearman_euclidean": 0.7083026579268292,
"eval_sts_dev_spearman_manhattan": 0.7111138102646555,
"eval_sts_dev_spearman_max": 0.786577121013552,
"step": 375
},
{
"epoch": 8.623955431754874,
"grad_norm": 4.5424346923828125,
"learning_rate": 2.643938076187163e-06,
"loss": 0.4049,
"step": 380
},
{
"epoch": 8.623955431754874,
"eval_loss": 0.030785972252488136,
"eval_runtime": 6.9052,
"eval_samples_per_second": 217.228,
"eval_steps_per_second": 27.226,
"eval_sts_dev_pearson_cosine": 0.786338341456081,
"eval_sts_dev_pearson_dot": 0.7090251722360976,
"eval_sts_dev_pearson_euclidean": 0.7176375494602096,
"eval_sts_dev_pearson_manhattan": 0.7195903686388057,
"eval_sts_dev_pearson_max": 0.786338341456081,
"eval_sts_dev_spearman_cosine": 0.7869461186588641,
"eval_sts_dev_spearman_dot": 0.7030353980707192,
"eval_sts_dev_spearman_euclidean": 0.7093240329985625,
"eval_sts_dev_spearman_manhattan": 0.7120013731894795,
"eval_sts_dev_spearman_max": 0.7869461186588641,
"step": 380
},
{
"epoch": 8.735376044568245,
"eval_loss": 0.03077947534620762,
"eval_runtime": 6.94,
"eval_samples_per_second": 216.137,
"eval_steps_per_second": 27.089,
"eval_sts_dev_pearson_cosine": 0.7867836664964302,
"eval_sts_dev_pearson_dot": 0.7089649699768177,
"eval_sts_dev_pearson_euclidean": 0.7185998785212442,
"eval_sts_dev_pearson_manhattan": 0.7205256023581162,
"eval_sts_dev_pearson_max": 0.7867836664964302,
"eval_sts_dev_spearman_cosine": 0.7875195626790124,
"eval_sts_dev_spearman_dot": 0.7028351666319841,
"eval_sts_dev_spearman_euclidean": 0.7105482738364566,
"eval_sts_dev_spearman_manhattan": 0.7132642042369475,
"eval_sts_dev_spearman_max": 0.7875195626790124,
"step": 385
},
{
"epoch": 8.846796657381615,
"grad_norm": 3.7269480228424072,
"learning_rate": 2.7135153939815623e-06,
"loss": 0.3742,
"step": 390
},
{
"epoch": 8.846796657381615,
"eval_loss": 0.030757909640669823,
"eval_runtime": 6.912,
"eval_samples_per_second": 217.015,
"eval_steps_per_second": 27.199,
"eval_sts_dev_pearson_cosine": 0.7873307957198338,
"eval_sts_dev_pearson_dot": 0.7087450117938812,
"eval_sts_dev_pearson_euclidean": 0.7199394166229915,
"eval_sts_dev_pearson_manhattan": 0.7218118008402783,
"eval_sts_dev_pearson_max": 0.7873307957198338,
"eval_sts_dev_spearman_cosine": 0.7883481466120934,
"eval_sts_dev_spearman_dot": 0.702431533404311,
"eval_sts_dev_spearman_euclidean": 0.7122286167501692,
"eval_sts_dev_spearman_manhattan": 0.7149544811678771,
"eval_sts_dev_spearman_max": 0.7883481466120934,
"step": 390
},
{
"epoch": 8.958217270194986,
"eval_loss": 0.03074067085981369,
"eval_runtime": 7.0786,
"eval_samples_per_second": 211.905,
"eval_steps_per_second": 26.559,
"eval_sts_dev_pearson_cosine": 0.7875281932009626,
"eval_sts_dev_pearson_dot": 0.7091183187974348,
"eval_sts_dev_pearson_euclidean": 0.720306579358833,
"eval_sts_dev_pearson_manhattan": 0.7221545912209083,
"eval_sts_dev_pearson_max": 0.7875281932009626,
"eval_sts_dev_spearman_cosine": 0.7884911216315376,
"eval_sts_dev_spearman_dot": 0.7026504547905195,
"eval_sts_dev_spearman_euclidean": 0.7125846397557779,
"eval_sts_dev_spearman_manhattan": 0.7153917764693033,
"eval_sts_dev_spearman_max": 0.7884911216315376,
"step": 395
},
{
"epoch": 9.089136490250697,
"grad_norm": 3.8048255443573,
"learning_rate": 2.7830927117759614e-06,
"loss": 0.3498,
"step": 400
},
{
"epoch": 9.089136490250697,
"eval_loss": 0.03073756769299507,
"eval_runtime": 7.1819,
"eval_samples_per_second": 208.858,
"eval_steps_per_second": 26.177,
"eval_sts_dev_pearson_cosine": 0.7875285006609543,
"eval_sts_dev_pearson_dot": 0.709718276464936,
"eval_sts_dev_pearson_euclidean": 0.7202436438310591,
"eval_sts_dev_pearson_manhattan": 0.7220766094080024,
"eval_sts_dev_pearson_max": 0.7875285006609543,
"eval_sts_dev_spearman_cosine": 0.7885939335328866,
"eval_sts_dev_spearman_dot": 0.7032536436958657,
"eval_sts_dev_spearman_euclidean": 0.7124855846354039,
"eval_sts_dev_spearman_manhattan": 0.7153797502128406,
"eval_sts_dev_spearman_max": 0.7885939335328866,
"step": 400
},
{
"epoch": 9.200557103064067,
"eval_loss": 0.03071259893476963,
"eval_runtime": 6.8201,
"eval_samples_per_second": 219.938,
"eval_steps_per_second": 27.566,
"eval_sts_dev_pearson_cosine": 0.787184477170156,
"eval_sts_dev_pearson_dot": 0.7102603851217889,
"eval_sts_dev_pearson_euclidean": 0.7195444208609296,
"eval_sts_dev_pearson_manhattan": 0.7213936268781151,
"eval_sts_dev_pearson_max": 0.787184477170156,
"eval_sts_dev_spearman_cosine": 0.78809909542145,
"eval_sts_dev_spearman_dot": 0.7036724949513745,
"eval_sts_dev_spearman_euclidean": 0.7115938480269084,
"eval_sts_dev_spearman_manhattan": 0.7143300985487689,
"eval_sts_dev_spearman_max": 0.78809909542145,
"step": 405
}
],
"logging_steps": 10,
"max_steps": 440,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 5,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}