AraEuroBert-610M / trainer_state.json
Omartificial-Intelligence-Space's picture
upload 16 files
8567b6f verified
{
"best_metric": 0.8299749701259963,
"best_model_checkpoint": "output/marbert_simce_EuroBERT-EuroBERT-610M_16_bs_1_e/checkpoint-25000",
"epoch": 0.8139208532135841,
"eval_steps": 500,
"global_step": 29000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005613247263541959,
"grad_norm": 98309536.0,
"learning_rate": 2.8066236317709794e-06,
"loss": 7.8472,
"step": 200
},
{
"epoch": 0.011226494527083918,
"grad_norm": 18181398.0,
"learning_rate": 5.613247263541959e-06,
"loss": 1.8133,
"step": 400
},
{
"epoch": 0.014033118158854897,
"eval_loss": 4.332894802093506,
"eval_runtime": 80.7749,
"eval_samples_per_second": 81.82,
"eval_sequential_score": 0.8033767514947983,
"eval_steps_per_second": 1.288,
"eval_sts-dev-1152_pearson_cosine": 0.8060511562516744,
"eval_sts-dev-1152_spearman_cosine": 0.8033767514947983,
"eval_sts-dev-512_pearson_cosine": 0.8032161700580676,
"eval_sts-dev-512_spearman_cosine": 0.8009572819138866,
"eval_sts-dev-768_pearson_cosine": 0.8037436235908193,
"eval_sts-dev-768_spearman_cosine": 0.8010140475354219,
"eval_sts-dev-960_pearson_cosine": 0.8052857547543548,
"eval_sts-dev-960_spearman_cosine": 0.8024252040181402,
"step": 500
},
{
"epoch": 0.016839741790625876,
"grad_norm": 4002785.75,
"learning_rate": 8.41987089531294e-06,
"loss": 1.2257,
"step": 600
},
{
"epoch": 0.022452989054167836,
"grad_norm": 14975067.0,
"learning_rate": 1.1226494527083917e-05,
"loss": 1.0662,
"step": 800
},
{
"epoch": 0.028066236317709794,
"grad_norm": 8259098.0,
"learning_rate": 1.4033118158854899e-05,
"loss": 1.0452,
"step": 1000
},
{
"epoch": 0.028066236317709794,
"eval_loss": 4.0553975105285645,
"eval_runtime": 83.8854,
"eval_samples_per_second": 78.786,
"eval_sequential_score": 0.8152056130630069,
"eval_steps_per_second": 1.24,
"eval_sts-dev-1152_pearson_cosine": 0.8162027650076662,
"eval_sts-dev-1152_spearman_cosine": 0.8152056130630069,
"eval_sts-dev-512_pearson_cosine": 0.8109347704147131,
"eval_sts-dev-512_spearman_cosine": 0.811300104502657,
"eval_sts-dev-768_pearson_cosine": 0.8133183334539484,
"eval_sts-dev-768_spearman_cosine": 0.8130045482521145,
"eval_sts-dev-960_pearson_cosine": 0.8156636692837823,
"eval_sts-dev-960_spearman_cosine": 0.814950405147375,
"step": 1000
},
{
"epoch": 0.03367948358125175,
"grad_norm": 4874028.5,
"learning_rate": 1.683974179062588e-05,
"loss": 1.0306,
"step": 1200
},
{
"epoch": 0.03929273084479371,
"grad_norm": 4182119.25,
"learning_rate": 1.9646365422396855e-05,
"loss": 1.0223,
"step": 1400
},
{
"epoch": 0.042099354476564696,
"eval_loss": 4.015018939971924,
"eval_runtime": 84.6437,
"eval_samples_per_second": 78.08,
"eval_sequential_score": 0.7963919935425935,
"eval_steps_per_second": 1.229,
"eval_sts-dev-1152_pearson_cosine": 0.7984721704385993,
"eval_sts-dev-1152_spearman_cosine": 0.7963919935425935,
"eval_sts-dev-512_pearson_cosine": 0.7945396929823907,
"eval_sts-dev-512_spearman_cosine": 0.7938897942194647,
"eval_sts-dev-768_pearson_cosine": 0.796786289376193,
"eval_sts-dev-768_spearman_cosine": 0.7957412002954445,
"eval_sts-dev-960_pearson_cosine": 0.7984310473849502,
"eval_sts-dev-960_spearman_cosine": 0.7964965649245895,
"step": 1500
},
{
"epoch": 0.04490597810833567,
"grad_norm": 4491974.5,
"learning_rate": 2.2452989054167835e-05,
"loss": 0.9923,
"step": 1600
},
{
"epoch": 0.050519225371877634,
"grad_norm": 2410247.5,
"learning_rate": 2.5259612685938815e-05,
"loss": 1.007,
"step": 1800
},
{
"epoch": 0.05613247263541959,
"grad_norm": 3212723.25,
"learning_rate": 2.8066236317709798e-05,
"loss": 0.9898,
"step": 2000
},
{
"epoch": 0.05613247263541959,
"eval_loss": 4.562768459320068,
"eval_runtime": 84.3142,
"eval_samples_per_second": 78.385,
"eval_sequential_score": 0.7838450146591204,
"eval_steps_per_second": 1.233,
"eval_sts-dev-1152_pearson_cosine": 0.7765908851768635,
"eval_sts-dev-1152_spearman_cosine": 0.7838450146591204,
"eval_sts-dev-512_pearson_cosine": 0.7672097671946088,
"eval_sts-dev-512_spearman_cosine": 0.7778618805232163,
"eval_sts-dev-768_pearson_cosine": 0.7696489245819802,
"eval_sts-dev-768_spearman_cosine": 0.7793358424223233,
"eval_sts-dev-960_pearson_cosine": 0.7741670009813553,
"eval_sts-dev-960_spearman_cosine": 0.7829158933416259,
"step": 2000
},
{
"epoch": 0.06174571989896155,
"grad_norm": 1835005.25,
"learning_rate": 3.087285994948078e-05,
"loss": 1.033,
"step": 2200
},
{
"epoch": 0.0673589671625035,
"grad_norm": 2749047.0,
"learning_rate": 3.367948358125176e-05,
"loss": 1.0091,
"step": 2400
},
{
"epoch": 0.0701655907942745,
"eval_loss": 4.613296985626221,
"eval_runtime": 81.7193,
"eval_samples_per_second": 80.874,
"eval_sequential_score": 0.7854737360223817,
"eval_steps_per_second": 1.273,
"eval_sts-dev-1152_pearson_cosine": 0.7821361682028354,
"eval_sts-dev-1152_spearman_cosine": 0.7854737360223817,
"eval_sts-dev-512_pearson_cosine": 0.7723481989731885,
"eval_sts-dev-512_spearman_cosine": 0.7767002410536074,
"eval_sts-dev-768_pearson_cosine": 0.7752960500176977,
"eval_sts-dev-768_spearman_cosine": 0.7801463868858681,
"eval_sts-dev-960_pearson_cosine": 0.779209273182729,
"eval_sts-dev-960_spearman_cosine": 0.7833584025542436,
"step": 2500
},
{
"epoch": 0.07297221442604547,
"grad_norm": 2484525.5,
"learning_rate": 3.648610721302274e-05,
"loss": 1.046,
"step": 2600
},
{
"epoch": 0.07858546168958742,
"grad_norm": 2774297.5,
"learning_rate": 3.929273084479371e-05,
"loss": 1.0212,
"step": 2800
},
{
"epoch": 0.08419870895312939,
"grad_norm": 3957846.75,
"learning_rate": 4.20993544765647e-05,
"loss": 1.0923,
"step": 3000
},
{
"epoch": 0.08419870895312939,
"eval_loss": 5.038168430328369,
"eval_runtime": 81.5206,
"eval_samples_per_second": 81.071,
"eval_sequential_score": 0.769819909731699,
"eval_steps_per_second": 1.276,
"eval_sts-dev-1152_pearson_cosine": 0.7639981739632917,
"eval_sts-dev-1152_spearman_cosine": 0.769819909731699,
"eval_sts-dev-512_pearson_cosine": 0.7557090629225184,
"eval_sts-dev-512_spearman_cosine": 0.7642757401766183,
"eval_sts-dev-768_pearson_cosine": 0.7573996144114894,
"eval_sts-dev-768_spearman_cosine": 0.7655547584963449,
"eval_sts-dev-960_pearson_cosine": 0.761317538183819,
"eval_sts-dev-960_spearman_cosine": 0.7681871634317281,
"step": 3000
},
{
"epoch": 0.08981195621667135,
"grad_norm": 2094610.625,
"learning_rate": 4.490597810833567e-05,
"loss": 1.0542,
"step": 3200
},
{
"epoch": 0.0954252034802133,
"grad_norm": 1429313.5,
"learning_rate": 4.7712601740106656e-05,
"loss": 1.025,
"step": 3400
},
{
"epoch": 0.09823182711198428,
"eval_loss": 4.955362319946289,
"eval_runtime": 78.4429,
"eval_samples_per_second": 84.252,
"eval_sequential_score": 0.773224539195293,
"eval_steps_per_second": 1.326,
"eval_sts-dev-1152_pearson_cosine": 0.7657713900430686,
"eval_sts-dev-1152_spearman_cosine": 0.773224539195293,
"eval_sts-dev-512_pearson_cosine": 0.7600635376400412,
"eval_sts-dev-512_spearman_cosine": 0.7680689406591721,
"eval_sts-dev-768_pearson_cosine": 0.7604851132447141,
"eval_sts-dev-768_spearman_cosine": 0.7693172932298855,
"eval_sts-dev-960_pearson_cosine": 0.7631668495431749,
"eval_sts-dev-960_spearman_cosine": 0.7712486101715305,
"step": 3500
},
{
"epoch": 0.10103845074375527,
"grad_norm": 993321.0,
"learning_rate": 4.99423082920136e-05,
"loss": 1.0056,
"step": 3600
},
{
"epoch": 0.10665169800729722,
"grad_norm": 13974978.0,
"learning_rate": 4.963046122181682e-05,
"loss": 1.0689,
"step": 3800
},
{
"epoch": 0.11226494527083918,
"grad_norm": 6447055.0,
"learning_rate": 4.931861415162005e-05,
"loss": 1.0453,
"step": 4000
},
{
"epoch": 0.11226494527083918,
"eval_loss": 5.834151268005371,
"eval_runtime": 77.2575,
"eval_samples_per_second": 85.545,
"eval_sequential_score": 0.7656099368624604,
"eval_steps_per_second": 1.346,
"eval_sts-dev-1152_pearson_cosine": 0.7552591908965304,
"eval_sts-dev-1152_spearman_cosine": 0.7656099368624604,
"eval_sts-dev-512_pearson_cosine": 0.7502607892289657,
"eval_sts-dev-512_spearman_cosine": 0.7606979870614468,
"eval_sts-dev-768_pearson_cosine": 0.7475751433884098,
"eval_sts-dev-768_spearman_cosine": 0.7607444598882842,
"eval_sts-dev-960_pearson_cosine": 0.7529520081710266,
"eval_sts-dev-960_spearman_cosine": 0.764507705472108,
"step": 4000
},
{
"epoch": 0.11787819253438114,
"grad_norm": 1890051.125,
"learning_rate": 4.9006767081423274e-05,
"loss": 1.0874,
"step": 4200
},
{
"epoch": 0.1234914397979231,
"grad_norm": 1684058.0,
"learning_rate": 4.8694920011226495e-05,
"loss": 1.0051,
"step": 4400
},
{
"epoch": 0.12629806342969407,
"eval_loss": 5.076698303222656,
"eval_runtime": 79.9271,
"eval_samples_per_second": 82.688,
"eval_sequential_score": 0.7752075113901122,
"eval_steps_per_second": 1.301,
"eval_sts-dev-1152_pearson_cosine": 0.7719939114671934,
"eval_sts-dev-1152_spearman_cosine": 0.7752075113901122,
"eval_sts-dev-512_pearson_cosine": 0.7652586024520893,
"eval_sts-dev-512_spearman_cosine": 0.7706670879702195,
"eval_sts-dev-768_pearson_cosine": 0.7663537315286835,
"eval_sts-dev-768_spearman_cosine": 0.771176148682848,
"eval_sts-dev-960_pearson_cosine": 0.7691859699812915,
"eval_sts-dev-960_spearman_cosine": 0.7728489604857174,
"step": 4500
},
{
"epoch": 0.12910468706146505,
"grad_norm": 1788628.75,
"learning_rate": 4.838307294102972e-05,
"loss": 1.0007,
"step": 4600
},
{
"epoch": 0.134717934325007,
"grad_norm": 1246448.375,
"learning_rate": 4.807122587083295e-05,
"loss": 0.9307,
"step": 4800
},
{
"epoch": 0.140331181588549,
"grad_norm": 1078949.875,
"learning_rate": 4.775937880063617e-05,
"loss": 0.9642,
"step": 5000
},
{
"epoch": 0.140331181588549,
"eval_loss": 5.198572158813477,
"eval_runtime": 77.2033,
"eval_samples_per_second": 85.605,
"eval_sequential_score": 0.7683338860401957,
"eval_steps_per_second": 1.347,
"eval_sts-dev-1152_pearson_cosine": 0.7579755842587974,
"eval_sts-dev-1152_spearman_cosine": 0.7683338860401957,
"eval_sts-dev-512_pearson_cosine": 0.7497028852140948,
"eval_sts-dev-512_spearman_cosine": 0.761778428268311,
"eval_sts-dev-768_pearson_cosine": 0.7527948972243363,
"eval_sts-dev-768_spearman_cosine": 0.7652442137002148,
"eval_sts-dev-960_pearson_cosine": 0.7555515440882907,
"eval_sts-dev-960_spearman_cosine": 0.7666530937388959,
"step": 5000
},
{
"epoch": 0.14594442885209094,
"grad_norm": 1531033.125,
"learning_rate": 4.744753173043939e-05,
"loss": 0.9259,
"step": 5200
},
{
"epoch": 0.1515576761156329,
"grad_norm": 2660688.5,
"learning_rate": 4.713568466024262e-05,
"loss": 0.8908,
"step": 5400
},
{
"epoch": 0.15436429974740387,
"eval_loss": 5.210824966430664,
"eval_runtime": 78.072,
"eval_samples_per_second": 84.653,
"eval_sequential_score": 0.7761786001654587,
"eval_steps_per_second": 1.332,
"eval_sts-dev-1152_pearson_cosine": 0.7628982988651709,
"eval_sts-dev-1152_spearman_cosine": 0.7761786001654587,
"eval_sts-dev-512_pearson_cosine": 0.7563657029921103,
"eval_sts-dev-512_spearman_cosine": 0.7711187793718405,
"eval_sts-dev-768_pearson_cosine": 0.7545503313040781,
"eval_sts-dev-768_spearman_cosine": 0.7713354514018652,
"eval_sts-dev-960_pearson_cosine": 0.7595387577495225,
"eval_sts-dev-960_spearman_cosine": 0.774137594546881,
"step": 5500
},
{
"epoch": 0.15717092337917485,
"grad_norm": 1493755.375,
"learning_rate": 4.6823837590045846e-05,
"loss": 0.8812,
"step": 5600
},
{
"epoch": 0.1627841706427168,
"grad_norm": 1185704.125,
"learning_rate": 4.651199051984907e-05,
"loss": 0.8544,
"step": 5800
},
{
"epoch": 0.16839741790625878,
"grad_norm": 1171420.375,
"learning_rate": 4.6200143449652295e-05,
"loss": 0.8314,
"step": 6000
},
{
"epoch": 0.16839741790625878,
"eval_loss": 5.24008321762085,
"eval_runtime": 77.1769,
"eval_samples_per_second": 85.634,
"eval_sequential_score": 0.7731812487805466,
"eval_steps_per_second": 1.348,
"eval_sts-dev-1152_pearson_cosine": 0.7629915610098019,
"eval_sts-dev-1152_spearman_cosine": 0.7731812487805466,
"eval_sts-dev-512_pearson_cosine": 0.7548517081547739,
"eval_sts-dev-512_spearman_cosine": 0.7660070542727266,
"eval_sts-dev-768_pearson_cosine": 0.7552724107119573,
"eval_sts-dev-768_spearman_cosine": 0.7688482677049882,
"eval_sts-dev-960_pearson_cosine": 0.7593839964325895,
"eval_sts-dev-960_spearman_cosine": 0.7709090473847962,
"step": 6000
},
{
"epoch": 0.17401066516980074,
"grad_norm": 20138014.0,
"learning_rate": 4.588829637945552e-05,
"loss": 0.8258,
"step": 6200
},
{
"epoch": 0.1796239124333427,
"grad_norm": 936058.875,
"learning_rate": 4.557644930925874e-05,
"loss": 0.8083,
"step": 6400
},
{
"epoch": 0.18243053606511367,
"eval_loss": 5.270838260650635,
"eval_runtime": 76.7742,
"eval_samples_per_second": 86.084,
"eval_sequential_score": 0.7680196935178831,
"eval_steps_per_second": 1.355,
"eval_sts-dev-1152_pearson_cosine": 0.7542454277081971,
"eval_sts-dev-1152_spearman_cosine": 0.7680196935178831,
"eval_sts-dev-512_pearson_cosine": 0.7491425293129472,
"eval_sts-dev-512_spearman_cosine": 0.7637887014241396,
"eval_sts-dev-768_pearson_cosine": 0.7462311896234177,
"eval_sts-dev-768_spearman_cosine": 0.7621924407912392,
"eval_sts-dev-960_pearson_cosine": 0.7506447400007437,
"eval_sts-dev-960_spearman_cosine": 0.7656016598904541,
"step": 6500
},
{
"epoch": 0.18523715969688465,
"grad_norm": 15407098.0,
"learning_rate": 4.5264602239061963e-05,
"loss": 0.8373,
"step": 6600
},
{
"epoch": 0.1908504069604266,
"grad_norm": 1347636.375,
"learning_rate": 4.495275516886519e-05,
"loss": 0.8031,
"step": 6800
},
{
"epoch": 0.19646365422396855,
"grad_norm": 1346039.0,
"learning_rate": 4.464090809866842e-05,
"loss": 0.7375,
"step": 7000
},
{
"epoch": 0.19646365422396855,
"eval_loss": 5.154874801635742,
"eval_runtime": 77.428,
"eval_samples_per_second": 85.357,
"eval_sequential_score": 0.776258910277576,
"eval_steps_per_second": 1.343,
"eval_sts-dev-1152_pearson_cosine": 0.7608005418896666,
"eval_sts-dev-1152_spearman_cosine": 0.776258910277576,
"eval_sts-dev-512_pearson_cosine": 0.7589220526161604,
"eval_sts-dev-512_spearman_cosine": 0.773786883290433,
"eval_sts-dev-768_pearson_cosine": 0.7534825416262227,
"eval_sts-dev-768_spearman_cosine": 0.7718540709899384,
"eval_sts-dev-960_pearson_cosine": 0.7570206619012192,
"eval_sts-dev-960_spearman_cosine": 0.7739587544161404,
"step": 7000
},
{
"epoch": 0.20207690148751054,
"grad_norm": 1730871.5,
"learning_rate": 4.432906102847164e-05,
"loss": 0.743,
"step": 7200
},
{
"epoch": 0.2076901487510525,
"grad_norm": 610056.0,
"learning_rate": 4.4017213958274867e-05,
"loss": 0.739,
"step": 7400
},
{
"epoch": 0.21049677238282347,
"eval_loss": 4.818795204162598,
"eval_runtime": 78.238,
"eval_samples_per_second": 84.473,
"eval_sequential_score": 0.7867819228719439,
"eval_steps_per_second": 1.329,
"eval_sts-dev-1152_pearson_cosine": 0.7777460591194046,
"eval_sts-dev-1152_spearman_cosine": 0.7867819228719439,
"eval_sts-dev-512_pearson_cosine": 0.7742009652343147,
"eval_sts-dev-512_spearman_cosine": 0.7834916540309068,
"eval_sts-dev-768_pearson_cosine": 0.771136418007053,
"eval_sts-dev-768_spearman_cosine": 0.7825233109168519,
"eval_sts-dev-960_pearson_cosine": 0.7749317385070862,
"eval_sts-dev-960_spearman_cosine": 0.7849087778447466,
"step": 7500
},
{
"epoch": 0.21330339601459444,
"grad_norm": 816702.6875,
"learning_rate": 4.370536688807809e-05,
"loss": 0.7399,
"step": 7600
},
{
"epoch": 0.2189166432781364,
"grad_norm": 984849.5625,
"learning_rate": 4.3393519817881315e-05,
"loss": 0.6723,
"step": 7800
},
{
"epoch": 0.22452989054167835,
"grad_norm": 2291592.5,
"learning_rate": 4.3081672747684535e-05,
"loss": 0.6866,
"step": 8000
},
{
"epoch": 0.22452989054167835,
"eval_loss": 5.077595233917236,
"eval_runtime": 81.793,
"eval_samples_per_second": 80.802,
"eval_sequential_score": 0.7714443289674112,
"eval_steps_per_second": 1.272,
"eval_sts-dev-1152_pearson_cosine": 0.7519731587462412,
"eval_sts-dev-1152_spearman_cosine": 0.7714443289674112,
"eval_sts-dev-512_pearson_cosine": 0.74485948560521,
"eval_sts-dev-512_spearman_cosine": 0.7666552681351647,
"eval_sts-dev-768_pearson_cosine": 0.7434046002477062,
"eval_sts-dev-768_spearman_cosine": 0.7663030810953583,
"eval_sts-dev-960_pearson_cosine": 0.7477923465698348,
"eval_sts-dev-960_spearman_cosine": 0.7687824273352952,
"step": 8000
},
{
"epoch": 0.23014313780522033,
"grad_norm": 854538.25,
"learning_rate": 4.276982567748776e-05,
"loss": 0.6556,
"step": 8200
},
{
"epoch": 0.2357563850687623,
"grad_norm": 567974.3125,
"learning_rate": 4.245797860729099e-05,
"loss": 0.6886,
"step": 8400
},
{
"epoch": 0.23856300870053326,
"eval_loss": 4.77580451965332,
"eval_runtime": 81.3533,
"eval_samples_per_second": 81.238,
"eval_sequential_score": 0.7845369317488485,
"eval_steps_per_second": 1.278,
"eval_sts-dev-1152_pearson_cosine": 0.7759169223036488,
"eval_sts-dev-1152_spearman_cosine": 0.7845369317488485,
"eval_sts-dev-512_pearson_cosine": 0.7701848660015407,
"eval_sts-dev-512_spearman_cosine": 0.7810897586341158,
"eval_sts-dev-768_pearson_cosine": 0.7706226329945012,
"eval_sts-dev-768_spearman_cosine": 0.7807681848391878,
"eval_sts-dev-960_pearson_cosine": 0.7736197337766615,
"eval_sts-dev-960_spearman_cosine": 0.7827940121873471,
"step": 8500
},
{
"epoch": 0.24136963233230424,
"grad_norm": 984615.125,
"learning_rate": 4.214613153709421e-05,
"loss": 0.685,
"step": 8600
},
{
"epoch": 0.2469828795958462,
"grad_norm": 891299.0625,
"learning_rate": 4.183428446689743e-05,
"loss": 0.6401,
"step": 8800
},
{
"epoch": 0.25259612685938815,
"grad_norm": 815193.5,
"learning_rate": 4.152243739670066e-05,
"loss": 0.6617,
"step": 9000
},
{
"epoch": 0.25259612685938815,
"eval_loss": 4.602816581726074,
"eval_runtime": 78.6877,
"eval_samples_per_second": 83.99,
"eval_sequential_score": 0.7821317387888623,
"eval_steps_per_second": 1.322,
"eval_sts-dev-1152_pearson_cosine": 0.7751221600500908,
"eval_sts-dev-1152_spearman_cosine": 0.7821317387888623,
"eval_sts-dev-512_pearson_cosine": 0.7694106022891276,
"eval_sts-dev-512_spearman_cosine": 0.7776661813002868,
"eval_sts-dev-768_pearson_cosine": 0.7688507047470482,
"eval_sts-dev-768_spearman_cosine": 0.7773842606816174,
"eval_sts-dev-960_pearson_cosine": 0.772897246519623,
"eval_sts-dev-960_spearman_cosine": 0.7804687323409016,
"step": 9000
},
{
"epoch": 0.2582093741229301,
"grad_norm": 677073.1875,
"learning_rate": 4.121059032650389e-05,
"loss": 0.6208,
"step": 9200
},
{
"epoch": 0.26382262138647206,
"grad_norm": 998130.3125,
"learning_rate": 4.089874325630711e-05,
"loss": 0.6307,
"step": 9400
},
{
"epoch": 0.26662924501824303,
"eval_loss": 4.539032459259033,
"eval_runtime": 78.0206,
"eval_samples_per_second": 84.708,
"eval_sequential_score": 0.7853296979011097,
"eval_steps_per_second": 1.333,
"eval_sts-dev-1152_pearson_cosine": 0.7765914965051797,
"eval_sts-dev-1152_spearman_cosine": 0.7853296979011097,
"eval_sts-dev-512_pearson_cosine": 0.7723537190754219,
"eval_sts-dev-512_spearman_cosine": 0.7823374065712144,
"eval_sts-dev-768_pearson_cosine": 0.7715991971997825,
"eval_sts-dev-768_spearman_cosine": 0.7821482216403839,
"eval_sts-dev-960_pearson_cosine": 0.7750935828664101,
"eval_sts-dev-960_spearman_cosine": 0.7843683074967508,
"step": 9500
},
{
"epoch": 0.269435868650014,
"grad_norm": 4926974.0,
"learning_rate": 4.0586896186110335e-05,
"loss": 0.6557,
"step": 9600
},
{
"epoch": 0.275049115913556,
"grad_norm": 1349007.875,
"learning_rate": 4.027504911591356e-05,
"loss": 0.6102,
"step": 9800
},
{
"epoch": 0.280662363177098,
"grad_norm": 4552381.5,
"learning_rate": 3.996320204571678e-05,
"loss": 0.5917,
"step": 10000
},
{
"epoch": 0.280662363177098,
"eval_loss": 4.608828067779541,
"eval_runtime": 77.5783,
"eval_samples_per_second": 85.191,
"eval_sequential_score": 0.7826250058709692,
"eval_steps_per_second": 1.341,
"eval_sts-dev-1152_pearson_cosine": 0.7759367710047786,
"eval_sts-dev-1152_spearman_cosine": 0.7826250058709692,
"eval_sts-dev-512_pearson_cosine": 0.7683710236695287,
"eval_sts-dev-512_spearman_cosine": 0.7770467944017624,
"eval_sts-dev-768_pearson_cosine": 0.7700346176363122,
"eval_sts-dev-768_spearman_cosine": 0.7779312583550618,
"eval_sts-dev-960_pearson_cosine": 0.7737155642846232,
"eval_sts-dev-960_spearman_cosine": 0.7807350237124752,
"step": 10000
},
{
"epoch": 0.2862756104406399,
"grad_norm": 836897.8125,
"learning_rate": 3.9651354975520004e-05,
"loss": 0.5845,
"step": 10200
},
{
"epoch": 0.2918888577041819,
"grad_norm": 735510.0625,
"learning_rate": 3.933950790532323e-05,
"loss": 0.6018,
"step": 10400
},
{
"epoch": 0.29469548133595286,
"eval_loss": 4.563432216644287,
"eval_runtime": 77.4091,
"eval_samples_per_second": 85.378,
"eval_sequential_score": 0.7901603432215759,
"eval_steps_per_second": 1.344,
"eval_sts-dev-1152_pearson_cosine": 0.7810780782124218,
"eval_sts-dev-1152_spearman_cosine": 0.7901603432215759,
"eval_sts-dev-512_pearson_cosine": 0.7767016322826037,
"eval_sts-dev-512_spearman_cosine": 0.787071277210133,
"eval_sts-dev-768_pearson_cosine": 0.7756406085198688,
"eval_sts-dev-768_spearman_cosine": 0.786612173354875,
"eval_sts-dev-960_pearson_cosine": 0.7789839662649704,
"eval_sts-dev-960_spearman_cosine": 0.7885719593916782,
"step": 10500
},
{
"epoch": 0.29750210496772383,
"grad_norm": 1244390.5,
"learning_rate": 3.902766083512646e-05,
"loss": 0.5859,
"step": 10600
},
{
"epoch": 0.3031153522312658,
"grad_norm": 1826701.25,
"learning_rate": 3.871581376492968e-05,
"loss": 0.5933,
"step": 10800
},
{
"epoch": 0.30872859949480774,
"grad_norm": 1036837.75,
"learning_rate": 3.840396669473291e-05,
"loss": 0.5717,
"step": 11000
},
{
"epoch": 0.30872859949480774,
"eval_loss": 4.405139446258545,
"eval_runtime": 80.2702,
"eval_samples_per_second": 82.334,
"eval_sequential_score": 0.7930013510501455,
"eval_steps_per_second": 1.296,
"eval_sts-dev-1152_pearson_cosine": 0.7842500678425964,
"eval_sts-dev-1152_spearman_cosine": 0.7930013510501455,
"eval_sts-dev-512_pearson_cosine": 0.7804560477931245,
"eval_sts-dev-512_spearman_cosine": 0.790194108314055,
"eval_sts-dev-768_pearson_cosine": 0.7795286908922767,
"eval_sts-dev-768_spearman_cosine": 0.7903488938053814,
"eval_sts-dev-960_pearson_cosine": 0.7823368063838988,
"eval_sts-dev-960_spearman_cosine": 0.7916644882542199,
"step": 11000
},
{
"epoch": 0.3143418467583497,
"grad_norm": 505930.4375,
"learning_rate": 3.809211962453613e-05,
"loss": 0.5719,
"step": 11200
},
{
"epoch": 0.31995509402189165,
"grad_norm": 573586.3125,
"learning_rate": 3.7780272554339355e-05,
"loss": 0.5422,
"step": 11400
},
{
"epoch": 0.3227617176536626,
"eval_loss": 4.44298791885376,
"eval_runtime": 79.3492,
"eval_samples_per_second": 83.29,
"eval_sequential_score": 0.7942242704582879,
"eval_steps_per_second": 1.311,
"eval_sts-dev-1152_pearson_cosine": 0.7875744329285601,
"eval_sts-dev-1152_spearman_cosine": 0.7942242704582879,
"eval_sts-dev-512_pearson_cosine": 0.7840443572822089,
"eval_sts-dev-512_spearman_cosine": 0.7920209001098614,
"eval_sts-dev-768_pearson_cosine": 0.7834004871669387,
"eval_sts-dev-768_spearman_cosine": 0.7917739294182365,
"eval_sts-dev-960_pearson_cosine": 0.7857857853856216,
"eval_sts-dev-960_spearman_cosine": 0.7931912176322746,
"step": 11500
},
{
"epoch": 0.3255683412854336,
"grad_norm": 2376617.5,
"learning_rate": 3.7468425484142576e-05,
"loss": 0.527,
"step": 11600
},
{
"epoch": 0.33118158854897556,
"grad_norm": 1004661.5,
"learning_rate": 3.7156578413945803e-05,
"loss": 0.5291,
"step": 11800
},
{
"epoch": 0.33679483581251757,
"grad_norm": 681331.25,
"learning_rate": 3.684473134374903e-05,
"loss": 0.542,
"step": 12000
},
{
"epoch": 0.33679483581251757,
"eval_loss": 4.3000807762146,
"eval_runtime": 78.0165,
"eval_samples_per_second": 84.713,
"eval_sequential_score": 0.7933961511062919,
"eval_steps_per_second": 1.333,
"eval_sts-dev-1152_pearson_cosine": 0.7901723077330912,
"eval_sts-dev-1152_spearman_cosine": 0.7933961511062919,
"eval_sts-dev-512_pearson_cosine": 0.7863821340291279,
"eval_sts-dev-512_spearman_cosine": 0.7907396171652296,
"eval_sts-dev-768_pearson_cosine": 0.7861497979708226,
"eval_sts-dev-768_spearman_cosine": 0.7905983485083213,
"eval_sts-dev-960_pearson_cosine": 0.7892627814382922,
"eval_sts-dev-960_spearman_cosine": 0.7927711835546136,
"step": 12000
},
{
"epoch": 0.3424080830760595,
"grad_norm": 339770.1875,
"learning_rate": 3.653288427355225e-05,
"loss": 0.5213,
"step": 12200
},
{
"epoch": 0.3480213303396015,
"grad_norm": 16329254.0,
"learning_rate": 3.622103720335547e-05,
"loss": 0.5226,
"step": 12400
},
{
"epoch": 0.35082795397137245,
"eval_loss": 4.8531880378723145,
"eval_runtime": 78.4497,
"eval_samples_per_second": 84.245,
"eval_sequential_score": 0.7723953575556768,
"eval_steps_per_second": 1.326,
"eval_sts-dev-1152_pearson_cosine": 0.7559096984425644,
"eval_sts-dev-1152_spearman_cosine": 0.7723953575556768,
"eval_sts-dev-512_pearson_cosine": 0.7513107904922858,
"eval_sts-dev-512_spearman_cosine": 0.7687285653830139,
"eval_sts-dev-768_pearson_cosine": 0.7485837494255871,
"eval_sts-dev-768_spearman_cosine": 0.7673974329389454,
"eval_sts-dev-960_pearson_cosine": 0.7527733497014006,
"eval_sts-dev-960_spearman_cosine": 0.7700910811459013,
"step": 12500
},
{
"epoch": 0.35363457760314343,
"grad_norm": 751577.375,
"learning_rate": 3.59091901331587e-05,
"loss": 0.5111,
"step": 12600
},
{
"epoch": 0.3592478248666854,
"grad_norm": 1020267.75,
"learning_rate": 3.559734306296193e-05,
"loss": 0.51,
"step": 12800
},
{
"epoch": 0.36486107213022734,
"grad_norm": 2245223.25,
"learning_rate": 3.528549599276515e-05,
"loss": 0.5439,
"step": 13000
},
{
"epoch": 0.36486107213022734,
"eval_loss": 4.562457084655762,
"eval_runtime": 76.5025,
"eval_samples_per_second": 86.389,
"eval_sequential_score": 0.7871283857415448,
"eval_steps_per_second": 1.359,
"eval_sts-dev-1152_pearson_cosine": 0.7813516950726106,
"eval_sts-dev-1152_spearman_cosine": 0.7871283857415448,
"eval_sts-dev-512_pearson_cosine": 0.7790901108872998,
"eval_sts-dev-512_spearman_cosine": 0.785904654893658,
"eval_sts-dev-768_pearson_cosine": 0.7768149257929241,
"eval_sts-dev-768_spearman_cosine": 0.7839641875290246,
"eval_sts-dev-960_pearson_cosine": 0.7799312425424749,
"eval_sts-dev-960_spearman_cosine": 0.7859198424326749,
"step": 13000
},
{
"epoch": 0.3704743193937693,
"grad_norm": 295381.71875,
"learning_rate": 3.4973648922568375e-05,
"loss": 0.4944,
"step": 13200
},
{
"epoch": 0.37608756665731125,
"grad_norm": 258496.15625,
"learning_rate": 3.46618018523716e-05,
"loss": 0.5055,
"step": 13400
},
{
"epoch": 0.3788941902890822,
"eval_loss": 4.690097808837891,
"eval_runtime": 77.0112,
"eval_samples_per_second": 85.819,
"eval_sequential_score": 0.7716690451444436,
"eval_steps_per_second": 1.35,
"eval_sts-dev-1152_pearson_cosine": 0.7550740608400445,
"eval_sts-dev-1152_spearman_cosine": 0.7716690451444436,
"eval_sts-dev-512_pearson_cosine": 0.7512508042826231,
"eval_sts-dev-512_spearman_cosine": 0.7686861357730667,
"eval_sts-dev-768_pearson_cosine": 0.7485196351380123,
"eval_sts-dev-768_spearman_cosine": 0.7674469031229442,
"eval_sts-dev-960_pearson_cosine": 0.7531056155361794,
"eval_sts-dev-960_spearman_cosine": 0.770399492414998,
"step": 13500
},
{
"epoch": 0.3817008139208532,
"grad_norm": 1523134.625,
"learning_rate": 3.4349954782174824e-05,
"loss": 0.4914,
"step": 13600
},
{
"epoch": 0.38731406118439515,
"grad_norm": 824847.5625,
"learning_rate": 3.4038107711978044e-05,
"loss": 0.4832,
"step": 13800
},
{
"epoch": 0.3929273084479371,
"grad_norm": 942315.875,
"learning_rate": 3.372626064178127e-05,
"loss": 0.4974,
"step": 14000
},
{
"epoch": 0.3929273084479371,
"eval_loss": 4.3223676681518555,
"eval_runtime": 76.7925,
"eval_samples_per_second": 86.063,
"eval_sequential_score": 0.7833165592743722,
"eval_steps_per_second": 1.354,
"eval_sts-dev-1152_pearson_cosine": 0.7744088760069867,
"eval_sts-dev-1152_spearman_cosine": 0.7833165592743722,
"eval_sts-dev-512_pearson_cosine": 0.7698343591511576,
"eval_sts-dev-512_spearman_cosine": 0.7802103790467256,
"eval_sts-dev-768_pearson_cosine": 0.7695304076449843,
"eval_sts-dev-768_spearman_cosine": 0.780087860180158,
"eval_sts-dev-960_pearson_cosine": 0.7727501854173389,
"eval_sts-dev-960_spearman_cosine": 0.7819591085796292,
"step": 14000
},
{
"epoch": 0.3985405557114791,
"grad_norm": 755688.0625,
"learning_rate": 3.34144135715845e-05,
"loss": 0.4834,
"step": 14200
},
{
"epoch": 0.40415380297502107,
"grad_norm": 1176346.25,
"learning_rate": 3.310256650138772e-05,
"loss": 0.4526,
"step": 14400
},
{
"epoch": 0.40696042660679205,
"eval_loss": 4.603125095367432,
"eval_runtime": 77.0079,
"eval_samples_per_second": 85.822,
"eval_sequential_score": 0.7798168210294973,
"eval_steps_per_second": 1.351,
"eval_sts-dev-1152_pearson_cosine": 0.7695286357100837,
"eval_sts-dev-1152_spearman_cosine": 0.7798168210294973,
"eval_sts-dev-512_pearson_cosine": 0.7680031892756588,
"eval_sts-dev-512_spearman_cosine": 0.7793198654141038,
"eval_sts-dev-768_pearson_cosine": 0.765316531020936,
"eval_sts-dev-768_spearman_cosine": 0.7771469239892022,
"eval_sts-dev-960_pearson_cosine": 0.7678963030870366,
"eval_sts-dev-960_spearman_cosine": 0.7788543197202844,
"step": 14500
},
{
"epoch": 0.409767050238563,
"grad_norm": 555539.0625,
"learning_rate": 3.279071943119095e-05,
"loss": 0.4621,
"step": 14600
},
{
"epoch": 0.415380297502105,
"grad_norm": 293199.1875,
"learning_rate": 3.2478872360994175e-05,
"loss": 0.4483,
"step": 14800
},
{
"epoch": 0.42099354476564693,
"grad_norm": 1106311.5,
"learning_rate": 3.2167025290797396e-05,
"loss": 0.4422,
"step": 15000
},
{
"epoch": 0.42099354476564693,
"eval_loss": 4.340238094329834,
"eval_runtime": 78.5719,
"eval_samples_per_second": 84.114,
"eval_sequential_score": 0.7852896447445069,
"eval_steps_per_second": 1.324,
"eval_sts-dev-1152_pearson_cosine": 0.7790739260952181,
"eval_sts-dev-1152_spearman_cosine": 0.7852896447445069,
"eval_sts-dev-512_pearson_cosine": 0.7764628401440794,
"eval_sts-dev-512_spearman_cosine": 0.7835744067282309,
"eval_sts-dev-768_pearson_cosine": 0.7743919883086495,
"eval_sts-dev-768_spearman_cosine": 0.7820714131385429,
"eval_sts-dev-960_pearson_cosine": 0.7773513056662522,
"eval_sts-dev-960_spearman_cosine": 0.7840212510775064,
"step": 15000
},
{
"epoch": 0.4266067920291889,
"grad_norm": 610679.5625,
"learning_rate": 3.1855178220600616e-05,
"loss": 0.4144,
"step": 15200
},
{
"epoch": 0.43222003929273084,
"grad_norm": 687958.4375,
"learning_rate": 3.1543331150403844e-05,
"loss": 0.4099,
"step": 15400
},
{
"epoch": 0.4350266629245018,
"eval_loss": 4.453821182250977,
"eval_runtime": 77.8538,
"eval_samples_per_second": 84.89,
"eval_sequential_score": 0.7885469182471105,
"eval_steps_per_second": 1.336,
"eval_sts-dev-1152_pearson_cosine": 0.7803450152456344,
"eval_sts-dev-1152_spearman_cosine": 0.7885469182471105,
"eval_sts-dev-512_pearson_cosine": 0.7769126522889329,
"eval_sts-dev-512_spearman_cosine": 0.7860781907940958,
"eval_sts-dev-768_pearson_cosine": 0.7768788991918325,
"eval_sts-dev-768_spearman_cosine": 0.7860822710268899,
"eval_sts-dev-960_pearson_cosine": 0.7790539464640599,
"eval_sts-dev-960_spearman_cosine": 0.7873534072847509,
"step": 15500
},
{
"epoch": 0.4378332865562728,
"grad_norm": 465139.34375,
"learning_rate": 3.123148408020707e-05,
"loss": 0.4196,
"step": 15600
},
{
"epoch": 0.44344653381981475,
"grad_norm": 1473414.0,
"learning_rate": 3.091963701001029e-05,
"loss": 0.4273,
"step": 15800
},
{
"epoch": 0.4490597810833567,
"grad_norm": 1426157.25,
"learning_rate": 3.060778993981352e-05,
"loss": 1.9924,
"step": 16000
},
{
"epoch": 0.4490597810833567,
"eval_loss": 3.5090487003326416,
"eval_runtime": 79.6383,
"eval_samples_per_second": 82.988,
"eval_sequential_score": 0.7927361346499643,
"eval_steps_per_second": 1.306,
"eval_sts-dev-1152_pearson_cosine": 0.790420349742375,
"eval_sts-dev-1152_spearman_cosine": 0.7927361346499643,
"eval_sts-dev-512_pearson_cosine": 0.7865016349246065,
"eval_sts-dev-512_spearman_cosine": 0.7896668024220279,
"eval_sts-dev-768_pearson_cosine": 0.7865658784180989,
"eval_sts-dev-768_spearman_cosine": 0.7894374189283365,
"eval_sts-dev-960_pearson_cosine": 0.78835917820683,
"eval_sts-dev-960_spearman_cosine": 0.7907010125477784,
"step": 16000
},
{
"epoch": 0.45467302834689866,
"grad_norm": 2128073.0,
"learning_rate": 3.029594286961674e-05,
"loss": 2.0174,
"step": 16200
},
{
"epoch": 0.46028627561044066,
"grad_norm": 1494404.25,
"learning_rate": 2.9984095799419964e-05,
"loss": 1.9566,
"step": 16400
},
{
"epoch": 0.46309289924221164,
"eval_loss": 3.305478572845459,
"eval_runtime": 76.7558,
"eval_samples_per_second": 86.104,
"eval_sequential_score": 0.8035377612749389,
"eval_steps_per_second": 1.355,
"eval_sts-dev-1152_pearson_cosine": 0.7997045755932278,
"eval_sts-dev-1152_spearman_cosine": 0.8035377612749389,
"eval_sts-dev-512_pearson_cosine": 0.7962257322892368,
"eval_sts-dev-512_spearman_cosine": 0.8010941658538488,
"eval_sts-dev-768_pearson_cosine": 0.7967126438007915,
"eval_sts-dev-768_spearman_cosine": 0.8016654322418212,
"eval_sts-dev-960_pearson_cosine": 0.7985639882269593,
"eval_sts-dev-960_spearman_cosine": 0.8025996694295968,
"step": 16500
},
{
"epoch": 0.4658995228739826,
"grad_norm": 21955176.0,
"learning_rate": 2.967224872922319e-05,
"loss": 1.8733,
"step": 16600
},
{
"epoch": 0.4715127701375246,
"grad_norm": 1634486.5,
"learning_rate": 2.9360401659026416e-05,
"loss": 1.8465,
"step": 16800
},
{
"epoch": 0.4771260174010665,
"grad_norm": 1099246.25,
"learning_rate": 2.904855458882964e-05,
"loss": 1.8083,
"step": 17000
},
{
"epoch": 0.4771260174010665,
"eval_loss": 3.1462347507476807,
"eval_runtime": 76.7838,
"eval_samples_per_second": 86.073,
"eval_sequential_score": 0.8044901468397658,
"eval_steps_per_second": 1.354,
"eval_sts-dev-1152_pearson_cosine": 0.8003150062678777,
"eval_sts-dev-1152_spearman_cosine": 0.8044901468397658,
"eval_sts-dev-512_pearson_cosine": 0.7967573482929446,
"eval_sts-dev-512_spearman_cosine": 0.8015237418575695,
"eval_sts-dev-768_pearson_cosine": 0.7971621831925142,
"eval_sts-dev-768_spearman_cosine": 0.801918123779033,
"eval_sts-dev-960_pearson_cosine": 0.7989525078627395,
"eval_sts-dev-960_spearman_cosine": 0.8033675149464842,
"step": 17000
},
{
"epoch": 0.4827392646646085,
"grad_norm": 1532188.125,
"learning_rate": 2.8736707518632867e-05,
"loss": 1.7193,
"step": 17200
},
{
"epoch": 0.48835251192815043,
"grad_norm": 1495786.125,
"learning_rate": 2.8424860448436085e-05,
"loss": 1.7423,
"step": 17400
},
{
"epoch": 0.4911591355599214,
"eval_loss": 3.0544025897979736,
"eval_runtime": 77.4424,
"eval_samples_per_second": 85.341,
"eval_sequential_score": 0.8085294051469485,
"eval_steps_per_second": 1.343,
"eval_sts-dev-1152_pearson_cosine": 0.806796528773235,
"eval_sts-dev-1152_spearman_cosine": 0.8085294051469485,
"eval_sts-dev-512_pearson_cosine": 0.8042370140263899,
"eval_sts-dev-512_spearman_cosine": 0.8066761351903039,
"eval_sts-dev-768_pearson_cosine": 0.8033611078675769,
"eval_sts-dev-768_spearman_cosine": 0.8060225357046799,
"eval_sts-dev-960_pearson_cosine": 0.8053597943880864,
"eval_sts-dev-960_spearman_cosine": 0.8073196764122358,
"step": 17500
},
{
"epoch": 0.4939657591916924,
"grad_norm": 1554646.625,
"learning_rate": 2.8113013378239312e-05,
"loss": 1.6114,
"step": 17600
},
{
"epoch": 0.49957900645523434,
"grad_norm": 1555168.625,
"learning_rate": 2.7801166308042536e-05,
"loss": 1.6524,
"step": 17800
},
{
"epoch": 0.5051922537187763,
"grad_norm": 2316977.25,
"learning_rate": 2.748931923784576e-05,
"loss": 1.568,
"step": 18000
},
{
"epoch": 0.5051922537187763,
"eval_loss": 3.023185968399048,
"eval_runtime": 77.2548,
"eval_samples_per_second": 85.548,
"eval_sequential_score": 0.8160116208449028,
"eval_steps_per_second": 1.346,
"eval_sts-dev-1152_pearson_cosine": 0.8098685109159951,
"eval_sts-dev-1152_spearman_cosine": 0.8160116208449028,
"eval_sts-dev-512_pearson_cosine": 0.8067482146442919,
"eval_sts-dev-512_spearman_cosine": 0.8138083394885887,
"eval_sts-dev-768_pearson_cosine": 0.8066934851514658,
"eval_sts-dev-768_spearman_cosine": 0.8137348986202628,
"eval_sts-dev-960_pearson_cosine": 0.8086043862820518,
"eval_sts-dev-960_spearman_cosine": 0.8147544112729422,
"step": 18000
},
{
"epoch": 0.5108055009823183,
"grad_norm": 1329637.625,
"learning_rate": 2.7177472167648988e-05,
"loss": 1.5263,
"step": 18200
},
{
"epoch": 0.5164187482458602,
"grad_norm": 1966328.0,
"learning_rate": 2.6865625097452212e-05,
"loss": 1.5547,
"step": 18400
},
{
"epoch": 0.5192253718776312,
"eval_loss": 2.870816469192505,
"eval_runtime": 76.6326,
"eval_samples_per_second": 86.243,
"eval_sequential_score": 0.8176598949005246,
"eval_steps_per_second": 1.357,
"eval_sts-dev-1152_pearson_cosine": 0.814666190335647,
"eval_sts-dev-1152_spearman_cosine": 0.8176598949005246,
"eval_sts-dev-512_pearson_cosine": 0.8121636324942605,
"eval_sts-dev-512_spearman_cosine": 0.8159107971728017,
"eval_sts-dev-768_pearson_cosine": 0.8122018182896737,
"eval_sts-dev-768_spearman_cosine": 0.816248481623874,
"eval_sts-dev-960_pearson_cosine": 0.813853882558624,
"eval_sts-dev-960_spearman_cosine": 0.8171316174695632,
"step": 18500
},
{
"epoch": 0.5220319955094022,
"grad_norm": 1405333.625,
"learning_rate": 2.6553778027255433e-05,
"loss": 1.5059,
"step": 18600
},
{
"epoch": 0.5276452427729441,
"grad_norm": 2138761.0,
"learning_rate": 2.6241930957058657e-05,
"loss": 1.4385,
"step": 18800
},
{
"epoch": 0.5332584900364861,
"grad_norm": 2403917.0,
"learning_rate": 2.5930083886861884e-05,
"loss": 1.476,
"step": 19000
},
{
"epoch": 0.5332584900364861,
"eval_loss": 2.9468226432800293,
"eval_runtime": 76.877,
"eval_samples_per_second": 85.969,
"eval_sequential_score": 0.81019252718923,
"eval_steps_per_second": 1.353,
"eval_sts-dev-1152_pearson_cosine": 0.8023306627696717,
"eval_sts-dev-1152_spearman_cosine": 0.81019252718923,
"eval_sts-dev-512_pearson_cosine": 0.8017622030828988,
"eval_sts-dev-512_spearman_cosine": 0.8093692797674851,
"eval_sts-dev-768_pearson_cosine": 0.8003993423781782,
"eval_sts-dev-768_spearman_cosine": 0.8091560440850984,
"eval_sts-dev-960_pearson_cosine": 0.8019211195358515,
"eval_sts-dev-960_spearman_cosine": 0.8101363718932728,
"step": 19000
},
{
"epoch": 0.538871737300028,
"grad_norm": 1055316.25,
"learning_rate": 2.561823681666511e-05,
"loss": 1.4558,
"step": 19200
},
{
"epoch": 0.54448498456357,
"grad_norm": 1079136.125,
"learning_rate": 2.5306389746468333e-05,
"loss": 1.4557,
"step": 19400
},
{
"epoch": 0.547291608195341,
"eval_loss": 2.8981781005859375,
"eval_runtime": 79.4411,
"eval_samples_per_second": 83.194,
"eval_sequential_score": 0.8094444352298047,
"eval_steps_per_second": 1.309,
"eval_sts-dev-1152_pearson_cosine": 0.8031328356570406,
"eval_sts-dev-1152_spearman_cosine": 0.8094444352298047,
"eval_sts-dev-512_pearson_cosine": 0.8001098459602658,
"eval_sts-dev-512_spearman_cosine": 0.8070996808860118,
"eval_sts-dev-768_pearson_cosine": 0.8001671724511775,
"eval_sts-dev-768_spearman_cosine": 0.807121764709248,
"eval_sts-dev-960_pearson_cosine": 0.8020898505862861,
"eval_sts-dev-960_spearman_cosine": 0.8084113769099328,
"step": 19500
},
{
"epoch": 0.550098231827112,
"grad_norm": 1276695.375,
"learning_rate": 2.4994542676271557e-05,
"loss": 1.4552,
"step": 19600
},
{
"epoch": 0.555711479090654,
"grad_norm": 1328590.75,
"learning_rate": 2.468269560607478e-05,
"loss": 1.4342,
"step": 19800
},
{
"epoch": 0.561324726354196,
"grad_norm": 1148225.75,
"learning_rate": 2.4370848535878008e-05,
"loss": 1.4503,
"step": 20000
},
{
"epoch": 0.561324726354196,
"eval_loss": 2.807321310043335,
"eval_runtime": 77.894,
"eval_samples_per_second": 84.846,
"eval_sequential_score": 0.8179010934691895,
"eval_steps_per_second": 1.335,
"eval_sts-dev-1152_pearson_cosine": 0.8140052747106419,
"eval_sts-dev-1152_spearman_cosine": 0.8179010934691895,
"eval_sts-dev-512_pearson_cosine": 0.8113404216022915,
"eval_sts-dev-512_spearman_cosine": 0.8159118696426358,
"eval_sts-dev-768_pearson_cosine": 0.8108446270228379,
"eval_sts-dev-768_spearman_cosine": 0.815575862581075,
"eval_sts-dev-960_pearson_cosine": 0.8128306079700292,
"eval_sts-dev-960_spearman_cosine": 0.8169448135786315,
"step": 20000
},
{
"epoch": 0.5669379736177379,
"grad_norm": 1494490.25,
"learning_rate": 2.405900146568123e-05,
"loss": 1.391,
"step": 20200
},
{
"epoch": 0.5725512208812799,
"grad_norm": 1219068.5,
"learning_rate": 2.3747154395484456e-05,
"loss": 1.3529,
"step": 20400
},
{
"epoch": 0.5753578445130508,
"eval_loss": 2.697373151779175,
"eval_runtime": 77.2603,
"eval_samples_per_second": 85.542,
"eval_sequential_score": 0.818387454814582,
"eval_steps_per_second": 1.346,
"eval_sts-dev-1152_pearson_cosine": 0.814425658636787,
"eval_sts-dev-1152_spearman_cosine": 0.818387454814582,
"eval_sts-dev-512_pearson_cosine": 0.8114864131532289,
"eval_sts-dev-512_spearman_cosine": 0.816474256076468,
"eval_sts-dev-768_pearson_cosine": 0.8111751402507601,
"eval_sts-dev-768_spearman_cosine": 0.8161965909973262,
"eval_sts-dev-960_pearson_cosine": 0.8133366326386466,
"eval_sts-dev-960_spearman_cosine": 0.817509136299924,
"step": 20500
},
{
"epoch": 0.5781644681448218,
"grad_norm": 1434187.0,
"learning_rate": 2.343530732528768e-05,
"loss": 1.3428,
"step": 20600
},
{
"epoch": 0.5837777154083638,
"grad_norm": 9602563.0,
"learning_rate": 2.3123460255090905e-05,
"loss": 1.3401,
"step": 20800
},
{
"epoch": 0.5893909626719057,
"grad_norm": 1751803.25,
"learning_rate": 2.281161318489413e-05,
"loss": 1.3809,
"step": 21000
},
{
"epoch": 0.5893909626719057,
"eval_loss": 2.703920602798462,
"eval_runtime": 77.5938,
"eval_samples_per_second": 85.174,
"eval_sequential_score": 0.8188296877385577,
"eval_steps_per_second": 1.34,
"eval_sts-dev-1152_pearson_cosine": 0.8145212969625442,
"eval_sts-dev-1152_spearman_cosine": 0.8188296877385577,
"eval_sts-dev-512_pearson_cosine": 0.8121156149968475,
"eval_sts-dev-512_spearman_cosine": 0.8170665731423932,
"eval_sts-dev-768_pearson_cosine": 0.8113063721736813,
"eval_sts-dev-768_spearman_cosine": 0.8162345284905963,
"eval_sts-dev-960_pearson_cosine": 0.813392544100018,
"eval_sts-dev-960_spearman_cosine": 0.8177012935138736,
"step": 21000
},
{
"epoch": 0.5950042099354477,
"grad_norm": 1005006.3125,
"learning_rate": 2.2499766114697353e-05,
"loss": 1.3193,
"step": 21200
},
{
"epoch": 0.6006174571989896,
"grad_norm": 2510840.0,
"learning_rate": 2.2187919044500577e-05,
"loss": 1.2531,
"step": 21400
},
{
"epoch": 0.6034240808307606,
"eval_loss": 2.6553146839141846,
"eval_runtime": 76.7412,
"eval_samples_per_second": 86.121,
"eval_sequential_score": 0.8132629704436432,
"eval_steps_per_second": 1.355,
"eval_sts-dev-1152_pearson_cosine": 0.8100792978741007,
"eval_sts-dev-1152_spearman_cosine": 0.8132629704436432,
"eval_sts-dev-512_pearson_cosine": 0.8064513413918455,
"eval_sts-dev-512_spearman_cosine": 0.810722844279224,
"eval_sts-dev-768_pearson_cosine": 0.8070561573065372,
"eval_sts-dev-768_spearman_cosine": 0.8113323371597876,
"eval_sts-dev-960_pearson_cosine": 0.8088718794978185,
"eval_sts-dev-960_spearman_cosine": 0.812389308544286,
"step": 21500
},
{
"epoch": 0.6062307044625316,
"grad_norm": 919804.125,
"learning_rate": 2.18760719743038e-05,
"loss": 1.3294,
"step": 21600
},
{
"epoch": 0.6118439517260735,
"grad_norm": 1363194.375,
"learning_rate": 2.156422490410703e-05,
"loss": 1.3076,
"step": 21800
},
{
"epoch": 0.6174571989896155,
"grad_norm": 1170667.75,
"learning_rate": 2.125237783391025e-05,
"loss": 1.2634,
"step": 22000
},
{
"epoch": 0.6174571989896155,
"eval_loss": 2.615736484527588,
"eval_runtime": 77.1191,
"eval_samples_per_second": 85.699,
"eval_sequential_score": 0.8135000785581838,
"eval_steps_per_second": 1.349,
"eval_sts-dev-1152_pearson_cosine": 0.8105061703368251,
"eval_sts-dev-1152_spearman_cosine": 0.8135000785581838,
"eval_sts-dev-512_pearson_cosine": 0.8074183216846054,
"eval_sts-dev-512_spearman_cosine": 0.8123168589567965,
"eval_sts-dev-768_pearson_cosine": 0.808323777710555,
"eval_sts-dev-768_spearman_cosine": 0.8122644439071114,
"eval_sts-dev-960_pearson_cosine": 0.809931025467389,
"eval_sts-dev-960_spearman_cosine": 0.8131210124611652,
"step": 22000
},
{
"epoch": 0.6230704462531574,
"grad_norm": 2690514.75,
"learning_rate": 2.0940530763713477e-05,
"loss": 1.242,
"step": 22200
},
{
"epoch": 0.6286836935166994,
"grad_norm": 687846.625,
"learning_rate": 2.06286836935167e-05,
"loss": 1.2545,
"step": 22400
},
{
"epoch": 0.6314903171484704,
"eval_loss": 2.6083288192749023,
"eval_runtime": 76.9345,
"eval_samples_per_second": 85.904,
"eval_sequential_score": 0.8198803113021629,
"eval_steps_per_second": 1.352,
"eval_sts-dev-1152_pearson_cosine": 0.8155088381997109,
"eval_sts-dev-1152_spearman_cosine": 0.8198803113021629,
"eval_sts-dev-512_pearson_cosine": 0.8119264111774269,
"eval_sts-dev-512_spearman_cosine": 0.817475857375689,
"eval_sts-dev-768_pearson_cosine": 0.8125582626459574,
"eval_sts-dev-768_spearman_cosine": 0.8178761844864308,
"eval_sts-dev-960_pearson_cosine": 0.8147256189246097,
"eval_sts-dev-960_spearman_cosine": 0.8192721867604403,
"step": 22500
},
{
"epoch": 0.6342969407802413,
"grad_norm": 1860284.25,
"learning_rate": 2.0316836623319925e-05,
"loss": 1.2362,
"step": 22600
},
{
"epoch": 0.6399101880437833,
"grad_norm": 7078587.0,
"learning_rate": 2.000498955312315e-05,
"loss": 1.1474,
"step": 22800
},
{
"epoch": 0.6455234353073253,
"grad_norm": 1408995.125,
"learning_rate": 1.9693142482926373e-05,
"loss": 1.2125,
"step": 23000
},
{
"epoch": 0.6455234353073253,
"eval_loss": 2.5617470741271973,
"eval_runtime": 76.5184,
"eval_samples_per_second": 86.371,
"eval_sequential_score": 0.8207731554186944,
"eval_steps_per_second": 1.359,
"eval_sts-dev-1152_pearson_cosine": 0.8178959248835184,
"eval_sts-dev-1152_spearman_cosine": 0.8207731554186944,
"eval_sts-dev-512_pearson_cosine": 0.8140061268760344,
"eval_sts-dev-512_spearman_cosine": 0.8180611571485459,
"eval_sts-dev-768_pearson_cosine": 0.8150538202678299,
"eval_sts-dev-768_spearman_cosine": 0.8188299877250856,
"eval_sts-dev-960_pearson_cosine": 0.8170381399447351,
"eval_sts-dev-960_spearman_cosine": 0.8200353073212047,
"step": 23000
},
{
"epoch": 0.6511366825708672,
"grad_norm": 1224838.5,
"learning_rate": 1.9381295412729597e-05,
"loss": 1.206,
"step": 23200
},
{
"epoch": 0.6567499298344092,
"grad_norm": 855258.6875,
"learning_rate": 1.906944834253282e-05,
"loss": 1.1236,
"step": 23400
},
{
"epoch": 0.6595565534661801,
"eval_loss": 2.5598337650299072,
"eval_runtime": 77.1811,
"eval_samples_per_second": 85.63,
"eval_sequential_score": 0.8251304266663447,
"eval_steps_per_second": 1.347,
"eval_sts-dev-1152_pearson_cosine": 0.8201666525971009,
"eval_sts-dev-1152_spearman_cosine": 0.8251304266663447,
"eval_sts-dev-512_pearson_cosine": 0.8165225072856572,
"eval_sts-dev-512_spearman_cosine": 0.8231945111392679,
"eval_sts-dev-768_pearson_cosine": 0.8174329120885295,
"eval_sts-dev-768_spearman_cosine": 0.8234756819083271,
"eval_sts-dev-960_pearson_cosine": 0.8193489194006733,
"eval_sts-dev-960_spearman_cosine": 0.8246625530393321,
"step": 23500
},
{
"epoch": 0.6623631770979511,
"grad_norm": 919016.75,
"learning_rate": 1.875760127233605e-05,
"loss": 1.1785,
"step": 23600
},
{
"epoch": 0.6679764243614931,
"grad_norm": 757121.4375,
"learning_rate": 1.844575420213927e-05,
"loss": 1.1376,
"step": 23800
},
{
"epoch": 0.6735896716250351,
"grad_norm": 844441.125,
"learning_rate": 1.8133907131942497e-05,
"loss": 1.1386,
"step": 24000
},
{
"epoch": 0.6735896716250351,
"eval_loss": 2.522897958755493,
"eval_runtime": 79.1821,
"eval_samples_per_second": 83.466,
"eval_sequential_score": 0.8207794501178992,
"eval_steps_per_second": 1.313,
"eval_sts-dev-1152_pearson_cosine": 0.8186757389035366,
"eval_sts-dev-1152_spearman_cosine": 0.8207794501178992,
"eval_sts-dev-512_pearson_cosine": 0.8149579009500032,
"eval_sts-dev-512_spearman_cosine": 0.8186514991034529,
"eval_sts-dev-768_pearson_cosine": 0.8158079152733716,
"eval_sts-dev-768_spearman_cosine": 0.8186101448904235,
"eval_sts-dev-960_pearson_cosine": 0.8173995719285795,
"eval_sts-dev-960_spearman_cosine": 0.8198375969222076,
"step": 24000
},
{
"epoch": 0.6792029188885771,
"grad_norm": 2238345.25,
"learning_rate": 1.782206006174572e-05,
"loss": 1.1293,
"step": 24200
},
{
"epoch": 0.684816166152119,
"grad_norm": 1067979.625,
"learning_rate": 1.7510212991548945e-05,
"loss": 1.101,
"step": 24400
},
{
"epoch": 0.68762278978389,
"eval_loss": 2.542306661605835,
"eval_runtime": 78.4755,
"eval_samples_per_second": 84.217,
"eval_sequential_score": 0.8273983158949195,
"eval_steps_per_second": 1.325,
"eval_sts-dev-1152_pearson_cosine": 0.8233467985109526,
"eval_sts-dev-1152_spearman_cosine": 0.8273983158949195,
"eval_sts-dev-512_pearson_cosine": 0.8191791525417332,
"eval_sts-dev-512_spearman_cosine": 0.8241280875045953,
"eval_sts-dev-768_pearson_cosine": 0.8205748238510661,
"eval_sts-dev-768_spearman_cosine": 0.8253162521368189,
"eval_sts-dev-960_pearson_cosine": 0.8223154809807092,
"eval_sts-dev-960_spearman_cosine": 0.8263287206559005,
"step": 24500
},
{
"epoch": 0.690429413415661,
"grad_norm": 1075801.875,
"learning_rate": 1.719836592135217e-05,
"loss": 1.1306,
"step": 24600
},
{
"epoch": 0.696042660679203,
"grad_norm": 1117327.75,
"learning_rate": 1.6886518851155393e-05,
"loss": 1.0517,
"step": 24800
},
{
"epoch": 0.7016559079427449,
"grad_norm": 1052429.25,
"learning_rate": 1.657467178095862e-05,
"loss": 1.0617,
"step": 25000
},
{
"epoch": 0.7016559079427449,
"eval_loss": 2.4987776279449463,
"eval_runtime": 78.7361,
"eval_samples_per_second": 83.939,
"eval_sequential_score": 0.8318737027912343,
"eval_steps_per_second": 1.321,
"eval_sts-dev-1152_pearson_cosine": 0.828742240805668,
"eval_sts-dev-1152_spearman_cosine": 0.8318737027912343,
"eval_sts-dev-512_pearson_cosine": 0.8253460176937367,
"eval_sts-dev-512_spearman_cosine": 0.8296126954497343,
"eval_sts-dev-768_pearson_cosine": 0.8260069313893207,
"eval_sts-dev-768_spearman_cosine": 0.8299749701259963,
"eval_sts-dev-960_pearson_cosine": 0.8277316254972338,
"eval_sts-dev-960_spearman_cosine": 0.8309505358919553,
"step": 25000
},
{
"epoch": 0.7072691552062869,
"grad_norm": 1933842.375,
"learning_rate": 1.626282471076184e-05,
"loss": 1.0408,
"step": 25200
},
{
"epoch": 0.7128824024698288,
"grad_norm": 1313519.75,
"learning_rate": 1.595097764056507e-05,
"loss": 1.0741,
"step": 25400
},
{
"epoch": 0.7156890261015998,
"eval_loss": 2.4365484714508057,
"eval_runtime": 76.9868,
"eval_samples_per_second": 85.846,
"eval_sequential_score": 0.8275808733889652,
"eval_steps_per_second": 1.351,
"eval_sts-dev-1152_pearson_cosine": 0.8235693092621815,
"eval_sts-dev-1152_spearman_cosine": 0.8275808733889652,
"eval_sts-dev-512_pearson_cosine": 0.8200900106962739,
"eval_sts-dev-512_spearman_cosine": 0.8251150183246924,
"eval_sts-dev-768_pearson_cosine": 0.8206274312682788,
"eval_sts-dev-768_spearman_cosine": 0.8253381212332793,
"eval_sts-dev-960_pearson_cosine": 0.8226055577499416,
"eval_sts-dev-960_spearman_cosine": 0.8267093188750975,
"step": 25500
},
{
"epoch": 0.7184956497333708,
"grad_norm": 2527256.75,
"learning_rate": 1.5639130570368293e-05,
"loss": 1.0373,
"step": 25600
},
{
"epoch": 0.7241088969969127,
"grad_norm": 1548847.25,
"learning_rate": 1.5327283500171517e-05,
"loss": 1.0239,
"step": 25800
},
{
"epoch": 0.7297221442604547,
"grad_norm": 771091.3125,
"learning_rate": 1.5015436429974741e-05,
"loss": 0.9982,
"step": 26000
},
{
"epoch": 0.7297221442604547,
"eval_loss": 2.417414903640747,
"eval_runtime": 76.5981,
"eval_samples_per_second": 86.282,
"eval_sequential_score": 0.8253319450226069,
"eval_steps_per_second": 1.358,
"eval_sts-dev-1152_pearson_cosine": 0.8229695273001155,
"eval_sts-dev-1152_spearman_cosine": 0.8253319450226069,
"eval_sts-dev-512_pearson_cosine": 0.8186723943926887,
"eval_sts-dev-512_spearman_cosine": 0.8223105621184139,
"eval_sts-dev-768_pearson_cosine": 0.8198835227077859,
"eval_sts-dev-768_spearman_cosine": 0.8231779055377496,
"eval_sts-dev-960_pearson_cosine": 0.8221140652235788,
"eval_sts-dev-960_spearman_cosine": 0.8246685858536678,
"step": 26000
},
{
"epoch": 0.7353353915239966,
"grad_norm": 1151759.625,
"learning_rate": 1.4703589359777967e-05,
"loss": 0.9829,
"step": 26200
},
{
"epoch": 0.7409486387875386,
"grad_norm": 1203075.5,
"learning_rate": 1.439174228958119e-05,
"loss": 0.9758,
"step": 26400
},
{
"epoch": 0.7437552624193096,
"eval_loss": 2.421391010284424,
"eval_runtime": 78.2685,
"eval_samples_per_second": 84.44,
"eval_sequential_score": 0.8273907944767567,
"eval_steps_per_second": 1.329,
"eval_sts-dev-1152_pearson_cosine": 0.8245632242048817,
"eval_sts-dev-1152_spearman_cosine": 0.8273907944767567,
"eval_sts-dev-512_pearson_cosine": 0.821320018207968,
"eval_sts-dev-512_spearman_cosine": 0.8249051957336987,
"eval_sts-dev-768_pearson_cosine": 0.8222668265396057,
"eval_sts-dev-768_spearman_cosine": 0.8258229540823162,
"eval_sts-dev-960_pearson_cosine": 0.8239329103520061,
"eval_sts-dev-960_spearman_cosine": 0.8268971547689833,
"step": 26500
},
{
"epoch": 0.7465618860510805,
"grad_norm": 873702.5625,
"learning_rate": 1.4079895219384415e-05,
"loss": 1.0123,
"step": 26600
},
{
"epoch": 0.7521751333146225,
"grad_norm": 1191796.125,
"learning_rate": 1.3768048149187641e-05,
"loss": 1.0156,
"step": 26800
},
{
"epoch": 0.7577883805781644,
"grad_norm": 3614394.75,
"learning_rate": 1.3456201078990863e-05,
"loss": 0.9687,
"step": 27000
},
{
"epoch": 0.7577883805781644,
"eval_loss": 2.381603240966797,
"eval_runtime": 76.1545,
"eval_samples_per_second": 86.784,
"eval_sequential_score": 0.822164187171298,
"eval_steps_per_second": 1.366,
"eval_sts-dev-1152_pearson_cosine": 0.8179990780257289,
"eval_sts-dev-1152_spearman_cosine": 0.822164187171298,
"eval_sts-dev-512_pearson_cosine": 0.8141681847358285,
"eval_sts-dev-512_spearman_cosine": 0.8190636570312585,
"eval_sts-dev-768_pearson_cosine": 0.8154464750993287,
"eval_sts-dev-768_spearman_cosine": 0.8202098023000759,
"eval_sts-dev-960_pearson_cosine": 0.8171115628702965,
"eval_sts-dev-960_spearman_cosine": 0.821364481930215,
"step": 27000
},
{
"epoch": 0.7634016278417064,
"grad_norm": 1148854.5,
"learning_rate": 1.3144354008794087e-05,
"loss": 0.9569,
"step": 27200
},
{
"epoch": 0.7690148751052484,
"grad_norm": 1114442.0,
"learning_rate": 1.2832506938597313e-05,
"loss": 0.9543,
"step": 27400
},
{
"epoch": 0.7718214987370193,
"eval_loss": 2.355195999145508,
"eval_runtime": 76.6074,
"eval_samples_per_second": 86.271,
"eval_sequential_score": 0.8289062223349991,
"eval_steps_per_second": 1.358,
"eval_sts-dev-1152_pearson_cosine": 0.8234371744631814,
"eval_sts-dev-1152_spearman_cosine": 0.8289062223349991,
"eval_sts-dev-512_pearson_cosine": 0.8207871259825303,
"eval_sts-dev-512_spearman_cosine": 0.8272360451735147,
"eval_sts-dev-768_pearson_cosine": 0.8214215565214141,
"eval_sts-dev-768_spearman_cosine": 0.8275606625119857,
"eval_sts-dev-960_pearson_cosine": 0.8229212299901374,
"eval_sts-dev-960_spearman_cosine": 0.8283389770018872,
"step": 27500
},
{
"epoch": 0.7746281223687903,
"grad_norm": 1473106.125,
"learning_rate": 1.2520659868400536e-05,
"loss": 0.9453,
"step": 27600
},
{
"epoch": 0.7802413696323323,
"grad_norm": 1301560.0,
"learning_rate": 1.2208812798203761e-05,
"loss": 0.9948,
"step": 27800
},
{
"epoch": 0.7858546168958742,
"grad_norm": 689391.125,
"learning_rate": 1.1896965728006985e-05,
"loss": 0.9874,
"step": 28000
},
{
"epoch": 0.7858546168958742,
"eval_loss": 2.3867998123168945,
"eval_runtime": 77.0268,
"eval_samples_per_second": 85.801,
"eval_sequential_score": 0.8280466360566167,
"eval_steps_per_second": 1.35,
"eval_sts-dev-1152_pearson_cosine": 0.8240164476876701,
"eval_sts-dev-1152_spearman_cosine": 0.8280466360566167,
"eval_sts-dev-512_pearson_cosine": 0.8215847717028831,
"eval_sts-dev-512_spearman_cosine": 0.8264438807620287,
"eval_sts-dev-768_pearson_cosine": 0.8221793287179034,
"eval_sts-dev-768_spearman_cosine": 0.8268230282109075,
"eval_sts-dev-960_pearson_cosine": 0.8233492812551204,
"eval_sts-dev-960_spearman_cosine": 0.8274233448566846,
"step": 28000
},
{
"epoch": 0.7914678641594162,
"grad_norm": 1482000.5,
"learning_rate": 1.1585118657810211e-05,
"loss": 0.8872,
"step": 28200
},
{
"epoch": 0.7970811114229582,
"grad_norm": 11877957.0,
"learning_rate": 1.1273271587613435e-05,
"loss": 0.9327,
"step": 28400
},
{
"epoch": 0.7998877350547292,
"eval_loss": 2.3834006786346436,
"eval_runtime": 76.4589,
"eval_samples_per_second": 86.439,
"eval_sequential_score": 0.8250187145214471,
"eval_steps_per_second": 1.36,
"eval_sts-dev-1152_pearson_cosine": 0.821012856551093,
"eval_sts-dev-1152_spearman_cosine": 0.8250187145214471,
"eval_sts-dev-512_pearson_cosine": 0.8184871654411439,
"eval_sts-dev-512_spearman_cosine": 0.823483460862761,
"eval_sts-dev-768_pearson_cosine": 0.8190353483169376,
"eval_sts-dev-768_spearman_cosine": 0.8237308290024404,
"eval_sts-dev-960_pearson_cosine": 0.8205974063302174,
"eval_sts-dev-960_spearman_cosine": 0.8245795527826257,
"step": 28500
},
{
"epoch": 0.8026943586865002,
"grad_norm": 993543.625,
"learning_rate": 1.096142451741666e-05,
"loss": 0.8715,
"step": 28600
},
{
"epoch": 0.8083076059500421,
"grad_norm": 1650682.25,
"learning_rate": 1.0649577447219884e-05,
"loss": 0.9566,
"step": 28800
},
{
"epoch": 0.8139208532135841,
"grad_norm": 1086581.625,
"learning_rate": 1.0337730377023108e-05,
"loss": 0.9265,
"step": 29000
},
{
"epoch": 0.8139208532135841,
"eval_loss": 2.345508337020874,
"eval_runtime": 78.2528,
"eval_samples_per_second": 84.457,
"eval_sequential_score": 0.8306689922163598,
"eval_steps_per_second": 1.329,
"eval_sts-dev-1152_pearson_cosine": 0.8264443610084379,
"eval_sts-dev-1152_spearman_cosine": 0.8306689922163598,
"eval_sts-dev-512_pearson_cosine": 0.8238103920299558,
"eval_sts-dev-512_spearman_cosine": 0.8293245725151981,
"eval_sts-dev-768_pearson_cosine": 0.8243518007889306,
"eval_sts-dev-768_spearman_cosine": 0.8293091429698137,
"eval_sts-dev-960_pearson_cosine": 0.8258566703064338,
"eval_sts-dev-960_spearman_cosine": 0.830247434103489,
"step": 29000
}
],
"logging_steps": 200,
"max_steps": 35630,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}