CocoRoF's picture
Training in progress, step 4390, checkpoint
3e54693 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.97880210556267,
"eval_steps": 250,
"global_step": 4390,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.022762839664248115,
"grad_norm": 7.620693683624268,
"learning_rate": 7.112628471851774e-09,
"loss": 0.3524,
"step": 10
},
{
"epoch": 0.04552567932849623,
"grad_norm": 6.552708625793457,
"learning_rate": 1.4225256943703548e-08,
"loss": 0.3496,
"step": 20
},
{
"epoch": 0.06828851899274435,
"grad_norm": 7.1639084815979,
"learning_rate": 2.1337885415555322e-08,
"loss": 0.3515,
"step": 30
},
{
"epoch": 0.09105135865699246,
"grad_norm": 6.914605617523193,
"learning_rate": 2.8450513887407095e-08,
"loss": 0.348,
"step": 40
},
{
"epoch": 0.11381419832124058,
"grad_norm": 6.548000812530518,
"learning_rate": 3.556314235925887e-08,
"loss": 0.3409,
"step": 50
},
{
"epoch": 0.1365770379854887,
"grad_norm": 7.1885857582092285,
"learning_rate": 4.2675770831110644e-08,
"loss": 0.347,
"step": 60
},
{
"epoch": 0.1593398776497368,
"grad_norm": 7.09645938873291,
"learning_rate": 4.978839930296241e-08,
"loss": 0.3377,
"step": 70
},
{
"epoch": 0.18210271731398492,
"grad_norm": 6.552192687988281,
"learning_rate": 5.690102777481419e-08,
"loss": 0.3317,
"step": 80
},
{
"epoch": 0.20486555697823303,
"grad_norm": 6.317521095275879,
"learning_rate": 6.401365624666596e-08,
"loss": 0.3279,
"step": 90
},
{
"epoch": 0.22762839664248116,
"grad_norm": 6.849682807922363,
"learning_rate": 7.112628471851774e-08,
"loss": 0.3264,
"step": 100
},
{
"epoch": 0.25039123630672927,
"grad_norm": 6.271164894104004,
"learning_rate": 7.823891319036951e-08,
"loss": 0.3116,
"step": 110
},
{
"epoch": 0.2731540759709774,
"grad_norm": 6.1660895347595215,
"learning_rate": 8.535154166222129e-08,
"loss": 0.3055,
"step": 120
},
{
"epoch": 0.2959169156352255,
"grad_norm": 5.520643711090088,
"learning_rate": 9.246417013407305e-08,
"loss": 0.3042,
"step": 130
},
{
"epoch": 0.3186797552994736,
"grad_norm": 5.591308116912842,
"learning_rate": 9.957679860592482e-08,
"loss": 0.2928,
"step": 140
},
{
"epoch": 0.3414425949637217,
"grad_norm": 5.716772079467773,
"learning_rate": 1.0668942707777661e-07,
"loss": 0.2835,
"step": 150
},
{
"epoch": 0.36420543462796984,
"grad_norm": 5.372351169586182,
"learning_rate": 1.1380205554962838e-07,
"loss": 0.2665,
"step": 160
},
{
"epoch": 0.386968274292218,
"grad_norm": 5.111922264099121,
"learning_rate": 1.2091468402148016e-07,
"loss": 0.2665,
"step": 170
},
{
"epoch": 0.40973111395646605,
"grad_norm": 4.756952285766602,
"learning_rate": 1.2802731249333193e-07,
"loss": 0.2486,
"step": 180
},
{
"epoch": 0.4324939536207142,
"grad_norm": 4.510580539703369,
"learning_rate": 1.351399409651837e-07,
"loss": 0.2387,
"step": 190
},
{
"epoch": 0.4552567932849623,
"grad_norm": 4.627274990081787,
"learning_rate": 1.4225256943703549e-07,
"loss": 0.2283,
"step": 200
},
{
"epoch": 0.4780196329492104,
"grad_norm": 4.426296234130859,
"learning_rate": 1.4936519790888725e-07,
"loss": 0.2237,
"step": 210
},
{
"epoch": 0.5007824726134585,
"grad_norm": 4.098926544189453,
"learning_rate": 1.5647782638073902e-07,
"loss": 0.2204,
"step": 220
},
{
"epoch": 0.5235453122777066,
"grad_norm": 3.592745065689087,
"learning_rate": 1.635904548525908e-07,
"loss": 0.205,
"step": 230
},
{
"epoch": 0.5463081519419548,
"grad_norm": 3.4670684337615967,
"learning_rate": 1.7070308332444258e-07,
"loss": 0.2002,
"step": 240
},
{
"epoch": 0.5690709916062029,
"grad_norm": 3.4649081230163574,
"learning_rate": 1.7781571179629434e-07,
"loss": 0.1904,
"step": 250
},
{
"epoch": 0.5690709916062029,
"eval_loss": 0.03304145112633705,
"eval_runtime": 3.1631,
"eval_samples_per_second": 474.224,
"eval_steps_per_second": 7.588,
"eval_sts_dev_pearson_cosine": 0.7871582161337227,
"eval_sts_dev_pearson_dot": 0.712522215037403,
"eval_sts_dev_pearson_euclidean": 0.7266842290027538,
"eval_sts_dev_pearson_manhattan": 0.7281230919411741,
"eval_sts_dev_pearson_max": 0.7871582161337227,
"eval_sts_dev_spearman_cosine": 0.7920626405951849,
"eval_sts_dev_spearman_dot": 0.7029699912720668,
"eval_sts_dev_spearman_euclidean": 0.7209681248102595,
"eval_sts_dev_spearman_manhattan": 0.7231977500622078,
"eval_sts_dev_spearman_max": 0.7920626405951849,
"step": 250
},
{
"epoch": 0.591833831270451,
"grad_norm": 2.9149794578552246,
"learning_rate": 1.849283402681461e-07,
"loss": 0.1834,
"step": 260
},
{
"epoch": 0.614596670934699,
"grad_norm": 2.7359046936035156,
"learning_rate": 1.9204096873999788e-07,
"loss": 0.1776,
"step": 270
},
{
"epoch": 0.6373595105989472,
"grad_norm": 2.24949312210083,
"learning_rate": 1.9915359721184964e-07,
"loss": 0.1665,
"step": 280
},
{
"epoch": 0.6601223502631953,
"grad_norm": 2.2236201763153076,
"learning_rate": 2.062662256837014e-07,
"loss": 0.1625,
"step": 290
},
{
"epoch": 0.6828851899274434,
"grad_norm": 1.932987928390503,
"learning_rate": 2.1337885415555323e-07,
"loss": 0.1585,
"step": 300
},
{
"epoch": 0.7056480295916916,
"grad_norm": 1.819048285484314,
"learning_rate": 2.20491482627405e-07,
"loss": 0.1522,
"step": 310
},
{
"epoch": 0.7284108692559397,
"grad_norm": 1.5515508651733398,
"learning_rate": 2.2760411109925676e-07,
"loss": 0.1552,
"step": 320
},
{
"epoch": 0.7511737089201878,
"grad_norm": 1.398522973060608,
"learning_rate": 2.3471673957110853e-07,
"loss": 0.1448,
"step": 330
},
{
"epoch": 0.773936548584436,
"grad_norm": 1.3401200771331787,
"learning_rate": 2.418293680429603e-07,
"loss": 0.1428,
"step": 340
},
{
"epoch": 0.796699388248684,
"grad_norm": 0.7522925734519958,
"learning_rate": 2.4894199651481206e-07,
"loss": 0.1401,
"step": 350
},
{
"epoch": 0.8194622279129321,
"grad_norm": 0.8474672436714172,
"learning_rate": 2.5605462498666385e-07,
"loss": 0.1399,
"step": 360
},
{
"epoch": 0.8422250675771803,
"grad_norm": 0.7029187083244324,
"learning_rate": 2.631672534585156e-07,
"loss": 0.1389,
"step": 370
},
{
"epoch": 0.8649879072414284,
"grad_norm": 0.7115994095802307,
"learning_rate": 2.702798819303674e-07,
"loss": 0.1372,
"step": 380
},
{
"epoch": 0.8877507469056765,
"grad_norm": 0.8128587007522583,
"learning_rate": 2.773925104022192e-07,
"loss": 0.1338,
"step": 390
},
{
"epoch": 0.9105135865699246,
"grad_norm": 0.694238543510437,
"learning_rate": 2.8450513887407097e-07,
"loss": 0.1361,
"step": 400
},
{
"epoch": 0.9332764262341727,
"grad_norm": 0.6458141207695007,
"learning_rate": 2.916177673459227e-07,
"loss": 0.1389,
"step": 410
},
{
"epoch": 0.9560392658984208,
"grad_norm": 0.653184711933136,
"learning_rate": 2.987303958177745e-07,
"loss": 0.1328,
"step": 420
},
{
"epoch": 0.9788021055626689,
"grad_norm": 0.7768956422805786,
"learning_rate": 3.0584302428962624e-07,
"loss": 0.1375,
"step": 430
},
{
"epoch": 1.0,
"grad_norm": 0.32018622756004333,
"learning_rate": 3.1295565276147804e-07,
"loss": 0.1266,
"step": 440
},
{
"epoch": 1.022762839664248,
"grad_norm": 0.6019369959831238,
"learning_rate": 3.200682812333298e-07,
"loss": 0.1269,
"step": 450
},
{
"epoch": 1.0455256793284962,
"grad_norm": 0.5142800211906433,
"learning_rate": 3.271809097051816e-07,
"loss": 0.1262,
"step": 460
},
{
"epoch": 1.0682885189927442,
"grad_norm": 0.5277538299560547,
"learning_rate": 3.342935381770333e-07,
"loss": 0.127,
"step": 470
},
{
"epoch": 1.0910513586569925,
"grad_norm": 0.6493993401527405,
"learning_rate": 3.4140616664888515e-07,
"loss": 0.1306,
"step": 480
},
{
"epoch": 1.1138141983212406,
"grad_norm": 0.49625104665756226,
"learning_rate": 3.485187951207369e-07,
"loss": 0.1266,
"step": 490
},
{
"epoch": 1.1365770379854887,
"grad_norm": 0.4953573942184448,
"learning_rate": 3.556314235925887e-07,
"loss": 0.1247,
"step": 500
},
{
"epoch": 1.1365770379854887,
"eval_loss": 0.04051072895526886,
"eval_runtime": 3.1508,
"eval_samples_per_second": 476.068,
"eval_steps_per_second": 7.617,
"eval_sts_dev_pearson_cosine": 0.7884134511298457,
"eval_sts_dev_pearson_dot": 0.7101637689598334,
"eval_sts_dev_pearson_euclidean": 0.7398882194893972,
"eval_sts_dev_pearson_manhattan": 0.7407996939857429,
"eval_sts_dev_pearson_max": 0.7884134511298457,
"eval_sts_dev_spearman_cosine": 0.7995145632068007,
"eval_sts_dev_spearman_dot": 0.6959219164369063,
"eval_sts_dev_spearman_euclidean": 0.7373979245023166,
"eval_sts_dev_spearman_manhattan": 0.7393614960639477,
"eval_sts_dev_spearman_max": 0.7995145632068007,
"step": 500
},
{
"epoch": 1.1593398776497368,
"grad_norm": 0.6302499771118164,
"learning_rate": 3.627440520644404e-07,
"loss": 0.1258,
"step": 510
},
{
"epoch": 1.1821027173139849,
"grad_norm": 0.559424638748169,
"learning_rate": 3.698566805362922e-07,
"loss": 0.1277,
"step": 520
},
{
"epoch": 1.204865556978233,
"grad_norm": 0.5976749658584595,
"learning_rate": 3.76969309008144e-07,
"loss": 0.13,
"step": 530
},
{
"epoch": 1.2276283966424812,
"grad_norm": 0.6781278252601624,
"learning_rate": 3.8408193747999575e-07,
"loss": 0.1291,
"step": 540
},
{
"epoch": 1.2503912363067293,
"grad_norm": 0.5501216053962708,
"learning_rate": 3.9119456595184754e-07,
"loss": 0.1287,
"step": 550
},
{
"epoch": 1.2731540759709774,
"grad_norm": 0.49228161573410034,
"learning_rate": 3.983071944236993e-07,
"loss": 0.1233,
"step": 560
},
{
"epoch": 1.2959169156352255,
"grad_norm": 0.5892780423164368,
"learning_rate": 4.054198228955511e-07,
"loss": 0.1242,
"step": 570
},
{
"epoch": 1.3186797552994736,
"grad_norm": 0.5604830980300903,
"learning_rate": 4.125324513674028e-07,
"loss": 0.1242,
"step": 580
},
{
"epoch": 1.3414425949637216,
"grad_norm": 0.46688252687454224,
"learning_rate": 4.196450798392546e-07,
"loss": 0.1227,
"step": 590
},
{
"epoch": 1.3642054346279697,
"grad_norm": 0.6248797178268433,
"learning_rate": 4.2675770831110646e-07,
"loss": 0.1201,
"step": 600
},
{
"epoch": 1.386968274292218,
"grad_norm": 0.5143482685089111,
"learning_rate": 4.338703367829582e-07,
"loss": 0.1247,
"step": 610
},
{
"epoch": 1.409731113956466,
"grad_norm": 0.50174880027771,
"learning_rate": 4.4098296525481e-07,
"loss": 0.1249,
"step": 620
},
{
"epoch": 1.4324939536207142,
"grad_norm": 0.4486837685108185,
"learning_rate": 4.4809559372666173e-07,
"loss": 0.1213,
"step": 630
},
{
"epoch": 1.4552567932849623,
"grad_norm": 0.5088754892349243,
"learning_rate": 4.552082221985135e-07,
"loss": 0.1217,
"step": 640
},
{
"epoch": 1.4780196329492103,
"grad_norm": 0.5201794505119324,
"learning_rate": 4.6232085067036526e-07,
"loss": 0.1204,
"step": 650
},
{
"epoch": 1.5007824726134587,
"grad_norm": 0.4499863088130951,
"learning_rate": 4.6943347914221705e-07,
"loss": 0.1191,
"step": 660
},
{
"epoch": 1.5235453122777067,
"grad_norm": 0.5528525114059448,
"learning_rate": 4.765461076140688e-07,
"loss": 0.1163,
"step": 670
},
{
"epoch": 1.5463081519419548,
"grad_norm": 0.475242018699646,
"learning_rate": 4.836587360859206e-07,
"loss": 0.1171,
"step": 680
},
{
"epoch": 1.569070991606203,
"grad_norm": 0.5379391312599182,
"learning_rate": 4.907713645577724e-07,
"loss": 0.1208,
"step": 690
},
{
"epoch": 1.591833831270451,
"grad_norm": 0.4778967797756195,
"learning_rate": 4.978839930296241e-07,
"loss": 0.1194,
"step": 700
},
{
"epoch": 1.614596670934699,
"grad_norm": 0.5140128135681152,
"learning_rate": 5.04996621501476e-07,
"loss": 0.1173,
"step": 710
},
{
"epoch": 1.6373595105989471,
"grad_norm": 0.448091059923172,
"learning_rate": 5.121092499733277e-07,
"loss": 0.1177,
"step": 720
},
{
"epoch": 1.6601223502631952,
"grad_norm": 0.5216450691223145,
"learning_rate": 5.192218784451794e-07,
"loss": 0.1148,
"step": 730
},
{
"epoch": 1.6828851899274433,
"grad_norm": 0.4396895170211792,
"learning_rate": 5.263345069170312e-07,
"loss": 0.1134,
"step": 740
},
{
"epoch": 1.7056480295916916,
"grad_norm": 0.4478048086166382,
"learning_rate": 5.33447135388883e-07,
"loss": 0.1167,
"step": 750
},
{
"epoch": 1.7056480295916916,
"eval_loss": 0.0421689935028553,
"eval_runtime": 3.1447,
"eval_samples_per_second": 476.988,
"eval_steps_per_second": 7.632,
"eval_sts_dev_pearson_cosine": 0.7979696362389916,
"eval_sts_dev_pearson_dot": 0.7067263755448246,
"eval_sts_dev_pearson_euclidean": 0.7541572372535862,
"eval_sts_dev_pearson_manhattan": 0.7549658870605886,
"eval_sts_dev_pearson_max": 0.7979696362389916,
"eval_sts_dev_spearman_cosine": 0.8091625743286466,
"eval_sts_dev_spearman_dot": 0.6900878688098099,
"eval_sts_dev_spearman_euclidean": 0.7547432017784844,
"eval_sts_dev_spearman_manhattan": 0.7564738854563926,
"eval_sts_dev_spearman_max": 0.8091625743286466,
"step": 750
},
{
"epoch": 1.7284108692559397,
"grad_norm": 0.5125904083251953,
"learning_rate": 5.405597638607348e-07,
"loss": 0.1145,
"step": 760
},
{
"epoch": 1.7511737089201878,
"grad_norm": 0.5108799934387207,
"learning_rate": 5.476723923325865e-07,
"loss": 0.114,
"step": 770
},
{
"epoch": 1.773936548584436,
"grad_norm": 0.6547980308532715,
"learning_rate": 5.547850208044384e-07,
"loss": 0.1136,
"step": 780
},
{
"epoch": 1.7966993882486841,
"grad_norm": 0.5986394882202148,
"learning_rate": 5.618976492762901e-07,
"loss": 0.1123,
"step": 790
},
{
"epoch": 1.8194622279129322,
"grad_norm": 0.4110976457595825,
"learning_rate": 5.690102777481419e-07,
"loss": 0.1115,
"step": 800
},
{
"epoch": 1.8422250675771803,
"grad_norm": 0.4697369933128357,
"learning_rate": 5.761229062199937e-07,
"loss": 0.1127,
"step": 810
},
{
"epoch": 1.8649879072414284,
"grad_norm": 0.44591763615608215,
"learning_rate": 5.832355346918454e-07,
"loss": 0.1137,
"step": 820
},
{
"epoch": 1.8877507469056765,
"grad_norm": 0.4944719076156616,
"learning_rate": 5.903481631636972e-07,
"loss": 0.1137,
"step": 830
},
{
"epoch": 1.9105135865699245,
"grad_norm": 0.4511684775352478,
"learning_rate": 5.97460791635549e-07,
"loss": 0.1123,
"step": 840
},
{
"epoch": 1.9332764262341726,
"grad_norm": 0.41804537177085876,
"learning_rate": 6.045734201074007e-07,
"loss": 0.1115,
"step": 850
},
{
"epoch": 1.9560392658984207,
"grad_norm": 0.4134292006492615,
"learning_rate": 6.116860485792525e-07,
"loss": 0.1105,
"step": 860
},
{
"epoch": 1.9788021055626688,
"grad_norm": 0.4234001934528351,
"learning_rate": 6.187986770511043e-07,
"loss": 0.1133,
"step": 870
},
{
"epoch": 2.0,
"grad_norm": 0.2553180754184723,
"learning_rate": 6.259113055229561e-07,
"loss": 0.1049,
"step": 880
},
{
"epoch": 2.022762839664248,
"grad_norm": 0.40301546454429626,
"learning_rate": 6.330239339948078e-07,
"loss": 0.1091,
"step": 890
},
{
"epoch": 2.045525679328496,
"grad_norm": 0.4319583773612976,
"learning_rate": 6.401365624666596e-07,
"loss": 0.111,
"step": 900
},
{
"epoch": 2.0682885189927442,
"grad_norm": 0.6076052188873291,
"learning_rate": 6.472491909385113e-07,
"loss": 0.1101,
"step": 910
},
{
"epoch": 2.0910513586569923,
"grad_norm": 0.4324122965335846,
"learning_rate": 6.543618194103632e-07,
"loss": 0.1078,
"step": 920
},
{
"epoch": 2.1138141983212404,
"grad_norm": 0.4718656837940216,
"learning_rate": 6.61474447882215e-07,
"loss": 0.1097,
"step": 930
},
{
"epoch": 2.1365770379854885,
"grad_norm": 0.42693474888801575,
"learning_rate": 6.685870763540666e-07,
"loss": 0.108,
"step": 940
},
{
"epoch": 2.159339877649737,
"grad_norm": 0.4170973300933838,
"learning_rate": 6.756997048259186e-07,
"loss": 0.1077,
"step": 950
},
{
"epoch": 2.182102717313985,
"grad_norm": 0.44675740599632263,
"learning_rate": 6.828123332977703e-07,
"loss": 0.1087,
"step": 960
},
{
"epoch": 2.204865556978233,
"grad_norm": 0.3757316470146179,
"learning_rate": 6.89924961769622e-07,
"loss": 0.1058,
"step": 970
},
{
"epoch": 2.2276283966424812,
"grad_norm": 0.3848642408847809,
"learning_rate": 6.970375902414738e-07,
"loss": 0.1071,
"step": 980
},
{
"epoch": 2.2503912363067293,
"grad_norm": 0.4413306713104248,
"learning_rate": 7.041502187133256e-07,
"loss": 0.1058,
"step": 990
},
{
"epoch": 2.2731540759709774,
"grad_norm": 0.38140761852264404,
"learning_rate": 7.112628471851774e-07,
"loss": 0.1104,
"step": 1000
},
{
"epoch": 2.2731540759709774,
"eval_loss": 0.04340244457125664,
"eval_runtime": 3.1758,
"eval_samples_per_second": 472.317,
"eval_steps_per_second": 7.557,
"eval_sts_dev_pearson_cosine": 0.8046339175494668,
"eval_sts_dev_pearson_dot": 0.7044830518743517,
"eval_sts_dev_pearson_euclidean": 0.7649588273720311,
"eval_sts_dev_pearson_manhattan": 0.7658573793768735,
"eval_sts_dev_pearson_max": 0.8046339175494668,
"eval_sts_dev_spearman_cosine": 0.8156078510955985,
"eval_sts_dev_spearman_dot": 0.686432354234728,
"eval_sts_dev_spearman_euclidean": 0.7675914883765887,
"eval_sts_dev_spearman_manhattan": 0.769330201012383,
"eval_sts_dev_spearman_max": 0.8156078510955985,
"step": 1000
},
{
"epoch": 2.2959169156352255,
"grad_norm": 0.39764779806137085,
"learning_rate": 7.183754756570291e-07,
"loss": 0.1036,
"step": 1010
},
{
"epoch": 2.3186797552994736,
"grad_norm": 0.45731303095817566,
"learning_rate": 7.254881041288809e-07,
"loss": 0.1068,
"step": 1020
},
{
"epoch": 2.3414425949637216,
"grad_norm": 0.39621132612228394,
"learning_rate": 7.326007326007327e-07,
"loss": 0.1033,
"step": 1030
},
{
"epoch": 2.3642054346279697,
"grad_norm": 0.4059266448020935,
"learning_rate": 7.397133610725844e-07,
"loss": 0.1058,
"step": 1040
},
{
"epoch": 2.386968274292218,
"grad_norm": 0.3780401945114136,
"learning_rate": 7.468259895444362e-07,
"loss": 0.105,
"step": 1050
},
{
"epoch": 2.409731113956466,
"grad_norm": 0.39316442608833313,
"learning_rate": 7.53938618016288e-07,
"loss": 0.1052,
"step": 1060
},
{
"epoch": 2.432493953620714,
"grad_norm": 0.42634522914886475,
"learning_rate": 7.610512464881398e-07,
"loss": 0.1013,
"step": 1070
},
{
"epoch": 2.4552567932849625,
"grad_norm": 0.42909443378448486,
"learning_rate": 7.681638749599915e-07,
"loss": 0.1037,
"step": 1080
},
{
"epoch": 2.4780196329492106,
"grad_norm": 0.404904842376709,
"learning_rate": 7.752765034318432e-07,
"loss": 0.1031,
"step": 1090
},
{
"epoch": 2.5007824726134587,
"grad_norm": 0.39788374304771423,
"learning_rate": 7.823891319036951e-07,
"loss": 0.1057,
"step": 1100
},
{
"epoch": 2.5235453122777067,
"grad_norm": 0.39511770009994507,
"learning_rate": 7.895017603755468e-07,
"loss": 0.1051,
"step": 1110
},
{
"epoch": 2.546308151941955,
"grad_norm": 0.3775276839733124,
"learning_rate": 7.966143888473986e-07,
"loss": 0.1019,
"step": 1120
},
{
"epoch": 2.569070991606203,
"grad_norm": 0.36302006244659424,
"learning_rate": 8.037270173192504e-07,
"loss": 0.1018,
"step": 1130
},
{
"epoch": 2.591833831270451,
"grad_norm": 0.397919625043869,
"learning_rate": 8.108396457911022e-07,
"loss": 0.1007,
"step": 1140
},
{
"epoch": 2.614596670934699,
"grad_norm": 0.39391639828681946,
"learning_rate": 8.179522742629539e-07,
"loss": 0.1035,
"step": 1150
},
{
"epoch": 2.637359510598947,
"grad_norm": 0.3966914117336273,
"learning_rate": 8.250649027348056e-07,
"loss": 0.1032,
"step": 1160
},
{
"epoch": 2.660122350263195,
"grad_norm": 0.47250422835350037,
"learning_rate": 8.321775312066576e-07,
"loss": 0.1036,
"step": 1170
},
{
"epoch": 2.6828851899274433,
"grad_norm": 0.41388604044914246,
"learning_rate": 8.392901596785092e-07,
"loss": 0.0971,
"step": 1180
},
{
"epoch": 2.705648029591692,
"grad_norm": 0.3682123124599457,
"learning_rate": 8.46402788150361e-07,
"loss": 0.1015,
"step": 1190
},
{
"epoch": 2.7284108692559395,
"grad_norm": 0.49675652384757996,
"learning_rate": 8.535154166222129e-07,
"loss": 0.104,
"step": 1200
},
{
"epoch": 2.751173708920188,
"grad_norm": 0.3492577075958252,
"learning_rate": 8.606280450940646e-07,
"loss": 0.1007,
"step": 1210
},
{
"epoch": 2.773936548584436,
"grad_norm": 0.34821173548698425,
"learning_rate": 8.677406735659164e-07,
"loss": 0.102,
"step": 1220
},
{
"epoch": 2.796699388248684,
"grad_norm": 0.37269824743270874,
"learning_rate": 8.748533020377681e-07,
"loss": 0.0994,
"step": 1230
},
{
"epoch": 2.819462227912932,
"grad_norm": 0.39633363485336304,
"learning_rate": 8.8196593050962e-07,
"loss": 0.0972,
"step": 1240
},
{
"epoch": 2.8422250675771803,
"grad_norm": 0.3534165322780609,
"learning_rate": 8.890785589814717e-07,
"loss": 0.0969,
"step": 1250
},
{
"epoch": 2.8422250675771803,
"eval_loss": 0.043736640363931656,
"eval_runtime": 3.1459,
"eval_samples_per_second": 476.804,
"eval_steps_per_second": 7.629,
"eval_sts_dev_pearson_cosine": 0.8088895757064547,
"eval_sts_dev_pearson_dot": 0.7030740598598051,
"eval_sts_dev_pearson_euclidean": 0.7723078699673291,
"eval_sts_dev_pearson_manhattan": 0.7732962792712872,
"eval_sts_dev_pearson_max": 0.8088895757064547,
"eval_sts_dev_spearman_cosine": 0.8185494576067605,
"eval_sts_dev_spearman_dot": 0.6845694825698349,
"eval_sts_dev_spearman_euclidean": 0.7756955565337718,
"eval_sts_dev_spearman_manhattan": 0.7776997356093678,
"eval_sts_dev_spearman_max": 0.8185494576067605,
"step": 1250
},
{
"epoch": 2.8649879072414284,
"grad_norm": 0.438899964094162,
"learning_rate": 8.961911874533235e-07,
"loss": 0.0968,
"step": 1260
},
{
"epoch": 2.8877507469056765,
"grad_norm": 0.3127659261226654,
"learning_rate": 9.033038159251752e-07,
"loss": 0.1003,
"step": 1270
},
{
"epoch": 2.9105135865699245,
"grad_norm": 0.41025590896606445,
"learning_rate": 9.10416444397027e-07,
"loss": 0.1036,
"step": 1280
},
{
"epoch": 2.9332764262341726,
"grad_norm": 0.3556434214115143,
"learning_rate": 9.175290728688788e-07,
"loss": 0.0969,
"step": 1290
},
{
"epoch": 2.9560392658984207,
"grad_norm": 0.3952799439430237,
"learning_rate": 9.246417013407305e-07,
"loss": 0.0965,
"step": 1300
},
{
"epoch": 2.978802105562669,
"grad_norm": 0.35889461636543274,
"learning_rate": 9.317543298125824e-07,
"loss": 0.0974,
"step": 1310
},
{
"epoch": 3.0,
"grad_norm": 0.1986362338066101,
"learning_rate": 9.388669582844341e-07,
"loss": 0.0905,
"step": 1320
},
{
"epoch": 3.022762839664248,
"grad_norm": 0.3811950981616974,
"learning_rate": 9.459795867562858e-07,
"loss": 0.1006,
"step": 1330
},
{
"epoch": 3.045525679328496,
"grad_norm": 0.3444836735725403,
"learning_rate": 9.530922152281376e-07,
"loss": 0.0952,
"step": 1340
},
{
"epoch": 3.0682885189927442,
"grad_norm": 0.38668960332870483,
"learning_rate": 9.602048436999895e-07,
"loss": 0.0971,
"step": 1350
},
{
"epoch": 3.0910513586569923,
"grad_norm": 0.3300378620624542,
"learning_rate": 9.673174721718413e-07,
"loss": 0.0943,
"step": 1360
},
{
"epoch": 3.1138141983212404,
"grad_norm": 0.35947421193122864,
"learning_rate": 9.74430100643693e-07,
"loss": 0.0996,
"step": 1370
},
{
"epoch": 3.1365770379854885,
"grad_norm": 0.33226439356803894,
"learning_rate": 9.815427291155448e-07,
"loss": 0.0971,
"step": 1380
},
{
"epoch": 3.159339877649737,
"grad_norm": 0.34526577591896057,
"learning_rate": 9.886553575873965e-07,
"loss": 0.097,
"step": 1390
},
{
"epoch": 3.182102717313985,
"grad_norm": 0.3941132128238678,
"learning_rate": 9.957679860592482e-07,
"loss": 0.0937,
"step": 1400
},
{
"epoch": 3.204865556978233,
"grad_norm": 0.3083663582801819,
"learning_rate": 1.0028806145311e-06,
"loss": 0.0955,
"step": 1410
},
{
"epoch": 3.2276283966424812,
"grad_norm": 0.33311647176742554,
"learning_rate": 1.009993243002952e-06,
"loss": 0.0963,
"step": 1420
},
{
"epoch": 3.2503912363067293,
"grad_norm": 0.35510948300361633,
"learning_rate": 1.0171058714748037e-06,
"loss": 0.0938,
"step": 1430
},
{
"epoch": 3.2731540759709774,
"grad_norm": 0.32245343923568726,
"learning_rate": 1.0242184999466554e-06,
"loss": 0.0986,
"step": 1440
},
{
"epoch": 3.2959169156352255,
"grad_norm": 0.34923267364501953,
"learning_rate": 1.0313311284185071e-06,
"loss": 0.0949,
"step": 1450
},
{
"epoch": 3.3186797552994736,
"grad_norm": 0.3278236985206604,
"learning_rate": 1.0384437568903589e-06,
"loss": 0.0932,
"step": 1460
},
{
"epoch": 3.3414425949637216,
"grad_norm": 0.3266041576862335,
"learning_rate": 1.0455563853622106e-06,
"loss": 0.096,
"step": 1470
},
{
"epoch": 3.3642054346279697,
"grad_norm": 0.3194010555744171,
"learning_rate": 1.0526690138340624e-06,
"loss": 0.0919,
"step": 1480
},
{
"epoch": 3.386968274292218,
"grad_norm": 0.2891862988471985,
"learning_rate": 1.0597816423059143e-06,
"loss": 0.093,
"step": 1490
},
{
"epoch": 3.409731113956466,
"grad_norm": 0.3133799433708191,
"learning_rate": 1.066894270777766e-06,
"loss": 0.0925,
"step": 1500
},
{
"epoch": 3.409731113956466,
"eval_loss": 0.04378899559378624,
"eval_runtime": 3.0984,
"eval_samples_per_second": 484.121,
"eval_steps_per_second": 7.746,
"eval_sts_dev_pearson_cosine": 0.811446634517756,
"eval_sts_dev_pearson_dot": 0.7028657760262548,
"eval_sts_dev_pearson_euclidean": 0.7772158914948109,
"eval_sts_dev_pearson_manhattan": 0.7782468091124333,
"eval_sts_dev_pearson_max": 0.811446634517756,
"eval_sts_dev_spearman_cosine": 0.8200700667481104,
"eval_sts_dev_spearman_dot": 0.6837762987069376,
"eval_sts_dev_spearman_euclidean": 0.7809451824112935,
"eval_sts_dev_spearman_manhattan": 0.782912075510424,
"eval_sts_dev_spearman_max": 0.8200700667481104,
"step": 1500
},
{
"epoch": 3.432493953620714,
"grad_norm": 0.35416606068611145,
"learning_rate": 1.0740068992496178e-06,
"loss": 0.0935,
"step": 1510
},
{
"epoch": 3.4552567932849625,
"grad_norm": 0.30686748027801514,
"learning_rate": 1.0811195277214695e-06,
"loss": 0.0928,
"step": 1520
},
{
"epoch": 3.4780196329492106,
"grad_norm": 0.35064470767974854,
"learning_rate": 1.0882321561933213e-06,
"loss": 0.0914,
"step": 1530
},
{
"epoch": 3.5007824726134587,
"grad_norm": 0.29766592383384705,
"learning_rate": 1.095344784665173e-06,
"loss": 0.0912,
"step": 1540
},
{
"epoch": 3.5235453122777067,
"grad_norm": 0.3228856921195984,
"learning_rate": 1.1024574131370248e-06,
"loss": 0.091,
"step": 1550
},
{
"epoch": 3.546308151941955,
"grad_norm": 0.3097136616706848,
"learning_rate": 1.1095700416088767e-06,
"loss": 0.0906,
"step": 1560
},
{
"epoch": 3.569070991606203,
"grad_norm": 0.3007184863090515,
"learning_rate": 1.1166826700807284e-06,
"loss": 0.0936,
"step": 1570
},
{
"epoch": 3.591833831270451,
"grad_norm": 0.33350813388824463,
"learning_rate": 1.1237952985525802e-06,
"loss": 0.0943,
"step": 1580
},
{
"epoch": 3.614596670934699,
"grad_norm": 0.3203691840171814,
"learning_rate": 1.130907927024432e-06,
"loss": 0.0925,
"step": 1590
},
{
"epoch": 3.637359510598947,
"grad_norm": 0.331552118062973,
"learning_rate": 1.1380205554962839e-06,
"loss": 0.0908,
"step": 1600
},
{
"epoch": 3.660122350263195,
"grad_norm": 0.3371254801750183,
"learning_rate": 1.1451331839681356e-06,
"loss": 0.0933,
"step": 1610
},
{
"epoch": 3.6828851899274433,
"grad_norm": 0.302493691444397,
"learning_rate": 1.1522458124399874e-06,
"loss": 0.0917,
"step": 1620
},
{
"epoch": 3.705648029591692,
"grad_norm": 0.3068028390407562,
"learning_rate": 1.159358440911839e-06,
"loss": 0.0887,
"step": 1630
},
{
"epoch": 3.7284108692559395,
"grad_norm": 0.3035215139389038,
"learning_rate": 1.1664710693836908e-06,
"loss": 0.0903,
"step": 1640
},
{
"epoch": 3.751173708920188,
"grad_norm": 0.30323877930641174,
"learning_rate": 1.1735836978555426e-06,
"loss": 0.0934,
"step": 1650
},
{
"epoch": 3.773936548584436,
"grad_norm": 0.3012927770614624,
"learning_rate": 1.1806963263273943e-06,
"loss": 0.0906,
"step": 1660
},
{
"epoch": 3.796699388248684,
"grad_norm": 0.30301883816719055,
"learning_rate": 1.1878089547992463e-06,
"loss": 0.0886,
"step": 1670
},
{
"epoch": 3.819462227912932,
"grad_norm": 0.33015456795692444,
"learning_rate": 1.194921583271098e-06,
"loss": 0.0915,
"step": 1680
},
{
"epoch": 3.8422250675771803,
"grad_norm": 0.3174781799316406,
"learning_rate": 1.2020342117429498e-06,
"loss": 0.0924,
"step": 1690
},
{
"epoch": 3.8649879072414284,
"grad_norm": 0.4451993405818939,
"learning_rate": 1.2091468402148015e-06,
"loss": 0.094,
"step": 1700
},
{
"epoch": 3.8877507469056765,
"grad_norm": 0.28383544087409973,
"learning_rate": 1.2162594686866532e-06,
"loss": 0.0899,
"step": 1710
},
{
"epoch": 3.9105135865699245,
"grad_norm": 0.29724323749542236,
"learning_rate": 1.223372097158505e-06,
"loss": 0.0881,
"step": 1720
},
{
"epoch": 3.9332764262341726,
"grad_norm": 0.2890814542770386,
"learning_rate": 1.2304847256303567e-06,
"loss": 0.0884,
"step": 1730
},
{
"epoch": 3.9560392658984207,
"grad_norm": 0.3286956548690796,
"learning_rate": 1.2375973541022087e-06,
"loss": 0.0894,
"step": 1740
},
{
"epoch": 3.978802105562669,
"grad_norm": 0.3125689923763275,
"learning_rate": 1.2447099825740604e-06,
"loss": 0.0892,
"step": 1750
},
{
"epoch": 3.978802105562669,
"eval_loss": 0.044054221361875534,
"eval_runtime": 3.1571,
"eval_samples_per_second": 475.117,
"eval_steps_per_second": 7.602,
"eval_sts_dev_pearson_cosine": 0.8132413514515704,
"eval_sts_dev_pearson_dot": 0.7029474825790806,
"eval_sts_dev_pearson_euclidean": 0.7809188333660187,
"eval_sts_dev_pearson_manhattan": 0.7819647523352286,
"eval_sts_dev_pearson_max": 0.8132413514515704,
"eval_sts_dev_spearman_cosine": 0.821487180003909,
"eval_sts_dev_spearman_dot": 0.6837898742130047,
"eval_sts_dev_spearman_euclidean": 0.7847115974803252,
"eval_sts_dev_spearman_manhattan": 0.7867474388257931,
"eval_sts_dev_spearman_max": 0.821487180003909,
"step": 1750
},
{
"epoch": 4.0,
"grad_norm": 0.15849126875400543,
"learning_rate": 1.2518226110459121e-06,
"loss": 0.0812,
"step": 1760
},
{
"epoch": 4.0227628396642485,
"grad_norm": 0.2848968505859375,
"learning_rate": 1.2589352395177639e-06,
"loss": 0.0878,
"step": 1770
},
{
"epoch": 4.045525679328496,
"grad_norm": 0.2904186546802521,
"learning_rate": 1.2660478679896156e-06,
"loss": 0.0869,
"step": 1780
},
{
"epoch": 4.068288518992745,
"grad_norm": 0.3993697464466095,
"learning_rate": 1.2731604964614674e-06,
"loss": 0.09,
"step": 1790
},
{
"epoch": 4.091051358656992,
"grad_norm": 0.28611546754837036,
"learning_rate": 1.280273124933319e-06,
"loss": 0.0875,
"step": 1800
},
{
"epoch": 4.113814198321241,
"grad_norm": 0.29007813334465027,
"learning_rate": 1.2873857534051708e-06,
"loss": 0.086,
"step": 1810
},
{
"epoch": 4.1365770379854885,
"grad_norm": 0.32408079504966736,
"learning_rate": 1.2944983818770226e-06,
"loss": 0.0888,
"step": 1820
},
{
"epoch": 4.159339877649737,
"grad_norm": 0.3043130934238434,
"learning_rate": 1.3016110103488747e-06,
"loss": 0.086,
"step": 1830
},
{
"epoch": 4.182102717313985,
"grad_norm": 0.27660122513771057,
"learning_rate": 1.3087236388207265e-06,
"loss": 0.0869,
"step": 1840
},
{
"epoch": 4.204865556978233,
"grad_norm": 0.28952455520629883,
"learning_rate": 1.3158362672925782e-06,
"loss": 0.0885,
"step": 1850
},
{
"epoch": 4.227628396642481,
"grad_norm": 0.3132406175136566,
"learning_rate": 1.32294889576443e-06,
"loss": 0.0891,
"step": 1860
},
{
"epoch": 4.250391236306729,
"grad_norm": 0.300589919090271,
"learning_rate": 1.3300615242362817e-06,
"loss": 0.0853,
"step": 1870
},
{
"epoch": 4.273154075970977,
"grad_norm": 0.28771117329597473,
"learning_rate": 1.3371741527081332e-06,
"loss": 0.0849,
"step": 1880
},
{
"epoch": 4.2959169156352255,
"grad_norm": 0.29580655694007874,
"learning_rate": 1.344286781179985e-06,
"loss": 0.0856,
"step": 1890
},
{
"epoch": 4.318679755299474,
"grad_norm": 0.31038954854011536,
"learning_rate": 1.3513994096518371e-06,
"loss": 0.0863,
"step": 1900
},
{
"epoch": 4.341442594963722,
"grad_norm": 0.28325986862182617,
"learning_rate": 1.3585120381236889e-06,
"loss": 0.0849,
"step": 1910
},
{
"epoch": 4.36420543462797,
"grad_norm": 0.27782177925109863,
"learning_rate": 1.3656246665955406e-06,
"loss": 0.0855,
"step": 1920
},
{
"epoch": 4.386968274292218,
"grad_norm": 0.30091577768325806,
"learning_rate": 1.3727372950673924e-06,
"loss": 0.0841,
"step": 1930
},
{
"epoch": 4.409731113956466,
"grad_norm": 0.3271619379520416,
"learning_rate": 1.379849923539244e-06,
"loss": 0.0893,
"step": 1940
},
{
"epoch": 4.432493953620714,
"grad_norm": 0.2756374478340149,
"learning_rate": 1.3869625520110958e-06,
"loss": 0.0847,
"step": 1950
},
{
"epoch": 4.4552567932849625,
"grad_norm": 0.29490333795547485,
"learning_rate": 1.3940751804829476e-06,
"loss": 0.0866,
"step": 1960
},
{
"epoch": 4.47801963294921,
"grad_norm": 0.318877637386322,
"learning_rate": 1.4011878089547993e-06,
"loss": 0.0866,
"step": 1970
},
{
"epoch": 4.500782472613459,
"grad_norm": 0.2944406270980835,
"learning_rate": 1.4083004374266513e-06,
"loss": 0.0844,
"step": 1980
},
{
"epoch": 4.523545312277706,
"grad_norm": 0.2944386601448059,
"learning_rate": 1.415413065898503e-06,
"loss": 0.0846,
"step": 1990
},
{
"epoch": 4.546308151941955,
"grad_norm": 0.33368054032325745,
"learning_rate": 1.4225256943703547e-06,
"loss": 0.0847,
"step": 2000
},
{
"epoch": 4.546308151941955,
"eval_loss": 0.043504875153303146,
"eval_runtime": 3.154,
"eval_samples_per_second": 475.589,
"eval_steps_per_second": 7.609,
"eval_sts_dev_pearson_cosine": 0.8146715006362908,
"eval_sts_dev_pearson_dot": 0.7040304873244005,
"eval_sts_dev_pearson_euclidean": 0.7833063990865667,
"eval_sts_dev_pearson_manhattan": 0.7843736809113127,
"eval_sts_dev_pearson_max": 0.8146715006362908,
"eval_sts_dev_spearman_cosine": 0.8219541620728356,
"eval_sts_dev_spearman_dot": 0.6847507056516472,
"eval_sts_dev_spearman_euclidean": 0.7871847822322454,
"eval_sts_dev_spearman_manhattan": 0.7889612573133329,
"eval_sts_dev_spearman_max": 0.8219541620728356,
"step": 2000
},
{
"epoch": 4.569070991606203,
"grad_norm": 0.27128133177757263,
"learning_rate": 1.4296383228422065e-06,
"loss": 0.0831,
"step": 2010
},
{
"epoch": 4.591833831270451,
"grad_norm": 0.34261852502822876,
"learning_rate": 1.4367509513140582e-06,
"loss": 0.0843,
"step": 2020
},
{
"epoch": 4.614596670934699,
"grad_norm": 0.35007432103157043,
"learning_rate": 1.44386357978591e-06,
"loss": 0.086,
"step": 2030
},
{
"epoch": 4.637359510598947,
"grad_norm": 0.3019850552082062,
"learning_rate": 1.4509762082577617e-06,
"loss": 0.0851,
"step": 2040
},
{
"epoch": 4.660122350263196,
"grad_norm": 0.2931526303291321,
"learning_rate": 1.4580888367296137e-06,
"loss": 0.0844,
"step": 2050
},
{
"epoch": 4.682885189927443,
"grad_norm": 0.32966339588165283,
"learning_rate": 1.4652014652014654e-06,
"loss": 0.0843,
"step": 2060
},
{
"epoch": 4.705648029591692,
"grad_norm": 0.27907735109329224,
"learning_rate": 1.4723140936733171e-06,
"loss": 0.0854,
"step": 2070
},
{
"epoch": 4.7284108692559395,
"grad_norm": 0.29893922805786133,
"learning_rate": 1.4794267221451689e-06,
"loss": 0.0851,
"step": 2080
},
{
"epoch": 4.751173708920188,
"grad_norm": 0.34036242961883545,
"learning_rate": 1.4865393506170206e-06,
"loss": 0.0822,
"step": 2090
},
{
"epoch": 4.773936548584436,
"grad_norm": 0.2901923656463623,
"learning_rate": 1.4936519790888724e-06,
"loss": 0.0859,
"step": 2100
},
{
"epoch": 4.796699388248684,
"grad_norm": 0.29854685068130493,
"learning_rate": 1.500764607560724e-06,
"loss": 0.0844,
"step": 2110
},
{
"epoch": 4.819462227912932,
"grad_norm": 0.3369494378566742,
"learning_rate": 1.507877236032576e-06,
"loss": 0.0853,
"step": 2120
},
{
"epoch": 4.84222506757718,
"grad_norm": 0.2752622067928314,
"learning_rate": 1.5149898645044278e-06,
"loss": 0.0815,
"step": 2130
},
{
"epoch": 4.864987907241428,
"grad_norm": 0.2834544777870178,
"learning_rate": 1.5221024929762795e-06,
"loss": 0.0833,
"step": 2140
},
{
"epoch": 4.8877507469056765,
"grad_norm": 0.29377996921539307,
"learning_rate": 1.5292151214481313e-06,
"loss": 0.0817,
"step": 2150
},
{
"epoch": 4.910513586569925,
"grad_norm": 0.28692808747291565,
"learning_rate": 1.536327749919983e-06,
"loss": 0.0873,
"step": 2160
},
{
"epoch": 4.933276426234173,
"grad_norm": 0.39381957054138184,
"learning_rate": 1.5434403783918347e-06,
"loss": 0.0813,
"step": 2170
},
{
"epoch": 4.956039265898421,
"grad_norm": 0.2600520849227905,
"learning_rate": 1.5505530068636865e-06,
"loss": 0.0829,
"step": 2180
},
{
"epoch": 4.978802105562669,
"grad_norm": 0.2618444859981537,
"learning_rate": 1.5576656353355384e-06,
"loss": 0.0812,
"step": 2190
},
{
"epoch": 5.0,
"grad_norm": 0.1586732417345047,
"learning_rate": 1.5647782638073902e-06,
"loss": 0.0776,
"step": 2200
},
{
"epoch": 5.0227628396642485,
"grad_norm": 0.290334016084671,
"learning_rate": 1.571890892279242e-06,
"loss": 0.083,
"step": 2210
},
{
"epoch": 5.045525679328496,
"grad_norm": 0.27750375866889954,
"learning_rate": 1.5790035207510937e-06,
"loss": 0.0821,
"step": 2220
},
{
"epoch": 5.068288518992745,
"grad_norm": 0.30981412529945374,
"learning_rate": 1.5861161492229454e-06,
"loss": 0.0806,
"step": 2230
},
{
"epoch": 5.091051358656992,
"grad_norm": 0.27481886744499207,
"learning_rate": 1.5932287776947971e-06,
"loss": 0.0809,
"step": 2240
},
{
"epoch": 5.113814198321241,
"grad_norm": 0.4338255226612091,
"learning_rate": 1.6003414061666489e-06,
"loss": 0.0814,
"step": 2250
},
{
"epoch": 5.113814198321241,
"eval_loss": 0.043074991554021835,
"eval_runtime": 3.1278,
"eval_samples_per_second": 479.564,
"eval_steps_per_second": 7.673,
"eval_sts_dev_pearson_cosine": 0.8157779253220427,
"eval_sts_dev_pearson_dot": 0.7048364440953621,
"eval_sts_dev_pearson_euclidean": 0.7856670451519477,
"eval_sts_dev_pearson_manhattan": 0.7867510343221469,
"eval_sts_dev_pearson_max": 0.8157779253220427,
"eval_sts_dev_spearman_cosine": 0.8225234805140361,
"eval_sts_dev_spearman_dot": 0.6855999639965007,
"eval_sts_dev_spearman_euclidean": 0.7894583838133404,
"eval_sts_dev_spearman_manhattan": 0.791341587911783,
"eval_sts_dev_spearman_max": 0.8225234805140361,
"step": 2250
},
{
"epoch": 5.1365770379854885,
"grad_norm": 0.28418755531311035,
"learning_rate": 1.6074540346385008e-06,
"loss": 0.0808,
"step": 2260
},
{
"epoch": 5.159339877649737,
"grad_norm": 0.2720179259777069,
"learning_rate": 1.6145666631103526e-06,
"loss": 0.0791,
"step": 2270
},
{
"epoch": 5.182102717313985,
"grad_norm": 0.24729762971401215,
"learning_rate": 1.6216792915822043e-06,
"loss": 0.0811,
"step": 2280
},
{
"epoch": 5.204865556978233,
"grad_norm": 0.2798333764076233,
"learning_rate": 1.628791920054056e-06,
"loss": 0.0805,
"step": 2290
},
{
"epoch": 5.227628396642481,
"grad_norm": 0.3200991749763489,
"learning_rate": 1.6359045485259078e-06,
"loss": 0.0817,
"step": 2300
},
{
"epoch": 5.250391236306729,
"grad_norm": 0.28277549147605896,
"learning_rate": 1.6430171769977595e-06,
"loss": 0.0772,
"step": 2310
},
{
"epoch": 5.273154075970977,
"grad_norm": 0.2859530746936798,
"learning_rate": 1.6501298054696113e-06,
"loss": 0.0799,
"step": 2320
},
{
"epoch": 5.2959169156352255,
"grad_norm": 0.30756381154060364,
"learning_rate": 1.6572424339414634e-06,
"loss": 0.0829,
"step": 2330
},
{
"epoch": 5.318679755299474,
"grad_norm": 0.280272901058197,
"learning_rate": 1.6643550624133152e-06,
"loss": 0.077,
"step": 2340
},
{
"epoch": 5.341442594963722,
"grad_norm": 0.25623443722724915,
"learning_rate": 1.671467690885167e-06,
"loss": 0.0801,
"step": 2350
},
{
"epoch": 5.36420543462797,
"grad_norm": 0.26670023798942566,
"learning_rate": 1.6785803193570184e-06,
"loss": 0.0812,
"step": 2360
},
{
"epoch": 5.386968274292218,
"grad_norm": 0.24578404426574707,
"learning_rate": 1.6856929478288702e-06,
"loss": 0.0788,
"step": 2370
},
{
"epoch": 5.409731113956466,
"grad_norm": 0.3148477375507355,
"learning_rate": 1.692805576300722e-06,
"loss": 0.0776,
"step": 2380
},
{
"epoch": 5.432493953620714,
"grad_norm": 0.2843981087207794,
"learning_rate": 1.6999182047725737e-06,
"loss": 0.0785,
"step": 2390
},
{
"epoch": 5.4552567932849625,
"grad_norm": 0.2720634639263153,
"learning_rate": 1.7070308332444258e-06,
"loss": 0.0771,
"step": 2400
},
{
"epoch": 5.47801963294921,
"grad_norm": 0.2510247826576233,
"learning_rate": 1.7141434617162776e-06,
"loss": 0.0788,
"step": 2410
},
{
"epoch": 5.500782472613459,
"grad_norm": 0.2841964662075043,
"learning_rate": 1.7212560901881293e-06,
"loss": 0.0796,
"step": 2420
},
{
"epoch": 5.523545312277706,
"grad_norm": 0.28437066078186035,
"learning_rate": 1.728368718659981e-06,
"loss": 0.0793,
"step": 2430
},
{
"epoch": 5.546308151941955,
"grad_norm": 0.26712462306022644,
"learning_rate": 1.7354813471318328e-06,
"loss": 0.0813,
"step": 2440
},
{
"epoch": 5.569070991606203,
"grad_norm": 0.28166961669921875,
"learning_rate": 1.7425939756036845e-06,
"loss": 0.0757,
"step": 2450
},
{
"epoch": 5.591833831270451,
"grad_norm": 0.31931671500205994,
"learning_rate": 1.7497066040755363e-06,
"loss": 0.079,
"step": 2460
},
{
"epoch": 5.614596670934699,
"grad_norm": 0.26701247692108154,
"learning_rate": 1.756819232547388e-06,
"loss": 0.0797,
"step": 2470
},
{
"epoch": 5.637359510598947,
"grad_norm": 0.31258830428123474,
"learning_rate": 1.76393186101924e-06,
"loss": 0.0794,
"step": 2480
},
{
"epoch": 5.660122350263196,
"grad_norm": 0.24133124947547913,
"learning_rate": 1.7710444894910917e-06,
"loss": 0.0808,
"step": 2490
},
{
"epoch": 5.682885189927443,
"grad_norm": 0.321419894695282,
"learning_rate": 1.7781571179629434e-06,
"loss": 0.0796,
"step": 2500
},
{
"epoch": 5.682885189927443,
"eval_loss": 0.042360566556453705,
"eval_runtime": 3.1328,
"eval_samples_per_second": 478.807,
"eval_steps_per_second": 7.661,
"eval_sts_dev_pearson_cosine": 0.8169384514842573,
"eval_sts_dev_pearson_dot": 0.7053659085797274,
"eval_sts_dev_pearson_euclidean": 0.7873338046578867,
"eval_sts_dev_pearson_manhattan": 0.7884276721147352,
"eval_sts_dev_pearson_max": 0.8169384514842573,
"eval_sts_dev_spearman_cosine": 0.8229950220469622,
"eval_sts_dev_spearman_dot": 0.6868513283838655,
"eval_sts_dev_spearman_euclidean": 0.7910863183517545,
"eval_sts_dev_spearman_manhattan": 0.7928925773180566,
"eval_sts_dev_spearman_max": 0.8229950220469622,
"step": 2500
},
{
"epoch": 5.705648029591692,
"grad_norm": 0.2824183404445648,
"learning_rate": 1.7852697464347952e-06,
"loss": 0.0802,
"step": 2510
},
{
"epoch": 5.7284108692559395,
"grad_norm": 0.23221443593502045,
"learning_rate": 1.792382374906647e-06,
"loss": 0.0799,
"step": 2520
},
{
"epoch": 5.751173708920188,
"grad_norm": 0.3471781313419342,
"learning_rate": 1.7994950033784986e-06,
"loss": 0.0802,
"step": 2530
},
{
"epoch": 5.773936548584436,
"grad_norm": 0.2657029628753662,
"learning_rate": 1.8066076318503504e-06,
"loss": 0.0813,
"step": 2540
},
{
"epoch": 5.796699388248684,
"grad_norm": 0.2691391706466675,
"learning_rate": 1.8137202603222023e-06,
"loss": 0.0772,
"step": 2550
},
{
"epoch": 5.819462227912932,
"grad_norm": 0.2894577383995056,
"learning_rate": 1.820832888794054e-06,
"loss": 0.0766,
"step": 2560
},
{
"epoch": 5.84222506757718,
"grad_norm": 0.3584199547767639,
"learning_rate": 1.8279455172659058e-06,
"loss": 0.0778,
"step": 2570
},
{
"epoch": 5.864987907241428,
"grad_norm": 0.29335150122642517,
"learning_rate": 1.8350581457377576e-06,
"loss": 0.076,
"step": 2580
},
{
"epoch": 5.8877507469056765,
"grad_norm": 0.3056882321834564,
"learning_rate": 1.8421707742096093e-06,
"loss": 0.0787,
"step": 2590
},
{
"epoch": 5.910513586569925,
"grad_norm": 0.2651148736476898,
"learning_rate": 1.849283402681461e-06,
"loss": 0.0794,
"step": 2600
},
{
"epoch": 5.933276426234173,
"grad_norm": 0.2723177969455719,
"learning_rate": 1.8563960311533128e-06,
"loss": 0.076,
"step": 2610
},
{
"epoch": 5.956039265898421,
"grad_norm": 0.26683980226516724,
"learning_rate": 1.8635086596251647e-06,
"loss": 0.0773,
"step": 2620
},
{
"epoch": 5.978802105562669,
"grad_norm": 0.2798727750778198,
"learning_rate": 1.8706212880970165e-06,
"loss": 0.0755,
"step": 2630
},
{
"epoch": 6.0,
"grad_norm": 0.1584087312221527,
"learning_rate": 1.8777339165688682e-06,
"loss": 0.0725,
"step": 2640
},
{
"epoch": 6.0227628396642485,
"grad_norm": 0.2422705441713333,
"learning_rate": 1.88484654504072e-06,
"loss": 0.0738,
"step": 2650
},
{
"epoch": 6.045525679328496,
"grad_norm": 0.29021552205085754,
"learning_rate": 1.8919591735125717e-06,
"loss": 0.0762,
"step": 2660
},
{
"epoch": 6.068288518992745,
"grad_norm": 0.24207890033721924,
"learning_rate": 1.8990718019844234e-06,
"loss": 0.0761,
"step": 2670
},
{
"epoch": 6.091051358656992,
"grad_norm": 0.2542373538017273,
"learning_rate": 1.9061844304562752e-06,
"loss": 0.0771,
"step": 2680
},
{
"epoch": 6.113814198321241,
"grad_norm": 0.24501535296440125,
"learning_rate": 1.913297058928127e-06,
"loss": 0.0765,
"step": 2690
},
{
"epoch": 6.1365770379854885,
"grad_norm": 0.2733090817928314,
"learning_rate": 1.920409687399979e-06,
"loss": 0.0755,
"step": 2700
},
{
"epoch": 6.159339877649737,
"grad_norm": 0.24523428082466125,
"learning_rate": 1.9275223158718306e-06,
"loss": 0.0771,
"step": 2710
},
{
"epoch": 6.182102717313985,
"grad_norm": 0.2885381877422333,
"learning_rate": 1.9346349443436826e-06,
"loss": 0.0748,
"step": 2720
},
{
"epoch": 6.204865556978233,
"grad_norm": 0.30437180399894714,
"learning_rate": 1.941747572815534e-06,
"loss": 0.0768,
"step": 2730
},
{
"epoch": 6.227628396642481,
"grad_norm": 0.26403576135635376,
"learning_rate": 1.948860201287386e-06,
"loss": 0.0766,
"step": 2740
},
{
"epoch": 6.250391236306729,
"grad_norm": 0.2586285471916199,
"learning_rate": 1.9559728297592376e-06,
"loss": 0.0766,
"step": 2750
},
{
"epoch": 6.250391236306729,
"eval_loss": 0.04222765937447548,
"eval_runtime": 3.1267,
"eval_samples_per_second": 479.737,
"eval_steps_per_second": 7.676,
"eval_sts_dev_pearson_cosine": 0.8179974050572225,
"eval_sts_dev_pearson_dot": 0.7047975315002233,
"eval_sts_dev_pearson_euclidean": 0.7884393906368459,
"eval_sts_dev_pearson_manhattan": 0.7895245481097086,
"eval_sts_dev_pearson_max": 0.8179974050572225,
"eval_sts_dev_spearman_cosine": 0.823939141053365,
"eval_sts_dev_spearman_dot": 0.6866457343802284,
"eval_sts_dev_spearman_euclidean": 0.7922252886902459,
"eval_sts_dev_spearman_manhattan": 0.7940188158576397,
"eval_sts_dev_spearman_max": 0.823939141053365,
"step": 2750
},
{
"epoch": 6.273154075970977,
"grad_norm": 0.3394823968410492,
"learning_rate": 1.9630854582310895e-06,
"loss": 0.076,
"step": 2760
},
{
"epoch": 6.2959169156352255,
"grad_norm": 0.25371795892715454,
"learning_rate": 1.9701980867029415e-06,
"loss": 0.0753,
"step": 2770
},
{
"epoch": 6.318679755299474,
"grad_norm": 0.2900475561618805,
"learning_rate": 1.977310715174793e-06,
"loss": 0.0735,
"step": 2780
},
{
"epoch": 6.341442594963722,
"grad_norm": 0.23057663440704346,
"learning_rate": 1.984423343646645e-06,
"loss": 0.0751,
"step": 2790
},
{
"epoch": 6.36420543462797,
"grad_norm": 0.3174869418144226,
"learning_rate": 1.9915359721184965e-06,
"loss": 0.0738,
"step": 2800
},
{
"epoch": 6.386968274292218,
"grad_norm": 0.2789759635925293,
"learning_rate": 1.9986486005903484e-06,
"loss": 0.0749,
"step": 2810
},
{
"epoch": 6.409731113956466,
"grad_norm": 0.27371764183044434,
"learning_rate": 2.0057612290622e-06,
"loss": 0.0753,
"step": 2820
},
{
"epoch": 6.432493953620714,
"grad_norm": 0.28073859214782715,
"learning_rate": 2.012873857534052e-06,
"loss": 0.077,
"step": 2830
},
{
"epoch": 6.4552567932849625,
"grad_norm": 0.2226971834897995,
"learning_rate": 2.019986486005904e-06,
"loss": 0.0747,
"step": 2840
},
{
"epoch": 6.47801963294921,
"grad_norm": 0.2504160404205322,
"learning_rate": 2.0270991144777554e-06,
"loss": 0.0722,
"step": 2850
},
{
"epoch": 6.500782472613459,
"grad_norm": 0.24718140065670013,
"learning_rate": 2.0342117429496073e-06,
"loss": 0.0736,
"step": 2860
},
{
"epoch": 6.523545312277706,
"grad_norm": 0.3116447627544403,
"learning_rate": 2.041324371421459e-06,
"loss": 0.073,
"step": 2870
},
{
"epoch": 6.546308151941955,
"grad_norm": 0.2456272393465042,
"learning_rate": 2.048436999893311e-06,
"loss": 0.0774,
"step": 2880
},
{
"epoch": 6.569070991606203,
"grad_norm": 0.2620035409927368,
"learning_rate": 2.0555496283651623e-06,
"loss": 0.075,
"step": 2890
},
{
"epoch": 6.591833831270451,
"grad_norm": 0.24615773558616638,
"learning_rate": 2.0626622568370143e-06,
"loss": 0.0718,
"step": 2900
},
{
"epoch": 6.614596670934699,
"grad_norm": 0.24443495273590088,
"learning_rate": 2.0697748853088662e-06,
"loss": 0.0727,
"step": 2910
},
{
"epoch": 6.637359510598947,
"grad_norm": 0.2751477062702179,
"learning_rate": 2.0768875137807178e-06,
"loss": 0.0735,
"step": 2920
},
{
"epoch": 6.660122350263196,
"grad_norm": 0.21635128557682037,
"learning_rate": 2.0840001422525697e-06,
"loss": 0.0726,
"step": 2930
},
{
"epoch": 6.682885189927443,
"grad_norm": 0.26079434156417847,
"learning_rate": 2.0911127707244213e-06,
"loss": 0.075,
"step": 2940
},
{
"epoch": 6.705648029591692,
"grad_norm": 0.2535637617111206,
"learning_rate": 2.098225399196273e-06,
"loss": 0.0728,
"step": 2950
},
{
"epoch": 6.7284108692559395,
"grad_norm": 0.28646010160446167,
"learning_rate": 2.1053380276681247e-06,
"loss": 0.0713,
"step": 2960
},
{
"epoch": 6.751173708920188,
"grad_norm": 0.24261872470378876,
"learning_rate": 2.1124506561399767e-06,
"loss": 0.0722,
"step": 2970
},
{
"epoch": 6.773936548584436,
"grad_norm": 0.3087233901023865,
"learning_rate": 2.1195632846118286e-06,
"loss": 0.0753,
"step": 2980
},
{
"epoch": 6.796699388248684,
"grad_norm": 0.26111656427383423,
"learning_rate": 2.12667591308368e-06,
"loss": 0.0733,
"step": 2990
},
{
"epoch": 6.819462227912932,
"grad_norm": 0.22123539447784424,
"learning_rate": 2.133788541555532e-06,
"loss": 0.0727,
"step": 3000
},
{
"epoch": 6.819462227912932,
"eval_loss": 0.04253983870148659,
"eval_runtime": 3.1522,
"eval_samples_per_second": 475.855,
"eval_steps_per_second": 7.614,
"eval_sts_dev_pearson_cosine": 0.8180287893647112,
"eval_sts_dev_pearson_dot": 0.7048827437986248,
"eval_sts_dev_pearson_euclidean": 0.7892330786682039,
"eval_sts_dev_pearson_manhattan": 0.7902606373541703,
"eval_sts_dev_pearson_max": 0.8180287893647112,
"eval_sts_dev_spearman_cosine": 0.8243152286673464,
"eval_sts_dev_spearman_dot": 0.6873609530160721,
"eval_sts_dev_spearman_euclidean": 0.7932823760168848,
"eval_sts_dev_spearman_manhattan": 0.7950721624458399,
"eval_sts_dev_spearman_max": 0.8243152286673464,
"step": 3000
},
{
"epoch": 6.84222506757718,
"grad_norm": 0.25969475507736206,
"learning_rate": 2.1409011700273836e-06,
"loss": 0.0729,
"step": 3010
},
{
"epoch": 6.864987907241428,
"grad_norm": 0.2635466158390045,
"learning_rate": 2.1480137984992356e-06,
"loss": 0.073,
"step": 3020
},
{
"epoch": 6.8877507469056765,
"grad_norm": 0.2308150678873062,
"learning_rate": 2.155126426971087e-06,
"loss": 0.0739,
"step": 3030
},
{
"epoch": 6.910513586569925,
"grad_norm": 0.28183940052986145,
"learning_rate": 2.162239055442939e-06,
"loss": 0.0717,
"step": 3040
},
{
"epoch": 6.933276426234173,
"grad_norm": 0.2507220506668091,
"learning_rate": 2.169351683914791e-06,
"loss": 0.0719,
"step": 3050
},
{
"epoch": 6.956039265898421,
"grad_norm": 0.24221095442771912,
"learning_rate": 2.1764643123866426e-06,
"loss": 0.0712,
"step": 3060
},
{
"epoch": 6.978802105562669,
"grad_norm": 0.2784756124019623,
"learning_rate": 2.1835769408584945e-06,
"loss": 0.0712,
"step": 3070
},
{
"epoch": 7.0,
"grad_norm": 0.146551251411438,
"learning_rate": 2.190689569330346e-06,
"loss": 0.0674,
"step": 3080
},
{
"epoch": 7.0227628396642485,
"grad_norm": 0.2597323954105377,
"learning_rate": 2.197802197802198e-06,
"loss": 0.0729,
"step": 3090
},
{
"epoch": 7.045525679328496,
"grad_norm": 0.23450802266597748,
"learning_rate": 2.2049148262740495e-06,
"loss": 0.0712,
"step": 3100
},
{
"epoch": 7.068288518992745,
"grad_norm": 0.25625136494636536,
"learning_rate": 2.2120274547459015e-06,
"loss": 0.0701,
"step": 3110
},
{
"epoch": 7.091051358656992,
"grad_norm": 0.24344876408576965,
"learning_rate": 2.2191400832177534e-06,
"loss": 0.0699,
"step": 3120
},
{
"epoch": 7.113814198321241,
"grad_norm": 0.26194125413894653,
"learning_rate": 2.2262527116896054e-06,
"loss": 0.0675,
"step": 3130
},
{
"epoch": 7.1365770379854885,
"grad_norm": 0.30849021673202515,
"learning_rate": 2.233365340161457e-06,
"loss": 0.0699,
"step": 3140
},
{
"epoch": 7.159339877649737,
"grad_norm": 0.2190207540988922,
"learning_rate": 2.2404779686333084e-06,
"loss": 0.0716,
"step": 3150
},
{
"epoch": 7.182102717313985,
"grad_norm": 0.24949057400226593,
"learning_rate": 2.2475905971051604e-06,
"loss": 0.0707,
"step": 3160
},
{
"epoch": 7.204865556978233,
"grad_norm": 0.3089491128921509,
"learning_rate": 2.254703225577012e-06,
"loss": 0.0717,
"step": 3170
},
{
"epoch": 7.227628396642481,
"grad_norm": 0.24911244213581085,
"learning_rate": 2.261815854048864e-06,
"loss": 0.0709,
"step": 3180
},
{
"epoch": 7.250391236306729,
"grad_norm": 0.24327076971530914,
"learning_rate": 2.268928482520716e-06,
"loss": 0.071,
"step": 3190
},
{
"epoch": 7.273154075970977,
"grad_norm": 0.26636022329330444,
"learning_rate": 2.2760411109925678e-06,
"loss": 0.0722,
"step": 3200
},
{
"epoch": 7.2959169156352255,
"grad_norm": 0.31015798449516296,
"learning_rate": 2.2831537394644193e-06,
"loss": 0.072,
"step": 3210
},
{
"epoch": 7.318679755299474,
"grad_norm": 0.238671213388443,
"learning_rate": 2.2902663679362712e-06,
"loss": 0.0729,
"step": 3220
},
{
"epoch": 7.341442594963722,
"grad_norm": 0.2293502241373062,
"learning_rate": 2.2973789964081228e-06,
"loss": 0.0678,
"step": 3230
},
{
"epoch": 7.36420543462797,
"grad_norm": 0.2628055810928345,
"learning_rate": 2.3044916248799747e-06,
"loss": 0.0705,
"step": 3240
},
{
"epoch": 7.386968274292218,
"grad_norm": 0.2914957106113434,
"learning_rate": 2.3116042533518262e-06,
"loss": 0.0715,
"step": 3250
},
{
"epoch": 7.386968274292218,
"eval_loss": 0.04259900003671646,
"eval_runtime": 3.1556,
"eval_samples_per_second": 475.348,
"eval_steps_per_second": 7.606,
"eval_sts_dev_pearson_cosine": 0.8190384244394546,
"eval_sts_dev_pearson_dot": 0.7052187544623498,
"eval_sts_dev_pearson_euclidean": 0.7905914669782148,
"eval_sts_dev_pearson_manhattan": 0.7915483272250368,
"eval_sts_dev_pearson_max": 0.8190384244394546,
"eval_sts_dev_spearman_cosine": 0.825567603424224,
"eval_sts_dev_spearman_dot": 0.6876636031299087,
"eval_sts_dev_spearman_euclidean": 0.7948957613160437,
"eval_sts_dev_spearman_manhattan": 0.7965685334149736,
"eval_sts_dev_spearman_max": 0.825567603424224,
"step": 3250
},
{
"epoch": 7.409731113956466,
"grad_norm": 0.2585102915763855,
"learning_rate": 2.318716881823678e-06,
"loss": 0.0703,
"step": 3260
},
{
"epoch": 7.432493953620714,
"grad_norm": 0.24229033291339874,
"learning_rate": 2.32582951029553e-06,
"loss": 0.0699,
"step": 3270
},
{
"epoch": 7.4552567932849625,
"grad_norm": 0.26663440465927124,
"learning_rate": 2.3329421387673817e-06,
"loss": 0.071,
"step": 3280
},
{
"epoch": 7.47801963294921,
"grad_norm": 0.28489962220191956,
"learning_rate": 2.3400547672392336e-06,
"loss": 0.0692,
"step": 3290
},
{
"epoch": 7.500782472613459,
"grad_norm": 0.3053910732269287,
"learning_rate": 2.347167395711085e-06,
"loss": 0.0693,
"step": 3300
},
{
"epoch": 7.523545312277706,
"grad_norm": 0.23338255286216736,
"learning_rate": 2.354280024182937e-06,
"loss": 0.0661,
"step": 3310
},
{
"epoch": 7.546308151941955,
"grad_norm": 0.4110086262226105,
"learning_rate": 2.3613926526547886e-06,
"loss": 0.0702,
"step": 3320
},
{
"epoch": 7.569070991606203,
"grad_norm": 0.23433230817317963,
"learning_rate": 2.3685052811266406e-06,
"loss": 0.0697,
"step": 3330
},
{
"epoch": 7.591833831270451,
"grad_norm": 0.26817795634269714,
"learning_rate": 2.3756179095984925e-06,
"loss": 0.072,
"step": 3340
},
{
"epoch": 7.614596670934699,
"grad_norm": 0.274307519197464,
"learning_rate": 2.382730538070344e-06,
"loss": 0.0693,
"step": 3350
},
{
"epoch": 7.637359510598947,
"grad_norm": 0.2531595230102539,
"learning_rate": 2.389843166542196e-06,
"loss": 0.0691,
"step": 3360
},
{
"epoch": 7.660122350263196,
"grad_norm": 0.23066122829914093,
"learning_rate": 2.3969557950140475e-06,
"loss": 0.0702,
"step": 3370
},
{
"epoch": 7.682885189927443,
"grad_norm": 0.26466798782348633,
"learning_rate": 2.4040684234858995e-06,
"loss": 0.0672,
"step": 3380
},
{
"epoch": 7.705648029591692,
"grad_norm": 0.27675947546958923,
"learning_rate": 2.411181051957751e-06,
"loss": 0.0698,
"step": 3390
},
{
"epoch": 7.7284108692559395,
"grad_norm": 0.2792898416519165,
"learning_rate": 2.418293680429603e-06,
"loss": 0.0687,
"step": 3400
},
{
"epoch": 7.751173708920188,
"grad_norm": 0.3003191351890564,
"learning_rate": 2.425406308901455e-06,
"loss": 0.0654,
"step": 3410
},
{
"epoch": 7.773936548584436,
"grad_norm": 0.2721976637840271,
"learning_rate": 2.4325189373733065e-06,
"loss": 0.0687,
"step": 3420
},
{
"epoch": 7.796699388248684,
"grad_norm": 0.23832115530967712,
"learning_rate": 2.4396315658451584e-06,
"loss": 0.0679,
"step": 3430
},
{
"epoch": 7.819462227912932,
"grad_norm": 0.2580774128437042,
"learning_rate": 2.44674419431701e-06,
"loss": 0.0713,
"step": 3440
},
{
"epoch": 7.84222506757718,
"grad_norm": 0.2566244900226593,
"learning_rate": 2.453856822788862e-06,
"loss": 0.0676,
"step": 3450
},
{
"epoch": 7.864987907241428,
"grad_norm": 0.22271563112735748,
"learning_rate": 2.4609694512607134e-06,
"loss": 0.0708,
"step": 3460
},
{
"epoch": 7.8877507469056765,
"grad_norm": 0.2326367348432541,
"learning_rate": 2.4680820797325654e-06,
"loss": 0.0666,
"step": 3470
},
{
"epoch": 7.910513586569925,
"grad_norm": 0.2301758974790573,
"learning_rate": 2.4751947082044173e-06,
"loss": 0.0675,
"step": 3480
},
{
"epoch": 7.933276426234173,
"grad_norm": 0.27631568908691406,
"learning_rate": 2.482307336676269e-06,
"loss": 0.0693,
"step": 3490
},
{
"epoch": 7.956039265898421,
"grad_norm": 0.2313879281282425,
"learning_rate": 2.489419965148121e-06,
"loss": 0.0688,
"step": 3500
},
{
"epoch": 7.956039265898421,
"eval_loss": 0.04267411306500435,
"eval_runtime": 3.1141,
"eval_samples_per_second": 481.677,
"eval_steps_per_second": 7.707,
"eval_sts_dev_pearson_cosine": 0.8198817503712781,
"eval_sts_dev_pearson_dot": 0.7039245984668114,
"eval_sts_dev_pearson_euclidean": 0.7917017633196954,
"eval_sts_dev_pearson_manhattan": 0.7925964604579336,
"eval_sts_dev_pearson_max": 0.8198817503712781,
"eval_sts_dev_spearman_cosine": 0.8259962866116397,
"eval_sts_dev_spearman_dot": 0.686457485674087,
"eval_sts_dev_spearman_euclidean": 0.7960724017615675,
"eval_sts_dev_spearman_manhattan": 0.7974479322296961,
"eval_sts_dev_spearman_max": 0.8259962866116397,
"step": 3500
},
{
"epoch": 7.978802105562669,
"grad_norm": 0.24964259564876556,
"learning_rate": 2.4965325936199723e-06,
"loss": 0.068,
"step": 3510
},
{
"epoch": 8.0,
"grad_norm": 0.16819854080677032,
"learning_rate": 2.5036452220918243e-06,
"loss": 0.063,
"step": 3520
},
{
"epoch": 8.022762839664248,
"grad_norm": 0.271306574344635,
"learning_rate": 2.510757850563676e-06,
"loss": 0.0659,
"step": 3530
},
{
"epoch": 8.045525679328497,
"grad_norm": 0.2654683291912079,
"learning_rate": 2.5178704790355278e-06,
"loss": 0.0639,
"step": 3540
},
{
"epoch": 8.068288518992745,
"grad_norm": 0.2875867784023285,
"learning_rate": 2.5249831075073793e-06,
"loss": 0.0678,
"step": 3550
},
{
"epoch": 8.091051358656992,
"grad_norm": 0.24303793907165527,
"learning_rate": 2.5320957359792312e-06,
"loss": 0.0689,
"step": 3560
},
{
"epoch": 8.11381419832124,
"grad_norm": 0.2647855281829834,
"learning_rate": 2.5392083644510828e-06,
"loss": 0.0687,
"step": 3570
},
{
"epoch": 8.13657703798549,
"grad_norm": 0.24669058620929718,
"learning_rate": 2.5463209929229347e-06,
"loss": 0.0672,
"step": 3580
},
{
"epoch": 8.159339877649737,
"grad_norm": 0.2413569688796997,
"learning_rate": 2.553433621394787e-06,
"loss": 0.0659,
"step": 3590
},
{
"epoch": 8.182102717313985,
"grad_norm": 0.24042648077011108,
"learning_rate": 2.560546249866638e-06,
"loss": 0.0658,
"step": 3600
},
{
"epoch": 8.204865556978232,
"grad_norm": 0.2669181525707245,
"learning_rate": 2.5676588783384906e-06,
"loss": 0.0664,
"step": 3610
},
{
"epoch": 8.227628396642482,
"grad_norm": 0.24161921441555023,
"learning_rate": 2.5747715068103417e-06,
"loss": 0.0659,
"step": 3620
},
{
"epoch": 8.25039123630673,
"grad_norm": 0.269060879945755,
"learning_rate": 2.5818841352821936e-06,
"loss": 0.0664,
"step": 3630
},
{
"epoch": 8.273154075970977,
"grad_norm": 0.27089112997055054,
"learning_rate": 2.588996763754045e-06,
"loss": 0.0652,
"step": 3640
},
{
"epoch": 8.295916915635225,
"grad_norm": 0.2416848987340927,
"learning_rate": 2.596109392225897e-06,
"loss": 0.0683,
"step": 3650
},
{
"epoch": 8.318679755299474,
"grad_norm": 0.21763497591018677,
"learning_rate": 2.6032220206977495e-06,
"loss": 0.0641,
"step": 3660
},
{
"epoch": 8.341442594963722,
"grad_norm": 0.23553740978240967,
"learning_rate": 2.6103346491696006e-06,
"loss": 0.0672,
"step": 3670
},
{
"epoch": 8.36420543462797,
"grad_norm": 0.22709369659423828,
"learning_rate": 2.617447277641453e-06,
"loss": 0.0655,
"step": 3680
},
{
"epoch": 8.386968274292219,
"grad_norm": 0.22554650902748108,
"learning_rate": 2.624559906113304e-06,
"loss": 0.0661,
"step": 3690
},
{
"epoch": 8.409731113956466,
"grad_norm": 0.22940964996814728,
"learning_rate": 2.6316725345851564e-06,
"loss": 0.0638,
"step": 3700
},
{
"epoch": 8.432493953620714,
"grad_norm": 0.22286923229694366,
"learning_rate": 2.6387851630570076e-06,
"loss": 0.0675,
"step": 3710
},
{
"epoch": 8.455256793284962,
"grad_norm": 0.2673478126525879,
"learning_rate": 2.64589779152886e-06,
"loss": 0.0648,
"step": 3720
},
{
"epoch": 8.478019632949211,
"grad_norm": 0.3129027485847473,
"learning_rate": 2.653010420000712e-06,
"loss": 0.067,
"step": 3730
},
{
"epoch": 8.500782472613459,
"grad_norm": 0.2747926712036133,
"learning_rate": 2.6601230484725634e-06,
"loss": 0.0684,
"step": 3740
},
{
"epoch": 8.523545312277706,
"grad_norm": 0.24347588419914246,
"learning_rate": 2.6672356769444154e-06,
"loss": 0.0667,
"step": 3750
},
{
"epoch": 8.523545312277706,
"eval_loss": 0.04199772700667381,
"eval_runtime": 3.2359,
"eval_samples_per_second": 463.553,
"eval_steps_per_second": 7.417,
"eval_sts_dev_pearson_cosine": 0.8204944023566688,
"eval_sts_dev_pearson_dot": 0.7040977870407604,
"eval_sts_dev_pearson_euclidean": 0.7919081914053657,
"eval_sts_dev_pearson_manhattan": 0.7927432790586826,
"eval_sts_dev_pearson_max": 0.8204944023566688,
"eval_sts_dev_spearman_cosine": 0.8268027687748516,
"eval_sts_dev_spearman_dot": 0.6869146800512241,
"eval_sts_dev_spearman_euclidean": 0.7966713636345194,
"eval_sts_dev_spearman_manhattan": 0.798229350269622,
"eval_sts_dev_spearman_max": 0.8268027687748516,
"step": 3750
},
{
"epoch": 8.546308151941954,
"grad_norm": 0.24286551773548126,
"learning_rate": 2.6743483054162665e-06,
"loss": 0.0645,
"step": 3760
},
{
"epoch": 8.569070991606203,
"grad_norm": 0.25422462821006775,
"learning_rate": 2.681460933888119e-06,
"loss": 0.0652,
"step": 3770
},
{
"epoch": 8.591833831270451,
"grad_norm": 0.22316358983516693,
"learning_rate": 2.68857356235997e-06,
"loss": 0.0633,
"step": 3780
},
{
"epoch": 8.614596670934699,
"grad_norm": 0.2212369740009308,
"learning_rate": 2.6956861908318223e-06,
"loss": 0.065,
"step": 3790
},
{
"epoch": 8.637359510598948,
"grad_norm": 0.24087338149547577,
"learning_rate": 2.7027988193036743e-06,
"loss": 0.064,
"step": 3800
},
{
"epoch": 8.660122350263196,
"grad_norm": 0.3741282522678375,
"learning_rate": 2.709911447775526e-06,
"loss": 0.0677,
"step": 3810
},
{
"epoch": 8.682885189927443,
"grad_norm": 0.3055291771888733,
"learning_rate": 2.7170240762473777e-06,
"loss": 0.0661,
"step": 3820
},
{
"epoch": 8.705648029591691,
"grad_norm": 0.25181838870048523,
"learning_rate": 2.7241367047192293e-06,
"loss": 0.0653,
"step": 3830
},
{
"epoch": 8.72841086925594,
"grad_norm": 0.21388790011405945,
"learning_rate": 2.7312493331910812e-06,
"loss": 0.0625,
"step": 3840
},
{
"epoch": 8.751173708920188,
"grad_norm": 0.23756906390190125,
"learning_rate": 2.7383619616629328e-06,
"loss": 0.0651,
"step": 3850
},
{
"epoch": 8.773936548584436,
"grad_norm": 0.2743723690509796,
"learning_rate": 2.7454745901347847e-06,
"loss": 0.0656,
"step": 3860
},
{
"epoch": 8.796699388248683,
"grad_norm": 0.25479790568351746,
"learning_rate": 2.7525872186066367e-06,
"loss": 0.0636,
"step": 3870
},
{
"epoch": 8.819462227912933,
"grad_norm": 0.23488923907279968,
"learning_rate": 2.759699847078488e-06,
"loss": 0.0655,
"step": 3880
},
{
"epoch": 8.84222506757718,
"grad_norm": 0.303478866815567,
"learning_rate": 2.76681247555034e-06,
"loss": 0.0647,
"step": 3890
},
{
"epoch": 8.864987907241428,
"grad_norm": 0.22421741485595703,
"learning_rate": 2.7739251040221917e-06,
"loss": 0.0638,
"step": 3900
},
{
"epoch": 8.887750746905677,
"grad_norm": 0.2436189502477646,
"learning_rate": 2.7810377324940436e-06,
"loss": 0.0636,
"step": 3910
},
{
"epoch": 8.910513586569925,
"grad_norm": 0.35646867752075195,
"learning_rate": 2.788150360965895e-06,
"loss": 0.0666,
"step": 3920
},
{
"epoch": 8.933276426234173,
"grad_norm": 0.2209872603416443,
"learning_rate": 2.795262989437747e-06,
"loss": 0.062,
"step": 3930
},
{
"epoch": 8.95603926589842,
"grad_norm": 0.25649750232696533,
"learning_rate": 2.8023756179095986e-06,
"loss": 0.065,
"step": 3940
},
{
"epoch": 8.97880210556267,
"grad_norm": 0.24208350479602814,
"learning_rate": 2.8094882463814506e-06,
"loss": 0.0643,
"step": 3950
},
{
"epoch": 9.0,
"grad_norm": 0.11922793090343475,
"learning_rate": 2.8166008748533025e-06,
"loss": 0.0594,
"step": 3960
},
{
"epoch": 9.022762839664248,
"grad_norm": 0.2234543114900589,
"learning_rate": 2.823713503325154e-06,
"loss": 0.0616,
"step": 3970
},
{
"epoch": 9.045525679328497,
"grad_norm": 0.21580451726913452,
"learning_rate": 2.830826131797006e-06,
"loss": 0.0638,
"step": 3980
},
{
"epoch": 9.068288518992745,
"grad_norm": 0.2051229476928711,
"learning_rate": 2.8379387602688575e-06,
"loss": 0.0625,
"step": 3990
},
{
"epoch": 9.091051358656992,
"grad_norm": 0.2237127125263214,
"learning_rate": 2.8450513887407095e-06,
"loss": 0.0665,
"step": 4000
},
{
"epoch": 9.091051358656992,
"eval_loss": 0.041355252265930176,
"eval_runtime": 3.1834,
"eval_samples_per_second": 471.199,
"eval_steps_per_second": 7.539,
"eval_sts_dev_pearson_cosine": 0.8220285778407846,
"eval_sts_dev_pearson_dot": 0.7022254885739367,
"eval_sts_dev_pearson_euclidean": 0.7933532583617332,
"eval_sts_dev_pearson_manhattan": 0.7941338912825391,
"eval_sts_dev_pearson_max": 0.8220285778407846,
"eval_sts_dev_spearman_cosine": 0.8276471334482826,
"eval_sts_dev_spearman_dot": 0.6857559655167198,
"eval_sts_dev_spearman_euclidean": 0.7981249234213611,
"eval_sts_dev_spearman_manhattan": 0.7997185742063436,
"eval_sts_dev_spearman_max": 0.8276471334482826,
"step": 4000
},
{
"epoch": 9.11381419832124,
"grad_norm": 0.23453885316848755,
"learning_rate": 2.852164017212561e-06,
"loss": 0.0624,
"step": 4010
},
{
"epoch": 9.13657703798549,
"grad_norm": 0.22881363332271576,
"learning_rate": 2.859276645684413e-06,
"loss": 0.0621,
"step": 4020
},
{
"epoch": 9.159339877649737,
"grad_norm": 0.21634767949581146,
"learning_rate": 2.866389274156265e-06,
"loss": 0.0648,
"step": 4030
},
{
"epoch": 9.182102717313985,
"grad_norm": 0.2653968334197998,
"learning_rate": 2.8735019026281164e-06,
"loss": 0.0622,
"step": 4040
},
{
"epoch": 9.204865556978232,
"grad_norm": 0.2806706726551056,
"learning_rate": 2.8806145310999684e-06,
"loss": 0.0635,
"step": 4050
},
{
"epoch": 9.227628396642482,
"grad_norm": 0.25029635429382324,
"learning_rate": 2.88772715957182e-06,
"loss": 0.061,
"step": 4060
},
{
"epoch": 9.25039123630673,
"grad_norm": 0.24983397126197815,
"learning_rate": 2.894839788043672e-06,
"loss": 0.0602,
"step": 4070
},
{
"epoch": 9.273154075970977,
"grad_norm": 0.21316730976104736,
"learning_rate": 2.9019524165155234e-06,
"loss": 0.0613,
"step": 4080
},
{
"epoch": 9.295916915635225,
"grad_norm": 0.21870028972625732,
"learning_rate": 2.9090650449873754e-06,
"loss": 0.0604,
"step": 4090
},
{
"epoch": 9.318679755299474,
"grad_norm": 0.21702495217323303,
"learning_rate": 2.9161776734592273e-06,
"loss": 0.0623,
"step": 4100
},
{
"epoch": 9.341442594963722,
"grad_norm": 0.22777798771858215,
"learning_rate": 2.923290301931079e-06,
"loss": 0.0641,
"step": 4110
},
{
"epoch": 9.36420543462797,
"grad_norm": 0.2656283378601074,
"learning_rate": 2.930402930402931e-06,
"loss": 0.0635,
"step": 4120
},
{
"epoch": 9.386968274292219,
"grad_norm": 0.23527038097381592,
"learning_rate": 2.9375155588747823e-06,
"loss": 0.0608,
"step": 4130
},
{
"epoch": 9.409731113956466,
"grad_norm": 0.21856476366519928,
"learning_rate": 2.9446281873466343e-06,
"loss": 0.0611,
"step": 4140
},
{
"epoch": 9.432493953620714,
"grad_norm": 0.23688729107379913,
"learning_rate": 2.951740815818486e-06,
"loss": 0.0607,
"step": 4150
},
{
"epoch": 9.455256793284962,
"grad_norm": 0.26457446813583374,
"learning_rate": 2.9588534442903377e-06,
"loss": 0.0631,
"step": 4160
},
{
"epoch": 9.478019632949211,
"grad_norm": 0.31578782200813293,
"learning_rate": 2.9659660727621897e-06,
"loss": 0.0618,
"step": 4170
},
{
"epoch": 9.500782472613459,
"grad_norm": 0.23187491297721863,
"learning_rate": 2.9730787012340412e-06,
"loss": 0.0609,
"step": 4180
},
{
"epoch": 9.523545312277706,
"grad_norm": 0.24577929079532623,
"learning_rate": 2.980191329705893e-06,
"loss": 0.0613,
"step": 4190
},
{
"epoch": 9.546308151941954,
"grad_norm": 0.23201169073581696,
"learning_rate": 2.9873039581777447e-06,
"loss": 0.0606,
"step": 4200
},
{
"epoch": 9.569070991606203,
"grad_norm": 0.2860512137413025,
"learning_rate": 2.9944165866495967e-06,
"loss": 0.0595,
"step": 4210
},
{
"epoch": 9.591833831270451,
"grad_norm": 0.237753763794899,
"learning_rate": 3.001529215121448e-06,
"loss": 0.0609,
"step": 4220
},
{
"epoch": 9.614596670934699,
"grad_norm": 0.23422682285308838,
"learning_rate": 3.0086418435933e-06,
"loss": 0.061,
"step": 4230
},
{
"epoch": 9.637359510598948,
"grad_norm": 0.2497267723083496,
"learning_rate": 3.015754472065152e-06,
"loss": 0.0616,
"step": 4240
},
{
"epoch": 9.660122350263196,
"grad_norm": 0.2505936622619629,
"learning_rate": 3.0228671005370036e-06,
"loss": 0.0613,
"step": 4250
},
{
"epoch": 9.660122350263196,
"eval_loss": 0.04175787419080734,
"eval_runtime": 3.1427,
"eval_samples_per_second": 477.3,
"eval_steps_per_second": 7.637,
"eval_sts_dev_pearson_cosine": 0.8220874775898197,
"eval_sts_dev_pearson_dot": 0.7010536213435227,
"eval_sts_dev_pearson_euclidean": 0.7929031352092236,
"eval_sts_dev_pearson_manhattan": 0.7936882861676204,
"eval_sts_dev_pearson_max": 0.8220874775898197,
"eval_sts_dev_spearman_cosine": 0.8282368218808581,
"eval_sts_dev_spearman_dot": 0.6844746263331734,
"eval_sts_dev_spearman_euclidean": 0.7979913252239026,
"eval_sts_dev_spearman_manhattan": 0.7996541111809876,
"eval_sts_dev_spearman_max": 0.8282368218808581,
"step": 4250
},
{
"epoch": 9.682885189927443,
"grad_norm": 0.2565889060497284,
"learning_rate": 3.0299797290088556e-06,
"loss": 0.0623,
"step": 4260
},
{
"epoch": 9.705648029591691,
"grad_norm": 0.2263515293598175,
"learning_rate": 3.037092357480707e-06,
"loss": 0.0605,
"step": 4270
},
{
"epoch": 9.72841086925594,
"grad_norm": 0.21705535054206848,
"learning_rate": 3.044204985952559e-06,
"loss": 0.0637,
"step": 4280
},
{
"epoch": 9.751173708920188,
"grad_norm": 0.21649038791656494,
"learning_rate": 3.0513176144244106e-06,
"loss": 0.0604,
"step": 4290
},
{
"epoch": 9.773936548584436,
"grad_norm": 0.22717022895812988,
"learning_rate": 3.0584302428962625e-06,
"loss": 0.0606,
"step": 4300
},
{
"epoch": 9.796699388248683,
"grad_norm": 0.23610946536064148,
"learning_rate": 3.0655428713681145e-06,
"loss": 0.0622,
"step": 4310
},
{
"epoch": 9.819462227912933,
"grad_norm": 0.2080880105495453,
"learning_rate": 3.072655499839966e-06,
"loss": 0.0598,
"step": 4320
},
{
"epoch": 9.84222506757718,
"grad_norm": 0.2862449884414673,
"learning_rate": 3.079768128311818e-06,
"loss": 0.0611,
"step": 4330
},
{
"epoch": 9.864987907241428,
"grad_norm": 0.2211073935031891,
"learning_rate": 3.0868807567836695e-06,
"loss": 0.0604,
"step": 4340
},
{
"epoch": 9.887750746905677,
"grad_norm": 0.2399899959564209,
"learning_rate": 3.0939933852555214e-06,
"loss": 0.0598,
"step": 4350
},
{
"epoch": 9.910513586569925,
"grad_norm": 0.2330579161643982,
"learning_rate": 3.101106013727373e-06,
"loss": 0.0626,
"step": 4360
},
{
"epoch": 9.933276426234173,
"grad_norm": 0.23163940012454987,
"learning_rate": 3.108218642199225e-06,
"loss": 0.0624,
"step": 4370
},
{
"epoch": 9.95603926589842,
"grad_norm": 0.2087012380361557,
"learning_rate": 3.115331270671077e-06,
"loss": 0.0617,
"step": 4380
},
{
"epoch": 9.97880210556267,
"grad_norm": 0.24286577105522156,
"learning_rate": 3.1224438991429284e-06,
"loss": 0.0603,
"step": 4390
}
],
"logging_steps": 10,
"max_steps": 4390,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 250,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}