CocoRoF's picture
Training in progress, step 2000, checkpoint
f64d5ab verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.751640112464855,
"eval_steps": 250,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01874414245548266,
"grad_norm": 1.0379021167755127,
"learning_rate": 0.0007999941425412586,
"loss": 0.6752,
"step": 10
},
{
"epoch": 0.03748828491096532,
"grad_norm": 0.779513955116272,
"learning_rate": 0.000799988285082517,
"loss": 0.3775,
"step": 20
},
{
"epoch": 0.056232427366447985,
"grad_norm": 0.7819342017173767,
"learning_rate": 0.0007999824276237754,
"loss": 0.3341,
"step": 30
},
{
"epoch": 0.07497656982193064,
"grad_norm": 0.8172865509986877,
"learning_rate": 0.0007999765701650339,
"loss": 0.3026,
"step": 40
},
{
"epoch": 0.09372071227741331,
"grad_norm": 0.6789480447769165,
"learning_rate": 0.0007999707127062924,
"loss": 0.3091,
"step": 50
},
{
"epoch": 0.11246485473289597,
"grad_norm": 0.6779003143310547,
"learning_rate": 0.0007999648552475509,
"loss": 0.2934,
"step": 60
},
{
"epoch": 0.13120899718837864,
"grad_norm": 0.5085116028785706,
"learning_rate": 0.0007999589977888094,
"loss": 0.2856,
"step": 70
},
{
"epoch": 0.14995313964386128,
"grad_norm": 0.5592414140701294,
"learning_rate": 0.0007999531403300678,
"loss": 0.2736,
"step": 80
},
{
"epoch": 0.16869728209934395,
"grad_norm": 0.5976341366767883,
"learning_rate": 0.0007999472828713263,
"loss": 0.2938,
"step": 90
},
{
"epoch": 0.18744142455482662,
"grad_norm": 0.5462539196014404,
"learning_rate": 0.0007999414254125848,
"loss": 0.2597,
"step": 100
},
{
"epoch": 0.20618556701030927,
"grad_norm": 0.5744641423225403,
"learning_rate": 0.0007999355679538432,
"loss": 0.2784,
"step": 110
},
{
"epoch": 0.22492970946579194,
"grad_norm": 0.6287326812744141,
"learning_rate": 0.0007999297104951018,
"loss": 0.2616,
"step": 120
},
{
"epoch": 0.2436738519212746,
"grad_norm": 0.5516992211341858,
"learning_rate": 0.0007999238530363602,
"loss": 0.2668,
"step": 130
},
{
"epoch": 0.2624179943767573,
"grad_norm": 0.5527953505516052,
"learning_rate": 0.0007999179955776188,
"loss": 0.2623,
"step": 140
},
{
"epoch": 0.28116213683223995,
"grad_norm": 0.5433118343353271,
"learning_rate": 0.0007999121381188772,
"loss": 0.2692,
"step": 150
},
{
"epoch": 0.29990627928772257,
"grad_norm": 0.5417677760124207,
"learning_rate": 0.0007999062806601356,
"loss": 0.2435,
"step": 160
},
{
"epoch": 0.31865042174320524,
"grad_norm": 0.523895800113678,
"learning_rate": 0.0007999004232013942,
"loss": 0.2459,
"step": 170
},
{
"epoch": 0.3373945641986879,
"grad_norm": 0.7117316126823425,
"learning_rate": 0.0007998945657426526,
"loss": 0.2638,
"step": 180
},
{
"epoch": 0.3561387066541706,
"grad_norm": 0.777367353439331,
"learning_rate": 0.0007998887082839111,
"loss": 0.2423,
"step": 190
},
{
"epoch": 0.37488284910965325,
"grad_norm": 0.6237531304359436,
"learning_rate": 0.0007998828508251696,
"loss": 0.2446,
"step": 200
},
{
"epoch": 0.3936269915651359,
"grad_norm": 0.47088104486465454,
"learning_rate": 0.000799876993366428,
"loss": 0.2556,
"step": 210
},
{
"epoch": 0.41237113402061853,
"grad_norm": 0.48523184657096863,
"learning_rate": 0.0007998711359076864,
"loss": 0.2361,
"step": 220
},
{
"epoch": 0.4311152764761012,
"grad_norm": 0.5119248628616333,
"learning_rate": 0.000799865278448945,
"loss": 0.2401,
"step": 230
},
{
"epoch": 0.4498594189315839,
"grad_norm": 0.4833837151527405,
"learning_rate": 0.0007998594209902034,
"loss": 0.2412,
"step": 240
},
{
"epoch": 0.46860356138706655,
"grad_norm": 0.5014116764068604,
"learning_rate": 0.000799853563531462,
"loss": 0.2475,
"step": 250
},
{
"epoch": 0.46860356138706655,
"eval_loss": 0.11557532846927643,
"eval_pearson_cosine": 0.7298070192337036,
"eval_pearson_dot": 0.7093910574913025,
"eval_pearson_euclidean": 0.7365932464599609,
"eval_pearson_manhattan": 0.7382453680038452,
"eval_runtime": 27.2701,
"eval_samples_per_second": 55.005,
"eval_spearman_cosine": 0.7577597198343433,
"eval_spearman_dot": 0.7151991550847255,
"eval_spearman_euclidean": 0.7434473510612767,
"eval_spearman_manhattan": 0.747354805702794,
"eval_steps_per_second": 6.894,
"step": 250
},
{
"epoch": 0.4873477038425492,
"grad_norm": 0.4523856043815613,
"learning_rate": 0.0007998477060727204,
"loss": 0.2261,
"step": 260
},
{
"epoch": 0.5060918462980318,
"grad_norm": 0.5232961177825928,
"learning_rate": 0.0007998418486139788,
"loss": 0.2473,
"step": 270
},
{
"epoch": 0.5248359887535146,
"grad_norm": 0.5113686323165894,
"learning_rate": 0.0007998359911552374,
"loss": 0.2354,
"step": 280
},
{
"epoch": 0.5435801312089972,
"grad_norm": 0.458387166261673,
"learning_rate": 0.0007998301336964958,
"loss": 0.2568,
"step": 290
},
{
"epoch": 0.5623242736644799,
"grad_norm": 0.45359304547309875,
"learning_rate": 0.0007998242762377543,
"loss": 0.2415,
"step": 300
},
{
"epoch": 0.5810684161199625,
"grad_norm": 0.46073561906814575,
"learning_rate": 0.0007998184187790128,
"loss": 0.2291,
"step": 310
},
{
"epoch": 0.5998125585754451,
"grad_norm": 0.4503585696220398,
"learning_rate": 0.0007998125613202712,
"loss": 0.2368,
"step": 320
},
{
"epoch": 0.6185567010309279,
"grad_norm": 0.4221174716949463,
"learning_rate": 0.0007998067038615297,
"loss": 0.2322,
"step": 330
},
{
"epoch": 0.6373008434864105,
"grad_norm": 0.42522430419921875,
"learning_rate": 0.0007998008464027882,
"loss": 0.241,
"step": 340
},
{
"epoch": 0.6560449859418932,
"grad_norm": 0.47986653447151184,
"learning_rate": 0.0007997949889440467,
"loss": 0.2252,
"step": 350
},
{
"epoch": 0.6747891283973758,
"grad_norm": 0.6221345067024231,
"learning_rate": 0.0007997891314853051,
"loss": 0.2418,
"step": 360
},
{
"epoch": 0.6935332708528584,
"grad_norm": 0.4737911820411682,
"learning_rate": 0.0007997832740265637,
"loss": 0.2232,
"step": 370
},
{
"epoch": 0.7122774133083412,
"grad_norm": 0.47973355650901794,
"learning_rate": 0.000799777416567822,
"loss": 0.227,
"step": 380
},
{
"epoch": 0.7310215557638238,
"grad_norm": 0.4451119005680084,
"learning_rate": 0.0007997715591090806,
"loss": 0.2206,
"step": 390
},
{
"epoch": 0.7497656982193065,
"grad_norm": 0.4816949963569641,
"learning_rate": 0.0007997657016503391,
"loss": 0.221,
"step": 400
},
{
"epoch": 0.7685098406747891,
"grad_norm": 0.44739213585853577,
"learning_rate": 0.0007997598441915975,
"loss": 0.226,
"step": 410
},
{
"epoch": 0.7872539831302718,
"grad_norm": 0.4036339521408081,
"learning_rate": 0.0007997539867328561,
"loss": 0.2359,
"step": 420
},
{
"epoch": 0.8059981255857545,
"grad_norm": 0.4639866054058075,
"learning_rate": 0.0007997481292741144,
"loss": 0.2251,
"step": 430
},
{
"epoch": 0.8247422680412371,
"grad_norm": 0.4569236636161804,
"learning_rate": 0.0007997422718153729,
"loss": 0.2337,
"step": 440
},
{
"epoch": 0.8434864104967198,
"grad_norm": 0.37712669372558594,
"learning_rate": 0.0007997364143566314,
"loss": 0.2009,
"step": 450
},
{
"epoch": 0.8622305529522024,
"grad_norm": 0.3660425543785095,
"learning_rate": 0.0007997305568978899,
"loss": 0.2217,
"step": 460
},
{
"epoch": 0.8809746954076851,
"grad_norm": 0.37786588072776794,
"learning_rate": 0.0007997246994391483,
"loss": 0.2256,
"step": 470
},
{
"epoch": 0.8997188378631678,
"grad_norm": 0.34985071420669556,
"learning_rate": 0.0007997188419804069,
"loss": 0.2137,
"step": 480
},
{
"epoch": 0.9184629803186504,
"grad_norm": 0.5390796661376953,
"learning_rate": 0.0007997129845216653,
"loss": 0.2164,
"step": 490
},
{
"epoch": 0.9372071227741331,
"grad_norm": 0.45559704303741455,
"learning_rate": 0.0007997071270629237,
"loss": 0.2267,
"step": 500
},
{
"epoch": 0.9372071227741331,
"eval_loss": 0.120799720287323,
"eval_pearson_cosine": 0.739181399345398,
"eval_pearson_dot": 0.7062755823135376,
"eval_pearson_euclidean": 0.7443870306015015,
"eval_pearson_manhattan": 0.7475869655609131,
"eval_runtime": 27.6267,
"eval_samples_per_second": 54.295,
"eval_spearman_cosine": 0.7615568395698803,
"eval_spearman_dot": 0.7105088137905408,
"eval_spearman_euclidean": 0.7483334786946333,
"eval_spearman_manhattan": 0.753415014125672,
"eval_steps_per_second": 6.805,
"step": 500
},
{
"epoch": 0.9559512652296157,
"grad_norm": 0.4264763593673706,
"learning_rate": 0.0007997012696041823,
"loss": 0.1925,
"step": 510
},
{
"epoch": 0.9746954076850984,
"grad_norm": 0.3580343425273895,
"learning_rate": 0.0007996954121454407,
"loss": 0.2208,
"step": 520
},
{
"epoch": 0.993439550140581,
"grad_norm": 0.3899773359298706,
"learning_rate": 0.0007996895546866993,
"loss": 0.2219,
"step": 530
},
{
"epoch": 1.013120899718838,
"grad_norm": 0.34510987997055054,
"learning_rate": 0.0007996836972279577,
"loss": 0.1681,
"step": 540
},
{
"epoch": 1.0318650421743205,
"grad_norm": 0.52265465259552,
"learning_rate": 0.0007996778397692161,
"loss": 0.1328,
"step": 550
},
{
"epoch": 1.0506091846298031,
"grad_norm": 0.39699018001556396,
"learning_rate": 0.0007996719823104747,
"loss": 0.146,
"step": 560
},
{
"epoch": 1.069353327085286,
"grad_norm": 0.3806276023387909,
"learning_rate": 0.0007996661248517331,
"loss": 0.1394,
"step": 570
},
{
"epoch": 1.0880974695407686,
"grad_norm": 0.38312238454818726,
"learning_rate": 0.0007996602673929916,
"loss": 0.1296,
"step": 580
},
{
"epoch": 1.1068416119962512,
"grad_norm": 0.40978288650512695,
"learning_rate": 0.0007996544099342501,
"loss": 0.1328,
"step": 590
},
{
"epoch": 1.1255857544517338,
"grad_norm": 0.36600926518440247,
"learning_rate": 0.0007996485524755086,
"loss": 0.1403,
"step": 600
},
{
"epoch": 1.1443298969072164,
"grad_norm": 0.44099515676498413,
"learning_rate": 0.0007996426950167669,
"loss": 0.133,
"step": 610
},
{
"epoch": 1.1630740393626993,
"grad_norm": 0.3496938645839691,
"learning_rate": 0.0007996368375580255,
"loss": 0.1334,
"step": 620
},
{
"epoch": 1.1818181818181819,
"grad_norm": 0.3754808306694031,
"learning_rate": 0.0007996309800992839,
"loss": 0.1417,
"step": 630
},
{
"epoch": 1.2005623242736645,
"grad_norm": 0.45285704731941223,
"learning_rate": 0.0007996251226405425,
"loss": 0.1525,
"step": 640
},
{
"epoch": 1.219306466729147,
"grad_norm": 0.4499760866165161,
"learning_rate": 0.000799619265181801,
"loss": 0.1432,
"step": 650
},
{
"epoch": 1.2380506091846297,
"grad_norm": 0.3310897648334503,
"learning_rate": 0.0007996134077230593,
"loss": 0.1392,
"step": 660
},
{
"epoch": 1.2567947516401126,
"grad_norm": 0.32931098341941833,
"learning_rate": 0.0007996075502643179,
"loss": 0.1484,
"step": 670
},
{
"epoch": 1.2755388940955952,
"grad_norm": 0.32448434829711914,
"learning_rate": 0.0007996016928055763,
"loss": 0.1464,
"step": 680
},
{
"epoch": 1.2942830365510778,
"grad_norm": 0.38759011030197144,
"learning_rate": 0.0007995958353468348,
"loss": 0.1412,
"step": 690
},
{
"epoch": 1.3130271790065604,
"grad_norm": 0.29216083884239197,
"learning_rate": 0.0007995899778880933,
"loss": 0.1381,
"step": 700
},
{
"epoch": 1.331771321462043,
"grad_norm": 0.36174365878105164,
"learning_rate": 0.0007995841204293518,
"loss": 0.1429,
"step": 710
},
{
"epoch": 1.3505154639175259,
"grad_norm": 0.3533223569393158,
"learning_rate": 0.0007995782629706102,
"loss": 0.1349,
"step": 720
},
{
"epoch": 1.3692596063730085,
"grad_norm": 0.34500110149383545,
"learning_rate": 0.0007995724055118687,
"loss": 0.1445,
"step": 730
},
{
"epoch": 1.388003748828491,
"grad_norm": 0.3347356617450714,
"learning_rate": 0.0007995665480531272,
"loss": 0.1508,
"step": 740
},
{
"epoch": 1.4067478912839737,
"grad_norm": 0.5160906910896301,
"learning_rate": 0.0007995606905943857,
"loss": 0.156,
"step": 750
},
{
"epoch": 1.4067478912839737,
"eval_loss": 0.07170082628726959,
"eval_pearson_cosine": 0.7533469200134277,
"eval_pearson_dot": 0.7396403551101685,
"eval_pearson_euclidean": 0.7499958276748657,
"eval_pearson_manhattan": 0.7558424472808838,
"eval_runtime": 27.4077,
"eval_samples_per_second": 54.729,
"eval_spearman_cosine": 0.7671397694369135,
"eval_spearman_dot": 0.7444267819763823,
"eval_spearman_euclidean": 0.753108908424924,
"eval_spearman_manhattan": 0.7608183984789815,
"eval_steps_per_second": 6.859,
"step": 750
},
{
"epoch": 1.4254920337394563,
"grad_norm": 0.3751141428947449,
"learning_rate": 0.0007995548331356442,
"loss": 0.1606,
"step": 760
},
{
"epoch": 1.4442361761949392,
"grad_norm": 0.3998653292655945,
"learning_rate": 0.0007995489756769026,
"loss": 0.1447,
"step": 770
},
{
"epoch": 1.4629803186504218,
"grad_norm": 0.32710304856300354,
"learning_rate": 0.0007995431182181611,
"loss": 0.1381,
"step": 780
},
{
"epoch": 1.4817244611059044,
"grad_norm": 0.3845181167125702,
"learning_rate": 0.0007995372607594195,
"loss": 0.1488,
"step": 790
},
{
"epoch": 1.5004686035613872,
"grad_norm": 0.39582550525665283,
"learning_rate": 0.000799531403300678,
"loss": 0.1546,
"step": 800
},
{
"epoch": 1.5192127460168696,
"grad_norm": 0.38061007857322693,
"learning_rate": 0.0007995255458419366,
"loss": 0.1403,
"step": 810
},
{
"epoch": 1.5379568884723525,
"grad_norm": 0.4833431839942932,
"learning_rate": 0.000799519688383195,
"loss": 0.1582,
"step": 820
},
{
"epoch": 1.556701030927835,
"grad_norm": 0.36174824833869934,
"learning_rate": 0.0007995138309244534,
"loss": 0.1561,
"step": 830
},
{
"epoch": 1.5754451733833177,
"grad_norm": 0.4403337836265564,
"learning_rate": 0.0007995079734657119,
"loss": 0.1472,
"step": 840
},
{
"epoch": 1.5941893158388005,
"grad_norm": 0.384498655796051,
"learning_rate": 0.0007995021160069704,
"loss": 0.1458,
"step": 850
},
{
"epoch": 1.612933458294283,
"grad_norm": 0.29897651076316833,
"learning_rate": 0.0007994962585482289,
"loss": 0.1528,
"step": 860
},
{
"epoch": 1.6316776007497658,
"grad_norm": 0.3865436613559723,
"learning_rate": 0.0007994904010894874,
"loss": 0.1497,
"step": 870
},
{
"epoch": 1.6504217432052484,
"grad_norm": 0.34619590640068054,
"learning_rate": 0.0007994845436307458,
"loss": 0.1425,
"step": 880
},
{
"epoch": 1.669165885660731,
"grad_norm": 0.2863396108150482,
"learning_rate": 0.0007994786861720043,
"loss": 0.1441,
"step": 890
},
{
"epoch": 1.6879100281162138,
"grad_norm": 0.371105432510376,
"learning_rate": 0.0007994728287132628,
"loss": 0.1437,
"step": 900
},
{
"epoch": 1.7066541705716962,
"grad_norm": 0.3657528758049011,
"learning_rate": 0.0007994669712545212,
"loss": 0.1435,
"step": 910
},
{
"epoch": 1.725398313027179,
"grad_norm": 0.333408385515213,
"learning_rate": 0.0007994611137957798,
"loss": 0.1322,
"step": 920
},
{
"epoch": 1.7441424554826617,
"grad_norm": 0.34960120916366577,
"learning_rate": 0.0007994552563370382,
"loss": 0.1551,
"step": 930
},
{
"epoch": 1.7628865979381443,
"grad_norm": 0.30177751183509827,
"learning_rate": 0.0007994493988782967,
"loss": 0.1641,
"step": 940
},
{
"epoch": 1.7816307403936271,
"grad_norm": 0.39110997319221497,
"learning_rate": 0.0007994435414195552,
"loss": 0.1523,
"step": 950
},
{
"epoch": 1.8003748828491095,
"grad_norm": 0.30561545491218567,
"learning_rate": 0.0007994376839608136,
"loss": 0.1516,
"step": 960
},
{
"epoch": 1.8191190253045924,
"grad_norm": 0.32364317774772644,
"learning_rate": 0.0007994318265020722,
"loss": 0.145,
"step": 970
},
{
"epoch": 1.837863167760075,
"grad_norm": 0.36380302906036377,
"learning_rate": 0.0007994259690433306,
"loss": 0.1576,
"step": 980
},
{
"epoch": 1.8566073102155576,
"grad_norm": 0.3005361258983612,
"learning_rate": 0.0007994201115845891,
"loss": 0.1535,
"step": 990
},
{
"epoch": 1.8753514526710404,
"grad_norm": 0.33928126096725464,
"learning_rate": 0.0007994142541258476,
"loss": 0.1741,
"step": 1000
},
{
"epoch": 1.8753514526710404,
"eval_loss": 0.08202449977397919,
"eval_pearson_cosine": 0.7594348192214966,
"eval_pearson_dot": 0.7289378643035889,
"eval_pearson_euclidean": 0.7542859315872192,
"eval_pearson_manhattan": 0.7619277238845825,
"eval_runtime": 27.4437,
"eval_samples_per_second": 54.657,
"eval_spearman_cosine": 0.7679965558172381,
"eval_spearman_dot": 0.7298723962118674,
"eval_spearman_euclidean": 0.7535528712822376,
"eval_spearman_manhattan": 0.7621231744319878,
"eval_steps_per_second": 6.85,
"step": 1000
},
{
"epoch": 1.8940955951265228,
"grad_norm": 0.3324965834617615,
"learning_rate": 0.000799408396667106,
"loss": 0.1491,
"step": 1010
},
{
"epoch": 1.9128397375820057,
"grad_norm": 0.3112243711948395,
"learning_rate": 0.0007994025392083644,
"loss": 0.1622,
"step": 1020
},
{
"epoch": 1.9315838800374883,
"grad_norm": 0.3381972014904022,
"learning_rate": 0.000799396681749623,
"loss": 0.1462,
"step": 1030
},
{
"epoch": 1.9503280224929709,
"grad_norm": 0.3424859642982483,
"learning_rate": 0.0007993908242908814,
"loss": 0.1651,
"step": 1040
},
{
"epoch": 1.9690721649484537,
"grad_norm": 0.42161494493484497,
"learning_rate": 0.0007993849668321399,
"loss": 0.1521,
"step": 1050
},
{
"epoch": 1.9878163074039361,
"grad_norm": 0.3541307747364044,
"learning_rate": 0.0007993791093733984,
"loss": 0.162,
"step": 1060
},
{
"epoch": 2.007497656982193,
"grad_norm": 0.22963856160640717,
"learning_rate": 0.0007993732519146568,
"loss": 0.1297,
"step": 1070
},
{
"epoch": 2.026241799437676,
"grad_norm": 0.28242990374565125,
"learning_rate": 0.0007993673944559154,
"loss": 0.0773,
"step": 1080
},
{
"epoch": 2.044985941893158,
"grad_norm": 0.3516603112220764,
"learning_rate": 0.0007993615369971738,
"loss": 0.0799,
"step": 1090
},
{
"epoch": 2.063730084348641,
"grad_norm": 0.3558428883552551,
"learning_rate": 0.0007993556795384323,
"loss": 0.0885,
"step": 1100
},
{
"epoch": 2.082474226804124,
"grad_norm": 0.3211170732975006,
"learning_rate": 0.0007993498220796908,
"loss": 0.0825,
"step": 1110
},
{
"epoch": 2.1012183692596063,
"grad_norm": 0.20844395458698273,
"learning_rate": 0.0007993439646209492,
"loss": 0.0763,
"step": 1120
},
{
"epoch": 2.119962511715089,
"grad_norm": 0.3156029284000397,
"learning_rate": 0.0007993381071622077,
"loss": 0.0797,
"step": 1130
},
{
"epoch": 2.138706654170572,
"grad_norm": 0.3986193835735321,
"learning_rate": 0.0007993322497034662,
"loss": 0.0852,
"step": 1140
},
{
"epoch": 2.1574507966260543,
"grad_norm": 0.18681703507900238,
"learning_rate": 0.0007993263922447247,
"loss": 0.0779,
"step": 1150
},
{
"epoch": 2.176194939081537,
"grad_norm": 0.2365262657403946,
"learning_rate": 0.0007993205347859831,
"loss": 0.0833,
"step": 1160
},
{
"epoch": 2.1949390815370196,
"grad_norm": 0.25459378957748413,
"learning_rate": 0.0007993146773272417,
"loss": 0.0761,
"step": 1170
},
{
"epoch": 2.2136832239925024,
"grad_norm": 0.39024218916893005,
"learning_rate": 0.0007993088198685,
"loss": 0.0873,
"step": 1180
},
{
"epoch": 2.2324273664479852,
"grad_norm": 0.3662407100200653,
"learning_rate": 0.0007993029624097585,
"loss": 0.0842,
"step": 1190
},
{
"epoch": 2.2511715089034676,
"grad_norm": 0.30686551332473755,
"learning_rate": 0.0007992971049510171,
"loss": 0.0845,
"step": 1200
},
{
"epoch": 2.2699156513589505,
"grad_norm": 0.29860755801200867,
"learning_rate": 0.0007992912474922755,
"loss": 0.0806,
"step": 1210
},
{
"epoch": 2.288659793814433,
"grad_norm": 0.272029310464859,
"learning_rate": 0.0007992853900335341,
"loss": 0.0849,
"step": 1220
},
{
"epoch": 2.3074039362699157,
"grad_norm": 0.23034346103668213,
"learning_rate": 0.0007992795325747924,
"loss": 0.0873,
"step": 1230
},
{
"epoch": 2.3261480787253985,
"grad_norm": 0.38400229811668396,
"learning_rate": 0.0007992736751160509,
"loss": 0.0854,
"step": 1240
},
{
"epoch": 2.344892221180881,
"grad_norm": 0.2619285583496094,
"learning_rate": 0.0007992678176573094,
"loss": 0.0854,
"step": 1250
},
{
"epoch": 2.344892221180881,
"eval_loss": 0.0549059733748436,
"eval_pearson_cosine": 0.7565033435821533,
"eval_pearson_dot": 0.7438405752182007,
"eval_pearson_euclidean": 0.7398021221160889,
"eval_pearson_manhattan": 0.7514023780822754,
"eval_runtime": 27.2363,
"eval_samples_per_second": 55.074,
"eval_spearman_cosine": 0.7657686934808458,
"eval_spearman_dot": 0.7450125999969373,
"eval_spearman_euclidean": 0.7411997174627442,
"eval_spearman_manhattan": 0.754436544283217,
"eval_steps_per_second": 6.903,
"step": 1250
},
{
"epoch": 2.3636363636363638,
"grad_norm": 0.2573038935661316,
"learning_rate": 0.0007992619601985679,
"loss": 0.0812,
"step": 1260
},
{
"epoch": 2.382380506091846,
"grad_norm": 0.2684009373188019,
"learning_rate": 0.0007992561027398263,
"loss": 0.0833,
"step": 1270
},
{
"epoch": 2.401124648547329,
"grad_norm": 0.2773861289024353,
"learning_rate": 0.0007992502452810849,
"loss": 0.0902,
"step": 1280
},
{
"epoch": 2.419868791002812,
"grad_norm": 0.3180435001850128,
"learning_rate": 0.0007992443878223433,
"loss": 0.0882,
"step": 1290
},
{
"epoch": 2.438612933458294,
"grad_norm": 0.2758583426475525,
"learning_rate": 0.0007992385303636017,
"loss": 0.0815,
"step": 1300
},
{
"epoch": 2.457357075913777,
"grad_norm": 0.3327929973602295,
"learning_rate": 0.0007992326729048603,
"loss": 0.0949,
"step": 1310
},
{
"epoch": 2.4761012183692594,
"grad_norm": 0.31645268201828003,
"learning_rate": 0.0007992268154461187,
"loss": 0.0942,
"step": 1320
},
{
"epoch": 2.4948453608247423,
"grad_norm": 0.2587279975414276,
"learning_rate": 0.0007992209579873773,
"loss": 0.0889,
"step": 1330
},
{
"epoch": 2.513589503280225,
"grad_norm": 0.29799187183380127,
"learning_rate": 0.0007992151005286357,
"loss": 0.1027,
"step": 1340
},
{
"epoch": 2.5323336457357075,
"grad_norm": 0.3042343258857727,
"learning_rate": 0.0007992092430698941,
"loss": 0.0947,
"step": 1350
},
{
"epoch": 2.5510777881911904,
"grad_norm": 0.36439308524131775,
"learning_rate": 0.0007992033856111527,
"loss": 0.0887,
"step": 1360
},
{
"epoch": 2.5698219306466727,
"grad_norm": 0.24675941467285156,
"learning_rate": 0.0007991975281524111,
"loss": 0.0893,
"step": 1370
},
{
"epoch": 2.5885660731021556,
"grad_norm": 0.3232560157775879,
"learning_rate": 0.0007991916706936696,
"loss": 0.0949,
"step": 1380
},
{
"epoch": 2.6073102155576384,
"grad_norm": 0.3095908463001251,
"learning_rate": 0.0007991858132349281,
"loss": 0.0893,
"step": 1390
},
{
"epoch": 2.626054358013121,
"grad_norm": 0.24996769428253174,
"learning_rate": 0.0007991799557761866,
"loss": 0.0918,
"step": 1400
},
{
"epoch": 2.6447985004686037,
"grad_norm": 0.3013332486152649,
"learning_rate": 0.0007991740983174449,
"loss": 0.0965,
"step": 1410
},
{
"epoch": 2.663542642924086,
"grad_norm": 0.43422290682792664,
"learning_rate": 0.0007991682408587035,
"loss": 0.1144,
"step": 1420
},
{
"epoch": 2.682286785379569,
"grad_norm": 0.3462458848953247,
"learning_rate": 0.0007991623833999619,
"loss": 0.1068,
"step": 1430
},
{
"epoch": 2.7010309278350517,
"grad_norm": 0.2752937078475952,
"learning_rate": 0.0007991565259412205,
"loss": 0.1048,
"step": 1440
},
{
"epoch": 2.719775070290534,
"grad_norm": 0.33038660883903503,
"learning_rate": 0.000799150668482479,
"loss": 0.1055,
"step": 1450
},
{
"epoch": 2.738519212746017,
"grad_norm": 0.28442054986953735,
"learning_rate": 0.0007991448110237373,
"loss": 0.1053,
"step": 1460
},
{
"epoch": 2.7572633552014993,
"grad_norm": 0.25279343128204346,
"learning_rate": 0.0007991389535649959,
"loss": 0.109,
"step": 1470
},
{
"epoch": 2.776007497656982,
"grad_norm": 0.3681808114051819,
"learning_rate": 0.0007991330961062543,
"loss": 0.1092,
"step": 1480
},
{
"epoch": 2.794751640112465,
"grad_norm": 0.3884279429912567,
"learning_rate": 0.0007991272386475128,
"loss": 0.1105,
"step": 1490
},
{
"epoch": 2.8134957825679474,
"grad_norm": 0.3542380928993225,
"learning_rate": 0.0007991213811887713,
"loss": 0.109,
"step": 1500
},
{
"epoch": 2.8134957825679474,
"eval_loss": 0.06194353476166725,
"eval_pearson_cosine": 0.7544945478439331,
"eval_pearson_dot": 0.7297648787498474,
"eval_pearson_euclidean": 0.7457708120346069,
"eval_pearson_manhattan": 0.7537869215011597,
"eval_runtime": 27.28,
"eval_samples_per_second": 54.985,
"eval_spearman_cosine": 0.7677406665753612,
"eval_spearman_dot": 0.7355031880736892,
"eval_spearman_euclidean": 0.752266788615453,
"eval_spearman_manhattan": 0.7620929193607933,
"eval_steps_per_second": 6.892,
"step": 1500
},
{
"epoch": 2.8322399250234302,
"grad_norm": 0.28738659620285034,
"learning_rate": 0.0007991155237300298,
"loss": 0.1043,
"step": 1510
},
{
"epoch": 2.8509840674789126,
"grad_norm": 0.39117714762687683,
"learning_rate": 0.0007991096662712882,
"loss": 0.0993,
"step": 1520
},
{
"epoch": 2.8697282099343955,
"grad_norm": 0.3144415616989136,
"learning_rate": 0.0007991038088125467,
"loss": 0.1145,
"step": 1530
},
{
"epoch": 2.8884723523898783,
"grad_norm": 0.28154823184013367,
"learning_rate": 0.0007990979513538052,
"loss": 0.1128,
"step": 1540
},
{
"epoch": 2.9072164948453607,
"grad_norm": 0.3766768276691437,
"learning_rate": 0.0007990920938950637,
"loss": 0.1033,
"step": 1550
},
{
"epoch": 2.9259606373008435,
"grad_norm": 0.38604792952537537,
"learning_rate": 0.0007990862364363222,
"loss": 0.1044,
"step": 1560
},
{
"epoch": 2.944704779756326,
"grad_norm": 0.36833906173706055,
"learning_rate": 0.0007990803789775806,
"loss": 0.1159,
"step": 1570
},
{
"epoch": 2.963448922211809,
"grad_norm": 0.3357650935649872,
"learning_rate": 0.0007990745215188391,
"loss": 0.1185,
"step": 1580
},
{
"epoch": 2.9821930646672916,
"grad_norm": 0.30260348320007324,
"learning_rate": 0.0007990686640600976,
"loss": 0.1167,
"step": 1590
},
{
"epoch": 3.0018744142455485,
"grad_norm": 0.28110650181770325,
"learning_rate": 0.000799062806601356,
"loss": 0.1115,
"step": 1600
},
{
"epoch": 3.020618556701031,
"grad_norm": 0.32038745284080505,
"learning_rate": 0.0007990569491426146,
"loss": 0.0637,
"step": 1610
},
{
"epoch": 3.0393626991565137,
"grad_norm": 0.29342755675315857,
"learning_rate": 0.000799051091683873,
"loss": 0.0687,
"step": 1620
},
{
"epoch": 3.058106841611996,
"grad_norm": 0.33964619040489197,
"learning_rate": 0.0007990452342251314,
"loss": 0.0611,
"step": 1630
},
{
"epoch": 3.076850984067479,
"grad_norm": 0.23580531775951385,
"learning_rate": 0.0007990393767663899,
"loss": 0.0635,
"step": 1640
},
{
"epoch": 3.0955951265229618,
"grad_norm": 0.2617776393890381,
"learning_rate": 0.0007990335193076484,
"loss": 0.0709,
"step": 1650
},
{
"epoch": 3.114339268978444,
"grad_norm": 0.25627410411834717,
"learning_rate": 0.0007990276618489068,
"loss": 0.0682,
"step": 1660
},
{
"epoch": 3.133083411433927,
"grad_norm": 0.21987001597881317,
"learning_rate": 0.0007990218043901654,
"loss": 0.06,
"step": 1670
},
{
"epoch": 3.1518275538894094,
"grad_norm": 0.2657093405723572,
"learning_rate": 0.0007990159469314238,
"loss": 0.0712,
"step": 1680
},
{
"epoch": 3.170571696344892,
"grad_norm": 0.23929661512374878,
"learning_rate": 0.0007990100894726823,
"loss": 0.0566,
"step": 1690
},
{
"epoch": 3.189315838800375,
"grad_norm": 0.23572145402431488,
"learning_rate": 0.0007990042320139408,
"loss": 0.0571,
"step": 1700
},
{
"epoch": 3.2080599812558575,
"grad_norm": 0.26287132501602173,
"learning_rate": 0.0007989983745551992,
"loss": 0.067,
"step": 1710
},
{
"epoch": 3.2268041237113403,
"grad_norm": 0.24504464864730835,
"learning_rate": 0.0007989925170964578,
"loss": 0.0637,
"step": 1720
},
{
"epoch": 3.2455482661668227,
"grad_norm": 0.17006747424602509,
"learning_rate": 0.0007989866596377162,
"loss": 0.0552,
"step": 1730
},
{
"epoch": 3.2642924086223055,
"grad_norm": 0.2752683460712433,
"learning_rate": 0.0007989808021789747,
"loss": 0.0639,
"step": 1740
},
{
"epoch": 3.2830365510777884,
"grad_norm": 0.2681417465209961,
"learning_rate": 0.0007989749447202332,
"loss": 0.0705,
"step": 1750
},
{
"epoch": 3.2830365510777884,
"eval_loss": 0.0486464686691761,
"eval_pearson_cosine": 0.7632350921630859,
"eval_pearson_dot": 0.7505504488945007,
"eval_pearson_euclidean": 0.7458865642547607,
"eval_pearson_manhattan": 0.7597954273223877,
"eval_runtime": 27.3673,
"eval_samples_per_second": 54.81,
"eval_spearman_cosine": 0.7679814031707208,
"eval_spearman_dot": 0.7517654374212466,
"eval_spearman_euclidean": 0.7467275015139031,
"eval_spearman_manhattan": 0.7607208640788498,
"eval_steps_per_second": 6.87,
"step": 1750
},
{
"epoch": 3.3017806935332707,
"grad_norm": 0.24346262216567993,
"learning_rate": 0.0007989690872614916,
"loss": 0.0658,
"step": 1760
},
{
"epoch": 3.3205248359887536,
"grad_norm": 0.24957306683063507,
"learning_rate": 0.0007989632298027502,
"loss": 0.0643,
"step": 1770
},
{
"epoch": 3.3392689784442364,
"grad_norm": 0.24416255950927734,
"learning_rate": 0.0007989573723440086,
"loss": 0.0626,
"step": 1780
},
{
"epoch": 3.358013120899719,
"grad_norm": 0.2224712073802948,
"learning_rate": 0.0007989515148852671,
"loss": 0.0634,
"step": 1790
},
{
"epoch": 3.3767572633552017,
"grad_norm": 0.27588558197021484,
"learning_rate": 0.0007989456574265256,
"loss": 0.0644,
"step": 1800
},
{
"epoch": 3.395501405810684,
"grad_norm": 0.26377061009407043,
"learning_rate": 0.000798939799967784,
"loss": 0.0585,
"step": 1810
},
{
"epoch": 3.414245548266167,
"grad_norm": 0.23178541660308838,
"learning_rate": 0.0007989339425090424,
"loss": 0.0588,
"step": 1820
},
{
"epoch": 3.4329896907216497,
"grad_norm": 0.1893617808818817,
"learning_rate": 0.000798928085050301,
"loss": 0.0649,
"step": 1830
},
{
"epoch": 3.451733833177132,
"grad_norm": 0.23445335030555725,
"learning_rate": 0.0007989222275915595,
"loss": 0.0629,
"step": 1840
},
{
"epoch": 3.470477975632615,
"grad_norm": 0.457109659910202,
"learning_rate": 0.0007989163701328179,
"loss": 0.0646,
"step": 1850
},
{
"epoch": 3.4892221180880973,
"grad_norm": 0.2316947728395462,
"learning_rate": 0.0007989105126740764,
"loss": 0.0677,
"step": 1860
},
{
"epoch": 3.50796626054358,
"grad_norm": 0.26950669288635254,
"learning_rate": 0.0007989046552153348,
"loss": 0.0732,
"step": 1870
},
{
"epoch": 3.526710402999063,
"grad_norm": 0.25258171558380127,
"learning_rate": 0.0007988987977565933,
"loss": 0.0635,
"step": 1880
},
{
"epoch": 3.5454545454545454,
"grad_norm": 0.2282831370830536,
"learning_rate": 0.0007988929402978518,
"loss": 0.0766,
"step": 1890
},
{
"epoch": 3.5641986879100283,
"grad_norm": 0.3049706220626831,
"learning_rate": 0.0007988870828391103,
"loss": 0.0766,
"step": 1900
},
{
"epoch": 3.5829428303655106,
"grad_norm": 0.21556228399276733,
"learning_rate": 0.0007988812253803688,
"loss": 0.0694,
"step": 1910
},
{
"epoch": 3.6016869728209935,
"grad_norm": 0.2859863340854645,
"learning_rate": 0.0007988753679216272,
"loss": 0.0665,
"step": 1920
},
{
"epoch": 3.6204311152764763,
"grad_norm": 0.22522784769535065,
"learning_rate": 0.0007988695104628857,
"loss": 0.073,
"step": 1930
},
{
"epoch": 3.6391752577319587,
"grad_norm": 0.3301334083080292,
"learning_rate": 0.0007988636530041442,
"loss": 0.0745,
"step": 1940
},
{
"epoch": 3.6579194001874415,
"grad_norm": 0.21438319981098175,
"learning_rate": 0.0007988577955454027,
"loss": 0.0713,
"step": 1950
},
{
"epoch": 3.676663542642924,
"grad_norm": 0.3207626938819885,
"learning_rate": 0.0007988519380866611,
"loss": 0.0759,
"step": 1960
},
{
"epoch": 3.695407685098407,
"grad_norm": 0.25493231415748596,
"learning_rate": 0.0007988460806279197,
"loss": 0.0722,
"step": 1970
},
{
"epoch": 3.7141518275538896,
"grad_norm": 0.2732018530368805,
"learning_rate": 0.0007988402231691781,
"loss": 0.0773,
"step": 1980
},
{
"epoch": 3.732895970009372,
"grad_norm": 0.19611899554729462,
"learning_rate": 0.0007988343657104365,
"loss": 0.0773,
"step": 1990
},
{
"epoch": 3.751640112464855,
"grad_norm": 0.2664394676685333,
"learning_rate": 0.0007988285082516951,
"loss": 0.072,
"step": 2000
},
{
"epoch": 3.751640112464855,
"eval_loss": 0.05059043690562248,
"eval_pearson_cosine": 0.7549334764480591,
"eval_pearson_dot": 0.7364022731781006,
"eval_pearson_euclidean": 0.7430644035339355,
"eval_pearson_manhattan": 0.7528964281082153,
"eval_runtime": 27.2774,
"eval_samples_per_second": 54.991,
"eval_spearman_cosine": 0.7612361982335023,
"eval_spearman_dot": 0.7370856746295986,
"eval_spearman_euclidean": 0.7449844586260276,
"eval_spearman_manhattan": 0.7551494271561938,
"eval_steps_per_second": 6.892,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 5330,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}