emo_eager_86 / trainer_state.json
krishnakalyan3's picture
Upload 6 files
115326c verified
{
"best_metric": 0.914588417551393,
"best_model_checkpoint": "/p/scratch/ccstdl/krishna/finetuned-cosine-loss/checkpoint-650",
"epoch": 0.26455026455026454,
"eval_steps": 10,
"global_step": 650,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00407000407000407,
"grad_norm": 1.8234177827835083,
"learning_rate": 3.391670058336725e-09,
"loss": 0.9272,
"step": 10
},
{
"epoch": 0.00407000407000407,
"eval_cos_sim": 0.0673384740948677,
"eval_loss": 0.9336964531158155,
"eval_runtime": 91.7154,
"eval_samples_per_second": 10.903,
"eval_steps_per_second": 0.349,
"step": 10
},
{
"epoch": 0.00814000814000814,
"grad_norm": 1.7962757349014282,
"learning_rate": 6.78334011667345e-09,
"loss": 0.9259,
"step": 20
},
{
"epoch": 0.00814000814000814,
"eval_cos_sim": 0.06735177338123322,
"eval_loss": 0.9336831488822646,
"eval_runtime": 89.447,
"eval_samples_per_second": 11.18,
"eval_steps_per_second": 0.358,
"step": 20
},
{
"epoch": 0.01221001221001221,
"grad_norm": 1.784166693687439,
"learning_rate": 1.0175010175010176e-08,
"loss": 0.9296,
"step": 30
},
{
"epoch": 0.01221001221001221,
"eval_cos_sim": 0.0673740804195404,
"eval_loss": 0.9336608658050245,
"eval_runtime": 89.5427,
"eval_samples_per_second": 11.168,
"eval_steps_per_second": 0.357,
"step": 30
},
{
"epoch": 0.01628001628001628,
"grad_norm": 1.8226275444030762,
"learning_rate": 1.35666802333469e-08,
"loss": 0.9259,
"step": 40
},
{
"epoch": 0.01628001628001628,
"eval_cos_sim": 0.06740561872720718,
"eval_loss": 0.93362934305417,
"eval_runtime": 89.3449,
"eval_samples_per_second": 11.193,
"eval_steps_per_second": 0.358,
"step": 40
},
{
"epoch": 0.02035002035002035,
"grad_norm": 1.7831834554672241,
"learning_rate": 1.6958350291683625e-08,
"loss": 0.9251,
"step": 50
},
{
"epoch": 0.02035002035002035,
"eval_cos_sim": 0.06744563579559326,
"eval_loss": 0.933589364073152,
"eval_runtime": 89.6651,
"eval_samples_per_second": 11.153,
"eval_steps_per_second": 0.357,
"step": 50
},
{
"epoch": 0.02442002442002442,
"grad_norm": 1.5201727151870728,
"learning_rate": 2.035002035002035e-08,
"loss": 0.9276,
"step": 60
},
{
"epoch": 0.02442002442002442,
"eval_cos_sim": 0.06749571859836578,
"eval_loss": 0.9335393262122816,
"eval_runtime": 89.3495,
"eval_samples_per_second": 11.192,
"eval_steps_per_second": 0.358,
"step": 60
},
{
"epoch": 0.02849002849002849,
"grad_norm": 1.813641905784607,
"learning_rate": 2.3741690408357078e-08,
"loss": 0.9303,
"step": 70
},
{
"epoch": 0.02849002849002849,
"eval_cos_sim": 0.06755577772855759,
"eval_loss": 0.9334793086265272,
"eval_runtime": 89.5359,
"eval_samples_per_second": 11.169,
"eval_steps_per_second": 0.357,
"step": 70
},
{
"epoch": 0.03256003256003256,
"grad_norm": 1.8243420124053955,
"learning_rate": 2.71333604666938e-08,
"loss": 0.9288,
"step": 80
},
{
"epoch": 0.03256003256003256,
"eval_cos_sim": 0.06762455403804779,
"eval_loss": 0.9334105701659864,
"eval_runtime": 89.5324,
"eval_samples_per_second": 11.169,
"eval_steps_per_second": 0.357,
"step": 80
},
{
"epoch": 0.03663003663003663,
"grad_norm": 1.7983627319335938,
"learning_rate": 3.052503052503053e-08,
"loss": 0.9243,
"step": 90
},
{
"epoch": 0.03663003663003663,
"eval_cos_sim": 0.06770093739032745,
"eval_loss": 0.9333342409347242,
"eval_runtime": 89.987,
"eval_samples_per_second": 11.113,
"eval_steps_per_second": 0.356,
"step": 90
},
{
"epoch": 0.0407000407000407,
"grad_norm": 1.8116332292556763,
"learning_rate": 3.391670058336725e-08,
"loss": 0.9289,
"step": 100
},
{
"epoch": 0.0407000407000407,
"eval_cos_sim": 0.06778644770383835,
"eval_loss": 0.9332487797950453,
"eval_runtime": 89.222,
"eval_samples_per_second": 11.208,
"eval_steps_per_second": 0.359,
"step": 100
},
{
"epoch": 0.04477004477004477,
"grad_norm": 1.4440973997116089,
"learning_rate": 3.730837064170397e-08,
"loss": 0.9274,
"step": 110
},
{
"epoch": 0.04477004477004477,
"eval_cos_sim": 0.06788003444671631,
"eval_loss": 0.933155252001161,
"eval_runtime": 89.3804,
"eval_samples_per_second": 11.188,
"eval_steps_per_second": 0.358,
"step": 110
},
{
"epoch": 0.04884004884004884,
"grad_norm": 1.8082635402679443,
"learning_rate": 4.07000407000407e-08,
"loss": 0.9267,
"step": 120
},
{
"epoch": 0.04884004884004884,
"eval_cos_sim": 0.06798040121793747,
"eval_loss": 0.9330549512122815,
"eval_runtime": 89.9274,
"eval_samples_per_second": 11.12,
"eval_steps_per_second": 0.356,
"step": 120
},
{
"epoch": 0.05291005291005291,
"grad_norm": 1.8302645683288574,
"learning_rate": 4.4091710758377425e-08,
"loss": 0.9244,
"step": 130
},
{
"epoch": 0.05291005291005291,
"eval_cos_sim": 0.06809354573488235,
"eval_loss": 0.932941900274629,
"eval_runtime": 89.8842,
"eval_samples_per_second": 11.125,
"eval_steps_per_second": 0.356,
"step": 130
},
{
"epoch": 0.05698005698005698,
"grad_norm": 1.5440356731414795,
"learning_rate": 4.7483380816714155e-08,
"loss": 0.9316,
"step": 140
},
{
"epoch": 0.05698005698005698,
"eval_cos_sim": 0.06821373105049133,
"eval_loss": 0.9328218155120558,
"eval_runtime": 89.7159,
"eval_samples_per_second": 11.146,
"eval_steps_per_second": 0.357,
"step": 140
},
{
"epoch": 0.06105006105006105,
"grad_norm": 1.5421152114868164,
"learning_rate": 5.087505087505087e-08,
"loss": 0.927,
"step": 150
},
{
"epoch": 0.06105006105006105,
"eval_cos_sim": 0.06834477931261063,
"eval_loss": 0.9326908560012526,
"eval_runtime": 89.7787,
"eval_samples_per_second": 11.138,
"eval_steps_per_second": 0.356,
"step": 150
},
{
"epoch": 0.06512006512006512,
"grad_norm": 1.7998121976852417,
"learning_rate": 5.42667209333876e-08,
"loss": 0.9252,
"step": 160
},
{
"epoch": 0.06512006512006512,
"eval_cos_sim": 0.06848659366369247,
"eval_loss": 0.9325491523956007,
"eval_runtime": 89.6926,
"eval_samples_per_second": 11.149,
"eval_steps_per_second": 0.357,
"step": 160
},
{
"epoch": 0.06919006919006919,
"grad_norm": 1.542074203491211,
"learning_rate": 5.7658390991724324e-08,
"loss": 0.9252,
"step": 170
},
{
"epoch": 0.06919006919006919,
"eval_cos_sim": 0.0686374306678772,
"eval_loss": 0.9323984141563123,
"eval_runtime": 89.453,
"eval_samples_per_second": 11.179,
"eval_steps_per_second": 0.358,
"step": 170
},
{
"epoch": 0.07326007326007326,
"grad_norm": 1.5359669923782349,
"learning_rate": 6.105006105006105e-08,
"loss": 0.9267,
"step": 180
},
{
"epoch": 0.07326007326007326,
"eval_cos_sim": 0.0687984898686409,
"eval_loss": 0.9322374482368178,
"eval_runtime": 89.6161,
"eval_samples_per_second": 11.159,
"eval_steps_per_second": 0.357,
"step": 180
},
{
"epoch": 0.07733007733007732,
"grad_norm": 1.8200968503952026,
"learning_rate": 6.444173110839778e-08,
"loss": 0.9235,
"step": 190
},
{
"epoch": 0.07733007733007732,
"eval_cos_sim": 0.06896985322237015,
"eval_loss": 0.9320661959861464,
"eval_runtime": 89.8737,
"eval_samples_per_second": 11.127,
"eval_steps_per_second": 0.356,
"step": 190
},
{
"epoch": 0.0814000814000814,
"grad_norm": 1.8174818754196167,
"learning_rate": 6.78334011667345e-08,
"loss": 0.9248,
"step": 200
},
{
"epoch": 0.0814000814000814,
"eval_cos_sim": 0.06914974004030228,
"eval_loss": 0.9318864355300611,
"eval_runtime": 89.8925,
"eval_samples_per_second": 11.124,
"eval_steps_per_second": 0.356,
"step": 200
},
{
"epoch": 0.08547008547008547,
"grad_norm": 1.8348275423049927,
"learning_rate": 7.122507122507124e-08,
"loss": 0.9234,
"step": 210
},
{
"epoch": 0.08547008547008547,
"eval_cos_sim": 0.06933821737766266,
"eval_loss": 0.9316981101249403,
"eval_runtime": 89.537,
"eval_samples_per_second": 11.169,
"eval_steps_per_second": 0.357,
"step": 210
},
{
"epoch": 0.08954008954008955,
"grad_norm": 1.8035074472427368,
"learning_rate": 7.461674128340795e-08,
"loss": 0.9222,
"step": 220
},
{
"epoch": 0.08954008954008955,
"eval_cos_sim": 0.06953444331884384,
"eval_loss": 0.9315020323013014,
"eval_runtime": 89.8235,
"eval_samples_per_second": 11.133,
"eval_steps_per_second": 0.356,
"step": 220
},
{
"epoch": 0.0936100936100936,
"grad_norm": 1.8179575204849243,
"learning_rate": 7.800841134174468e-08,
"loss": 0.9286,
"step": 230
},
{
"epoch": 0.0936100936100936,
"eval_cos_sim": 0.06974231451749802,
"eval_loss": 0.9312943387244886,
"eval_runtime": 89.2117,
"eval_samples_per_second": 11.209,
"eval_steps_per_second": 0.359,
"step": 230
},
{
"epoch": 0.09768009768009768,
"grad_norm": 1.7966115474700928,
"learning_rate": 8.14000814000814e-08,
"loss": 0.9255,
"step": 240
},
{
"epoch": 0.09768009768009768,
"eval_cos_sim": 0.06995726376771927,
"eval_loss": 0.931079523107881,
"eval_runtime": 89.804,
"eval_samples_per_second": 11.135,
"eval_steps_per_second": 0.356,
"step": 240
},
{
"epoch": 0.10175010175010175,
"grad_norm": 1.797987461090088,
"learning_rate": 8.479175145841813e-08,
"loss": 0.9262,
"step": 250
},
{
"epoch": 0.10175010175010175,
"eval_cos_sim": 0.07017555087804794,
"eval_loss": 0.9308614025329298,
"eval_runtime": 89.3997,
"eval_samples_per_second": 11.186,
"eval_steps_per_second": 0.358,
"step": 250
},
{
"epoch": 0.10582010582010581,
"grad_norm": 1.8064875602722168,
"learning_rate": 8.818342151675485e-08,
"loss": 0.9273,
"step": 260
},
{
"epoch": 0.10582010582010581,
"eval_cos_sim": 0.0703999400138855,
"eval_loss": 0.9306371965621656,
"eval_runtime": 89.9472,
"eval_samples_per_second": 11.118,
"eval_steps_per_second": 0.356,
"step": 260
},
{
"epoch": 0.10989010989010989,
"grad_norm": 1.8176881074905396,
"learning_rate": 9.157509157509157e-08,
"loss": 0.9206,
"step": 270
},
{
"epoch": 0.10989010989010989,
"eval_cos_sim": 0.07064417004585266,
"eval_loss": 0.9303931417678541,
"eval_runtime": 89.7373,
"eval_samples_per_second": 11.144,
"eval_steps_per_second": 0.357,
"step": 270
},
{
"epoch": 0.11396011396011396,
"grad_norm": 1.5151400566101074,
"learning_rate": 9.496676163342831e-08,
"loss": 0.9211,
"step": 280
},
{
"epoch": 0.11396011396011396,
"eval_cos_sim": 0.0709003284573555,
"eval_loss": 0.9301371760581678,
"eval_runtime": 89.9622,
"eval_samples_per_second": 11.116,
"eval_steps_per_second": 0.356,
"step": 280
},
{
"epoch": 0.11803011803011804,
"grad_norm": 1.7980380058288574,
"learning_rate": 9.835843169176503e-08,
"loss": 0.9203,
"step": 290
},
{
"epoch": 0.11803011803011804,
"eval_cos_sim": 0.07116622477769852,
"eval_loss": 0.9298714599822706,
"eval_runtime": 90.2995,
"eval_samples_per_second": 11.074,
"eval_steps_per_second": 0.354,
"step": 290
},
{
"epoch": 0.1221001221001221,
"grad_norm": 1.5477901697158813,
"learning_rate": 1.0175010175010174e-07,
"loss": 0.9216,
"step": 300
},
{
"epoch": 0.1221001221001221,
"eval_cos_sim": 0.07143931835889816,
"eval_loss": 0.9295985355590528,
"eval_runtime": 90.1186,
"eval_samples_per_second": 11.096,
"eval_steps_per_second": 0.355,
"step": 300
},
{
"epoch": 0.12617012617012616,
"grad_norm": 1.7951384782791138,
"learning_rate": 1.0514177180843848e-07,
"loss": 0.9261,
"step": 310
},
{
"epoch": 0.12617012617012616,
"eval_cos_sim": 0.07170899212360382,
"eval_loss": 0.9293290882323927,
"eval_runtime": 90.3148,
"eval_samples_per_second": 11.072,
"eval_steps_per_second": 0.354,
"step": 310
},
{
"epoch": 0.13024013024013023,
"grad_norm": 1.7869328260421753,
"learning_rate": 1.085334418667752e-07,
"loss": 0.9189,
"step": 320
},
{
"epoch": 0.13024013024013023,
"eval_cos_sim": 0.07198921591043472,
"eval_loss": 0.9290490951751417,
"eval_runtime": 89.77,
"eval_samples_per_second": 11.14,
"eval_steps_per_second": 0.356,
"step": 320
},
{
"epoch": 0.1343101343101343,
"grad_norm": 1.5212459564208984,
"learning_rate": 1.1192511192511194e-07,
"loss": 0.9254,
"step": 330
},
{
"epoch": 0.1343101343101343,
"eval_cos_sim": 0.07227852195501328,
"eval_loss": 0.9287599940513319,
"eval_runtime": 90.335,
"eval_samples_per_second": 11.07,
"eval_steps_per_second": 0.354,
"step": 330
},
{
"epoch": 0.13838013838013838,
"grad_norm": 1.8259695768356323,
"learning_rate": 1.1531678198344865e-07,
"loss": 0.924,
"step": 340
},
{
"epoch": 0.13838013838013838,
"eval_cos_sim": 0.072585329413414,
"eval_loss": 0.928453390142793,
"eval_runtime": 90.0139,
"eval_samples_per_second": 11.109,
"eval_steps_per_second": 0.356,
"step": 340
},
{
"epoch": 0.14245014245014245,
"grad_norm": 1.810530424118042,
"learning_rate": 1.1870845204178537e-07,
"loss": 0.9223,
"step": 350
},
{
"epoch": 0.14245014245014245,
"eval_cos_sim": 0.07290184497833252,
"eval_loss": 0.92813706495511,
"eval_runtime": 89.979,
"eval_samples_per_second": 11.114,
"eval_steps_per_second": 0.356,
"step": 350
},
{
"epoch": 0.14652014652014653,
"grad_norm": 1.819162368774414,
"learning_rate": 1.221001221001221e-07,
"loss": 0.9227,
"step": 360
},
{
"epoch": 0.14652014652014653,
"eval_cos_sim": 0.07322434335947037,
"eval_loss": 0.9278148160194105,
"eval_runtime": 90.5195,
"eval_samples_per_second": 11.047,
"eval_steps_per_second": 0.354,
"step": 360
},
{
"epoch": 0.1505901505901506,
"grad_norm": 1.7979774475097656,
"learning_rate": 1.2549179215845883e-07,
"loss": 0.9185,
"step": 370
},
{
"epoch": 0.1505901505901506,
"eval_cos_sim": 0.07355932146310806,
"eval_loss": 0.9274800749038404,
"eval_runtime": 90.4314,
"eval_samples_per_second": 11.058,
"eval_steps_per_second": 0.354,
"step": 370
},
{
"epoch": 0.15466015466015465,
"grad_norm": 1.784081220626831,
"learning_rate": 1.2888346221679555e-07,
"loss": 0.9175,
"step": 380
},
{
"epoch": 0.15466015466015465,
"eval_cos_sim": 0.07390536367893219,
"eval_loss": 0.9271342835639662,
"eval_runtime": 90.2731,
"eval_samples_per_second": 11.077,
"eval_steps_per_second": 0.354,
"step": 380
},
{
"epoch": 0.15873015873015872,
"grad_norm": 1.8276127576828003,
"learning_rate": 1.3227513227513228e-07,
"loss": 0.9165,
"step": 390
},
{
"epoch": 0.15873015873015872,
"eval_cos_sim": 0.07425787299871445,
"eval_loss": 0.926782009146089,
"eval_runtime": 89.6304,
"eval_samples_per_second": 11.157,
"eval_steps_per_second": 0.357,
"step": 390
},
{
"epoch": 0.1628001628001628,
"grad_norm": 1.8162034749984741,
"learning_rate": 1.35666802333469e-07,
"loss": 0.9199,
"step": 400
},
{
"epoch": 0.1628001628001628,
"eval_cos_sim": 0.07460718601942062,
"eval_loss": 0.9264329524253553,
"eval_runtime": 90.4314,
"eval_samples_per_second": 11.058,
"eval_steps_per_second": 0.354,
"step": 400
},
{
"epoch": 0.16687016687016687,
"grad_norm": 1.4985719919204712,
"learning_rate": 1.3905847239180572e-07,
"loss": 0.9224,
"step": 410
},
{
"epoch": 0.16687016687016687,
"eval_cos_sim": 0.0749591663479805,
"eval_loss": 0.9260812273238844,
"eval_runtime": 89.9689,
"eval_samples_per_second": 11.115,
"eval_steps_per_second": 0.356,
"step": 410
},
{
"epoch": 0.17094017094017094,
"grad_norm": 1.8033661842346191,
"learning_rate": 1.4245014245014247e-07,
"loss": 0.9152,
"step": 420
},
{
"epoch": 0.17094017094017094,
"eval_cos_sim": 0.07533340901136398,
"eval_loss": 0.9257072530006116,
"eval_runtime": 90.1973,
"eval_samples_per_second": 11.087,
"eval_steps_per_second": 0.355,
"step": 420
},
{
"epoch": 0.17501017501017502,
"grad_norm": 1.7833577394485474,
"learning_rate": 1.4584181250847917e-07,
"loss": 0.9187,
"step": 430
},
{
"epoch": 0.17501017501017502,
"eval_cos_sim": 0.07572542876005173,
"eval_loss": 0.9253155136321729,
"eval_runtime": 89.962,
"eval_samples_per_second": 11.116,
"eval_steps_per_second": 0.356,
"step": 430
},
{
"epoch": 0.1790801790801791,
"grad_norm": 1.7877451181411743,
"learning_rate": 1.492334825668159e-07,
"loss": 0.9175,
"step": 440
},
{
"epoch": 0.1790801790801791,
"eval_cos_sim": 0.0761212408542633,
"eval_loss": 0.9249199991439527,
"eval_runtime": 89.9831,
"eval_samples_per_second": 11.113,
"eval_steps_per_second": 0.356,
"step": 440
},
{
"epoch": 0.18315018315018314,
"grad_norm": 1.7935380935668945,
"learning_rate": 1.5262515262515264e-07,
"loss": 0.9167,
"step": 450
},
{
"epoch": 0.18315018315018314,
"eval_cos_sim": 0.07652699947357178,
"eval_loss": 0.9245145797942823,
"eval_runtime": 90.5936,
"eval_samples_per_second": 11.038,
"eval_steps_per_second": 0.353,
"step": 450
},
{
"epoch": 0.1872201872201872,
"grad_norm": 1.8291174173355103,
"learning_rate": 1.5601682268348936e-07,
"loss": 0.9127,
"step": 460
},
{
"epoch": 0.1872201872201872,
"eval_cos_sim": 0.0769369825720787,
"eval_loss": 0.9241049232696241,
"eval_runtime": 89.8268,
"eval_samples_per_second": 11.133,
"eval_steps_per_second": 0.356,
"step": 460
},
{
"epoch": 0.19129019129019129,
"grad_norm": 1.5488710403442383,
"learning_rate": 1.594084927418261e-07,
"loss": 0.9227,
"step": 470
},
{
"epoch": 0.19129019129019129,
"eval_cos_sim": 0.07736244797706604,
"eval_loss": 0.9236797976707166,
"eval_runtime": 89.9173,
"eval_samples_per_second": 11.121,
"eval_steps_per_second": 0.356,
"step": 470
},
{
"epoch": 0.19536019536019536,
"grad_norm": 1.5216211080551147,
"learning_rate": 1.628001628001628e-07,
"loss": 0.9196,
"step": 480
},
{
"epoch": 0.19536019536019536,
"eval_cos_sim": 0.07778871059417725,
"eval_loss": 0.9232538404678052,
"eval_runtime": 90.6603,
"eval_samples_per_second": 11.03,
"eval_steps_per_second": 0.353,
"step": 480
},
{
"epoch": 0.19943019943019943,
"grad_norm": 1.7952001094818115,
"learning_rate": 1.6619183285849953e-07,
"loss": 0.9113,
"step": 490
},
{
"epoch": 0.19943019943019943,
"eval_cos_sim": 0.07823511958122253,
"eval_loss": 0.922807737371797,
"eval_runtime": 90.9654,
"eval_samples_per_second": 10.993,
"eval_steps_per_second": 0.352,
"step": 490
},
{
"epoch": 0.2035002035002035,
"grad_norm": 1.5366356372833252,
"learning_rate": 1.6958350291683626e-07,
"loss": 0.9204,
"step": 500
},
{
"epoch": 0.2035002035002035,
"eval_cos_sim": 0.07868806272745132,
"eval_loss": 0.9223551345084852,
"eval_runtime": 90.5097,
"eval_samples_per_second": 11.049,
"eval_steps_per_second": 0.354,
"step": 500
},
{
"epoch": 0.20757020757020758,
"grad_norm": 1.5292208194732666,
"learning_rate": 1.7297517297517298e-07,
"loss": 0.9163,
"step": 510
},
{
"epoch": 0.20757020757020758,
"eval_cos_sim": 0.0791405662894249,
"eval_loss": 0.9219029598449415,
"eval_runtime": 90.3795,
"eval_samples_per_second": 11.064,
"eval_steps_per_second": 0.354,
"step": 510
},
{
"epoch": 0.21164021164021163,
"grad_norm": 1.8251779079437256,
"learning_rate": 1.763668430335097e-07,
"loss": 0.9177,
"step": 520
},
{
"epoch": 0.21164021164021163,
"eval_cos_sim": 0.07960349321365356,
"eval_loss": 0.921440408251161,
"eval_runtime": 90.2913,
"eval_samples_per_second": 11.075,
"eval_steps_per_second": 0.354,
"step": 520
},
{
"epoch": 0.2157102157102157,
"grad_norm": 1.8137582540512085,
"learning_rate": 1.7975851309184642e-07,
"loss": 0.9108,
"step": 530
},
{
"epoch": 0.2157102157102157,
"eval_cos_sim": 0.08007470518350601,
"eval_loss": 0.9209695620750136,
"eval_runtime": 90.4592,
"eval_samples_per_second": 11.055,
"eval_steps_per_second": 0.354,
"step": 530
},
{
"epoch": 0.21978021978021978,
"grad_norm": 1.5241363048553467,
"learning_rate": 1.8315018315018315e-07,
"loss": 0.9184,
"step": 540
},
{
"epoch": 0.21978021978021978,
"eval_cos_sim": 0.08055756241083145,
"eval_loss": 0.9204870734428113,
"eval_runtime": 91.1901,
"eval_samples_per_second": 10.966,
"eval_steps_per_second": 0.351,
"step": 540
},
{
"epoch": 0.22385022385022385,
"grad_norm": 1.8105984926223755,
"learning_rate": 1.865418532085199e-07,
"loss": 0.9157,
"step": 550
},
{
"epoch": 0.22385022385022385,
"eval_cos_sim": 0.08104187250137329,
"eval_loss": 0.9200031218742079,
"eval_runtime": 90.5659,
"eval_samples_per_second": 11.042,
"eval_steps_per_second": 0.353,
"step": 550
},
{
"epoch": 0.22792022792022792,
"grad_norm": 1.430216908454895,
"learning_rate": 1.8993352326685662e-07,
"loss": 0.9178,
"step": 560
},
{
"epoch": 0.22792022792022792,
"eval_cos_sim": 0.08153677731752396,
"eval_loss": 0.9195085845206923,
"eval_runtime": 91.0613,
"eval_samples_per_second": 10.982,
"eval_steps_per_second": 0.351,
"step": 560
},
{
"epoch": 0.231990231990232,
"grad_norm": 1.5421180725097656,
"learning_rate": 1.9332519332519332e-07,
"loss": 0.9142,
"step": 570
},
{
"epoch": 0.231990231990232,
"eval_cos_sim": 0.08204251527786255,
"eval_loss": 0.9190033140395826,
"eval_runtime": 90.396,
"eval_samples_per_second": 11.062,
"eval_steps_per_second": 0.354,
"step": 570
},
{
"epoch": 0.23606023606023607,
"grad_norm": 1.8284296989440918,
"learning_rate": 1.9671686338353007e-07,
"loss": 0.9138,
"step": 580
},
{
"epoch": 0.23606023606023607,
"eval_cos_sim": 0.08255407214164734,
"eval_loss": 0.9184921603416151,
"eval_runtime": 90.7456,
"eval_samples_per_second": 11.02,
"eval_steps_per_second": 0.353,
"step": 580
},
{
"epoch": 0.24013024013024012,
"grad_norm": 1.7948687076568604,
"learning_rate": 2.001085334418668e-07,
"loss": 0.9125,
"step": 590
},
{
"epoch": 0.24013024013024012,
"eval_cos_sim": 0.08306826651096344,
"eval_loss": 0.9179783883308119,
"eval_runtime": 90.7739,
"eval_samples_per_second": 11.016,
"eval_steps_per_second": 0.353,
"step": 590
},
{
"epoch": 0.2442002442002442,
"grad_norm": 1.7996617555618286,
"learning_rate": 2.0350020350020349e-07,
"loss": 0.9098,
"step": 600
},
{
"epoch": 0.2442002442002442,
"eval_cos_sim": 0.08360342681407928,
"eval_loss": 0.917443627378816,
"eval_runtime": 91.56,
"eval_samples_per_second": 10.922,
"eval_steps_per_second": 0.349,
"step": 600
},
{
"epoch": 0.24827024827024827,
"grad_norm": 1.5217535495758057,
"learning_rate": 2.0689187355854024e-07,
"loss": 0.9111,
"step": 610
},
{
"epoch": 0.24827024827024827,
"eval_cos_sim": 0.08415944874286652,
"eval_loss": 0.9168879828666395,
"eval_runtime": 91.0251,
"eval_samples_per_second": 10.986,
"eval_steps_per_second": 0.352,
"step": 610
},
{
"epoch": 0.2523402523402523,
"grad_norm": 1.549865961074829,
"learning_rate": 2.1028354361687696e-07,
"loss": 0.9089,
"step": 620
},
{
"epoch": 0.2523402523402523,
"eval_cos_sim": 0.08473014831542969,
"eval_loss": 0.9163176923011488,
"eval_runtime": 91.5776,
"eval_samples_per_second": 10.92,
"eval_steps_per_second": 0.349,
"step": 620
},
{
"epoch": 0.2564102564102564,
"grad_norm": 1.531300663948059,
"learning_rate": 2.136752136752137e-07,
"loss": 0.9091,
"step": 630
},
{
"epoch": 0.2564102564102564,
"eval_cos_sim": 0.08530249446630478,
"eval_loss": 0.9157457843040174,
"eval_runtime": 91.1083,
"eval_samples_per_second": 10.976,
"eval_steps_per_second": 0.351,
"step": 630
},
{
"epoch": 0.26048026048026046,
"grad_norm": 1.8052423000335693,
"learning_rate": 2.170668837335504e-07,
"loss": 0.9067,
"step": 640
},
{
"epoch": 0.26048026048026046,
"eval_cos_sim": 0.0858771950006485,
"eval_loss": 0.9151715054725355,
"eval_runtime": 92.0234,
"eval_samples_per_second": 10.867,
"eval_steps_per_second": 0.348,
"step": 640
},
{
"epoch": 0.26455026455026454,
"grad_norm": 1.513959288597107,
"learning_rate": 2.2045855379188713e-07,
"loss": 0.91,
"step": 650
},
{
"epoch": 0.26455026455026454,
"eval_cos_sim": 0.08646074682474136,
"eval_loss": 0.914588417551393,
"eval_runtime": 91.7102,
"eval_samples_per_second": 10.904,
"eval_steps_per_second": 0.349,
"step": 650
}
],
"logging_steps": 10,
"max_steps": 1474200,
"num_input_tokens_seen": 0,
"num_train_epochs": 600,
"save_steps": 10,
"total_flos": 0.0,
"train_batch_size": 160,
"trial_name": null,
"trial_params": null
}