{ "best_metric": 0.914588417551393, "best_model_checkpoint": "/p/scratch/ccstdl/krishna/finetuned-cosine-loss/checkpoint-650", "epoch": 0.26455026455026454, "eval_steps": 10, "global_step": 650, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00407000407000407, "grad_norm": 1.8234177827835083, "learning_rate": 3.391670058336725e-09, "loss": 0.9272, "step": 10 }, { "epoch": 0.00407000407000407, "eval_cos_sim": 0.0673384740948677, "eval_loss": 0.9336964531158155, "eval_runtime": 91.7154, "eval_samples_per_second": 10.903, "eval_steps_per_second": 0.349, "step": 10 }, { "epoch": 0.00814000814000814, "grad_norm": 1.7962757349014282, "learning_rate": 6.78334011667345e-09, "loss": 0.9259, "step": 20 }, { "epoch": 0.00814000814000814, "eval_cos_sim": 0.06735177338123322, "eval_loss": 0.9336831488822646, "eval_runtime": 89.447, "eval_samples_per_second": 11.18, "eval_steps_per_second": 0.358, "step": 20 }, { "epoch": 0.01221001221001221, "grad_norm": 1.784166693687439, "learning_rate": 1.0175010175010176e-08, "loss": 0.9296, "step": 30 }, { "epoch": 0.01221001221001221, "eval_cos_sim": 0.0673740804195404, "eval_loss": 0.9336608658050245, "eval_runtime": 89.5427, "eval_samples_per_second": 11.168, "eval_steps_per_second": 0.357, "step": 30 }, { "epoch": 0.01628001628001628, "grad_norm": 1.8226275444030762, "learning_rate": 1.35666802333469e-08, "loss": 0.9259, "step": 40 }, { "epoch": 0.01628001628001628, "eval_cos_sim": 0.06740561872720718, "eval_loss": 0.93362934305417, "eval_runtime": 89.3449, "eval_samples_per_second": 11.193, "eval_steps_per_second": 0.358, "step": 40 }, { "epoch": 0.02035002035002035, "grad_norm": 1.7831834554672241, "learning_rate": 1.6958350291683625e-08, "loss": 0.9251, "step": 50 }, { "epoch": 0.02035002035002035, "eval_cos_sim": 0.06744563579559326, "eval_loss": 0.933589364073152, "eval_runtime": 89.6651, "eval_samples_per_second": 11.153, "eval_steps_per_second": 0.357, "step": 50 }, { "epoch": 0.02442002442002442, "grad_norm": 1.5201727151870728, "learning_rate": 2.035002035002035e-08, "loss": 0.9276, "step": 60 }, { "epoch": 0.02442002442002442, "eval_cos_sim": 0.06749571859836578, "eval_loss": 0.9335393262122816, "eval_runtime": 89.3495, "eval_samples_per_second": 11.192, "eval_steps_per_second": 0.358, "step": 60 }, { "epoch": 0.02849002849002849, "grad_norm": 1.813641905784607, "learning_rate": 2.3741690408357078e-08, "loss": 0.9303, "step": 70 }, { "epoch": 0.02849002849002849, "eval_cos_sim": 0.06755577772855759, "eval_loss": 0.9334793086265272, "eval_runtime": 89.5359, "eval_samples_per_second": 11.169, "eval_steps_per_second": 0.357, "step": 70 }, { "epoch": 0.03256003256003256, "grad_norm": 1.8243420124053955, "learning_rate": 2.71333604666938e-08, "loss": 0.9288, "step": 80 }, { "epoch": 0.03256003256003256, "eval_cos_sim": 0.06762455403804779, "eval_loss": 0.9334105701659864, "eval_runtime": 89.5324, "eval_samples_per_second": 11.169, "eval_steps_per_second": 0.357, "step": 80 }, { "epoch": 0.03663003663003663, "grad_norm": 1.7983627319335938, "learning_rate": 3.052503052503053e-08, "loss": 0.9243, "step": 90 }, { "epoch": 0.03663003663003663, "eval_cos_sim": 0.06770093739032745, "eval_loss": 0.9333342409347242, "eval_runtime": 89.987, "eval_samples_per_second": 11.113, "eval_steps_per_second": 0.356, "step": 90 }, { "epoch": 0.0407000407000407, "grad_norm": 1.8116332292556763, "learning_rate": 3.391670058336725e-08, "loss": 0.9289, "step": 100 }, { "epoch": 0.0407000407000407, "eval_cos_sim": 0.06778644770383835, "eval_loss": 0.9332487797950453, "eval_runtime": 89.222, "eval_samples_per_second": 11.208, "eval_steps_per_second": 0.359, "step": 100 }, { "epoch": 0.04477004477004477, "grad_norm": 1.4440973997116089, "learning_rate": 3.730837064170397e-08, "loss": 0.9274, "step": 110 }, { "epoch": 0.04477004477004477, "eval_cos_sim": 0.06788003444671631, "eval_loss": 0.933155252001161, "eval_runtime": 89.3804, "eval_samples_per_second": 11.188, "eval_steps_per_second": 0.358, "step": 110 }, { "epoch": 0.04884004884004884, "grad_norm": 1.8082635402679443, "learning_rate": 4.07000407000407e-08, "loss": 0.9267, "step": 120 }, { "epoch": 0.04884004884004884, "eval_cos_sim": 0.06798040121793747, "eval_loss": 0.9330549512122815, "eval_runtime": 89.9274, "eval_samples_per_second": 11.12, "eval_steps_per_second": 0.356, "step": 120 }, { "epoch": 0.05291005291005291, "grad_norm": 1.8302645683288574, "learning_rate": 4.4091710758377425e-08, "loss": 0.9244, "step": 130 }, { "epoch": 0.05291005291005291, "eval_cos_sim": 0.06809354573488235, "eval_loss": 0.932941900274629, "eval_runtime": 89.8842, "eval_samples_per_second": 11.125, "eval_steps_per_second": 0.356, "step": 130 }, { "epoch": 0.05698005698005698, "grad_norm": 1.5440356731414795, "learning_rate": 4.7483380816714155e-08, "loss": 0.9316, "step": 140 }, { "epoch": 0.05698005698005698, "eval_cos_sim": 0.06821373105049133, "eval_loss": 0.9328218155120558, "eval_runtime": 89.7159, "eval_samples_per_second": 11.146, "eval_steps_per_second": 0.357, "step": 140 }, { "epoch": 0.06105006105006105, "grad_norm": 1.5421152114868164, "learning_rate": 5.087505087505087e-08, "loss": 0.927, "step": 150 }, { "epoch": 0.06105006105006105, "eval_cos_sim": 0.06834477931261063, "eval_loss": 0.9326908560012526, "eval_runtime": 89.7787, "eval_samples_per_second": 11.138, "eval_steps_per_second": 0.356, "step": 150 }, { "epoch": 0.06512006512006512, "grad_norm": 1.7998121976852417, "learning_rate": 5.42667209333876e-08, "loss": 0.9252, "step": 160 }, { "epoch": 0.06512006512006512, "eval_cos_sim": 0.06848659366369247, "eval_loss": 0.9325491523956007, "eval_runtime": 89.6926, "eval_samples_per_second": 11.149, "eval_steps_per_second": 0.357, "step": 160 }, { "epoch": 0.06919006919006919, "grad_norm": 1.542074203491211, "learning_rate": 5.7658390991724324e-08, "loss": 0.9252, "step": 170 }, { "epoch": 0.06919006919006919, "eval_cos_sim": 0.0686374306678772, "eval_loss": 0.9323984141563123, "eval_runtime": 89.453, "eval_samples_per_second": 11.179, "eval_steps_per_second": 0.358, "step": 170 }, { "epoch": 0.07326007326007326, "grad_norm": 1.5359669923782349, "learning_rate": 6.105006105006105e-08, "loss": 0.9267, "step": 180 }, { "epoch": 0.07326007326007326, "eval_cos_sim": 0.0687984898686409, "eval_loss": 0.9322374482368178, "eval_runtime": 89.6161, "eval_samples_per_second": 11.159, "eval_steps_per_second": 0.357, "step": 180 }, { "epoch": 0.07733007733007732, "grad_norm": 1.8200968503952026, "learning_rate": 6.444173110839778e-08, "loss": 0.9235, "step": 190 }, { "epoch": 0.07733007733007732, "eval_cos_sim": 0.06896985322237015, "eval_loss": 0.9320661959861464, "eval_runtime": 89.8737, "eval_samples_per_second": 11.127, "eval_steps_per_second": 0.356, "step": 190 }, { "epoch": 0.0814000814000814, "grad_norm": 1.8174818754196167, "learning_rate": 6.78334011667345e-08, "loss": 0.9248, "step": 200 }, { "epoch": 0.0814000814000814, "eval_cos_sim": 0.06914974004030228, "eval_loss": 0.9318864355300611, "eval_runtime": 89.8925, "eval_samples_per_second": 11.124, "eval_steps_per_second": 0.356, "step": 200 }, { "epoch": 0.08547008547008547, "grad_norm": 1.8348275423049927, "learning_rate": 7.122507122507124e-08, "loss": 0.9234, "step": 210 }, { "epoch": 0.08547008547008547, "eval_cos_sim": 0.06933821737766266, "eval_loss": 0.9316981101249403, "eval_runtime": 89.537, "eval_samples_per_second": 11.169, "eval_steps_per_second": 0.357, "step": 210 }, { "epoch": 0.08954008954008955, "grad_norm": 1.8035074472427368, "learning_rate": 7.461674128340795e-08, "loss": 0.9222, "step": 220 }, { "epoch": 0.08954008954008955, "eval_cos_sim": 0.06953444331884384, "eval_loss": 0.9315020323013014, "eval_runtime": 89.8235, "eval_samples_per_second": 11.133, "eval_steps_per_second": 0.356, "step": 220 }, { "epoch": 0.0936100936100936, "grad_norm": 1.8179575204849243, "learning_rate": 7.800841134174468e-08, "loss": 0.9286, "step": 230 }, { "epoch": 0.0936100936100936, "eval_cos_sim": 0.06974231451749802, "eval_loss": 0.9312943387244886, "eval_runtime": 89.2117, "eval_samples_per_second": 11.209, "eval_steps_per_second": 0.359, "step": 230 }, { "epoch": 0.09768009768009768, "grad_norm": 1.7966115474700928, "learning_rate": 8.14000814000814e-08, "loss": 0.9255, "step": 240 }, { "epoch": 0.09768009768009768, "eval_cos_sim": 0.06995726376771927, "eval_loss": 0.931079523107881, "eval_runtime": 89.804, "eval_samples_per_second": 11.135, "eval_steps_per_second": 0.356, "step": 240 }, { "epoch": 0.10175010175010175, "grad_norm": 1.797987461090088, "learning_rate": 8.479175145841813e-08, "loss": 0.9262, "step": 250 }, { "epoch": 0.10175010175010175, "eval_cos_sim": 0.07017555087804794, "eval_loss": 0.9308614025329298, "eval_runtime": 89.3997, "eval_samples_per_second": 11.186, "eval_steps_per_second": 0.358, "step": 250 }, { "epoch": 0.10582010582010581, "grad_norm": 1.8064875602722168, "learning_rate": 8.818342151675485e-08, "loss": 0.9273, "step": 260 }, { "epoch": 0.10582010582010581, "eval_cos_sim": 0.0703999400138855, "eval_loss": 0.9306371965621656, "eval_runtime": 89.9472, "eval_samples_per_second": 11.118, "eval_steps_per_second": 0.356, "step": 260 }, { "epoch": 0.10989010989010989, "grad_norm": 1.8176881074905396, "learning_rate": 9.157509157509157e-08, "loss": 0.9206, "step": 270 }, { "epoch": 0.10989010989010989, "eval_cos_sim": 0.07064417004585266, "eval_loss": 0.9303931417678541, "eval_runtime": 89.7373, "eval_samples_per_second": 11.144, "eval_steps_per_second": 0.357, "step": 270 }, { "epoch": 0.11396011396011396, "grad_norm": 1.5151400566101074, "learning_rate": 9.496676163342831e-08, "loss": 0.9211, "step": 280 }, { "epoch": 0.11396011396011396, "eval_cos_sim": 0.0709003284573555, "eval_loss": 0.9301371760581678, "eval_runtime": 89.9622, "eval_samples_per_second": 11.116, "eval_steps_per_second": 0.356, "step": 280 }, { "epoch": 0.11803011803011804, "grad_norm": 1.7980380058288574, "learning_rate": 9.835843169176503e-08, "loss": 0.9203, "step": 290 }, { "epoch": 0.11803011803011804, "eval_cos_sim": 0.07116622477769852, "eval_loss": 0.9298714599822706, "eval_runtime": 90.2995, "eval_samples_per_second": 11.074, "eval_steps_per_second": 0.354, "step": 290 }, { "epoch": 0.1221001221001221, "grad_norm": 1.5477901697158813, "learning_rate": 1.0175010175010174e-07, "loss": 0.9216, "step": 300 }, { "epoch": 0.1221001221001221, "eval_cos_sim": 0.07143931835889816, "eval_loss": 0.9295985355590528, "eval_runtime": 90.1186, "eval_samples_per_second": 11.096, "eval_steps_per_second": 0.355, "step": 300 }, { "epoch": 0.12617012617012616, "grad_norm": 1.7951384782791138, "learning_rate": 1.0514177180843848e-07, "loss": 0.9261, "step": 310 }, { "epoch": 0.12617012617012616, "eval_cos_sim": 0.07170899212360382, "eval_loss": 0.9293290882323927, "eval_runtime": 90.3148, "eval_samples_per_second": 11.072, "eval_steps_per_second": 0.354, "step": 310 }, { "epoch": 0.13024013024013023, "grad_norm": 1.7869328260421753, "learning_rate": 1.085334418667752e-07, "loss": 0.9189, "step": 320 }, { "epoch": 0.13024013024013023, "eval_cos_sim": 0.07198921591043472, "eval_loss": 0.9290490951751417, "eval_runtime": 89.77, "eval_samples_per_second": 11.14, "eval_steps_per_second": 0.356, "step": 320 }, { "epoch": 0.1343101343101343, "grad_norm": 1.5212459564208984, "learning_rate": 1.1192511192511194e-07, "loss": 0.9254, "step": 330 }, { "epoch": 0.1343101343101343, "eval_cos_sim": 0.07227852195501328, "eval_loss": 0.9287599940513319, "eval_runtime": 90.335, "eval_samples_per_second": 11.07, "eval_steps_per_second": 0.354, "step": 330 }, { "epoch": 0.13838013838013838, "grad_norm": 1.8259695768356323, "learning_rate": 1.1531678198344865e-07, "loss": 0.924, "step": 340 }, { "epoch": 0.13838013838013838, "eval_cos_sim": 0.072585329413414, "eval_loss": 0.928453390142793, "eval_runtime": 90.0139, "eval_samples_per_second": 11.109, "eval_steps_per_second": 0.356, "step": 340 }, { "epoch": 0.14245014245014245, "grad_norm": 1.810530424118042, "learning_rate": 1.1870845204178537e-07, "loss": 0.9223, "step": 350 }, { "epoch": 0.14245014245014245, "eval_cos_sim": 0.07290184497833252, "eval_loss": 0.92813706495511, "eval_runtime": 89.979, "eval_samples_per_second": 11.114, "eval_steps_per_second": 0.356, "step": 350 }, { "epoch": 0.14652014652014653, "grad_norm": 1.819162368774414, "learning_rate": 1.221001221001221e-07, "loss": 0.9227, "step": 360 }, { "epoch": 0.14652014652014653, "eval_cos_sim": 0.07322434335947037, "eval_loss": 0.9278148160194105, "eval_runtime": 90.5195, "eval_samples_per_second": 11.047, "eval_steps_per_second": 0.354, "step": 360 }, { "epoch": 0.1505901505901506, "grad_norm": 1.7979774475097656, "learning_rate": 1.2549179215845883e-07, "loss": 0.9185, "step": 370 }, { "epoch": 0.1505901505901506, "eval_cos_sim": 0.07355932146310806, "eval_loss": 0.9274800749038404, "eval_runtime": 90.4314, "eval_samples_per_second": 11.058, "eval_steps_per_second": 0.354, "step": 370 }, { "epoch": 0.15466015466015465, "grad_norm": 1.784081220626831, "learning_rate": 1.2888346221679555e-07, "loss": 0.9175, "step": 380 }, { "epoch": 0.15466015466015465, "eval_cos_sim": 0.07390536367893219, "eval_loss": 0.9271342835639662, "eval_runtime": 90.2731, "eval_samples_per_second": 11.077, "eval_steps_per_second": 0.354, "step": 380 }, { "epoch": 0.15873015873015872, "grad_norm": 1.8276127576828003, "learning_rate": 1.3227513227513228e-07, "loss": 0.9165, "step": 390 }, { "epoch": 0.15873015873015872, "eval_cos_sim": 0.07425787299871445, "eval_loss": 0.926782009146089, "eval_runtime": 89.6304, "eval_samples_per_second": 11.157, "eval_steps_per_second": 0.357, "step": 390 }, { "epoch": 0.1628001628001628, "grad_norm": 1.8162034749984741, "learning_rate": 1.35666802333469e-07, "loss": 0.9199, "step": 400 }, { "epoch": 0.1628001628001628, "eval_cos_sim": 0.07460718601942062, "eval_loss": 0.9264329524253553, "eval_runtime": 90.4314, "eval_samples_per_second": 11.058, "eval_steps_per_second": 0.354, "step": 400 }, { "epoch": 0.16687016687016687, "grad_norm": 1.4985719919204712, "learning_rate": 1.3905847239180572e-07, "loss": 0.9224, "step": 410 }, { "epoch": 0.16687016687016687, "eval_cos_sim": 0.0749591663479805, "eval_loss": 0.9260812273238844, "eval_runtime": 89.9689, "eval_samples_per_second": 11.115, "eval_steps_per_second": 0.356, "step": 410 }, { "epoch": 0.17094017094017094, "grad_norm": 1.8033661842346191, "learning_rate": 1.4245014245014247e-07, "loss": 0.9152, "step": 420 }, { "epoch": 0.17094017094017094, "eval_cos_sim": 0.07533340901136398, "eval_loss": 0.9257072530006116, "eval_runtime": 90.1973, "eval_samples_per_second": 11.087, "eval_steps_per_second": 0.355, "step": 420 }, { "epoch": 0.17501017501017502, "grad_norm": 1.7833577394485474, "learning_rate": 1.4584181250847917e-07, "loss": 0.9187, "step": 430 }, { "epoch": 0.17501017501017502, "eval_cos_sim": 0.07572542876005173, "eval_loss": 0.9253155136321729, "eval_runtime": 89.962, "eval_samples_per_second": 11.116, "eval_steps_per_second": 0.356, "step": 430 }, { "epoch": 0.1790801790801791, "grad_norm": 1.7877451181411743, "learning_rate": 1.492334825668159e-07, "loss": 0.9175, "step": 440 }, { "epoch": 0.1790801790801791, "eval_cos_sim": 0.0761212408542633, "eval_loss": 0.9249199991439527, "eval_runtime": 89.9831, "eval_samples_per_second": 11.113, "eval_steps_per_second": 0.356, "step": 440 }, { "epoch": 0.18315018315018314, "grad_norm": 1.7935380935668945, "learning_rate": 1.5262515262515264e-07, "loss": 0.9167, "step": 450 }, { "epoch": 0.18315018315018314, "eval_cos_sim": 0.07652699947357178, "eval_loss": 0.9245145797942823, "eval_runtime": 90.5936, "eval_samples_per_second": 11.038, "eval_steps_per_second": 0.353, "step": 450 }, { "epoch": 0.1872201872201872, "grad_norm": 1.8291174173355103, "learning_rate": 1.5601682268348936e-07, "loss": 0.9127, "step": 460 }, { "epoch": 0.1872201872201872, "eval_cos_sim": 0.0769369825720787, "eval_loss": 0.9241049232696241, "eval_runtime": 89.8268, "eval_samples_per_second": 11.133, "eval_steps_per_second": 0.356, "step": 460 }, { "epoch": 0.19129019129019129, "grad_norm": 1.5488710403442383, "learning_rate": 1.594084927418261e-07, "loss": 0.9227, "step": 470 }, { "epoch": 0.19129019129019129, "eval_cos_sim": 0.07736244797706604, "eval_loss": 0.9236797976707166, "eval_runtime": 89.9173, "eval_samples_per_second": 11.121, "eval_steps_per_second": 0.356, "step": 470 }, { "epoch": 0.19536019536019536, "grad_norm": 1.5216211080551147, "learning_rate": 1.628001628001628e-07, "loss": 0.9196, "step": 480 }, { "epoch": 0.19536019536019536, "eval_cos_sim": 0.07778871059417725, "eval_loss": 0.9232538404678052, "eval_runtime": 90.6603, "eval_samples_per_second": 11.03, "eval_steps_per_second": 0.353, "step": 480 }, { "epoch": 0.19943019943019943, "grad_norm": 1.7952001094818115, "learning_rate": 1.6619183285849953e-07, "loss": 0.9113, "step": 490 }, { "epoch": 0.19943019943019943, "eval_cos_sim": 0.07823511958122253, "eval_loss": 0.922807737371797, "eval_runtime": 90.9654, "eval_samples_per_second": 10.993, "eval_steps_per_second": 0.352, "step": 490 }, { "epoch": 0.2035002035002035, "grad_norm": 1.5366356372833252, "learning_rate": 1.6958350291683626e-07, "loss": 0.9204, "step": 500 }, { "epoch": 0.2035002035002035, "eval_cos_sim": 0.07868806272745132, "eval_loss": 0.9223551345084852, "eval_runtime": 90.5097, "eval_samples_per_second": 11.049, "eval_steps_per_second": 0.354, "step": 500 }, { "epoch": 0.20757020757020758, "grad_norm": 1.5292208194732666, "learning_rate": 1.7297517297517298e-07, "loss": 0.9163, "step": 510 }, { "epoch": 0.20757020757020758, "eval_cos_sim": 0.0791405662894249, "eval_loss": 0.9219029598449415, "eval_runtime": 90.3795, "eval_samples_per_second": 11.064, "eval_steps_per_second": 0.354, "step": 510 }, { "epoch": 0.21164021164021163, "grad_norm": 1.8251779079437256, "learning_rate": 1.763668430335097e-07, "loss": 0.9177, "step": 520 }, { "epoch": 0.21164021164021163, "eval_cos_sim": 0.07960349321365356, "eval_loss": 0.921440408251161, "eval_runtime": 90.2913, "eval_samples_per_second": 11.075, "eval_steps_per_second": 0.354, "step": 520 }, { "epoch": 0.2157102157102157, "grad_norm": 1.8137582540512085, "learning_rate": 1.7975851309184642e-07, "loss": 0.9108, "step": 530 }, { "epoch": 0.2157102157102157, "eval_cos_sim": 0.08007470518350601, "eval_loss": 0.9209695620750136, "eval_runtime": 90.4592, "eval_samples_per_second": 11.055, "eval_steps_per_second": 0.354, "step": 530 }, { "epoch": 0.21978021978021978, "grad_norm": 1.5241363048553467, "learning_rate": 1.8315018315018315e-07, "loss": 0.9184, "step": 540 }, { "epoch": 0.21978021978021978, "eval_cos_sim": 0.08055756241083145, "eval_loss": 0.9204870734428113, "eval_runtime": 91.1901, "eval_samples_per_second": 10.966, "eval_steps_per_second": 0.351, "step": 540 }, { "epoch": 0.22385022385022385, "grad_norm": 1.8105984926223755, "learning_rate": 1.865418532085199e-07, "loss": 0.9157, "step": 550 }, { "epoch": 0.22385022385022385, "eval_cos_sim": 0.08104187250137329, "eval_loss": 0.9200031218742079, "eval_runtime": 90.5659, "eval_samples_per_second": 11.042, "eval_steps_per_second": 0.353, "step": 550 }, { "epoch": 0.22792022792022792, "grad_norm": 1.430216908454895, "learning_rate": 1.8993352326685662e-07, "loss": 0.9178, "step": 560 }, { "epoch": 0.22792022792022792, "eval_cos_sim": 0.08153677731752396, "eval_loss": 0.9195085845206923, "eval_runtime": 91.0613, "eval_samples_per_second": 10.982, "eval_steps_per_second": 0.351, "step": 560 }, { "epoch": 0.231990231990232, "grad_norm": 1.5421180725097656, "learning_rate": 1.9332519332519332e-07, "loss": 0.9142, "step": 570 }, { "epoch": 0.231990231990232, "eval_cos_sim": 0.08204251527786255, "eval_loss": 0.9190033140395826, "eval_runtime": 90.396, "eval_samples_per_second": 11.062, "eval_steps_per_second": 0.354, "step": 570 }, { "epoch": 0.23606023606023607, "grad_norm": 1.8284296989440918, "learning_rate": 1.9671686338353007e-07, "loss": 0.9138, "step": 580 }, { "epoch": 0.23606023606023607, "eval_cos_sim": 0.08255407214164734, "eval_loss": 0.9184921603416151, "eval_runtime": 90.7456, "eval_samples_per_second": 11.02, "eval_steps_per_second": 0.353, "step": 580 }, { "epoch": 0.24013024013024012, "grad_norm": 1.7948687076568604, "learning_rate": 2.001085334418668e-07, "loss": 0.9125, "step": 590 }, { "epoch": 0.24013024013024012, "eval_cos_sim": 0.08306826651096344, "eval_loss": 0.9179783883308119, "eval_runtime": 90.7739, "eval_samples_per_second": 11.016, "eval_steps_per_second": 0.353, "step": 590 }, { "epoch": 0.2442002442002442, "grad_norm": 1.7996617555618286, "learning_rate": 2.0350020350020349e-07, "loss": 0.9098, "step": 600 }, { "epoch": 0.2442002442002442, "eval_cos_sim": 0.08360342681407928, "eval_loss": 0.917443627378816, "eval_runtime": 91.56, "eval_samples_per_second": 10.922, "eval_steps_per_second": 0.349, "step": 600 }, { "epoch": 0.24827024827024827, "grad_norm": 1.5217535495758057, "learning_rate": 2.0689187355854024e-07, "loss": 0.9111, "step": 610 }, { "epoch": 0.24827024827024827, "eval_cos_sim": 0.08415944874286652, "eval_loss": 0.9168879828666395, "eval_runtime": 91.0251, "eval_samples_per_second": 10.986, "eval_steps_per_second": 0.352, "step": 610 }, { "epoch": 0.2523402523402523, "grad_norm": 1.549865961074829, "learning_rate": 2.1028354361687696e-07, "loss": 0.9089, "step": 620 }, { "epoch": 0.2523402523402523, "eval_cos_sim": 0.08473014831542969, "eval_loss": 0.9163176923011488, "eval_runtime": 91.5776, "eval_samples_per_second": 10.92, "eval_steps_per_second": 0.349, "step": 620 }, { "epoch": 0.2564102564102564, "grad_norm": 1.531300663948059, "learning_rate": 2.136752136752137e-07, "loss": 0.9091, "step": 630 }, { "epoch": 0.2564102564102564, "eval_cos_sim": 0.08530249446630478, "eval_loss": 0.9157457843040174, "eval_runtime": 91.1083, "eval_samples_per_second": 10.976, "eval_steps_per_second": 0.351, "step": 630 }, { "epoch": 0.26048026048026046, "grad_norm": 1.8052423000335693, "learning_rate": 2.170668837335504e-07, "loss": 0.9067, "step": 640 }, { "epoch": 0.26048026048026046, "eval_cos_sim": 0.0858771950006485, "eval_loss": 0.9151715054725355, "eval_runtime": 92.0234, "eval_samples_per_second": 10.867, "eval_steps_per_second": 0.348, "step": 640 }, { "epoch": 0.26455026455026454, "grad_norm": 1.513959288597107, "learning_rate": 2.2045855379188713e-07, "loss": 0.91, "step": 650 }, { "epoch": 0.26455026455026454, "eval_cos_sim": 0.08646074682474136, "eval_loss": 0.914588417551393, "eval_runtime": 91.7102, "eval_samples_per_second": 10.904, "eval_steps_per_second": 0.349, "step": 650 } ], "logging_steps": 10, "max_steps": 1474200, "num_input_tokens_seen": 0, "num_train_epochs": 600, "save_steps": 10, "total_flos": 0.0, "train_batch_size": 160, "trial_name": null, "trial_params": null }