{ "best_metric": 0.7333984103416151, "best_model_checkpoint": "/p/scratch/ccstdl/krishna/finetuned-cosine-loss/checkpoint-660", "epoch": 0.2686202686202686, "eval_steps": 10, "global_step": 660, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00407000407000407, "grad_norm": 1.8124009370803833, "learning_rate": 3.391670058336725e-09, "loss": 0.732, "step": 10 }, { "epoch": 0.00407000407000407, "eval_cos_sim": 0.25005149841308594, "eval_loss": 0.7511245851730055, "eval_runtime": 89.901, "eval_samples_per_second": 11.123, "eval_steps_per_second": 0.356, "step": 10 }, { "epoch": 0.00814000814000814, "grad_norm": 1.8144397735595703, "learning_rate": 6.78334011667345e-09, "loss": 0.7341, "step": 20 }, { "epoch": 0.00814000814000814, "eval_cos_sim": 0.25006377696990967, "eval_loss": 0.7511123490546888, "eval_runtime": 88.402, "eval_samples_per_second": 11.312, "eval_steps_per_second": 0.362, "step": 20 }, { "epoch": 0.01221001221001221, "grad_norm": 1.7918556928634644, "learning_rate": 1.0175010175010176e-08, "loss": 0.7425, "step": 30 }, { "epoch": 0.01221001221001221, "eval_cos_sim": 0.2500843405723572, "eval_loss": 0.7510917806838697, "eval_runtime": 88.6959, "eval_samples_per_second": 11.274, "eval_steps_per_second": 0.361, "step": 30 }, { "epoch": 0.01628001628001628, "grad_norm": 1.804418683052063, "learning_rate": 1.35666802333469e-08, "loss": 0.7437, "step": 40 }, { "epoch": 0.01628001628001628, "eval_cos_sim": 0.25011318922042847, "eval_loss": 0.7510629492019362, "eval_runtime": 88.1716, "eval_samples_per_second": 11.342, "eval_steps_per_second": 0.363, "step": 40 }, { "epoch": 0.02035002035002035, "grad_norm": 1.794533133506775, "learning_rate": 1.6958350291683625e-08, "loss": 0.7319, "step": 50 }, { "epoch": 0.02035002035002035, "eval_cos_sim": 0.2501494884490967, "eval_loss": 0.7510266695235914, "eval_runtime": 87.5335, "eval_samples_per_second": 11.424, "eval_steps_per_second": 0.366, "step": 50 }, { "epoch": 0.02442002442002442, "grad_norm": 1.5170952081680298, "learning_rate": 2.035002035002035e-08, "loss": 0.7504, "step": 60 }, { "epoch": 0.02442002442002442, "eval_cos_sim": 0.25019460916519165, "eval_loss": 0.7509815736030286, "eval_runtime": 88.3799, "eval_samples_per_second": 11.315, "eval_steps_per_second": 0.362, "step": 60 }, { "epoch": 0.02849002849002849, "grad_norm": 1.5275336503982544, "learning_rate": 2.3741690408357078e-08, "loss": 0.7458, "step": 70 }, { "epoch": 0.02849002849002849, "eval_cos_sim": 0.2502490282058716, "eval_loss": 0.7509272141669935, "eval_runtime": 88.4986, "eval_samples_per_second": 11.3, "eval_steps_per_second": 0.362, "step": 70 }, { "epoch": 0.03256003256003256, "grad_norm": 1.7890334129333496, "learning_rate": 2.71333604666938e-08, "loss": 0.764, "step": 80 }, { "epoch": 0.03256003256003256, "eval_cos_sim": 0.2503115236759186, "eval_loss": 0.7508647556518263, "eval_runtime": 88.1347, "eval_samples_per_second": 11.346, "eval_steps_per_second": 0.363, "step": 80 }, { "epoch": 0.03663003663003663, "grad_norm": 1.8015758991241455, "learning_rate": 3.052503052503053e-08, "loss": 0.7331, "step": 90 }, { "epoch": 0.03663003663003663, "eval_cos_sim": 0.2503812611103058, "eval_loss": 0.7507950768684095, "eval_runtime": 88.5092, "eval_samples_per_second": 11.298, "eval_steps_per_second": 0.362, "step": 90 }, { "epoch": 0.0407000407000407, "grad_norm": 1.786906361579895, "learning_rate": 3.391670058336725e-08, "loss": 0.7403, "step": 100 }, { "epoch": 0.0407000407000407, "eval_cos_sim": 0.25045979022979736, "eval_loss": 0.750716588518495, "eval_runtime": 87.8285, "eval_samples_per_second": 11.386, "eval_steps_per_second": 0.364, "step": 100 }, { "epoch": 0.04477004477004477, "grad_norm": 1.5222004652023315, "learning_rate": 3.730837064170397e-08, "loss": 0.7402, "step": 110 }, { "epoch": 0.04477004477004477, "eval_cos_sim": 0.250546395778656, "eval_loss": 0.7506300463889783, "eval_runtime": 88.743, "eval_samples_per_second": 11.268, "eval_steps_per_second": 0.361, "step": 110 }, { "epoch": 0.04884004884004884, "grad_norm": 1.8194061517715454, "learning_rate": 4.07000407000407e-08, "loss": 0.7417, "step": 120 }, { "epoch": 0.04884004884004884, "eval_cos_sim": 0.2506391704082489, "eval_loss": 0.7505373349403089, "eval_runtime": 88.5658, "eval_samples_per_second": 11.291, "eval_steps_per_second": 0.361, "step": 120 }, { "epoch": 0.05291005291005291, "grad_norm": 1.8265488147735596, "learning_rate": 4.4091710758377425e-08, "loss": 0.7297, "step": 130 }, { "epoch": 0.05291005291005291, "eval_cos_sim": 0.2507420480251312, "eval_loss": 0.7504345479224866, "eval_runtime": 88.6555, "eval_samples_per_second": 11.28, "eval_steps_per_second": 0.361, "step": 130 }, { "epoch": 0.05698005698005698, "grad_norm": 1.7994428873062134, "learning_rate": 4.7483380816714155e-08, "loss": 0.7484, "step": 140 }, { "epoch": 0.05698005698005698, "eval_cos_sim": 0.2508507966995239, "eval_loss": 0.7503258481239027, "eval_runtime": 89.045, "eval_samples_per_second": 11.23, "eval_steps_per_second": 0.359, "step": 140 }, { "epoch": 0.06105006105006105, "grad_norm": 1.5162811279296875, "learning_rate": 5.087505087505087e-08, "loss": 0.7453, "step": 150 }, { "epoch": 0.06105006105006105, "eval_cos_sim": 0.2509710192680359, "eval_loss": 0.7502057595466322, "eval_runtime": 88.4537, "eval_samples_per_second": 11.305, "eval_steps_per_second": 0.362, "step": 150 }, { "epoch": 0.06512006512006512, "grad_norm": 1.5280661582946777, "learning_rate": 5.42667209333876e-08, "loss": 0.7404, "step": 160 }, { "epoch": 0.06512006512006512, "eval_cos_sim": 0.2511015236377716, "eval_loss": 0.7500753340934462, "eval_runtime": 87.8406, "eval_samples_per_second": 11.384, "eval_steps_per_second": 0.364, "step": 160 }, { "epoch": 0.06919006919006919, "grad_norm": 1.5292084217071533, "learning_rate": 5.7658390991724324e-08, "loss": 0.7317, "step": 170 }, { "epoch": 0.06919006919006919, "eval_cos_sim": 0.2512374520301819, "eval_loss": 0.7499395051215834, "eval_runtime": 88.4647, "eval_samples_per_second": 11.304, "eval_steps_per_second": 0.362, "step": 170 }, { "epoch": 0.07326007326007326, "grad_norm": 1.5250757932662964, "learning_rate": 6.105006105006105e-08, "loss": 0.7437, "step": 180 }, { "epoch": 0.07326007326007326, "eval_cos_sim": 0.25138044357299805, "eval_loss": 0.7497965970252699, "eval_runtime": 88.2947, "eval_samples_per_second": 11.326, "eval_steps_per_second": 0.362, "step": 180 }, { "epoch": 0.07733007733007732, "grad_norm": 1.8300116062164307, "learning_rate": 6.444173110839778e-08, "loss": 0.7374, "step": 190 }, { "epoch": 0.07733007733007732, "eval_cos_sim": 0.25153324007987976, "eval_loss": 0.7496438927863782, "eval_runtime": 88.4194, "eval_samples_per_second": 11.31, "eval_steps_per_second": 0.362, "step": 190 }, { "epoch": 0.0814000814000814, "grad_norm": 1.7857962846755981, "learning_rate": 6.78334011667345e-08, "loss": 0.7294, "step": 200 }, { "epoch": 0.0814000814000814, "eval_cos_sim": 0.2516980767250061, "eval_loss": 0.7494791708205885, "eval_runtime": 88.859, "eval_samples_per_second": 11.254, "eval_steps_per_second": 0.36, "step": 200 }, { "epoch": 0.08547008547008547, "grad_norm": 1.8195607662200928, "learning_rate": 7.122507122507124e-08, "loss": 0.7441, "step": 210 }, { "epoch": 0.08547008547008547, "eval_cos_sim": 0.25187191367149353, "eval_loss": 0.7493054814552015, "eval_runtime": 88.3989, "eval_samples_per_second": 11.312, "eval_steps_per_second": 0.362, "step": 210 }, { "epoch": 0.08954008954008955, "grad_norm": 1.7800198793411255, "learning_rate": 7.461674128340795e-08, "loss": 0.7395, "step": 220 }, { "epoch": 0.08954008954008955, "eval_cos_sim": 0.2520497441291809, "eval_loss": 0.7491277885650343, "eval_runtime": 88.4463, "eval_samples_per_second": 11.306, "eval_steps_per_second": 0.362, "step": 220 }, { "epoch": 0.0936100936100936, "grad_norm": 1.5236977338790894, "learning_rate": 7.800841134174468e-08, "loss": 0.7402, "step": 230 }, { "epoch": 0.0936100936100936, "eval_cos_sim": 0.25224006175994873, "eval_loss": 0.7489376101707167, "eval_runtime": 88.645, "eval_samples_per_second": 11.281, "eval_steps_per_second": 0.361, "step": 230 }, { "epoch": 0.09768009768009768, "grad_norm": 1.79439377784729, "learning_rate": 8.14000814000814e-08, "loss": 0.7432, "step": 240 }, { "epoch": 0.09768009768009768, "eval_cos_sim": 0.25243115425109863, "eval_loss": 0.7487466225837416, "eval_runtime": 88.5544, "eval_samples_per_second": 11.292, "eval_steps_per_second": 0.361, "step": 240 }, { "epoch": 0.10175010175010175, "grad_norm": 1.8155500888824463, "learning_rate": 8.479175145841813e-08, "loss": 0.7358, "step": 250 }, { "epoch": 0.10175010175010175, "eval_cos_sim": 0.25262758135795593, "eval_loss": 0.7485503211234754, "eval_runtime": 88.6192, "eval_samples_per_second": 11.284, "eval_steps_per_second": 0.361, "step": 250 }, { "epoch": 0.10582010582010581, "grad_norm": 1.8234913349151611, "learning_rate": 8.818342151675485e-08, "loss": 0.7418, "step": 260 }, { "epoch": 0.10582010582010581, "eval_cos_sim": 0.25283777713775635, "eval_loss": 0.7483402905677503, "eval_runtime": 89.1169, "eval_samples_per_second": 11.221, "eval_steps_per_second": 0.359, "step": 260 }, { "epoch": 0.10989010989010989, "grad_norm": 1.808308482170105, "learning_rate": 9.157509157509157e-08, "loss": 0.7298, "step": 270 }, { "epoch": 0.10989010989010989, "eval_cos_sim": 0.2530568242073059, "eval_loss": 0.7481214027618116, "eval_runtime": 88.155, "eval_samples_per_second": 11.344, "eval_steps_per_second": 0.363, "step": 270 }, { "epoch": 0.11396011396011396, "grad_norm": 1.7929309606552124, "learning_rate": 9.496676163342831e-08, "loss": 0.7257, "step": 280 }, { "epoch": 0.11396011396011396, "eval_cos_sim": 0.25328928232192993, "eval_loss": 0.747889132521028, "eval_runtime": 89.0622, "eval_samples_per_second": 11.228, "eval_steps_per_second": 0.359, "step": 280 }, { "epoch": 0.11803011803011804, "grad_norm": 1.4342882633209229, "learning_rate": 9.835843169176503e-08, "loss": 0.7451, "step": 290 }, { "epoch": 0.11803011803011804, "eval_cos_sim": 0.2535253167152405, "eval_loss": 0.7476532855247205, "eval_runtime": 89.396, "eval_samples_per_second": 11.186, "eval_steps_per_second": 0.358, "step": 290 }, { "epoch": 0.1221001221001221, "grad_norm": 1.7873544692993164, "learning_rate": 1.0175010175010174e-07, "loss": 0.7289, "step": 300 }, { "epoch": 0.1221001221001221, "eval_cos_sim": 0.25376906991004944, "eval_loss": 0.7474096789573378, "eval_runtime": 88.7445, "eval_samples_per_second": 11.268, "eval_steps_per_second": 0.361, "step": 300 }, { "epoch": 0.12617012617012616, "grad_norm": 1.4993253946304321, "learning_rate": 1.0514177180843848e-07, "loss": 0.7498, "step": 310 }, { "epoch": 0.12617012617012616, "eval_cos_sim": 0.2540150582790375, "eval_loss": 0.747163932344789, "eval_runtime": 88.8446, "eval_samples_per_second": 11.256, "eval_steps_per_second": 0.36, "step": 310 }, { "epoch": 0.13024013024013023, "grad_norm": 1.5209181308746338, "learning_rate": 1.085334418667752e-07, "loss": 0.7332, "step": 320 }, { "epoch": 0.13024013024013023, "eval_cos_sim": 0.25427138805389404, "eval_loss": 0.7469077882980055, "eval_runtime": 89.1231, "eval_samples_per_second": 11.22, "eval_steps_per_second": 0.359, "step": 320 }, { "epoch": 0.1343101343101343, "grad_norm": 1.8111993074417114, "learning_rate": 1.1192511192511194e-07, "loss": 0.7377, "step": 330 }, { "epoch": 0.1343101343101343, "eval_cos_sim": 0.2545357644557953, "eval_loss": 0.7466436038230604, "eval_runtime": 88.9406, "eval_samples_per_second": 11.243, "eval_steps_per_second": 0.36, "step": 330 }, { "epoch": 0.13838013838013838, "grad_norm": 1.5225635766983032, "learning_rate": 1.1531678198344865e-07, "loss": 0.7411, "step": 340 }, { "epoch": 0.13838013838013838, "eval_cos_sim": 0.25481417775154114, "eval_loss": 0.7463653974746413, "eval_runtime": 88.552, "eval_samples_per_second": 11.293, "eval_steps_per_second": 0.361, "step": 340 }, { "epoch": 0.14245014245014245, "grad_norm": 1.81794273853302, "learning_rate": 1.1870845204178537e-07, "loss": 0.7482, "step": 350 }, { "epoch": 0.14245014245014245, "eval_cos_sim": 0.2550930678844452, "eval_loss": 0.7460866894935316, "eval_runtime": 88.1664, "eval_samples_per_second": 11.342, "eval_steps_per_second": 0.363, "step": 350 }, { "epoch": 0.14652014652014653, "grad_norm": 1.4908636808395386, "learning_rate": 1.221001221001221e-07, "loss": 0.7319, "step": 360 }, { "epoch": 0.14652014652014653, "eval_cos_sim": 0.2553824186325073, "eval_loss": 0.7457975163673108, "eval_runtime": 88.5321, "eval_samples_per_second": 11.295, "eval_steps_per_second": 0.361, "step": 360 }, { "epoch": 0.1505901505901506, "grad_norm": 1.7756201028823853, "learning_rate": 1.2549179215845883e-07, "loss": 0.7245, "step": 370 }, { "epoch": 0.1505901505901506, "eval_cos_sim": 0.25568509101867676, "eval_loss": 0.7454950423453993, "eval_runtime": 88.9525, "eval_samples_per_second": 11.242, "eval_steps_per_second": 0.36, "step": 370 }, { "epoch": 0.15466015466015465, "grad_norm": 1.5107450485229492, "learning_rate": 1.2888346221679555e-07, "loss": 0.7453, "step": 380 }, { "epoch": 0.15466015466015465, "eval_cos_sim": 0.25599703192710876, "eval_loss": 0.7451833567832655, "eval_runtime": 88.5223, "eval_samples_per_second": 11.297, "eval_steps_per_second": 0.361, "step": 380 }, { "epoch": 0.15873015873015872, "grad_norm": 1.8164764642715454, "learning_rate": 1.3227513227513228e-07, "loss": 0.7226, "step": 390 }, { "epoch": 0.15873015873015872, "eval_cos_sim": 0.25630947947502136, "eval_loss": 0.7448710847114272, "eval_runtime": 89.3118, "eval_samples_per_second": 11.197, "eval_steps_per_second": 0.358, "step": 390 }, { "epoch": 0.1628001628001628, "grad_norm": 1.500030279159546, "learning_rate": 1.35666802333469e-07, "loss": 0.7354, "step": 400 }, { "epoch": 0.1628001628001628, "eval_cos_sim": 0.2566259205341339, "eval_loss": 0.7445548601363844, "eval_runtime": 89.5491, "eval_samples_per_second": 11.167, "eval_steps_per_second": 0.357, "step": 400 }, { "epoch": 0.16687016687016687, "grad_norm": 1.4817250967025757, "learning_rate": 1.3905847239180572e-07, "loss": 0.7507, "step": 410 }, { "epoch": 0.16687016687016687, "eval_cos_sim": 0.25694403052330017, "eval_loss": 0.744236885568971, "eval_runtime": 88.9828, "eval_samples_per_second": 11.238, "eval_steps_per_second": 0.36, "step": 410 }, { "epoch": 0.17094017094017094, "grad_norm": 1.510607123374939, "learning_rate": 1.4245014245014247e-07, "loss": 0.7244, "step": 420 }, { "epoch": 0.17094017094017094, "eval_cos_sim": 0.2572762966156006, "eval_loss": 0.7439048562263196, "eval_runtime": 88.9759, "eval_samples_per_second": 11.239, "eval_steps_per_second": 0.36, "step": 420 }, { "epoch": 0.17501017501017502, "grad_norm": 1.7870216369628906, "learning_rate": 1.4584181250847917e-07, "loss": 0.7468, "step": 430 }, { "epoch": 0.17501017501017502, "eval_cos_sim": 0.2576209008693695, "eval_loss": 0.7435604424689954, "eval_runtime": 89.244, "eval_samples_per_second": 11.205, "eval_steps_per_second": 0.359, "step": 430 }, { "epoch": 0.1790801790801791, "grad_norm": 1.7750380039215088, "learning_rate": 1.492334825668159e-07, "loss": 0.7291, "step": 440 }, { "epoch": 0.1790801790801791, "eval_cos_sim": 0.2579769194126129, "eval_loss": 0.7432047133659071, "eval_runtime": 88.5309, "eval_samples_per_second": 11.295, "eval_steps_per_second": 0.361, "step": 440 }, { "epoch": 0.18315018315018314, "grad_norm": 1.7759586572647095, "learning_rate": 1.5262515262515264e-07, "loss": 0.7293, "step": 450 }, { "epoch": 0.18315018315018314, "eval_cos_sim": 0.25834715366363525, "eval_loss": 0.7428347926353163, "eval_runtime": 89.2461, "eval_samples_per_second": 11.205, "eval_steps_per_second": 0.359, "step": 450 }, { "epoch": 0.1872201872201872, "grad_norm": 1.8177165985107422, "learning_rate": 1.5601682268348936e-07, "loss": 0.7326, "step": 460 }, { "epoch": 0.1872201872201872, "eval_cos_sim": 0.25872257351875305, "eval_loss": 0.7424596538757032, "eval_runtime": 89.1253, "eval_samples_per_second": 11.22, "eval_steps_per_second": 0.359, "step": 460 }, { "epoch": 0.19129019129019129, "grad_norm": 1.5271542072296143, "learning_rate": 1.594084927418261e-07, "loss": 0.7358, "step": 470 }, { "epoch": 0.19129019129019129, "eval_cos_sim": 0.25911861658096313, "eval_loss": 0.742063922426576, "eval_runtime": 88.958, "eval_samples_per_second": 11.241, "eval_steps_per_second": 0.36, "step": 470 }, { "epoch": 0.19536019536019536, "grad_norm": 1.519902229309082, "learning_rate": 1.628001628001628e-07, "loss": 0.7568, "step": 480 }, { "epoch": 0.19536019536019536, "eval_cos_sim": 0.2595009207725525, "eval_loss": 0.7416818804954237, "eval_runtime": 89.0027, "eval_samples_per_second": 11.236, "eval_steps_per_second": 0.36, "step": 480 }, { "epoch": 0.19943019943019943, "grad_norm": 1.5114340782165527, "learning_rate": 1.6619183285849953e-07, "loss": 0.7242, "step": 490 }, { "epoch": 0.19943019943019943, "eval_cos_sim": 0.2598978579044342, "eval_loss": 0.7412852273200696, "eval_runtime": 89.2389, "eval_samples_per_second": 11.206, "eval_steps_per_second": 0.359, "step": 490 }, { "epoch": 0.2035002035002035, "grad_norm": 1.78502357006073, "learning_rate": 1.6958350291683626e-07, "loss": 0.7339, "step": 500 }, { "epoch": 0.2035002035002035, "eval_cos_sim": 0.26030153036117554, "eval_loss": 0.7408818502639478, "eval_runtime": 88.3154, "eval_samples_per_second": 11.323, "eval_steps_per_second": 0.362, "step": 500 }, { "epoch": 0.20757020757020758, "grad_norm": 1.540235161781311, "learning_rate": 1.7297517297517298e-07, "loss": 0.7322, "step": 510 }, { "epoch": 0.20757020757020758, "eval_cos_sim": 0.26071667671203613, "eval_loss": 0.7404670024131483, "eval_runtime": 88.9616, "eval_samples_per_second": 11.241, "eval_steps_per_second": 0.36, "step": 510 }, { "epoch": 0.21164021164021163, "grad_norm": 1.8035300970077515, "learning_rate": 1.763668430335097e-07, "loss": 0.721, "step": 520 }, { "epoch": 0.21164021164021163, "eval_cos_sim": 0.26114267110824585, "eval_loss": 0.7400413398956007, "eval_runtime": 89.0108, "eval_samples_per_second": 11.235, "eval_steps_per_second": 0.36, "step": 520 }, { "epoch": 0.2157102157102157, "grad_norm": 1.5290201902389526, "learning_rate": 1.7975851309184642e-07, "loss": 0.7263, "step": 530 }, { "epoch": 0.2157102157102157, "eval_cos_sim": 0.2615705728530884, "eval_loss": 0.7396137051795667, "eval_runtime": 89.0214, "eval_samples_per_second": 11.233, "eval_steps_per_second": 0.359, "step": 530 }, { "epoch": 0.21978021978021978, "grad_norm": 1.5043171644210815, "learning_rate": 1.8315018315018315e-07, "loss": 0.7215, "step": 540 }, { "epoch": 0.21978021978021978, "eval_cos_sim": 0.26200759410858154, "eval_loss": 0.7391770148490614, "eval_runtime": 89.0696, "eval_samples_per_second": 11.227, "eval_steps_per_second": 0.359, "step": 540 }, { "epoch": 0.22385022385022385, "grad_norm": 1.8029693365097046, "learning_rate": 1.865418532085199e-07, "loss": 0.7221, "step": 550 }, { "epoch": 0.22385022385022385, "eval_cos_sim": 0.26245614886283875, "eval_loss": 0.7387287416671461, "eval_runtime": 89.6121, "eval_samples_per_second": 11.159, "eval_steps_per_second": 0.357, "step": 550 }, { "epoch": 0.22792022792022792, "grad_norm": 1.492769479751587, "learning_rate": 1.8993352326685662e-07, "loss": 0.7335, "step": 560 }, { "epoch": 0.22792022792022792, "eval_cos_sim": 0.2629122734069824, "eval_loss": 0.738272936842317, "eval_runtime": 89.4796, "eval_samples_per_second": 11.176, "eval_steps_per_second": 0.358, "step": 560 }, { "epoch": 0.231990231990232, "grad_norm": 1.4421427249908447, "learning_rate": 1.9332519332519332e-07, "loss": 0.7335, "step": 570 }, { "epoch": 0.231990231990232, "eval_cos_sim": 0.2633700668811798, "eval_loss": 0.7378155255531019, "eval_runtime": 89.3151, "eval_samples_per_second": 11.196, "eval_steps_per_second": 0.358, "step": 570 }, { "epoch": 0.23606023606023607, "grad_norm": 1.8092644214630127, "learning_rate": 1.9671686338353007e-07, "loss": 0.7406, "step": 580 }, { "epoch": 0.23606023606023607, "eval_cos_sim": 0.26382094621658325, "eval_loss": 0.7373649797652906, "eval_runtime": 89.6512, "eval_samples_per_second": 11.154, "eval_steps_per_second": 0.357, "step": 580 }, { "epoch": 0.24013024013024012, "grad_norm": 1.7754801511764526, "learning_rate": 2.001085334418668e-07, "loss": 0.7369, "step": 590 }, { "epoch": 0.24013024013024012, "eval_cos_sim": 0.2642780840396881, "eval_loss": 0.7369081225608534, "eval_runtime": 89.4603, "eval_samples_per_second": 11.178, "eval_steps_per_second": 0.358, "step": 590 }, { "epoch": 0.2442002442002442, "grad_norm": 1.8073511123657227, "learning_rate": 2.0350020350020349e-07, "loss": 0.723, "step": 600 }, { "epoch": 0.2442002442002442, "eval_cos_sim": 0.2647515833377838, "eval_loss": 0.7364349565719313, "eval_runtime": 89.5556, "eval_samples_per_second": 11.166, "eval_steps_per_second": 0.357, "step": 600 }, { "epoch": 0.24827024827024827, "grad_norm": 1.5272294282913208, "learning_rate": 2.0689187355854024e-07, "loss": 0.7257, "step": 610 }, { "epoch": 0.24827024827024827, "eval_cos_sim": 0.2652308940887451, "eval_loss": 0.7359559812759108, "eval_runtime": 90.0107, "eval_samples_per_second": 11.11, "eval_steps_per_second": 0.356, "step": 610 }, { "epoch": 0.2523402523402523, "grad_norm": 1.5300447940826416, "learning_rate": 2.1028354361687696e-07, "loss": 0.7159, "step": 620 }, { "epoch": 0.2523402523402523, "eval_cos_sim": 0.26572421193122864, "eval_loss": 0.7354630170081801, "eval_runtime": 89.5462, "eval_samples_per_second": 11.167, "eval_steps_per_second": 0.357, "step": 620 }, { "epoch": 0.2564102564102564, "grad_norm": 1.7922492027282715, "learning_rate": 2.136752136752137e-07, "loss": 0.7201, "step": 630 }, { "epoch": 0.2564102564102564, "eval_cos_sim": 0.2662273347377777, "eval_loss": 0.7349601850722974, "eval_runtime": 89.2139, "eval_samples_per_second": 11.209, "eval_steps_per_second": 0.359, "step": 630 }, { "epoch": 0.26048026048026046, "grad_norm": 1.800451636314392, "learning_rate": 2.170668837335504e-07, "loss": 0.7122, "step": 640 }, { "epoch": 0.26048026048026046, "eval_cos_sim": 0.2667379379272461, "eval_loss": 0.7344499821876234, "eval_runtime": 88.9978, "eval_samples_per_second": 11.236, "eval_steps_per_second": 0.36, "step": 640 }, { "epoch": 0.26455026455026454, "grad_norm": 1.5168377161026, "learning_rate": 2.2045855379188713e-07, "loss": 0.7292, "step": 650 }, { "epoch": 0.26455026455026454, "eval_cos_sim": 0.2672579288482666, "eval_loss": 0.7339303827499097, "eval_runtime": 89.422, "eval_samples_per_second": 11.183, "eval_steps_per_second": 0.358, "step": 650 }, { "epoch": 0.2686202686202686, "grad_norm": 1.7991094589233398, "learning_rate": 2.2385022385022388e-07, "loss": 0.7102, "step": 660 }, { "epoch": 0.2686202686202686, "eval_cos_sim": 0.267790287733078, "eval_loss": 0.7333984103416151, "eval_runtime": 89.5146, "eval_samples_per_second": 11.171, "eval_steps_per_second": 0.357, "step": 660 } ], "logging_steps": 10, "max_steps": 1474200, "num_input_tokens_seen": 0, "num_train_epochs": 600, "save_steps": 10, "total_flos": 0.0, "train_batch_size": 160, "trial_name": null, "trial_params": null }