{ "best_metric": 0.7681954999183362, "best_model_checkpoint": "/p/scratch/ccstdl/krishna/finetuned-cosine-loss/checkpoint-630", "epoch": 0.2564102564102564, "eval_steps": 10, "global_step": 630, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00407000407000407, "grad_norm": 1.8248943090438843, "learning_rate": 3.391670058336725e-09, "loss": 0.7718, "step": 10 }, { "epoch": 0.00407000407000407, "eval_cos_sim": 0.2161455750465393, "eval_loss": 0.7850055160735792, "eval_runtime": 91.1558, "eval_samples_per_second": 10.97, "eval_steps_per_second": 0.351, "step": 10 }, { "epoch": 0.00814000814000814, "grad_norm": 1.5165057182312012, "learning_rate": 6.78334011667345e-09, "loss": 0.7694, "step": 20 }, { "epoch": 0.00814000814000814, "eval_cos_sim": 0.21615812182426453, "eval_loss": 0.7849929766868299, "eval_runtime": 88.5893, "eval_samples_per_second": 11.288, "eval_steps_per_second": 0.361, "step": 20 }, { "epoch": 0.01221001221001221, "grad_norm": 1.8078064918518066, "learning_rate": 1.0175010175010176e-08, "loss": 0.7875, "step": 30 }, { "epoch": 0.01221001221001221, "eval_cos_sim": 0.21617917716503143, "eval_loss": 0.7849719367240614, "eval_runtime": 88.6143, "eval_samples_per_second": 11.285, "eval_steps_per_second": 0.361, "step": 30 }, { "epoch": 0.01628001628001628, "grad_norm": 1.8216888904571533, "learning_rate": 1.35666802333469e-08, "loss": 0.7742, "step": 40 }, { "epoch": 0.01628001628001628, "eval_cos_sim": 0.21620890498161316, "eval_loss": 0.7849422373984999, "eval_runtime": 88.4565, "eval_samples_per_second": 11.305, "eval_steps_per_second": 0.362, "step": 40 }, { "epoch": 0.02035002035002035, "grad_norm": 1.7948490381240845, "learning_rate": 1.6958350291683625e-08, "loss": 0.7683, "step": 50 }, { "epoch": 0.02035002035002035, "eval_cos_sim": 0.2162467986345291, "eval_loss": 0.7849043712829298, "eval_runtime": 88.6129, "eval_samples_per_second": 11.285, "eval_steps_per_second": 0.361, "step": 50 }, { "epoch": 0.02442002442002442, "grad_norm": 1.5213645696640015, "learning_rate": 2.035002035002035e-08, "loss": 0.7773, "step": 60 }, { "epoch": 0.02442002442002442, "eval_cos_sim": 0.21629446744918823, "eval_loss": 0.7848567362044996, "eval_runtime": 88.6351, "eval_samples_per_second": 11.282, "eval_steps_per_second": 0.361, "step": 60 }, { "epoch": 0.02849002849002849, "grad_norm": 1.816196322441101, "learning_rate": 2.3741690408357078e-08, "loss": 0.7677, "step": 70 }, { "epoch": 0.02849002849002849, "eval_cos_sim": 0.21635092794895172, "eval_loss": 0.7848003177856153, "eval_runtime": 88.6682, "eval_samples_per_second": 11.278, "eval_steps_per_second": 0.361, "step": 70 }, { "epoch": 0.03256003256003256, "grad_norm": 1.8195900917053223, "learning_rate": 2.71333604666938e-08, "loss": 0.78, "step": 80 }, { "epoch": 0.03256003256003256, "eval_cos_sim": 0.21641571819782257, "eval_loss": 0.7847355809425062, "eval_runtime": 88.4895, "eval_samples_per_second": 11.301, "eval_steps_per_second": 0.362, "step": 80 }, { "epoch": 0.03663003663003663, "grad_norm": 1.7980706691741943, "learning_rate": 3.052503052503053e-08, "loss": 0.7655, "step": 90 }, { "epoch": 0.03663003663003663, "eval_cos_sim": 0.2164890468120575, "eval_loss": 0.7846623053764051, "eval_runtime": 88.0594, "eval_samples_per_second": 11.356, "eval_steps_per_second": 0.363, "step": 90 }, { "epoch": 0.0407000407000407, "grad_norm": 1.8171244859695435, "learning_rate": 3.391670058336725e-08, "loss": 0.7747, "step": 100 }, { "epoch": 0.0407000407000407, "eval_cos_sim": 0.2165709137916565, "eval_loss": 0.7845804929946607, "eval_runtime": 88.4211, "eval_samples_per_second": 11.31, "eval_steps_per_second": 0.362, "step": 100 }, { "epoch": 0.04477004477004477, "grad_norm": 1.529422640800476, "learning_rate": 3.730837064170397e-08, "loss": 0.776, "step": 110 }, { "epoch": 0.04477004477004477, "eval_cos_sim": 0.2166617214679718, "eval_loss": 0.7844897389625257, "eval_runtime": 88.6911, "eval_samples_per_second": 11.275, "eval_steps_per_second": 0.361, "step": 110 }, { "epoch": 0.04884004884004884, "grad_norm": 1.828736424446106, "learning_rate": 4.07000407000407e-08, "loss": 0.7757, "step": 120 }, { "epoch": 0.04884004884004884, "eval_cos_sim": 0.216759592294693, "eval_loss": 0.7843919339393324, "eval_runtime": 89.0261, "eval_samples_per_second": 11.233, "eval_steps_per_second": 0.359, "step": 120 }, { "epoch": 0.05291005291005291, "grad_norm": 1.5348294973373413, "learning_rate": 4.4091710758377425e-08, "loss": 0.7829, "step": 130 }, { "epoch": 0.05291005291005291, "eval_cos_sim": 0.21686479449272156, "eval_loss": 0.7842868261550612, "eval_runtime": 88.7305, "eval_samples_per_second": 11.27, "eval_steps_per_second": 0.361, "step": 130 }, { "epoch": 0.05698005698005698, "grad_norm": 1.8242194652557373, "learning_rate": 4.7483380816714155e-08, "loss": 0.7794, "step": 140 }, { "epoch": 0.05698005698005698, "eval_cos_sim": 0.21697954833507538, "eval_loss": 0.7841721568320936, "eval_runtime": 88.4802, "eval_samples_per_second": 11.302, "eval_steps_per_second": 0.362, "step": 140 }, { "epoch": 0.06105006105006105, "grad_norm": 1.4458454847335815, "learning_rate": 5.087505087505087e-08, "loss": 0.7751, "step": 150 }, { "epoch": 0.06105006105006105, "eval_cos_sim": 0.21710529923439026, "eval_loss": 0.7840465169166273, "eval_runtime": 89.0281, "eval_samples_per_second": 11.232, "eval_steps_per_second": 0.359, "step": 150 }, { "epoch": 0.06512006512006512, "grad_norm": 1.8203269243240356, "learning_rate": 5.42667209333876e-08, "loss": 0.7681, "step": 160 }, { "epoch": 0.06512006512006512, "eval_cos_sim": 0.21723878383636475, "eval_loss": 0.7839131169532484, "eval_runtime": 88.2381, "eval_samples_per_second": 11.333, "eval_steps_per_second": 0.363, "step": 160 }, { "epoch": 0.06919006919006919, "grad_norm": 1.5368891954421997, "learning_rate": 5.7658390991724324e-08, "loss": 0.7704, "step": 170 }, { "epoch": 0.06919006919006919, "eval_cos_sim": 0.21737951040267944, "eval_loss": 0.783772478601808, "eval_runtime": 88.519, "eval_samples_per_second": 11.297, "eval_steps_per_second": 0.362, "step": 170 }, { "epoch": 0.07326007326007326, "grad_norm": 1.5333787202835083, "learning_rate": 6.105006105006105e-08, "loss": 0.7713, "step": 180 }, { "epoch": 0.07326007326007326, "eval_cos_sim": 0.21752700209617615, "eval_loss": 0.7836251006339735, "eval_runtime": 88.1346, "eval_samples_per_second": 11.346, "eval_steps_per_second": 0.363, "step": 180 }, { "epoch": 0.07733007733007732, "grad_norm": 1.8334540128707886, "learning_rate": 6.444173110839778e-08, "loss": 0.7818, "step": 190 }, { "epoch": 0.07733007733007732, "eval_cos_sim": 0.2176828682422638, "eval_loss": 0.7834693560813611, "eval_runtime": 88.9414, "eval_samples_per_second": 11.243, "eval_steps_per_second": 0.36, "step": 190 }, { "epoch": 0.0814000814000814, "grad_norm": 1.7938237190246582, "learning_rate": 6.78334011667345e-08, "loss": 0.7756, "step": 200 }, { "epoch": 0.0814000814000814, "eval_cos_sim": 0.21784724295139313, "eval_loss": 0.7833051133369154, "eval_runtime": 88.7619, "eval_samples_per_second": 11.266, "eval_steps_per_second": 0.361, "step": 200 }, { "epoch": 0.08547008547008547, "grad_norm": 1.8335057497024536, "learning_rate": 7.122507122507124e-08, "loss": 0.7742, "step": 210 }, { "epoch": 0.08547008547008547, "eval_cos_sim": 0.21802641451358795, "eval_loss": 0.7831260886405653, "eval_runtime": 88.6351, "eval_samples_per_second": 11.282, "eval_steps_per_second": 0.361, "step": 210 }, { "epoch": 0.08954008954008955, "grad_norm": 1.7981688976287842, "learning_rate": 7.461674128340795e-08, "loss": 0.7744, "step": 220 }, { "epoch": 0.08954008954008955, "eval_cos_sim": 0.21821342408657074, "eval_loss": 0.7829392380927748, "eval_runtime": 88.6647, "eval_samples_per_second": 11.278, "eval_steps_per_second": 0.361, "step": 220 }, { "epoch": 0.0936100936100936, "grad_norm": 1.658451795578003, "learning_rate": 7.800841134174468e-08, "loss": 0.7703, "step": 230 }, { "epoch": 0.0936100936100936, "eval_cos_sim": 0.21841329336166382, "eval_loss": 0.7827395224784559, "eval_runtime": 88.7603, "eval_samples_per_second": 11.266, "eval_steps_per_second": 0.361, "step": 230 }, { "epoch": 0.09768009768009768, "grad_norm": 1.794303297996521, "learning_rate": 8.14000814000814e-08, "loss": 0.7663, "step": 240 }, { "epoch": 0.09768009768009768, "eval_cos_sim": 0.21862231194972992, "eval_loss": 0.782530607721681, "eval_runtime": 88.7986, "eval_samples_per_second": 11.261, "eval_steps_per_second": 0.36, "step": 240 }, { "epoch": 0.10175010175010175, "grad_norm": 1.7992609739303589, "learning_rate": 8.479175145841813e-08, "loss": 0.7736, "step": 250 }, { "epoch": 0.10175010175010175, "eval_cos_sim": 0.21883440017700195, "eval_loss": 0.7823186969970412, "eval_runtime": 88.8672, "eval_samples_per_second": 11.253, "eval_steps_per_second": 0.36, "step": 250 }, { "epoch": 0.10582010582010581, "grad_norm": 1.8333110809326172, "learning_rate": 8.818342151675485e-08, "loss": 0.7732, "step": 260 }, { "epoch": 0.10582010582010581, "eval_cos_sim": 0.21905279159545898, "eval_loss": 0.782100461981172, "eval_runtime": 88.9248, "eval_samples_per_second": 11.245, "eval_steps_per_second": 0.36, "step": 260 }, { "epoch": 0.10989010989010989, "grad_norm": 1.8126780986785889, "learning_rate": 9.157509157509157e-08, "loss": 0.7731, "step": 270 }, { "epoch": 0.10989010989010989, "eval_cos_sim": 0.2192821353673935, "eval_loss": 0.7818713002418226, "eval_runtime": 89.0894, "eval_samples_per_second": 11.225, "eval_steps_per_second": 0.359, "step": 270 }, { "epoch": 0.11396011396011396, "grad_norm": 1.8034828901290894, "learning_rate": 9.496676163342831e-08, "loss": 0.7653, "step": 280 }, { "epoch": 0.11396011396011396, "eval_cos_sim": 0.21952229738235474, "eval_loss": 0.781631359598512, "eval_runtime": 88.5405, "eval_samples_per_second": 11.294, "eval_steps_per_second": 0.361, "step": 280 }, { "epoch": 0.11803011803011804, "grad_norm": 1.5344951152801514, "learning_rate": 9.835843169176503e-08, "loss": 0.7647, "step": 290 }, { "epoch": 0.11803011803011804, "eval_cos_sim": 0.21977314352989197, "eval_loss": 0.7813806886886304, "eval_runtime": 89.249, "eval_samples_per_second": 11.205, "eval_steps_per_second": 0.359, "step": 290 }, { "epoch": 0.1221001221001221, "grad_norm": 1.5414931774139404, "learning_rate": 1.0175010175010174e-07, "loss": 0.7778, "step": 300 }, { "epoch": 0.1221001221001221, "eval_cos_sim": 0.2200283408164978, "eval_loss": 0.7811256456588453, "eval_runtime": 88.215, "eval_samples_per_second": 11.336, "eval_steps_per_second": 0.363, "step": 300 }, { "epoch": 0.12617012617012616, "grad_norm": 1.507014513015747, "learning_rate": 1.0514177180843848e-07, "loss": 0.7792, "step": 310 }, { "epoch": 0.12617012617012616, "eval_cos_sim": 0.22028397023677826, "eval_loss": 0.7808702659820265, "eval_runtime": 89.0436, "eval_samples_per_second": 11.23, "eval_steps_per_second": 0.359, "step": 310 }, { "epoch": 0.13024013024013023, "grad_norm": 1.8126572370529175, "learning_rate": 1.085334418667752e-07, "loss": 0.7647, "step": 320 }, { "epoch": 0.13024013024013023, "eval_cos_sim": 0.22054480016231537, "eval_loss": 0.7806096720908827, "eval_runtime": 89.0304, "eval_samples_per_second": 11.232, "eval_steps_per_second": 0.359, "step": 320 }, { "epoch": 0.1343101343101343, "grad_norm": 1.8201279640197754, "learning_rate": 1.1192511192511194e-07, "loss": 0.7791, "step": 330 }, { "epoch": 0.1343101343101343, "eval_cos_sim": 0.22081315517425537, "eval_loss": 0.7803415045951552, "eval_runtime": 89.0654, "eval_samples_per_second": 11.228, "eval_steps_per_second": 0.359, "step": 330 }, { "epoch": 0.13838013838013838, "grad_norm": 1.7973226308822632, "learning_rate": 1.1531678198344865e-07, "loss": 0.7672, "step": 340 }, { "epoch": 0.13838013838013838, "eval_cos_sim": 0.22110143303871155, "eval_loss": 0.7800534692023939, "eval_runtime": 88.2286, "eval_samples_per_second": 11.334, "eval_steps_per_second": 0.363, "step": 340 }, { "epoch": 0.14245014245014245, "grad_norm": 1.8215441703796387, "learning_rate": 1.1870845204178537e-07, "loss": 0.7699, "step": 350 }, { "epoch": 0.14245014245014245, "eval_cos_sim": 0.22139768302440643, "eval_loss": 0.7797573957656568, "eval_runtime": 88.8585, "eval_samples_per_second": 11.254, "eval_steps_per_second": 0.36, "step": 350 }, { "epoch": 0.14652014652014653, "grad_norm": 1.8159250020980835, "learning_rate": 1.221001221001221e-07, "loss": 0.7712, "step": 360 }, { "epoch": 0.14652014652014653, "eval_cos_sim": 0.2217043787240982, "eval_loss": 0.7794509191726393, "eval_runtime": 88.9767, "eval_samples_per_second": 11.239, "eval_steps_per_second": 0.36, "step": 360 }, { "epoch": 0.1505901505901506, "grad_norm": 1.7892403602600098, "learning_rate": 1.2549179215845883e-07, "loss": 0.7594, "step": 370 }, { "epoch": 0.1505901505901506, "eval_cos_sim": 0.22202090919017792, "eval_loss": 0.7791346249793715, "eval_runtime": 88.4113, "eval_samples_per_second": 11.311, "eval_steps_per_second": 0.362, "step": 370 }, { "epoch": 0.15466015466015465, "grad_norm": 1.499679446220398, "learning_rate": 1.2888346221679555e-07, "loss": 0.7702, "step": 380 }, { "epoch": 0.15466015466015465, "eval_cos_sim": 0.22234594821929932, "eval_loss": 0.7788098197196669, "eval_runtime": 89.1927, "eval_samples_per_second": 11.212, "eval_steps_per_second": 0.359, "step": 380 }, { "epoch": 0.15873015873015872, "grad_norm": 1.827500343322754, "learning_rate": 1.3227513227513228e-07, "loss": 0.7578, "step": 390 }, { "epoch": 0.15873015873015872, "eval_cos_sim": 0.22267462313175201, "eval_loss": 0.7784813823913282, "eval_runtime": 89.2341, "eval_samples_per_second": 11.206, "eval_steps_per_second": 0.359, "step": 390 }, { "epoch": 0.1628001628001628, "grad_norm": 1.8155056238174438, "learning_rate": 1.35666802333469e-07, "loss": 0.7663, "step": 400 }, { "epoch": 0.1628001628001628, "eval_cos_sim": 0.2230025976896286, "eval_loss": 0.7781536517356581, "eval_runtime": 89.0874, "eval_samples_per_second": 11.225, "eval_steps_per_second": 0.359, "step": 400 }, { "epoch": 0.16687016687016687, "grad_norm": 1.5147600173950195, "learning_rate": 1.3905847239180572e-07, "loss": 0.7685, "step": 410 }, { "epoch": 0.16687016687016687, "eval_cos_sim": 0.2233392745256424, "eval_loss": 0.7778171930526442, "eval_runtime": 89.0688, "eval_samples_per_second": 11.227, "eval_steps_per_second": 0.359, "step": 410 }, { "epoch": 0.17094017094017094, "grad_norm": 1.5169967412948608, "learning_rate": 1.4245014245014247e-07, "loss": 0.7614, "step": 420 }, { "epoch": 0.17094017094017094, "eval_cos_sim": 0.2236924022436142, "eval_loss": 0.7774643163894361, "eval_runtime": 88.9798, "eval_samples_per_second": 11.239, "eval_steps_per_second": 0.36, "step": 420 }, { "epoch": 0.17501017501017502, "grad_norm": 1.5205143690109253, "learning_rate": 1.4584181250847917e-07, "loss": 0.7646, "step": 430 }, { "epoch": 0.17501017501017502, "eval_cos_sim": 0.224058136343956, "eval_loss": 0.7770988393043227, "eval_runtime": 89.0477, "eval_samples_per_second": 11.23, "eval_steps_per_second": 0.359, "step": 430 }, { "epoch": 0.1790801790801791, "grad_norm": 1.5187054872512817, "learning_rate": 1.492334825668159e-07, "loss": 0.775, "step": 440 }, { "epoch": 0.1790801790801791, "eval_cos_sim": 0.22443027794361115, "eval_loss": 0.7767269988273329, "eval_runtime": 89.3697, "eval_samples_per_second": 11.189, "eval_steps_per_second": 0.358, "step": 440 }, { "epoch": 0.18315018315018314, "grad_norm": 1.8240047693252563, "learning_rate": 1.5262515262515264e-07, "loss": 0.7622, "step": 450 }, { "epoch": 0.18315018315018314, "eval_cos_sim": 0.22480851411819458, "eval_loss": 0.7763490748618788, "eval_runtime": 89.2014, "eval_samples_per_second": 11.211, "eval_steps_per_second": 0.359, "step": 450 }, { "epoch": 0.1872201872201872, "grad_norm": 1.8208777904510498, "learning_rate": 1.5601682268348936e-07, "loss": 0.7563, "step": 460 }, { "epoch": 0.1872201872201872, "eval_cos_sim": 0.2251981645822525, "eval_loss": 0.7759597497199721, "eval_runtime": 89.202, "eval_samples_per_second": 11.211, "eval_steps_per_second": 0.359, "step": 460 }, { "epoch": 0.19129019129019129, "grad_norm": 1.523047924041748, "learning_rate": 1.594084927418261e-07, "loss": 0.7613, "step": 470 }, { "epoch": 0.19129019129019129, "eval_cos_sim": 0.2256050556898117, "eval_loss": 0.775553204080934, "eval_runtime": 88.9659, "eval_samples_per_second": 11.24, "eval_steps_per_second": 0.36, "step": 470 }, { "epoch": 0.19536019536019536, "grad_norm": 1.7896461486816406, "learning_rate": 1.628001628001628e-07, "loss": 0.7571, "step": 480 }, { "epoch": 0.19536019536019536, "eval_cos_sim": 0.2260132133960724, "eval_loss": 0.7751453356956189, "eval_runtime": 89.3097, "eval_samples_per_second": 11.197, "eval_steps_per_second": 0.358, "step": 480 }, { "epoch": 0.19943019943019943, "grad_norm": 1.802852749824524, "learning_rate": 1.6619183285849953e-07, "loss": 0.7666, "step": 490 }, { "epoch": 0.19943019943019943, "eval_cos_sim": 0.2264304757118225, "eval_loss": 0.7747283859466261, "eval_runtime": 89.0947, "eval_samples_per_second": 11.224, "eval_steps_per_second": 0.359, "step": 490 }, { "epoch": 0.2035002035002035, "grad_norm": 1.821495532989502, "learning_rate": 1.6958350291683626e-07, "loss": 0.7624, "step": 500 }, { "epoch": 0.2035002035002035, "eval_cos_sim": 0.22685033082962036, "eval_loss": 0.7743088460181898, "eval_runtime": 89.2762, "eval_samples_per_second": 11.201, "eval_steps_per_second": 0.358, "step": 500 }, { "epoch": 0.20757020757020758, "grad_norm": 1.545665979385376, "learning_rate": 1.7297517297517298e-07, "loss": 0.7694, "step": 510 }, { "epoch": 0.20757020757020758, "eval_cos_sim": 0.22728146612644196, "eval_loss": 0.7738780179237074, "eval_runtime": 88.7471, "eval_samples_per_second": 11.268, "eval_steps_per_second": 0.361, "step": 510 }, { "epoch": 0.21164021164021163, "grad_norm": 1.7993839979171753, "learning_rate": 1.763668430335097e-07, "loss": 0.7601, "step": 520 }, { "epoch": 0.21164021164021163, "eval_cos_sim": 0.2277180701494217, "eval_loss": 0.7734417824958509, "eval_runtime": 89.3566, "eval_samples_per_second": 11.191, "eval_steps_per_second": 0.358, "step": 520 }, { "epoch": 0.2157102157102157, "grad_norm": 1.7981739044189453, "learning_rate": 1.7975851309184642e-07, "loss": 0.7572, "step": 530 }, { "epoch": 0.2157102157102157, "eval_cos_sim": 0.22816509008407593, "eval_loss": 0.7729950847839063, "eval_runtime": 89.1907, "eval_samples_per_second": 11.212, "eval_steps_per_second": 0.359, "step": 530 }, { "epoch": 0.21978021978021978, "grad_norm": 1.4281421899795532, "learning_rate": 1.8315018315018315e-07, "loss": 0.7713, "step": 540 }, { "epoch": 0.21978021978021978, "eval_cos_sim": 0.2286159247159958, "eval_loss": 0.7725445466254895, "eval_runtime": 88.9936, "eval_samples_per_second": 11.237, "eval_steps_per_second": 0.36, "step": 540 }, { "epoch": 0.22385022385022385, "grad_norm": 1.800079107284546, "learning_rate": 1.865418532085199e-07, "loss": 0.7637, "step": 550 }, { "epoch": 0.22385022385022385, "eval_cos_sim": 0.22906461358070374, "eval_loss": 0.772096163771028, "eval_runtime": 88.9615, "eval_samples_per_second": 11.241, "eval_steps_per_second": 0.36, "step": 550 }, { "epoch": 0.22792022792022792, "grad_norm": 1.5776735544204712, "learning_rate": 1.8993352326685662e-07, "loss": 0.7699, "step": 560 }, { "epoch": 0.22792022792022792, "eval_cos_sim": 0.2295254021883011, "eval_loss": 0.771635666391725, "eval_runtime": 88.9524, "eval_samples_per_second": 11.242, "eval_steps_per_second": 0.36, "step": 560 }, { "epoch": 0.231990231990232, "grad_norm": 1.8183437585830688, "learning_rate": 1.9332519332519332e-07, "loss": 0.7634, "step": 570 }, { "epoch": 0.231990231990232, "eval_cos_sim": 0.22998037934303284, "eval_loss": 0.7711810879920667, "eval_runtime": 88.9474, "eval_samples_per_second": 11.243, "eval_steps_per_second": 0.36, "step": 570 }, { "epoch": 0.23606023606023607, "grad_norm": 1.8223985433578491, "learning_rate": 1.9671686338353007e-07, "loss": 0.7772, "step": 580 }, { "epoch": 0.23606023606023607, "eval_cos_sim": 0.23044081032276154, "eval_loss": 0.7707209568237012, "eval_runtime": 88.7661, "eval_samples_per_second": 11.266, "eval_steps_per_second": 0.36, "step": 580 }, { "epoch": 0.24013024013024012, "grad_norm": 1.7877051830291748, "learning_rate": 2.001085334418668e-07, "loss": 0.7534, "step": 590 }, { "epoch": 0.24013024013024012, "eval_cos_sim": 0.23091764748096466, "eval_loss": 0.770244472048158, "eval_runtime": 89.1681, "eval_samples_per_second": 11.215, "eval_steps_per_second": 0.359, "step": 590 }, { "epoch": 0.2442002442002442, "grad_norm": 1.6694035530090332, "learning_rate": 2.0350020350020349e-07, "loss": 0.764, "step": 600 }, { "epoch": 0.2442002442002442, "eval_cos_sim": 0.23141798377037048, "eval_loss": 0.769744483969087, "eval_runtime": 89.1824, "eval_samples_per_second": 11.213, "eval_steps_per_second": 0.359, "step": 600 }, { "epoch": 0.24827024827024827, "grad_norm": 1.5185260772705078, "learning_rate": 2.0689187355854024e-07, "loss": 0.7641, "step": 610 }, { "epoch": 0.24827024827024827, "eval_cos_sim": 0.2319241613149643, "eval_loss": 0.7692386307929701, "eval_runtime": 89.2776, "eval_samples_per_second": 11.201, "eval_steps_per_second": 0.358, "step": 610 }, { "epoch": 0.2523402523402523, "grad_norm": 1.8109288215637207, "learning_rate": 2.1028354361687696e-07, "loss": 0.7554, "step": 620 }, { "epoch": 0.2523402523402523, "eval_cos_sim": 0.23244358599185944, "eval_loss": 0.7687196593497938, "eval_runtime": 89.197, "eval_samples_per_second": 11.211, "eval_steps_per_second": 0.359, "step": 620 }, { "epoch": 0.2564102564102564, "grad_norm": 1.8006572723388672, "learning_rate": 2.136752136752137e-07, "loss": 0.7547, "step": 630 }, { "epoch": 0.2564102564102564, "eval_cos_sim": 0.23296819627285004, "eval_loss": 0.7681954999183362, "eval_runtime": 89.2344, "eval_samples_per_second": 11.206, "eval_steps_per_second": 0.359, "step": 630 } ], "logging_steps": 10, "max_steps": 1474200, "num_input_tokens_seen": 0, "num_train_epochs": 600, "save_steps": 10, "total_flos": 0.0, "train_batch_size": 160, "trial_name": null, "trial_params": null }