{ "best_metric": 0.121661689779634, "best_model_checkpoint": "/workspace/disk2/krishna/checkpoints/checkpoint-1280", "epoch": 0.128, "eval_steps": 10, "global_step": 1280, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001, "grad_norm": 0.11198576539754868, "learning_rate": 1e-05, "loss": 0.126, "step": 10 }, { "epoch": 0.001, "eval_cos_sim": 0.8696296215057373, "eval_loss": 0.13132101871716445, "eval_runtime": 191.9539, "eval_samples_per_second": 20.838, "eval_steps_per_second": 1.302, "step": 10 }, { "epoch": 0.002, "grad_norm": 0.19444850087165833, "learning_rate": 2e-05, "loss": 0.1267, "step": 20 }, { "epoch": 0.002, "eval_cos_sim": 0.8698329329490662, "eval_loss": 0.1311149292205519, "eval_runtime": 177.5098, "eval_samples_per_second": 22.534, "eval_steps_per_second": 1.408, "step": 20 }, { "epoch": 0.003, "grad_norm": 0.12954622507095337, "learning_rate": 3e-05, "loss": 0.1271, "step": 30 }, { "epoch": 0.003, "eval_cos_sim": 0.8700494766235352, "eval_loss": 0.1309011602615065, "eval_runtime": 179.7068, "eval_samples_per_second": 22.258, "eval_steps_per_second": 1.391, "step": 30 }, { "epoch": 0.004, "grad_norm": 0.11514733731746674, "learning_rate": 4e-05, "loss": 0.1265, "step": 40 }, { "epoch": 0.004, "eval_cos_sim": 0.870728075504303, "eval_loss": 0.13021534349667496, "eval_runtime": 174.4918, "eval_samples_per_second": 22.924, "eval_steps_per_second": 1.433, "step": 40 }, { "epoch": 0.005, "grad_norm": 0.34224584698677063, "learning_rate": 5e-05, "loss": 0.1273, "step": 50 }, { "epoch": 0.005, "eval_cos_sim": 0.8705285787582397, "eval_loss": 0.1304176144813246, "eval_runtime": 175.5157, "eval_samples_per_second": 22.79, "eval_steps_per_second": 1.424, "step": 50 }, { "epoch": 0.006, "grad_norm": 0.1085827499628067, "learning_rate": 4.517892759404963e-05, "loss": 0.125, "step": 60 }, { "epoch": 0.006, "eval_cos_sim": 0.8709338903427124, "eval_loss": 0.130007851145143, "eval_runtime": 173.9237, "eval_samples_per_second": 22.999, "eval_steps_per_second": 1.437, "step": 60 }, { "epoch": 0.007, "grad_norm": 0.11786766350269318, "learning_rate": 3.257512950767182e-05, "loss": 0.1291, "step": 70 }, { "epoch": 0.007, "eval_cos_sim": 0.8714690208435059, "eval_loss": 0.12946533443676894, "eval_runtime": 177.0345, "eval_samples_per_second": 22.594, "eval_steps_per_second": 1.412, "step": 70 }, { "epoch": 0.008, "grad_norm": 0.10741184651851654, "learning_rate": 1.7049711594019046e-05, "loss": 0.1285, "step": 80 }, { "epoch": 0.008, "eval_cos_sim": 0.8719983696937561, "eval_loss": 0.1289418597434706, "eval_runtime": 178.6566, "eval_samples_per_second": 22.389, "eval_steps_per_second": 1.399, "step": 80 }, { "epoch": 0.009, "grad_norm": 0.12072350829839706, "learning_rate": 4.590606964640023e-06, "loss": 0.125, "step": 90 }, { "epoch": 0.009, "eval_cos_sim": 0.8721248507499695, "eval_loss": 0.12881728055226274, "eval_runtime": 181.5969, "eval_samples_per_second": 22.027, "eval_steps_per_second": 1.377, "step": 90 }, { "epoch": 0.01, "grad_norm": 0.11123672872781754, "learning_rate": 4.999688473794144e-05, "loss": 0.1249, "step": 100 }, { "epoch": 0.01, "eval_cos_sim": 0.8721336722373962, "eval_loss": 0.12880885388600297, "eval_runtime": 174.6097, "eval_samples_per_second": 22.908, "eval_steps_per_second": 1.432, "step": 100 }, { "epoch": 0.011, "grad_norm": 0.11100038141012192, "learning_rate": 4.494343314093799e-05, "loss": 0.1246, "step": 110 }, { "epoch": 0.011, "eval_cos_sim": 0.8723854422569275, "eval_loss": 0.1285583892081923, "eval_runtime": 180.7772, "eval_samples_per_second": 22.127, "eval_steps_per_second": 1.383, "step": 110 }, { "epoch": 0.012, "grad_norm": 0.11933281272649765, "learning_rate": 3.219808272827916e-05, "loss": 0.1265, "step": 120 }, { "epoch": 0.012, "eval_cos_sim": 0.8727645874023438, "eval_loss": 0.12819017722355788, "eval_runtime": 176.8881, "eval_samples_per_second": 22.613, "eval_steps_per_second": 1.413, "step": 120 }, { "epoch": 0.013, "grad_norm": 0.11295568197965622, "learning_rate": 1.667653407425597e-05, "loss": 0.1256, "step": 130 }, { "epoch": 0.013, "eval_cos_sim": 0.8724489808082581, "eval_loss": 0.12850400116192764, "eval_runtime": 176.2937, "eval_samples_per_second": 22.689, "eval_steps_per_second": 1.418, "step": 130 }, { "epoch": 0.014, "grad_norm": 0.10013717412948608, "learning_rate": 4.365227971950606e-06, "loss": 0.1252, "step": 140 }, { "epoch": 0.014, "eval_cos_sim": 0.8726389408111572, "eval_loss": 0.1283098426078505, "eval_runtime": 175.1837, "eval_samples_per_second": 22.833, "eval_steps_per_second": 1.427, "step": 140 }, { "epoch": 0.015, "grad_norm": 0.08663387596607208, "learning_rate": 4.998753972815435e-05, "loss": 0.1252, "step": 150 }, { "epoch": 0.015, "eval_cos_sim": 0.8726971745491028, "eval_loss": 0.12825069954144425, "eval_runtime": 179.297, "eval_samples_per_second": 22.309, "eval_steps_per_second": 1.394, "step": 150 }, { "epoch": 0.016, "grad_norm": 0.10253303498029709, "learning_rate": 4.47029683661798e-05, "loss": 0.1258, "step": 160 }, { "epoch": 0.016, "eval_cos_sim": 0.8739002346992493, "eval_loss": 0.12703985621678301, "eval_runtime": 175.0922, "eval_samples_per_second": 22.845, "eval_steps_per_second": 1.428, "step": 160 }, { "epoch": 0.017, "grad_norm": 0.11590978503227234, "learning_rate": 3.1819242035765096e-05, "loss": 0.1219, "step": 170 }, { "epoch": 0.017, "eval_cos_sim": 0.8737954497337341, "eval_loss": 0.12715704419362017, "eval_runtime": 180.7326, "eval_samples_per_second": 22.132, "eval_steps_per_second": 1.383, "step": 170 }, { "epoch": 0.018, "grad_norm": 0.09687651693820953, "learning_rate": 1.6305430936700428e-05, "loss": 0.1244, "step": 180 }, { "epoch": 0.018, "eval_cos_sim": 0.8735443353652954, "eval_loss": 0.12740902497517534, "eval_runtime": 177.9084, "eval_samples_per_second": 22.483, "eval_steps_per_second": 1.405, "step": 180 }, { "epoch": 0.019, "grad_norm": 0.10086172819137573, "learning_rate": 4.144991597052059e-06, "loss": 0.1258, "step": 190 }, { "epoch": 0.019, "eval_cos_sim": 0.8735744953155518, "eval_loss": 0.12737621738659807, "eval_runtime": 174.0483, "eval_samples_per_second": 22.982, "eval_steps_per_second": 1.436, "step": 190 }, { "epoch": 0.02, "grad_norm": 0.09316889941692352, "learning_rate": 4.9971967299611097e-05, "loss": 0.122, "step": 200 }, { "epoch": 0.02, "eval_cos_sim": 0.8735851645469666, "eval_loss": 0.12736523821103043, "eval_runtime": 176.3327, "eval_samples_per_second": 22.684, "eval_steps_per_second": 1.418, "step": 200 }, { "epoch": 0.021, "grad_norm": 0.10805534571409225, "learning_rate": 4.4457593198638246e-05, "loss": 0.1256, "step": 210 }, { "epoch": 0.021, "eval_cos_sim": 0.8735992312431335, "eval_loss": 0.12734888651120133, "eval_runtime": 177.4342, "eval_samples_per_second": 22.544, "eval_steps_per_second": 1.409, "step": 210 }, { "epoch": 0.022, "grad_norm": 0.14335550367832184, "learning_rate": 3.143870184517241e-05, "loss": 0.1228, "step": 220 }, { "epoch": 0.022, "eval_cos_sim": 0.8742734789848328, "eval_loss": 0.1266735837672896, "eval_runtime": 174.698, "eval_samples_per_second": 22.897, "eval_steps_per_second": 1.431, "step": 220 }, { "epoch": 0.023, "grad_norm": 0.10455214232206345, "learning_rate": 1.5936494668034417e-05, "loss": 0.1235, "step": 230 }, { "epoch": 0.023, "eval_cos_sim": 0.874700129032135, "eval_loss": 0.12624898936497636, "eval_runtime": 175.2174, "eval_samples_per_second": 22.829, "eval_steps_per_second": 1.427, "step": 230 }, { "epoch": 0.024, "grad_norm": 0.10344243049621582, "learning_rate": 3.9299527274662355e-06, "loss": 0.1258, "step": 240 }, { "epoch": 0.024, "eval_cos_sim": 0.8746932148933411, "eval_loss": 0.1262588949416823, "eval_runtime": 178.5496, "eval_samples_per_second": 22.403, "eval_steps_per_second": 1.4, "step": 240 }, { "epoch": 0.025, "grad_norm": 0.1515665352344513, "learning_rate": 4.9950171333287335e-05, "loss": 0.1259, "step": 250 }, { "epoch": 0.025, "eval_cos_sim": 0.8746062517166138, "eval_loss": 0.1263456218455977, "eval_runtime": 181.2208, "eval_samples_per_second": 22.073, "eval_steps_per_second": 1.38, "step": 250 }, { "epoch": 0.026, "grad_norm": 0.08521851152181625, "learning_rate": 4.420736879094929e-05, "loss": 0.123, "step": 260 }, { "epoch": 0.026, "eval_cos_sim": 0.8742081522941589, "eval_loss": 0.1267440173839278, "eval_runtime": 172.3377, "eval_samples_per_second": 23.21, "eval_steps_per_second": 1.451, "step": 260 }, { "epoch": 0.027, "grad_norm": 0.24638278782367706, "learning_rate": 3.105655699509455e-05, "loss": 0.1246, "step": 270 }, { "epoch": 0.027, "eval_cos_sim": 0.8748664259910583, "eval_loss": 0.12609003236042926, "eval_runtime": 175.6344, "eval_samples_per_second": 22.775, "eval_steps_per_second": 1.423, "step": 270 }, { "epoch": 0.028, "grad_norm": 0.09267835319042206, "learning_rate": 1.5569817214910634e-05, "loss": 0.1246, "step": 280 }, { "epoch": 0.028, "eval_cos_sim": 0.8748399615287781, "eval_loss": 0.12611397721516557, "eval_runtime": 175.9072, "eval_samples_per_second": 22.739, "eval_steps_per_second": 1.421, "step": 280 }, { "epoch": 0.029, "grad_norm": 0.1712462306022644, "learning_rate": 3.720164955387656e-06, "loss": 0.1243, "step": 290 }, { "epoch": 0.029, "eval_cos_sim": 0.8749127388000488, "eval_loss": 0.1260433347438521, "eval_runtime": 176.0561, "eval_samples_per_second": 22.72, "eval_steps_per_second": 1.42, "step": 290 }, { "epoch": 0.03, "grad_norm": 0.07719286531209946, "learning_rate": 4.992215726119483e-05, "loss": 0.1227, "step": 300 }, { "epoch": 0.03, "eval_cos_sim": 0.8748821020126343, "eval_loss": 0.1260761695121474, "eval_runtime": 174.2263, "eval_samples_per_second": 22.959, "eval_steps_per_second": 1.435, "step": 300 }, { "epoch": 0.031, "grad_norm": 0.08637545257806778, "learning_rate": 4.395235750428112e-05, "loss": 0.1222, "step": 310 }, { "epoch": 0.031, "eval_cos_sim": 0.8745994567871094, "eval_loss": 0.12635979654538104, "eval_runtime": 179.4806, "eval_samples_per_second": 22.287, "eval_steps_per_second": 1.393, "step": 310 }, { "epoch": 0.032, "grad_norm": 0.0923767164349556, "learning_rate": 3.0672902724039794e-05, "loss": 0.1232, "step": 320 }, { "epoch": 0.032, "eval_cos_sim": 0.8750612735748291, "eval_loss": 0.1258947375034041, "eval_runtime": 181.1338, "eval_samples_per_second": 22.083, "eval_steps_per_second": 1.38, "step": 320 }, { "epoch": 0.033, "grad_norm": 0.08724959194660187, "learning_rate": 1.5205489961037645e-05, "loss": 0.1236, "step": 330 }, { "epoch": 0.033, "eval_cos_sim": 0.8755974173545837, "eval_loss": 0.125363212845201, "eval_runtime": 198.751, "eval_samples_per_second": 20.126, "eval_steps_per_second": 1.258, "step": 330 }, { "epoch": 0.034, "grad_norm": 0.07283046841621399, "learning_rate": 3.5156805643271896e-06, "loss": 0.1239, "step": 340 }, { "epoch": 0.034, "eval_cos_sim": 0.8756656646728516, "eval_loss": 0.12529714014279317, "eval_runtime": 187.9639, "eval_samples_per_second": 21.281, "eval_steps_per_second": 1.33, "step": 340 }, { "epoch": 0.035, "grad_norm": 0.15486685931682587, "learning_rate": 4.9887932065027656e-05, "loss": 0.1231, "step": 350 }, { "epoch": 0.035, "eval_cos_sim": 0.8756564259529114, "eval_loss": 0.12530613209950398, "eval_runtime": 194.2503, "eval_samples_per_second": 20.592, "eval_steps_per_second": 1.287, "step": 350 }, { "epoch": 0.036, "grad_norm": 0.07505682110786438, "learning_rate": 4.369262289279271e-05, "loss": 0.1233, "step": 360 }, { "epoch": 0.036, "eval_cos_sim": 0.8755001425743103, "eval_loss": 0.12546515204655598, "eval_runtime": 194.8309, "eval_samples_per_second": 20.531, "eval_steps_per_second": 1.283, "step": 360 }, { "epoch": 0.037, "grad_norm": 0.09688587486743927, "learning_rate": 3.0287834646695457e-05, "loss": 0.1259, "step": 370 }, { "epoch": 0.037, "eval_cos_sim": 0.8756394386291504, "eval_loss": 0.1253258285735793, "eval_runtime": 188.2216, "eval_samples_per_second": 21.252, "eval_steps_per_second": 1.328, "step": 370 }, { "epoch": 0.038, "grad_norm": 0.07268425822257996, "learning_rate": 1.4843603704405253e-05, "loss": 0.1247, "step": 380 }, { "epoch": 0.038, "eval_cos_sim": 0.8758111596107483, "eval_loss": 0.12515661337124775, "eval_runtime": 189.0095, "eval_samples_per_second": 21.163, "eval_steps_per_second": 1.323, "step": 380 }, { "epoch": 0.039, "grad_norm": 0.09875091165304184, "learning_rate": 3.316550516082126e-06, "loss": 0.1229, "step": 390 }, { "epoch": 0.039, "eval_cos_sim": 0.8758672475814819, "eval_loss": 0.12509912636029194, "eval_runtime": 235.6105, "eval_samples_per_second": 16.977, "eval_steps_per_second": 1.061, "step": 390 }, { "epoch": 0.04, "grad_norm": 0.0792056992650032, "learning_rate": 4.98475042744222e-05, "loss": 0.1246, "step": 400 }, { "epoch": 0.04, "eval_cos_sim": 0.8759932518005371, "eval_loss": 0.12497495915638873, "eval_runtime": 200.3436, "eval_samples_per_second": 19.966, "eval_steps_per_second": 1.248, "step": 400 }, { "epoch": 0.041, "grad_norm": 0.10644775629043579, "learning_rate": 4.3428229687794505e-05, "loss": 0.1224, "step": 410 }, { "epoch": 0.041, "eval_cos_sim": 0.8761371374130249, "eval_loss": 0.12483511426197956, "eval_runtime": 197.5074, "eval_samples_per_second": 20.252, "eval_steps_per_second": 1.266, "step": 410 }, { "epoch": 0.042, "grad_norm": 0.09292006492614746, "learning_rate": 2.9901448730099503e-05, "loss": 0.1239, "step": 420 }, { "epoch": 0.042, "eval_cos_sim": 0.876413881778717, "eval_loss": 0.12455732419239948, "eval_runtime": 187.5784, "eval_samples_per_second": 21.324, "eval_steps_per_second": 1.333, "step": 420 }, { "epoch": 0.043, "grad_norm": 0.08105887472629547, "learning_rate": 1.448424863465538e-05, "loss": 0.1231, "step": 430 }, { "epoch": 0.043, "eval_cos_sim": 0.876311719417572, "eval_loss": 0.12465796377407977, "eval_runtime": 203.0598, "eval_samples_per_second": 19.699, "eval_steps_per_second": 1.231, "step": 430 }, { "epoch": 0.044, "grad_norm": 0.15435349941253662, "learning_rate": 3.1228244380351547e-06, "loss": 0.1225, "step": 440 }, { "epoch": 0.044, "eval_cos_sim": 0.8762248754501343, "eval_loss": 0.12474570634114215, "eval_runtime": 199.1025, "eval_samples_per_second": 20.09, "eval_steps_per_second": 1.256, "step": 440 }, { "epoch": 0.045, "grad_norm": 0.09370752424001694, "learning_rate": 4.980088396483146e-05, "loss": 0.1228, "step": 450 }, { "epoch": 0.045, "eval_cos_sim": 0.8761196136474609, "eval_loss": 0.12484796597706745, "eval_runtime": 192.1246, "eval_samples_per_second": 20.82, "eval_steps_per_second": 1.301, "step": 450 }, { "epoch": 0.046, "grad_norm": 0.08999752253293991, "learning_rate": 4.3159243781616026e-05, "loss": 0.1229, "step": 460 }, { "epoch": 0.046, "eval_cos_sim": 0.8762247562408447, "eval_loss": 0.12473729922520588, "eval_runtime": 196.5532, "eval_samples_per_second": 20.351, "eval_steps_per_second": 1.272, "step": 460 }, { "epoch": 0.047, "grad_norm": 0.0809365063905716, "learning_rate": 2.9513841269722613e-05, "loss": 0.124, "step": 470 }, { "epoch": 0.047, "eval_cos_sim": 0.8765152096748352, "eval_loss": 0.12444968440281817, "eval_runtime": 204.1545, "eval_samples_per_second": 19.593, "eval_steps_per_second": 1.225, "step": 470 }, { "epoch": 0.048, "grad_norm": 0.08176057785749435, "learning_rate": 1.4127514310605238e-05, "loss": 0.123, "step": 480 }, { "epoch": 0.048, "eval_cos_sim": 0.876448929309845, "eval_loss": 0.12451095607029865, "eval_runtime": 198.7286, "eval_samples_per_second": 20.128, "eval_steps_per_second": 1.258, "step": 480 }, { "epoch": 0.049, "grad_norm": 0.09636738151311874, "learning_rate": 2.934550610786291e-06, "loss": 0.1236, "step": 490 }, { "epoch": 0.049, "eval_cos_sim": 0.8765274882316589, "eval_loss": 0.12443248560177753, "eval_runtime": 196.3413, "eval_samples_per_second": 20.373, "eval_steps_per_second": 1.273, "step": 490 }, { "epoch": 0.05, "grad_norm": 0.08814109116792679, "learning_rate": 4.974808275501392e-05, "loss": 0.123, "step": 500 }, { "epoch": 0.05, "eval_cos_sim": 0.8765753507614136, "eval_loss": 0.12438686539875934, "eval_runtime": 191.2687, "eval_samples_per_second": 20.913, "eval_steps_per_second": 1.307, "step": 500 }, { "epoch": 0.051, "grad_norm": 0.08511923253536224, "learning_rate": 4.2885732211184324e-05, "loss": 0.1246, "step": 510 }, { "epoch": 0.051, "eval_cos_sim": 0.8767162561416626, "eval_loss": 0.12425224568592975, "eval_runtime": 173.2088, "eval_samples_per_second": 23.094, "eval_steps_per_second": 1.443, "step": 510 }, { "epoch": 0.052, "grad_norm": 0.0837215781211853, "learning_rate": 2.9125108865470048e-05, "loss": 0.1221, "step": 520 }, { "epoch": 0.052, "eval_cos_sim": 0.876861572265625, "eval_loss": 0.1241044213985152, "eval_runtime": 174.8239, "eval_samples_per_second": 22.88, "eval_steps_per_second": 1.43, "step": 520 }, { "epoch": 0.053, "grad_norm": 0.09207245707511902, "learning_rate": 1.3773489637927061e-05, "loss": 0.1229, "step": 530 }, { "epoch": 0.053, "eval_cos_sim": 0.8767414093017578, "eval_loss": 0.12421691825138996, "eval_runtime": 173.8268, "eval_samples_per_second": 23.011, "eval_steps_per_second": 1.438, "step": 530 }, { "epoch": 0.054, "grad_norm": 0.0655718669295311, "learning_rate": 2.7517759561205253e-06, "loss": 0.1221, "step": 540 }, { "epoch": 0.054, "eval_cos_sim": 0.8767919540405273, "eval_loss": 0.1241676082824416, "eval_runtime": 179.6327, "eval_samples_per_second": 22.268, "eval_steps_per_second": 1.392, "step": 540 }, { "epoch": 0.055, "grad_norm": 0.21964910626411438, "learning_rate": 4.968911380413809e-05, "loss": 0.1243, "step": 550 }, { "epoch": 0.055, "eval_cos_sim": 0.8768623471260071, "eval_loss": 0.12409912397610615, "eval_runtime": 172.7843, "eval_samples_per_second": 23.15, "eval_steps_per_second": 1.447, "step": 550 }, { "epoch": 0.056, "grad_norm": 0.08817338943481445, "learning_rate": 4.260776314131676e-05, "loss": 0.1222, "step": 560 }, { "epoch": 0.056, "eval_cos_sim": 0.8767062425613403, "eval_loss": 0.12425821544873188, "eval_runtime": 172.6396, "eval_samples_per_second": 23.17, "eval_steps_per_second": 1.448, "step": 560 }, { "epoch": 0.057, "grad_norm": 0.06475117802619934, "learning_rate": 2.873534839760646e-05, "loss": 0.1232, "step": 570 }, { "epoch": 0.057, "eval_cos_sim": 0.8768667578697205, "eval_loss": 0.12410461117970416, "eval_runtime": 172.7054, "eval_samples_per_second": 23.161, "eval_steps_per_second": 1.448, "step": 570 }, { "epoch": 0.058, "grad_norm": 0.07474437355995178, "learning_rate": 1.342226284699138e-05, "loss": 0.1227, "step": 580 }, { "epoch": 0.058, "eval_cos_sim": 0.8771414160728455, "eval_loss": 0.12382852866398761, "eval_runtime": 175.1422, "eval_samples_per_second": 22.839, "eval_steps_per_second": 1.427, "step": 580 }, { "epoch": 0.059, "grad_norm": 0.07362603396177292, "learning_rate": 2.5745460253134484e-06, "loss": 0.1234, "step": 590 }, { "epoch": 0.059, "eval_cos_sim": 0.8771759271621704, "eval_loss": 0.12379106380688618, "eval_runtime": 174.7169, "eval_samples_per_second": 22.894, "eval_steps_per_second": 1.431, "step": 590 }, { "epoch": 0.06, "grad_norm": 0.07593993842601776, "learning_rate": 4.962399180850275e-05, "loss": 0.1232, "step": 600 }, { "epoch": 0.06, "eval_cos_sim": 0.877038300037384, "eval_loss": 0.12392904116856525, "eval_runtime": 172.4786, "eval_samples_per_second": 23.191, "eval_steps_per_second": 1.449, "step": 600 }, { "epoch": 0.061, "grad_norm": 0.07887241989374161, "learning_rate": 4.2325405847733254e-05, "loss": 0.1235, "step": 610 }, { "epoch": 0.061, "eval_cos_sim": 0.8767529726028442, "eval_loss": 0.12422390153157184, "eval_runtime": 173.6696, "eval_samples_per_second": 23.032, "eval_steps_per_second": 1.44, "step": 610 }, { "epoch": 0.062, "grad_norm": 0.17296281456947327, "learning_rate": 2.834465700261192e-05, "loss": 0.1204, "step": 620 }, { "epoch": 0.062, "eval_cos_sim": 0.8772019743919373, "eval_loss": 0.12377139737355183, "eval_runtime": 179.9864, "eval_samples_per_second": 22.224, "eval_steps_per_second": 1.389, "step": 620 }, { "epoch": 0.063, "grad_norm": 0.06920995563268661, "learning_rate": 1.3073921470877709e-05, "loss": 0.1245, "step": 630 }, { "epoch": 0.063, "eval_cos_sim": 0.8773365616798401, "eval_loss": 0.12363236000287008, "eval_runtime": 173.1204, "eval_samples_per_second": 23.105, "eval_steps_per_second": 1.444, "step": 630 }, { "epoch": 0.064, "grad_norm": 0.08347232639789581, "learning_rate": 2.4029049877794472e-06, "loss": 0.1217, "step": 640 }, { "epoch": 0.064, "eval_cos_sim": 0.8773410320281982, "eval_loss": 0.12362796523320149, "eval_runtime": 172.0713, "eval_samples_per_second": 23.246, "eval_steps_per_second": 1.453, "step": 640 }, { "epoch": 0.065, "grad_norm": 0.07459770888090134, "learning_rate": 4.955273299787453e-05, "loss": 0.1223, "step": 650 }, { "epoch": 0.065, "eval_cos_sim": 0.8773767948150635, "eval_loss": 0.12359384205090472, "eval_runtime": 173.2149, "eval_samples_per_second": 23.093, "eval_steps_per_second": 1.443, "step": 650 }, { "epoch": 0.066, "grad_norm": 0.0831998735666275, "learning_rate": 4.203873069979081e-05, "loss": 0.1231, "step": 660 }, { "epoch": 0.066, "eval_cos_sim": 0.8774532675743103, "eval_loss": 0.12351777221905659, "eval_runtime": 171.902, "eval_samples_per_second": 23.269, "eval_steps_per_second": 1.454, "step": 660 }, { "epoch": 0.067, "grad_norm": 0.07724840193986893, "learning_rate": 2.7953132048972646e-05, "loss": 0.122, "step": 670 }, { "epoch": 0.067, "eval_cos_sim": 0.877151608467102, "eval_loss": 0.12382214214550921, "eval_runtime": 173.6766, "eval_samples_per_second": 23.031, "eval_steps_per_second": 1.439, "step": 670 }, { "epoch": 0.068, "grad_norm": 0.0648268312215805, "learning_rate": 1.2728552323560239e-05, "loss": 0.1227, "step": 680 }, { "epoch": 0.068, "eval_cos_sim": 0.8769506216049194, "eval_loss": 0.12402295615422199, "eval_runtime": 171.7424, "eval_samples_per_second": 23.291, "eval_steps_per_second": 1.456, "step": 680 }, { "epoch": 0.069, "grad_norm": 0.08475865423679352, "learning_rate": 2.2368956200634283e-06, "loss": 0.1274, "step": 690 }, { "epoch": 0.069, "eval_cos_sim": 0.8771329522132874, "eval_loss": 0.12383969738232563, "eval_runtime": 174.2776, "eval_samples_per_second": 22.952, "eval_steps_per_second": 1.434, "step": 690 }, { "epoch": 0.07, "grad_norm": 0.06382860988378525, "learning_rate": 4.947535513144286e-05, "loss": 0.122, "step": 700 }, { "epoch": 0.07, "eval_cos_sim": 0.8775114417076111, "eval_loss": 0.12346241619336079, "eval_runtime": 185.1334, "eval_samples_per_second": 21.606, "eval_steps_per_second": 1.35, "step": 700 }, { "epoch": 0.071, "grad_norm": 0.07273228466510773, "learning_rate": 4.174780914294635e-05, "loss": 0.1228, "step": 710 }, { "epoch": 0.071, "eval_cos_sim": 0.8777372241020203, "eval_loss": 0.12323929693448019, "eval_runtime": 170.2151, "eval_samples_per_second": 23.5, "eval_steps_per_second": 1.469, "step": 710 }, { "epoch": 0.072, "grad_norm": 0.08377543836832047, "learning_rate": 2.756087111291529e-05, "loss": 0.1209, "step": 720 }, { "epoch": 0.072, "eval_cos_sim": 0.8776744604110718, "eval_loss": 0.12329552843319844, "eval_runtime": 173.1907, "eval_samples_per_second": 23.096, "eval_steps_per_second": 1.443, "step": 720 }, { "epoch": 0.073, "grad_norm": 0.08579932153224945, "learning_rate": 1.2386241478270527e-05, "loss": 0.1234, "step": 730 }, { "epoch": 0.073, "eval_cos_sim": 0.8776343464851379, "eval_loss": 0.12333650018917988, "eval_runtime": 172.2784, "eval_samples_per_second": 23.218, "eval_steps_per_second": 1.451, "step": 730 }, { "epoch": 0.074, "grad_norm": 0.07494545727968216, "learning_rate": 2.0765592951802664e-06, "loss": 0.1209, "step": 740 }, { "epoch": 0.074, "eval_cos_sim": 0.8777279853820801, "eval_loss": 0.12324421884762715, "eval_runtime": 172.9417, "eval_samples_per_second": 23.129, "eval_steps_per_second": 1.446, "step": 740 }, { "epoch": 0.075, "grad_norm": 0.07511463761329651, "learning_rate": 4.9391877493394335e-05, "loss": 0.1222, "step": 750 }, { "epoch": 0.075, "eval_cos_sim": 0.8777404427528381, "eval_loss": 0.12323040797459553, "eval_runtime": 173.813, "eval_samples_per_second": 23.013, "eval_steps_per_second": 1.438, "step": 750 }, { "epoch": 0.076, "grad_norm": 0.08240217715501785, "learning_rate": 4.1452713680951016e-05, "loss": 0.1237, "step": 760 }, { "epoch": 0.076, "eval_cos_sim": 0.8776569366455078, "eval_loss": 0.1233164258216567, "eval_runtime": 173.6453, "eval_samples_per_second": 23.035, "eval_steps_per_second": 1.44, "step": 760 }, { "epoch": 0.077, "grad_norm": 0.07817904651165009, "learning_rate": 2.716797195408887e-05, "loss": 0.1215, "step": 770 }, { "epoch": 0.077, "eval_cos_sim": 0.8779506683349609, "eval_loss": 0.12303087331997822, "eval_runtime": 198.4978, "eval_samples_per_second": 20.151, "eval_steps_per_second": 1.259, "step": 770 }, { "epoch": 0.078, "grad_norm": 0.06472489982843399, "learning_rate": 1.2047074246048157e-05, "loss": 0.1222, "step": 780 }, { "epoch": 0.078, "eval_cos_sim": 0.8780341148376465, "eval_loss": 0.12294723345982503, "eval_runtime": 187.0246, "eval_samples_per_second": 21.388, "eval_steps_per_second": 1.337, "step": 780 }, { "epoch": 0.079, "grad_norm": 0.06511878967285156, "learning_rate": 1.921935972303521e-06, "loss": 0.1211, "step": 790 }, { "epoch": 0.079, "eval_cos_sim": 0.8780234456062317, "eval_loss": 0.1229577579711623, "eval_runtime": 170.8199, "eval_samples_per_second": 23.416, "eval_steps_per_second": 1.464, "step": 790 }, { "epoch": 0.08, "grad_norm": 0.08275925368070602, "learning_rate": 4.9302320888106454e-05, "loss": 0.1234, "step": 800 }, { "epoch": 0.08, "eval_cos_sim": 0.8778801560401917, "eval_loss": 0.1230986237739272, "eval_runtime": 175.6448, "eval_samples_per_second": 22.773, "eval_steps_per_second": 1.423, "step": 800 }, { "epoch": 0.081, "grad_norm": 0.06466321647167206, "learning_rate": 4.115351785778022e-05, "loss": 0.1215, "step": 810 }, { "epoch": 0.081, "eval_cos_sim": 0.877547025680542, "eval_loss": 0.12342484547841023, "eval_runtime": 173.845, "eval_samples_per_second": 23.009, "eval_steps_per_second": 1.438, "step": 810 }, { "epoch": 0.082, "grad_norm": 0.060175709426403046, "learning_rate": 2.6774532491200373e-05, "loss": 0.1237, "step": 820 }, { "epoch": 0.082, "eval_cos_sim": 0.8778981566429138, "eval_loss": 0.1230772545551009, "eval_runtime": 174.1784, "eval_samples_per_second": 22.965, "eval_steps_per_second": 1.435, "step": 820 }, { "epoch": 0.083, "grad_norm": 0.06948266923427582, "learning_rate": 1.1711135154477437e-05, "loss": 0.1213, "step": 830 }, { "epoch": 0.083, "eval_cos_sim": 0.8779332041740417, "eval_loss": 0.12304716589199971, "eval_runtime": 171.7677, "eval_samples_per_second": 23.287, "eval_steps_per_second": 1.455, "step": 830 }, { "epoch": 0.084, "grad_norm": 0.0633857399225235, "learning_rate": 1.7730641868067276e-06, "loss": 0.1212, "step": 840 }, { "epoch": 0.084, "eval_cos_sim": 0.8779239058494568, "eval_loss": 0.12305730154263447, "eval_runtime": 172.6941, "eval_samples_per_second": 23.162, "eval_steps_per_second": 1.448, "step": 840 }, { "epoch": 0.085, "grad_norm": 0.07013432681560516, "learning_rate": 4.9206707634962714e-05, "loss": 0.1219, "step": 850 }, { "epoch": 0.085, "eval_cos_sim": 0.8781536221504211, "eval_loss": 0.12283129765736531, "eval_runtime": 178.3382, "eval_samples_per_second": 22.429, "eval_steps_per_second": 1.402, "step": 850 }, { "epoch": 0.086, "grad_norm": 0.0714387595653534, "learning_rate": 4.085029623930606e-05, "loss": 0.1214, "step": 860 }, { "epoch": 0.086, "eval_cos_sim": 0.8783000111579895, "eval_loss": 0.12268445636975239, "eval_runtime": 180.4291, "eval_samples_per_second": 22.169, "eval_steps_per_second": 1.386, "step": 860 }, { "epoch": 0.087, "grad_norm": 0.07285313308238983, "learning_rate": 2.638065077761282e-05, "loss": 0.1211, "step": 870 }, { "epoch": 0.087, "eval_cos_sim": 0.8782742619514465, "eval_loss": 0.12271090867268514, "eval_runtime": 174.6757, "eval_samples_per_second": 22.9, "eval_steps_per_second": 1.431, "step": 870 }, { "epoch": 0.088, "grad_norm": 0.1114286258816719, "learning_rate": 1.1378507926623341e-05, "loss": 0.1203, "step": 880 }, { "epoch": 0.088, "eval_cos_sim": 0.8782421946525574, "eval_loss": 0.12274044944989156, "eval_runtime": 173.5126, "eval_samples_per_second": 23.053, "eval_steps_per_second": 1.441, "step": 880 }, { "epoch": 0.089, "grad_norm": 0.07392691820859909, "learning_rate": 1.6299810406600836e-06, "loss": 0.1222, "step": 890 }, { "epoch": 0.089, "eval_cos_sim": 0.8782600164413452, "eval_loss": 0.12272232272374105, "eval_runtime": 173.9745, "eval_samples_per_second": 22.992, "eval_steps_per_second": 1.437, "step": 890 }, { "epoch": 0.09, "grad_norm": 0.1509944051504135, "learning_rate": 4.9105061562790325e-05, "loss": 0.1211, "step": 900 }, { "epoch": 0.09, "eval_cos_sim": 0.8785330653190613, "eval_loss": 0.12244940116154622, "eval_runtime": 174.6529, "eval_samples_per_second": 22.903, "eval_steps_per_second": 1.431, "step": 900 }, { "epoch": 0.091, "grad_norm": 0.07572964578866959, "learning_rate": 4.0543124394712475e-05, "loss": 0.1234, "step": 910 }, { "epoch": 0.091, "eval_cos_sim": 0.8782286643981934, "eval_loss": 0.1227607171748824, "eval_runtime": 174.4786, "eval_samples_per_second": 22.925, "eval_steps_per_second": 1.433, "step": 910 }, { "epoch": 0.092, "grad_norm": 0.07199128717184067, "learning_rate": 2.5986424976906166e-05, "loss": 0.1202, "step": 920 }, { "epoch": 0.092, "eval_cos_sim": 0.8780964612960815, "eval_loss": 0.12288942649113606, "eval_runtime": 175.9134, "eval_samples_per_second": 22.738, "eval_steps_per_second": 1.421, "step": 920 }, { "epoch": 0.093, "grad_norm": 0.07497607171535492, "learning_rate": 1.1049275460163872e-05, "loss": 0.123, "step": 930 }, { "epoch": 0.093, "eval_cos_sim": 0.8781337141990662, "eval_loss": 0.12284465791928242, "eval_runtime": 174.1009, "eval_samples_per_second": 22.975, "eval_steps_per_second": 1.436, "step": 930 }, { "epoch": 0.094, "grad_norm": 0.056581463664770126, "learning_rate": 1.4927221931830576e-06, "loss": 0.1218, "step": 940 }, { "epoch": 0.094, "eval_cos_sim": 0.8781940340995789, "eval_loss": 0.12278383018719624, "eval_runtime": 180.3511, "eval_samples_per_second": 22.179, "eval_steps_per_second": 1.386, "step": 940 }, { "epoch": 0.095, "grad_norm": 0.06227719038724899, "learning_rate": 4.8997408003921384e-05, "loss": 0.1216, "step": 950 }, { "epoch": 0.095, "eval_cos_sim": 0.8782709836959839, "eval_loss": 0.12271020819889973, "eval_runtime": 174.3195, "eval_samples_per_second": 22.946, "eval_steps_per_second": 1.434, "step": 950 }, { "epoch": 0.096, "grad_norm": 0.07964574545621872, "learning_rate": 4.02320788776628e-05, "loss": 0.1205, "step": 960 }, { "epoch": 0.096, "eval_cos_sim": 0.8782918453216553, "eval_loss": 0.12269965698468159, "eval_runtime": 171.8922, "eval_samples_per_second": 23.27, "eval_steps_per_second": 1.454, "step": 960 }, { "epoch": 0.097, "grad_norm": 0.059999242424964905, "learning_rate": 2.559195333841573e-05, "loss": 0.1224, "step": 970 }, { "epoch": 0.097, "eval_cos_sim": 0.8782675862312317, "eval_loss": 0.12272447182881306, "eval_runtime": 178.4336, "eval_samples_per_second": 22.417, "eval_steps_per_second": 1.401, "step": 970 }, { "epoch": 0.098, "grad_norm": 0.07078584283590317, "learning_rate": 1.0723519806732741e-05, "loss": 0.1226, "step": 980 }, { "epoch": 0.098, "eval_cos_sim": 0.8782561421394348, "eval_loss": 0.12273399831997822, "eval_runtime": 172.0171, "eval_samples_per_second": 23.254, "eval_steps_per_second": 1.453, "step": 980 }, { "epoch": 0.099, "grad_norm": 0.0700722336769104, "learning_rate": 1.3613218521583647e-06, "loss": 0.1189, "step": 990 }, { "epoch": 0.099, "eval_cos_sim": 0.8782747387886047, "eval_loss": 0.1227147035812087, "eval_runtime": 174.8389, "eval_samples_per_second": 22.878, "eval_steps_per_second": 1.43, "step": 990 }, { "epoch": 0.1, "grad_norm": 0.06270556151866913, "learning_rate": 4.888377378787991e-05, "loss": 0.1209, "step": 1000 }, { "epoch": 0.1, "eval_cos_sim": 0.8783043622970581, "eval_loss": 0.12268760301815938, "eval_runtime": 171.6574, "eval_samples_per_second": 23.302, "eval_steps_per_second": 1.456, "step": 1000 }, { "epoch": 0.101, "grad_norm": 0.059303585439920425, "learning_rate": 3.9917237207221514e-05, "loss": 0.1206, "step": 1010 }, { "epoch": 0.101, "eval_cos_sim": 0.8785374760627747, "eval_loss": 0.12245997311818074, "eval_runtime": 173.2279, "eval_samples_per_second": 23.091, "eval_steps_per_second": 1.443, "step": 1010 }, { "epoch": 0.102, "grad_norm": 0.06463504582643509, "learning_rate": 2.519733417274297e-05, "loss": 0.122, "step": 1020 }, { "epoch": 0.102, "eval_cos_sim": 0.8785625100135803, "eval_loss": 0.12243694259869527, "eval_runtime": 179.8429, "eval_samples_per_second": 22.242, "eval_steps_per_second": 1.39, "step": 1020 }, { "epoch": 0.103, "grad_norm": 0.06594408303499222, "learning_rate": 1.0401322151467458e-05, "loss": 0.1226, "step": 1030 }, { "epoch": 0.103, "eval_cos_sim": 0.8784922361373901, "eval_loss": 0.1225029034827895, "eval_runtime": 171.8585, "eval_samples_per_second": 23.275, "eval_steps_per_second": 1.455, "step": 1030 }, { "epoch": 0.104, "grad_norm": 0.061140164732933044, "learning_rate": 1.2358127653053858e-06, "loss": 0.122, "step": 1040 }, { "epoch": 0.104, "eval_cos_sim": 0.8785346746444702, "eval_loss": 0.12245874931561421, "eval_runtime": 170.3116, "eval_samples_per_second": 23.486, "eval_steps_per_second": 1.468, "step": 1040 }, { "epoch": 0.105, "grad_norm": 0.06770511716604233, "learning_rate": 4.876418723469453e-05, "loss": 0.1196, "step": 1050 }, { "epoch": 0.105, "eval_cos_sim": 0.878551721572876, "eval_loss": 0.12243552591549825, "eval_runtime": 173.9331, "eval_samples_per_second": 22.997, "eval_steps_per_second": 1.437, "step": 1050 }, { "epoch": 0.106, "grad_norm": 0.06050929054617882, "learning_rate": 3.959867784853255e-05, "loss": 0.1219, "step": 1060 }, { "epoch": 0.106, "eval_cos_sim": 0.8784484267234802, "eval_loss": 0.12253486802327107, "eval_runtime": 175.2374, "eval_samples_per_second": 22.826, "eval_steps_per_second": 1.427, "step": 1060 }, { "epoch": 0.107, "grad_norm": 0.07329047471284866, "learning_rate": 2.4802665827257035e-05, "loss": 0.1214, "step": 1070 }, { "epoch": 0.107, "eval_cos_sim": 0.8785268068313599, "eval_loss": 0.12246101453053426, "eval_runtime": 172.381, "eval_samples_per_second": 23.204, "eval_steps_per_second": 1.45, "step": 1070 }, { "epoch": 0.108, "grad_norm": 0.061687979847192764, "learning_rate": 1.0082762792778497e-05, "loss": 0.1206, "step": 1080 }, { "epoch": 0.108, "eval_cos_sim": 0.8787024617195129, "eval_loss": 0.12228504302250813, "eval_runtime": 171.0068, "eval_samples_per_second": 23.391, "eval_steps_per_second": 1.462, "step": 1080 }, { "epoch": 0.109, "grad_norm": 0.06697102636098862, "learning_rate": 1.1162262121200917e-06, "loss": 0.1216, "step": 1090 }, { "epoch": 0.109, "eval_cos_sim": 0.8787557482719421, "eval_loss": 0.12223189308392476, "eval_runtime": 172.5647, "eval_samples_per_second": 23.18, "eval_steps_per_second": 1.449, "step": 1090 }, { "epoch": 0.11, "grad_norm": 0.06245901808142662, "learning_rate": 4.8638678147841726e-05, "loss": 0.1224, "step": 1100 }, { "epoch": 0.11, "eval_cos_sim": 0.878864049911499, "eval_loss": 0.12212434603917073, "eval_runtime": 177.5612, "eval_samples_per_second": 22.527, "eval_steps_per_second": 1.408, "step": 1100 }, { "epoch": 0.111, "grad_norm": 0.07445187121629715, "learning_rate": 3.9276480193267495e-05, "loss": 0.1226, "step": 1110 }, { "epoch": 0.111, "eval_cos_sim": 0.8787615895271301, "eval_loss": 0.12223191478001545, "eval_runtime": 170.2386, "eval_samples_per_second": 23.496, "eval_steps_per_second": 1.469, "step": 1110 }, { "epoch": 0.112, "grad_norm": 0.06328488141298294, "learning_rate": 2.4408046661584553e-05, "loss": 0.1205, "step": 1120 }, { "epoch": 0.112, "eval_cos_sim": 0.8786949515342712, "eval_loss": 0.12229911091076802, "eval_runtime": 173.6977, "eval_samples_per_second": 23.029, "eval_steps_per_second": 1.439, "step": 1120 }, { "epoch": 0.113, "grad_norm": 0.1140422523021698, "learning_rate": 9.767921122337203e-06, "loss": 0.1213, "step": 1130 }, { "epoch": 0.113, "eval_cos_sim": 0.8787314295768738, "eval_loss": 0.12225894191014242, "eval_runtime": 176.5254, "eval_samples_per_second": 22.66, "eval_steps_per_second": 1.416, "step": 1130 }, { "epoch": 0.114, "grad_norm": 0.07940120995044708, "learning_rate": 1.0025919960786169e-06, "loss": 0.1216, "step": 1140 }, { "epoch": 0.114, "eval_cos_sim": 0.878764271736145, "eval_loss": 0.12222567083584737, "eval_runtime": 173.6241, "eval_samples_per_second": 23.038, "eval_steps_per_second": 1.44, "step": 1140 }, { "epoch": 0.115, "grad_norm": 0.06326926499605179, "learning_rate": 4.850727780681685e-05, "loss": 0.121, "step": 1150 }, { "epoch": 0.115, "eval_cos_sim": 0.8787913918495178, "eval_loss": 0.1222020423625655, "eval_runtime": 197.6043, "eval_samples_per_second": 20.242, "eval_steps_per_second": 1.265, "step": 1150 }, { "epoch": 0.116, "grad_norm": 0.06304363161325455, "learning_rate": 3.89507245398359e-05, "loss": 0.1212, "step": 1160 }, { "epoch": 0.116, "eval_cos_sim": 0.8788431286811829, "eval_loss": 0.1221448552821822, "eval_runtime": 180.7769, "eval_samples_per_second": 22.127, "eval_steps_per_second": 1.383, "step": 1160 }, { "epoch": 0.117, "grad_norm": 0.06048878654837608, "learning_rate": 2.4013575023093562e-05, "loss": 0.121, "step": 1170 }, { "epoch": 0.117, "eval_cos_sim": 0.8789100050926208, "eval_loss": 0.12207724287259053, "eval_runtime": 175.5012, "eval_samples_per_second": 22.792, "eval_steps_per_second": 1.424, "step": 1170 }, { "epoch": 0.118, "grad_norm": 0.060076240450143814, "learning_rate": 9.456875605287529e-06, "loss": 0.1208, "step": 1180 }, { "epoch": 0.118, "eval_cos_sim": 0.8789265751838684, "eval_loss": 0.12206284239041279, "eval_runtime": 179.6264, "eval_samples_per_second": 22.268, "eval_steps_per_second": 1.392, "step": 1180 }, { "epoch": 0.119, "grad_norm": 0.06535797566175461, "learning_rate": 8.949384372096747e-07, "loss": 0.1224, "step": 1190 }, { "epoch": 0.119, "eval_cos_sim": 0.8789151310920715, "eval_loss": 0.12207536175000142, "eval_runtime": 173.573, "eval_samples_per_second": 23.045, "eval_steps_per_second": 1.44, "step": 1190 }, { "epoch": 0.12, "grad_norm": 0.051111843436956406, "learning_rate": 4.8370018959339916e-05, "loss": 0.1216, "step": 1200 }, { "epoch": 0.12, "eval_cos_sim": 0.878704845905304, "eval_loss": 0.1222877917503066, "eval_runtime": 170.7747, "eval_samples_per_second": 23.423, "eval_steps_per_second": 1.464, "step": 1200 }, { "epoch": 0.121, "grad_norm": 0.07394807785749435, "learning_rate": 3.862149207337666e-05, "loss": 0.1227, "step": 1210 }, { "epoch": 0.121, "eval_cos_sim": 0.8786987662315369, "eval_loss": 0.12228692223774862, "eval_runtime": 172.7735, "eval_samples_per_second": 23.152, "eval_steps_per_second": 1.447, "step": 1210 }, { "epoch": 0.122, "grad_norm": 0.06019896641373634, "learning_rate": 2.3619349222387182e-05, "loss": 0.1194, "step": 1220 }, { "epoch": 0.122, "eval_cos_sim": 0.8791972398757935, "eval_loss": 0.12178870942341757, "eval_runtime": 171.5715, "eval_samples_per_second": 23.314, "eval_steps_per_second": 1.457, "step": 1220 }, { "epoch": 0.123, "grad_norm": 0.05350535735487938, "learning_rate": 9.149703760694162e-06, "loss": 0.1214, "step": 1230 }, { "epoch": 0.123, "eval_cos_sim": 0.8792542219161987, "eval_loss": 0.12173621847378684, "eval_runtime": 173.1804, "eval_samples_per_second": 23.097, "eval_steps_per_second": 1.444, "step": 1230 }, { "epoch": 0.124, "grad_norm": 0.06338366866111755, "learning_rate": 7.932923650373624e-07, "loss": 0.1194, "step": 1240 }, { "epoch": 0.124, "eval_cos_sim": 0.8792427182197571, "eval_loss": 0.12174849869954062, "eval_runtime": 172.0716, "eval_samples_per_second": 23.246, "eval_steps_per_second": 1.453, "step": 1240 }, { "epoch": 0.125, "grad_norm": 0.052142199128866196, "learning_rate": 4.822693581319333e-05, "loss": 0.12, "step": 1250 }, { "epoch": 0.125, "eval_cos_sim": 0.8787649869918823, "eval_loss": 0.1222243664478011, "eval_runtime": 172.6696, "eval_samples_per_second": 23.166, "eval_steps_per_second": 1.448, "step": 1250 }, { "epoch": 0.126, "grad_norm": 0.0695052519440651, "learning_rate": 3.828886484552269e-05, "loss": 0.1213, "step": 1260 }, { "epoch": 0.126, "eval_cos_sim": 0.8785125017166138, "eval_loss": 0.12247128774868916, "eval_runtime": 182.4937, "eval_samples_per_second": 21.919, "eval_steps_per_second": 1.37, "step": 1260 }, { "epoch": 0.127, "grad_norm": 0.07181504368782043, "learning_rate": 2.3225467508799494e-05, "loss": 0.1216, "step": 1270 }, { "epoch": 0.127, "eval_cos_sim": 0.8791427612304688, "eval_loss": 0.12184033658253621, "eval_runtime": 172.8353, "eval_samples_per_second": 23.143, "eval_steps_per_second": 1.446, "step": 1270 }, { "epoch": 0.128, "grad_norm": 0.06035405769944191, "learning_rate": 8.846482142219678e-06, "loss": 0.12, "step": 1280 }, { "epoch": 0.128, "eval_cos_sim": 0.8793256282806396, "eval_loss": 0.121661689779634, "eval_runtime": 173.4166, "eval_samples_per_second": 23.066, "eval_steps_per_second": 1.442, "step": 1280 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 110, "trial_name": null, "trial_params": null }