| { | |
| "best_metric": 0.121661689779634, | |
| "best_model_checkpoint": "/workspace/disk2/krishna/checkpoints/checkpoint-1280", | |
| "epoch": 0.128, | |
| "eval_steps": 10, | |
| "global_step": 1280, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.001, | |
| "grad_norm": 0.11198576539754868, | |
| "learning_rate": 1e-05, | |
| "loss": 0.126, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.001, | |
| "eval_cos_sim": 0.8696296215057373, | |
| "eval_loss": 0.13132101871716445, | |
| "eval_runtime": 191.9539, | |
| "eval_samples_per_second": 20.838, | |
| "eval_steps_per_second": 1.302, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.002, | |
| "grad_norm": 0.19444850087165833, | |
| "learning_rate": 2e-05, | |
| "loss": 0.1267, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.002, | |
| "eval_cos_sim": 0.8698329329490662, | |
| "eval_loss": 0.1311149292205519, | |
| "eval_runtime": 177.5098, | |
| "eval_samples_per_second": 22.534, | |
| "eval_steps_per_second": 1.408, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.003, | |
| "grad_norm": 0.12954622507095337, | |
| "learning_rate": 3e-05, | |
| "loss": 0.1271, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.003, | |
| "eval_cos_sim": 0.8700494766235352, | |
| "eval_loss": 0.1309011602615065, | |
| "eval_runtime": 179.7068, | |
| "eval_samples_per_second": 22.258, | |
| "eval_steps_per_second": 1.391, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.004, | |
| "grad_norm": 0.11514733731746674, | |
| "learning_rate": 4e-05, | |
| "loss": 0.1265, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.004, | |
| "eval_cos_sim": 0.870728075504303, | |
| "eval_loss": 0.13021534349667496, | |
| "eval_runtime": 174.4918, | |
| "eval_samples_per_second": 22.924, | |
| "eval_steps_per_second": 1.433, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.005, | |
| "grad_norm": 0.34224584698677063, | |
| "learning_rate": 5e-05, | |
| "loss": 0.1273, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.005, | |
| "eval_cos_sim": 0.8705285787582397, | |
| "eval_loss": 0.1304176144813246, | |
| "eval_runtime": 175.5157, | |
| "eval_samples_per_second": 22.79, | |
| "eval_steps_per_second": 1.424, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.006, | |
| "grad_norm": 0.1085827499628067, | |
| "learning_rate": 4.517892759404963e-05, | |
| "loss": 0.125, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.006, | |
| "eval_cos_sim": 0.8709338903427124, | |
| "eval_loss": 0.130007851145143, | |
| "eval_runtime": 173.9237, | |
| "eval_samples_per_second": 22.999, | |
| "eval_steps_per_second": 1.437, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.007, | |
| "grad_norm": 0.11786766350269318, | |
| "learning_rate": 3.257512950767182e-05, | |
| "loss": 0.1291, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.007, | |
| "eval_cos_sim": 0.8714690208435059, | |
| "eval_loss": 0.12946533443676894, | |
| "eval_runtime": 177.0345, | |
| "eval_samples_per_second": 22.594, | |
| "eval_steps_per_second": 1.412, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 0.10741184651851654, | |
| "learning_rate": 1.7049711594019046e-05, | |
| "loss": 0.1285, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.008, | |
| "eval_cos_sim": 0.8719983696937561, | |
| "eval_loss": 0.1289418597434706, | |
| "eval_runtime": 178.6566, | |
| "eval_samples_per_second": 22.389, | |
| "eval_steps_per_second": 1.399, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.009, | |
| "grad_norm": 0.12072350829839706, | |
| "learning_rate": 4.590606964640023e-06, | |
| "loss": 0.125, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.009, | |
| "eval_cos_sim": 0.8721248507499695, | |
| "eval_loss": 0.12881728055226274, | |
| "eval_runtime": 181.5969, | |
| "eval_samples_per_second": 22.027, | |
| "eval_steps_per_second": 1.377, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 0.11123672872781754, | |
| "learning_rate": 4.999688473794144e-05, | |
| "loss": 0.1249, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "eval_cos_sim": 0.8721336722373962, | |
| "eval_loss": 0.12880885388600297, | |
| "eval_runtime": 174.6097, | |
| "eval_samples_per_second": 22.908, | |
| "eval_steps_per_second": 1.432, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.011, | |
| "grad_norm": 0.11100038141012192, | |
| "learning_rate": 4.494343314093799e-05, | |
| "loss": 0.1246, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.011, | |
| "eval_cos_sim": 0.8723854422569275, | |
| "eval_loss": 0.1285583892081923, | |
| "eval_runtime": 180.7772, | |
| "eval_samples_per_second": 22.127, | |
| "eval_steps_per_second": 1.383, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "grad_norm": 0.11933281272649765, | |
| "learning_rate": 3.219808272827916e-05, | |
| "loss": 0.1265, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.012, | |
| "eval_cos_sim": 0.8727645874023438, | |
| "eval_loss": 0.12819017722355788, | |
| "eval_runtime": 176.8881, | |
| "eval_samples_per_second": 22.613, | |
| "eval_steps_per_second": 1.413, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.013, | |
| "grad_norm": 0.11295568197965622, | |
| "learning_rate": 1.667653407425597e-05, | |
| "loss": 0.1256, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.013, | |
| "eval_cos_sim": 0.8724489808082581, | |
| "eval_loss": 0.12850400116192764, | |
| "eval_runtime": 176.2937, | |
| "eval_samples_per_second": 22.689, | |
| "eval_steps_per_second": 1.418, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.014, | |
| "grad_norm": 0.10013717412948608, | |
| "learning_rate": 4.365227971950606e-06, | |
| "loss": 0.1252, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.014, | |
| "eval_cos_sim": 0.8726389408111572, | |
| "eval_loss": 0.1283098426078505, | |
| "eval_runtime": 175.1837, | |
| "eval_samples_per_second": 22.833, | |
| "eval_steps_per_second": 1.427, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.015, | |
| "grad_norm": 0.08663387596607208, | |
| "learning_rate": 4.998753972815435e-05, | |
| "loss": 0.1252, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.015, | |
| "eval_cos_sim": 0.8726971745491028, | |
| "eval_loss": 0.12825069954144425, | |
| "eval_runtime": 179.297, | |
| "eval_samples_per_second": 22.309, | |
| "eval_steps_per_second": 1.394, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 0.10253303498029709, | |
| "learning_rate": 4.47029683661798e-05, | |
| "loss": 0.1258, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "eval_cos_sim": 0.8739002346992493, | |
| "eval_loss": 0.12703985621678301, | |
| "eval_runtime": 175.0922, | |
| "eval_samples_per_second": 22.845, | |
| "eval_steps_per_second": 1.428, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.017, | |
| "grad_norm": 0.11590978503227234, | |
| "learning_rate": 3.1819242035765096e-05, | |
| "loss": 0.1219, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.017, | |
| "eval_cos_sim": 0.8737954497337341, | |
| "eval_loss": 0.12715704419362017, | |
| "eval_runtime": 180.7326, | |
| "eval_samples_per_second": 22.132, | |
| "eval_steps_per_second": 1.383, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.018, | |
| "grad_norm": 0.09687651693820953, | |
| "learning_rate": 1.6305430936700428e-05, | |
| "loss": 0.1244, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.018, | |
| "eval_cos_sim": 0.8735443353652954, | |
| "eval_loss": 0.12740902497517534, | |
| "eval_runtime": 177.9084, | |
| "eval_samples_per_second": 22.483, | |
| "eval_steps_per_second": 1.405, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.019, | |
| "grad_norm": 0.10086172819137573, | |
| "learning_rate": 4.144991597052059e-06, | |
| "loss": 0.1258, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.019, | |
| "eval_cos_sim": 0.8735744953155518, | |
| "eval_loss": 0.12737621738659807, | |
| "eval_runtime": 174.0483, | |
| "eval_samples_per_second": 22.982, | |
| "eval_steps_per_second": 1.436, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.09316889941692352, | |
| "learning_rate": 4.9971967299611097e-05, | |
| "loss": 0.122, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "eval_cos_sim": 0.8735851645469666, | |
| "eval_loss": 0.12736523821103043, | |
| "eval_runtime": 176.3327, | |
| "eval_samples_per_second": 22.684, | |
| "eval_steps_per_second": 1.418, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.021, | |
| "grad_norm": 0.10805534571409225, | |
| "learning_rate": 4.4457593198638246e-05, | |
| "loss": 0.1256, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.021, | |
| "eval_cos_sim": 0.8735992312431335, | |
| "eval_loss": 0.12734888651120133, | |
| "eval_runtime": 177.4342, | |
| "eval_samples_per_second": 22.544, | |
| "eval_steps_per_second": 1.409, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.022, | |
| "grad_norm": 0.14335550367832184, | |
| "learning_rate": 3.143870184517241e-05, | |
| "loss": 0.1228, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.022, | |
| "eval_cos_sim": 0.8742734789848328, | |
| "eval_loss": 0.1266735837672896, | |
| "eval_runtime": 174.698, | |
| "eval_samples_per_second": 22.897, | |
| "eval_steps_per_second": 1.431, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.023, | |
| "grad_norm": 0.10455214232206345, | |
| "learning_rate": 1.5936494668034417e-05, | |
| "loss": 0.1235, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.023, | |
| "eval_cos_sim": 0.874700129032135, | |
| "eval_loss": 0.12624898936497636, | |
| "eval_runtime": 175.2174, | |
| "eval_samples_per_second": 22.829, | |
| "eval_steps_per_second": 1.427, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 0.10344243049621582, | |
| "learning_rate": 3.9299527274662355e-06, | |
| "loss": 0.1258, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "eval_cos_sim": 0.8746932148933411, | |
| "eval_loss": 0.1262588949416823, | |
| "eval_runtime": 178.5496, | |
| "eval_samples_per_second": 22.403, | |
| "eval_steps_per_second": 1.4, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 0.1515665352344513, | |
| "learning_rate": 4.9950171333287335e-05, | |
| "loss": 0.1259, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "eval_cos_sim": 0.8746062517166138, | |
| "eval_loss": 0.1263456218455977, | |
| "eval_runtime": 181.2208, | |
| "eval_samples_per_second": 22.073, | |
| "eval_steps_per_second": 1.38, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.026, | |
| "grad_norm": 0.08521851152181625, | |
| "learning_rate": 4.420736879094929e-05, | |
| "loss": 0.123, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.026, | |
| "eval_cos_sim": 0.8742081522941589, | |
| "eval_loss": 0.1267440173839278, | |
| "eval_runtime": 172.3377, | |
| "eval_samples_per_second": 23.21, | |
| "eval_steps_per_second": 1.451, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.027, | |
| "grad_norm": 0.24638278782367706, | |
| "learning_rate": 3.105655699509455e-05, | |
| "loss": 0.1246, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.027, | |
| "eval_cos_sim": 0.8748664259910583, | |
| "eval_loss": 0.12609003236042926, | |
| "eval_runtime": 175.6344, | |
| "eval_samples_per_second": 22.775, | |
| "eval_steps_per_second": 1.423, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "grad_norm": 0.09267835319042206, | |
| "learning_rate": 1.5569817214910634e-05, | |
| "loss": 0.1246, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.028, | |
| "eval_cos_sim": 0.8748399615287781, | |
| "eval_loss": 0.12611397721516557, | |
| "eval_runtime": 175.9072, | |
| "eval_samples_per_second": 22.739, | |
| "eval_steps_per_second": 1.421, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.029, | |
| "grad_norm": 0.1712462306022644, | |
| "learning_rate": 3.720164955387656e-06, | |
| "loss": 0.1243, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.029, | |
| "eval_cos_sim": 0.8749127388000488, | |
| "eval_loss": 0.1260433347438521, | |
| "eval_runtime": 176.0561, | |
| "eval_samples_per_second": 22.72, | |
| "eval_steps_per_second": 1.42, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.07719286531209946, | |
| "learning_rate": 4.992215726119483e-05, | |
| "loss": 0.1227, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "eval_cos_sim": 0.8748821020126343, | |
| "eval_loss": 0.1260761695121474, | |
| "eval_runtime": 174.2263, | |
| "eval_samples_per_second": 22.959, | |
| "eval_steps_per_second": 1.435, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.031, | |
| "grad_norm": 0.08637545257806778, | |
| "learning_rate": 4.395235750428112e-05, | |
| "loss": 0.1222, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.031, | |
| "eval_cos_sim": 0.8745994567871094, | |
| "eval_loss": 0.12635979654538104, | |
| "eval_runtime": 179.4806, | |
| "eval_samples_per_second": 22.287, | |
| "eval_steps_per_second": 1.393, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 0.0923767164349556, | |
| "learning_rate": 3.0672902724039794e-05, | |
| "loss": 0.1232, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "eval_cos_sim": 0.8750612735748291, | |
| "eval_loss": 0.1258947375034041, | |
| "eval_runtime": 181.1338, | |
| "eval_samples_per_second": 22.083, | |
| "eval_steps_per_second": 1.38, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.033, | |
| "grad_norm": 0.08724959194660187, | |
| "learning_rate": 1.5205489961037645e-05, | |
| "loss": 0.1236, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.033, | |
| "eval_cos_sim": 0.8755974173545837, | |
| "eval_loss": 0.125363212845201, | |
| "eval_runtime": 198.751, | |
| "eval_samples_per_second": 20.126, | |
| "eval_steps_per_second": 1.258, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.034, | |
| "grad_norm": 0.07283046841621399, | |
| "learning_rate": 3.5156805643271896e-06, | |
| "loss": 0.1239, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.034, | |
| "eval_cos_sim": 0.8756656646728516, | |
| "eval_loss": 0.12529714014279317, | |
| "eval_runtime": 187.9639, | |
| "eval_samples_per_second": 21.281, | |
| "eval_steps_per_second": 1.33, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.035, | |
| "grad_norm": 0.15486685931682587, | |
| "learning_rate": 4.9887932065027656e-05, | |
| "loss": 0.1231, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.035, | |
| "eval_cos_sim": 0.8756564259529114, | |
| "eval_loss": 0.12530613209950398, | |
| "eval_runtime": 194.2503, | |
| "eval_samples_per_second": 20.592, | |
| "eval_steps_per_second": 1.287, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "grad_norm": 0.07505682110786438, | |
| "learning_rate": 4.369262289279271e-05, | |
| "loss": 0.1233, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.036, | |
| "eval_cos_sim": 0.8755001425743103, | |
| "eval_loss": 0.12546515204655598, | |
| "eval_runtime": 194.8309, | |
| "eval_samples_per_second": 20.531, | |
| "eval_steps_per_second": 1.283, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.037, | |
| "grad_norm": 0.09688587486743927, | |
| "learning_rate": 3.0287834646695457e-05, | |
| "loss": 0.1259, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.037, | |
| "eval_cos_sim": 0.8756394386291504, | |
| "eval_loss": 0.1253258285735793, | |
| "eval_runtime": 188.2216, | |
| "eval_samples_per_second": 21.252, | |
| "eval_steps_per_second": 1.328, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.038, | |
| "grad_norm": 0.07268425822257996, | |
| "learning_rate": 1.4843603704405253e-05, | |
| "loss": 0.1247, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.038, | |
| "eval_cos_sim": 0.8758111596107483, | |
| "eval_loss": 0.12515661337124775, | |
| "eval_runtime": 189.0095, | |
| "eval_samples_per_second": 21.163, | |
| "eval_steps_per_second": 1.323, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.039, | |
| "grad_norm": 0.09875091165304184, | |
| "learning_rate": 3.316550516082126e-06, | |
| "loss": 0.1229, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.039, | |
| "eval_cos_sim": 0.8758672475814819, | |
| "eval_loss": 0.12509912636029194, | |
| "eval_runtime": 235.6105, | |
| "eval_samples_per_second": 16.977, | |
| "eval_steps_per_second": 1.061, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.0792056992650032, | |
| "learning_rate": 4.98475042744222e-05, | |
| "loss": 0.1246, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "eval_cos_sim": 0.8759932518005371, | |
| "eval_loss": 0.12497495915638873, | |
| "eval_runtime": 200.3436, | |
| "eval_samples_per_second": 19.966, | |
| "eval_steps_per_second": 1.248, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.041, | |
| "grad_norm": 0.10644775629043579, | |
| "learning_rate": 4.3428229687794505e-05, | |
| "loss": 0.1224, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.041, | |
| "eval_cos_sim": 0.8761371374130249, | |
| "eval_loss": 0.12483511426197956, | |
| "eval_runtime": 197.5074, | |
| "eval_samples_per_second": 20.252, | |
| "eval_steps_per_second": 1.266, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.042, | |
| "grad_norm": 0.09292006492614746, | |
| "learning_rate": 2.9901448730099503e-05, | |
| "loss": 0.1239, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.042, | |
| "eval_cos_sim": 0.876413881778717, | |
| "eval_loss": 0.12455732419239948, | |
| "eval_runtime": 187.5784, | |
| "eval_samples_per_second": 21.324, | |
| "eval_steps_per_second": 1.333, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.043, | |
| "grad_norm": 0.08105887472629547, | |
| "learning_rate": 1.448424863465538e-05, | |
| "loss": 0.1231, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.043, | |
| "eval_cos_sim": 0.876311719417572, | |
| "eval_loss": 0.12465796377407977, | |
| "eval_runtime": 203.0598, | |
| "eval_samples_per_second": 19.699, | |
| "eval_steps_per_second": 1.231, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "grad_norm": 0.15435349941253662, | |
| "learning_rate": 3.1228244380351547e-06, | |
| "loss": 0.1225, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.044, | |
| "eval_cos_sim": 0.8762248754501343, | |
| "eval_loss": 0.12474570634114215, | |
| "eval_runtime": 199.1025, | |
| "eval_samples_per_second": 20.09, | |
| "eval_steps_per_second": 1.256, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.045, | |
| "grad_norm": 0.09370752424001694, | |
| "learning_rate": 4.980088396483146e-05, | |
| "loss": 0.1228, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.045, | |
| "eval_cos_sim": 0.8761196136474609, | |
| "eval_loss": 0.12484796597706745, | |
| "eval_runtime": 192.1246, | |
| "eval_samples_per_second": 20.82, | |
| "eval_steps_per_second": 1.301, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.046, | |
| "grad_norm": 0.08999752253293991, | |
| "learning_rate": 4.3159243781616026e-05, | |
| "loss": 0.1229, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.046, | |
| "eval_cos_sim": 0.8762247562408447, | |
| "eval_loss": 0.12473729922520588, | |
| "eval_runtime": 196.5532, | |
| "eval_samples_per_second": 20.351, | |
| "eval_steps_per_second": 1.272, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.047, | |
| "grad_norm": 0.0809365063905716, | |
| "learning_rate": 2.9513841269722613e-05, | |
| "loss": 0.124, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.047, | |
| "eval_cos_sim": 0.8765152096748352, | |
| "eval_loss": 0.12444968440281817, | |
| "eval_runtime": 204.1545, | |
| "eval_samples_per_second": 19.593, | |
| "eval_steps_per_second": 1.225, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 0.08176057785749435, | |
| "learning_rate": 1.4127514310605238e-05, | |
| "loss": 0.123, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "eval_cos_sim": 0.876448929309845, | |
| "eval_loss": 0.12451095607029865, | |
| "eval_runtime": 198.7286, | |
| "eval_samples_per_second": 20.128, | |
| "eval_steps_per_second": 1.258, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.049, | |
| "grad_norm": 0.09636738151311874, | |
| "learning_rate": 2.934550610786291e-06, | |
| "loss": 0.1236, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.049, | |
| "eval_cos_sim": 0.8765274882316589, | |
| "eval_loss": 0.12443248560177753, | |
| "eval_runtime": 196.3413, | |
| "eval_samples_per_second": 20.373, | |
| "eval_steps_per_second": 1.273, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.08814109116792679, | |
| "learning_rate": 4.974808275501392e-05, | |
| "loss": 0.123, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "eval_cos_sim": 0.8765753507614136, | |
| "eval_loss": 0.12438686539875934, | |
| "eval_runtime": 191.2687, | |
| "eval_samples_per_second": 20.913, | |
| "eval_steps_per_second": 1.307, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.051, | |
| "grad_norm": 0.08511923253536224, | |
| "learning_rate": 4.2885732211184324e-05, | |
| "loss": 0.1246, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.051, | |
| "eval_cos_sim": 0.8767162561416626, | |
| "eval_loss": 0.12425224568592975, | |
| "eval_runtime": 173.2088, | |
| "eval_samples_per_second": 23.094, | |
| "eval_steps_per_second": 1.443, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "grad_norm": 0.0837215781211853, | |
| "learning_rate": 2.9125108865470048e-05, | |
| "loss": 0.1221, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.052, | |
| "eval_cos_sim": 0.876861572265625, | |
| "eval_loss": 0.1241044213985152, | |
| "eval_runtime": 174.8239, | |
| "eval_samples_per_second": 22.88, | |
| "eval_steps_per_second": 1.43, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.053, | |
| "grad_norm": 0.09207245707511902, | |
| "learning_rate": 1.3773489637927061e-05, | |
| "loss": 0.1229, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.053, | |
| "eval_cos_sim": 0.8767414093017578, | |
| "eval_loss": 0.12421691825138996, | |
| "eval_runtime": 173.8268, | |
| "eval_samples_per_second": 23.011, | |
| "eval_steps_per_second": 1.438, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.054, | |
| "grad_norm": 0.0655718669295311, | |
| "learning_rate": 2.7517759561205253e-06, | |
| "loss": 0.1221, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.054, | |
| "eval_cos_sim": 0.8767919540405273, | |
| "eval_loss": 0.1241676082824416, | |
| "eval_runtime": 179.6327, | |
| "eval_samples_per_second": 22.268, | |
| "eval_steps_per_second": 1.392, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.055, | |
| "grad_norm": 0.21964910626411438, | |
| "learning_rate": 4.968911380413809e-05, | |
| "loss": 0.1243, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.055, | |
| "eval_cos_sim": 0.8768623471260071, | |
| "eval_loss": 0.12409912397610615, | |
| "eval_runtime": 172.7843, | |
| "eval_samples_per_second": 23.15, | |
| "eval_steps_per_second": 1.447, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 0.08817338943481445, | |
| "learning_rate": 4.260776314131676e-05, | |
| "loss": 0.1222, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "eval_cos_sim": 0.8767062425613403, | |
| "eval_loss": 0.12425821544873188, | |
| "eval_runtime": 172.6396, | |
| "eval_samples_per_second": 23.17, | |
| "eval_steps_per_second": 1.448, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.057, | |
| "grad_norm": 0.06475117802619934, | |
| "learning_rate": 2.873534839760646e-05, | |
| "loss": 0.1232, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.057, | |
| "eval_cos_sim": 0.8768667578697205, | |
| "eval_loss": 0.12410461117970416, | |
| "eval_runtime": 172.7054, | |
| "eval_samples_per_second": 23.161, | |
| "eval_steps_per_second": 1.448, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.058, | |
| "grad_norm": 0.07474437355995178, | |
| "learning_rate": 1.342226284699138e-05, | |
| "loss": 0.1227, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.058, | |
| "eval_cos_sim": 0.8771414160728455, | |
| "eval_loss": 0.12382852866398761, | |
| "eval_runtime": 175.1422, | |
| "eval_samples_per_second": 22.839, | |
| "eval_steps_per_second": 1.427, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.059, | |
| "grad_norm": 0.07362603396177292, | |
| "learning_rate": 2.5745460253134484e-06, | |
| "loss": 0.1234, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.059, | |
| "eval_cos_sim": 0.8771759271621704, | |
| "eval_loss": 0.12379106380688618, | |
| "eval_runtime": 174.7169, | |
| "eval_samples_per_second": 22.894, | |
| "eval_steps_per_second": 1.431, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.07593993842601776, | |
| "learning_rate": 4.962399180850275e-05, | |
| "loss": 0.1232, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "eval_cos_sim": 0.877038300037384, | |
| "eval_loss": 0.12392904116856525, | |
| "eval_runtime": 172.4786, | |
| "eval_samples_per_second": 23.191, | |
| "eval_steps_per_second": 1.449, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.061, | |
| "grad_norm": 0.07887241989374161, | |
| "learning_rate": 4.2325405847733254e-05, | |
| "loss": 0.1235, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.061, | |
| "eval_cos_sim": 0.8767529726028442, | |
| "eval_loss": 0.12422390153157184, | |
| "eval_runtime": 173.6696, | |
| "eval_samples_per_second": 23.032, | |
| "eval_steps_per_second": 1.44, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.062, | |
| "grad_norm": 0.17296281456947327, | |
| "learning_rate": 2.834465700261192e-05, | |
| "loss": 0.1204, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.062, | |
| "eval_cos_sim": 0.8772019743919373, | |
| "eval_loss": 0.12377139737355183, | |
| "eval_runtime": 179.9864, | |
| "eval_samples_per_second": 22.224, | |
| "eval_steps_per_second": 1.389, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.063, | |
| "grad_norm": 0.06920995563268661, | |
| "learning_rate": 1.3073921470877709e-05, | |
| "loss": 0.1245, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.063, | |
| "eval_cos_sim": 0.8773365616798401, | |
| "eval_loss": 0.12363236000287008, | |
| "eval_runtime": 173.1204, | |
| "eval_samples_per_second": 23.105, | |
| "eval_steps_per_second": 1.444, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 0.08347232639789581, | |
| "learning_rate": 2.4029049877794472e-06, | |
| "loss": 0.1217, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "eval_cos_sim": 0.8773410320281982, | |
| "eval_loss": 0.12362796523320149, | |
| "eval_runtime": 172.0713, | |
| "eval_samples_per_second": 23.246, | |
| "eval_steps_per_second": 1.453, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.065, | |
| "grad_norm": 0.07459770888090134, | |
| "learning_rate": 4.955273299787453e-05, | |
| "loss": 0.1223, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.065, | |
| "eval_cos_sim": 0.8773767948150635, | |
| "eval_loss": 0.12359384205090472, | |
| "eval_runtime": 173.2149, | |
| "eval_samples_per_second": 23.093, | |
| "eval_steps_per_second": 1.443, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.066, | |
| "grad_norm": 0.0831998735666275, | |
| "learning_rate": 4.203873069979081e-05, | |
| "loss": 0.1231, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.066, | |
| "eval_cos_sim": 0.8774532675743103, | |
| "eval_loss": 0.12351777221905659, | |
| "eval_runtime": 171.902, | |
| "eval_samples_per_second": 23.269, | |
| "eval_steps_per_second": 1.454, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.067, | |
| "grad_norm": 0.07724840193986893, | |
| "learning_rate": 2.7953132048972646e-05, | |
| "loss": 0.122, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.067, | |
| "eval_cos_sim": 0.877151608467102, | |
| "eval_loss": 0.12382214214550921, | |
| "eval_runtime": 173.6766, | |
| "eval_samples_per_second": 23.031, | |
| "eval_steps_per_second": 1.439, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.068, | |
| "grad_norm": 0.0648268312215805, | |
| "learning_rate": 1.2728552323560239e-05, | |
| "loss": 0.1227, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.068, | |
| "eval_cos_sim": 0.8769506216049194, | |
| "eval_loss": 0.12402295615422199, | |
| "eval_runtime": 171.7424, | |
| "eval_samples_per_second": 23.291, | |
| "eval_steps_per_second": 1.456, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.069, | |
| "grad_norm": 0.08475865423679352, | |
| "learning_rate": 2.2368956200634283e-06, | |
| "loss": 0.1274, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.069, | |
| "eval_cos_sim": 0.8771329522132874, | |
| "eval_loss": 0.12383969738232563, | |
| "eval_runtime": 174.2776, | |
| "eval_samples_per_second": 22.952, | |
| "eval_steps_per_second": 1.434, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.06382860988378525, | |
| "learning_rate": 4.947535513144286e-05, | |
| "loss": 0.122, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "eval_cos_sim": 0.8775114417076111, | |
| "eval_loss": 0.12346241619336079, | |
| "eval_runtime": 185.1334, | |
| "eval_samples_per_second": 21.606, | |
| "eval_steps_per_second": 1.35, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.071, | |
| "grad_norm": 0.07273228466510773, | |
| "learning_rate": 4.174780914294635e-05, | |
| "loss": 0.1228, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.071, | |
| "eval_cos_sim": 0.8777372241020203, | |
| "eval_loss": 0.12323929693448019, | |
| "eval_runtime": 170.2151, | |
| "eval_samples_per_second": 23.5, | |
| "eval_steps_per_second": 1.469, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 0.08377543836832047, | |
| "learning_rate": 2.756087111291529e-05, | |
| "loss": 0.1209, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "eval_cos_sim": 0.8776744604110718, | |
| "eval_loss": 0.12329552843319844, | |
| "eval_runtime": 173.1907, | |
| "eval_samples_per_second": 23.096, | |
| "eval_steps_per_second": 1.443, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.073, | |
| "grad_norm": 0.08579932153224945, | |
| "learning_rate": 1.2386241478270527e-05, | |
| "loss": 0.1234, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.073, | |
| "eval_cos_sim": 0.8776343464851379, | |
| "eval_loss": 0.12333650018917988, | |
| "eval_runtime": 172.2784, | |
| "eval_samples_per_second": 23.218, | |
| "eval_steps_per_second": 1.451, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.074, | |
| "grad_norm": 0.07494545727968216, | |
| "learning_rate": 2.0765592951802664e-06, | |
| "loss": 0.1209, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.074, | |
| "eval_cos_sim": 0.8777279853820801, | |
| "eval_loss": 0.12324421884762715, | |
| "eval_runtime": 172.9417, | |
| "eval_samples_per_second": 23.129, | |
| "eval_steps_per_second": 1.446, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.075, | |
| "grad_norm": 0.07511463761329651, | |
| "learning_rate": 4.9391877493394335e-05, | |
| "loss": 0.1222, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.075, | |
| "eval_cos_sim": 0.8777404427528381, | |
| "eval_loss": 0.12323040797459553, | |
| "eval_runtime": 173.813, | |
| "eval_samples_per_second": 23.013, | |
| "eval_steps_per_second": 1.438, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.076, | |
| "grad_norm": 0.08240217715501785, | |
| "learning_rate": 4.1452713680951016e-05, | |
| "loss": 0.1237, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.076, | |
| "eval_cos_sim": 0.8776569366455078, | |
| "eval_loss": 0.1233164258216567, | |
| "eval_runtime": 173.6453, | |
| "eval_samples_per_second": 23.035, | |
| "eval_steps_per_second": 1.44, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.077, | |
| "grad_norm": 0.07817904651165009, | |
| "learning_rate": 2.716797195408887e-05, | |
| "loss": 0.1215, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.077, | |
| "eval_cos_sim": 0.8779506683349609, | |
| "eval_loss": 0.12303087331997822, | |
| "eval_runtime": 198.4978, | |
| "eval_samples_per_second": 20.151, | |
| "eval_steps_per_second": 1.259, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.078, | |
| "grad_norm": 0.06472489982843399, | |
| "learning_rate": 1.2047074246048157e-05, | |
| "loss": 0.1222, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.078, | |
| "eval_cos_sim": 0.8780341148376465, | |
| "eval_loss": 0.12294723345982503, | |
| "eval_runtime": 187.0246, | |
| "eval_samples_per_second": 21.388, | |
| "eval_steps_per_second": 1.337, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.079, | |
| "grad_norm": 0.06511878967285156, | |
| "learning_rate": 1.921935972303521e-06, | |
| "loss": 0.1211, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.079, | |
| "eval_cos_sim": 0.8780234456062317, | |
| "eval_loss": 0.1229577579711623, | |
| "eval_runtime": 170.8199, | |
| "eval_samples_per_second": 23.416, | |
| "eval_steps_per_second": 1.464, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.08275925368070602, | |
| "learning_rate": 4.9302320888106454e-05, | |
| "loss": 0.1234, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "eval_cos_sim": 0.8778801560401917, | |
| "eval_loss": 0.1230986237739272, | |
| "eval_runtime": 175.6448, | |
| "eval_samples_per_second": 22.773, | |
| "eval_steps_per_second": 1.423, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.081, | |
| "grad_norm": 0.06466321647167206, | |
| "learning_rate": 4.115351785778022e-05, | |
| "loss": 0.1215, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.081, | |
| "eval_cos_sim": 0.877547025680542, | |
| "eval_loss": 0.12342484547841023, | |
| "eval_runtime": 173.845, | |
| "eval_samples_per_second": 23.009, | |
| "eval_steps_per_second": 1.438, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.082, | |
| "grad_norm": 0.060175709426403046, | |
| "learning_rate": 2.6774532491200373e-05, | |
| "loss": 0.1237, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.082, | |
| "eval_cos_sim": 0.8778981566429138, | |
| "eval_loss": 0.1230772545551009, | |
| "eval_runtime": 174.1784, | |
| "eval_samples_per_second": 22.965, | |
| "eval_steps_per_second": 1.435, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.083, | |
| "grad_norm": 0.06948266923427582, | |
| "learning_rate": 1.1711135154477437e-05, | |
| "loss": 0.1213, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.083, | |
| "eval_cos_sim": 0.8779332041740417, | |
| "eval_loss": 0.12304716589199971, | |
| "eval_runtime": 171.7677, | |
| "eval_samples_per_second": 23.287, | |
| "eval_steps_per_second": 1.455, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.084, | |
| "grad_norm": 0.0633857399225235, | |
| "learning_rate": 1.7730641868067276e-06, | |
| "loss": 0.1212, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.084, | |
| "eval_cos_sim": 0.8779239058494568, | |
| "eval_loss": 0.12305730154263447, | |
| "eval_runtime": 172.6941, | |
| "eval_samples_per_second": 23.162, | |
| "eval_steps_per_second": 1.448, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.085, | |
| "grad_norm": 0.07013432681560516, | |
| "learning_rate": 4.9206707634962714e-05, | |
| "loss": 0.1219, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.085, | |
| "eval_cos_sim": 0.8781536221504211, | |
| "eval_loss": 0.12283129765736531, | |
| "eval_runtime": 178.3382, | |
| "eval_samples_per_second": 22.429, | |
| "eval_steps_per_second": 1.402, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.086, | |
| "grad_norm": 0.0714387595653534, | |
| "learning_rate": 4.085029623930606e-05, | |
| "loss": 0.1214, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.086, | |
| "eval_cos_sim": 0.8783000111579895, | |
| "eval_loss": 0.12268445636975239, | |
| "eval_runtime": 180.4291, | |
| "eval_samples_per_second": 22.169, | |
| "eval_steps_per_second": 1.386, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.087, | |
| "grad_norm": 0.07285313308238983, | |
| "learning_rate": 2.638065077761282e-05, | |
| "loss": 0.1211, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.087, | |
| "eval_cos_sim": 0.8782742619514465, | |
| "eval_loss": 0.12271090867268514, | |
| "eval_runtime": 174.6757, | |
| "eval_samples_per_second": 22.9, | |
| "eval_steps_per_second": 1.431, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 0.1114286258816719, | |
| "learning_rate": 1.1378507926623341e-05, | |
| "loss": 0.1203, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "eval_cos_sim": 0.8782421946525574, | |
| "eval_loss": 0.12274044944989156, | |
| "eval_runtime": 173.5126, | |
| "eval_samples_per_second": 23.053, | |
| "eval_steps_per_second": 1.441, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.089, | |
| "grad_norm": 0.07392691820859909, | |
| "learning_rate": 1.6299810406600836e-06, | |
| "loss": 0.1222, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.089, | |
| "eval_cos_sim": 0.8782600164413452, | |
| "eval_loss": 0.12272232272374105, | |
| "eval_runtime": 173.9745, | |
| "eval_samples_per_second": 22.992, | |
| "eval_steps_per_second": 1.437, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.1509944051504135, | |
| "learning_rate": 4.9105061562790325e-05, | |
| "loss": 0.1211, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "eval_cos_sim": 0.8785330653190613, | |
| "eval_loss": 0.12244940116154622, | |
| "eval_runtime": 174.6529, | |
| "eval_samples_per_second": 22.903, | |
| "eval_steps_per_second": 1.431, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.091, | |
| "grad_norm": 0.07572964578866959, | |
| "learning_rate": 4.0543124394712475e-05, | |
| "loss": 0.1234, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.091, | |
| "eval_cos_sim": 0.8782286643981934, | |
| "eval_loss": 0.1227607171748824, | |
| "eval_runtime": 174.4786, | |
| "eval_samples_per_second": 22.925, | |
| "eval_steps_per_second": 1.433, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.092, | |
| "grad_norm": 0.07199128717184067, | |
| "learning_rate": 2.5986424976906166e-05, | |
| "loss": 0.1202, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.092, | |
| "eval_cos_sim": 0.8780964612960815, | |
| "eval_loss": 0.12288942649113606, | |
| "eval_runtime": 175.9134, | |
| "eval_samples_per_second": 22.738, | |
| "eval_steps_per_second": 1.421, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.093, | |
| "grad_norm": 0.07497607171535492, | |
| "learning_rate": 1.1049275460163872e-05, | |
| "loss": 0.123, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.093, | |
| "eval_cos_sim": 0.8781337141990662, | |
| "eval_loss": 0.12284465791928242, | |
| "eval_runtime": 174.1009, | |
| "eval_samples_per_second": 22.975, | |
| "eval_steps_per_second": 1.436, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.094, | |
| "grad_norm": 0.056581463664770126, | |
| "learning_rate": 1.4927221931830576e-06, | |
| "loss": 0.1218, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.094, | |
| "eval_cos_sim": 0.8781940340995789, | |
| "eval_loss": 0.12278383018719624, | |
| "eval_runtime": 180.3511, | |
| "eval_samples_per_second": 22.179, | |
| "eval_steps_per_second": 1.386, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.095, | |
| "grad_norm": 0.06227719038724899, | |
| "learning_rate": 4.8997408003921384e-05, | |
| "loss": 0.1216, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.095, | |
| "eval_cos_sim": 0.8782709836959839, | |
| "eval_loss": 0.12271020819889973, | |
| "eval_runtime": 174.3195, | |
| "eval_samples_per_second": 22.946, | |
| "eval_steps_per_second": 1.434, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 0.07964574545621872, | |
| "learning_rate": 4.02320788776628e-05, | |
| "loss": 0.1205, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "eval_cos_sim": 0.8782918453216553, | |
| "eval_loss": 0.12269965698468159, | |
| "eval_runtime": 171.8922, | |
| "eval_samples_per_second": 23.27, | |
| "eval_steps_per_second": 1.454, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.097, | |
| "grad_norm": 0.059999242424964905, | |
| "learning_rate": 2.559195333841573e-05, | |
| "loss": 0.1224, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.097, | |
| "eval_cos_sim": 0.8782675862312317, | |
| "eval_loss": 0.12272447182881306, | |
| "eval_runtime": 178.4336, | |
| "eval_samples_per_second": 22.417, | |
| "eval_steps_per_second": 1.401, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.098, | |
| "grad_norm": 0.07078584283590317, | |
| "learning_rate": 1.0723519806732741e-05, | |
| "loss": 0.1226, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.098, | |
| "eval_cos_sim": 0.8782561421394348, | |
| "eval_loss": 0.12273399831997822, | |
| "eval_runtime": 172.0171, | |
| "eval_samples_per_second": 23.254, | |
| "eval_steps_per_second": 1.453, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.099, | |
| "grad_norm": 0.0700722336769104, | |
| "learning_rate": 1.3613218521583647e-06, | |
| "loss": 0.1189, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.099, | |
| "eval_cos_sim": 0.8782747387886047, | |
| "eval_loss": 0.1227147035812087, | |
| "eval_runtime": 174.8389, | |
| "eval_samples_per_second": 22.878, | |
| "eval_steps_per_second": 1.43, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.06270556151866913, | |
| "learning_rate": 4.888377378787991e-05, | |
| "loss": 0.1209, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "eval_cos_sim": 0.8783043622970581, | |
| "eval_loss": 0.12268760301815938, | |
| "eval_runtime": 171.6574, | |
| "eval_samples_per_second": 23.302, | |
| "eval_steps_per_second": 1.456, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.101, | |
| "grad_norm": 0.059303585439920425, | |
| "learning_rate": 3.9917237207221514e-05, | |
| "loss": 0.1206, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.101, | |
| "eval_cos_sim": 0.8785374760627747, | |
| "eval_loss": 0.12245997311818074, | |
| "eval_runtime": 173.2279, | |
| "eval_samples_per_second": 23.091, | |
| "eval_steps_per_second": 1.443, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.102, | |
| "grad_norm": 0.06463504582643509, | |
| "learning_rate": 2.519733417274297e-05, | |
| "loss": 0.122, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.102, | |
| "eval_cos_sim": 0.8785625100135803, | |
| "eval_loss": 0.12243694259869527, | |
| "eval_runtime": 179.8429, | |
| "eval_samples_per_second": 22.242, | |
| "eval_steps_per_second": 1.39, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.103, | |
| "grad_norm": 0.06594408303499222, | |
| "learning_rate": 1.0401322151467458e-05, | |
| "loss": 0.1226, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.103, | |
| "eval_cos_sim": 0.8784922361373901, | |
| "eval_loss": 0.1225029034827895, | |
| "eval_runtime": 171.8585, | |
| "eval_samples_per_second": 23.275, | |
| "eval_steps_per_second": 1.455, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 0.061140164732933044, | |
| "learning_rate": 1.2358127653053858e-06, | |
| "loss": 0.122, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "eval_cos_sim": 0.8785346746444702, | |
| "eval_loss": 0.12245874931561421, | |
| "eval_runtime": 170.3116, | |
| "eval_samples_per_second": 23.486, | |
| "eval_steps_per_second": 1.468, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.105, | |
| "grad_norm": 0.06770511716604233, | |
| "learning_rate": 4.876418723469453e-05, | |
| "loss": 0.1196, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.105, | |
| "eval_cos_sim": 0.878551721572876, | |
| "eval_loss": 0.12243552591549825, | |
| "eval_runtime": 173.9331, | |
| "eval_samples_per_second": 22.997, | |
| "eval_steps_per_second": 1.437, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.106, | |
| "grad_norm": 0.06050929054617882, | |
| "learning_rate": 3.959867784853255e-05, | |
| "loss": 0.1219, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.106, | |
| "eval_cos_sim": 0.8784484267234802, | |
| "eval_loss": 0.12253486802327107, | |
| "eval_runtime": 175.2374, | |
| "eval_samples_per_second": 22.826, | |
| "eval_steps_per_second": 1.427, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.107, | |
| "grad_norm": 0.07329047471284866, | |
| "learning_rate": 2.4802665827257035e-05, | |
| "loss": 0.1214, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.107, | |
| "eval_cos_sim": 0.8785268068313599, | |
| "eval_loss": 0.12246101453053426, | |
| "eval_runtime": 172.381, | |
| "eval_samples_per_second": 23.204, | |
| "eval_steps_per_second": 1.45, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.108, | |
| "grad_norm": 0.061687979847192764, | |
| "learning_rate": 1.0082762792778497e-05, | |
| "loss": 0.1206, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.108, | |
| "eval_cos_sim": 0.8787024617195129, | |
| "eval_loss": 0.12228504302250813, | |
| "eval_runtime": 171.0068, | |
| "eval_samples_per_second": 23.391, | |
| "eval_steps_per_second": 1.462, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.109, | |
| "grad_norm": 0.06697102636098862, | |
| "learning_rate": 1.1162262121200917e-06, | |
| "loss": 0.1216, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.109, | |
| "eval_cos_sim": 0.8787557482719421, | |
| "eval_loss": 0.12223189308392476, | |
| "eval_runtime": 172.5647, | |
| "eval_samples_per_second": 23.18, | |
| "eval_steps_per_second": 1.449, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.06245901808142662, | |
| "learning_rate": 4.8638678147841726e-05, | |
| "loss": 0.1224, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "eval_cos_sim": 0.878864049911499, | |
| "eval_loss": 0.12212434603917073, | |
| "eval_runtime": 177.5612, | |
| "eval_samples_per_second": 22.527, | |
| "eval_steps_per_second": 1.408, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.111, | |
| "grad_norm": 0.07445187121629715, | |
| "learning_rate": 3.9276480193267495e-05, | |
| "loss": 0.1226, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.111, | |
| "eval_cos_sim": 0.8787615895271301, | |
| "eval_loss": 0.12223191478001545, | |
| "eval_runtime": 170.2386, | |
| "eval_samples_per_second": 23.496, | |
| "eval_steps_per_second": 1.469, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 0.06328488141298294, | |
| "learning_rate": 2.4408046661584553e-05, | |
| "loss": 0.1205, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "eval_cos_sim": 0.8786949515342712, | |
| "eval_loss": 0.12229911091076802, | |
| "eval_runtime": 173.6977, | |
| "eval_samples_per_second": 23.029, | |
| "eval_steps_per_second": 1.439, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.113, | |
| "grad_norm": 0.1140422523021698, | |
| "learning_rate": 9.767921122337203e-06, | |
| "loss": 0.1213, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.113, | |
| "eval_cos_sim": 0.8787314295768738, | |
| "eval_loss": 0.12225894191014242, | |
| "eval_runtime": 176.5254, | |
| "eval_samples_per_second": 22.66, | |
| "eval_steps_per_second": 1.416, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.114, | |
| "grad_norm": 0.07940120995044708, | |
| "learning_rate": 1.0025919960786169e-06, | |
| "loss": 0.1216, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.114, | |
| "eval_cos_sim": 0.878764271736145, | |
| "eval_loss": 0.12222567083584737, | |
| "eval_runtime": 173.6241, | |
| "eval_samples_per_second": 23.038, | |
| "eval_steps_per_second": 1.44, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.115, | |
| "grad_norm": 0.06326926499605179, | |
| "learning_rate": 4.850727780681685e-05, | |
| "loss": 0.121, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.115, | |
| "eval_cos_sim": 0.8787913918495178, | |
| "eval_loss": 0.1222020423625655, | |
| "eval_runtime": 197.6043, | |
| "eval_samples_per_second": 20.242, | |
| "eval_steps_per_second": 1.265, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.116, | |
| "grad_norm": 0.06304363161325455, | |
| "learning_rate": 3.89507245398359e-05, | |
| "loss": 0.1212, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.116, | |
| "eval_cos_sim": 0.8788431286811829, | |
| "eval_loss": 0.1221448552821822, | |
| "eval_runtime": 180.7769, | |
| "eval_samples_per_second": 22.127, | |
| "eval_steps_per_second": 1.383, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.117, | |
| "grad_norm": 0.06048878654837608, | |
| "learning_rate": 2.4013575023093562e-05, | |
| "loss": 0.121, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.117, | |
| "eval_cos_sim": 0.8789100050926208, | |
| "eval_loss": 0.12207724287259053, | |
| "eval_runtime": 175.5012, | |
| "eval_samples_per_second": 22.792, | |
| "eval_steps_per_second": 1.424, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.118, | |
| "grad_norm": 0.060076240450143814, | |
| "learning_rate": 9.456875605287529e-06, | |
| "loss": 0.1208, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.118, | |
| "eval_cos_sim": 0.8789265751838684, | |
| "eval_loss": 0.12206284239041279, | |
| "eval_runtime": 179.6264, | |
| "eval_samples_per_second": 22.268, | |
| "eval_steps_per_second": 1.392, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.119, | |
| "grad_norm": 0.06535797566175461, | |
| "learning_rate": 8.949384372096747e-07, | |
| "loss": 0.1224, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.119, | |
| "eval_cos_sim": 0.8789151310920715, | |
| "eval_loss": 0.12207536175000142, | |
| "eval_runtime": 173.573, | |
| "eval_samples_per_second": 23.045, | |
| "eval_steps_per_second": 1.44, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.051111843436956406, | |
| "learning_rate": 4.8370018959339916e-05, | |
| "loss": 0.1216, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "eval_cos_sim": 0.878704845905304, | |
| "eval_loss": 0.1222877917503066, | |
| "eval_runtime": 170.7747, | |
| "eval_samples_per_second": 23.423, | |
| "eval_steps_per_second": 1.464, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.121, | |
| "grad_norm": 0.07394807785749435, | |
| "learning_rate": 3.862149207337666e-05, | |
| "loss": 0.1227, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.121, | |
| "eval_cos_sim": 0.8786987662315369, | |
| "eval_loss": 0.12228692223774862, | |
| "eval_runtime": 172.7735, | |
| "eval_samples_per_second": 23.152, | |
| "eval_steps_per_second": 1.447, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.122, | |
| "grad_norm": 0.06019896641373634, | |
| "learning_rate": 2.3619349222387182e-05, | |
| "loss": 0.1194, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.122, | |
| "eval_cos_sim": 0.8791972398757935, | |
| "eval_loss": 0.12178870942341757, | |
| "eval_runtime": 171.5715, | |
| "eval_samples_per_second": 23.314, | |
| "eval_steps_per_second": 1.457, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.123, | |
| "grad_norm": 0.05350535735487938, | |
| "learning_rate": 9.149703760694162e-06, | |
| "loss": 0.1214, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.123, | |
| "eval_cos_sim": 0.8792542219161987, | |
| "eval_loss": 0.12173621847378684, | |
| "eval_runtime": 173.1804, | |
| "eval_samples_per_second": 23.097, | |
| "eval_steps_per_second": 1.444, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.124, | |
| "grad_norm": 0.06338366866111755, | |
| "learning_rate": 7.932923650373624e-07, | |
| "loss": 0.1194, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.124, | |
| "eval_cos_sim": 0.8792427182197571, | |
| "eval_loss": 0.12174849869954062, | |
| "eval_runtime": 172.0716, | |
| "eval_samples_per_second": 23.246, | |
| "eval_steps_per_second": 1.453, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 0.052142199128866196, | |
| "learning_rate": 4.822693581319333e-05, | |
| "loss": 0.12, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "eval_cos_sim": 0.8787649869918823, | |
| "eval_loss": 0.1222243664478011, | |
| "eval_runtime": 172.6696, | |
| "eval_samples_per_second": 23.166, | |
| "eval_steps_per_second": 1.448, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.126, | |
| "grad_norm": 0.0695052519440651, | |
| "learning_rate": 3.828886484552269e-05, | |
| "loss": 0.1213, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.126, | |
| "eval_cos_sim": 0.8785125017166138, | |
| "eval_loss": 0.12247128774868916, | |
| "eval_runtime": 182.4937, | |
| "eval_samples_per_second": 21.919, | |
| "eval_steps_per_second": 1.37, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.127, | |
| "grad_norm": 0.07181504368782043, | |
| "learning_rate": 2.3225467508799494e-05, | |
| "loss": 0.1216, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.127, | |
| "eval_cos_sim": 0.8791427612304688, | |
| "eval_loss": 0.12184033658253621, | |
| "eval_runtime": 172.8353, | |
| "eval_samples_per_second": 23.143, | |
| "eval_steps_per_second": 1.446, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 0.06035405769944191, | |
| "learning_rate": 8.846482142219678e-06, | |
| "loss": 0.12, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "eval_cos_sim": 0.8793256282806396, | |
| "eval_loss": 0.121661689779634, | |
| "eval_runtime": 173.4166, | |
| "eval_samples_per_second": 23.066, | |
| "eval_steps_per_second": 1.442, | |
| "step": 1280 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 10000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9223372036854775807, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 110, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |