{ "best_metric": 0.15673477947711945, "best_model_checkpoint": "experiments/SFT-all-MiniLM-L12-v2-WordNetNoun-MixedHop-RandomNegatives/checkpoint-2500", "epoch": 3.0, "eval_steps": 500, "global_step": 8802, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03408316291751875, "grad_norm": 0.3132801949977875, "learning_rate": 9.886389456941606e-06, "loss": 0.4223, "step": 100 }, { "epoch": 0.0681663258350375, "grad_norm": 1.3860148191452026, "learning_rate": 9.77277891388321e-06, "loss": 0.2764, "step": 200 }, { "epoch": 0.10224948875255624, "grad_norm": 1.7058498859405518, "learning_rate": 9.659168370824814e-06, "loss": 0.1906, "step": 300 }, { "epoch": 0.136332651670075, "grad_norm": 1.2867127656936646, "learning_rate": 9.545557827766417e-06, "loss": 0.159, "step": 400 }, { "epoch": 0.17041581458759372, "grad_norm": 1.7491652965545654, "learning_rate": 9.431947284708022e-06, "loss": 0.1487, "step": 500 }, { "epoch": 0.17041581458759372, "eval_loss": 0.16311119496822357, "eval_runtime": 135.6265, "eval_samples_per_second": 2690.662, "eval_steps_per_second": 5.257, "step": 500 }, { "epoch": 0.20449897750511248, "grad_norm": 2.1244304180145264, "learning_rate": 9.318336741649626e-06, "loss": 0.1361, "step": 600 }, { "epoch": 0.23858214042263123, "grad_norm": 1.6412886381149292, "learning_rate": 9.20472619859123e-06, "loss": 0.139, "step": 700 }, { "epoch": 0.27266530334015, "grad_norm": 1.912682056427002, "learning_rate": 9.091115655532834e-06, "loss": 0.1371, "step": 800 }, { "epoch": 0.3067484662576687, "grad_norm": 1.6108455657958984, "learning_rate": 8.97750511247444e-06, "loss": 0.1323, "step": 900 }, { "epoch": 0.34083162917518744, "grad_norm": 1.5050699710845947, "learning_rate": 8.863894569416042e-06, "loss": 0.1286, "step": 1000 }, { "epoch": 0.34083162917518744, "eval_loss": 0.17418159544467926, "eval_runtime": 137.6524, "eval_samples_per_second": 2651.061, "eval_steps_per_second": 5.18, "step": 1000 }, { "epoch": 0.37491479209270623, "grad_norm": 1.8735222816467285, "learning_rate": 8.750284026357647e-06, "loss": 0.1291, "step": 1100 }, { "epoch": 0.40899795501022496, "grad_norm": 1.3617902994155884, "learning_rate": 8.63667348329925e-06, "loss": 0.1274, "step": 1200 }, { "epoch": 0.4430811179277437, "grad_norm": 1.449449896812439, "learning_rate": 8.523062940240855e-06, "loss": 0.121, "step": 1300 }, { "epoch": 0.47716428084526247, "grad_norm": 1.5432512760162354, "learning_rate": 8.40945239718246e-06, "loss": 0.1252, "step": 1400 }, { "epoch": 0.5112474437627812, "grad_norm": 1.3983290195465088, "learning_rate": 8.295841854124063e-06, "loss": 0.1211, "step": 1500 }, { "epoch": 0.5112474437627812, "eval_loss": 0.16112419962882996, "eval_runtime": 133.378, "eval_samples_per_second": 2736.022, "eval_steps_per_second": 5.346, "step": 1500 }, { "epoch": 0.5453306066803, "grad_norm": 1.8421082496643066, "learning_rate": 8.182231311065668e-06, "loss": 0.1192, "step": 1600 }, { "epoch": 0.5794137695978187, "grad_norm": 1.6170974969863892, "learning_rate": 8.068620768007271e-06, "loss": 0.116, "step": 1700 }, { "epoch": 0.6134969325153374, "grad_norm": 1.4847702980041504, "learning_rate": 7.955010224948876e-06, "loss": 0.1184, "step": 1800 }, { "epoch": 0.6475800954328562, "grad_norm": 2.074660301208496, "learning_rate": 7.84139968189048e-06, "loss": 0.1153, "step": 1900 }, { "epoch": 0.6816632583503749, "grad_norm": 1.3638675212860107, "learning_rate": 7.727789138832085e-06, "loss": 0.1151, "step": 2000 }, { "epoch": 0.6816632583503749, "eval_loss": 0.16037538647651672, "eval_runtime": 135.7872, "eval_samples_per_second": 2687.476, "eval_steps_per_second": 5.251, "step": 2000 }, { "epoch": 0.7157464212678937, "grad_norm": 2.7780377864837646, "learning_rate": 7.614178595773688e-06, "loss": 0.1136, "step": 2100 }, { "epoch": 0.7498295841854125, "grad_norm": 1.0972269773483276, "learning_rate": 7.500568052715293e-06, "loss": 0.112, "step": 2200 }, { "epoch": 0.7839127471029311, "grad_norm": 1.5648587942123413, "learning_rate": 7.386957509656897e-06, "loss": 0.1152, "step": 2300 }, { "epoch": 0.8179959100204499, "grad_norm": 1.1397899389266968, "learning_rate": 7.273346966598501e-06, "loss": 0.1122, "step": 2400 }, { "epoch": 0.8520790729379687, "grad_norm": 1.6574677228927612, "learning_rate": 7.1597364235401045e-06, "loss": 0.1116, "step": 2500 }, { "epoch": 0.8520790729379687, "eval_loss": 0.15673477947711945, "eval_runtime": 140.649, "eval_samples_per_second": 2594.58, "eval_steps_per_second": 5.069, "step": 2500 }, { "epoch": 0.8861622358554874, "grad_norm": 1.6420308351516724, "learning_rate": 7.046125880481709e-06, "loss": 0.1158, "step": 2600 }, { "epoch": 0.9202453987730062, "grad_norm": 1.954850673675537, "learning_rate": 6.932515337423313e-06, "loss": 0.1139, "step": 2700 }, { "epoch": 0.9543285616905249, "grad_norm": 1.0990276336669922, "learning_rate": 6.818904794364918e-06, "loss": 0.1096, "step": 2800 }, { "epoch": 0.9884117246080436, "grad_norm": 1.3278498649597168, "learning_rate": 6.705294251306522e-06, "loss": 0.1107, "step": 2900 }, { "epoch": 1.0224948875255624, "grad_norm": 1.6887524127960205, "learning_rate": 6.591683708248125e-06, "loss": 0.1016, "step": 3000 }, { "epoch": 1.0224948875255624, "eval_loss": 0.1628647744655609, "eval_runtime": 139.478, "eval_samples_per_second": 2616.362, "eval_steps_per_second": 5.112, "step": 3000 }, { "epoch": 1.056578050443081, "grad_norm": 1.6014968156814575, "learning_rate": 6.47807316518973e-06, "loss": 0.1081, "step": 3100 }, { "epoch": 1.0906612133606, "grad_norm": 1.05489182472229, "learning_rate": 6.364462622131334e-06, "loss": 0.1027, "step": 3200 }, { "epoch": 1.1247443762781186, "grad_norm": 1.5599446296691895, "learning_rate": 6.250852079072939e-06, "loss": 0.1013, "step": 3300 }, { "epoch": 1.1588275391956373, "grad_norm": 1.6163196563720703, "learning_rate": 6.137241536014543e-06, "loss": 0.1, "step": 3400 }, { "epoch": 1.1929107021131562, "grad_norm": 1.601138710975647, "learning_rate": 6.023630992956147e-06, "loss": 0.1043, "step": 3500 }, { "epoch": 1.1929107021131562, "eval_loss": 0.1665213406085968, "eval_runtime": 141.7559, "eval_samples_per_second": 2574.319, "eval_steps_per_second": 5.03, "step": 3500 }, { "epoch": 1.2269938650306749, "grad_norm": 1.882118582725525, "learning_rate": 5.910020449897751e-06, "loss": 0.0932, "step": 3600 }, { "epoch": 1.2610770279481935, "grad_norm": 1.9329348802566528, "learning_rate": 5.796409906839356e-06, "loss": 0.0998, "step": 3700 }, { "epoch": 1.2951601908657122, "grad_norm": 1.7485824823379517, "learning_rate": 5.68279936378096e-06, "loss": 0.1005, "step": 3800 }, { "epoch": 1.329243353783231, "grad_norm": 1.1488587856292725, "learning_rate": 5.569188820722563e-06, "loss": 0.1039, "step": 3900 }, { "epoch": 1.3633265167007498, "grad_norm": 1.6326992511749268, "learning_rate": 5.455578277664168e-06, "loss": 0.1024, "step": 4000 }, { "epoch": 1.3633265167007498, "eval_loss": 0.1776169091463089, "eval_runtime": 136.9413, "eval_samples_per_second": 2664.827, "eval_steps_per_second": 5.207, "step": 4000 }, { "epoch": 1.3974096796182685, "grad_norm": 1.085883617401123, "learning_rate": 5.341967734605772e-06, "loss": 0.103, "step": 4100 }, { "epoch": 1.4314928425357873, "grad_norm": 1.6013509035110474, "learning_rate": 5.2283571915473764e-06, "loss": 0.1003, "step": 4200 }, { "epoch": 1.465576005453306, "grad_norm": 1.25858473777771, "learning_rate": 5.11474664848898e-06, "loss": 0.097, "step": 4300 }, { "epoch": 1.4996591683708247, "grad_norm": 1.6914137601852417, "learning_rate": 5.001136105430584e-06, "loss": 0.097, "step": 4400 }, { "epoch": 1.5337423312883436, "grad_norm": 1.6514638662338257, "learning_rate": 4.887525562372188e-06, "loss": 0.1011, "step": 4500 }, { "epoch": 1.5337423312883436, "eval_loss": 0.16166538000106812, "eval_runtime": 137.2738, "eval_samples_per_second": 2658.374, "eval_steps_per_second": 5.194, "step": 4500 }, { "epoch": 1.5678254942058623, "grad_norm": 1.554187536239624, "learning_rate": 4.773915019313793e-06, "loss": 0.105, "step": 4600 }, { "epoch": 1.601908657123381, "grad_norm": 1.0056004524230957, "learning_rate": 4.660304476255396e-06, "loss": 0.1015, "step": 4700 }, { "epoch": 1.6359918200408998, "grad_norm": 1.3594541549682617, "learning_rate": 4.546693933197001e-06, "loss": 0.1025, "step": 4800 }, { "epoch": 1.6700749829584185, "grad_norm": 1.210564136505127, "learning_rate": 4.433083390138605e-06, "loss": 0.0971, "step": 4900 }, { "epoch": 1.7041581458759372, "grad_norm": 1.5984232425689697, "learning_rate": 4.319472847080209e-06, "loss": 0.0994, "step": 5000 }, { "epoch": 1.7041581458759372, "eval_loss": 0.16965465247631073, "eval_runtime": 136.9881, "eval_samples_per_second": 2663.917, "eval_steps_per_second": 5.205, "step": 5000 }, { "epoch": 1.738241308793456, "grad_norm": 1.4730218648910522, "learning_rate": 4.205862304021814e-06, "loss": 0.1043, "step": 5100 }, { "epoch": 1.7723244717109747, "grad_norm": 1.4520608186721802, "learning_rate": 4.092251760963418e-06, "loss": 0.0964, "step": 5200 }, { "epoch": 1.8064076346284934, "grad_norm": 0.9586948156356812, "learning_rate": 3.978641217905022e-06, "loss": 0.1076, "step": 5300 }, { "epoch": 1.8404907975460123, "grad_norm": 1.1026732921600342, "learning_rate": 3.865030674846626e-06, "loss": 0.0944, "step": 5400 }, { "epoch": 1.874573960463531, "grad_norm": 1.3042621612548828, "learning_rate": 3.7514201317882303e-06, "loss": 0.0991, "step": 5500 }, { "epoch": 1.874573960463531, "eval_loss": 0.17953361570835114, "eval_runtime": 135.448, "eval_samples_per_second": 2694.207, "eval_steps_per_second": 5.264, "step": 5500 }, { "epoch": 1.9086571233810496, "grad_norm": 2.037799835205078, "learning_rate": 3.6378095887298343e-06, "loss": 0.103, "step": 5600 }, { "epoch": 1.9427402862985685, "grad_norm": 0.8908063173294067, "learning_rate": 3.5241990456714387e-06, "loss": 0.1001, "step": 5700 }, { "epoch": 1.9768234492160872, "grad_norm": 1.693176031112671, "learning_rate": 3.4105885026130427e-06, "loss": 0.0999, "step": 5800 }, { "epoch": 2.010906612133606, "grad_norm": 1.9925082921981812, "learning_rate": 3.296977959554647e-06, "loss": 0.0984, "step": 5900 }, { "epoch": 2.044989775051125, "grad_norm": 1.3392109870910645, "learning_rate": 3.1833674164962515e-06, "loss": 0.0947, "step": 6000 }, { "epoch": 2.044989775051125, "eval_loss": 0.170347198843956, "eval_runtime": 137.8512, "eval_samples_per_second": 2647.238, "eval_steps_per_second": 5.172, "step": 6000 }, { "epoch": 2.0790729379686437, "grad_norm": 1.497381329536438, "learning_rate": 3.069756873437855e-06, "loss": 0.0962, "step": 6100 }, { "epoch": 2.113156100886162, "grad_norm": 0.9153881072998047, "learning_rate": 2.95614633037946e-06, "loss": 0.0953, "step": 6200 }, { "epoch": 2.147239263803681, "grad_norm": 1.077412486076355, "learning_rate": 2.8425357873210634e-06, "loss": 0.0918, "step": 6300 }, { "epoch": 2.1813224267212, "grad_norm": 1.3187857866287231, "learning_rate": 2.728925244262668e-06, "loss": 0.0894, "step": 6400 }, { "epoch": 2.2154055896387184, "grad_norm": 1.6318199634552002, "learning_rate": 2.6153147012042718e-06, "loss": 0.0928, "step": 6500 }, { "epoch": 2.2154055896387184, "eval_loss": 0.17857030034065247, "eval_runtime": 134.8345, "eval_samples_per_second": 2706.465, "eval_steps_per_second": 5.288, "step": 6500 }, { "epoch": 2.2494887525562373, "grad_norm": 1.3745598793029785, "learning_rate": 2.501704158145876e-06, "loss": 0.0909, "step": 6600 }, { "epoch": 2.283571915473756, "grad_norm": 1.7317062616348267, "learning_rate": 2.38809361508748e-06, "loss": 0.0978, "step": 6700 }, { "epoch": 2.3176550783912746, "grad_norm": 1.3482944965362549, "learning_rate": 2.2744830720290846e-06, "loss": 0.0945, "step": 6800 }, { "epoch": 2.3517382413087935, "grad_norm": 1.982127070426941, "learning_rate": 2.1608725289706885e-06, "loss": 0.0943, "step": 6900 }, { "epoch": 2.3858214042263124, "grad_norm": 1.0882318019866943, "learning_rate": 2.047261985912293e-06, "loss": 0.0931, "step": 7000 }, { "epoch": 2.3858214042263124, "eval_loss": 0.1807963252067566, "eval_runtime": 136.6363, "eval_samples_per_second": 2670.777, "eval_steps_per_second": 5.218, "step": 7000 }, { "epoch": 2.419904567143831, "grad_norm": 1.5541267395019531, "learning_rate": 1.933651442853897e-06, "loss": 0.0931, "step": 7100 }, { "epoch": 2.4539877300613497, "grad_norm": 1.3314629793167114, "learning_rate": 1.8200408997955013e-06, "loss": 0.0968, "step": 7200 }, { "epoch": 2.4880708929788686, "grad_norm": 1.3885324001312256, "learning_rate": 1.7064303567371055e-06, "loss": 0.0915, "step": 7300 }, { "epoch": 2.522154055896387, "grad_norm": 1.093854546546936, "learning_rate": 1.5928198136787095e-06, "loss": 0.0927, "step": 7400 }, { "epoch": 2.556237218813906, "grad_norm": 1.4065414667129517, "learning_rate": 1.4792092706203137e-06, "loss": 0.0919, "step": 7500 }, { "epoch": 2.556237218813906, "eval_loss": 0.1740484982728958, "eval_runtime": 137.3804, "eval_samples_per_second": 2656.31, "eval_steps_per_second": 5.19, "step": 7500 }, { "epoch": 2.5903203817314244, "grad_norm": 1.5335302352905273, "learning_rate": 1.3655987275619179e-06, "loss": 0.0922, "step": 7600 }, { "epoch": 2.6244035446489433, "grad_norm": 1.5776997804641724, "learning_rate": 1.251988184503522e-06, "loss": 0.0938, "step": 7700 }, { "epoch": 2.658486707566462, "grad_norm": 1.4402474164962769, "learning_rate": 1.138377641445126e-06, "loss": 0.0978, "step": 7800 }, { "epoch": 2.6925698704839807, "grad_norm": 1.450110912322998, "learning_rate": 1.0247670983867305e-06, "loss": 0.0916, "step": 7900 }, { "epoch": 2.7266530334014996, "grad_norm": 1.5721951723098755, "learning_rate": 9.111565553283345e-07, "loss": 0.0892, "step": 8000 }, { "epoch": 2.7266530334014996, "eval_loss": 0.176322802901268, "eval_runtime": 131.8065, "eval_samples_per_second": 2768.642, "eval_steps_per_second": 5.409, "step": 8000 }, { "epoch": 2.7607361963190185, "grad_norm": 1.1707704067230225, "learning_rate": 7.975460122699387e-07, "loss": 0.0927, "step": 8100 }, { "epoch": 2.794819359236537, "grad_norm": 1.1209170818328857, "learning_rate": 6.839354692115428e-07, "loss": 0.094, "step": 8200 }, { "epoch": 2.828902522154056, "grad_norm": 2.1155459880828857, "learning_rate": 5.70324926153147e-07, "loss": 0.0911, "step": 8300 }, { "epoch": 2.8629856850715747, "grad_norm": 1.1937357187271118, "learning_rate": 4.5671438309475126e-07, "loss": 0.0936, "step": 8400 }, { "epoch": 2.897068847989093, "grad_norm": 2.0643107891082764, "learning_rate": 3.431038400363554e-07, "loss": 0.0892, "step": 8500 }, { "epoch": 2.897068847989093, "eval_loss": 0.17550139129161835, "eval_runtime": 133.7198, "eval_samples_per_second": 2729.028, "eval_steps_per_second": 5.332, "step": 8500 }, { "epoch": 2.931152010906612, "grad_norm": 1.0680649280548096, "learning_rate": 2.2949329697795956e-07, "loss": 0.0926, "step": 8600 }, { "epoch": 2.965235173824131, "grad_norm": 1.1272894144058228, "learning_rate": 1.1588275391956375e-07, "loss": 0.0963, "step": 8700 }, { "epoch": 2.9993183367416494, "grad_norm": 1.5708293914794922, "learning_rate": 2.2722108611679166e-09, "loss": 0.0891, "step": 8800 } ], "logging_steps": 100, "max_steps": 8802, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4490386148766588.0, "train_batch_size": 256, "trial_name": null, "trial_params": null }