| { | |
| "best_metric": 0.15673477947711945, | |
| "best_model_checkpoint": "experiments/SFT-all-MiniLM-L12-v2-WordNetNoun-MixedHop-RandomNegatives/checkpoint-2500", | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 8802, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03408316291751875, | |
| "grad_norm": 0.3132801949977875, | |
| "learning_rate": 9.886389456941606e-06, | |
| "loss": 0.4223, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0681663258350375, | |
| "grad_norm": 1.3860148191452026, | |
| "learning_rate": 9.77277891388321e-06, | |
| "loss": 0.2764, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.10224948875255624, | |
| "grad_norm": 1.7058498859405518, | |
| "learning_rate": 9.659168370824814e-06, | |
| "loss": 0.1906, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.136332651670075, | |
| "grad_norm": 1.2867127656936646, | |
| "learning_rate": 9.545557827766417e-06, | |
| "loss": 0.159, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.17041581458759372, | |
| "grad_norm": 1.7491652965545654, | |
| "learning_rate": 9.431947284708022e-06, | |
| "loss": 0.1487, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.17041581458759372, | |
| "eval_loss": 0.16311119496822357, | |
| "eval_runtime": 135.6265, | |
| "eval_samples_per_second": 2690.662, | |
| "eval_steps_per_second": 5.257, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.20449897750511248, | |
| "grad_norm": 2.1244304180145264, | |
| "learning_rate": 9.318336741649626e-06, | |
| "loss": 0.1361, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.23858214042263123, | |
| "grad_norm": 1.6412886381149292, | |
| "learning_rate": 9.20472619859123e-06, | |
| "loss": 0.139, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.27266530334015, | |
| "grad_norm": 1.912682056427002, | |
| "learning_rate": 9.091115655532834e-06, | |
| "loss": 0.1371, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.3067484662576687, | |
| "grad_norm": 1.6108455657958984, | |
| "learning_rate": 8.97750511247444e-06, | |
| "loss": 0.1323, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.34083162917518744, | |
| "grad_norm": 1.5050699710845947, | |
| "learning_rate": 8.863894569416042e-06, | |
| "loss": 0.1286, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.34083162917518744, | |
| "eval_loss": 0.17418159544467926, | |
| "eval_runtime": 137.6524, | |
| "eval_samples_per_second": 2651.061, | |
| "eval_steps_per_second": 5.18, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.37491479209270623, | |
| "grad_norm": 1.8735222816467285, | |
| "learning_rate": 8.750284026357647e-06, | |
| "loss": 0.1291, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.40899795501022496, | |
| "grad_norm": 1.3617902994155884, | |
| "learning_rate": 8.63667348329925e-06, | |
| "loss": 0.1274, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.4430811179277437, | |
| "grad_norm": 1.449449896812439, | |
| "learning_rate": 8.523062940240855e-06, | |
| "loss": 0.121, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.47716428084526247, | |
| "grad_norm": 1.5432512760162354, | |
| "learning_rate": 8.40945239718246e-06, | |
| "loss": 0.1252, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.5112474437627812, | |
| "grad_norm": 1.3983290195465088, | |
| "learning_rate": 8.295841854124063e-06, | |
| "loss": 0.1211, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5112474437627812, | |
| "eval_loss": 0.16112419962882996, | |
| "eval_runtime": 133.378, | |
| "eval_samples_per_second": 2736.022, | |
| "eval_steps_per_second": 5.346, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.5453306066803, | |
| "grad_norm": 1.8421082496643066, | |
| "learning_rate": 8.182231311065668e-06, | |
| "loss": 0.1192, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5794137695978187, | |
| "grad_norm": 1.6170974969863892, | |
| "learning_rate": 8.068620768007271e-06, | |
| "loss": 0.116, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.6134969325153374, | |
| "grad_norm": 1.4847702980041504, | |
| "learning_rate": 7.955010224948876e-06, | |
| "loss": 0.1184, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.6475800954328562, | |
| "grad_norm": 2.074660301208496, | |
| "learning_rate": 7.84139968189048e-06, | |
| "loss": 0.1153, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.6816632583503749, | |
| "grad_norm": 1.3638675212860107, | |
| "learning_rate": 7.727789138832085e-06, | |
| "loss": 0.1151, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6816632583503749, | |
| "eval_loss": 0.16037538647651672, | |
| "eval_runtime": 135.7872, | |
| "eval_samples_per_second": 2687.476, | |
| "eval_steps_per_second": 5.251, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.7157464212678937, | |
| "grad_norm": 2.7780377864837646, | |
| "learning_rate": 7.614178595773688e-06, | |
| "loss": 0.1136, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.7498295841854125, | |
| "grad_norm": 1.0972269773483276, | |
| "learning_rate": 7.500568052715293e-06, | |
| "loss": 0.112, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7839127471029311, | |
| "grad_norm": 1.5648587942123413, | |
| "learning_rate": 7.386957509656897e-06, | |
| "loss": 0.1152, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.8179959100204499, | |
| "grad_norm": 1.1397899389266968, | |
| "learning_rate": 7.273346966598501e-06, | |
| "loss": 0.1122, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.8520790729379687, | |
| "grad_norm": 1.6574677228927612, | |
| "learning_rate": 7.1597364235401045e-06, | |
| "loss": 0.1116, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8520790729379687, | |
| "eval_loss": 0.15673477947711945, | |
| "eval_runtime": 140.649, | |
| "eval_samples_per_second": 2594.58, | |
| "eval_steps_per_second": 5.069, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.8861622358554874, | |
| "grad_norm": 1.6420308351516724, | |
| "learning_rate": 7.046125880481709e-06, | |
| "loss": 0.1158, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.9202453987730062, | |
| "grad_norm": 1.954850673675537, | |
| "learning_rate": 6.932515337423313e-06, | |
| "loss": 0.1139, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.9543285616905249, | |
| "grad_norm": 1.0990276336669922, | |
| "learning_rate": 6.818904794364918e-06, | |
| "loss": 0.1096, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.9884117246080436, | |
| "grad_norm": 1.3278498649597168, | |
| "learning_rate": 6.705294251306522e-06, | |
| "loss": 0.1107, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.0224948875255624, | |
| "grad_norm": 1.6887524127960205, | |
| "learning_rate": 6.591683708248125e-06, | |
| "loss": 0.1016, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.0224948875255624, | |
| "eval_loss": 0.1628647744655609, | |
| "eval_runtime": 139.478, | |
| "eval_samples_per_second": 2616.362, | |
| "eval_steps_per_second": 5.112, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.056578050443081, | |
| "grad_norm": 1.6014968156814575, | |
| "learning_rate": 6.47807316518973e-06, | |
| "loss": 0.1081, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.0906612133606, | |
| "grad_norm": 1.05489182472229, | |
| "learning_rate": 6.364462622131334e-06, | |
| "loss": 0.1027, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.1247443762781186, | |
| "grad_norm": 1.5599446296691895, | |
| "learning_rate": 6.250852079072939e-06, | |
| "loss": 0.1013, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.1588275391956373, | |
| "grad_norm": 1.6163196563720703, | |
| "learning_rate": 6.137241536014543e-06, | |
| "loss": 0.1, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.1929107021131562, | |
| "grad_norm": 1.601138710975647, | |
| "learning_rate": 6.023630992956147e-06, | |
| "loss": 0.1043, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.1929107021131562, | |
| "eval_loss": 0.1665213406085968, | |
| "eval_runtime": 141.7559, | |
| "eval_samples_per_second": 2574.319, | |
| "eval_steps_per_second": 5.03, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.2269938650306749, | |
| "grad_norm": 1.882118582725525, | |
| "learning_rate": 5.910020449897751e-06, | |
| "loss": 0.0932, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.2610770279481935, | |
| "grad_norm": 1.9329348802566528, | |
| "learning_rate": 5.796409906839356e-06, | |
| "loss": 0.0998, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.2951601908657122, | |
| "grad_norm": 1.7485824823379517, | |
| "learning_rate": 5.68279936378096e-06, | |
| "loss": 0.1005, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.329243353783231, | |
| "grad_norm": 1.1488587856292725, | |
| "learning_rate": 5.569188820722563e-06, | |
| "loss": 0.1039, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.3633265167007498, | |
| "grad_norm": 1.6326992511749268, | |
| "learning_rate": 5.455578277664168e-06, | |
| "loss": 0.1024, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.3633265167007498, | |
| "eval_loss": 0.1776169091463089, | |
| "eval_runtime": 136.9413, | |
| "eval_samples_per_second": 2664.827, | |
| "eval_steps_per_second": 5.207, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.3974096796182685, | |
| "grad_norm": 1.085883617401123, | |
| "learning_rate": 5.341967734605772e-06, | |
| "loss": 0.103, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.4314928425357873, | |
| "grad_norm": 1.6013509035110474, | |
| "learning_rate": 5.2283571915473764e-06, | |
| "loss": 0.1003, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.465576005453306, | |
| "grad_norm": 1.25858473777771, | |
| "learning_rate": 5.11474664848898e-06, | |
| "loss": 0.097, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.4996591683708247, | |
| "grad_norm": 1.6914137601852417, | |
| "learning_rate": 5.001136105430584e-06, | |
| "loss": 0.097, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.5337423312883436, | |
| "grad_norm": 1.6514638662338257, | |
| "learning_rate": 4.887525562372188e-06, | |
| "loss": 0.1011, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.5337423312883436, | |
| "eval_loss": 0.16166538000106812, | |
| "eval_runtime": 137.2738, | |
| "eval_samples_per_second": 2658.374, | |
| "eval_steps_per_second": 5.194, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.5678254942058623, | |
| "grad_norm": 1.554187536239624, | |
| "learning_rate": 4.773915019313793e-06, | |
| "loss": 0.105, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.601908657123381, | |
| "grad_norm": 1.0056004524230957, | |
| "learning_rate": 4.660304476255396e-06, | |
| "loss": 0.1015, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.6359918200408998, | |
| "grad_norm": 1.3594541549682617, | |
| "learning_rate": 4.546693933197001e-06, | |
| "loss": 0.1025, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.6700749829584185, | |
| "grad_norm": 1.210564136505127, | |
| "learning_rate": 4.433083390138605e-06, | |
| "loss": 0.0971, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.7041581458759372, | |
| "grad_norm": 1.5984232425689697, | |
| "learning_rate": 4.319472847080209e-06, | |
| "loss": 0.0994, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.7041581458759372, | |
| "eval_loss": 0.16965465247631073, | |
| "eval_runtime": 136.9881, | |
| "eval_samples_per_second": 2663.917, | |
| "eval_steps_per_second": 5.205, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.738241308793456, | |
| "grad_norm": 1.4730218648910522, | |
| "learning_rate": 4.205862304021814e-06, | |
| "loss": 0.1043, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.7723244717109747, | |
| "grad_norm": 1.4520608186721802, | |
| "learning_rate": 4.092251760963418e-06, | |
| "loss": 0.0964, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.8064076346284934, | |
| "grad_norm": 0.9586948156356812, | |
| "learning_rate": 3.978641217905022e-06, | |
| "loss": 0.1076, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.8404907975460123, | |
| "grad_norm": 1.1026732921600342, | |
| "learning_rate": 3.865030674846626e-06, | |
| "loss": 0.0944, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.874573960463531, | |
| "grad_norm": 1.3042621612548828, | |
| "learning_rate": 3.7514201317882303e-06, | |
| "loss": 0.0991, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.874573960463531, | |
| "eval_loss": 0.17953361570835114, | |
| "eval_runtime": 135.448, | |
| "eval_samples_per_second": 2694.207, | |
| "eval_steps_per_second": 5.264, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.9086571233810496, | |
| "grad_norm": 2.037799835205078, | |
| "learning_rate": 3.6378095887298343e-06, | |
| "loss": 0.103, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.9427402862985685, | |
| "grad_norm": 0.8908063173294067, | |
| "learning_rate": 3.5241990456714387e-06, | |
| "loss": 0.1001, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.9768234492160872, | |
| "grad_norm": 1.693176031112671, | |
| "learning_rate": 3.4105885026130427e-06, | |
| "loss": 0.0999, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 2.010906612133606, | |
| "grad_norm": 1.9925082921981812, | |
| "learning_rate": 3.296977959554647e-06, | |
| "loss": 0.0984, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 2.044989775051125, | |
| "grad_norm": 1.3392109870910645, | |
| "learning_rate": 3.1833674164962515e-06, | |
| "loss": 0.0947, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.044989775051125, | |
| "eval_loss": 0.170347198843956, | |
| "eval_runtime": 137.8512, | |
| "eval_samples_per_second": 2647.238, | |
| "eval_steps_per_second": 5.172, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 2.0790729379686437, | |
| "grad_norm": 1.497381329536438, | |
| "learning_rate": 3.069756873437855e-06, | |
| "loss": 0.0962, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 2.113156100886162, | |
| "grad_norm": 0.9153881072998047, | |
| "learning_rate": 2.95614633037946e-06, | |
| "loss": 0.0953, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 2.147239263803681, | |
| "grad_norm": 1.077412486076355, | |
| "learning_rate": 2.8425357873210634e-06, | |
| "loss": 0.0918, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 2.1813224267212, | |
| "grad_norm": 1.3187857866287231, | |
| "learning_rate": 2.728925244262668e-06, | |
| "loss": 0.0894, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 2.2154055896387184, | |
| "grad_norm": 1.6318199634552002, | |
| "learning_rate": 2.6153147012042718e-06, | |
| "loss": 0.0928, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.2154055896387184, | |
| "eval_loss": 0.17857030034065247, | |
| "eval_runtime": 134.8345, | |
| "eval_samples_per_second": 2706.465, | |
| "eval_steps_per_second": 5.288, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.2494887525562373, | |
| "grad_norm": 1.3745598793029785, | |
| "learning_rate": 2.501704158145876e-06, | |
| "loss": 0.0909, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.283571915473756, | |
| "grad_norm": 1.7317062616348267, | |
| "learning_rate": 2.38809361508748e-06, | |
| "loss": 0.0978, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.3176550783912746, | |
| "grad_norm": 1.3482944965362549, | |
| "learning_rate": 2.2744830720290846e-06, | |
| "loss": 0.0945, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.3517382413087935, | |
| "grad_norm": 1.982127070426941, | |
| "learning_rate": 2.1608725289706885e-06, | |
| "loss": 0.0943, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.3858214042263124, | |
| "grad_norm": 1.0882318019866943, | |
| "learning_rate": 2.047261985912293e-06, | |
| "loss": 0.0931, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.3858214042263124, | |
| "eval_loss": 0.1807963252067566, | |
| "eval_runtime": 136.6363, | |
| "eval_samples_per_second": 2670.777, | |
| "eval_steps_per_second": 5.218, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.419904567143831, | |
| "grad_norm": 1.5541267395019531, | |
| "learning_rate": 1.933651442853897e-06, | |
| "loss": 0.0931, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.4539877300613497, | |
| "grad_norm": 1.3314629793167114, | |
| "learning_rate": 1.8200408997955013e-06, | |
| "loss": 0.0968, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.4880708929788686, | |
| "grad_norm": 1.3885324001312256, | |
| "learning_rate": 1.7064303567371055e-06, | |
| "loss": 0.0915, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.522154055896387, | |
| "grad_norm": 1.093854546546936, | |
| "learning_rate": 1.5928198136787095e-06, | |
| "loss": 0.0927, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.556237218813906, | |
| "grad_norm": 1.4065414667129517, | |
| "learning_rate": 1.4792092706203137e-06, | |
| "loss": 0.0919, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.556237218813906, | |
| "eval_loss": 0.1740484982728958, | |
| "eval_runtime": 137.3804, | |
| "eval_samples_per_second": 2656.31, | |
| "eval_steps_per_second": 5.19, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.5903203817314244, | |
| "grad_norm": 1.5335302352905273, | |
| "learning_rate": 1.3655987275619179e-06, | |
| "loss": 0.0922, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.6244035446489433, | |
| "grad_norm": 1.5776997804641724, | |
| "learning_rate": 1.251988184503522e-06, | |
| "loss": 0.0938, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.658486707566462, | |
| "grad_norm": 1.4402474164962769, | |
| "learning_rate": 1.138377641445126e-06, | |
| "loss": 0.0978, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.6925698704839807, | |
| "grad_norm": 1.450110912322998, | |
| "learning_rate": 1.0247670983867305e-06, | |
| "loss": 0.0916, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.7266530334014996, | |
| "grad_norm": 1.5721951723098755, | |
| "learning_rate": 9.111565553283345e-07, | |
| "loss": 0.0892, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.7266530334014996, | |
| "eval_loss": 0.176322802901268, | |
| "eval_runtime": 131.8065, | |
| "eval_samples_per_second": 2768.642, | |
| "eval_steps_per_second": 5.409, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.7607361963190185, | |
| "grad_norm": 1.1707704067230225, | |
| "learning_rate": 7.975460122699387e-07, | |
| "loss": 0.0927, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.794819359236537, | |
| "grad_norm": 1.1209170818328857, | |
| "learning_rate": 6.839354692115428e-07, | |
| "loss": 0.094, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.828902522154056, | |
| "grad_norm": 2.1155459880828857, | |
| "learning_rate": 5.70324926153147e-07, | |
| "loss": 0.0911, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.8629856850715747, | |
| "grad_norm": 1.1937357187271118, | |
| "learning_rate": 4.5671438309475126e-07, | |
| "loss": 0.0936, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.897068847989093, | |
| "grad_norm": 2.0643107891082764, | |
| "learning_rate": 3.431038400363554e-07, | |
| "loss": 0.0892, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.897068847989093, | |
| "eval_loss": 0.17550139129161835, | |
| "eval_runtime": 133.7198, | |
| "eval_samples_per_second": 2729.028, | |
| "eval_steps_per_second": 5.332, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.931152010906612, | |
| "grad_norm": 1.0680649280548096, | |
| "learning_rate": 2.2949329697795956e-07, | |
| "loss": 0.0926, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.965235173824131, | |
| "grad_norm": 1.1272894144058228, | |
| "learning_rate": 1.1588275391956375e-07, | |
| "loss": 0.0963, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.9993183367416494, | |
| "grad_norm": 1.5708293914794922, | |
| "learning_rate": 2.2722108611679166e-09, | |
| "loss": 0.0891, | |
| "step": 8800 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 8802, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4490386148766588.0, | |
| "train_batch_size": 256, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |