allMini_hierarchy_finetune / trainer_state.json
CEBangu's picture
Upload folder using huggingface_hub
ab5c133 verified
{
"best_metric": 0.15673477947711945,
"best_model_checkpoint": "experiments/SFT-all-MiniLM-L12-v2-WordNetNoun-MixedHop-RandomNegatives/checkpoint-2500",
"epoch": 3.0,
"eval_steps": 500,
"global_step": 8802,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03408316291751875,
"grad_norm": 0.3132801949977875,
"learning_rate": 9.886389456941606e-06,
"loss": 0.4223,
"step": 100
},
{
"epoch": 0.0681663258350375,
"grad_norm": 1.3860148191452026,
"learning_rate": 9.77277891388321e-06,
"loss": 0.2764,
"step": 200
},
{
"epoch": 0.10224948875255624,
"grad_norm": 1.7058498859405518,
"learning_rate": 9.659168370824814e-06,
"loss": 0.1906,
"step": 300
},
{
"epoch": 0.136332651670075,
"grad_norm": 1.2867127656936646,
"learning_rate": 9.545557827766417e-06,
"loss": 0.159,
"step": 400
},
{
"epoch": 0.17041581458759372,
"grad_norm": 1.7491652965545654,
"learning_rate": 9.431947284708022e-06,
"loss": 0.1487,
"step": 500
},
{
"epoch": 0.17041581458759372,
"eval_loss": 0.16311119496822357,
"eval_runtime": 135.6265,
"eval_samples_per_second": 2690.662,
"eval_steps_per_second": 5.257,
"step": 500
},
{
"epoch": 0.20449897750511248,
"grad_norm": 2.1244304180145264,
"learning_rate": 9.318336741649626e-06,
"loss": 0.1361,
"step": 600
},
{
"epoch": 0.23858214042263123,
"grad_norm": 1.6412886381149292,
"learning_rate": 9.20472619859123e-06,
"loss": 0.139,
"step": 700
},
{
"epoch": 0.27266530334015,
"grad_norm": 1.912682056427002,
"learning_rate": 9.091115655532834e-06,
"loss": 0.1371,
"step": 800
},
{
"epoch": 0.3067484662576687,
"grad_norm": 1.6108455657958984,
"learning_rate": 8.97750511247444e-06,
"loss": 0.1323,
"step": 900
},
{
"epoch": 0.34083162917518744,
"grad_norm": 1.5050699710845947,
"learning_rate": 8.863894569416042e-06,
"loss": 0.1286,
"step": 1000
},
{
"epoch": 0.34083162917518744,
"eval_loss": 0.17418159544467926,
"eval_runtime": 137.6524,
"eval_samples_per_second": 2651.061,
"eval_steps_per_second": 5.18,
"step": 1000
},
{
"epoch": 0.37491479209270623,
"grad_norm": 1.8735222816467285,
"learning_rate": 8.750284026357647e-06,
"loss": 0.1291,
"step": 1100
},
{
"epoch": 0.40899795501022496,
"grad_norm": 1.3617902994155884,
"learning_rate": 8.63667348329925e-06,
"loss": 0.1274,
"step": 1200
},
{
"epoch": 0.4430811179277437,
"grad_norm": 1.449449896812439,
"learning_rate": 8.523062940240855e-06,
"loss": 0.121,
"step": 1300
},
{
"epoch": 0.47716428084526247,
"grad_norm": 1.5432512760162354,
"learning_rate": 8.40945239718246e-06,
"loss": 0.1252,
"step": 1400
},
{
"epoch": 0.5112474437627812,
"grad_norm": 1.3983290195465088,
"learning_rate": 8.295841854124063e-06,
"loss": 0.1211,
"step": 1500
},
{
"epoch": 0.5112474437627812,
"eval_loss": 0.16112419962882996,
"eval_runtime": 133.378,
"eval_samples_per_second": 2736.022,
"eval_steps_per_second": 5.346,
"step": 1500
},
{
"epoch": 0.5453306066803,
"grad_norm": 1.8421082496643066,
"learning_rate": 8.182231311065668e-06,
"loss": 0.1192,
"step": 1600
},
{
"epoch": 0.5794137695978187,
"grad_norm": 1.6170974969863892,
"learning_rate": 8.068620768007271e-06,
"loss": 0.116,
"step": 1700
},
{
"epoch": 0.6134969325153374,
"grad_norm": 1.4847702980041504,
"learning_rate": 7.955010224948876e-06,
"loss": 0.1184,
"step": 1800
},
{
"epoch": 0.6475800954328562,
"grad_norm": 2.074660301208496,
"learning_rate": 7.84139968189048e-06,
"loss": 0.1153,
"step": 1900
},
{
"epoch": 0.6816632583503749,
"grad_norm": 1.3638675212860107,
"learning_rate": 7.727789138832085e-06,
"loss": 0.1151,
"step": 2000
},
{
"epoch": 0.6816632583503749,
"eval_loss": 0.16037538647651672,
"eval_runtime": 135.7872,
"eval_samples_per_second": 2687.476,
"eval_steps_per_second": 5.251,
"step": 2000
},
{
"epoch": 0.7157464212678937,
"grad_norm": 2.7780377864837646,
"learning_rate": 7.614178595773688e-06,
"loss": 0.1136,
"step": 2100
},
{
"epoch": 0.7498295841854125,
"grad_norm": 1.0972269773483276,
"learning_rate": 7.500568052715293e-06,
"loss": 0.112,
"step": 2200
},
{
"epoch": 0.7839127471029311,
"grad_norm": 1.5648587942123413,
"learning_rate": 7.386957509656897e-06,
"loss": 0.1152,
"step": 2300
},
{
"epoch": 0.8179959100204499,
"grad_norm": 1.1397899389266968,
"learning_rate": 7.273346966598501e-06,
"loss": 0.1122,
"step": 2400
},
{
"epoch": 0.8520790729379687,
"grad_norm": 1.6574677228927612,
"learning_rate": 7.1597364235401045e-06,
"loss": 0.1116,
"step": 2500
},
{
"epoch": 0.8520790729379687,
"eval_loss": 0.15673477947711945,
"eval_runtime": 140.649,
"eval_samples_per_second": 2594.58,
"eval_steps_per_second": 5.069,
"step": 2500
},
{
"epoch": 0.8861622358554874,
"grad_norm": 1.6420308351516724,
"learning_rate": 7.046125880481709e-06,
"loss": 0.1158,
"step": 2600
},
{
"epoch": 0.9202453987730062,
"grad_norm": 1.954850673675537,
"learning_rate": 6.932515337423313e-06,
"loss": 0.1139,
"step": 2700
},
{
"epoch": 0.9543285616905249,
"grad_norm": 1.0990276336669922,
"learning_rate": 6.818904794364918e-06,
"loss": 0.1096,
"step": 2800
},
{
"epoch": 0.9884117246080436,
"grad_norm": 1.3278498649597168,
"learning_rate": 6.705294251306522e-06,
"loss": 0.1107,
"step": 2900
},
{
"epoch": 1.0224948875255624,
"grad_norm": 1.6887524127960205,
"learning_rate": 6.591683708248125e-06,
"loss": 0.1016,
"step": 3000
},
{
"epoch": 1.0224948875255624,
"eval_loss": 0.1628647744655609,
"eval_runtime": 139.478,
"eval_samples_per_second": 2616.362,
"eval_steps_per_second": 5.112,
"step": 3000
},
{
"epoch": 1.056578050443081,
"grad_norm": 1.6014968156814575,
"learning_rate": 6.47807316518973e-06,
"loss": 0.1081,
"step": 3100
},
{
"epoch": 1.0906612133606,
"grad_norm": 1.05489182472229,
"learning_rate": 6.364462622131334e-06,
"loss": 0.1027,
"step": 3200
},
{
"epoch": 1.1247443762781186,
"grad_norm": 1.5599446296691895,
"learning_rate": 6.250852079072939e-06,
"loss": 0.1013,
"step": 3300
},
{
"epoch": 1.1588275391956373,
"grad_norm": 1.6163196563720703,
"learning_rate": 6.137241536014543e-06,
"loss": 0.1,
"step": 3400
},
{
"epoch": 1.1929107021131562,
"grad_norm": 1.601138710975647,
"learning_rate": 6.023630992956147e-06,
"loss": 0.1043,
"step": 3500
},
{
"epoch": 1.1929107021131562,
"eval_loss": 0.1665213406085968,
"eval_runtime": 141.7559,
"eval_samples_per_second": 2574.319,
"eval_steps_per_second": 5.03,
"step": 3500
},
{
"epoch": 1.2269938650306749,
"grad_norm": 1.882118582725525,
"learning_rate": 5.910020449897751e-06,
"loss": 0.0932,
"step": 3600
},
{
"epoch": 1.2610770279481935,
"grad_norm": 1.9329348802566528,
"learning_rate": 5.796409906839356e-06,
"loss": 0.0998,
"step": 3700
},
{
"epoch": 1.2951601908657122,
"grad_norm": 1.7485824823379517,
"learning_rate": 5.68279936378096e-06,
"loss": 0.1005,
"step": 3800
},
{
"epoch": 1.329243353783231,
"grad_norm": 1.1488587856292725,
"learning_rate": 5.569188820722563e-06,
"loss": 0.1039,
"step": 3900
},
{
"epoch": 1.3633265167007498,
"grad_norm": 1.6326992511749268,
"learning_rate": 5.455578277664168e-06,
"loss": 0.1024,
"step": 4000
},
{
"epoch": 1.3633265167007498,
"eval_loss": 0.1776169091463089,
"eval_runtime": 136.9413,
"eval_samples_per_second": 2664.827,
"eval_steps_per_second": 5.207,
"step": 4000
},
{
"epoch": 1.3974096796182685,
"grad_norm": 1.085883617401123,
"learning_rate": 5.341967734605772e-06,
"loss": 0.103,
"step": 4100
},
{
"epoch": 1.4314928425357873,
"grad_norm": 1.6013509035110474,
"learning_rate": 5.2283571915473764e-06,
"loss": 0.1003,
"step": 4200
},
{
"epoch": 1.465576005453306,
"grad_norm": 1.25858473777771,
"learning_rate": 5.11474664848898e-06,
"loss": 0.097,
"step": 4300
},
{
"epoch": 1.4996591683708247,
"grad_norm": 1.6914137601852417,
"learning_rate": 5.001136105430584e-06,
"loss": 0.097,
"step": 4400
},
{
"epoch": 1.5337423312883436,
"grad_norm": 1.6514638662338257,
"learning_rate": 4.887525562372188e-06,
"loss": 0.1011,
"step": 4500
},
{
"epoch": 1.5337423312883436,
"eval_loss": 0.16166538000106812,
"eval_runtime": 137.2738,
"eval_samples_per_second": 2658.374,
"eval_steps_per_second": 5.194,
"step": 4500
},
{
"epoch": 1.5678254942058623,
"grad_norm": 1.554187536239624,
"learning_rate": 4.773915019313793e-06,
"loss": 0.105,
"step": 4600
},
{
"epoch": 1.601908657123381,
"grad_norm": 1.0056004524230957,
"learning_rate": 4.660304476255396e-06,
"loss": 0.1015,
"step": 4700
},
{
"epoch": 1.6359918200408998,
"grad_norm": 1.3594541549682617,
"learning_rate": 4.546693933197001e-06,
"loss": 0.1025,
"step": 4800
},
{
"epoch": 1.6700749829584185,
"grad_norm": 1.210564136505127,
"learning_rate": 4.433083390138605e-06,
"loss": 0.0971,
"step": 4900
},
{
"epoch": 1.7041581458759372,
"grad_norm": 1.5984232425689697,
"learning_rate": 4.319472847080209e-06,
"loss": 0.0994,
"step": 5000
},
{
"epoch": 1.7041581458759372,
"eval_loss": 0.16965465247631073,
"eval_runtime": 136.9881,
"eval_samples_per_second": 2663.917,
"eval_steps_per_second": 5.205,
"step": 5000
},
{
"epoch": 1.738241308793456,
"grad_norm": 1.4730218648910522,
"learning_rate": 4.205862304021814e-06,
"loss": 0.1043,
"step": 5100
},
{
"epoch": 1.7723244717109747,
"grad_norm": 1.4520608186721802,
"learning_rate": 4.092251760963418e-06,
"loss": 0.0964,
"step": 5200
},
{
"epoch": 1.8064076346284934,
"grad_norm": 0.9586948156356812,
"learning_rate": 3.978641217905022e-06,
"loss": 0.1076,
"step": 5300
},
{
"epoch": 1.8404907975460123,
"grad_norm": 1.1026732921600342,
"learning_rate": 3.865030674846626e-06,
"loss": 0.0944,
"step": 5400
},
{
"epoch": 1.874573960463531,
"grad_norm": 1.3042621612548828,
"learning_rate": 3.7514201317882303e-06,
"loss": 0.0991,
"step": 5500
},
{
"epoch": 1.874573960463531,
"eval_loss": 0.17953361570835114,
"eval_runtime": 135.448,
"eval_samples_per_second": 2694.207,
"eval_steps_per_second": 5.264,
"step": 5500
},
{
"epoch": 1.9086571233810496,
"grad_norm": 2.037799835205078,
"learning_rate": 3.6378095887298343e-06,
"loss": 0.103,
"step": 5600
},
{
"epoch": 1.9427402862985685,
"grad_norm": 0.8908063173294067,
"learning_rate": 3.5241990456714387e-06,
"loss": 0.1001,
"step": 5700
},
{
"epoch": 1.9768234492160872,
"grad_norm": 1.693176031112671,
"learning_rate": 3.4105885026130427e-06,
"loss": 0.0999,
"step": 5800
},
{
"epoch": 2.010906612133606,
"grad_norm": 1.9925082921981812,
"learning_rate": 3.296977959554647e-06,
"loss": 0.0984,
"step": 5900
},
{
"epoch": 2.044989775051125,
"grad_norm": 1.3392109870910645,
"learning_rate": 3.1833674164962515e-06,
"loss": 0.0947,
"step": 6000
},
{
"epoch": 2.044989775051125,
"eval_loss": 0.170347198843956,
"eval_runtime": 137.8512,
"eval_samples_per_second": 2647.238,
"eval_steps_per_second": 5.172,
"step": 6000
},
{
"epoch": 2.0790729379686437,
"grad_norm": 1.497381329536438,
"learning_rate": 3.069756873437855e-06,
"loss": 0.0962,
"step": 6100
},
{
"epoch": 2.113156100886162,
"grad_norm": 0.9153881072998047,
"learning_rate": 2.95614633037946e-06,
"loss": 0.0953,
"step": 6200
},
{
"epoch": 2.147239263803681,
"grad_norm": 1.077412486076355,
"learning_rate": 2.8425357873210634e-06,
"loss": 0.0918,
"step": 6300
},
{
"epoch": 2.1813224267212,
"grad_norm": 1.3187857866287231,
"learning_rate": 2.728925244262668e-06,
"loss": 0.0894,
"step": 6400
},
{
"epoch": 2.2154055896387184,
"grad_norm": 1.6318199634552002,
"learning_rate": 2.6153147012042718e-06,
"loss": 0.0928,
"step": 6500
},
{
"epoch": 2.2154055896387184,
"eval_loss": 0.17857030034065247,
"eval_runtime": 134.8345,
"eval_samples_per_second": 2706.465,
"eval_steps_per_second": 5.288,
"step": 6500
},
{
"epoch": 2.2494887525562373,
"grad_norm": 1.3745598793029785,
"learning_rate": 2.501704158145876e-06,
"loss": 0.0909,
"step": 6600
},
{
"epoch": 2.283571915473756,
"grad_norm": 1.7317062616348267,
"learning_rate": 2.38809361508748e-06,
"loss": 0.0978,
"step": 6700
},
{
"epoch": 2.3176550783912746,
"grad_norm": 1.3482944965362549,
"learning_rate": 2.2744830720290846e-06,
"loss": 0.0945,
"step": 6800
},
{
"epoch": 2.3517382413087935,
"grad_norm": 1.982127070426941,
"learning_rate": 2.1608725289706885e-06,
"loss": 0.0943,
"step": 6900
},
{
"epoch": 2.3858214042263124,
"grad_norm": 1.0882318019866943,
"learning_rate": 2.047261985912293e-06,
"loss": 0.0931,
"step": 7000
},
{
"epoch": 2.3858214042263124,
"eval_loss": 0.1807963252067566,
"eval_runtime": 136.6363,
"eval_samples_per_second": 2670.777,
"eval_steps_per_second": 5.218,
"step": 7000
},
{
"epoch": 2.419904567143831,
"grad_norm": 1.5541267395019531,
"learning_rate": 1.933651442853897e-06,
"loss": 0.0931,
"step": 7100
},
{
"epoch": 2.4539877300613497,
"grad_norm": 1.3314629793167114,
"learning_rate": 1.8200408997955013e-06,
"loss": 0.0968,
"step": 7200
},
{
"epoch": 2.4880708929788686,
"grad_norm": 1.3885324001312256,
"learning_rate": 1.7064303567371055e-06,
"loss": 0.0915,
"step": 7300
},
{
"epoch": 2.522154055896387,
"grad_norm": 1.093854546546936,
"learning_rate": 1.5928198136787095e-06,
"loss": 0.0927,
"step": 7400
},
{
"epoch": 2.556237218813906,
"grad_norm": 1.4065414667129517,
"learning_rate": 1.4792092706203137e-06,
"loss": 0.0919,
"step": 7500
},
{
"epoch": 2.556237218813906,
"eval_loss": 0.1740484982728958,
"eval_runtime": 137.3804,
"eval_samples_per_second": 2656.31,
"eval_steps_per_second": 5.19,
"step": 7500
},
{
"epoch": 2.5903203817314244,
"grad_norm": 1.5335302352905273,
"learning_rate": 1.3655987275619179e-06,
"loss": 0.0922,
"step": 7600
},
{
"epoch": 2.6244035446489433,
"grad_norm": 1.5776997804641724,
"learning_rate": 1.251988184503522e-06,
"loss": 0.0938,
"step": 7700
},
{
"epoch": 2.658486707566462,
"grad_norm": 1.4402474164962769,
"learning_rate": 1.138377641445126e-06,
"loss": 0.0978,
"step": 7800
},
{
"epoch": 2.6925698704839807,
"grad_norm": 1.450110912322998,
"learning_rate": 1.0247670983867305e-06,
"loss": 0.0916,
"step": 7900
},
{
"epoch": 2.7266530334014996,
"grad_norm": 1.5721951723098755,
"learning_rate": 9.111565553283345e-07,
"loss": 0.0892,
"step": 8000
},
{
"epoch": 2.7266530334014996,
"eval_loss": 0.176322802901268,
"eval_runtime": 131.8065,
"eval_samples_per_second": 2768.642,
"eval_steps_per_second": 5.409,
"step": 8000
},
{
"epoch": 2.7607361963190185,
"grad_norm": 1.1707704067230225,
"learning_rate": 7.975460122699387e-07,
"loss": 0.0927,
"step": 8100
},
{
"epoch": 2.794819359236537,
"grad_norm": 1.1209170818328857,
"learning_rate": 6.839354692115428e-07,
"loss": 0.094,
"step": 8200
},
{
"epoch": 2.828902522154056,
"grad_norm": 2.1155459880828857,
"learning_rate": 5.70324926153147e-07,
"loss": 0.0911,
"step": 8300
},
{
"epoch": 2.8629856850715747,
"grad_norm": 1.1937357187271118,
"learning_rate": 4.5671438309475126e-07,
"loss": 0.0936,
"step": 8400
},
{
"epoch": 2.897068847989093,
"grad_norm": 2.0643107891082764,
"learning_rate": 3.431038400363554e-07,
"loss": 0.0892,
"step": 8500
},
{
"epoch": 2.897068847989093,
"eval_loss": 0.17550139129161835,
"eval_runtime": 133.7198,
"eval_samples_per_second": 2729.028,
"eval_steps_per_second": 5.332,
"step": 8500
},
{
"epoch": 2.931152010906612,
"grad_norm": 1.0680649280548096,
"learning_rate": 2.2949329697795956e-07,
"loss": 0.0926,
"step": 8600
},
{
"epoch": 2.965235173824131,
"grad_norm": 1.1272894144058228,
"learning_rate": 1.1588275391956375e-07,
"loss": 0.0963,
"step": 8700
},
{
"epoch": 2.9993183367416494,
"grad_norm": 1.5708293914794922,
"learning_rate": 2.2722108611679166e-09,
"loss": 0.0891,
"step": 8800
}
],
"logging_steps": 100,
"max_steps": 8802,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4490386148766588.0,
"train_batch_size": 256,
"trial_name": null,
"trial_params": null
}