CocoRoF's picture
Training in progress, step 300, checkpoint
221e5a3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.802228412256268,
"eval_steps": 30,
"global_step": 300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.22284122562674094,
"grad_norm": 0.2863571047782898,
"learning_rate": 2.785515320334262e-07,
"loss": 0.0285,
"step": 10
},
{
"epoch": 0.4456824512534819,
"grad_norm": 0.2500404119491577,
"learning_rate": 5.571030640668524e-07,
"loss": 0.0396,
"step": 20
},
{
"epoch": 0.6685236768802229,
"grad_norm": 0.3944888710975647,
"learning_rate": 8.356545961002786e-07,
"loss": 0.0396,
"step": 30
},
{
"epoch": 0.6685236768802229,
"eval_loss": 0.037624359130859375,
"eval_runtime": 8.938,
"eval_samples_per_second": 167.823,
"eval_steps_per_second": 10.517,
"eval_sts_dev_pearson_cosine": 0.764955664513056,
"eval_sts_dev_pearson_dot": 0.589803108822307,
"eval_sts_dev_pearson_euclidean": 0.7219234025187748,
"eval_sts_dev_pearson_manhattan": 0.72145381158843,
"eval_sts_dev_pearson_max": 0.764955664513056,
"eval_sts_dev_spearman_cosine": 0.7647154884722602,
"eval_sts_dev_spearman_dot": 0.5877288570418023,
"eval_sts_dev_spearman_euclidean": 0.714094450799924,
"eval_sts_dev_spearman_manhattan": 0.7137187115899585,
"eval_sts_dev_spearman_max": 0.7647154884722602,
"step": 30
},
{
"epoch": 0.8913649025069638,
"grad_norm": 0.5294222831726074,
"learning_rate": 1.1142061281337048e-06,
"loss": 0.0594,
"step": 40
},
{
"epoch": 1.1337047353760445,
"grad_norm": 0.2601313292980194,
"learning_rate": 1.392757660167131e-06,
"loss": 0.0438,
"step": 50
},
{
"epoch": 1.3565459610027855,
"grad_norm": 0.3371785879135132,
"learning_rate": 1.6713091922005572e-06,
"loss": 0.0302,
"step": 60
},
{
"epoch": 1.3565459610027855,
"eval_loss": 0.03583640605211258,
"eval_runtime": 8.8794,
"eval_samples_per_second": 168.931,
"eval_steps_per_second": 10.586,
"eval_sts_dev_pearson_cosine": 0.7705187018593216,
"eval_sts_dev_pearson_dot": 0.6076573663583646,
"eval_sts_dev_pearson_euclidean": 0.7228065665910366,
"eval_sts_dev_pearson_manhattan": 0.7223378022429527,
"eval_sts_dev_pearson_max": 0.7705187018593216,
"eval_sts_dev_spearman_cosine": 0.7723120443468319,
"eval_sts_dev_spearman_dot": 0.6075378373677961,
"eval_sts_dev_spearman_euclidean": 0.7150378636996941,
"eval_sts_dev_spearman_manhattan": 0.7147730157418715,
"eval_sts_dev_spearman_max": 0.7723120443468319,
"step": 60
},
{
"epoch": 1.5793871866295266,
"grad_norm": 0.3295373022556305,
"learning_rate": 1.9498607242339835e-06,
"loss": 0.0398,
"step": 70
},
{
"epoch": 1.8022284122562673,
"grad_norm": 0.35398605465888977,
"learning_rate": 2.2284122562674097e-06,
"loss": 0.0457,
"step": 80
},
{
"epoch": 2.0445682451253484,
"grad_norm": 0.2262781709432602,
"learning_rate": 2.506963788300836e-06,
"loss": 0.0464,
"step": 90
},
{
"epoch": 2.0445682451253484,
"eval_loss": 0.03468449041247368,
"eval_runtime": 8.8139,
"eval_samples_per_second": 170.185,
"eval_steps_per_second": 10.665,
"eval_sts_dev_pearson_cosine": 0.7745596534667405,
"eval_sts_dev_pearson_dot": 0.6202842788366623,
"eval_sts_dev_pearson_euclidean": 0.7230641708414006,
"eval_sts_dev_pearson_manhattan": 0.7225653526096391,
"eval_sts_dev_pearson_max": 0.7745596534667405,
"eval_sts_dev_spearman_cosine": 0.7804674349325152,
"eval_sts_dev_spearman_dot": 0.6269246663218206,
"eval_sts_dev_spearman_euclidean": 0.7154276381242246,
"eval_sts_dev_spearman_manhattan": 0.715244268630065,
"eval_sts_dev_spearman_max": 0.7804674349325152,
"step": 90
},
{
"epoch": 2.267409470752089,
"grad_norm": 0.2479257434606552,
"learning_rate": 2.785515320334262e-06,
"loss": 0.026,
"step": 100
},
{
"epoch": 2.4902506963788302,
"grad_norm": 0.18173329532146454,
"learning_rate": 3.064066852367688e-06,
"loss": 0.0331,
"step": 110
},
{
"epoch": 2.713091922005571,
"grad_norm": 0.34261173009872437,
"learning_rate": 3.3426183844011143e-06,
"loss": 0.0318,
"step": 120
},
{
"epoch": 2.713091922005571,
"eval_loss": 0.032896921038627625,
"eval_runtime": 8.9155,
"eval_samples_per_second": 168.247,
"eval_steps_per_second": 10.543,
"eval_sts_dev_pearson_cosine": 0.7787682019715072,
"eval_sts_dev_pearson_dot": 0.6287420367838928,
"eval_sts_dev_pearson_euclidean": 0.7232103414826321,
"eval_sts_dev_pearson_manhattan": 0.7226983341404614,
"eval_sts_dev_pearson_max": 0.7787682019715072,
"eval_sts_dev_spearman_cosine": 0.7836888215928643,
"eval_sts_dev_spearman_dot": 0.6396485003600499,
"eval_sts_dev_spearman_euclidean": 0.7152705156495667,
"eval_sts_dev_spearman_manhattan": 0.7150050400356657,
"eval_sts_dev_spearman_max": 0.7836888215928643,
"step": 120
},
{
"epoch": 2.935933147632312,
"grad_norm": 0.32490110397338867,
"learning_rate": 3.6211699164345405e-06,
"loss": 0.0399,
"step": 130
},
{
"epoch": 3.1782729805013927,
"grad_norm": 0.2102952003479004,
"learning_rate": 3.899721448467967e-06,
"loss": 0.0264,
"step": 140
},
{
"epoch": 3.401114206128134,
"grad_norm": 0.29396915435791016,
"learning_rate": 4.178272980501394e-06,
"loss": 0.0268,
"step": 150
},
{
"epoch": 3.401114206128134,
"eval_loss": 0.03317762911319733,
"eval_runtime": 9.3558,
"eval_samples_per_second": 160.329,
"eval_steps_per_second": 10.047,
"eval_sts_dev_pearson_cosine": 0.7818829552806165,
"eval_sts_dev_pearson_dot": 0.6407030863223679,
"eval_sts_dev_pearson_euclidean": 0.7251646010881555,
"eval_sts_dev_pearson_manhattan": 0.724664700544263,
"eval_sts_dev_pearson_max": 0.7818829552806165,
"eval_sts_dev_spearman_cosine": 0.7883531423647104,
"eval_sts_dev_spearman_dot": 0.6500865416788887,
"eval_sts_dev_spearman_euclidean": 0.7177286550860219,
"eval_sts_dev_spearman_manhattan": 0.717444216928271,
"eval_sts_dev_spearman_max": 0.7883531423647104,
"step": 150
},
{
"epoch": 3.6239554317548746,
"grad_norm": 0.18695248663425446,
"learning_rate": 4.456824512534819e-06,
"loss": 0.0241,
"step": 160
},
{
"epoch": 3.8467966573816157,
"grad_norm": 0.2890850007534027,
"learning_rate": 4.735376044568246e-06,
"loss": 0.0309,
"step": 170
},
{
"epoch": 4.089136490250697,
"grad_norm": 0.1940670907497406,
"learning_rate": 5.013927576601672e-06,
"loss": 0.0263,
"step": 180
},
{
"epoch": 4.089136490250697,
"eval_loss": 0.03255865350365639,
"eval_runtime": 9.7386,
"eval_samples_per_second": 154.026,
"eval_steps_per_second": 9.652,
"eval_sts_dev_pearson_cosine": 0.7866026930184947,
"eval_sts_dev_pearson_dot": 0.6395121549516076,
"eval_sts_dev_pearson_euclidean": 0.7276962342001664,
"eval_sts_dev_pearson_manhattan": 0.7271401753294904,
"eval_sts_dev_pearson_max": 0.7866026930184947,
"eval_sts_dev_spearman_cosine": 0.7917956870568429,
"eval_sts_dev_spearman_dot": 0.6487597753154516,
"eval_sts_dev_spearman_euclidean": 0.7209760720407495,
"eval_sts_dev_spearman_manhattan": 0.720422142644627,
"eval_sts_dev_spearman_max": 0.7917956870568429,
"step": 180
},
{
"epoch": 4.311977715877437,
"grad_norm": 0.18042121827602386,
"learning_rate": 5.292479108635098e-06,
"loss": 0.0164,
"step": 190
},
{
"epoch": 4.534818941504178,
"grad_norm": 0.15695632994174957,
"learning_rate": 5.571030640668524e-06,
"loss": 0.0226,
"step": 200
},
{
"epoch": 4.757660167130919,
"grad_norm": 0.22100669145584106,
"learning_rate": 5.849582172701951e-06,
"loss": 0.0196,
"step": 210
},
{
"epoch": 4.757660167130919,
"eval_loss": 0.03137701749801636,
"eval_runtime": 9.3122,
"eval_samples_per_second": 161.08,
"eval_steps_per_second": 10.094,
"eval_sts_dev_pearson_cosine": 0.7870442789973777,
"eval_sts_dev_pearson_dot": 0.6305251777821479,
"eval_sts_dev_pearson_euclidean": 0.7283553144330797,
"eval_sts_dev_pearson_manhattan": 0.7277896396418712,
"eval_sts_dev_pearson_max": 0.7870442789973777,
"eval_sts_dev_spearman_cosine": 0.7896056186150787,
"eval_sts_dev_spearman_dot": 0.6435332170567882,
"eval_sts_dev_spearman_euclidean": 0.7218272927999315,
"eval_sts_dev_spearman_manhattan": 0.7214883882970341,
"eval_sts_dev_spearman_max": 0.7896056186150787,
"step": 210
},
{
"epoch": 4.9805013927576605,
"grad_norm": 0.23286058008670807,
"learning_rate": 6.128133704735376e-06,
"loss": 0.0217,
"step": 220
},
{
"epoch": 5.222841225626741,
"grad_norm": 0.17037726938724518,
"learning_rate": 6.406685236768803e-06,
"loss": 0.0134,
"step": 230
},
{
"epoch": 5.445682451253482,
"grad_norm": 0.18465109169483185,
"learning_rate": 6.685236768802229e-06,
"loss": 0.0157,
"step": 240
},
{
"epoch": 5.445682451253482,
"eval_loss": 0.03204584866762161,
"eval_runtime": 9.2367,
"eval_samples_per_second": 162.396,
"eval_steps_per_second": 10.177,
"eval_sts_dev_pearson_cosine": 0.7879215371135696,
"eval_sts_dev_pearson_dot": 0.6400228300721227,
"eval_sts_dev_pearson_euclidean": 0.7281787730138891,
"eval_sts_dev_pearson_manhattan": 0.7275982374933561,
"eval_sts_dev_pearson_max": 0.7879215371135696,
"eval_sts_dev_spearman_cosine": 0.7910741437085514,
"eval_sts_dev_spearman_dot": 0.6477074504092454,
"eval_sts_dev_spearman_euclidean": 0.7215931954085069,
"eval_sts_dev_spearman_manhattan": 0.7212215794866459,
"eval_sts_dev_spearman_max": 0.7910741437085514,
"step": 240
},
{
"epoch": 5.6685236768802225,
"grad_norm": 0.2040845900774002,
"learning_rate": 6.963788300835655e-06,
"loss": 0.0136,
"step": 250
},
{
"epoch": 5.891364902506964,
"grad_norm": 0.2258893996477127,
"learning_rate": 7.242339832869081e-06,
"loss": 0.0143,
"step": 260
},
{
"epoch": 6.133704735376044,
"grad_norm": 0.11912677437067032,
"learning_rate": 7.5208913649025075e-06,
"loss": 0.0114,
"step": 270
},
{
"epoch": 6.133704735376044,
"eval_loss": 0.03218431398272514,
"eval_runtime": 9.1387,
"eval_samples_per_second": 164.137,
"eval_steps_per_second": 10.286,
"eval_sts_dev_pearson_cosine": 0.7875077785532312,
"eval_sts_dev_pearson_dot": 0.6389110741592561,
"eval_sts_dev_pearson_euclidean": 0.7278472242426817,
"eval_sts_dev_pearson_manhattan": 0.7272084663171942,
"eval_sts_dev_pearson_max": 0.7875077785532312,
"eval_sts_dev_spearman_cosine": 0.7906616591294269,
"eval_sts_dev_spearman_dot": 0.6460173261770129,
"eval_sts_dev_spearman_euclidean": 0.721550246781466,
"eval_sts_dev_spearman_manhattan": 0.7213414755418661,
"eval_sts_dev_spearman_max": 0.7906616591294269,
"step": 270
},
{
"epoch": 6.3565459610027855,
"grad_norm": 0.151308074593544,
"learning_rate": 7.799442896935934e-06,
"loss": 0.0077,
"step": 280
},
{
"epoch": 6.579387186629527,
"grad_norm": 0.12630996108055115,
"learning_rate": 8.07799442896936e-06,
"loss": 0.0116,
"step": 290
},
{
"epoch": 6.802228412256268,
"grad_norm": 0.13274052739143372,
"learning_rate": 8.356545961002787e-06,
"loss": 0.0087,
"step": 300
},
{
"epoch": 6.802228412256268,
"eval_loss": 0.03128228709101677,
"eval_runtime": 9.1866,
"eval_samples_per_second": 163.282,
"eval_steps_per_second": 10.232,
"eval_sts_dev_pearson_cosine": 0.7861496812877409,
"eval_sts_dev_pearson_dot": 0.6275374593246463,
"eval_sts_dev_pearson_euclidean": 0.7262636157490319,
"eval_sts_dev_pearson_manhattan": 0.7255528346187912,
"eval_sts_dev_pearson_max": 0.7861496812877409,
"eval_sts_dev_spearman_cosine": 0.7868100521872967,
"eval_sts_dev_spearman_dot": 0.6389952669242227,
"eval_sts_dev_spearman_euclidean": 0.7203939324818445,
"eval_sts_dev_spearman_manhattan": 0.7200129773124266,
"eval_sts_dev_spearman_max": 0.7868100521872967,
"step": 300
}
],
"logging_steps": 10,
"max_steps": 440,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}