Bespoke_17k_lora_checkpoint-225 / trainer_state.json
tongliuphysics's picture
Upload folder using huggingface_hub
84434a3 verified
{
"best_global_step": 736,
"best_metric": 0.5167202949523926,
"best_model_checkpoint": "/dss/dssfs05/pn39qo/pn39qo-dss-0001/tong/efficient_reasoning/extraction-vs-summary-efficient-cot-reasoning-perspective---Experiment-main/output/lora/Bespoke_17k_lora/checkpoint-704",
"epoch": 3.0,
"eval_steps": 32,
"global_step": 747,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.06449987402368355,
"grad_norm": 0.3072740435600281,
"learning_rate": 1e-05,
"loss": 0.8782,
"step": 16
},
{
"epoch": 0.1289997480473671,
"grad_norm": 0.17370416224002838,
"learning_rate": 2.0666666666666666e-05,
"loss": 0.8425,
"step": 32
},
{
"epoch": 0.1289997480473671,
"eval_loss": 0.7647674679756165,
"eval_runtime": 287.3499,
"eval_samples_per_second": 2.909,
"eval_steps_per_second": 0.727,
"step": 32
},
{
"epoch": 0.19349962207105065,
"grad_norm": 0.08820519596338272,
"learning_rate": 3.1333333333333334e-05,
"loss": 0.7825,
"step": 48
},
{
"epoch": 0.2579994960947342,
"grad_norm": 0.07885795831680298,
"learning_rate": 4.2e-05,
"loss": 0.7261,
"step": 64
},
{
"epoch": 0.2579994960947342,
"eval_loss": 0.6591677665710449,
"eval_runtime": 286.8893,
"eval_samples_per_second": 2.914,
"eval_steps_per_second": 0.729,
"step": 64
},
{
"epoch": 0.3224993701184177,
"grad_norm": 0.07239190489053726,
"learning_rate": 4.999562902281866e-05,
"loss": 0.6921,
"step": 80
},
{
"epoch": 0.3869992441421013,
"grad_norm": 0.07642391324043274,
"learning_rate": 4.989080197352834e-05,
"loss": 0.6559,
"step": 96
},
{
"epoch": 0.3869992441421013,
"eval_loss": 0.598283588886261,
"eval_runtime": 286.906,
"eval_samples_per_second": 2.914,
"eval_steps_per_second": 0.728,
"step": 96
},
{
"epoch": 0.4514991181657848,
"grad_norm": 0.08931587636470795,
"learning_rate": 4.96467754629559e-05,
"loss": 0.6412,
"step": 112
},
{
"epoch": 0.5159989921894684,
"grad_norm": 0.08870087563991547,
"learning_rate": 4.9264914186334775e-05,
"loss": 0.6316,
"step": 128
},
{
"epoch": 0.5159989921894684,
"eval_loss": 0.5706672668457031,
"eval_runtime": 286.9187,
"eval_samples_per_second": 2.914,
"eval_steps_per_second": 0.728,
"step": 128
},
{
"epoch": 0.5804988662131519,
"grad_norm": 0.10332474857568741,
"learning_rate": 4.874735366682115e-05,
"loss": 0.6045,
"step": 144
},
{
"epoch": 0.6449987402368355,
"grad_norm": 0.10315942764282227,
"learning_rate": 4.8096988312782174e-05,
"loss": 0.6236,
"step": 160
},
{
"epoch": 0.6449987402368355,
"eval_loss": 0.555696964263916,
"eval_runtime": 286.9017,
"eval_samples_per_second": 2.914,
"eval_steps_per_second": 0.728,
"step": 160
},
{
"epoch": 0.709498614260519,
"grad_norm": 0.09106426686048508,
"learning_rate": 4.731745523109029e-05,
"loss": 0.6111,
"step": 176
},
{
"epoch": 0.7739984882842026,
"grad_norm": 0.11765453964471817,
"learning_rate": 4.641311388694629e-05,
"loss": 0.6061,
"step": 192
},
{
"epoch": 0.7739984882842026,
"eval_loss": 0.5462843775749207,
"eval_runtime": 286.9865,
"eval_samples_per_second": 2.913,
"eval_steps_per_second": 0.728,
"step": 192
},
{
"epoch": 0.8384983623078861,
"grad_norm": 0.1019754558801651,
"learning_rate": 4.538902172398151e-05,
"loss": 0.5923,
"step": 208
},
{
"epoch": 0.9029982363315696,
"grad_norm": 0.13166609406471252,
"learning_rate": 4.4250905880981574e-05,
"loss": 0.593,
"step": 224
},
{
"epoch": 0.9029982363315696,
"eval_loss": 0.5395550727844238,
"eval_runtime": 286.9332,
"eval_samples_per_second": 2.914,
"eval_steps_per_second": 0.728,
"step": 224
},
{
"epoch": 0.9674981103552532,
"grad_norm": 0.11405035853385925,
"learning_rate": 4.300513116340317e-05,
"loss": 0.6011,
"step": 240
},
{
"epoch": 1.0282186948853616,
"grad_norm": 0.1464119553565979,
"learning_rate": 4.16586644488001e-05,
"loss": 0.5771,
"step": 256
},
{
"epoch": 1.0282186948853616,
"eval_loss": 0.5375078320503235,
"eval_runtime": 286.961,
"eval_samples_per_second": 2.913,
"eval_steps_per_second": 0.728,
"step": 256
},
{
"epoch": 1.092718568909045,
"grad_norm": 0.11681631207466125,
"learning_rate": 4.021903572521802e-05,
"loss": 0.5877,
"step": 272
},
{
"epoch": 1.1572184429327286,
"grad_norm": 0.11041384935379028,
"learning_rate": 3.869429598044679e-05,
"loss": 0.5953,
"step": 288
},
{
"epoch": 1.1572184429327286,
"eval_loss": 0.5316164493560791,
"eval_runtime": 286.9111,
"eval_samples_per_second": 2.914,
"eval_steps_per_second": 0.728,
"step": 288
},
{
"epoch": 1.2217183169564123,
"grad_norm": 0.10784497857093811,
"learning_rate": 3.7092972177631e-05,
"loss": 0.5784,
"step": 304
},
{
"epoch": 1.2862181909800958,
"grad_norm": 0.121210016310215,
"learning_rate": 3.542401956903321e-05,
"loss": 0.5735,
"step": 320
},
{
"epoch": 1.2862181909800958,
"eval_loss": 0.5288791060447693,
"eval_runtime": 286.9172,
"eval_samples_per_second": 2.914,
"eval_steps_per_second": 0.728,
"step": 320
},
{
"epoch": 1.3507180650037793,
"grad_norm": 0.12248364090919495,
"learning_rate": 3.369677161463068e-05,
"loss": 0.5902,
"step": 336
},
{
"epoch": 1.4152179390274628,
"grad_norm": 0.12172559648752213,
"learning_rate": 3.1920887785621235e-05,
"loss": 0.5752,
"step": 352
},
{
"epoch": 1.4152179390274628,
"eval_loss": 0.5264384150505066,
"eval_runtime": 286.9223,
"eval_samples_per_second": 2.914,
"eval_steps_per_second": 0.728,
"step": 352
},
{
"epoch": 1.4797178130511464,
"grad_norm": 0.1209440752863884,
"learning_rate": 3.010629954474201e-05,
"loss": 0.5812,
"step": 368
},
{
"epoch": 1.54421768707483,
"grad_norm": 0.12822891771793365,
"learning_rate": 2.8263154805501297e-05,
"loss": 0.5903,
"step": 384
},
{
"epoch": 1.54421768707483,
"eval_loss": 0.5242487192153931,
"eval_runtime": 286.9295,
"eval_samples_per_second": 2.914,
"eval_steps_per_second": 0.728,
"step": 384
},
{
"epoch": 1.6087175610985134,
"grad_norm": 0.11404039710760117,
"learning_rate": 2.6401761180929797e-05,
"loss": 0.5774,
"step": 400
},
{
"epoch": 1.6732174351221971,
"grad_norm": 0.13033507764339447,
"learning_rate": 2.4532528339227452e-05,
"loss": 0.5662,
"step": 416
},
{
"epoch": 1.6732174351221971,
"eval_loss": 0.5225337743759155,
"eval_runtime": 286.941,
"eval_samples_per_second": 2.913,
"eval_steps_per_second": 0.728,
"step": 416
},
{
"epoch": 1.7377173091458806,
"grad_norm": 0.1214585080742836,
"learning_rate": 2.2665909788676237e-05,
"loss": 0.5605,
"step": 432
},
{
"epoch": 1.8022171831695641,
"grad_norm": 0.1181873306632042,
"learning_rate": 2.0812344417381595e-05,
"loss": 0.5656,
"step": 448
},
{
"epoch": 1.8022171831695641,
"eval_loss": 0.5209087133407593,
"eval_runtime": 286.9336,
"eval_samples_per_second": 2.914,
"eval_steps_per_second": 0.728,
"step": 448
},
{
"epoch": 1.8667170571932477,
"grad_norm": 0.13211221992969513,
"learning_rate": 1.8982198114775682e-05,
"loss": 0.5652,
"step": 464
},
{
"epoch": 1.9312169312169312,
"grad_norm": 0.12655970454216003,
"learning_rate": 1.7185705801358892e-05,
"loss": 0.574,
"step": 480
},
{
"epoch": 1.9312169312169312,
"eval_loss": 0.5198933482170105,
"eval_runtime": 286.946,
"eval_samples_per_second": 2.913,
"eval_steps_per_second": 0.728,
"step": 480
},
{
"epoch": 1.995716805240615,
"grad_norm": 0.13482671976089478,
"learning_rate": 1.5432914190872757e-05,
"loss": 0.5746,
"step": 496
},
{
"epoch": 2.056437389770723,
"grad_norm": 0.13910339772701263,
"learning_rate": 1.3733625605001365e-05,
"loss": 0.5692,
"step": 512
},
{
"epoch": 2.056437389770723,
"eval_loss": 0.519320011138916,
"eval_runtime": 286.9501,
"eval_samples_per_second": 2.913,
"eval_steps_per_second": 0.728,
"step": 512
},
{
"epoch": 2.120937263794407,
"grad_norm": 0.11855538934469223,
"learning_rate": 1.2097343154812332e-05,
"loss": 0.5617,
"step": 528
},
{
"epoch": 2.18543713781809,
"grad_norm": 0.11745280772447586,
"learning_rate": 1.0533217595504858e-05,
"loss": 0.5656,
"step": 544
},
{
"epoch": 2.18543713781809,
"eval_loss": 0.5182603001594543,
"eval_runtime": 286.9435,
"eval_samples_per_second": 2.913,
"eval_steps_per_second": 0.728,
"step": 544
},
{
"epoch": 2.249937011841774,
"grad_norm": 0.10972374677658081,
"learning_rate": 9.049996151674789e-06,
"loss": 0.5678,
"step": 560
},
{
"epoch": 2.314436885865457,
"grad_norm": 0.11525531113147736,
"learning_rate": 7.65597359928646e-06,
"loss": 0.5654,
"step": 576
},
{
"epoch": 2.314436885865457,
"eval_loss": 0.5176821351051331,
"eval_runtime": 286.9603,
"eval_samples_per_second": 2.913,
"eval_steps_per_second": 0.728,
"step": 576
},
{
"epoch": 2.378936759889141,
"grad_norm": 0.11849993467330933,
"learning_rate": 6.358945877920861e-06,
"loss": 0.5639,
"step": 592
},
{
"epoch": 2.4434366339128246,
"grad_norm": 0.11330319941043854,
"learning_rate": 5.166166492719124e-06,
"loss": 0.5664,
"step": 608
},
{
"epoch": 2.4434366339128246,
"eval_loss": 0.5173108577728271,
"eval_runtime": 286.9091,
"eval_samples_per_second": 2.914,
"eval_steps_per_second": 0.728,
"step": 608
},
{
"epoch": 2.507936507936508,
"grad_norm": 0.11044891923666,
"learning_rate": 4.0843059498395065e-06,
"loss": 0.5657,
"step": 624
},
{
"epoch": 2.5724363819601916,
"grad_norm": 0.10681544989347458,
"learning_rate": 3.119414452281158e-06,
"loss": 0.5714,
"step": 640
},
{
"epoch": 2.5724363819601916,
"eval_loss": 0.5169517993927002,
"eval_runtime": 286.937,
"eval_samples_per_second": 2.914,
"eval_steps_per_second": 0.728,
"step": 640
},
{
"epoch": 2.636936255983875,
"grad_norm": 0.11138761043548584,
"learning_rate": 2.2768880646947268e-06,
"loss": 0.5679,
"step": 656
},
{
"epoch": 2.7014361300075587,
"grad_norm": 0.10976061224937439,
"learning_rate": 1.5614385364000228e-06,
"loss": 0.5656,
"step": 672
},
{
"epoch": 2.7014361300075587,
"eval_loss": 0.51683509349823,
"eval_runtime": 286.9027,
"eval_samples_per_second": 2.914,
"eval_steps_per_second": 0.728,
"step": 672
},
{
"epoch": 2.765936004031242,
"grad_norm": 0.10790491104125977,
"learning_rate": 9.770669513725128e-07,
"loss": 0.566,
"step": 688
},
{
"epoch": 2.8304358780549257,
"grad_norm": 0.1116618812084198,
"learning_rate": 5.270413525587909e-07,
"loss": 0.5681,
"step": 704
},
{
"epoch": 2.8304358780549257,
"eval_loss": 0.516765296459198,
"eval_runtime": 286.9412,
"eval_samples_per_second": 2.913,
"eval_steps_per_second": 0.728,
"step": 704
},
{
"epoch": 2.8949357520786094,
"grad_norm": 0.11122512072324753,
"learning_rate": 2.1387846565474045e-07,
"loss": 0.567,
"step": 720
},
{
"epoch": 2.9594356261022927,
"grad_norm": 0.12087109684944153,
"learning_rate": 3.9329624554584884e-08,
"loss": 0.5541,
"step": 736
},
{
"epoch": 2.9594356261022927,
"eval_loss": 0.5167202949523926,
"eval_runtime": 287.1433,
"eval_samples_per_second": 2.911,
"eval_steps_per_second": 0.728,
"step": 736
},
{
"epoch": 3.0,
"step": 747,
"total_flos": 1.0904530297886343e+19,
"train_loss": 0.6036630449205677,
"train_runtime": 44345.4162,
"train_samples_per_second": 1.074,
"train_steps_per_second": 0.017
}
],
"logging_steps": 16,
"max_steps": 747,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 64,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0904530297886343e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}