{ "best_global_step": 736, "best_metric": 0.5167202949523926, "best_model_checkpoint": "/dss/dssfs05/pn39qo/pn39qo-dss-0001/tong/efficient_reasoning/extraction-vs-summary-efficient-cot-reasoning-perspective---Experiment-main/output/lora/Bespoke_17k_lora/checkpoint-704", "epoch": 3.0, "eval_steps": 32, "global_step": 747, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06449987402368355, "grad_norm": 0.3072740435600281, "learning_rate": 1e-05, "loss": 0.8782, "step": 16 }, { "epoch": 0.1289997480473671, "grad_norm": 0.17370416224002838, "learning_rate": 2.0666666666666666e-05, "loss": 0.8425, "step": 32 }, { "epoch": 0.1289997480473671, "eval_loss": 0.7647674679756165, "eval_runtime": 287.3499, "eval_samples_per_second": 2.909, "eval_steps_per_second": 0.727, "step": 32 }, { "epoch": 0.19349962207105065, "grad_norm": 0.08820519596338272, "learning_rate": 3.1333333333333334e-05, "loss": 0.7825, "step": 48 }, { "epoch": 0.2579994960947342, "grad_norm": 0.07885795831680298, "learning_rate": 4.2e-05, "loss": 0.7261, "step": 64 }, { "epoch": 0.2579994960947342, "eval_loss": 0.6591677665710449, "eval_runtime": 286.8893, "eval_samples_per_second": 2.914, "eval_steps_per_second": 0.729, "step": 64 }, { "epoch": 0.3224993701184177, "grad_norm": 0.07239190489053726, "learning_rate": 4.999562902281866e-05, "loss": 0.6921, "step": 80 }, { "epoch": 0.3869992441421013, "grad_norm": 0.07642391324043274, "learning_rate": 4.989080197352834e-05, "loss": 0.6559, "step": 96 }, { "epoch": 0.3869992441421013, "eval_loss": 0.598283588886261, "eval_runtime": 286.906, "eval_samples_per_second": 2.914, "eval_steps_per_second": 0.728, "step": 96 }, { "epoch": 0.4514991181657848, "grad_norm": 0.08931587636470795, "learning_rate": 4.96467754629559e-05, "loss": 0.6412, "step": 112 }, { "epoch": 0.5159989921894684, "grad_norm": 0.08870087563991547, "learning_rate": 4.9264914186334775e-05, "loss": 0.6316, "step": 128 }, { "epoch": 0.5159989921894684, "eval_loss": 0.5706672668457031, "eval_runtime": 286.9187, "eval_samples_per_second": 2.914, "eval_steps_per_second": 0.728, "step": 128 }, { "epoch": 0.5804988662131519, "grad_norm": 0.10332474857568741, "learning_rate": 4.874735366682115e-05, "loss": 0.6045, "step": 144 }, { "epoch": 0.6449987402368355, "grad_norm": 0.10315942764282227, "learning_rate": 4.8096988312782174e-05, "loss": 0.6236, "step": 160 }, { "epoch": 0.6449987402368355, "eval_loss": 0.555696964263916, "eval_runtime": 286.9017, "eval_samples_per_second": 2.914, "eval_steps_per_second": 0.728, "step": 160 }, { "epoch": 0.709498614260519, "grad_norm": 0.09106426686048508, "learning_rate": 4.731745523109029e-05, "loss": 0.6111, "step": 176 }, { "epoch": 0.7739984882842026, "grad_norm": 0.11765453964471817, "learning_rate": 4.641311388694629e-05, "loss": 0.6061, "step": 192 }, { "epoch": 0.7739984882842026, "eval_loss": 0.5462843775749207, "eval_runtime": 286.9865, "eval_samples_per_second": 2.913, "eval_steps_per_second": 0.728, "step": 192 }, { "epoch": 0.8384983623078861, "grad_norm": 0.1019754558801651, "learning_rate": 4.538902172398151e-05, "loss": 0.5923, "step": 208 }, { "epoch": 0.9029982363315696, "grad_norm": 0.13166609406471252, "learning_rate": 4.4250905880981574e-05, "loss": 0.593, "step": 224 }, { "epoch": 0.9029982363315696, "eval_loss": 0.5395550727844238, "eval_runtime": 286.9332, "eval_samples_per_second": 2.914, "eval_steps_per_second": 0.728, "step": 224 }, { "epoch": 0.9674981103552532, "grad_norm": 0.11405035853385925, "learning_rate": 4.300513116340317e-05, "loss": 0.6011, "step": 240 }, { "epoch": 1.0282186948853616, "grad_norm": 0.1464119553565979, "learning_rate": 4.16586644488001e-05, "loss": 0.5771, "step": 256 }, { "epoch": 1.0282186948853616, "eval_loss": 0.5375078320503235, "eval_runtime": 286.961, "eval_samples_per_second": 2.913, "eval_steps_per_second": 0.728, "step": 256 }, { "epoch": 1.092718568909045, "grad_norm": 0.11681631207466125, "learning_rate": 4.021903572521802e-05, "loss": 0.5877, "step": 272 }, { "epoch": 1.1572184429327286, "grad_norm": 0.11041384935379028, "learning_rate": 3.869429598044679e-05, "loss": 0.5953, "step": 288 }, { "epoch": 1.1572184429327286, "eval_loss": 0.5316164493560791, "eval_runtime": 286.9111, "eval_samples_per_second": 2.914, "eval_steps_per_second": 0.728, "step": 288 }, { "epoch": 1.2217183169564123, "grad_norm": 0.10784497857093811, "learning_rate": 3.7092972177631e-05, "loss": 0.5784, "step": 304 }, { "epoch": 1.2862181909800958, "grad_norm": 0.121210016310215, "learning_rate": 3.542401956903321e-05, "loss": 0.5735, "step": 320 }, { "epoch": 1.2862181909800958, "eval_loss": 0.5288791060447693, "eval_runtime": 286.9172, "eval_samples_per_second": 2.914, "eval_steps_per_second": 0.728, "step": 320 }, { "epoch": 1.3507180650037793, "grad_norm": 0.12248364090919495, "learning_rate": 3.369677161463068e-05, "loss": 0.5902, "step": 336 }, { "epoch": 1.4152179390274628, "grad_norm": 0.12172559648752213, "learning_rate": 3.1920887785621235e-05, "loss": 0.5752, "step": 352 }, { "epoch": 1.4152179390274628, "eval_loss": 0.5264384150505066, "eval_runtime": 286.9223, "eval_samples_per_second": 2.914, "eval_steps_per_second": 0.728, "step": 352 }, { "epoch": 1.4797178130511464, "grad_norm": 0.1209440752863884, "learning_rate": 3.010629954474201e-05, "loss": 0.5812, "step": 368 }, { "epoch": 1.54421768707483, "grad_norm": 0.12822891771793365, "learning_rate": 2.8263154805501297e-05, "loss": 0.5903, "step": 384 }, { "epoch": 1.54421768707483, "eval_loss": 0.5242487192153931, "eval_runtime": 286.9295, "eval_samples_per_second": 2.914, "eval_steps_per_second": 0.728, "step": 384 }, { "epoch": 1.6087175610985134, "grad_norm": 0.11404039710760117, "learning_rate": 2.6401761180929797e-05, "loss": 0.5774, "step": 400 }, { "epoch": 1.6732174351221971, "grad_norm": 0.13033507764339447, "learning_rate": 2.4532528339227452e-05, "loss": 0.5662, "step": 416 }, { "epoch": 1.6732174351221971, "eval_loss": 0.5225337743759155, "eval_runtime": 286.941, "eval_samples_per_second": 2.913, "eval_steps_per_second": 0.728, "step": 416 }, { "epoch": 1.7377173091458806, "grad_norm": 0.1214585080742836, "learning_rate": 2.2665909788676237e-05, "loss": 0.5605, "step": 432 }, { "epoch": 1.8022171831695641, "grad_norm": 0.1181873306632042, "learning_rate": 2.0812344417381595e-05, "loss": 0.5656, "step": 448 }, { "epoch": 1.8022171831695641, "eval_loss": 0.5209087133407593, "eval_runtime": 286.9336, "eval_samples_per_second": 2.914, "eval_steps_per_second": 0.728, "step": 448 }, { "epoch": 1.8667170571932477, "grad_norm": 0.13211221992969513, "learning_rate": 1.8982198114775682e-05, "loss": 0.5652, "step": 464 }, { "epoch": 1.9312169312169312, "grad_norm": 0.12655970454216003, "learning_rate": 1.7185705801358892e-05, "loss": 0.574, "step": 480 }, { "epoch": 1.9312169312169312, "eval_loss": 0.5198933482170105, "eval_runtime": 286.946, "eval_samples_per_second": 2.913, "eval_steps_per_second": 0.728, "step": 480 }, { "epoch": 1.995716805240615, "grad_norm": 0.13482671976089478, "learning_rate": 1.5432914190872757e-05, "loss": 0.5746, "step": 496 }, { "epoch": 2.056437389770723, "grad_norm": 0.13910339772701263, "learning_rate": 1.3733625605001365e-05, "loss": 0.5692, "step": 512 }, { "epoch": 2.056437389770723, "eval_loss": 0.519320011138916, "eval_runtime": 286.9501, "eval_samples_per_second": 2.913, "eval_steps_per_second": 0.728, "step": 512 }, { "epoch": 2.120937263794407, "grad_norm": 0.11855538934469223, "learning_rate": 1.2097343154812332e-05, "loss": 0.5617, "step": 528 }, { "epoch": 2.18543713781809, "grad_norm": 0.11745280772447586, "learning_rate": 1.0533217595504858e-05, "loss": 0.5656, "step": 544 }, { "epoch": 2.18543713781809, "eval_loss": 0.5182603001594543, "eval_runtime": 286.9435, "eval_samples_per_second": 2.913, "eval_steps_per_second": 0.728, "step": 544 }, { "epoch": 2.249937011841774, "grad_norm": 0.10972374677658081, "learning_rate": 9.049996151674789e-06, "loss": 0.5678, "step": 560 }, { "epoch": 2.314436885865457, "grad_norm": 0.11525531113147736, "learning_rate": 7.65597359928646e-06, "loss": 0.5654, "step": 576 }, { "epoch": 2.314436885865457, "eval_loss": 0.5176821351051331, "eval_runtime": 286.9603, "eval_samples_per_second": 2.913, "eval_steps_per_second": 0.728, "step": 576 }, { "epoch": 2.378936759889141, "grad_norm": 0.11849993467330933, "learning_rate": 6.358945877920861e-06, "loss": 0.5639, "step": 592 }, { "epoch": 2.4434366339128246, "grad_norm": 0.11330319941043854, "learning_rate": 5.166166492719124e-06, "loss": 0.5664, "step": 608 }, { "epoch": 2.4434366339128246, "eval_loss": 0.5173108577728271, "eval_runtime": 286.9091, "eval_samples_per_second": 2.914, "eval_steps_per_second": 0.728, "step": 608 }, { "epoch": 2.507936507936508, "grad_norm": 0.11044891923666, "learning_rate": 4.0843059498395065e-06, "loss": 0.5657, "step": 624 }, { "epoch": 2.5724363819601916, "grad_norm": 0.10681544989347458, "learning_rate": 3.119414452281158e-06, "loss": 0.5714, "step": 640 }, { "epoch": 2.5724363819601916, "eval_loss": 0.5169517993927002, "eval_runtime": 286.937, "eval_samples_per_second": 2.914, "eval_steps_per_second": 0.728, "step": 640 }, { "epoch": 2.636936255983875, "grad_norm": 0.11138761043548584, "learning_rate": 2.2768880646947268e-06, "loss": 0.5679, "step": 656 }, { "epoch": 2.7014361300075587, "grad_norm": 0.10976061224937439, "learning_rate": 1.5614385364000228e-06, "loss": 0.5656, "step": 672 }, { "epoch": 2.7014361300075587, "eval_loss": 0.51683509349823, "eval_runtime": 286.9027, "eval_samples_per_second": 2.914, "eval_steps_per_second": 0.728, "step": 672 }, { "epoch": 2.765936004031242, "grad_norm": 0.10790491104125977, "learning_rate": 9.770669513725128e-07, "loss": 0.566, "step": 688 }, { "epoch": 2.8304358780549257, "grad_norm": 0.1116618812084198, "learning_rate": 5.270413525587909e-07, "loss": 0.5681, "step": 704 }, { "epoch": 2.8304358780549257, "eval_loss": 0.516765296459198, "eval_runtime": 286.9412, "eval_samples_per_second": 2.913, "eval_steps_per_second": 0.728, "step": 704 }, { "epoch": 2.8949357520786094, "grad_norm": 0.11122512072324753, "learning_rate": 2.1387846565474045e-07, "loss": 0.567, "step": 720 }, { "epoch": 2.9594356261022927, "grad_norm": 0.12087109684944153, "learning_rate": 3.9329624554584884e-08, "loss": 0.5541, "step": 736 }, { "epoch": 2.9594356261022927, "eval_loss": 0.5167202949523926, "eval_runtime": 287.1433, "eval_samples_per_second": 2.911, "eval_steps_per_second": 0.728, "step": 736 }, { "epoch": 3.0, "step": 747, "total_flos": 1.0904530297886343e+19, "train_loss": 0.6036630449205677, "train_runtime": 44345.4162, "train_samples_per_second": 1.074, "train_steps_per_second": 0.017 } ], "logging_steps": 16, "max_steps": 747, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 64, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0904530297886343e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }