| { | |
| "best_global_step": 736, | |
| "best_metric": 0.5167202949523926, | |
| "best_model_checkpoint": "/dss/dssfs05/pn39qo/pn39qo-dss-0001/tong/efficient_reasoning/extraction-vs-summary-efficient-cot-reasoning-perspective---Experiment-main/output/lora/Bespoke_17k_lora/checkpoint-704", | |
| "epoch": 3.0, | |
| "eval_steps": 32, | |
| "global_step": 747, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.06449987402368355, | |
| "grad_norm": 0.3072740435600281, | |
| "learning_rate": 1e-05, | |
| "loss": 0.8782, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.1289997480473671, | |
| "grad_norm": 0.17370416224002838, | |
| "learning_rate": 2.0666666666666666e-05, | |
| "loss": 0.8425, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.1289997480473671, | |
| "eval_loss": 0.7647674679756165, | |
| "eval_runtime": 287.3499, | |
| "eval_samples_per_second": 2.909, | |
| "eval_steps_per_second": 0.727, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.19349962207105065, | |
| "grad_norm": 0.08820519596338272, | |
| "learning_rate": 3.1333333333333334e-05, | |
| "loss": 0.7825, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.2579994960947342, | |
| "grad_norm": 0.07885795831680298, | |
| "learning_rate": 4.2e-05, | |
| "loss": 0.7261, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.2579994960947342, | |
| "eval_loss": 0.6591677665710449, | |
| "eval_runtime": 286.8893, | |
| "eval_samples_per_second": 2.914, | |
| "eval_steps_per_second": 0.729, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.3224993701184177, | |
| "grad_norm": 0.07239190489053726, | |
| "learning_rate": 4.999562902281866e-05, | |
| "loss": 0.6921, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3869992441421013, | |
| "grad_norm": 0.07642391324043274, | |
| "learning_rate": 4.989080197352834e-05, | |
| "loss": 0.6559, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.3869992441421013, | |
| "eval_loss": 0.598283588886261, | |
| "eval_runtime": 286.906, | |
| "eval_samples_per_second": 2.914, | |
| "eval_steps_per_second": 0.728, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.4514991181657848, | |
| "grad_norm": 0.08931587636470795, | |
| "learning_rate": 4.96467754629559e-05, | |
| "loss": 0.6412, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.5159989921894684, | |
| "grad_norm": 0.08870087563991547, | |
| "learning_rate": 4.9264914186334775e-05, | |
| "loss": 0.6316, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.5159989921894684, | |
| "eval_loss": 0.5706672668457031, | |
| "eval_runtime": 286.9187, | |
| "eval_samples_per_second": 2.914, | |
| "eval_steps_per_second": 0.728, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.5804988662131519, | |
| "grad_norm": 0.10332474857568741, | |
| "learning_rate": 4.874735366682115e-05, | |
| "loss": 0.6045, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.6449987402368355, | |
| "grad_norm": 0.10315942764282227, | |
| "learning_rate": 4.8096988312782174e-05, | |
| "loss": 0.6236, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6449987402368355, | |
| "eval_loss": 0.555696964263916, | |
| "eval_runtime": 286.9017, | |
| "eval_samples_per_second": 2.914, | |
| "eval_steps_per_second": 0.728, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.709498614260519, | |
| "grad_norm": 0.09106426686048508, | |
| "learning_rate": 4.731745523109029e-05, | |
| "loss": 0.6111, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.7739984882842026, | |
| "grad_norm": 0.11765453964471817, | |
| "learning_rate": 4.641311388694629e-05, | |
| "loss": 0.6061, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.7739984882842026, | |
| "eval_loss": 0.5462843775749207, | |
| "eval_runtime": 286.9865, | |
| "eval_samples_per_second": 2.913, | |
| "eval_steps_per_second": 0.728, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.8384983623078861, | |
| "grad_norm": 0.1019754558801651, | |
| "learning_rate": 4.538902172398151e-05, | |
| "loss": 0.5923, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.9029982363315696, | |
| "grad_norm": 0.13166609406471252, | |
| "learning_rate": 4.4250905880981574e-05, | |
| "loss": 0.593, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.9029982363315696, | |
| "eval_loss": 0.5395550727844238, | |
| "eval_runtime": 286.9332, | |
| "eval_samples_per_second": 2.914, | |
| "eval_steps_per_second": 0.728, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.9674981103552532, | |
| "grad_norm": 0.11405035853385925, | |
| "learning_rate": 4.300513116340317e-05, | |
| "loss": 0.6011, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.0282186948853616, | |
| "grad_norm": 0.1464119553565979, | |
| "learning_rate": 4.16586644488001e-05, | |
| "loss": 0.5771, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.0282186948853616, | |
| "eval_loss": 0.5375078320503235, | |
| "eval_runtime": 286.961, | |
| "eval_samples_per_second": 2.913, | |
| "eval_steps_per_second": 0.728, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.092718568909045, | |
| "grad_norm": 0.11681631207466125, | |
| "learning_rate": 4.021903572521802e-05, | |
| "loss": 0.5877, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.1572184429327286, | |
| "grad_norm": 0.11041384935379028, | |
| "learning_rate": 3.869429598044679e-05, | |
| "loss": 0.5953, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.1572184429327286, | |
| "eval_loss": 0.5316164493560791, | |
| "eval_runtime": 286.9111, | |
| "eval_samples_per_second": 2.914, | |
| "eval_steps_per_second": 0.728, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.2217183169564123, | |
| "grad_norm": 0.10784497857093811, | |
| "learning_rate": 3.7092972177631e-05, | |
| "loss": 0.5784, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 1.2862181909800958, | |
| "grad_norm": 0.121210016310215, | |
| "learning_rate": 3.542401956903321e-05, | |
| "loss": 0.5735, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.2862181909800958, | |
| "eval_loss": 0.5288791060447693, | |
| "eval_runtime": 286.9172, | |
| "eval_samples_per_second": 2.914, | |
| "eval_steps_per_second": 0.728, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.3507180650037793, | |
| "grad_norm": 0.12248364090919495, | |
| "learning_rate": 3.369677161463068e-05, | |
| "loss": 0.5902, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.4152179390274628, | |
| "grad_norm": 0.12172559648752213, | |
| "learning_rate": 3.1920887785621235e-05, | |
| "loss": 0.5752, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 1.4152179390274628, | |
| "eval_loss": 0.5264384150505066, | |
| "eval_runtime": 286.9223, | |
| "eval_samples_per_second": 2.914, | |
| "eval_steps_per_second": 0.728, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 1.4797178130511464, | |
| "grad_norm": 0.1209440752863884, | |
| "learning_rate": 3.010629954474201e-05, | |
| "loss": 0.5812, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 1.54421768707483, | |
| "grad_norm": 0.12822891771793365, | |
| "learning_rate": 2.8263154805501297e-05, | |
| "loss": 0.5903, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 1.54421768707483, | |
| "eval_loss": 0.5242487192153931, | |
| "eval_runtime": 286.9295, | |
| "eval_samples_per_second": 2.914, | |
| "eval_steps_per_second": 0.728, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 1.6087175610985134, | |
| "grad_norm": 0.11404039710760117, | |
| "learning_rate": 2.6401761180929797e-05, | |
| "loss": 0.5774, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.6732174351221971, | |
| "grad_norm": 0.13033507764339447, | |
| "learning_rate": 2.4532528339227452e-05, | |
| "loss": 0.5662, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 1.6732174351221971, | |
| "eval_loss": 0.5225337743759155, | |
| "eval_runtime": 286.941, | |
| "eval_samples_per_second": 2.913, | |
| "eval_steps_per_second": 0.728, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 1.7377173091458806, | |
| "grad_norm": 0.1214585080742836, | |
| "learning_rate": 2.2665909788676237e-05, | |
| "loss": 0.5605, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 1.8022171831695641, | |
| "grad_norm": 0.1181873306632042, | |
| "learning_rate": 2.0812344417381595e-05, | |
| "loss": 0.5656, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 1.8022171831695641, | |
| "eval_loss": 0.5209087133407593, | |
| "eval_runtime": 286.9336, | |
| "eval_samples_per_second": 2.914, | |
| "eval_steps_per_second": 0.728, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 1.8667170571932477, | |
| "grad_norm": 0.13211221992969513, | |
| "learning_rate": 1.8982198114775682e-05, | |
| "loss": 0.5652, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 1.9312169312169312, | |
| "grad_norm": 0.12655970454216003, | |
| "learning_rate": 1.7185705801358892e-05, | |
| "loss": 0.574, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.9312169312169312, | |
| "eval_loss": 0.5198933482170105, | |
| "eval_runtime": 286.946, | |
| "eval_samples_per_second": 2.913, | |
| "eval_steps_per_second": 0.728, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.995716805240615, | |
| "grad_norm": 0.13482671976089478, | |
| "learning_rate": 1.5432914190872757e-05, | |
| "loss": 0.5746, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 2.056437389770723, | |
| "grad_norm": 0.13910339772701263, | |
| "learning_rate": 1.3733625605001365e-05, | |
| "loss": 0.5692, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 2.056437389770723, | |
| "eval_loss": 0.519320011138916, | |
| "eval_runtime": 286.9501, | |
| "eval_samples_per_second": 2.913, | |
| "eval_steps_per_second": 0.728, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 2.120937263794407, | |
| "grad_norm": 0.11855538934469223, | |
| "learning_rate": 1.2097343154812332e-05, | |
| "loss": 0.5617, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 2.18543713781809, | |
| "grad_norm": 0.11745280772447586, | |
| "learning_rate": 1.0533217595504858e-05, | |
| "loss": 0.5656, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 2.18543713781809, | |
| "eval_loss": 0.5182603001594543, | |
| "eval_runtime": 286.9435, | |
| "eval_samples_per_second": 2.913, | |
| "eval_steps_per_second": 0.728, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 2.249937011841774, | |
| "grad_norm": 0.10972374677658081, | |
| "learning_rate": 9.049996151674789e-06, | |
| "loss": 0.5678, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.314436885865457, | |
| "grad_norm": 0.11525531113147736, | |
| "learning_rate": 7.65597359928646e-06, | |
| "loss": 0.5654, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 2.314436885865457, | |
| "eval_loss": 0.5176821351051331, | |
| "eval_runtime": 286.9603, | |
| "eval_samples_per_second": 2.913, | |
| "eval_steps_per_second": 0.728, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 2.378936759889141, | |
| "grad_norm": 0.11849993467330933, | |
| "learning_rate": 6.358945877920861e-06, | |
| "loss": 0.5639, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 2.4434366339128246, | |
| "grad_norm": 0.11330319941043854, | |
| "learning_rate": 5.166166492719124e-06, | |
| "loss": 0.5664, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 2.4434366339128246, | |
| "eval_loss": 0.5173108577728271, | |
| "eval_runtime": 286.9091, | |
| "eval_samples_per_second": 2.914, | |
| "eval_steps_per_second": 0.728, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 2.507936507936508, | |
| "grad_norm": 0.11044891923666, | |
| "learning_rate": 4.0843059498395065e-06, | |
| "loss": 0.5657, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 2.5724363819601916, | |
| "grad_norm": 0.10681544989347458, | |
| "learning_rate": 3.119414452281158e-06, | |
| "loss": 0.5714, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.5724363819601916, | |
| "eval_loss": 0.5169517993927002, | |
| "eval_runtime": 286.937, | |
| "eval_samples_per_second": 2.914, | |
| "eval_steps_per_second": 0.728, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.636936255983875, | |
| "grad_norm": 0.11138761043548584, | |
| "learning_rate": 2.2768880646947268e-06, | |
| "loss": 0.5679, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 2.7014361300075587, | |
| "grad_norm": 0.10976061224937439, | |
| "learning_rate": 1.5614385364000228e-06, | |
| "loss": 0.5656, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 2.7014361300075587, | |
| "eval_loss": 0.51683509349823, | |
| "eval_runtime": 286.9027, | |
| "eval_samples_per_second": 2.914, | |
| "eval_steps_per_second": 0.728, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 2.765936004031242, | |
| "grad_norm": 0.10790491104125977, | |
| "learning_rate": 9.770669513725128e-07, | |
| "loss": 0.566, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 2.8304358780549257, | |
| "grad_norm": 0.1116618812084198, | |
| "learning_rate": 5.270413525587909e-07, | |
| "loss": 0.5681, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 2.8304358780549257, | |
| "eval_loss": 0.516765296459198, | |
| "eval_runtime": 286.9412, | |
| "eval_samples_per_second": 2.913, | |
| "eval_steps_per_second": 0.728, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 2.8949357520786094, | |
| "grad_norm": 0.11122512072324753, | |
| "learning_rate": 2.1387846565474045e-07, | |
| "loss": 0.567, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.9594356261022927, | |
| "grad_norm": 0.12087109684944153, | |
| "learning_rate": 3.9329624554584884e-08, | |
| "loss": 0.5541, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 2.9594356261022927, | |
| "eval_loss": 0.5167202949523926, | |
| "eval_runtime": 287.1433, | |
| "eval_samples_per_second": 2.911, | |
| "eval_steps_per_second": 0.728, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 747, | |
| "total_flos": 1.0904530297886343e+19, | |
| "train_loss": 0.6036630449205677, | |
| "train_runtime": 44345.4162, | |
| "train_samples_per_second": 1.074, | |
| "train_steps_per_second": 0.017 | |
| } | |
| ], | |
| "logging_steps": 16, | |
| "max_steps": 747, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 64, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.0904530297886343e+19, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |