{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 24591, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0609979260705136, "grad_norm": 2.1482937335968018, "learning_rate": 4.8985401163027127e-05, "loss": 5.1096, "step": 500 }, { "epoch": 0.1219958521410272, "grad_norm": 1.8602643013000488, "learning_rate": 4.79687690618519e-05, "loss": 4.2904, "step": 1000 }, { "epoch": 0.1829937782115408, "grad_norm": 2.2239928245544434, "learning_rate": 4.6952136960676673e-05, "loss": 3.9507, "step": 1500 }, { "epoch": 0.2439917042820544, "grad_norm": 1.9908227920532227, "learning_rate": 4.593550485950145e-05, "loss": 3.7767, "step": 2000 }, { "epoch": 0.304989630352568, "grad_norm": 2.2968549728393555, "learning_rate": 4.491887275832622e-05, "loss": 3.6462, "step": 2500 }, { "epoch": 0.3659875564230816, "grad_norm": 1.7772067785263062, "learning_rate": 4.3902240657150994e-05, "loss": 3.5216, "step": 3000 }, { "epoch": 0.4269854824935952, "grad_norm": 1.9565205574035645, "learning_rate": 4.288560855597577e-05, "loss": 3.4266, "step": 3500 }, { "epoch": 0.4879834085641088, "grad_norm": 2.3488683700561523, "learning_rate": 4.186897645480054e-05, "loss": 3.348, "step": 4000 }, { "epoch": 0.5489813346346224, "grad_norm": 2.3590667247772217, "learning_rate": 4.085234435362531e-05, "loss": 3.2803, "step": 4500 }, { "epoch": 0.609979260705136, "grad_norm": 1.8940634727478027, "learning_rate": 3.983571225245009e-05, "loss": 3.2199, "step": 5000 }, { "epoch": 0.6709771867756497, "grad_norm": 2.2232987880706787, "learning_rate": 3.8819080151274854e-05, "loss": 3.1632, "step": 5500 }, { "epoch": 0.7319751128461632, "grad_norm": 1.9332739114761353, "learning_rate": 3.7802448050099634e-05, "loss": 3.1074, "step": 6000 }, { "epoch": 0.7929730389166768, "grad_norm": 1.869598150253296, "learning_rate": 3.678581594892441e-05, "loss": 3.0732, "step": 6500 }, { "epoch": 0.8539709649871904, "grad_norm": 1.6995704174041748, "learning_rate": 3.5769183847749174e-05, "loss": 3.0224, "step": 7000 }, { "epoch": 0.9149688910577041, "grad_norm": 2.039504051208496, "learning_rate": 3.4752551746573955e-05, "loss": 2.9645, "step": 7500 }, { "epoch": 0.9759668171282176, "grad_norm": 1.8474594354629517, "learning_rate": 3.373591964539872e-05, "loss": 2.9575, "step": 8000 }, { "epoch": 1.0369647431987312, "grad_norm": 1.9620344638824463, "learning_rate": 3.2719287544223495e-05, "loss": 2.8944, "step": 8500 }, { "epoch": 1.0979626692692448, "grad_norm": 1.8038884401321411, "learning_rate": 3.1702655443048275e-05, "loss": 2.877, "step": 9000 }, { "epoch": 1.1589605953397584, "grad_norm": 1.9445528984069824, "learning_rate": 3.068602334187304e-05, "loss": 2.8539, "step": 9500 }, { "epoch": 1.219958521410272, "grad_norm": 1.7721298933029175, "learning_rate": 2.966939124069782e-05, "loss": 2.8176, "step": 10000 }, { "epoch": 1.2809564474807855, "grad_norm": 2.3092963695526123, "learning_rate": 2.865275913952259e-05, "loss": 2.7872, "step": 10500 }, { "epoch": 1.3419543735512993, "grad_norm": 2.0396852493286133, "learning_rate": 2.7636127038347365e-05, "loss": 2.7764, "step": 11000 }, { "epoch": 1.402952299621813, "grad_norm": 1.755911946296692, "learning_rate": 2.6619494937172135e-05, "loss": 2.7607, "step": 11500 }, { "epoch": 1.4639502256923265, "grad_norm": 2.1026928424835205, "learning_rate": 2.560286283599691e-05, "loss": 2.7469, "step": 12000 }, { "epoch": 1.52494815176284, "grad_norm": 1.655788540840149, "learning_rate": 2.4586230734821682e-05, "loss": 2.7264, "step": 12500 }, { "epoch": 1.5859460778333536, "grad_norm": 2.534583568572998, "learning_rate": 2.3569598633646456e-05, "loss": 2.7102, "step": 13000 }, { "epoch": 1.6469440039038674, "grad_norm": 1.9951140880584717, "learning_rate": 2.2552966532471232e-05, "loss": 2.7136, "step": 13500 }, { "epoch": 1.707941929974381, "grad_norm": 2.0365281105041504, "learning_rate": 2.1536334431296002e-05, "loss": 2.6968, "step": 14000 }, { "epoch": 1.7689398560448946, "grad_norm": 1.7324215173721313, "learning_rate": 2.0519702330120776e-05, "loss": 2.6767, "step": 14500 }, { "epoch": 1.8299377821154081, "grad_norm": 1.9328315258026123, "learning_rate": 1.950307022894555e-05, "loss": 2.6868, "step": 15000 }, { "epoch": 1.8909357081859217, "grad_norm": 1.9608515501022339, "learning_rate": 1.8486438127770323e-05, "loss": 2.6585, "step": 15500 }, { "epoch": 1.9519336342564353, "grad_norm": 2.448692560195923, "learning_rate": 1.7469806026595096e-05, "loss": 2.6456, "step": 16000 }, { "epoch": 2.012931560326949, "grad_norm": 2.166044235229492, "learning_rate": 1.645317392541987e-05, "loss": 2.6379, "step": 16500 }, { "epoch": 2.0739294863974624, "grad_norm": 2.6067957878112793, "learning_rate": 1.5436541824244643e-05, "loss": 2.6017, "step": 17000 }, { "epoch": 2.134927412467976, "grad_norm": 2.173795700073242, "learning_rate": 1.4419909723069416e-05, "loss": 2.6037, "step": 17500 }, { "epoch": 2.1959253385384896, "grad_norm": 2.3912558555603027, "learning_rate": 1.340327762189419e-05, "loss": 2.5863, "step": 18000 }, { "epoch": 2.256923264609003, "grad_norm": 2.6069979667663574, "learning_rate": 1.2386645520718963e-05, "loss": 2.5806, "step": 18500 }, { "epoch": 2.3179211906795167, "grad_norm": 2.0667498111724854, "learning_rate": 1.1370013419543737e-05, "loss": 2.5738, "step": 19000 }, { "epoch": 2.3789191167500303, "grad_norm": 2.186269760131836, "learning_rate": 1.0353381318368509e-05, "loss": 2.5831, "step": 19500 }, { "epoch": 2.439917042820544, "grad_norm": 1.7841541767120361, "learning_rate": 9.336749217193284e-06, "loss": 2.561, "step": 20000 }, { "epoch": 2.500914968891058, "grad_norm": 2.1247878074645996, "learning_rate": 8.320117116018055e-06, "loss": 2.5534, "step": 20500 }, { "epoch": 2.561912894961571, "grad_norm": 1.7261439561843872, "learning_rate": 7.303485014842829e-06, "loss": 2.5288, "step": 21000 }, { "epoch": 2.622910821032085, "grad_norm": 1.6930060386657715, "learning_rate": 6.286852913667603e-06, "loss": 2.5449, "step": 21500 }, { "epoch": 2.6839087471025986, "grad_norm": 1.9172097444534302, "learning_rate": 5.270220812492376e-06, "loss": 2.5482, "step": 22000 }, { "epoch": 2.744906673173112, "grad_norm": 1.56685209274292, "learning_rate": 4.253588711317149e-06, "loss": 2.5493, "step": 22500 }, { "epoch": 2.805904599243626, "grad_norm": 2.165651321411133, "learning_rate": 3.2369566101419217e-06, "loss": 2.5481, "step": 23000 }, { "epoch": 2.8669025253141394, "grad_norm": 2.2414863109588623, "learning_rate": 2.220324508966695e-06, "loss": 2.5216, "step": 23500 }, { "epoch": 2.927900451384653, "grad_norm": 2.2040302753448486, "learning_rate": 1.2036924077914685e-06, "loss": 2.5239, "step": 24000 }, { "epoch": 2.9888983774551665, "grad_norm": 2.3587329387664795, "learning_rate": 1.8706030661624173e-07, "loss": 2.5358, "step": 24500 }, { "epoch": 3.0, "step": 24591, "total_flos": 4.51806634222551e+16, "train_loss": 2.926649949844495, "train_runtime": 7468.4511, "train_samples_per_second": 26.34, "train_steps_per_second": 3.293 } ], "logging_steps": 500, "max_steps": 24591, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.51806634222551e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }