| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 50, | |
| "global_step": 477, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.12598425196850394, | |
| "grad_norm": 1.0740249156951904, | |
| "learning_rate": 7.6e-05, | |
| "loss": 2.0357, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.25196850393700787, | |
| "grad_norm": 0.6201758980751038, | |
| "learning_rate": 0.00015600000000000002, | |
| "loss": 0.9997, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.31496062992125984, | |
| "eval_loss": 0.8496084213256836, | |
| "eval_runtime": 43.5003, | |
| "eval_samples_per_second": 14.598, | |
| "eval_steps_per_second": 7.31, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3779527559055118, | |
| "grad_norm": 0.5872485637664795, | |
| "learning_rate": 0.00019578454332552694, | |
| "loss": 0.8539, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.5039370078740157, | |
| "grad_norm": 0.5724177956581116, | |
| "learning_rate": 0.0001864168618266979, | |
| "loss": 0.7978, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.6299212598425197, | |
| "grad_norm": 0.5841088891029358, | |
| "learning_rate": 0.00017704918032786885, | |
| "loss": 0.7641, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6299212598425197, | |
| "eval_loss": 0.7325109839439392, | |
| "eval_runtime": 43.098, | |
| "eval_samples_per_second": 14.734, | |
| "eval_steps_per_second": 7.379, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.7559055118110236, | |
| "grad_norm": 0.5532050728797913, | |
| "learning_rate": 0.00016768149882903982, | |
| "loss": 0.7371, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.8818897637795275, | |
| "grad_norm": 0.5918635725975037, | |
| "learning_rate": 0.00015831381733021077, | |
| "loss": 0.7189, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.9448818897637795, | |
| "eval_loss": 0.6780820488929749, | |
| "eval_runtime": 42.7868, | |
| "eval_samples_per_second": 14.841, | |
| "eval_steps_per_second": 7.432, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.0062992125984251, | |
| "grad_norm": 0.5814059972763062, | |
| "learning_rate": 0.00014894613583138174, | |
| "loss": 0.6968, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.132283464566929, | |
| "grad_norm": 0.6082066297531128, | |
| "learning_rate": 0.0001395784543325527, | |
| "loss": 0.589, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.258267716535433, | |
| "grad_norm": 0.6297890543937683, | |
| "learning_rate": 0.00013021077283372365, | |
| "loss": 0.5802, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.258267716535433, | |
| "eval_loss": 0.6524496674537659, | |
| "eval_runtime": 42.7436, | |
| "eval_samples_per_second": 14.856, | |
| "eval_steps_per_second": 7.44, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.384251968503937, | |
| "grad_norm": 0.6982228755950928, | |
| "learning_rate": 0.00012084309133489463, | |
| "loss": 0.5729, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.510236220472441, | |
| "grad_norm": 0.6589049696922302, | |
| "learning_rate": 0.00011147540983606557, | |
| "loss": 0.5635, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.573228346456693, | |
| "eval_loss": 0.6260292530059814, | |
| "eval_runtime": 42.9705, | |
| "eval_samples_per_second": 14.778, | |
| "eval_steps_per_second": 7.4, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.6362204724409448, | |
| "grad_norm": 0.7309821844100952, | |
| "learning_rate": 0.00010210772833723654, | |
| "loss": 0.5753, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.762204724409449, | |
| "grad_norm": 0.6762523055076599, | |
| "learning_rate": 9.27400468384075e-05, | |
| "loss": 0.5653, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.8881889763779527, | |
| "grad_norm": 0.6731918454170227, | |
| "learning_rate": 8.337236533957846e-05, | |
| "loss": 0.5458, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.8881889763779527, | |
| "eval_loss": 0.6061463952064514, | |
| "eval_runtime": 43.3473, | |
| "eval_samples_per_second": 14.649, | |
| "eval_steps_per_second": 7.336, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.0125984251968503, | |
| "grad_norm": 0.659310519695282, | |
| "learning_rate": 7.400468384074943e-05, | |
| "loss": 0.5314, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.1385826771653544, | |
| "grad_norm": 0.7076399326324463, | |
| "learning_rate": 6.463700234192038e-05, | |
| "loss": 0.4378, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.2015748031496063, | |
| "eval_loss": 0.6155605912208557, | |
| "eval_runtime": 43.0746, | |
| "eval_samples_per_second": 14.742, | |
| "eval_steps_per_second": 7.383, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.264566929133858, | |
| "grad_norm": 0.7343379259109497, | |
| "learning_rate": 5.5269320843091335e-05, | |
| "loss": 0.4302, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.3905511811023623, | |
| "grad_norm": 0.7805226445198059, | |
| "learning_rate": 4.59016393442623e-05, | |
| "loss": 0.4258, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.516535433070866, | |
| "grad_norm": 0.7747156620025635, | |
| "learning_rate": 3.6533957845433256e-05, | |
| "loss": 0.4345, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.516535433070866, | |
| "eval_loss": 0.5991191864013672, | |
| "eval_runtime": 43.4425, | |
| "eval_samples_per_second": 14.617, | |
| "eval_steps_per_second": 7.32, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.64251968503937, | |
| "grad_norm": 0.7755193114280701, | |
| "learning_rate": 2.716627634660422e-05, | |
| "loss": 0.4222, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.768503937007874, | |
| "grad_norm": 0.7862898707389832, | |
| "learning_rate": 1.7798594847775178e-05, | |
| "loss": 0.4169, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.8314960629921258, | |
| "eval_loss": 0.5949774980545044, | |
| "eval_runtime": 43.2416, | |
| "eval_samples_per_second": 14.685, | |
| "eval_steps_per_second": 7.354, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.894488188976378, | |
| "grad_norm": 0.7334442138671875, | |
| "learning_rate": 8.430913348946136e-06, | |
| "loss": 0.4191, | |
| "step": 460 | |
| } | |
| ], | |
| "logging_steps": 20, | |
| "max_steps": 477, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.721630568922931e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |