{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0417287630402385, "eval_steps": 50, "global_step": 350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.029806259314456036, "grad_norm": 2.8659629606409e-05, "learning_rate": 1.3333333333333333e-05, "loss": 1.0016, "step": 10 }, { "epoch": 0.05961251862891207, "grad_norm": 1.3777846106677316e-05, "learning_rate": 2.814814814814815e-05, "loss": 0.59, "step": 20 }, { "epoch": 0.08941877794336811, "grad_norm": 7.97840766608715e-06, "learning_rate": 4.296296296296296e-05, "loss": 0.6772, "step": 30 }, { "epoch": 0.11922503725782414, "grad_norm": 6.855416359030642e-06, "learning_rate": 5.7777777777777776e-05, "loss": 0.6207, "step": 40 }, { "epoch": 0.14903129657228018, "grad_norm": 8.57202394399792e-06, "learning_rate": 7.25925925925926e-05, "loss": 0.6395, "step": 50 }, { "epoch": 0.14903129657228018, "eval_loss": 0.5846871733665466, "eval_runtime": 87.5403, "eval_samples_per_second": 3.233, "eval_steps_per_second": 1.622, "step": 50 }, { "epoch": 0.17883755588673622, "grad_norm": 3.3820235785242403e-06, "learning_rate": 8.740740740740741e-05, "loss": 0.4937, "step": 60 }, { "epoch": 0.20864381520119224, "grad_norm": 5.906346359552117e-06, "learning_rate": 0.00010222222222222222, "loss": 0.467, "step": 70 }, { "epoch": 0.23845007451564829, "grad_norm": 5.968211553408764e-06, "learning_rate": 0.00011703703703703704, "loss": 0.6052, "step": 80 }, { "epoch": 0.26825633383010433, "grad_norm": 6.654247954429593e-06, "learning_rate": 0.00013185185185185186, "loss": 0.4269, "step": 90 }, { "epoch": 0.29806259314456035, "grad_norm": 1.4780930541746784e-05, "learning_rate": 0.00014666666666666666, "loss": 0.4415, "step": 100 }, { "epoch": 0.29806259314456035, "eval_loss": 0.49561959505081177, "eval_runtime": 86.7616, "eval_samples_per_second": 3.262, "eval_steps_per_second": 1.637, "step": 100 }, { "epoch": 0.32786885245901637, "grad_norm": 5.26007761436631e-06, "learning_rate": 0.0001614814814814815, "loss": 0.4201, "step": 110 }, { "epoch": 0.35767511177347244, "grad_norm": 1.0745498912001494e-05, "learning_rate": 0.0001762962962962963, "loss": 0.5687, "step": 120 }, { "epoch": 0.38748137108792846, "grad_norm": 9.13943767955061e-06, "learning_rate": 0.00019111111111111114, "loss": 0.4544, "step": 130 }, { "epoch": 0.4172876304023845, "grad_norm": 9.74733575276332e-06, "learning_rate": 0.00019999459826567048, "loss": 0.3984, "step": 140 }, { "epoch": 0.44709388971684055, "grad_norm": 7.19069566912367e-06, "learning_rate": 0.00019993383545625465, "loss": 0.3551, "step": 150 }, { "epoch": 0.44709388971684055, "eval_loss": 0.44105133414268494, "eval_runtime": 86.7076, "eval_samples_per_second": 3.264, "eval_steps_per_second": 1.638, "step": 150 }, { "epoch": 0.47690014903129657, "grad_norm": 5.66791504752473e-06, "learning_rate": 0.00019980559883241722, "loss": 0.3437, "step": 160 }, { "epoch": 0.5067064083457526, "grad_norm": 9.916246199281886e-06, "learning_rate": 0.0001996099749775874, "loss": 0.533, "step": 170 }, { "epoch": 0.5365126676602087, "grad_norm": 6.7572823354566935e-06, "learning_rate": 0.00019934709597403352, "loss": 0.4875, "step": 180 }, { "epoch": 0.5663189269746647, "grad_norm": 2.7519972718437202e-05, "learning_rate": 0.00019901713931368332, "loss": 0.4088, "step": 190 }, { "epoch": 0.5961251862891207, "grad_norm": 6.2564413383370265e-06, "learning_rate": 0.00019862032777828405, "loss": 0.3734, "step": 200 }, { "epoch": 0.5961251862891207, "eval_loss": 0.416751503944397, "eval_runtime": 86.855, "eval_samples_per_second": 3.258, "eval_steps_per_second": 1.635, "step": 200 }, { "epoch": 0.6259314456035767, "grad_norm": 6.829235189798055e-06, "learning_rate": 0.00019815692928898347, "loss": 0.4013, "step": 210 }, { "epoch": 0.6557377049180327, "grad_norm": 2.6349375730205793e-06, "learning_rate": 0.00019762725672543371, "loss": 0.439, "step": 220 }, { "epoch": 0.6855439642324889, "grad_norm": 6.470134849223541e-06, "learning_rate": 0.00019703166771453952, "loss": 0.3611, "step": 230 }, { "epoch": 0.7153502235469449, "grad_norm": 4.913066732115112e-06, "learning_rate": 0.0001963705643889941, "loss": 0.3843, "step": 240 }, { "epoch": 0.7451564828614009, "grad_norm": 8.839782822178677e-06, "learning_rate": 0.00019564439311576512, "loss": 0.4593, "step": 250 }, { "epoch": 0.7451564828614009, "eval_loss": 0.3998318612575531, "eval_runtime": 86.6219, "eval_samples_per_second": 3.267, "eval_steps_per_second": 1.639, "step": 250 }, { "epoch": 0.7749627421758569, "grad_norm": 6.905674126755912e-06, "learning_rate": 0.00019485364419471454, "loss": 0.3549, "step": 260 }, { "epoch": 0.8047690014903129, "grad_norm": 1.1015033123840112e-05, "learning_rate": 0.00019399885152755558, "loss": 0.33, "step": 270 }, { "epoch": 0.834575260804769, "grad_norm": 7.4981344369007275e-06, "learning_rate": 0.00019308059225737014, "loss": 0.464, "step": 280 }, { "epoch": 0.8643815201192251, "grad_norm": 1.2525980309874285e-05, "learning_rate": 0.00019209948637893088, "loss": 0.4893, "step": 290 }, { "epoch": 0.8941877794336811, "grad_norm": 8.533593245374504e-06, "learning_rate": 0.00019105619632008982, "loss": 0.3002, "step": 300 }, { "epoch": 0.8941877794336811, "eval_loss": 0.38659366965293884, "eval_runtime": 86.718, "eval_samples_per_second": 3.263, "eval_steps_per_second": 1.637, "step": 300 }, { "epoch": 0.9239940387481371, "grad_norm": 6.511543233500561e-06, "learning_rate": 0.0001899514264945173, "loss": 0.3519, "step": 310 }, { "epoch": 0.9538002980625931, "grad_norm": 9.529394446872175e-06, "learning_rate": 0.00018878592282609228, "loss": 0.2376, "step": 320 }, { "epoch": 0.9836065573770492, "grad_norm": 5.924814558966318e-06, "learning_rate": 0.00018756047224526606, "loss": 0.3868, "step": 330 }, { "epoch": 1.0119225037257824, "grad_norm": 7.456989806087222e-06, "learning_rate": 0.0001862759021577385, "loss": 0.4924, "step": 340 }, { "epoch": 1.0417287630402385, "grad_norm": 8.004604751477018e-06, "learning_rate": 0.00018493307988580652, "loss": 0.3768, "step": 350 }, { "epoch": 1.0417287630402385, "eval_loss": 0.3797300159931183, "eval_runtime": 86.5928, "eval_samples_per_second": 3.268, "eval_steps_per_second": 1.64, "step": 350 } ], "logging_steps": 10, "max_steps": 1344, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.971853201788314e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }