{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3751, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.026659557451346308, "grad_norm": 7.0857120383857755, "learning_rate": 2.6329787234042554e-06, "loss": 1.0624, "step": 100 }, { "epoch": 0.053319114902692616, "grad_norm": 6.581000668568491, "learning_rate": 5.292553191489362e-06, "loss": 0.7422, "step": 200 }, { "epoch": 0.07997867235403892, "grad_norm": 7.4656274686679085, "learning_rate": 7.95212765957447e-06, "loss": 0.7649, "step": 300 }, { "epoch": 0.10663822980538523, "grad_norm": 4.554989486482939, "learning_rate": 9.998854140728647e-06, "loss": 0.7662, "step": 400 }, { "epoch": 0.13329778725673153, "grad_norm": 4.023821679311659, "learning_rate": 9.967263823916638e-06, "loss": 0.784, "step": 500 }, { "epoch": 0.15995734470807785, "grad_norm": 4.221747608519819, "learning_rate": 9.892664857121854e-06, "loss": 0.79, "step": 600 }, { "epoch": 0.18661690215942414, "grad_norm": 3.838998761862227, "learning_rate": 9.775703149433419e-06, "loss": 0.7811, "step": 700 }, { "epoch": 0.21327645961077046, "grad_norm": 4.346355062560185, "learning_rate": 9.617391404288412e-06, "loss": 0.8124, "step": 800 }, { "epoch": 0.23993601706211676, "grad_norm": 4.985043330591158, "learning_rate": 9.41910035106149e-06, "loss": 0.7959, "step": 900 }, { "epoch": 0.26659557451346305, "grad_norm": 4.43851554256146, "learning_rate": 9.18254687671603e-06, "loss": 0.8065, "step": 1000 }, { "epoch": 0.2932551319648094, "grad_norm": 4.6639168705369425, "learning_rate": 8.909779160277951e-06, "loss": 0.7854, "step": 1100 }, { "epoch": 0.3199146894161557, "grad_norm": 3.7379632914857375, "learning_rate": 8.603158938844122e-06, "loss": 0.7729, "step": 1200 }, { "epoch": 0.346574246867502, "grad_norm": 4.820935299394203, "learning_rate": 8.265341058673722e-06, "loss": 0.7831, "step": 1300 }, { "epoch": 0.3732338043188483, "grad_norm": 3.6917931938034148, "learning_rate": 7.899250488417746e-06, "loss": 0.7967, "step": 1400 }, { "epoch": 0.39989336177019463, "grad_norm": 4.773594475659506, "learning_rate": 7.5080569935157375e-06, "loss": 0.7979, "step": 1500 }, { "epoch": 0.4265529192215409, "grad_norm": 4.447857956208416, "learning_rate": 7.095147691039425e-06, "loss": 0.7843, "step": 1600 }, { "epoch": 0.4532124766728872, "grad_norm": 4.389149312164256, "learning_rate": 6.664097722614934e-06, "loss": 0.7721, "step": 1700 }, { "epoch": 0.4798720341242335, "grad_norm": 3.915206329218288, "learning_rate": 6.218639299349676e-06, "loss": 0.7526, "step": 1800 }, { "epoch": 0.5065315915755798, "grad_norm": 5.180289969908035, "learning_rate": 5.7626293867858985e-06, "loss": 0.7854, "step": 1900 }, { "epoch": 0.5331911490269261, "grad_norm": 4.139436495632767, "learning_rate": 5.300016309678104e-06, "loss": 0.7381, "step": 2000 }, { "epoch": 0.5598507064782725, "grad_norm": 4.936713870803515, "learning_rate": 4.834805565744173e-06, "loss": 0.7471, "step": 2100 }, { "epoch": 0.5865102639296188, "grad_norm": 4.5976706617804, "learning_rate": 4.371025144389e-06, "loss": 0.7611, "step": 2200 }, { "epoch": 0.6131698213809651, "grad_norm": 5.820707950034038, "learning_rate": 3.912690650685726e-06, "loss": 0.7374, "step": 2300 }, { "epoch": 0.6398293788323114, "grad_norm": 3.951105419131897, "learning_rate": 3.4637705365856666e-06, "loss": 0.7444, "step": 2400 }, { "epoch": 0.6664889362836577, "grad_norm": 4.529901890166711, "learning_rate": 3.0281517403997245e-06, "loss": 0.7452, "step": 2500 }, { "epoch": 0.693148493735004, "grad_norm": 2.851635488548725, "learning_rate": 2.6096060320590393e-06, "loss": 0.7345, "step": 2600 }, { "epoch": 0.7198080511863503, "grad_norm": 3.6802521221701947, "learning_rate": 2.2117573555516774e-06, "loss": 0.7348, "step": 2700 }, { "epoch": 0.7464676086376966, "grad_norm": 3.6788331738320075, "learning_rate": 1.8380504512982329e-06, "loss": 0.7352, "step": 2800 }, { "epoch": 0.773127166089043, "grad_norm": 3.664797807064228, "learning_rate": 1.491721030146963e-06, "loss": 0.7299, "step": 2900 }, { "epoch": 0.7997867235403893, "grad_norm": 3.9751097792558783, "learning_rate": 1.1757677572344577e-06, "loss": 0.7353, "step": 3000 }, { "epoch": 0.8264462809917356, "grad_norm": 3.7953764140303985, "learning_rate": 8.929262882873524e-07, "loss": 0.7297, "step": 3100 }, { "epoch": 0.8531058384430819, "grad_norm": 3.4294084377480814, "learning_rate": 6.456455831696234e-07, "loss": 0.7151, "step": 3200 }, { "epoch": 0.8797653958944281, "grad_norm": 4.751586217200114, "learning_rate": 4.3606670176271014e-07, "loss": 0.695, "step": 3300 }, { "epoch": 0.9064249533457744, "grad_norm": 4.643644454272238, "learning_rate": 2.660042657725931e-07, "loss": 0.7423, "step": 3400 }, { "epoch": 0.9330845107971207, "grad_norm": 4.371360024084083, "learning_rate": 1.3693074697528231e-07, "loss": 0.7068, "step": 3500 }, { "epoch": 0.959744068248467, "grad_norm": 5.224648142017166, "learning_rate": 4.996371793965837e-08, "loss": 0.7121, "step": 3600 }, { "epoch": 0.9864036256998134, "grad_norm": 4.815750775150759, "learning_rate": 5.8561756162400785e-09, "loss": 0.7437, "step": 3700 }, { "epoch": 1.0, "step": 3751, "total_flos": 84620534874112.0, "train_loss": 0.7639887226768826, "train_runtime": 3882.0564, "train_samples_per_second": 7.728, "train_steps_per_second": 0.966 } ], "logging_steps": 100, "max_steps": 3751, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 84620534874112.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }