{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 416, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024067388688327317, "grad_norm": 4.96875, "learning_rate": 3.6000000000000003e-06, "loss": 2.9962, "memory/device_reserved (GiB)": 32.53, "memory/max_active (GiB)": 31.83, "memory/max_allocated (GiB)": 31.83, "step": 10, "tokens_per_second_per_gpu": 2526.79 }, { "epoch": 0.048134777376654635, "grad_norm": 4.375, "learning_rate": 7.600000000000001e-06, "loss": 3.0077, "memory/device_reserved (GiB)": 32.54, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 20, "tokens_per_second_per_gpu": 2746.26 }, { "epoch": 0.07220216606498195, "grad_norm": 4.21875, "learning_rate": 1.16e-05, "loss": 2.8836, "memory/device_reserved (GiB)": 32.54, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 30, "tokens_per_second_per_gpu": 2704.02 }, { "epoch": 0.09626955475330927, "grad_norm": 4.15625, "learning_rate": 1.5600000000000003e-05, "loss": 2.7347, "memory/device_reserved (GiB)": 32.54, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 40, "tokens_per_second_per_gpu": 2751.94 }, { "epoch": 0.12033694344163658, "grad_norm": 13.8125, "learning_rate": 1.9600000000000002e-05, "loss": 2.6617, "memory/device_reserved (GiB)": 32.55, "memory/max_active (GiB)": 31.8, "memory/max_allocated (GiB)": 31.8, "step": 50, "tokens_per_second_per_gpu": 12.16 }, { "epoch": 0.1444043321299639, "grad_norm": 2.953125, "learning_rate": 1.9970175264485268e-05, "loss": 2.6474, "memory/device_reserved (GiB)": 32.9, "memory/max_active (GiB)": 32.02, "memory/max_allocated (GiB)": 32.02, "step": 60, "tokens_per_second_per_gpu": 3420.31 }, { "epoch": 0.1684717208182912, "grad_norm": 3.359375, "learning_rate": 1.9867305793119814e-05, "loss": 2.5291, "memory/device_reserved (GiB)": 32.9, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 70, "tokens_per_second_per_gpu": 2842.63 }, { "epoch": 0.19253910950661854, "grad_norm": 3.53125, "learning_rate": 1.9691780654392538e-05, "loss": 2.5769, "memory/device_reserved (GiB)": 32.9, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 80, "tokens_per_second_per_gpu": 2713.62 }, { "epoch": 0.21660649819494585, "grad_norm": 4.0, "learning_rate": 1.9444892287836614e-05, "loss": 2.509, "memory/device_reserved (GiB)": 32.9, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 90, "tokens_per_second_per_gpu": 2652.2 }, { "epoch": 0.24067388688327315, "grad_norm": 9.6875, "learning_rate": 1.9128458599921358e-05, "loss": 2.3728, "memory/device_reserved (GiB)": 32.9, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 100, "tokens_per_second_per_gpu": 15.95 }, { "epoch": 0.2647412755716005, "grad_norm": 3.25, "learning_rate": 1.8744809578310398e-05, "loss": 2.5744, "memory/device_reserved (GiB)": 32.76, "memory/max_active (GiB)": 31.92, "memory/max_allocated (GiB)": 31.92, "step": 110, "tokens_per_second_per_gpu": 3061.35 }, { "epoch": 0.2888086642599278, "grad_norm": 3.203125, "learning_rate": 1.829677013552619e-05, "loss": 2.4197, "memory/device_reserved (GiB)": 32.76, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 120, "tokens_per_second_per_gpu": 2742.16 }, { "epoch": 0.3128760529482551, "grad_norm": 3.6875, "learning_rate": 1.778763930834761e-05, "loss": 2.3961, "memory/device_reserved (GiB)": 32.76, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 130, "tokens_per_second_per_gpu": 2790.74 }, { "epoch": 0.3369434416365824, "grad_norm": 4.15625, "learning_rate": 1.7221165966101163e-05, "loss": 2.4885, "memory/device_reserved (GiB)": 32.76, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 140, "tokens_per_second_per_gpu": 2710.33 }, { "epoch": 0.36101083032490977, "grad_norm": 8.5, "learning_rate": 1.660152120671232e-05, "loss": 2.3189, "memory/device_reserved (GiB)": 32.76, "memory/max_active (GiB)": 31.8, "memory/max_allocated (GiB)": 31.8, "step": 150, "tokens_per_second_per_gpu": 13.9 }, { "epoch": 0.3850782190132371, "grad_norm": 3.109375, "learning_rate": 1.593326764377232e-05, "loss": 2.4858, "memory/device_reserved (GiB)": 32.82, "memory/max_active (GiB)": 31.96, "memory/max_allocated (GiB)": 31.96, "step": 160, "tokens_per_second_per_gpu": 3034.65 }, { "epoch": 0.4091456077015644, "grad_norm": 3.296875, "learning_rate": 1.5221325810768251e-05, "loss": 2.3984, "memory/device_reserved (GiB)": 32.82, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 170, "tokens_per_second_per_gpu": 2794.98 }, { "epoch": 0.4332129963898917, "grad_norm": 3.765625, "learning_rate": 1.4470937929851142e-05, "loss": 2.4386, "memory/device_reserved (GiB)": 32.82, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 180, "tokens_per_second_per_gpu": 2640.62 }, { "epoch": 0.457280385078219, "grad_norm": 3.9375, "learning_rate": 1.3687629311922604e-05, "loss": 2.3784, "memory/device_reserved (GiB)": 32.82, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 190, "tokens_per_second_per_gpu": 2688.2 }, { "epoch": 0.4813477737665463, "grad_norm": 6.375, "learning_rate": 1.287716767226167e-05, "loss": 2.254, "memory/device_reserved (GiB)": 32.82, "memory/max_active (GiB)": 31.8, "memory/max_allocated (GiB)": 31.8, "step": 200, "tokens_per_second_per_gpu": 16.53 }, { "epoch": 0.5054151624548736, "grad_norm": 3.125, "learning_rate": 1.2045520661262011e-05, "loss": 2.5127, "memory/device_reserved (GiB)": 32.94, "memory/max_active (GiB)": 31.96, "memory/max_allocated (GiB)": 31.96, "step": 210, "tokens_per_second_per_gpu": 3069.32 }, { "epoch": 0.529482551143201, "grad_norm": 3.328125, "learning_rate": 1.1198811922992274e-05, "loss": 2.4208, "memory/device_reserved (GiB)": 32.94, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 220, "tokens_per_second_per_gpu": 2758.53 }, { "epoch": 0.5535499398315282, "grad_norm": 3.390625, "learning_rate": 1.0343276005132436e-05, "loss": 2.3724, "memory/device_reserved (GiB)": 32.94, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 230, "tokens_per_second_per_gpu": 2799.61 }, { "epoch": 0.5776173285198556, "grad_norm": 3.96875, "learning_rate": 9.485212452296535e-06, "loss": 2.3658, "memory/device_reserved (GiB)": 32.94, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 240, "tokens_per_second_per_gpu": 2669.67 }, { "epoch": 0.601684717208183, "grad_norm": 7.875, "learning_rate": 8.630939420765247e-06, "loss": 2.2695, "memory/device_reserved (GiB)": 32.94, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 250, "tokens_per_second_per_gpu": 16.97 }, { "epoch": 0.6257521058965102, "grad_norm": 3.0625, "learning_rate": 7.786747156175675e-06, "loss": 2.4657, "memory/device_reserved (GiB)": 32.64, "memory/max_active (GiB)": 31.99, "memory/max_allocated (GiB)": 31.99, "step": 260, "tokens_per_second_per_gpu": 3091.19 }, { "epoch": 0.6498194945848376, "grad_norm": 3.234375, "learning_rate": 6.958851676724823e-06, "loss": 2.4131, "memory/device_reserved (GiB)": 32.64, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 270, "tokens_per_second_per_gpu": 2802.37 }, { "epoch": 0.6738868832731648, "grad_norm": 3.421875, "learning_rate": 6.153349002929988e-06, "loss": 2.3716, "memory/device_reserved (GiB)": 32.64, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 280, "tokens_per_second_per_gpu": 2645.5 }, { "epoch": 0.6979542719614922, "grad_norm": 4.125, "learning_rate": 5.3761702709648555e-06, "loss": 2.3805, "memory/device_reserved (GiB)": 32.64, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 290, "tokens_per_second_per_gpu": 2761.33 }, { "epoch": 0.7220216606498195, "grad_norm": 11.75, "learning_rate": 4.633038060083996e-06, "loss": 2.1915, "memory/device_reserved (GiB)": 32.65, "memory/max_active (GiB)": 31.8, "memory/max_allocated (GiB)": 31.8, "step": 300, "tokens_per_second_per_gpu": 14.43 }, { "epoch": 0.7460890493381468, "grad_norm": 3.078125, "learning_rate": 3.929424255708999e-06, "loss": 2.4542, "memory/device_reserved (GiB)": 33.33, "memory/max_active (GiB)": 31.99, "memory/max_allocated (GiB)": 31.99, "step": 310, "tokens_per_second_per_gpu": 3503.84 }, { "epoch": 0.7701564380264742, "grad_norm": 3.28125, "learning_rate": 3.2705097584416712e-06, "loss": 2.4018, "memory/device_reserved (GiB)": 33.33, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 320, "tokens_per_second_per_gpu": 2889.9 }, { "epoch": 0.7942238267148014, "grad_norm": 3.53125, "learning_rate": 2.66114633567801e-06, "loss": 2.4133, "memory/device_reserved (GiB)": 33.33, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 330, "tokens_per_second_per_gpu": 2775.32 }, { "epoch": 0.8182912154031288, "grad_norm": 4.09375, "learning_rate": 2.1058208967198046e-06, "loss": 2.3389, "memory/device_reserved (GiB)": 33.33, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 340, "tokens_per_second_per_gpu": 2850.91 }, { "epoch": 0.8423586040914561, "grad_norm": 12.9375, "learning_rate": 1.6086224544360617e-06, "loss": 2.2469, "memory/device_reserved (GiB)": 33.33, "memory/max_active (GiB)": 31.8, "memory/max_allocated (GiB)": 31.8, "step": 350, "tokens_per_second_per_gpu": 13.75 }, { "epoch": 0.8664259927797834, "grad_norm": 3.015625, "learning_rate": 1.1732120167445248e-06, "loss": 2.4261, "memory/device_reserved (GiB)": 32.94, "memory/max_active (GiB)": 31.92, "memory/max_allocated (GiB)": 31.92, "step": 360, "tokens_per_second_per_gpu": 3009.01 }, { "epoch": 0.8904933814681107, "grad_norm": 3.28125, "learning_rate": 8.027956296105355e-07, "loss": 2.411, "memory/device_reserved (GiB)": 32.94, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 370, "tokens_per_second_per_gpu": 2749.41 }, { "epoch": 0.914560770156438, "grad_norm": 3.453125, "learning_rate": 5.001007700549898e-07, "loss": 2.3689, "memory/device_reserved (GiB)": 32.94, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 380, "tokens_per_second_per_gpu": 2769.43 }, { "epoch": 0.9386281588447654, "grad_norm": 3.84375, "learning_rate": 2.6735626299617456e-07, "loss": 2.3496, "memory/device_reserved (GiB)": 32.94, "memory/max_active (GiB)": 31.81, "memory/max_allocated (GiB)": 31.81, "step": 390, "tokens_per_second_per_gpu": 2799.6 }, { "epoch": 0.9626955475330926, "grad_norm": 10.625, "learning_rate": 1.0627586980317073e-07, "loss": 2.1982, "memory/device_reserved (GiB)": 32.94, "memory/max_active (GiB)": 31.8, "memory/max_allocated (GiB)": 31.8, "step": 400, "tokens_per_second_per_gpu": 14.1 }, { "epoch": 0.98676293622142, "grad_norm": 3.75, "learning_rate": 1.8045669402859678e-08, "loss": 2.473, "memory/device_reserved (GiB)": 32.76, "memory/max_active (GiB)": 31.86, "memory/max_allocated (GiB)": 31.86, "step": 410, "tokens_per_second_per_gpu": 3564.04 } ], "logging_steps": 10, "max_steps": 416, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.6853339156578304e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }