{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.25103830179972314, "eval_steps": 500, "global_step": 34, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007383479464697739, "grad_norm": 1.9591929912567139, "learning_rate": 0.0, "loss": 1.6228, "memory/device_mem_reserved(gib)": 21.61, "memory/max_mem_active(gib)": 21.2, "memory/max_mem_allocated(gib)": 21.2, "step": 1 }, { "epoch": 0.014766958929395477, "grad_norm": 1.4523507356643677, "learning_rate": 1.5384615384615387e-05, "loss": 1.5769, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 2 }, { "epoch": 0.022150438394093218, "grad_norm": 1.1918187141418457, "learning_rate": 3.0769230769230774e-05, "loss": 1.5435, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 3 }, { "epoch": 0.029533917858790955, "grad_norm": 0.8260876536369324, "learning_rate": 4.615384615384616e-05, "loss": 1.6523, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 4 }, { "epoch": 0.03691739732348869, "grad_norm": 0.8584926128387451, "learning_rate": 6.153846153846155e-05, "loss": 1.5745, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 5 }, { "epoch": 0.044300876788186436, "grad_norm": 0.6466429829597473, "learning_rate": 7.692307692307693e-05, "loss": 1.4759, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 6 }, { "epoch": 0.05168435625288417, "grad_norm": 0.5014482140541077, "learning_rate": 9.230769230769232e-05, "loss": 1.602, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 7 }, { "epoch": 0.05906783571758191, "grad_norm": 0.6017433404922485, "learning_rate": 0.0001076923076923077, "loss": 1.4176, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 8 }, { "epoch": 0.06645131518227965, "grad_norm": 0.4612258970737457, "learning_rate": 0.0001230769230769231, "loss": 1.5819, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 9 }, { "epoch": 0.07383479464697738, "grad_norm": 0.4430214464664459, "learning_rate": 0.00013846153846153847, "loss": 1.561, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 10 }, { "epoch": 0.08121827411167512, "grad_norm": 0.3746771216392517, "learning_rate": 0.00015384615384615385, "loss": 1.6744, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 11 }, { "epoch": 0.08860175357637287, "grad_norm": 0.38248857855796814, "learning_rate": 0.00016923076923076923, "loss": 1.5629, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 12 }, { "epoch": 0.09598523304107061, "grad_norm": 0.515844464302063, "learning_rate": 0.00018461538461538463, "loss": 1.5264, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 13 }, { "epoch": 0.10336871250576835, "grad_norm": 0.3964424431324005, "learning_rate": 0.0002, "loss": 1.5398, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 14 }, { "epoch": 0.11075219197046608, "grad_norm": 0.4010593891143799, "learning_rate": 0.0001999668467514313, "loss": 1.4618, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 15 }, { "epoch": 0.11813567143516382, "grad_norm": 0.3192802965641022, "learning_rate": 0.00019986740898848306, "loss": 1.6994, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 16 }, { "epoch": 0.12551915089986157, "grad_norm": 0.410099059343338, "learning_rate": 0.00019970175264485266, "loss": 1.5913, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 17 }, { "epoch": 0.1329026303645593, "grad_norm": 0.312429815530777, "learning_rate": 0.0001994699875614589, "loss": 1.5701, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 18 }, { "epoch": 0.14028610982925704, "grad_norm": 0.2831230163574219, "learning_rate": 0.00019917226741361015, "loss": 1.5744, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 19 }, { "epoch": 0.14766958929395477, "grad_norm": 0.3618868291378021, "learning_rate": 0.00019880878960910772, "loss": 1.5185, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 20 }, { "epoch": 0.15505306875865252, "grad_norm": 0.3151628077030182, "learning_rate": 0.00019837979515735166, "loss": 1.5086, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 21 }, { "epoch": 0.16243654822335024, "grad_norm": 0.31955838203430176, "learning_rate": 0.0001978855685095358, "loss": 1.6329, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 22 }, { "epoch": 0.169820027688048, "grad_norm": 0.3030437231063843, "learning_rate": 0.00019732643737003827, "loss": 1.6697, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 23 }, { "epoch": 0.17720350715274574, "grad_norm": 0.41288134455680847, "learning_rate": 0.00019670277247913205, "loss": 1.7094, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 24 }, { "epoch": 0.18458698661744347, "grad_norm": 0.2887294888496399, "learning_rate": 0.00019601498736716017, "loss": 1.5554, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 25 }, { "epoch": 0.19197046608214122, "grad_norm": 0.3173791170120239, "learning_rate": 0.00019526353808033825, "loss": 1.4404, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 26 }, { "epoch": 0.19935394554683894, "grad_norm": 0.2877439558506012, "learning_rate": 0.00019444892287836613, "loss": 1.4766, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 27 }, { "epoch": 0.2067374250115367, "grad_norm": 0.29286038875579834, "learning_rate": 0.00019357168190404936, "loss": 1.5156, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 28 }, { "epoch": 0.2141209044762344, "grad_norm": 0.27713659405708313, "learning_rate": 0.00019263239682514952, "loss": 1.5153, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 29 }, { "epoch": 0.22150438394093216, "grad_norm": 0.29187655448913574, "learning_rate": 0.0001916316904487005, "loss": 1.6036, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 30 }, { "epoch": 0.22888786340562992, "grad_norm": 0.2671583890914917, "learning_rate": 0.00019057022630804716, "loss": 1.4675, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 31 }, { "epoch": 0.23627134287032764, "grad_norm": 0.2679831087589264, "learning_rate": 0.00018944870822287956, "loss": 1.581, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 32 }, { "epoch": 0.2436548223350254, "grad_norm": 0.26359617710113525, "learning_rate": 0.00018826787983255473, "loss": 1.4674, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 33 }, { "epoch": 0.25103830179972314, "grad_norm": 0.30446046590805054, "learning_rate": 0.00018702852410301554, "loss": 1.5038, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 34 } ], "logging_steps": 1, "max_steps": 135, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 34, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.17292722381783e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }