{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5020766035994463, "eval_steps": 500, "global_step": 68, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007383479464697739, "grad_norm": 1.9591929912567139, "learning_rate": 0.0, "loss": 1.6228, "memory/device_mem_reserved(gib)": 21.61, "memory/max_mem_active(gib)": 21.2, "memory/max_mem_allocated(gib)": 21.2, "step": 1 }, { "epoch": 0.014766958929395477, "grad_norm": 1.4523507356643677, "learning_rate": 1.5384615384615387e-05, "loss": 1.5769, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 2 }, { "epoch": 0.022150438394093218, "grad_norm": 1.1918187141418457, "learning_rate": 3.0769230769230774e-05, "loss": 1.5435, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 3 }, { "epoch": 0.029533917858790955, "grad_norm": 0.8260876536369324, "learning_rate": 4.615384615384616e-05, "loss": 1.6523, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 4 }, { "epoch": 0.03691739732348869, "grad_norm": 0.8584926128387451, "learning_rate": 6.153846153846155e-05, "loss": 1.5745, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 5 }, { "epoch": 0.044300876788186436, "grad_norm": 0.6466429829597473, "learning_rate": 7.692307692307693e-05, "loss": 1.4759, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 6 }, { "epoch": 0.05168435625288417, "grad_norm": 0.5014482140541077, "learning_rate": 9.230769230769232e-05, "loss": 1.602, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 7 }, { "epoch": 0.05906783571758191, "grad_norm": 0.6017433404922485, "learning_rate": 0.0001076923076923077, "loss": 1.4176, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 8 }, { "epoch": 0.06645131518227965, "grad_norm": 0.4612258970737457, "learning_rate": 0.0001230769230769231, "loss": 1.5819, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 9 }, { "epoch": 0.07383479464697738, "grad_norm": 0.4430214464664459, "learning_rate": 0.00013846153846153847, "loss": 1.561, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 10 }, { "epoch": 0.08121827411167512, "grad_norm": 0.3746771216392517, "learning_rate": 0.00015384615384615385, "loss": 1.6744, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 11 }, { "epoch": 0.08860175357637287, "grad_norm": 0.38248857855796814, "learning_rate": 0.00016923076923076923, "loss": 1.5629, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 12 }, { "epoch": 0.09598523304107061, "grad_norm": 0.515844464302063, "learning_rate": 0.00018461538461538463, "loss": 1.5264, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 13 }, { "epoch": 0.10336871250576835, "grad_norm": 0.3964424431324005, "learning_rate": 0.0002, "loss": 1.5398, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 14 }, { "epoch": 0.11075219197046608, "grad_norm": 0.4010593891143799, "learning_rate": 0.0001999668467514313, "loss": 1.4618, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 15 }, { "epoch": 0.11813567143516382, "grad_norm": 0.3192802965641022, "learning_rate": 0.00019986740898848306, "loss": 1.6994, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 16 }, { "epoch": 0.12551915089986157, "grad_norm": 0.410099059343338, "learning_rate": 0.00019970175264485266, "loss": 1.5913, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 17 }, { "epoch": 0.1329026303645593, "grad_norm": 0.312429815530777, "learning_rate": 0.0001994699875614589, "loss": 1.5701, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 18 }, { "epoch": 0.14028610982925704, "grad_norm": 0.2831230163574219, "learning_rate": 0.00019917226741361015, "loss": 1.5744, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 19 }, { "epoch": 0.14766958929395477, "grad_norm": 0.3618868291378021, "learning_rate": 0.00019880878960910772, "loss": 1.5185, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 20 }, { "epoch": 0.15505306875865252, "grad_norm": 0.3151628077030182, "learning_rate": 0.00019837979515735166, "loss": 1.5086, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 21 }, { "epoch": 0.16243654822335024, "grad_norm": 0.31955838203430176, "learning_rate": 0.0001978855685095358, "loss": 1.6329, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 22 }, { "epoch": 0.169820027688048, "grad_norm": 0.3030437231063843, "learning_rate": 0.00019732643737003827, "loss": 1.6697, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 23 }, { "epoch": 0.17720350715274574, "grad_norm": 0.41288134455680847, "learning_rate": 0.00019670277247913205, "loss": 1.7094, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 24 }, { "epoch": 0.18458698661744347, "grad_norm": 0.2887294888496399, "learning_rate": 0.00019601498736716017, "loss": 1.5554, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 25 }, { "epoch": 0.19197046608214122, "grad_norm": 0.3173791170120239, "learning_rate": 0.00019526353808033825, "loss": 1.4404, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 26 }, { "epoch": 0.19935394554683894, "grad_norm": 0.2877439558506012, "learning_rate": 0.00019444892287836613, "loss": 1.4766, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 27 }, { "epoch": 0.2067374250115367, "grad_norm": 0.29286038875579834, "learning_rate": 0.00019357168190404936, "loss": 1.5156, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 28 }, { "epoch": 0.2141209044762344, "grad_norm": 0.27713659405708313, "learning_rate": 0.00019263239682514952, "loss": 1.5153, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 29 }, { "epoch": 0.22150438394093216, "grad_norm": 0.29187655448913574, "learning_rate": 0.0001916316904487005, "loss": 1.6036, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 30 }, { "epoch": 0.22888786340562992, "grad_norm": 0.2671583890914917, "learning_rate": 0.00019057022630804716, "loss": 1.4675, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 31 }, { "epoch": 0.23627134287032764, "grad_norm": 0.2679831087589264, "learning_rate": 0.00018944870822287956, "loss": 1.581, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 32 }, { "epoch": 0.2436548223350254, "grad_norm": 0.26359617710113525, "learning_rate": 0.00018826787983255473, "loss": 1.4674, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 33 }, { "epoch": 0.25103830179972314, "grad_norm": 0.30446046590805054, "learning_rate": 0.00018702852410301554, "loss": 1.5038, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 34 }, { "epoch": 0.25842178126442084, "grad_norm": 0.3004315197467804, "learning_rate": 0.00018573146280763324, "loss": 1.4024, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 35 }, { "epoch": 0.2658052607291186, "grad_norm": 0.27353399991989136, "learning_rate": 0.00018437755598231856, "loss": 1.3652, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 36 }, { "epoch": 0.27318874019381634, "grad_norm": 0.2659265995025635, "learning_rate": 0.0001829677013552619, "loss": 1.4905, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 37 }, { "epoch": 0.2805722196585141, "grad_norm": 0.2703750431537628, "learning_rate": 0.00018150283375168114, "loss": 1.3298, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 38 }, { "epoch": 0.28795569912321184, "grad_norm": 0.29322877526283264, "learning_rate": 0.00017998392447397197, "loss": 1.3639, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 39 }, { "epoch": 0.29533917858790953, "grad_norm": 0.26927265524864197, "learning_rate": 0.00017841198065767107, "loss": 1.4849, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 40 }, { "epoch": 0.3027226580526073, "grad_norm": 0.26683205366134644, "learning_rate": 0.00017678804460366, "loss": 1.4523, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 41 }, { "epoch": 0.31010613751730504, "grad_norm": 0.26909339427948, "learning_rate": 0.00017511319308705198, "loss": 1.6047, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 42 }, { "epoch": 0.3174896169820028, "grad_norm": 0.2938016355037689, "learning_rate": 0.00017338853664321992, "loss": 1.3858, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 43 }, { "epoch": 0.3248730964467005, "grad_norm": 0.28290048241615295, "learning_rate": 0.00017161521883143934, "loss": 1.3671, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 44 }, { "epoch": 0.33225657591139823, "grad_norm": 0.2870519161224365, "learning_rate": 0.00016979441547663435, "loss": 1.5334, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 45 }, { "epoch": 0.339640055376096, "grad_norm": 0.26375913619995117, "learning_rate": 0.00016792733388972932, "loss": 1.5627, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 46 }, { "epoch": 0.34702353484079373, "grad_norm": 0.26975446939468384, "learning_rate": 0.00016601521206712318, "loss": 1.4987, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 47 }, { "epoch": 0.3544070143054915, "grad_norm": 0.30166226625442505, "learning_rate": 0.00016405931786981755, "loss": 1.5128, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 48 }, { "epoch": 0.3617904937701892, "grad_norm": 0.29190436005592346, "learning_rate": 0.00016206094818274229, "loss": 1.4914, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 49 }, { "epoch": 0.36917397323488693, "grad_norm": 0.2847207486629486, "learning_rate": 0.00016002142805483685, "loss": 1.4755, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 50 }, { "epoch": 0.3765574526995847, "grad_norm": 0.2877140939235687, "learning_rate": 0.00015794210982045636, "loss": 1.422, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 51 }, { "epoch": 0.38394093216428243, "grad_norm": 0.2705255150794983, "learning_rate": 0.00015582437220268647, "loss": 1.4822, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 52 }, { "epoch": 0.3913244116289802, "grad_norm": 0.2597866654396057, "learning_rate": 0.00015366961939916008, "loss": 1.5088, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 53 }, { "epoch": 0.3987078910936779, "grad_norm": 0.2965547740459442, "learning_rate": 0.0001514792801509831, "loss": 1.5346, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 54 }, { "epoch": 0.40609137055837563, "grad_norm": 0.2670862078666687, "learning_rate": 0.00014925480679538647, "loss": 1.5695, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 55 }, { "epoch": 0.4134748500230734, "grad_norm": 0.30679062008857727, "learning_rate": 0.000146997674302732, "loss": 1.6911, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 56 }, { "epoch": 0.42085832948777113, "grad_norm": 0.2812528908252716, "learning_rate": 0.0001447093792985114, "loss": 1.664, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 57 }, { "epoch": 0.4282418089524688, "grad_norm": 0.26079821586608887, "learning_rate": 0.0001423914390709861, "loss": 1.3911, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 58 }, { "epoch": 0.4356252884171666, "grad_norm": 0.3026648461818695, "learning_rate": 0.00014004539056512667, "loss": 1.5685, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 59 }, { "epoch": 0.44300876788186433, "grad_norm": 0.28435277938842773, "learning_rate": 0.00013767278936351854, "loss": 1.3599, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 60 }, { "epoch": 0.4503922473465621, "grad_norm": 0.29877325892448425, "learning_rate": 0.0001352752086549095, "loss": 1.5173, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 61 }, { "epoch": 0.45777572681125983, "grad_norm": 0.29666033387184143, "learning_rate": 0.0001328542381910835, "loss": 1.5017, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 62 }, { "epoch": 0.4651592062759575, "grad_norm": 0.26934438943862915, "learning_rate": 0.0001304114832327518, "loss": 1.4674, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 63 }, { "epoch": 0.4725426857406553, "grad_norm": 0.30038923025131226, "learning_rate": 0.00012794856348516095, "loss": 1.5306, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 64 }, { "epoch": 0.47992616520535303, "grad_norm": 0.2799171805381775, "learning_rate": 0.00012546711202412287, "loss": 1.4696, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 65 }, { "epoch": 0.4873096446700508, "grad_norm": 0.313919335603714, "learning_rate": 0.0001229687742131796, "loss": 1.5927, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 66 }, { "epoch": 0.4946931241347485, "grad_norm": 0.27579808235168457, "learning_rate": 0.0001204552066126201, "loss": 1.4654, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 67 }, { "epoch": 0.5020766035994463, "grad_norm": 0.28434908390045166, "learning_rate": 0.00011792807588107357, "loss": 1.5039, "memory/device_mem_reserved(gib)": 21.8, "memory/max_mem_active(gib)": 21.55, "memory/max_mem_allocated(gib)": 21.55, "step": 68 } ], "logging_steps": 1, "max_steps": 135, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 34, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.234585444763566e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }