{ "best_global_step": 100, "best_metric": 3.08596992, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 174, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005747126436781609, "grad_norm": 54.19905417297271, "learning_rate": 1.111111111111111e-06, "loss": 1.7543538808822632, "memory(GiB)": 10.22, "step": 1, "token_acc": 0.5277777777777778, "train_speed(iter/s)": 0.067004 }, { "epoch": 0.028735632183908046, "grad_norm": 49.497954919310544, "learning_rate": 5.555555555555557e-06, "loss": 1.7060493230819702, "memory(GiB)": 10.22, "step": 5, "token_acc": 0.4930555555555556, "train_speed(iter/s)": 0.248559 }, { "epoch": 0.05747126436781609, "grad_norm": 14.633223810699512, "learning_rate": 1.1111111111111113e-05, "loss": 1.3003526687622071, "memory(GiB)": 10.22, "step": 10, "token_acc": 0.65, "train_speed(iter/s)": 0.379406 }, { "epoch": 0.08620689655172414, "grad_norm": 17.504341368414668, "learning_rate": 1.6666666666666667e-05, "loss": 1.1191390037536622, "memory(GiB)": 10.22, "step": 15, "token_acc": 0.7055555555555556, "train_speed(iter/s)": 0.457972 }, { "epoch": 0.11494252873563218, "grad_norm": 5.415303862567768, "learning_rate": 1.9991889981715696e-05, "loss": 1.0228230476379394, "memory(GiB)": 10.22, "step": 20, "token_acc": 0.7388888888888889, "train_speed(iter/s)": 0.514038 }, { "epoch": 0.14367816091954022, "grad_norm": 4.302409750602581, "learning_rate": 1.9900803279611643e-05, "loss": 0.8856360435485839, "memory(GiB)": 10.22, "step": 25, "token_acc": 0.7777777777777778, "train_speed(iter/s)": 0.554758 }, { "epoch": 0.1724137931034483, "grad_norm": 5.504310065306082, "learning_rate": 1.9709418174260523e-05, "loss": 0.8048258781433105, "memory(GiB)": 10.22, "step": 30, "token_acc": 0.8055555555555556, "train_speed(iter/s)": 0.585649 }, { "epoch": 0.20114942528735633, "grad_norm": 9.75691173960977, "learning_rate": 1.9419673459912652e-05, "loss": 0.7117091178894043, "memory(GiB)": 10.22, "step": 35, "token_acc": 0.7833333333333333, "train_speed(iter/s)": 0.609091 }, { "epoch": 0.22988505747126436, "grad_norm": 4.470052209724094, "learning_rate": 1.9034504346103825e-05, "loss": 0.659924840927124, "memory(GiB)": 10.22, "step": 40, "token_acc": 0.7597765363128491, "train_speed(iter/s)": 0.628438 }, { "epoch": 0.25862068965517243, "grad_norm": 7.40586681091568, "learning_rate": 1.8557812723014476e-05, "loss": 0.5936851024627685, "memory(GiB)": 10.22, "step": 45, "token_acc": 0.8166666666666667, "train_speed(iter/s)": 0.645078 }, { "epoch": 0.28735632183908044, "grad_norm": 3.894185239193535, "learning_rate": 1.7994427634035016e-05, "loss": 0.5902361869812012, "memory(GiB)": 10.22, "step": 50, "token_acc": 0.8100558659217877, "train_speed(iter/s)": 0.658944 }, { "epoch": 0.3160919540229885, "grad_norm": 6.535408253273618, "learning_rate": 1.7350056355963287e-05, "loss": 0.5443444728851319, "memory(GiB)": 10.22, "step": 55, "token_acc": 0.8491620111731844, "train_speed(iter/s)": 0.666067 }, { "epoch": 0.3448275862068966, "grad_norm": 4.884017101989966, "learning_rate": 1.6631226582407954e-05, "loss": 0.4963292121887207, "memory(GiB)": 10.22, "step": 60, "token_acc": 0.8435754189944135, "train_speed(iter/s)": 0.676319 }, { "epoch": 0.3735632183908046, "grad_norm": 5.325652389779561, "learning_rate": 1.584522029609889e-05, "loss": 0.5059492111206054, "memory(GiB)": 10.22, "step": 65, "token_acc": 0.8379888268156425, "train_speed(iter/s)": 0.685075 }, { "epoch": 0.40229885057471265, "grad_norm": 5.744137888836345, "learning_rate": 1.5000000000000002e-05, "loss": 0.48285999298095705, "memory(GiB)": 10.22, "step": 70, "token_acc": 0.8277777777777777, "train_speed(iter/s)": 0.692846 }, { "epoch": 0.43103448275862066, "grad_norm": 5.1841000179766406, "learning_rate": 1.410412805452757e-05, "loss": 0.44365577697753905, "memory(GiB)": 10.22, "step": 75, "token_acc": 0.8333333333333334, "train_speed(iter/s)": 0.70009 }, { "epoch": 0.45977011494252873, "grad_norm": 5.4044860823804015, "learning_rate": 1.3166679938014728e-05, "loss": 0.44077281951904296, "memory(GiB)": 10.22, "step": 80, "token_acc": 0.8547486033519553, "train_speed(iter/s)": 0.706774 }, { "epoch": 0.4885057471264368, "grad_norm": 3.814192170889284, "learning_rate": 1.2197152309122173e-05, "loss": 0.41083593368530275, "memory(GiB)": 10.22, "step": 85, "token_acc": 0.8777777777777778, "train_speed(iter/s)": 0.712107 }, { "epoch": 0.5172413793103449, "grad_norm": 6.230484574533733, "learning_rate": 1.1205366802553231e-05, "loss": 0.4023772716522217, "memory(GiB)": 10.22, "step": 90, "token_acc": 0.8491620111731844, "train_speed(iter/s)": 0.716314 }, { "epoch": 0.5459770114942529, "grad_norm": 5.5433724273249965, "learning_rate": 1.0201370532654404e-05, "loss": 0.3939186096191406, "memory(GiB)": 10.22, "step": 95, "token_acc": 0.8715083798882681, "train_speed(iter/s)": 0.720953 }, { "epoch": 0.5747126436781609, "grad_norm": 4.744446848913436, "learning_rate": 9.195334312832742e-06, "loss": 0.36634814739227295, "memory(GiB)": 10.22, "step": 100, "token_acc": 0.8944444444444445, "train_speed(iter/s)": 0.725326 }, { "epoch": 0.5747126436781609, "eval_loss": 3.085969924926758, "eval_runtime": 8.8084, "eval_samples_per_second": 116.253, "eval_steps_per_second": 2.498, "eval_token_acc": 0.7783402536829532, "step": 100 }, { "epoch": 0.603448275862069, "grad_norm": 3.926014775478041, "learning_rate": 8.197449621860944e-06, "loss": 0.35011539459228513, "memory(GiB)": 15.92, "step": 105, "token_acc": 0.7667682926829268, "train_speed(iter/s)": 0.678189 }, { "epoch": 0.632183908045977, "grad_norm": 4.246402025826811, "learning_rate": 7.217825360835475e-06, "loss": 0.33618762493133547, "memory(GiB)": 15.92, "step": 110, "token_acc": 0.8659217877094972, "train_speed(iter/s)": 0.677226 }, { "epoch": 0.6609195402298851, "grad_norm": 4.310168084132965, "learning_rate": 6.266385446673791e-06, "loss": 0.31383814811706545, "memory(GiB)": 15.92, "step": 115, "token_acc": 0.9055555555555556, "train_speed(iter/s)": 0.67988 }, { "epoch": 0.6896551724137931, "grad_norm": 4.774968229122785, "learning_rate": 5.352768279562315e-06, "loss": 0.2903557300567627, "memory(GiB)": 15.92, "step": 120, "token_acc": 0.9, "train_speed(iter/s)": 0.684505 }, { "epoch": 0.7183908045977011, "grad_norm": 4.0187528651074835, "learning_rate": 4.486229102783084e-06, "loss": 0.2903176784515381, "memory(GiB)": 15.92, "step": 125, "token_acc": 0.888268156424581, "train_speed(iter/s)": 0.688844 }, { "epoch": 0.7471264367816092, "grad_norm": 4.23665356199129, "learning_rate": 3.6755462440462288e-06, "loss": 0.25630669593811034, "memory(GiB)": 15.92, "step": 130, "token_acc": 0.9111111111111111, "train_speed(iter/s)": 0.693005 }, { "epoch": 0.7758620689655172, "grad_norm": 5.164302726059258, "learning_rate": 2.9289321881345257e-06, "loss": 0.26200544834136963, "memory(GiB)": 15.92, "step": 135, "token_acc": 0.8888888888888888, "train_speed(iter/s)": 0.696714 }, { "epoch": 0.8045977011494253, "grad_norm": 4.025947370412893, "learning_rate": 2.2539503817234553e-06, "loss": 0.24404802322387695, "memory(GiB)": 15.92, "step": 140, "token_acc": 0.9222222222222223, "train_speed(iter/s)": 0.700315 }, { "epoch": 0.8333333333333334, "grad_norm": 4.029452854747659, "learning_rate": 1.6574386131713872e-06, "loss": 0.2302267074584961, "memory(GiB)": 15.92, "step": 145, "token_acc": 0.9, "train_speed(iter/s)": 0.703785 }, { "epoch": 0.8620689655172413, "grad_norm": 4.111326617944181, "learning_rate": 1.1454397434679022e-06, "loss": 0.21495592594146729, "memory(GiB)": 15.92, "step": 150, "token_acc": 0.9333333333333333, "train_speed(iter/s)": 0.706914 }, { "epoch": 0.8908045977011494, "grad_norm": 3.6705370041668646, "learning_rate": 7.231404900585714e-07, "loss": 0.21858932971954345, "memory(GiB)": 15.92, "step": 155, "token_acc": 0.88268156424581, "train_speed(iter/s)": 0.709691 }, { "epoch": 0.9195402298850575, "grad_norm": 3.4777493199480354, "learning_rate": 3.9481888368627764e-07, "loss": 0.21195197105407715, "memory(GiB)": 15.92, "step": 160, "token_acc": 0.9162011173184358, "train_speed(iter/s)": 0.712557 }, { "epoch": 0.9482758620689655, "grad_norm": 3.8081053430996246, "learning_rate": 1.6380093052856482e-07, "loss": 0.20168027877807618, "memory(GiB)": 15.92, "step": 165, "token_acc": 0.9277777777777778, "train_speed(iter/s)": 0.715106 }, { "epoch": 0.9770114942528736, "grad_norm": 3.75099140436243, "learning_rate": 3.242691865790071e-08, "loss": 0.2000497817993164, "memory(GiB)": 15.92, "step": 170, "token_acc": 0.8888888888888888, "train_speed(iter/s)": 0.716937 }, { "epoch": 1.0, "eval_loss": 3.787536144256592, "eval_runtime": 5.2095, "eval_samples_per_second": 196.563, "eval_steps_per_second": 4.223, "eval_token_acc": 0.7779989761674535, "step": 174 } ], "logging_steps": 5, "max_steps": 174, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 220, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.470457873998807e+17, "train_batch_size": 6, "trial_name": null, "trial_params": null }