| { | |
| "best_global_step": 100, | |
| "best_metric": 3.08596992, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 100, | |
| "global_step": 174, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005747126436781609, | |
| "grad_norm": 54.19905417297271, | |
| "learning_rate": 1.111111111111111e-06, | |
| "loss": 1.7543538808822632, | |
| "memory(GiB)": 10.22, | |
| "step": 1, | |
| "token_acc": 0.5277777777777778, | |
| "train_speed(iter/s)": 0.067004 | |
| }, | |
| { | |
| "epoch": 0.028735632183908046, | |
| "grad_norm": 49.497954919310544, | |
| "learning_rate": 5.555555555555557e-06, | |
| "loss": 1.7060493230819702, | |
| "memory(GiB)": 10.22, | |
| "step": 5, | |
| "token_acc": 0.4930555555555556, | |
| "train_speed(iter/s)": 0.248559 | |
| }, | |
| { | |
| "epoch": 0.05747126436781609, | |
| "grad_norm": 14.633223810699512, | |
| "learning_rate": 1.1111111111111113e-05, | |
| "loss": 1.3003526687622071, | |
| "memory(GiB)": 10.22, | |
| "step": 10, | |
| "token_acc": 0.65, | |
| "train_speed(iter/s)": 0.379406 | |
| }, | |
| { | |
| "epoch": 0.08620689655172414, | |
| "grad_norm": 17.504341368414668, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 1.1191390037536622, | |
| "memory(GiB)": 10.22, | |
| "step": 15, | |
| "token_acc": 0.7055555555555556, | |
| "train_speed(iter/s)": 0.457972 | |
| }, | |
| { | |
| "epoch": 0.11494252873563218, | |
| "grad_norm": 5.415303862567768, | |
| "learning_rate": 1.9991889981715696e-05, | |
| "loss": 1.0228230476379394, | |
| "memory(GiB)": 10.22, | |
| "step": 20, | |
| "token_acc": 0.7388888888888889, | |
| "train_speed(iter/s)": 0.514038 | |
| }, | |
| { | |
| "epoch": 0.14367816091954022, | |
| "grad_norm": 4.302409750602581, | |
| "learning_rate": 1.9900803279611643e-05, | |
| "loss": 0.8856360435485839, | |
| "memory(GiB)": 10.22, | |
| "step": 25, | |
| "token_acc": 0.7777777777777778, | |
| "train_speed(iter/s)": 0.554758 | |
| }, | |
| { | |
| "epoch": 0.1724137931034483, | |
| "grad_norm": 5.504310065306082, | |
| "learning_rate": 1.9709418174260523e-05, | |
| "loss": 0.8048258781433105, | |
| "memory(GiB)": 10.22, | |
| "step": 30, | |
| "token_acc": 0.8055555555555556, | |
| "train_speed(iter/s)": 0.585649 | |
| }, | |
| { | |
| "epoch": 0.20114942528735633, | |
| "grad_norm": 9.75691173960977, | |
| "learning_rate": 1.9419673459912652e-05, | |
| "loss": 0.7117091178894043, | |
| "memory(GiB)": 10.22, | |
| "step": 35, | |
| "token_acc": 0.7833333333333333, | |
| "train_speed(iter/s)": 0.609091 | |
| }, | |
| { | |
| "epoch": 0.22988505747126436, | |
| "grad_norm": 4.470052209724094, | |
| "learning_rate": 1.9034504346103825e-05, | |
| "loss": 0.659924840927124, | |
| "memory(GiB)": 10.22, | |
| "step": 40, | |
| "token_acc": 0.7597765363128491, | |
| "train_speed(iter/s)": 0.628438 | |
| }, | |
| { | |
| "epoch": 0.25862068965517243, | |
| "grad_norm": 7.40586681091568, | |
| "learning_rate": 1.8557812723014476e-05, | |
| "loss": 0.5936851024627685, | |
| "memory(GiB)": 10.22, | |
| "step": 45, | |
| "token_acc": 0.8166666666666667, | |
| "train_speed(iter/s)": 0.645078 | |
| }, | |
| { | |
| "epoch": 0.28735632183908044, | |
| "grad_norm": 3.894185239193535, | |
| "learning_rate": 1.7994427634035016e-05, | |
| "loss": 0.5902361869812012, | |
| "memory(GiB)": 10.22, | |
| "step": 50, | |
| "token_acc": 0.8100558659217877, | |
| "train_speed(iter/s)": 0.658944 | |
| }, | |
| { | |
| "epoch": 0.3160919540229885, | |
| "grad_norm": 6.535408253273618, | |
| "learning_rate": 1.7350056355963287e-05, | |
| "loss": 0.5443444728851319, | |
| "memory(GiB)": 10.22, | |
| "step": 55, | |
| "token_acc": 0.8491620111731844, | |
| "train_speed(iter/s)": 0.666067 | |
| }, | |
| { | |
| "epoch": 0.3448275862068966, | |
| "grad_norm": 4.884017101989966, | |
| "learning_rate": 1.6631226582407954e-05, | |
| "loss": 0.4963292121887207, | |
| "memory(GiB)": 10.22, | |
| "step": 60, | |
| "token_acc": 0.8435754189944135, | |
| "train_speed(iter/s)": 0.676319 | |
| }, | |
| { | |
| "epoch": 0.3735632183908046, | |
| "grad_norm": 5.325652389779561, | |
| "learning_rate": 1.584522029609889e-05, | |
| "loss": 0.5059492111206054, | |
| "memory(GiB)": 10.22, | |
| "step": 65, | |
| "token_acc": 0.8379888268156425, | |
| "train_speed(iter/s)": 0.685075 | |
| }, | |
| { | |
| "epoch": 0.40229885057471265, | |
| "grad_norm": 5.744137888836345, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 0.48285999298095705, | |
| "memory(GiB)": 10.22, | |
| "step": 70, | |
| "token_acc": 0.8277777777777777, | |
| "train_speed(iter/s)": 0.692846 | |
| }, | |
| { | |
| "epoch": 0.43103448275862066, | |
| "grad_norm": 5.1841000179766406, | |
| "learning_rate": 1.410412805452757e-05, | |
| "loss": 0.44365577697753905, | |
| "memory(GiB)": 10.22, | |
| "step": 75, | |
| "token_acc": 0.8333333333333334, | |
| "train_speed(iter/s)": 0.70009 | |
| }, | |
| { | |
| "epoch": 0.45977011494252873, | |
| "grad_norm": 5.4044860823804015, | |
| "learning_rate": 1.3166679938014728e-05, | |
| "loss": 0.44077281951904296, | |
| "memory(GiB)": 10.22, | |
| "step": 80, | |
| "token_acc": 0.8547486033519553, | |
| "train_speed(iter/s)": 0.706774 | |
| }, | |
| { | |
| "epoch": 0.4885057471264368, | |
| "grad_norm": 3.814192170889284, | |
| "learning_rate": 1.2197152309122173e-05, | |
| "loss": 0.41083593368530275, | |
| "memory(GiB)": 10.22, | |
| "step": 85, | |
| "token_acc": 0.8777777777777778, | |
| "train_speed(iter/s)": 0.712107 | |
| }, | |
| { | |
| "epoch": 0.5172413793103449, | |
| "grad_norm": 6.230484574533733, | |
| "learning_rate": 1.1205366802553231e-05, | |
| "loss": 0.4023772716522217, | |
| "memory(GiB)": 10.22, | |
| "step": 90, | |
| "token_acc": 0.8491620111731844, | |
| "train_speed(iter/s)": 0.716314 | |
| }, | |
| { | |
| "epoch": 0.5459770114942529, | |
| "grad_norm": 5.5433724273249965, | |
| "learning_rate": 1.0201370532654404e-05, | |
| "loss": 0.3939186096191406, | |
| "memory(GiB)": 10.22, | |
| "step": 95, | |
| "token_acc": 0.8715083798882681, | |
| "train_speed(iter/s)": 0.720953 | |
| }, | |
| { | |
| "epoch": 0.5747126436781609, | |
| "grad_norm": 4.744446848913436, | |
| "learning_rate": 9.195334312832742e-06, | |
| "loss": 0.36634814739227295, | |
| "memory(GiB)": 10.22, | |
| "step": 100, | |
| "token_acc": 0.8944444444444445, | |
| "train_speed(iter/s)": 0.725326 | |
| }, | |
| { | |
| "epoch": 0.5747126436781609, | |
| "eval_loss": 3.085969924926758, | |
| "eval_runtime": 8.8084, | |
| "eval_samples_per_second": 116.253, | |
| "eval_steps_per_second": 2.498, | |
| "eval_token_acc": 0.7783402536829532, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.603448275862069, | |
| "grad_norm": 3.926014775478041, | |
| "learning_rate": 8.197449621860944e-06, | |
| "loss": 0.35011539459228513, | |
| "memory(GiB)": 15.92, | |
| "step": 105, | |
| "token_acc": 0.7667682926829268, | |
| "train_speed(iter/s)": 0.678189 | |
| }, | |
| { | |
| "epoch": 0.632183908045977, | |
| "grad_norm": 4.246402025826811, | |
| "learning_rate": 7.217825360835475e-06, | |
| "loss": 0.33618762493133547, | |
| "memory(GiB)": 15.92, | |
| "step": 110, | |
| "token_acc": 0.8659217877094972, | |
| "train_speed(iter/s)": 0.677226 | |
| }, | |
| { | |
| "epoch": 0.6609195402298851, | |
| "grad_norm": 4.310168084132965, | |
| "learning_rate": 6.266385446673791e-06, | |
| "loss": 0.31383814811706545, | |
| "memory(GiB)": 15.92, | |
| "step": 115, | |
| "token_acc": 0.9055555555555556, | |
| "train_speed(iter/s)": 0.67988 | |
| }, | |
| { | |
| "epoch": 0.6896551724137931, | |
| "grad_norm": 4.774968229122785, | |
| "learning_rate": 5.352768279562315e-06, | |
| "loss": 0.2903557300567627, | |
| "memory(GiB)": 15.92, | |
| "step": 120, | |
| "token_acc": 0.9, | |
| "train_speed(iter/s)": 0.684505 | |
| }, | |
| { | |
| "epoch": 0.7183908045977011, | |
| "grad_norm": 4.0187528651074835, | |
| "learning_rate": 4.486229102783084e-06, | |
| "loss": 0.2903176784515381, | |
| "memory(GiB)": 15.92, | |
| "step": 125, | |
| "token_acc": 0.888268156424581, | |
| "train_speed(iter/s)": 0.688844 | |
| }, | |
| { | |
| "epoch": 0.7471264367816092, | |
| "grad_norm": 4.23665356199129, | |
| "learning_rate": 3.6755462440462288e-06, | |
| "loss": 0.25630669593811034, | |
| "memory(GiB)": 15.92, | |
| "step": 130, | |
| "token_acc": 0.9111111111111111, | |
| "train_speed(iter/s)": 0.693005 | |
| }, | |
| { | |
| "epoch": 0.7758620689655172, | |
| "grad_norm": 5.164302726059258, | |
| "learning_rate": 2.9289321881345257e-06, | |
| "loss": 0.26200544834136963, | |
| "memory(GiB)": 15.92, | |
| "step": 135, | |
| "token_acc": 0.8888888888888888, | |
| "train_speed(iter/s)": 0.696714 | |
| }, | |
| { | |
| "epoch": 0.8045977011494253, | |
| "grad_norm": 4.025947370412893, | |
| "learning_rate": 2.2539503817234553e-06, | |
| "loss": 0.24404802322387695, | |
| "memory(GiB)": 15.92, | |
| "step": 140, | |
| "token_acc": 0.9222222222222223, | |
| "train_speed(iter/s)": 0.700315 | |
| }, | |
| { | |
| "epoch": 0.8333333333333334, | |
| "grad_norm": 4.029452854747659, | |
| "learning_rate": 1.6574386131713872e-06, | |
| "loss": 0.2302267074584961, | |
| "memory(GiB)": 15.92, | |
| "step": 145, | |
| "token_acc": 0.9, | |
| "train_speed(iter/s)": 0.703785 | |
| }, | |
| { | |
| "epoch": 0.8620689655172413, | |
| "grad_norm": 4.111326617944181, | |
| "learning_rate": 1.1454397434679022e-06, | |
| "loss": 0.21495592594146729, | |
| "memory(GiB)": 15.92, | |
| "step": 150, | |
| "token_acc": 0.9333333333333333, | |
| "train_speed(iter/s)": 0.706914 | |
| }, | |
| { | |
| "epoch": 0.8908045977011494, | |
| "grad_norm": 3.6705370041668646, | |
| "learning_rate": 7.231404900585714e-07, | |
| "loss": 0.21858932971954345, | |
| "memory(GiB)": 15.92, | |
| "step": 155, | |
| "token_acc": 0.88268156424581, | |
| "train_speed(iter/s)": 0.709691 | |
| }, | |
| { | |
| "epoch": 0.9195402298850575, | |
| "grad_norm": 3.4777493199480354, | |
| "learning_rate": 3.9481888368627764e-07, | |
| "loss": 0.21195197105407715, | |
| "memory(GiB)": 15.92, | |
| "step": 160, | |
| "token_acc": 0.9162011173184358, | |
| "train_speed(iter/s)": 0.712557 | |
| }, | |
| { | |
| "epoch": 0.9482758620689655, | |
| "grad_norm": 3.8081053430996246, | |
| "learning_rate": 1.6380093052856482e-07, | |
| "loss": 0.20168027877807618, | |
| "memory(GiB)": 15.92, | |
| "step": 165, | |
| "token_acc": 0.9277777777777778, | |
| "train_speed(iter/s)": 0.715106 | |
| }, | |
| { | |
| "epoch": 0.9770114942528736, | |
| "grad_norm": 3.75099140436243, | |
| "learning_rate": 3.242691865790071e-08, | |
| "loss": 0.2000497817993164, | |
| "memory(GiB)": 15.92, | |
| "step": 170, | |
| "token_acc": 0.8888888888888888, | |
| "train_speed(iter/s)": 0.716937 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 3.787536144256592, | |
| "eval_runtime": 5.2095, | |
| "eval_samples_per_second": 196.563, | |
| "eval_steps_per_second": 4.223, | |
| "eval_token_acc": 0.7779989761674535, | |
| "step": 174 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 174, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 220, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.470457873998807e+17, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |