{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.923076923076923, "eval_steps": 25, "global_step": 1001, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14792899408284024, "grad_norm": 0.98828125, "learning_rate": 0.0001951951951951952, "loss": 0.9391, "step": 25 }, { "epoch": 0.14792899408284024, "eval_loss": 0.6652668118476868, "eval_runtime": 5.3863, "eval_samples_per_second": 16.523, "eval_steps_per_second": 2.228, "step": 25 }, { "epoch": 0.2958579881656805, "grad_norm": 0.654296875, "learning_rate": 0.0001901901901901902, "loss": 0.6138, "step": 50 }, { "epoch": 0.2958579881656805, "eval_loss": 0.6126009225845337, "eval_runtime": 5.4512, "eval_samples_per_second": 16.327, "eval_steps_per_second": 2.201, "step": 50 }, { "epoch": 0.4437869822485207, "grad_norm": 0.7314453125, "learning_rate": 0.0001851851851851852, "loss": 0.6039, "step": 75 }, { "epoch": 0.4437869822485207, "eval_loss": 0.6061152219772339, "eval_runtime": 5.4459, "eval_samples_per_second": 16.343, "eval_steps_per_second": 2.203, "step": 75 }, { "epoch": 0.591715976331361, "grad_norm": 0.65869140625, "learning_rate": 0.00018018018018018018, "loss": 0.5927, "step": 100 }, { "epoch": 0.591715976331361, "eval_loss": 0.5998491644859314, "eval_runtime": 5.4564, "eval_samples_per_second": 16.311, "eval_steps_per_second": 2.199, "step": 100 }, { "epoch": 0.7396449704142012, "grad_norm": 0.67333984375, "learning_rate": 0.0001751751751751752, "loss": 0.5973, "step": 125 }, { "epoch": 0.7396449704142012, "eval_loss": 0.594585120677948, "eval_runtime": 5.4901, "eval_samples_per_second": 16.211, "eval_steps_per_second": 2.186, "step": 125 }, { "epoch": 0.8875739644970414, "grad_norm": 0.65380859375, "learning_rate": 0.0001701701701701702, "loss": 0.602, "step": 150 }, { "epoch": 0.8875739644970414, "eval_loss": 0.5942851305007935, "eval_runtime": 5.4552, "eval_samples_per_second": 16.315, "eval_steps_per_second": 2.2, "step": 150 }, { "epoch": 1.0355029585798816, "grad_norm": 0.58544921875, "learning_rate": 0.00016516516516516518, "loss": 0.547, "step": 175 }, { "epoch": 1.0355029585798816, "eval_loss": 0.6319454312324524, "eval_runtime": 5.4449, "eval_samples_per_second": 16.345, "eval_steps_per_second": 2.204, "step": 175 }, { "epoch": 1.183431952662722, "grad_norm": 0.62158203125, "learning_rate": 0.00016016016016016018, "loss": 0.4239, "step": 200 }, { "epoch": 1.183431952662722, "eval_loss": 0.6168724894523621, "eval_runtime": 5.4499, "eval_samples_per_second": 16.331, "eval_steps_per_second": 2.202, "step": 200 }, { "epoch": 1.331360946745562, "grad_norm": 0.71240234375, "learning_rate": 0.00015515515515515516, "loss": 0.4301, "step": 225 }, { "epoch": 1.331360946745562, "eval_loss": 0.615761935710907, "eval_runtime": 5.4932, "eval_samples_per_second": 16.202, "eval_steps_per_second": 2.185, "step": 225 }, { "epoch": 1.4792899408284024, "grad_norm": 0.6865234375, "learning_rate": 0.00015015015015015014, "loss": 0.4176, "step": 250 }, { "epoch": 1.4792899408284024, "eval_loss": 0.6192708611488342, "eval_runtime": 5.4548, "eval_samples_per_second": 16.316, "eval_steps_per_second": 2.2, "step": 250 }, { "epoch": 1.6272189349112427, "grad_norm": 0.8076171875, "learning_rate": 0.00014514514514514515, "loss": 0.4295, "step": 275 }, { "epoch": 1.6272189349112427, "eval_loss": 0.6242427229881287, "eval_runtime": 5.4583, "eval_samples_per_second": 16.305, "eval_steps_per_second": 2.198, "step": 275 }, { "epoch": 1.7751479289940828, "grad_norm": 0.66796875, "learning_rate": 0.00014014014014014013, "loss": 0.4252, "step": 300 }, { "epoch": 1.7751479289940828, "eval_loss": 0.6264795660972595, "eval_runtime": 5.4513, "eval_samples_per_second": 16.326, "eval_steps_per_second": 2.201, "step": 300 }, { "epoch": 1.9230769230769231, "grad_norm": 0.720703125, "learning_rate": 0.00013513513513513514, "loss": 0.4252, "step": 325 }, { "epoch": 1.9230769230769231, "eval_loss": 0.6264156103134155, "eval_runtime": 5.4759, "eval_samples_per_second": 16.253, "eval_steps_per_second": 2.191, "step": 325 }, { "epoch": 2.0710059171597632, "grad_norm": 0.76611328125, "learning_rate": 0.00013013013013013014, "loss": 0.3591, "step": 350 }, { "epoch": 2.0710059171597632, "eval_loss": 0.6893021464347839, "eval_runtime": 5.4744, "eval_samples_per_second": 16.258, "eval_steps_per_second": 2.192, "step": 350 }, { "epoch": 2.2189349112426036, "grad_norm": 0.74169921875, "learning_rate": 0.00012512512512512512, "loss": 0.2758, "step": 375 }, { "epoch": 2.2189349112426036, "eval_loss": 0.7153319716453552, "eval_runtime": 5.504, "eval_samples_per_second": 16.17, "eval_steps_per_second": 2.18, "step": 375 }, { "epoch": 2.366863905325444, "grad_norm": 0.69384765625, "learning_rate": 0.00012012012012012013, "loss": 0.2702, "step": 400 }, { "epoch": 2.366863905325444, "eval_loss": 0.7170297503471375, "eval_runtime": 5.4565, "eval_samples_per_second": 16.311, "eval_steps_per_second": 2.199, "step": 400 }, { "epoch": 2.5147928994082838, "grad_norm": 0.806640625, "learning_rate": 0.00011511511511511512, "loss": 0.2797, "step": 425 }, { "epoch": 2.5147928994082838, "eval_loss": 0.7173412442207336, "eval_runtime": 5.4741, "eval_samples_per_second": 16.258, "eval_steps_per_second": 2.192, "step": 425 }, { "epoch": 2.662721893491124, "grad_norm": 0.77099609375, "learning_rate": 0.00011011011011011012, "loss": 0.2727, "step": 450 }, { "epoch": 2.662721893491124, "eval_loss": 0.7144489288330078, "eval_runtime": 5.5009, "eval_samples_per_second": 16.179, "eval_steps_per_second": 2.181, "step": 450 }, { "epoch": 2.8106508875739644, "grad_norm": 42.5625, "learning_rate": 0.00010510510510510511, "loss": 0.2817, "step": 475 }, { "epoch": 2.8106508875739644, "eval_loss": 0.7168906331062317, "eval_runtime": 5.4533, "eval_samples_per_second": 16.32, "eval_steps_per_second": 2.201, "step": 475 }, { "epoch": 2.9585798816568047, "grad_norm": 0.724609375, "learning_rate": 0.00010010010010010012, "loss": 0.2798, "step": 500 }, { "epoch": 2.9585798816568047, "eval_loss": 0.7015586495399475, "eval_runtime": 5.467, "eval_samples_per_second": 16.28, "eval_steps_per_second": 2.195, "step": 500 }, { "epoch": 3.106508875739645, "grad_norm": 0.6162109375, "learning_rate": 9.50950950950951e-05, "loss": 0.1922, "step": 525 }, { "epoch": 3.106508875739645, "eval_loss": 0.8090196847915649, "eval_runtime": 5.458, "eval_samples_per_second": 16.306, "eval_steps_per_second": 2.199, "step": 525 }, { "epoch": 3.2544378698224854, "grad_norm": 0.80517578125, "learning_rate": 9.009009009009009e-05, "loss": 0.16, "step": 550 }, { "epoch": 3.2544378698224854, "eval_loss": 0.8372513651847839, "eval_runtime": 5.4975, "eval_samples_per_second": 16.189, "eval_steps_per_second": 2.183, "step": 550 }, { "epoch": 3.4023668639053253, "grad_norm": 0.71728515625, "learning_rate": 8.50850850850851e-05, "loss": 0.1623, "step": 575 }, { "epoch": 3.4023668639053253, "eval_loss": 0.8371546864509583, "eval_runtime": 5.4897, "eval_samples_per_second": 16.212, "eval_steps_per_second": 2.186, "step": 575 }, { "epoch": 3.5502958579881656, "grad_norm": 0.775390625, "learning_rate": 8.008008008008009e-05, "loss": 0.1632, "step": 600 }, { "epoch": 3.5502958579881656, "eval_loss": 0.8401942849159241, "eval_runtime": 5.4525, "eval_samples_per_second": 16.323, "eval_steps_per_second": 2.201, "step": 600 }, { "epoch": 3.698224852071006, "grad_norm": 0.96337890625, "learning_rate": 7.507507507507507e-05, "loss": 0.1618, "step": 625 }, { "epoch": 3.698224852071006, "eval_loss": 0.8558365106582642, "eval_runtime": 5.4558, "eval_samples_per_second": 16.313, "eval_steps_per_second": 2.199, "step": 625 }, { "epoch": 3.8461538461538463, "grad_norm": 0.80322265625, "learning_rate": 7.007007007007007e-05, "loss": 0.1732, "step": 650 }, { "epoch": 3.8461538461538463, "eval_loss": 0.8581485748291016, "eval_runtime": 5.4935, "eval_samples_per_second": 16.201, "eval_steps_per_second": 2.184, "step": 650 }, { "epoch": 3.994082840236686, "grad_norm": 0.85498046875, "learning_rate": 6.506506506506507e-05, "loss": 0.1687, "step": 675 }, { "epoch": 3.994082840236686, "eval_loss": 0.8611082434654236, "eval_runtime": 5.4485, "eval_samples_per_second": 16.335, "eval_steps_per_second": 2.202, "step": 675 }, { "epoch": 4.1420118343195265, "grad_norm": 0.5654296875, "learning_rate": 6.0060060060060066e-05, "loss": 0.0961, "step": 700 }, { "epoch": 4.1420118343195265, "eval_loss": 0.9902079105377197, "eval_runtime": 5.519, "eval_samples_per_second": 16.126, "eval_steps_per_second": 2.174, "step": 700 }, { "epoch": 4.289940828402367, "grad_norm": 0.560546875, "learning_rate": 5.505505505505506e-05, "loss": 0.0879, "step": 725 }, { "epoch": 4.289940828402367, "eval_loss": 1.0101935863494873, "eval_runtime": 5.4771, "eval_samples_per_second": 16.25, "eval_steps_per_second": 2.191, "step": 725 }, { "epoch": 4.437869822485207, "grad_norm": 0.76611328125, "learning_rate": 5.005005005005006e-05, "loss": 0.0899, "step": 750 }, { "epoch": 4.437869822485207, "eval_loss": 1.0344929695129395, "eval_runtime": 5.4997, "eval_samples_per_second": 16.183, "eval_steps_per_second": 2.182, "step": 750 }, { "epoch": 4.585798816568047, "grad_norm": 0.595703125, "learning_rate": 4.5045045045045046e-05, "loss": 0.0899, "step": 775 }, { "epoch": 4.585798816568047, "eval_loss": 1.0255744457244873, "eval_runtime": 5.4646, "eval_samples_per_second": 16.287, "eval_steps_per_second": 2.196, "step": 775 }, { "epoch": 4.733727810650888, "grad_norm": 0.5869140625, "learning_rate": 4.0040040040040046e-05, "loss": 0.0882, "step": 800 }, { "epoch": 4.733727810650888, "eval_loss": 1.0273164510726929, "eval_runtime": 5.4989, "eval_samples_per_second": 16.185, "eval_steps_per_second": 2.182, "step": 800 }, { "epoch": 4.881656804733728, "grad_norm": 0.720703125, "learning_rate": 3.503503503503503e-05, "loss": 0.0893, "step": 825 }, { "epoch": 4.881656804733728, "eval_loss": 1.0559364557266235, "eval_runtime": 5.4574, "eval_samples_per_second": 16.308, "eval_steps_per_second": 2.199, "step": 825 }, { "epoch": 5.029585798816568, "grad_norm": 0.4755859375, "learning_rate": 3.0030030030030033e-05, "loss": 0.0824, "step": 850 }, { "epoch": 5.029585798816568, "eval_loss": 1.0753172636032104, "eval_runtime": 5.5098, "eval_samples_per_second": 16.153, "eval_steps_per_second": 2.178, "step": 850 }, { "epoch": 5.177514792899408, "grad_norm": 0.50439453125, "learning_rate": 2.502502502502503e-05, "loss": 0.052, "step": 875 }, { "epoch": 5.177514792899408, "eval_loss": 1.158236026763916, "eval_runtime": 5.4641, "eval_samples_per_second": 16.288, "eval_steps_per_second": 2.196, "step": 875 }, { "epoch": 5.325443786982248, "grad_norm": 0.468994140625, "learning_rate": 2.0020020020020023e-05, "loss": 0.052, "step": 900 }, { "epoch": 5.325443786982248, "eval_loss": 1.164330005645752, "eval_runtime": 5.4588, "eval_samples_per_second": 16.304, "eval_steps_per_second": 2.198, "step": 900 }, { "epoch": 5.4733727810650885, "grad_norm": 0.5849609375, "learning_rate": 1.5015015015015016e-05, "loss": 0.0526, "step": 925 }, { "epoch": 5.4733727810650885, "eval_loss": 1.1923322677612305, "eval_runtime": 5.5009, "eval_samples_per_second": 16.179, "eval_steps_per_second": 2.181, "step": 925 }, { "epoch": 5.621301775147929, "grad_norm": 0.52783203125, "learning_rate": 1.0010010010010011e-05, "loss": 0.0497, "step": 950 }, { "epoch": 5.621301775147929, "eval_loss": 1.175872802734375, "eval_runtime": 5.4976, "eval_samples_per_second": 16.189, "eval_steps_per_second": 2.183, "step": 950 }, { "epoch": 5.769230769230769, "grad_norm": 0.461669921875, "learning_rate": 5.005005005005006e-06, "loss": 0.0496, "step": 975 }, { "epoch": 5.769230769230769, "eval_loss": 1.1811896562576294, "eval_runtime": 5.4611, "eval_samples_per_second": 16.297, "eval_steps_per_second": 2.197, "step": 975 }, { "epoch": 5.9171597633136095, "grad_norm": 0.487548828125, "learning_rate": 0.0, "loss": 0.0477, "step": 1000 }, { "epoch": 5.9171597633136095, "eval_loss": 1.1831614971160889, "eval_runtime": 5.452, "eval_samples_per_second": 16.324, "eval_steps_per_second": 2.201, "step": 1000 }, { "epoch": 5.923076923076923, "step": 1001, "total_flos": 1.7606154086724403e+17, "train_loss": 4.579017792905604e-05, "train_runtime": 1.6348, "train_samples_per_second": 2446.747, "train_steps_per_second": 611.687 } ], "logging_steps": 25, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 25, "total_flos": 1.7606154086724403e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }