| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.923076923076923, | |
| "eval_steps": 25, | |
| "global_step": 1001, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.14792899408284024, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 0.0001951951951951952, | |
| "loss": 0.9391, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.14792899408284024, | |
| "eval_loss": 0.6652668118476868, | |
| "eval_runtime": 5.3863, | |
| "eval_samples_per_second": 16.523, | |
| "eval_steps_per_second": 2.228, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.2958579881656805, | |
| "grad_norm": 0.654296875, | |
| "learning_rate": 0.0001901901901901902, | |
| "loss": 0.6138, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2958579881656805, | |
| "eval_loss": 0.6126009225845337, | |
| "eval_runtime": 5.4512, | |
| "eval_samples_per_second": 16.327, | |
| "eval_steps_per_second": 2.201, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.4437869822485207, | |
| "grad_norm": 0.7314453125, | |
| "learning_rate": 0.0001851851851851852, | |
| "loss": 0.6039, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.4437869822485207, | |
| "eval_loss": 0.6061152219772339, | |
| "eval_runtime": 5.4459, | |
| "eval_samples_per_second": 16.343, | |
| "eval_steps_per_second": 2.203, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.591715976331361, | |
| "grad_norm": 0.65869140625, | |
| "learning_rate": 0.00018018018018018018, | |
| "loss": 0.5927, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.591715976331361, | |
| "eval_loss": 0.5998491644859314, | |
| "eval_runtime": 5.4564, | |
| "eval_samples_per_second": 16.311, | |
| "eval_steps_per_second": 2.199, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.7396449704142012, | |
| "grad_norm": 0.67333984375, | |
| "learning_rate": 0.0001751751751751752, | |
| "loss": 0.5973, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.7396449704142012, | |
| "eval_loss": 0.594585120677948, | |
| "eval_runtime": 5.4901, | |
| "eval_samples_per_second": 16.211, | |
| "eval_steps_per_second": 2.186, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.8875739644970414, | |
| "grad_norm": 0.65380859375, | |
| "learning_rate": 0.0001701701701701702, | |
| "loss": 0.602, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8875739644970414, | |
| "eval_loss": 0.5942851305007935, | |
| "eval_runtime": 5.4552, | |
| "eval_samples_per_second": 16.315, | |
| "eval_steps_per_second": 2.2, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.0355029585798816, | |
| "grad_norm": 0.58544921875, | |
| "learning_rate": 0.00016516516516516518, | |
| "loss": 0.547, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.0355029585798816, | |
| "eval_loss": 0.6319454312324524, | |
| "eval_runtime": 5.4449, | |
| "eval_samples_per_second": 16.345, | |
| "eval_steps_per_second": 2.204, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.183431952662722, | |
| "grad_norm": 0.62158203125, | |
| "learning_rate": 0.00016016016016016018, | |
| "loss": 0.4239, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.183431952662722, | |
| "eval_loss": 0.6168724894523621, | |
| "eval_runtime": 5.4499, | |
| "eval_samples_per_second": 16.331, | |
| "eval_steps_per_second": 2.202, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.331360946745562, | |
| "grad_norm": 0.71240234375, | |
| "learning_rate": 0.00015515515515515516, | |
| "loss": 0.4301, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.331360946745562, | |
| "eval_loss": 0.615761935710907, | |
| "eval_runtime": 5.4932, | |
| "eval_samples_per_second": 16.202, | |
| "eval_steps_per_second": 2.185, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.4792899408284024, | |
| "grad_norm": 0.6865234375, | |
| "learning_rate": 0.00015015015015015014, | |
| "loss": 0.4176, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.4792899408284024, | |
| "eval_loss": 0.6192708611488342, | |
| "eval_runtime": 5.4548, | |
| "eval_samples_per_second": 16.316, | |
| "eval_steps_per_second": 2.2, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.6272189349112427, | |
| "grad_norm": 0.8076171875, | |
| "learning_rate": 0.00014514514514514515, | |
| "loss": 0.4295, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.6272189349112427, | |
| "eval_loss": 0.6242427229881287, | |
| "eval_runtime": 5.4583, | |
| "eval_samples_per_second": 16.305, | |
| "eval_steps_per_second": 2.198, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.7751479289940828, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.00014014014014014013, | |
| "loss": 0.4252, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.7751479289940828, | |
| "eval_loss": 0.6264795660972595, | |
| "eval_runtime": 5.4513, | |
| "eval_samples_per_second": 16.326, | |
| "eval_steps_per_second": 2.201, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.9230769230769231, | |
| "grad_norm": 0.720703125, | |
| "learning_rate": 0.00013513513513513514, | |
| "loss": 0.4252, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.9230769230769231, | |
| "eval_loss": 0.6264156103134155, | |
| "eval_runtime": 5.4759, | |
| "eval_samples_per_second": 16.253, | |
| "eval_steps_per_second": 2.191, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.0710059171597632, | |
| "grad_norm": 0.76611328125, | |
| "learning_rate": 0.00013013013013013014, | |
| "loss": 0.3591, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.0710059171597632, | |
| "eval_loss": 0.6893021464347839, | |
| "eval_runtime": 5.4744, | |
| "eval_samples_per_second": 16.258, | |
| "eval_steps_per_second": 2.192, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.2189349112426036, | |
| "grad_norm": 0.74169921875, | |
| "learning_rate": 0.00012512512512512512, | |
| "loss": 0.2758, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 2.2189349112426036, | |
| "eval_loss": 0.7153319716453552, | |
| "eval_runtime": 5.504, | |
| "eval_samples_per_second": 16.17, | |
| "eval_steps_per_second": 2.18, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 2.366863905325444, | |
| "grad_norm": 0.69384765625, | |
| "learning_rate": 0.00012012012012012013, | |
| "loss": 0.2702, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.366863905325444, | |
| "eval_loss": 0.7170297503471375, | |
| "eval_runtime": 5.4565, | |
| "eval_samples_per_second": 16.311, | |
| "eval_steps_per_second": 2.199, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.5147928994082838, | |
| "grad_norm": 0.806640625, | |
| "learning_rate": 0.00011511511511511512, | |
| "loss": 0.2797, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 2.5147928994082838, | |
| "eval_loss": 0.7173412442207336, | |
| "eval_runtime": 5.4741, | |
| "eval_samples_per_second": 16.258, | |
| "eval_steps_per_second": 2.192, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 2.662721893491124, | |
| "grad_norm": 0.77099609375, | |
| "learning_rate": 0.00011011011011011012, | |
| "loss": 0.2727, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.662721893491124, | |
| "eval_loss": 0.7144489288330078, | |
| "eval_runtime": 5.5009, | |
| "eval_samples_per_second": 16.179, | |
| "eval_steps_per_second": 2.181, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.8106508875739644, | |
| "grad_norm": 42.5625, | |
| "learning_rate": 0.00010510510510510511, | |
| "loss": 0.2817, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 2.8106508875739644, | |
| "eval_loss": 0.7168906331062317, | |
| "eval_runtime": 5.4533, | |
| "eval_samples_per_second": 16.32, | |
| "eval_steps_per_second": 2.201, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 2.9585798816568047, | |
| "grad_norm": 0.724609375, | |
| "learning_rate": 0.00010010010010010012, | |
| "loss": 0.2798, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.9585798816568047, | |
| "eval_loss": 0.7015586495399475, | |
| "eval_runtime": 5.467, | |
| "eval_samples_per_second": 16.28, | |
| "eval_steps_per_second": 2.195, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.106508875739645, | |
| "grad_norm": 0.6162109375, | |
| "learning_rate": 9.50950950950951e-05, | |
| "loss": 0.1922, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 3.106508875739645, | |
| "eval_loss": 0.8090196847915649, | |
| "eval_runtime": 5.458, | |
| "eval_samples_per_second": 16.306, | |
| "eval_steps_per_second": 2.199, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 3.2544378698224854, | |
| "grad_norm": 0.80517578125, | |
| "learning_rate": 9.009009009009009e-05, | |
| "loss": 0.16, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 3.2544378698224854, | |
| "eval_loss": 0.8372513651847839, | |
| "eval_runtime": 5.4975, | |
| "eval_samples_per_second": 16.189, | |
| "eval_steps_per_second": 2.183, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 3.4023668639053253, | |
| "grad_norm": 0.71728515625, | |
| "learning_rate": 8.50850850850851e-05, | |
| "loss": 0.1623, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 3.4023668639053253, | |
| "eval_loss": 0.8371546864509583, | |
| "eval_runtime": 5.4897, | |
| "eval_samples_per_second": 16.212, | |
| "eval_steps_per_second": 2.186, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 3.5502958579881656, | |
| "grad_norm": 0.775390625, | |
| "learning_rate": 8.008008008008009e-05, | |
| "loss": 0.1632, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.5502958579881656, | |
| "eval_loss": 0.8401942849159241, | |
| "eval_runtime": 5.4525, | |
| "eval_samples_per_second": 16.323, | |
| "eval_steps_per_second": 2.201, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.698224852071006, | |
| "grad_norm": 0.96337890625, | |
| "learning_rate": 7.507507507507507e-05, | |
| "loss": 0.1618, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 3.698224852071006, | |
| "eval_loss": 0.8558365106582642, | |
| "eval_runtime": 5.4558, | |
| "eval_samples_per_second": 16.313, | |
| "eval_steps_per_second": 2.199, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 3.8461538461538463, | |
| "grad_norm": 0.80322265625, | |
| "learning_rate": 7.007007007007007e-05, | |
| "loss": 0.1732, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 3.8461538461538463, | |
| "eval_loss": 0.8581485748291016, | |
| "eval_runtime": 5.4935, | |
| "eval_samples_per_second": 16.201, | |
| "eval_steps_per_second": 2.184, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 3.994082840236686, | |
| "grad_norm": 0.85498046875, | |
| "learning_rate": 6.506506506506507e-05, | |
| "loss": 0.1687, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 3.994082840236686, | |
| "eval_loss": 0.8611082434654236, | |
| "eval_runtime": 5.4485, | |
| "eval_samples_per_second": 16.335, | |
| "eval_steps_per_second": 2.202, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 4.1420118343195265, | |
| "grad_norm": 0.5654296875, | |
| "learning_rate": 6.0060060060060066e-05, | |
| "loss": 0.0961, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 4.1420118343195265, | |
| "eval_loss": 0.9902079105377197, | |
| "eval_runtime": 5.519, | |
| "eval_samples_per_second": 16.126, | |
| "eval_steps_per_second": 2.174, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 4.289940828402367, | |
| "grad_norm": 0.560546875, | |
| "learning_rate": 5.505505505505506e-05, | |
| "loss": 0.0879, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 4.289940828402367, | |
| "eval_loss": 1.0101935863494873, | |
| "eval_runtime": 5.4771, | |
| "eval_samples_per_second": 16.25, | |
| "eval_steps_per_second": 2.191, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 4.437869822485207, | |
| "grad_norm": 0.76611328125, | |
| "learning_rate": 5.005005005005006e-05, | |
| "loss": 0.0899, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 4.437869822485207, | |
| "eval_loss": 1.0344929695129395, | |
| "eval_runtime": 5.4997, | |
| "eval_samples_per_second": 16.183, | |
| "eval_steps_per_second": 2.182, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 4.585798816568047, | |
| "grad_norm": 0.595703125, | |
| "learning_rate": 4.5045045045045046e-05, | |
| "loss": 0.0899, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 4.585798816568047, | |
| "eval_loss": 1.0255744457244873, | |
| "eval_runtime": 5.4646, | |
| "eval_samples_per_second": 16.287, | |
| "eval_steps_per_second": 2.196, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 4.733727810650888, | |
| "grad_norm": 0.5869140625, | |
| "learning_rate": 4.0040040040040046e-05, | |
| "loss": 0.0882, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.733727810650888, | |
| "eval_loss": 1.0273164510726929, | |
| "eval_runtime": 5.4989, | |
| "eval_samples_per_second": 16.185, | |
| "eval_steps_per_second": 2.182, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.881656804733728, | |
| "grad_norm": 0.720703125, | |
| "learning_rate": 3.503503503503503e-05, | |
| "loss": 0.0893, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 4.881656804733728, | |
| "eval_loss": 1.0559364557266235, | |
| "eval_runtime": 5.4574, | |
| "eval_samples_per_second": 16.308, | |
| "eval_steps_per_second": 2.199, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 5.029585798816568, | |
| "grad_norm": 0.4755859375, | |
| "learning_rate": 3.0030030030030033e-05, | |
| "loss": 0.0824, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 5.029585798816568, | |
| "eval_loss": 1.0753172636032104, | |
| "eval_runtime": 5.5098, | |
| "eval_samples_per_second": 16.153, | |
| "eval_steps_per_second": 2.178, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 5.177514792899408, | |
| "grad_norm": 0.50439453125, | |
| "learning_rate": 2.502502502502503e-05, | |
| "loss": 0.052, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 5.177514792899408, | |
| "eval_loss": 1.158236026763916, | |
| "eval_runtime": 5.4641, | |
| "eval_samples_per_second": 16.288, | |
| "eval_steps_per_second": 2.196, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 5.325443786982248, | |
| "grad_norm": 0.468994140625, | |
| "learning_rate": 2.0020020020020023e-05, | |
| "loss": 0.052, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 5.325443786982248, | |
| "eval_loss": 1.164330005645752, | |
| "eval_runtime": 5.4588, | |
| "eval_samples_per_second": 16.304, | |
| "eval_steps_per_second": 2.198, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 5.4733727810650885, | |
| "grad_norm": 0.5849609375, | |
| "learning_rate": 1.5015015015015016e-05, | |
| "loss": 0.0526, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 5.4733727810650885, | |
| "eval_loss": 1.1923322677612305, | |
| "eval_runtime": 5.5009, | |
| "eval_samples_per_second": 16.179, | |
| "eval_steps_per_second": 2.181, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 5.621301775147929, | |
| "grad_norm": 0.52783203125, | |
| "learning_rate": 1.0010010010010011e-05, | |
| "loss": 0.0497, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 5.621301775147929, | |
| "eval_loss": 1.175872802734375, | |
| "eval_runtime": 5.4976, | |
| "eval_samples_per_second": 16.189, | |
| "eval_steps_per_second": 2.183, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 5.769230769230769, | |
| "grad_norm": 0.461669921875, | |
| "learning_rate": 5.005005005005006e-06, | |
| "loss": 0.0496, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 5.769230769230769, | |
| "eval_loss": 1.1811896562576294, | |
| "eval_runtime": 5.4611, | |
| "eval_samples_per_second": 16.297, | |
| "eval_steps_per_second": 2.197, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 5.9171597633136095, | |
| "grad_norm": 0.487548828125, | |
| "learning_rate": 0.0, | |
| "loss": 0.0477, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 5.9171597633136095, | |
| "eval_loss": 1.1831614971160889, | |
| "eval_runtime": 5.452, | |
| "eval_samples_per_second": 16.324, | |
| "eval_steps_per_second": 2.201, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 5.923076923076923, | |
| "step": 1001, | |
| "total_flos": 1.7606154086724403e+17, | |
| "train_loss": 4.579017792905604e-05, | |
| "train_runtime": 1.6348, | |
| "train_samples_per_second": 2446.747, | |
| "train_steps_per_second": 611.687 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 1000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 6, | |
| "save_steps": 25, | |
| "total_flos": 1.7606154086724403e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |