{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.923076923076923, "eval_steps": 25, "global_step": 1001, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14792899408284024, "grad_norm": 0.7053780555725098, "learning_rate": 0.0001951951951951952, "loss": 0.9474, "step": 25 }, { "epoch": 0.14792899408284024, "eval_loss": 0.6950487494468689, "eval_runtime": 31.2023, "eval_samples_per_second": 2.852, "eval_steps_per_second": 0.385, "step": 25 }, { "epoch": 0.2958579881656805, "grad_norm": 0.6385655403137207, "learning_rate": 0.0001901901901901902, "loss": 0.6403, "step": 50 }, { "epoch": 0.2958579881656805, "eval_loss": 0.6290514469146729, "eval_runtime": 31.6217, "eval_samples_per_second": 2.815, "eval_steps_per_second": 0.379, "step": 50 }, { "epoch": 0.4437869822485207, "grad_norm": 0.7015706896781921, "learning_rate": 0.0001851851851851852, "loss": 0.6184, "step": 75 }, { "epoch": 0.4437869822485207, "eval_loss": 0.6191244125366211, "eval_runtime": 31.7723, "eval_samples_per_second": 2.801, "eval_steps_per_second": 0.378, "step": 75 }, { "epoch": 0.591715976331361, "grad_norm": 0.6220183372497559, "learning_rate": 0.00018018018018018018, "loss": 0.6041, "step": 100 }, { "epoch": 0.591715976331361, "eval_loss": 0.608259379863739, "eval_runtime": 31.7817, "eval_samples_per_second": 2.8, "eval_steps_per_second": 0.378, "step": 100 }, { "epoch": 0.7396449704142012, "grad_norm": 0.6809254884719849, "learning_rate": 0.0001751751751751752, "loss": 0.6023, "step": 125 }, { "epoch": 0.7396449704142012, "eval_loss": 0.604732871055603, "eval_runtime": 31.6489, "eval_samples_per_second": 2.812, "eval_steps_per_second": 0.379, "step": 125 }, { "epoch": 0.8875739644970414, "grad_norm": 0.6159196496009827, "learning_rate": 0.0001701701701701702, "loss": 0.6086, "step": 150 }, { "epoch": 0.8875739644970414, "eval_loss": 0.6013623476028442, "eval_runtime": 31.6492, "eval_samples_per_second": 2.812, "eval_steps_per_second": 0.379, "step": 150 }, { "epoch": 1.0355029585798816, "grad_norm": 0.5559250116348267, "learning_rate": 0.00016516516516516518, "loss": 0.5545, "step": 175 }, { "epoch": 1.0355029585798816, "eval_loss": 0.6374889612197876, "eval_runtime": 31.6397, "eval_samples_per_second": 2.813, "eval_steps_per_second": 0.379, "step": 175 }, { "epoch": 1.183431952662722, "grad_norm": 0.5993044972419739, "learning_rate": 0.00016016016016016018, "loss": 0.4278, "step": 200 }, { "epoch": 1.183431952662722, "eval_loss": 0.6234655380249023, "eval_runtime": 31.6312, "eval_samples_per_second": 2.814, "eval_steps_per_second": 0.379, "step": 200 }, { "epoch": 1.331360946745562, "grad_norm": 0.689406156539917, "learning_rate": 0.00015515515515515516, "loss": 0.4297, "step": 225 }, { "epoch": 1.331360946745562, "eval_loss": 0.6275980472564697, "eval_runtime": 31.6369, "eval_samples_per_second": 2.813, "eval_steps_per_second": 0.379, "step": 225 }, { "epoch": 1.4792899408284024, "grad_norm": 0.7030369639396667, "learning_rate": 0.00015015015015015014, "loss": 0.4172, "step": 250 }, { "epoch": 1.4792899408284024, "eval_loss": 0.6314178705215454, "eval_runtime": 31.6517, "eval_samples_per_second": 2.812, "eval_steps_per_second": 0.379, "step": 250 }, { "epoch": 1.6272189349112427, "grad_norm": 0.7457050085067749, "learning_rate": 0.00014514514514514515, "loss": 0.4273, "step": 275 }, { "epoch": 1.6272189349112427, "eval_loss": 0.6374988555908203, "eval_runtime": 31.6395, "eval_samples_per_second": 2.813, "eval_steps_per_second": 0.379, "step": 275 }, { "epoch": 1.7751479289940828, "grad_norm": 0.6606324315071106, "learning_rate": 0.00014014014014014013, "loss": 0.4264, "step": 300 }, { "epoch": 1.7751479289940828, "eval_loss": 0.6344882845878601, "eval_runtime": 31.6619, "eval_samples_per_second": 2.811, "eval_steps_per_second": 0.379, "step": 300 }, { "epoch": 1.9230769230769231, "grad_norm": 0.675614058971405, "learning_rate": 0.00013513513513513514, "loss": 0.428, "step": 325 }, { "epoch": 1.9230769230769231, "eval_loss": 0.6296113133430481, "eval_runtime": 31.6627, "eval_samples_per_second": 2.811, "eval_steps_per_second": 0.379, "step": 325 }, { "epoch": 2.0710059171597632, "grad_norm": 0.7311059832572937, "learning_rate": 0.00013013013013013014, "loss": 0.3524, "step": 350 }, { "epoch": 2.0710059171597632, "eval_loss": 0.7067192792892456, "eval_runtime": 31.651, "eval_samples_per_second": 2.812, "eval_steps_per_second": 0.379, "step": 350 }, { "epoch": 2.2189349112426036, "grad_norm": 0.7563914060592651, "learning_rate": 0.00012512512512512512, "loss": 0.2697, "step": 375 }, { "epoch": 2.2189349112426036, "eval_loss": 0.7313967943191528, "eval_runtime": 31.645, "eval_samples_per_second": 2.812, "eval_steps_per_second": 0.379, "step": 375 }, { "epoch": 2.366863905325444, "grad_norm": 0.6278096437454224, "learning_rate": 0.00012012012012012013, "loss": 0.2645, "step": 400 }, { "epoch": 2.366863905325444, "eval_loss": 0.725497841835022, "eval_runtime": 31.6442, "eval_samples_per_second": 2.813, "eval_steps_per_second": 0.379, "step": 400 }, { "epoch": 2.5147928994082838, "grad_norm": 0.782738447189331, "learning_rate": 0.00011511511511511512, "loss": 0.2778, "step": 425 }, { "epoch": 2.5147928994082838, "eval_loss": 0.7220944166183472, "eval_runtime": 31.6506, "eval_samples_per_second": 2.812, "eval_steps_per_second": 0.379, "step": 425 }, { "epoch": 2.662721893491124, "grad_norm": 0.7897526025772095, "learning_rate": 0.00011011011011011012, "loss": 0.2687, "step": 450 }, { "epoch": 2.662721893491124, "eval_loss": 0.7373032569885254, "eval_runtime": 31.668, "eval_samples_per_second": 2.81, "eval_steps_per_second": 0.379, "step": 450 }, { "epoch": 2.8106508875739644, "grad_norm": 0.8417075276374817, "learning_rate": 0.00010510510510510511, "loss": 0.2748, "step": 475 }, { "epoch": 2.8106508875739644, "eval_loss": 0.7250338196754456, "eval_runtime": 31.7591, "eval_samples_per_second": 2.802, "eval_steps_per_second": 0.378, "step": 475 }, { "epoch": 2.9585798816568047, "grad_norm": 0.672287106513977, "learning_rate": 0.00010010010010010012, "loss": 0.2737, "step": 500 }, { "epoch": 2.9585798816568047, "eval_loss": 0.7213594913482666, "eval_runtime": 31.6416, "eval_samples_per_second": 2.813, "eval_steps_per_second": 0.379, "step": 500 }, { "epoch": 3.106508875739645, "grad_norm": 0.658898115158081, "learning_rate": 9.50950950950951e-05, "loss": 0.1848, "step": 525 }, { "epoch": 3.106508875739645, "eval_loss": 0.8414345979690552, "eval_runtime": 31.6513, "eval_samples_per_second": 2.812, "eval_steps_per_second": 0.379, "step": 525 }, { "epoch": 3.2544378698224854, "grad_norm": 0.6711578369140625, "learning_rate": 9.009009009009009e-05, "loss": 0.1535, "step": 550 }, { "epoch": 3.2544378698224854, "eval_loss": 0.8437659740447998, "eval_runtime": 31.6588, "eval_samples_per_second": 2.811, "eval_steps_per_second": 0.379, "step": 550 }, { "epoch": 3.4023668639053253, "grad_norm": 0.6709449887275696, "learning_rate": 8.50850850850851e-05, "loss": 0.1565, "step": 575 }, { "epoch": 3.4023668639053253, "eval_loss": 0.847898006439209, "eval_runtime": 31.6337, "eval_samples_per_second": 2.813, "eval_steps_per_second": 0.379, "step": 575 }, { "epoch": 3.5502958579881656, "grad_norm": 0.7851375937461853, "learning_rate": 8.008008008008009e-05, "loss": 0.1583, "step": 600 }, { "epoch": 3.5502958579881656, "eval_loss": 0.8719269633293152, "eval_runtime": 31.6367, "eval_samples_per_second": 2.813, "eval_steps_per_second": 0.379, "step": 600 }, { "epoch": 3.698224852071006, "grad_norm": 0.7410476207733154, "learning_rate": 7.507507507507507e-05, "loss": 0.1537, "step": 625 }, { "epoch": 3.698224852071006, "eval_loss": 0.8496631383895874, "eval_runtime": 31.6614, "eval_samples_per_second": 2.811, "eval_steps_per_second": 0.379, "step": 625 }, { "epoch": 3.8461538461538463, "grad_norm": 0.7157964110374451, "learning_rate": 7.007007007007007e-05, "loss": 0.1611, "step": 650 }, { "epoch": 3.8461538461538463, "eval_loss": 0.8586809039115906, "eval_runtime": 31.6397, "eval_samples_per_second": 2.813, "eval_steps_per_second": 0.379, "step": 650 }, { "epoch": 3.994082840236686, "grad_norm": 0.7757616639137268, "learning_rate": 6.506506506506507e-05, "loss": 0.16, "step": 675 }, { "epoch": 3.994082840236686, "eval_loss": 0.8713619112968445, "eval_runtime": 31.6525, "eval_samples_per_second": 2.812, "eval_steps_per_second": 0.379, "step": 675 }, { "epoch": 4.1420118343195265, "grad_norm": 0.5490134358406067, "learning_rate": 6.0060060060060066e-05, "loss": 0.0913, "step": 700 }, { "epoch": 4.1420118343195265, "eval_loss": 1.0193753242492676, "eval_runtime": 31.652, "eval_samples_per_second": 2.812, "eval_steps_per_second": 0.379, "step": 700 }, { "epoch": 4.289940828402367, "grad_norm": 0.667753279209137, "learning_rate": 5.505505505505506e-05, "loss": 0.0841, "step": 725 }, { "epoch": 4.289940828402367, "eval_loss": 1.0428720712661743, "eval_runtime": 31.6444, "eval_samples_per_second": 2.813, "eval_steps_per_second": 0.379, "step": 725 }, { "epoch": 4.437869822485207, "grad_norm": 0.5531997084617615, "learning_rate": 5.005005005005006e-05, "loss": 0.0836, "step": 750 }, { "epoch": 4.437869822485207, "eval_loss": 1.0543982982635498, "eval_runtime": 31.6468, "eval_samples_per_second": 2.812, "eval_steps_per_second": 0.379, "step": 750 }, { "epoch": 4.585798816568047, "grad_norm": 0.6152017712593079, "learning_rate": 4.5045045045045046e-05, "loss": 0.0842, "step": 775 }, { "epoch": 4.585798816568047, "eval_loss": 1.0431654453277588, "eval_runtime": 31.6364, "eval_samples_per_second": 2.813, "eval_steps_per_second": 0.379, "step": 775 }, { "epoch": 4.733727810650888, "grad_norm": 0.5139034986495972, "learning_rate": 4.0040040040040046e-05, "loss": 0.081, "step": 800 }, { "epoch": 4.733727810650888, "eval_loss": 1.0386168956756592, "eval_runtime": 31.7013, "eval_samples_per_second": 2.807, "eval_steps_per_second": 0.379, "step": 800 }, { "epoch": 4.881656804733728, "grad_norm": 0.7744113206863403, "learning_rate": 3.503503503503503e-05, "loss": 0.0848, "step": 825 }, { "epoch": 4.881656804733728, "eval_loss": 1.0704792737960815, "eval_runtime": 31.6705, "eval_samples_per_second": 2.81, "eval_steps_per_second": 0.379, "step": 825 }, { "epoch": 5.029585798816568, "grad_norm": 0.4454633593559265, "learning_rate": 3.0030030030030033e-05, "loss": 0.0776, "step": 850 }, { "epoch": 5.029585798816568, "eval_loss": 1.0772627592086792, "eval_runtime": 31.7163, "eval_samples_per_second": 2.806, "eval_steps_per_second": 0.378, "step": 850 }, { "epoch": 5.177514792899408, "grad_norm": 0.45951634645462036, "learning_rate": 2.502502502502503e-05, "loss": 0.0485, "step": 875 }, { "epoch": 5.177514792899408, "eval_loss": 1.190962553024292, "eval_runtime": 31.6581, "eval_samples_per_second": 2.811, "eval_steps_per_second": 0.379, "step": 875 }, { "epoch": 5.325443786982248, "grad_norm": 0.49803122878074646, "learning_rate": 2.0020020020020023e-05, "loss": 0.0481, "step": 900 }, { "epoch": 5.325443786982248, "eval_loss": 1.1799925565719604, "eval_runtime": 31.6457, "eval_samples_per_second": 2.812, "eval_steps_per_second": 0.379, "step": 900 }, { "epoch": 5.4733727810650885, "grad_norm": 0.48808640241622925, "learning_rate": 1.5015015015015016e-05, "loss": 0.0481, "step": 925 }, { "epoch": 5.4733727810650885, "eval_loss": 1.1915431022644043, "eval_runtime": 31.641, "eval_samples_per_second": 2.813, "eval_steps_per_second": 0.379, "step": 925 }, { "epoch": 5.621301775147929, "grad_norm": 0.4711610972881317, "learning_rate": 1.0010010010010011e-05, "loss": 0.0467, "step": 950 }, { "epoch": 5.621301775147929, "eval_loss": 1.1996334791183472, "eval_runtime": 31.634, "eval_samples_per_second": 2.813, "eval_steps_per_second": 0.379, "step": 950 }, { "epoch": 5.769230769230769, "grad_norm": 0.4745465815067291, "learning_rate": 5.005005005005006e-06, "loss": 0.0468, "step": 975 }, { "epoch": 5.769230769230769, "eval_loss": 1.1960943937301636, "eval_runtime": 31.6459, "eval_samples_per_second": 2.812, "eval_steps_per_second": 0.379, "step": 975 }, { "epoch": 5.9171597633136095, "grad_norm": 0.44335442781448364, "learning_rate": 0.0, "loss": 0.0449, "step": 1000 }, { "epoch": 5.9171597633136095, "eval_loss": 1.2003010511398315, "eval_runtime": 31.6446, "eval_samples_per_second": 2.812, "eval_steps_per_second": 0.379, "step": 1000 }, { "epoch": 5.923076923076923, "step": 1001, "total_flos": 1.7606154086724403e+17, "train_loss": 4.533969319902815e-05, "train_runtime": 4.4394, "train_samples_per_second": 901.017, "train_steps_per_second": 225.254 } ], "logging_steps": 25, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 25, "total_flos": 1.7606154086724403e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }