| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.923076923076923, | |
| "eval_steps": 25, | |
| "global_step": 1001, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.14792899408284024, | |
| "grad_norm": 0.7053780555725098, | |
| "learning_rate": 0.0001951951951951952, | |
| "loss": 0.9474, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.14792899408284024, | |
| "eval_loss": 0.6950487494468689, | |
| "eval_runtime": 31.2023, | |
| "eval_samples_per_second": 2.852, | |
| "eval_steps_per_second": 0.385, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.2958579881656805, | |
| "grad_norm": 0.6385655403137207, | |
| "learning_rate": 0.0001901901901901902, | |
| "loss": 0.6403, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.2958579881656805, | |
| "eval_loss": 0.6290514469146729, | |
| "eval_runtime": 31.6217, | |
| "eval_samples_per_second": 2.815, | |
| "eval_steps_per_second": 0.379, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.4437869822485207, | |
| "grad_norm": 0.7015706896781921, | |
| "learning_rate": 0.0001851851851851852, | |
| "loss": 0.6184, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.4437869822485207, | |
| "eval_loss": 0.6191244125366211, | |
| "eval_runtime": 31.7723, | |
| "eval_samples_per_second": 2.801, | |
| "eval_steps_per_second": 0.378, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.591715976331361, | |
| "grad_norm": 0.6220183372497559, | |
| "learning_rate": 0.00018018018018018018, | |
| "loss": 0.6041, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.591715976331361, | |
| "eval_loss": 0.608259379863739, | |
| "eval_runtime": 31.7817, | |
| "eval_samples_per_second": 2.8, | |
| "eval_steps_per_second": 0.378, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.7396449704142012, | |
| "grad_norm": 0.6809254884719849, | |
| "learning_rate": 0.0001751751751751752, | |
| "loss": 0.6023, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.7396449704142012, | |
| "eval_loss": 0.604732871055603, | |
| "eval_runtime": 31.6489, | |
| "eval_samples_per_second": 2.812, | |
| "eval_steps_per_second": 0.379, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.8875739644970414, | |
| "grad_norm": 0.6159196496009827, | |
| "learning_rate": 0.0001701701701701702, | |
| "loss": 0.6086, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8875739644970414, | |
| "eval_loss": 0.6013623476028442, | |
| "eval_runtime": 31.6492, | |
| "eval_samples_per_second": 2.812, | |
| "eval_steps_per_second": 0.379, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.0355029585798816, | |
| "grad_norm": 0.5559250116348267, | |
| "learning_rate": 0.00016516516516516518, | |
| "loss": 0.5545, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.0355029585798816, | |
| "eval_loss": 0.6374889612197876, | |
| "eval_runtime": 31.6397, | |
| "eval_samples_per_second": 2.813, | |
| "eval_steps_per_second": 0.379, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.183431952662722, | |
| "grad_norm": 0.5993044972419739, | |
| "learning_rate": 0.00016016016016016018, | |
| "loss": 0.4278, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.183431952662722, | |
| "eval_loss": 0.6234655380249023, | |
| "eval_runtime": 31.6312, | |
| "eval_samples_per_second": 2.814, | |
| "eval_steps_per_second": 0.379, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.331360946745562, | |
| "grad_norm": 0.689406156539917, | |
| "learning_rate": 0.00015515515515515516, | |
| "loss": 0.4297, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.331360946745562, | |
| "eval_loss": 0.6275980472564697, | |
| "eval_runtime": 31.6369, | |
| "eval_samples_per_second": 2.813, | |
| "eval_steps_per_second": 0.379, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.4792899408284024, | |
| "grad_norm": 0.7030369639396667, | |
| "learning_rate": 0.00015015015015015014, | |
| "loss": 0.4172, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.4792899408284024, | |
| "eval_loss": 0.6314178705215454, | |
| "eval_runtime": 31.6517, | |
| "eval_samples_per_second": 2.812, | |
| "eval_steps_per_second": 0.379, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.6272189349112427, | |
| "grad_norm": 0.7457050085067749, | |
| "learning_rate": 0.00014514514514514515, | |
| "loss": 0.4273, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.6272189349112427, | |
| "eval_loss": 0.6374988555908203, | |
| "eval_runtime": 31.6395, | |
| "eval_samples_per_second": 2.813, | |
| "eval_steps_per_second": 0.379, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.7751479289940828, | |
| "grad_norm": 0.6606324315071106, | |
| "learning_rate": 0.00014014014014014013, | |
| "loss": 0.4264, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.7751479289940828, | |
| "eval_loss": 0.6344882845878601, | |
| "eval_runtime": 31.6619, | |
| "eval_samples_per_second": 2.811, | |
| "eval_steps_per_second": 0.379, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.9230769230769231, | |
| "grad_norm": 0.675614058971405, | |
| "learning_rate": 0.00013513513513513514, | |
| "loss": 0.428, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.9230769230769231, | |
| "eval_loss": 0.6296113133430481, | |
| "eval_runtime": 31.6627, | |
| "eval_samples_per_second": 2.811, | |
| "eval_steps_per_second": 0.379, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.0710059171597632, | |
| "grad_norm": 0.7311059832572937, | |
| "learning_rate": 0.00013013013013013014, | |
| "loss": 0.3524, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.0710059171597632, | |
| "eval_loss": 0.7067192792892456, | |
| "eval_runtime": 31.651, | |
| "eval_samples_per_second": 2.812, | |
| "eval_steps_per_second": 0.379, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.2189349112426036, | |
| "grad_norm": 0.7563914060592651, | |
| "learning_rate": 0.00012512512512512512, | |
| "loss": 0.2697, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 2.2189349112426036, | |
| "eval_loss": 0.7313967943191528, | |
| "eval_runtime": 31.645, | |
| "eval_samples_per_second": 2.812, | |
| "eval_steps_per_second": 0.379, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 2.366863905325444, | |
| "grad_norm": 0.6278096437454224, | |
| "learning_rate": 0.00012012012012012013, | |
| "loss": 0.2645, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.366863905325444, | |
| "eval_loss": 0.725497841835022, | |
| "eval_runtime": 31.6442, | |
| "eval_samples_per_second": 2.813, | |
| "eval_steps_per_second": 0.379, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.5147928994082838, | |
| "grad_norm": 0.782738447189331, | |
| "learning_rate": 0.00011511511511511512, | |
| "loss": 0.2778, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 2.5147928994082838, | |
| "eval_loss": 0.7220944166183472, | |
| "eval_runtime": 31.6506, | |
| "eval_samples_per_second": 2.812, | |
| "eval_steps_per_second": 0.379, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 2.662721893491124, | |
| "grad_norm": 0.7897526025772095, | |
| "learning_rate": 0.00011011011011011012, | |
| "loss": 0.2687, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.662721893491124, | |
| "eval_loss": 0.7373032569885254, | |
| "eval_runtime": 31.668, | |
| "eval_samples_per_second": 2.81, | |
| "eval_steps_per_second": 0.379, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.8106508875739644, | |
| "grad_norm": 0.8417075276374817, | |
| "learning_rate": 0.00010510510510510511, | |
| "loss": 0.2748, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 2.8106508875739644, | |
| "eval_loss": 0.7250338196754456, | |
| "eval_runtime": 31.7591, | |
| "eval_samples_per_second": 2.802, | |
| "eval_steps_per_second": 0.378, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 2.9585798816568047, | |
| "grad_norm": 0.672287106513977, | |
| "learning_rate": 0.00010010010010010012, | |
| "loss": 0.2737, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.9585798816568047, | |
| "eval_loss": 0.7213594913482666, | |
| "eval_runtime": 31.6416, | |
| "eval_samples_per_second": 2.813, | |
| "eval_steps_per_second": 0.379, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.106508875739645, | |
| "grad_norm": 0.658898115158081, | |
| "learning_rate": 9.50950950950951e-05, | |
| "loss": 0.1848, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 3.106508875739645, | |
| "eval_loss": 0.8414345979690552, | |
| "eval_runtime": 31.6513, | |
| "eval_samples_per_second": 2.812, | |
| "eval_steps_per_second": 0.379, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 3.2544378698224854, | |
| "grad_norm": 0.6711578369140625, | |
| "learning_rate": 9.009009009009009e-05, | |
| "loss": 0.1535, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 3.2544378698224854, | |
| "eval_loss": 0.8437659740447998, | |
| "eval_runtime": 31.6588, | |
| "eval_samples_per_second": 2.811, | |
| "eval_steps_per_second": 0.379, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 3.4023668639053253, | |
| "grad_norm": 0.6709449887275696, | |
| "learning_rate": 8.50850850850851e-05, | |
| "loss": 0.1565, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 3.4023668639053253, | |
| "eval_loss": 0.847898006439209, | |
| "eval_runtime": 31.6337, | |
| "eval_samples_per_second": 2.813, | |
| "eval_steps_per_second": 0.379, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 3.5502958579881656, | |
| "grad_norm": 0.7851375937461853, | |
| "learning_rate": 8.008008008008009e-05, | |
| "loss": 0.1583, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.5502958579881656, | |
| "eval_loss": 0.8719269633293152, | |
| "eval_runtime": 31.6367, | |
| "eval_samples_per_second": 2.813, | |
| "eval_steps_per_second": 0.379, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.698224852071006, | |
| "grad_norm": 0.7410476207733154, | |
| "learning_rate": 7.507507507507507e-05, | |
| "loss": 0.1537, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 3.698224852071006, | |
| "eval_loss": 0.8496631383895874, | |
| "eval_runtime": 31.6614, | |
| "eval_samples_per_second": 2.811, | |
| "eval_steps_per_second": 0.379, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 3.8461538461538463, | |
| "grad_norm": 0.7157964110374451, | |
| "learning_rate": 7.007007007007007e-05, | |
| "loss": 0.1611, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 3.8461538461538463, | |
| "eval_loss": 0.8586809039115906, | |
| "eval_runtime": 31.6397, | |
| "eval_samples_per_second": 2.813, | |
| "eval_steps_per_second": 0.379, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 3.994082840236686, | |
| "grad_norm": 0.7757616639137268, | |
| "learning_rate": 6.506506506506507e-05, | |
| "loss": 0.16, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 3.994082840236686, | |
| "eval_loss": 0.8713619112968445, | |
| "eval_runtime": 31.6525, | |
| "eval_samples_per_second": 2.812, | |
| "eval_steps_per_second": 0.379, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 4.1420118343195265, | |
| "grad_norm": 0.5490134358406067, | |
| "learning_rate": 6.0060060060060066e-05, | |
| "loss": 0.0913, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 4.1420118343195265, | |
| "eval_loss": 1.0193753242492676, | |
| "eval_runtime": 31.652, | |
| "eval_samples_per_second": 2.812, | |
| "eval_steps_per_second": 0.379, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 4.289940828402367, | |
| "grad_norm": 0.667753279209137, | |
| "learning_rate": 5.505505505505506e-05, | |
| "loss": 0.0841, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 4.289940828402367, | |
| "eval_loss": 1.0428720712661743, | |
| "eval_runtime": 31.6444, | |
| "eval_samples_per_second": 2.813, | |
| "eval_steps_per_second": 0.379, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 4.437869822485207, | |
| "grad_norm": 0.5531997084617615, | |
| "learning_rate": 5.005005005005006e-05, | |
| "loss": 0.0836, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 4.437869822485207, | |
| "eval_loss": 1.0543982982635498, | |
| "eval_runtime": 31.6468, | |
| "eval_samples_per_second": 2.812, | |
| "eval_steps_per_second": 0.379, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 4.585798816568047, | |
| "grad_norm": 0.6152017712593079, | |
| "learning_rate": 4.5045045045045046e-05, | |
| "loss": 0.0842, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 4.585798816568047, | |
| "eval_loss": 1.0431654453277588, | |
| "eval_runtime": 31.6364, | |
| "eval_samples_per_second": 2.813, | |
| "eval_steps_per_second": 0.379, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 4.733727810650888, | |
| "grad_norm": 0.5139034986495972, | |
| "learning_rate": 4.0040040040040046e-05, | |
| "loss": 0.081, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.733727810650888, | |
| "eval_loss": 1.0386168956756592, | |
| "eval_runtime": 31.7013, | |
| "eval_samples_per_second": 2.807, | |
| "eval_steps_per_second": 0.379, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.881656804733728, | |
| "grad_norm": 0.7744113206863403, | |
| "learning_rate": 3.503503503503503e-05, | |
| "loss": 0.0848, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 4.881656804733728, | |
| "eval_loss": 1.0704792737960815, | |
| "eval_runtime": 31.6705, | |
| "eval_samples_per_second": 2.81, | |
| "eval_steps_per_second": 0.379, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 5.029585798816568, | |
| "grad_norm": 0.4454633593559265, | |
| "learning_rate": 3.0030030030030033e-05, | |
| "loss": 0.0776, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 5.029585798816568, | |
| "eval_loss": 1.0772627592086792, | |
| "eval_runtime": 31.7163, | |
| "eval_samples_per_second": 2.806, | |
| "eval_steps_per_second": 0.378, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 5.177514792899408, | |
| "grad_norm": 0.45951634645462036, | |
| "learning_rate": 2.502502502502503e-05, | |
| "loss": 0.0485, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 5.177514792899408, | |
| "eval_loss": 1.190962553024292, | |
| "eval_runtime": 31.6581, | |
| "eval_samples_per_second": 2.811, | |
| "eval_steps_per_second": 0.379, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 5.325443786982248, | |
| "grad_norm": 0.49803122878074646, | |
| "learning_rate": 2.0020020020020023e-05, | |
| "loss": 0.0481, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 5.325443786982248, | |
| "eval_loss": 1.1799925565719604, | |
| "eval_runtime": 31.6457, | |
| "eval_samples_per_second": 2.812, | |
| "eval_steps_per_second": 0.379, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 5.4733727810650885, | |
| "grad_norm": 0.48808640241622925, | |
| "learning_rate": 1.5015015015015016e-05, | |
| "loss": 0.0481, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 5.4733727810650885, | |
| "eval_loss": 1.1915431022644043, | |
| "eval_runtime": 31.641, | |
| "eval_samples_per_second": 2.813, | |
| "eval_steps_per_second": 0.379, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 5.621301775147929, | |
| "grad_norm": 0.4711610972881317, | |
| "learning_rate": 1.0010010010010011e-05, | |
| "loss": 0.0467, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 5.621301775147929, | |
| "eval_loss": 1.1996334791183472, | |
| "eval_runtime": 31.634, | |
| "eval_samples_per_second": 2.813, | |
| "eval_steps_per_second": 0.379, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 5.769230769230769, | |
| "grad_norm": 0.4745465815067291, | |
| "learning_rate": 5.005005005005006e-06, | |
| "loss": 0.0468, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 5.769230769230769, | |
| "eval_loss": 1.1960943937301636, | |
| "eval_runtime": 31.6459, | |
| "eval_samples_per_second": 2.812, | |
| "eval_steps_per_second": 0.379, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 5.9171597633136095, | |
| "grad_norm": 0.44335442781448364, | |
| "learning_rate": 0.0, | |
| "loss": 0.0449, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 5.9171597633136095, | |
| "eval_loss": 1.2003010511398315, | |
| "eval_runtime": 31.6446, | |
| "eval_samples_per_second": 2.812, | |
| "eval_steps_per_second": 0.379, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 5.923076923076923, | |
| "step": 1001, | |
| "total_flos": 1.7606154086724403e+17, | |
| "train_loss": 4.533969319902815e-05, | |
| "train_runtime": 4.4394, | |
| "train_samples_per_second": 901.017, | |
| "train_steps_per_second": 225.254 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 1000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 6, | |
| "save_steps": 25, | |
| "total_flos": 1.7606154086724403e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |