| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.923076923076923, |
| "eval_steps": 25, |
| "global_step": 1001, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.14792899408284024, |
| "grad_norm": 0.7053780555725098, |
| "learning_rate": 0.0001951951951951952, |
| "loss": 0.9474, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.14792899408284024, |
| "eval_loss": 0.6950487494468689, |
| "eval_runtime": 31.2023, |
| "eval_samples_per_second": 2.852, |
| "eval_steps_per_second": 0.385, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.2958579881656805, |
| "grad_norm": 0.6385655403137207, |
| "learning_rate": 0.0001901901901901902, |
| "loss": 0.6403, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.2958579881656805, |
| "eval_loss": 0.6290514469146729, |
| "eval_runtime": 31.6217, |
| "eval_samples_per_second": 2.815, |
| "eval_steps_per_second": 0.379, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.4437869822485207, |
| "grad_norm": 0.7015706896781921, |
| "learning_rate": 0.0001851851851851852, |
| "loss": 0.6184, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.4437869822485207, |
| "eval_loss": 0.6191244125366211, |
| "eval_runtime": 31.7723, |
| "eval_samples_per_second": 2.801, |
| "eval_steps_per_second": 0.378, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.591715976331361, |
| "grad_norm": 0.6220183372497559, |
| "learning_rate": 0.00018018018018018018, |
| "loss": 0.6041, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.591715976331361, |
| "eval_loss": 0.608259379863739, |
| "eval_runtime": 31.7817, |
| "eval_samples_per_second": 2.8, |
| "eval_steps_per_second": 0.378, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.7396449704142012, |
| "grad_norm": 0.6809254884719849, |
| "learning_rate": 0.0001751751751751752, |
| "loss": 0.6023, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.7396449704142012, |
| "eval_loss": 0.604732871055603, |
| "eval_runtime": 31.6489, |
| "eval_samples_per_second": 2.812, |
| "eval_steps_per_second": 0.379, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.8875739644970414, |
| "grad_norm": 0.6159196496009827, |
| "learning_rate": 0.0001701701701701702, |
| "loss": 0.6086, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.8875739644970414, |
| "eval_loss": 0.6013623476028442, |
| "eval_runtime": 31.6492, |
| "eval_samples_per_second": 2.812, |
| "eval_steps_per_second": 0.379, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.0355029585798816, |
| "grad_norm": 0.5559250116348267, |
| "learning_rate": 0.00016516516516516518, |
| "loss": 0.5545, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.0355029585798816, |
| "eval_loss": 0.6374889612197876, |
| "eval_runtime": 31.6397, |
| "eval_samples_per_second": 2.813, |
| "eval_steps_per_second": 0.379, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.183431952662722, |
| "grad_norm": 0.5993044972419739, |
| "learning_rate": 0.00016016016016016018, |
| "loss": 0.4278, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.183431952662722, |
| "eval_loss": 0.6234655380249023, |
| "eval_runtime": 31.6312, |
| "eval_samples_per_second": 2.814, |
| "eval_steps_per_second": 0.379, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.331360946745562, |
| "grad_norm": 0.689406156539917, |
| "learning_rate": 0.00015515515515515516, |
| "loss": 0.4297, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.331360946745562, |
| "eval_loss": 0.6275980472564697, |
| "eval_runtime": 31.6369, |
| "eval_samples_per_second": 2.813, |
| "eval_steps_per_second": 0.379, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.4792899408284024, |
| "grad_norm": 0.7030369639396667, |
| "learning_rate": 0.00015015015015015014, |
| "loss": 0.4172, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.4792899408284024, |
| "eval_loss": 0.6314178705215454, |
| "eval_runtime": 31.6517, |
| "eval_samples_per_second": 2.812, |
| "eval_steps_per_second": 0.379, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.6272189349112427, |
| "grad_norm": 0.7457050085067749, |
| "learning_rate": 0.00014514514514514515, |
| "loss": 0.4273, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.6272189349112427, |
| "eval_loss": 0.6374988555908203, |
| "eval_runtime": 31.6395, |
| "eval_samples_per_second": 2.813, |
| "eval_steps_per_second": 0.379, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.7751479289940828, |
| "grad_norm": 0.6606324315071106, |
| "learning_rate": 0.00014014014014014013, |
| "loss": 0.4264, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.7751479289940828, |
| "eval_loss": 0.6344882845878601, |
| "eval_runtime": 31.6619, |
| "eval_samples_per_second": 2.811, |
| "eval_steps_per_second": 0.379, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.9230769230769231, |
| "grad_norm": 0.675614058971405, |
| "learning_rate": 0.00013513513513513514, |
| "loss": 0.428, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.9230769230769231, |
| "eval_loss": 0.6296113133430481, |
| "eval_runtime": 31.6627, |
| "eval_samples_per_second": 2.811, |
| "eval_steps_per_second": 0.379, |
| "step": 325 |
| }, |
| { |
| "epoch": 2.0710059171597632, |
| "grad_norm": 0.7311059832572937, |
| "learning_rate": 0.00013013013013013014, |
| "loss": 0.3524, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.0710059171597632, |
| "eval_loss": 0.7067192792892456, |
| "eval_runtime": 31.651, |
| "eval_samples_per_second": 2.812, |
| "eval_steps_per_second": 0.379, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.2189349112426036, |
| "grad_norm": 0.7563914060592651, |
| "learning_rate": 0.00012512512512512512, |
| "loss": 0.2697, |
| "step": 375 |
| }, |
| { |
| "epoch": 2.2189349112426036, |
| "eval_loss": 0.7313967943191528, |
| "eval_runtime": 31.645, |
| "eval_samples_per_second": 2.812, |
| "eval_steps_per_second": 0.379, |
| "step": 375 |
| }, |
| { |
| "epoch": 2.366863905325444, |
| "grad_norm": 0.6278096437454224, |
| "learning_rate": 0.00012012012012012013, |
| "loss": 0.2645, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.366863905325444, |
| "eval_loss": 0.725497841835022, |
| "eval_runtime": 31.6442, |
| "eval_samples_per_second": 2.813, |
| "eval_steps_per_second": 0.379, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.5147928994082838, |
| "grad_norm": 0.782738447189331, |
| "learning_rate": 0.00011511511511511512, |
| "loss": 0.2778, |
| "step": 425 |
| }, |
| { |
| "epoch": 2.5147928994082838, |
| "eval_loss": 0.7220944166183472, |
| "eval_runtime": 31.6506, |
| "eval_samples_per_second": 2.812, |
| "eval_steps_per_second": 0.379, |
| "step": 425 |
| }, |
| { |
| "epoch": 2.662721893491124, |
| "grad_norm": 0.7897526025772095, |
| "learning_rate": 0.00011011011011011012, |
| "loss": 0.2687, |
| "step": 450 |
| }, |
| { |
| "epoch": 2.662721893491124, |
| "eval_loss": 0.7373032569885254, |
| "eval_runtime": 31.668, |
| "eval_samples_per_second": 2.81, |
| "eval_steps_per_second": 0.379, |
| "step": 450 |
| }, |
| { |
| "epoch": 2.8106508875739644, |
| "grad_norm": 0.8417075276374817, |
| "learning_rate": 0.00010510510510510511, |
| "loss": 0.2748, |
| "step": 475 |
| }, |
| { |
| "epoch": 2.8106508875739644, |
| "eval_loss": 0.7250338196754456, |
| "eval_runtime": 31.7591, |
| "eval_samples_per_second": 2.802, |
| "eval_steps_per_second": 0.378, |
| "step": 475 |
| }, |
| { |
| "epoch": 2.9585798816568047, |
| "grad_norm": 0.672287106513977, |
| "learning_rate": 0.00010010010010010012, |
| "loss": 0.2737, |
| "step": 500 |
| }, |
| { |
| "epoch": 2.9585798816568047, |
| "eval_loss": 0.7213594913482666, |
| "eval_runtime": 31.6416, |
| "eval_samples_per_second": 2.813, |
| "eval_steps_per_second": 0.379, |
| "step": 500 |
| }, |
| { |
| "epoch": 3.106508875739645, |
| "grad_norm": 0.658898115158081, |
| "learning_rate": 9.50950950950951e-05, |
| "loss": 0.1848, |
| "step": 525 |
| }, |
| { |
| "epoch": 3.106508875739645, |
| "eval_loss": 0.8414345979690552, |
| "eval_runtime": 31.6513, |
| "eval_samples_per_second": 2.812, |
| "eval_steps_per_second": 0.379, |
| "step": 525 |
| }, |
| { |
| "epoch": 3.2544378698224854, |
| "grad_norm": 0.6711578369140625, |
| "learning_rate": 9.009009009009009e-05, |
| "loss": 0.1535, |
| "step": 550 |
| }, |
| { |
| "epoch": 3.2544378698224854, |
| "eval_loss": 0.8437659740447998, |
| "eval_runtime": 31.6588, |
| "eval_samples_per_second": 2.811, |
| "eval_steps_per_second": 0.379, |
| "step": 550 |
| }, |
| { |
| "epoch": 3.4023668639053253, |
| "grad_norm": 0.6709449887275696, |
| "learning_rate": 8.50850850850851e-05, |
| "loss": 0.1565, |
| "step": 575 |
| }, |
| { |
| "epoch": 3.4023668639053253, |
| "eval_loss": 0.847898006439209, |
| "eval_runtime": 31.6337, |
| "eval_samples_per_second": 2.813, |
| "eval_steps_per_second": 0.379, |
| "step": 575 |
| }, |
| { |
| "epoch": 3.5502958579881656, |
| "grad_norm": 0.7851375937461853, |
| "learning_rate": 8.008008008008009e-05, |
| "loss": 0.1583, |
| "step": 600 |
| }, |
| { |
| "epoch": 3.5502958579881656, |
| "eval_loss": 0.8719269633293152, |
| "eval_runtime": 31.6367, |
| "eval_samples_per_second": 2.813, |
| "eval_steps_per_second": 0.379, |
| "step": 600 |
| }, |
| { |
| "epoch": 3.698224852071006, |
| "grad_norm": 0.7410476207733154, |
| "learning_rate": 7.507507507507507e-05, |
| "loss": 0.1537, |
| "step": 625 |
| }, |
| { |
| "epoch": 3.698224852071006, |
| "eval_loss": 0.8496631383895874, |
| "eval_runtime": 31.6614, |
| "eval_samples_per_second": 2.811, |
| "eval_steps_per_second": 0.379, |
| "step": 625 |
| }, |
| { |
| "epoch": 3.8461538461538463, |
| "grad_norm": 0.7157964110374451, |
| "learning_rate": 7.007007007007007e-05, |
| "loss": 0.1611, |
| "step": 650 |
| }, |
| { |
| "epoch": 3.8461538461538463, |
| "eval_loss": 0.8586809039115906, |
| "eval_runtime": 31.6397, |
| "eval_samples_per_second": 2.813, |
| "eval_steps_per_second": 0.379, |
| "step": 650 |
| }, |
| { |
| "epoch": 3.994082840236686, |
| "grad_norm": 0.7757616639137268, |
| "learning_rate": 6.506506506506507e-05, |
| "loss": 0.16, |
| "step": 675 |
| }, |
| { |
| "epoch": 3.994082840236686, |
| "eval_loss": 0.8713619112968445, |
| "eval_runtime": 31.6525, |
| "eval_samples_per_second": 2.812, |
| "eval_steps_per_second": 0.379, |
| "step": 675 |
| }, |
| { |
| "epoch": 4.1420118343195265, |
| "grad_norm": 0.5490134358406067, |
| "learning_rate": 6.0060060060060066e-05, |
| "loss": 0.0913, |
| "step": 700 |
| }, |
| { |
| "epoch": 4.1420118343195265, |
| "eval_loss": 1.0193753242492676, |
| "eval_runtime": 31.652, |
| "eval_samples_per_second": 2.812, |
| "eval_steps_per_second": 0.379, |
| "step": 700 |
| }, |
| { |
| "epoch": 4.289940828402367, |
| "grad_norm": 0.667753279209137, |
| "learning_rate": 5.505505505505506e-05, |
| "loss": 0.0841, |
| "step": 725 |
| }, |
| { |
| "epoch": 4.289940828402367, |
| "eval_loss": 1.0428720712661743, |
| "eval_runtime": 31.6444, |
| "eval_samples_per_second": 2.813, |
| "eval_steps_per_second": 0.379, |
| "step": 725 |
| }, |
| { |
| "epoch": 4.437869822485207, |
| "grad_norm": 0.5531997084617615, |
| "learning_rate": 5.005005005005006e-05, |
| "loss": 0.0836, |
| "step": 750 |
| }, |
| { |
| "epoch": 4.437869822485207, |
| "eval_loss": 1.0543982982635498, |
| "eval_runtime": 31.6468, |
| "eval_samples_per_second": 2.812, |
| "eval_steps_per_second": 0.379, |
| "step": 750 |
| }, |
| { |
| "epoch": 4.585798816568047, |
| "grad_norm": 0.6152017712593079, |
| "learning_rate": 4.5045045045045046e-05, |
| "loss": 0.0842, |
| "step": 775 |
| }, |
| { |
| "epoch": 4.585798816568047, |
| "eval_loss": 1.0431654453277588, |
| "eval_runtime": 31.6364, |
| "eval_samples_per_second": 2.813, |
| "eval_steps_per_second": 0.379, |
| "step": 775 |
| }, |
| { |
| "epoch": 4.733727810650888, |
| "grad_norm": 0.5139034986495972, |
| "learning_rate": 4.0040040040040046e-05, |
| "loss": 0.081, |
| "step": 800 |
| }, |
| { |
| "epoch": 4.733727810650888, |
| "eval_loss": 1.0386168956756592, |
| "eval_runtime": 31.7013, |
| "eval_samples_per_second": 2.807, |
| "eval_steps_per_second": 0.379, |
| "step": 800 |
| }, |
| { |
| "epoch": 4.881656804733728, |
| "grad_norm": 0.7744113206863403, |
| "learning_rate": 3.503503503503503e-05, |
| "loss": 0.0848, |
| "step": 825 |
| }, |
| { |
| "epoch": 4.881656804733728, |
| "eval_loss": 1.0704792737960815, |
| "eval_runtime": 31.6705, |
| "eval_samples_per_second": 2.81, |
| "eval_steps_per_second": 0.379, |
| "step": 825 |
| }, |
| { |
| "epoch": 5.029585798816568, |
| "grad_norm": 0.4454633593559265, |
| "learning_rate": 3.0030030030030033e-05, |
| "loss": 0.0776, |
| "step": 850 |
| }, |
| { |
| "epoch": 5.029585798816568, |
| "eval_loss": 1.0772627592086792, |
| "eval_runtime": 31.7163, |
| "eval_samples_per_second": 2.806, |
| "eval_steps_per_second": 0.378, |
| "step": 850 |
| }, |
| { |
| "epoch": 5.177514792899408, |
| "grad_norm": 0.45951634645462036, |
| "learning_rate": 2.502502502502503e-05, |
| "loss": 0.0485, |
| "step": 875 |
| }, |
| { |
| "epoch": 5.177514792899408, |
| "eval_loss": 1.190962553024292, |
| "eval_runtime": 31.6581, |
| "eval_samples_per_second": 2.811, |
| "eval_steps_per_second": 0.379, |
| "step": 875 |
| }, |
| { |
| "epoch": 5.325443786982248, |
| "grad_norm": 0.49803122878074646, |
| "learning_rate": 2.0020020020020023e-05, |
| "loss": 0.0481, |
| "step": 900 |
| }, |
| { |
| "epoch": 5.325443786982248, |
| "eval_loss": 1.1799925565719604, |
| "eval_runtime": 31.6457, |
| "eval_samples_per_second": 2.812, |
| "eval_steps_per_second": 0.379, |
| "step": 900 |
| }, |
| { |
| "epoch": 5.4733727810650885, |
| "grad_norm": 0.48808640241622925, |
| "learning_rate": 1.5015015015015016e-05, |
| "loss": 0.0481, |
| "step": 925 |
| }, |
| { |
| "epoch": 5.4733727810650885, |
| "eval_loss": 1.1915431022644043, |
| "eval_runtime": 31.641, |
| "eval_samples_per_second": 2.813, |
| "eval_steps_per_second": 0.379, |
| "step": 925 |
| }, |
| { |
| "epoch": 5.621301775147929, |
| "grad_norm": 0.4711610972881317, |
| "learning_rate": 1.0010010010010011e-05, |
| "loss": 0.0467, |
| "step": 950 |
| }, |
| { |
| "epoch": 5.621301775147929, |
| "eval_loss": 1.1996334791183472, |
| "eval_runtime": 31.634, |
| "eval_samples_per_second": 2.813, |
| "eval_steps_per_second": 0.379, |
| "step": 950 |
| }, |
| { |
| "epoch": 5.769230769230769, |
| "grad_norm": 0.4745465815067291, |
| "learning_rate": 5.005005005005006e-06, |
| "loss": 0.0468, |
| "step": 975 |
| }, |
| { |
| "epoch": 5.769230769230769, |
| "eval_loss": 1.1960943937301636, |
| "eval_runtime": 31.6459, |
| "eval_samples_per_second": 2.812, |
| "eval_steps_per_second": 0.379, |
| "step": 975 |
| }, |
| { |
| "epoch": 5.9171597633136095, |
| "grad_norm": 0.44335442781448364, |
| "learning_rate": 0.0, |
| "loss": 0.0449, |
| "step": 1000 |
| }, |
| { |
| "epoch": 5.9171597633136095, |
| "eval_loss": 1.2003010511398315, |
| "eval_runtime": 31.6446, |
| "eval_samples_per_second": 2.812, |
| "eval_steps_per_second": 0.379, |
| "step": 1000 |
| }, |
| { |
| "epoch": 5.923076923076923, |
| "step": 1001, |
| "total_flos": 1.7606154086724403e+17, |
| "train_loss": 4.533969319902815e-05, |
| "train_runtime": 4.6313, |
| "train_samples_per_second": 863.692, |
| "train_steps_per_second": 215.923 |
| } |
| ], |
| "logging_steps": 25, |
| "max_steps": 1000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 6, |
| "save_steps": 25, |
| "total_flos": 1.7606154086724403e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|