| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 5.9171597633136095, |
| "eval_steps": 25, |
| "global_step": 1000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.14792899408284024, |
| "grad_norm": 0.83642578125, |
| "learning_rate": 0.0001951951951951952, |
| "loss": 0.9391, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.14792899408284024, |
| "eval_loss": 0.6647412776947021, |
| "eval_runtime": 5.4263, |
| "eval_samples_per_second": 16.402, |
| "eval_steps_per_second": 2.211, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.2958579881656805, |
| "grad_norm": 0.64208984375, |
| "learning_rate": 0.0001901901901901902, |
| "loss": 0.6132, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.2958579881656805, |
| "eval_loss": 0.6130561232566833, |
| "eval_runtime": 5.4068, |
| "eval_samples_per_second": 16.461, |
| "eval_steps_per_second": 2.219, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.4437869822485207, |
| "grad_norm": 0.7255859375, |
| "learning_rate": 0.0001851851851851852, |
| "loss": 0.6037, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.4437869822485207, |
| "eval_loss": 0.605912983417511, |
| "eval_runtime": 5.4769, |
| "eval_samples_per_second": 16.25, |
| "eval_steps_per_second": 2.191, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.591715976331361, |
| "grad_norm": 0.6396484375, |
| "learning_rate": 0.00018018018018018018, |
| "loss": 0.5924, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.591715976331361, |
| "eval_loss": 0.5997055768966675, |
| "eval_runtime": 5.4232, |
| "eval_samples_per_second": 16.411, |
| "eval_steps_per_second": 2.213, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.7396449704142012, |
| "grad_norm": 0.6826171875, |
| "learning_rate": 0.0001751751751751752, |
| "loss": 0.5962, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.7396449704142012, |
| "eval_loss": 0.5943592190742493, |
| "eval_runtime": 5.4562, |
| "eval_samples_per_second": 16.312, |
| "eval_steps_per_second": 2.199, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.8875739644970414, |
| "grad_norm": 0.65771484375, |
| "learning_rate": 0.0001701701701701702, |
| "loss": 0.6017, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.8875739644970414, |
| "eval_loss": 0.5939959287643433, |
| "eval_runtime": 5.4411, |
| "eval_samples_per_second": 16.357, |
| "eval_steps_per_second": 2.205, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.0355029585798816, |
| "grad_norm": 0.5908203125, |
| "learning_rate": 0.00016516516516516518, |
| "loss": 0.546, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.0355029585798816, |
| "eval_loss": 0.6260668039321899, |
| "eval_runtime": 5.4895, |
| "eval_samples_per_second": 16.213, |
| "eval_steps_per_second": 2.186, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.183431952662722, |
| "grad_norm": 0.638671875, |
| "learning_rate": 0.00016016016016016018, |
| "loss": 0.4233, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.183431952662722, |
| "eval_loss": 0.6178574562072754, |
| "eval_runtime": 5.5065, |
| "eval_samples_per_second": 16.163, |
| "eval_steps_per_second": 2.179, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.331360946745562, |
| "grad_norm": 0.70947265625, |
| "learning_rate": 0.00015515515515515516, |
| "loss": 0.4295, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.331360946745562, |
| "eval_loss": 0.6140432953834534, |
| "eval_runtime": 5.468, |
| "eval_samples_per_second": 16.276, |
| "eval_steps_per_second": 2.195, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.4792899408284024, |
| "grad_norm": 0.685546875, |
| "learning_rate": 0.00015015015015015014, |
| "loss": 0.4165, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.4792899408284024, |
| "eval_loss": 0.6192346811294556, |
| "eval_runtime": 5.4874, |
| "eval_samples_per_second": 16.219, |
| "eval_steps_per_second": 2.187, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.6272189349112427, |
| "grad_norm": 0.82958984375, |
| "learning_rate": 0.00014514514514514515, |
| "loss": 0.428, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.6272189349112427, |
| "eval_loss": 0.6221323609352112, |
| "eval_runtime": 5.4947, |
| "eval_samples_per_second": 16.197, |
| "eval_steps_per_second": 2.184, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.7751479289940828, |
| "grad_norm": 0.6923828125, |
| "learning_rate": 0.00014014014014014013, |
| "loss": 0.4256, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.7751479289940828, |
| "eval_loss": 0.6257030367851257, |
| "eval_runtime": 5.4444, |
| "eval_samples_per_second": 16.347, |
| "eval_steps_per_second": 2.204, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.9230769230769231, |
| "grad_norm": 0.70703125, |
| "learning_rate": 0.00013513513513513514, |
| "loss": 0.4254, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.9230769230769231, |
| "eval_loss": 0.6234580874443054, |
| "eval_runtime": 5.4501, |
| "eval_samples_per_second": 16.33, |
| "eval_steps_per_second": 2.202, |
| "step": 325 |
| }, |
| { |
| "epoch": 2.0710059171597632, |
| "grad_norm": 0.7587890625, |
| "learning_rate": 0.00013013013013013014, |
| "loss": 0.3561, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.0710059171597632, |
| "eval_loss": 0.6810835599899292, |
| "eval_runtime": 5.4991, |
| "eval_samples_per_second": 16.184, |
| "eval_steps_per_second": 2.182, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.2189349112426036, |
| "grad_norm": 0.72998046875, |
| "learning_rate": 0.00012512512512512512, |
| "loss": 0.274, |
| "step": 375 |
| }, |
| { |
| "epoch": 2.2189349112426036, |
| "eval_loss": 0.7050045132637024, |
| "eval_runtime": 5.4732, |
| "eval_samples_per_second": 16.261, |
| "eval_steps_per_second": 2.193, |
| "step": 375 |
| }, |
| { |
| "epoch": 2.366863905325444, |
| "grad_norm": 0.67041015625, |
| "learning_rate": 0.00012012012012012013, |
| "loss": 0.2681, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.366863905325444, |
| "eval_loss": 0.7119271159172058, |
| "eval_runtime": 5.4471, |
| "eval_samples_per_second": 16.339, |
| "eval_steps_per_second": 2.203, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.5147928994082838, |
| "grad_norm": 0.841796875, |
| "learning_rate": 0.00011511511511511512, |
| "loss": 0.2792, |
| "step": 425 |
| }, |
| { |
| "epoch": 2.5147928994082838, |
| "eval_loss": 0.7170438170433044, |
| "eval_runtime": 5.4708, |
| "eval_samples_per_second": 16.268, |
| "eval_steps_per_second": 2.193, |
| "step": 425 |
| }, |
| { |
| "epoch": 2.662721893491124, |
| "grad_norm": 0.75341796875, |
| "learning_rate": 0.00011011011011011012, |
| "loss": 0.2738, |
| "step": 450 |
| }, |
| { |
| "epoch": 2.662721893491124, |
| "eval_loss": 0.7100588083267212, |
| "eval_runtime": 5.4533, |
| "eval_samples_per_second": 16.32, |
| "eval_steps_per_second": 2.201, |
| "step": 450 |
| }, |
| { |
| "epoch": 2.8106508875739644, |
| "grad_norm": 0.8544921875, |
| "learning_rate": 0.00010510510510510511, |
| "loss": 0.2787, |
| "step": 475 |
| }, |
| { |
| "epoch": 2.8106508875739644, |
| "eval_loss": 0.7120051383972168, |
| "eval_runtime": 5.4731, |
| "eval_samples_per_second": 16.261, |
| "eval_steps_per_second": 2.193, |
| "step": 475 |
| }, |
| { |
| "epoch": 2.9585798816568047, |
| "grad_norm": 0.6982421875, |
| "learning_rate": 0.00010010010010010012, |
| "loss": 0.2794, |
| "step": 500 |
| }, |
| { |
| "epoch": 2.9585798816568047, |
| "eval_loss": 0.7013982534408569, |
| "eval_runtime": 5.4689, |
| "eval_samples_per_second": 16.274, |
| "eval_steps_per_second": 2.194, |
| "step": 500 |
| }, |
| { |
| "epoch": 3.106508875739645, |
| "grad_norm": 0.61669921875, |
| "learning_rate": 9.50950950950951e-05, |
| "loss": 0.1912, |
| "step": 525 |
| }, |
| { |
| "epoch": 3.106508875739645, |
| "eval_loss": 0.8123311400413513, |
| "eval_runtime": 5.4645, |
| "eval_samples_per_second": 16.287, |
| "eval_steps_per_second": 2.196, |
| "step": 525 |
| }, |
| { |
| "epoch": 3.2544378698224854, |
| "grad_norm": 0.72412109375, |
| "learning_rate": 9.009009009009009e-05, |
| "loss": 0.1592, |
| "step": 550 |
| }, |
| { |
| "epoch": 3.2544378698224854, |
| "eval_loss": 0.8475367426872253, |
| "eval_runtime": 5.4516, |
| "eval_samples_per_second": 16.325, |
| "eval_steps_per_second": 2.201, |
| "step": 550 |
| }, |
| { |
| "epoch": 3.4023668639053253, |
| "grad_norm": 0.75341796875, |
| "learning_rate": 8.50850850850851e-05, |
| "loss": 0.1614, |
| "step": 575 |
| }, |
| { |
| "epoch": 3.4023668639053253, |
| "eval_loss": 0.8443621397018433, |
| "eval_runtime": 5.4523, |
| "eval_samples_per_second": 16.323, |
| "eval_steps_per_second": 2.201, |
| "step": 575 |
| }, |
| { |
| "epoch": 3.5502958579881656, |
| "grad_norm": 0.7294921875, |
| "learning_rate": 8.008008008008009e-05, |
| "loss": 0.1628, |
| "step": 600 |
| }, |
| { |
| "epoch": 3.5502958579881656, |
| "eval_loss": 0.8429920077323914, |
| "eval_runtime": 5.4655, |
| "eval_samples_per_second": 16.284, |
| "eval_steps_per_second": 2.196, |
| "step": 600 |
| }, |
| { |
| "epoch": 3.698224852071006, |
| "grad_norm": 0.90869140625, |
| "learning_rate": 7.507507507507507e-05, |
| "loss": 0.1599, |
| "step": 625 |
| }, |
| { |
| "epoch": 3.698224852071006, |
| "eval_loss": 0.8433002829551697, |
| "eval_runtime": 5.4723, |
| "eval_samples_per_second": 16.264, |
| "eval_steps_per_second": 2.193, |
| "step": 625 |
| }, |
| { |
| "epoch": 3.8461538461538463, |
| "grad_norm": 0.9658203125, |
| "learning_rate": 7.007007007007007e-05, |
| "loss": 0.1667, |
| "step": 650 |
| }, |
| { |
| "epoch": 3.8461538461538463, |
| "eval_loss": 0.8468146324157715, |
| "eval_runtime": 5.4573, |
| "eval_samples_per_second": 16.308, |
| "eval_steps_per_second": 2.199, |
| "step": 650 |
| }, |
| { |
| "epoch": 3.994082840236686, |
| "grad_norm": 0.806640625, |
| "learning_rate": 6.506506506506507e-05, |
| "loss": 0.1643, |
| "step": 675 |
| }, |
| { |
| "epoch": 3.994082840236686, |
| "eval_loss": 0.858656644821167, |
| "eval_runtime": 5.4565, |
| "eval_samples_per_second": 16.311, |
| "eval_steps_per_second": 2.199, |
| "step": 675 |
| }, |
| { |
| "epoch": 4.1420118343195265, |
| "grad_norm": 0.55029296875, |
| "learning_rate": 6.0060060060060066e-05, |
| "loss": 0.0958, |
| "step": 700 |
| }, |
| { |
| "epoch": 4.1420118343195265, |
| "eval_loss": 0.9950948357582092, |
| "eval_runtime": 5.5007, |
| "eval_samples_per_second": 16.18, |
| "eval_steps_per_second": 2.182, |
| "step": 700 |
| }, |
| { |
| "epoch": 4.289940828402367, |
| "grad_norm": 0.6220703125, |
| "learning_rate": 5.505505505505506e-05, |
| "loss": 0.0872, |
| "step": 725 |
| }, |
| { |
| "epoch": 4.289940828402367, |
| "eval_loss": 1.022282600402832, |
| "eval_runtime": 5.4552, |
| "eval_samples_per_second": 16.315, |
| "eval_steps_per_second": 2.2, |
| "step": 725 |
| }, |
| { |
| "epoch": 4.437869822485207, |
| "grad_norm": 0.74267578125, |
| "learning_rate": 5.005005005005006e-05, |
| "loss": 0.0892, |
| "step": 750 |
| }, |
| { |
| "epoch": 4.437869822485207, |
| "eval_loss": 1.0246338844299316, |
| "eval_runtime": 5.4786, |
| "eval_samples_per_second": 16.245, |
| "eval_steps_per_second": 2.19, |
| "step": 750 |
| }, |
| { |
| "epoch": 4.585798816568047, |
| "grad_norm": 0.69970703125, |
| "learning_rate": 4.5045045045045046e-05, |
| "loss": 0.0892, |
| "step": 775 |
| }, |
| { |
| "epoch": 4.585798816568047, |
| "eval_loss": 1.037864327430725, |
| "eval_runtime": 5.5314, |
| "eval_samples_per_second": 16.09, |
| "eval_steps_per_second": 2.169, |
| "step": 775 |
| }, |
| { |
| "epoch": 4.733727810650888, |
| "grad_norm": 0.5244140625, |
| "learning_rate": 4.0040040040040046e-05, |
| "loss": 0.086, |
| "step": 800 |
| }, |
| { |
| "epoch": 4.733727810650888, |
| "eval_loss": 1.0276521444320679, |
| "eval_runtime": 5.5056, |
| "eval_samples_per_second": 16.165, |
| "eval_steps_per_second": 2.18, |
| "step": 800 |
| }, |
| { |
| "epoch": 4.881656804733728, |
| "grad_norm": 0.7646484375, |
| "learning_rate": 3.503503503503503e-05, |
| "loss": 0.0871, |
| "step": 825 |
| }, |
| { |
| "epoch": 4.881656804733728, |
| "eval_loss": 1.058767318725586, |
| "eval_runtime": 5.4583, |
| "eval_samples_per_second": 16.305, |
| "eval_steps_per_second": 2.198, |
| "step": 825 |
| }, |
| { |
| "epoch": 5.029585798816568, |
| "grad_norm": 0.45458984375, |
| "learning_rate": 3.0030030030030033e-05, |
| "loss": 0.0808, |
| "step": 850 |
| }, |
| { |
| "epoch": 5.029585798816568, |
| "eval_loss": 1.081122636795044, |
| "eval_runtime": 5.4567, |
| "eval_samples_per_second": 16.31, |
| "eval_steps_per_second": 2.199, |
| "step": 850 |
| }, |
| { |
| "epoch": 5.177514792899408, |
| "grad_norm": 0.5234375, |
| "learning_rate": 2.502502502502503e-05, |
| "loss": 0.0512, |
| "step": 875 |
| }, |
| { |
| "epoch": 5.177514792899408, |
| "eval_loss": 1.168296217918396, |
| "eval_runtime": 5.4565, |
| "eval_samples_per_second": 16.311, |
| "eval_steps_per_second": 2.199, |
| "step": 875 |
| }, |
| { |
| "epoch": 5.325443786982248, |
| "grad_norm": 0.43701171875, |
| "learning_rate": 2.0020020020020023e-05, |
| "loss": 0.0512, |
| "step": 900 |
| }, |
| { |
| "epoch": 5.325443786982248, |
| "eval_loss": 1.161507487297058, |
| "eval_runtime": 5.5102, |
| "eval_samples_per_second": 16.152, |
| "eval_steps_per_second": 2.178, |
| "step": 900 |
| }, |
| { |
| "epoch": 5.4733727810650885, |
| "grad_norm": 0.56298828125, |
| "learning_rate": 1.5015015015015016e-05, |
| "loss": 0.051, |
| "step": 925 |
| }, |
| { |
| "epoch": 5.4733727810650885, |
| "eval_loss": 1.1937472820281982, |
| "eval_runtime": 5.5151, |
| "eval_samples_per_second": 16.138, |
| "eval_steps_per_second": 2.176, |
| "step": 925 |
| }, |
| { |
| "epoch": 5.621301775147929, |
| "grad_norm": 0.56396484375, |
| "learning_rate": 1.0010010010010011e-05, |
| "loss": 0.0493, |
| "step": 950 |
| }, |
| { |
| "epoch": 5.621301775147929, |
| "eval_loss": 1.184690237045288, |
| "eval_runtime": 5.4656, |
| "eval_samples_per_second": 16.284, |
| "eval_steps_per_second": 2.196, |
| "step": 950 |
| }, |
| { |
| "epoch": 5.769230769230769, |
| "grad_norm": 0.43017578125, |
| "learning_rate": 5.005005005005006e-06, |
| "loss": 0.0496, |
| "step": 975 |
| }, |
| { |
| "epoch": 5.769230769230769, |
| "eval_loss": 1.1814072132110596, |
| "eval_runtime": 5.4875, |
| "eval_samples_per_second": 16.219, |
| "eval_steps_per_second": 2.187, |
| "step": 975 |
| }, |
| { |
| "epoch": 5.9171597633136095, |
| "grad_norm": 0.56103515625, |
| "learning_rate": 0.0, |
| "loss": 0.0469, |
| "step": 1000 |
| }, |
| { |
| "epoch": 5.9171597633136095, |
| "eval_loss": 1.1828829050064087, |
| "eval_runtime": 5.4573, |
| "eval_samples_per_second": 16.308, |
| "eval_steps_per_second": 2.199, |
| "step": 1000 |
| }, |
| { |
| "epoch": 5.9171597633136095, |
| "step": 1000, |
| "total_flos": 1.75885655212032e+17, |
| "train_loss": 0.27826149678230283, |
| "train_runtime": 1217.2703, |
| "train_samples_per_second": 3.286, |
| "train_steps_per_second": 0.822 |
| } |
| ], |
| "logging_steps": 25, |
| "max_steps": 1000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 6, |
| "save_steps": 25, |
| "total_flos": 1.75885655212032e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|