| { | |
| "best_global_step": 1050, | |
| "best_metric": 4.7181901931762695, | |
| "best_model_checkpoint": ".../training_output/checkpoint-800", | |
| "epoch": 2.0, | |
| "eval_steps": 50, | |
| "global_step": 1140, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.017543859649122806, | |
| "grad_norm": 0.8221026659011841, | |
| "learning_rate": 7.894736842105263e-07, | |
| "loss": 4.957, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.03508771929824561, | |
| "grad_norm": 0.8544751405715942, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 4.9467, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05263157894736842, | |
| "grad_norm": 0.964083731174469, | |
| "learning_rate": 2.5438596491228075e-06, | |
| "loss": 4.9452, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.07017543859649122, | |
| "grad_norm": 0.9615139365196228, | |
| "learning_rate": 3.421052631578948e-06, | |
| "loss": 4.9325, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08771929824561403, | |
| "grad_norm": 1.156923770904541, | |
| "learning_rate": 4.298245614035088e-06, | |
| "loss": 4.9056, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.08771929824561403, | |
| "eval_q2q_data_loss": 4.880394458770752, | |
| "eval_q2q_data_runtime": 5.5966, | |
| "eval_q2q_data_samples_per_second": 314.295, | |
| "eval_q2q_data_steps_per_second": 19.655, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.08771929824561403, | |
| "eval_q2p_data_loss": 4.922183990478516, | |
| "eval_q2p_data_runtime": 7.55, | |
| "eval_q2p_data_samples_per_second": 53.775, | |
| "eval_q2p_data_steps_per_second": 3.444, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.10526315789473684, | |
| "grad_norm": 1.2874988317489624, | |
| "learning_rate": 5.175438596491229e-06, | |
| "loss": 4.9041, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.12280701754385964, | |
| "grad_norm": 1.5450624227523804, | |
| "learning_rate": 6.0526315789473685e-06, | |
| "loss": 4.8866, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.14035087719298245, | |
| "grad_norm": 1.8990825414657593, | |
| "learning_rate": 6.92982456140351e-06, | |
| "loss": 4.844, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.15789473684210525, | |
| "grad_norm": 2.0947864055633545, | |
| "learning_rate": 7.80701754385965e-06, | |
| "loss": 4.8064, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.17543859649122806, | |
| "grad_norm": 2.2433862686157227, | |
| "learning_rate": 8.68421052631579e-06, | |
| "loss": 4.8182, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.17543859649122806, | |
| "eval_q2q_data_loss": 4.724327087402344, | |
| "eval_q2q_data_runtime": 5.5749, | |
| "eval_q2q_data_samples_per_second": 315.523, | |
| "eval_q2q_data_steps_per_second": 19.731, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.17543859649122806, | |
| "eval_q2p_data_loss": 4.865963459014893, | |
| "eval_q2p_data_runtime": 7.5397, | |
| "eval_q2p_data_samples_per_second": 53.849, | |
| "eval_q2p_data_steps_per_second": 3.448, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.19298245614035087, | |
| "grad_norm": 2.198146104812622, | |
| "learning_rate": 9.56140350877193e-06, | |
| "loss": 4.7791, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.21052631578947367, | |
| "grad_norm": 2.6786892414093018, | |
| "learning_rate": 9.951267056530215e-06, | |
| "loss": 4.7659, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.22807017543859648, | |
| "grad_norm": 2.485137462615967, | |
| "learning_rate": 9.853801169590644e-06, | |
| "loss": 4.7572, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.24561403508771928, | |
| "grad_norm": 2.5113883018493652, | |
| "learning_rate": 9.756335282651072e-06, | |
| "loss": 4.7234, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.2631578947368421, | |
| "grad_norm": 3.184298276901245, | |
| "learning_rate": 9.658869395711503e-06, | |
| "loss": 4.726, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2631578947368421, | |
| "eval_q2q_data_loss": 4.626772403717041, | |
| "eval_q2q_data_runtime": 5.5905, | |
| "eval_q2q_data_samples_per_second": 314.638, | |
| "eval_q2q_data_steps_per_second": 19.676, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2631578947368421, | |
| "eval_q2p_data_loss": 4.871231555938721, | |
| "eval_q2p_data_runtime": 7.5434, | |
| "eval_q2p_data_samples_per_second": 53.822, | |
| "eval_q2p_data_steps_per_second": 3.447, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2807017543859649, | |
| "grad_norm": 3.1563026905059814, | |
| "learning_rate": 9.56140350877193e-06, | |
| "loss": 4.6932, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.2982456140350877, | |
| "grad_norm": 3.4077727794647217, | |
| "learning_rate": 9.463937621832359e-06, | |
| "loss": 4.6654, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3157894736842105, | |
| "grad_norm": 3.617626428604126, | |
| "learning_rate": 9.366471734892788e-06, | |
| "loss": 4.6776, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 4.701232433319092, | |
| "learning_rate": 9.269005847953217e-06, | |
| "loss": 4.6617, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3508771929824561, | |
| "grad_norm": 7.48028564453125, | |
| "learning_rate": 9.171539961013646e-06, | |
| "loss": 4.6928, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3508771929824561, | |
| "eval_q2q_data_loss": 4.558098793029785, | |
| "eval_q2q_data_runtime": 5.5778, | |
| "eval_q2q_data_samples_per_second": 315.355, | |
| "eval_q2q_data_steps_per_second": 19.721, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3508771929824561, | |
| "eval_q2p_data_loss": 4.881445407867432, | |
| "eval_q2p_data_runtime": 7.5112, | |
| "eval_q2p_data_samples_per_second": 54.053, | |
| "eval_q2p_data_steps_per_second": 3.462, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3684210526315789, | |
| "grad_norm": 4.592555522918701, | |
| "learning_rate": 9.074074074074075e-06, | |
| "loss": 4.6497, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.38596491228070173, | |
| "grad_norm": 4.758955478668213, | |
| "learning_rate": 8.976608187134503e-06, | |
| "loss": 4.677, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.40350877192982454, | |
| "grad_norm": 4.005542278289795, | |
| "learning_rate": 8.879142300194934e-06, | |
| "loss": 4.6344, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.42105263157894735, | |
| "grad_norm": 5.429654598236084, | |
| "learning_rate": 8.781676413255361e-06, | |
| "loss": 4.6612, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.43859649122807015, | |
| "grad_norm": 5.14253044128418, | |
| "learning_rate": 8.68421052631579e-06, | |
| "loss": 4.6274, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.43859649122807015, | |
| "eval_q2q_data_loss": 4.515370845794678, | |
| "eval_q2q_data_runtime": 5.5777, | |
| "eval_q2q_data_samples_per_second": 315.36, | |
| "eval_q2q_data_steps_per_second": 19.721, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.43859649122807015, | |
| "eval_q2p_data_loss": 4.839608669281006, | |
| "eval_q2p_data_runtime": 7.5286, | |
| "eval_q2p_data_samples_per_second": 53.928, | |
| "eval_q2p_data_steps_per_second": 3.454, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.45614035087719296, | |
| "grad_norm": 4.397937774658203, | |
| "learning_rate": 8.586744639376219e-06, | |
| "loss": 4.6556, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.47368421052631576, | |
| "grad_norm": 6.12044095993042, | |
| "learning_rate": 8.489278752436648e-06, | |
| "loss": 4.6382, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.49122807017543857, | |
| "grad_norm": 8.43116283416748, | |
| "learning_rate": 8.391812865497077e-06, | |
| "loss": 4.6053, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5087719298245614, | |
| "grad_norm": 7.88032341003418, | |
| "learning_rate": 8.294346978557506e-06, | |
| "loss": 4.6131, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 6.561196804046631, | |
| "learning_rate": 8.196881091617934e-06, | |
| "loss": 4.6453, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "eval_q2q_data_loss": 4.495702743530273, | |
| "eval_q2q_data_runtime": 5.5691, | |
| "eval_q2q_data_samples_per_second": 315.85, | |
| "eval_q2q_data_steps_per_second": 19.752, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "eval_q2p_data_loss": 4.831414222717285, | |
| "eval_q2p_data_runtime": 7.5076, | |
| "eval_q2p_data_samples_per_second": 54.079, | |
| "eval_q2p_data_steps_per_second": 3.463, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.543859649122807, | |
| "grad_norm": 7.7354536056518555, | |
| "learning_rate": 8.099415204678363e-06, | |
| "loss": 4.5819, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5614035087719298, | |
| "grad_norm": 6.592026233673096, | |
| "learning_rate": 8.001949317738792e-06, | |
| "loss": 4.5948, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5789473684210527, | |
| "grad_norm": 8.176568031311035, | |
| "learning_rate": 7.904483430799221e-06, | |
| "loss": 4.5288, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.5964912280701754, | |
| "grad_norm": 8.80689811706543, | |
| "learning_rate": 7.80701754385965e-06, | |
| "loss": 4.6152, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6140350877192983, | |
| "grad_norm": 6.051924228668213, | |
| "learning_rate": 7.70955165692008e-06, | |
| "loss": 4.5831, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6140350877192983, | |
| "eval_q2q_data_loss": 4.4657182693481445, | |
| "eval_q2q_data_runtime": 5.5705, | |
| "eval_q2q_data_samples_per_second": 315.77, | |
| "eval_q2q_data_steps_per_second": 19.747, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6140350877192983, | |
| "eval_q2p_data_loss": 4.795331001281738, | |
| "eval_q2p_data_runtime": 7.5177, | |
| "eval_q2p_data_samples_per_second": 54.006, | |
| "eval_q2p_data_steps_per_second": 3.458, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.631578947368421, | |
| "grad_norm": 6.087244510650635, | |
| "learning_rate": 7.612085769980507e-06, | |
| "loss": 4.5507, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.6491228070175439, | |
| "grad_norm": 8.209424018859863, | |
| "learning_rate": 7.5146198830409365e-06, | |
| "loss": 4.5718, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 11.899641990661621, | |
| "learning_rate": 7.417153996101365e-06, | |
| "loss": 4.6269, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.6842105263157895, | |
| "grad_norm": 10.490060806274414, | |
| "learning_rate": 7.319688109161795e-06, | |
| "loss": 4.6017, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.7017543859649122, | |
| "grad_norm": 6.545611381530762, | |
| "learning_rate": 7.222222222222223e-06, | |
| "loss": 4.5155, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7017543859649122, | |
| "eval_q2q_data_loss": 4.439589500427246, | |
| "eval_q2q_data_runtime": 5.563, | |
| "eval_q2q_data_samples_per_second": 316.195, | |
| "eval_q2q_data_steps_per_second": 19.773, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7017543859649122, | |
| "eval_q2p_data_loss": 4.769360542297363, | |
| "eval_q2p_data_runtime": 7.5013, | |
| "eval_q2p_data_samples_per_second": 54.124, | |
| "eval_q2p_data_steps_per_second": 3.466, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7192982456140351, | |
| "grad_norm": 9.658538818359375, | |
| "learning_rate": 7.124756335282652e-06, | |
| "loss": 4.5055, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.7368421052631579, | |
| "grad_norm": 11.859044075012207, | |
| "learning_rate": 7.02729044834308e-06, | |
| "loss": 4.534, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.7543859649122807, | |
| "grad_norm": 6.311577320098877, | |
| "learning_rate": 6.92982456140351e-06, | |
| "loss": 4.5358, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.7719298245614035, | |
| "grad_norm": 15.303114891052246, | |
| "learning_rate": 6.832358674463938e-06, | |
| "loss": 4.5443, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.7894736842105263, | |
| "grad_norm": 7.770440101623535, | |
| "learning_rate": 6.7348927875243675e-06, | |
| "loss": 4.5309, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.7894736842105263, | |
| "eval_q2q_data_loss": 4.418254852294922, | |
| "eval_q2q_data_runtime": 5.5809, | |
| "eval_q2q_data_samples_per_second": 315.182, | |
| "eval_q2q_data_steps_per_second": 19.71, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.7894736842105263, | |
| "eval_q2p_data_loss": 4.7750725746154785, | |
| "eval_q2p_data_runtime": 7.5356, | |
| "eval_q2p_data_samples_per_second": 53.878, | |
| "eval_q2p_data_steps_per_second": 3.45, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.8070175438596491, | |
| "grad_norm": 10.787198066711426, | |
| "learning_rate": 6.637426900584796e-06, | |
| "loss": 4.5952, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.8245614035087719, | |
| "grad_norm": 6.622506141662598, | |
| "learning_rate": 6.539961013645225e-06, | |
| "loss": 4.5561, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.8421052631578947, | |
| "grad_norm": 9.452810287475586, | |
| "learning_rate": 6.442495126705654e-06, | |
| "loss": 4.5191, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.8596491228070176, | |
| "grad_norm": 8.921065330505371, | |
| "learning_rate": 6.345029239766083e-06, | |
| "loss": 4.5066, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.8771929824561403, | |
| "grad_norm": 6.36785364151001, | |
| "learning_rate": 6.247563352826511e-06, | |
| "loss": 4.4875, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8771929824561403, | |
| "eval_q2q_data_loss": 4.413846015930176, | |
| "eval_q2q_data_runtime": 5.5964, | |
| "eval_q2q_data_samples_per_second": 314.308, | |
| "eval_q2q_data_steps_per_second": 19.655, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8771929824561403, | |
| "eval_q2p_data_loss": 4.819548606872559, | |
| "eval_q2p_data_runtime": 7.5407, | |
| "eval_q2p_data_samples_per_second": 53.841, | |
| "eval_q2p_data_steps_per_second": 3.448, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8947368421052632, | |
| "grad_norm": 8.613053321838379, | |
| "learning_rate": 6.15009746588694e-06, | |
| "loss": 4.5051, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.9122807017543859, | |
| "grad_norm": 6.249648571014404, | |
| "learning_rate": 6.0526315789473685e-06, | |
| "loss": 4.4872, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.9298245614035088, | |
| "grad_norm": 14.66945743560791, | |
| "learning_rate": 5.9551656920077984e-06, | |
| "loss": 4.4918, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.9473684210526315, | |
| "grad_norm": 13.305913925170898, | |
| "learning_rate": 5.857699805068227e-06, | |
| "loss": 4.5357, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.9649122807017544, | |
| "grad_norm": 10.659647941589355, | |
| "learning_rate": 5.760233918128656e-06, | |
| "loss": 4.4898, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.9649122807017544, | |
| "eval_q2q_data_loss": 4.375401020050049, | |
| "eval_q2q_data_runtime": 5.5712, | |
| "eval_q2q_data_samples_per_second": 315.731, | |
| "eval_q2q_data_steps_per_second": 19.744, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.9649122807017544, | |
| "eval_q2p_data_loss": 4.779933929443359, | |
| "eval_q2p_data_runtime": 7.4961, | |
| "eval_q2p_data_samples_per_second": 54.162, | |
| "eval_q2p_data_steps_per_second": 3.468, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.9824561403508771, | |
| "grad_norm": 7.730218410491943, | |
| "learning_rate": 5.662768031189084e-06, | |
| "loss": 4.5742, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 9.418205261230469, | |
| "learning_rate": 5.565302144249514e-06, | |
| "loss": 4.5461, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.0175438596491229, | |
| "grad_norm": 10.373188972473145, | |
| "learning_rate": 5.467836257309942e-06, | |
| "loss": 4.5505, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.0350877192982457, | |
| "grad_norm": 11.559415817260742, | |
| "learning_rate": 5.370370370370371e-06, | |
| "loss": 4.5027, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.0526315789473684, | |
| "grad_norm": 18.346025466918945, | |
| "learning_rate": 5.2729044834307995e-06, | |
| "loss": 4.5747, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.0526315789473684, | |
| "eval_q2q_data_loss": 4.405951499938965, | |
| "eval_q2q_data_runtime": 5.5358, | |
| "eval_q2q_data_samples_per_second": 317.749, | |
| "eval_q2q_data_steps_per_second": 19.871, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.0526315789473684, | |
| "eval_q2p_data_loss": 4.791478633880615, | |
| "eval_q2p_data_runtime": 7.389, | |
| "eval_q2p_data_samples_per_second": 54.947, | |
| "eval_q2p_data_steps_per_second": 3.519, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.0701754385964912, | |
| "grad_norm": 8.272171020507812, | |
| "learning_rate": 5.175438596491229e-06, | |
| "loss": 4.5296, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.087719298245614, | |
| "grad_norm": 8.837151527404785, | |
| "learning_rate": 5.077972709551658e-06, | |
| "loss": 4.4262, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.1052631578947367, | |
| "grad_norm": 13.43027400970459, | |
| "learning_rate": 4.980506822612086e-06, | |
| "loss": 4.5415, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.1228070175438596, | |
| "grad_norm": 8.466143608093262, | |
| "learning_rate": 4.883040935672515e-06, | |
| "loss": 4.5386, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.1403508771929824, | |
| "grad_norm": 10.755400657653809, | |
| "learning_rate": 4.785575048732944e-06, | |
| "loss": 4.4552, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.1403508771929824, | |
| "eval_q2q_data_loss": 4.363187789916992, | |
| "eval_q2q_data_runtime": 5.5237, | |
| "eval_q2q_data_samples_per_second": 318.449, | |
| "eval_q2q_data_steps_per_second": 19.914, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.1403508771929824, | |
| "eval_q2p_data_loss": 4.810464382171631, | |
| "eval_q2p_data_runtime": 7.469, | |
| "eval_q2p_data_samples_per_second": 54.358, | |
| "eval_q2p_data_steps_per_second": 3.481, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.1578947368421053, | |
| "grad_norm": 8.030132293701172, | |
| "learning_rate": 4.688109161793373e-06, | |
| "loss": 4.4473, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.1754385964912282, | |
| "grad_norm": 8.19764518737793, | |
| "learning_rate": 4.590643274853801e-06, | |
| "loss": 4.5069, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.1929824561403508, | |
| "grad_norm": 11.119821548461914, | |
| "learning_rate": 4.4931773879142305e-06, | |
| "loss": 4.5129, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.2105263157894737, | |
| "grad_norm": 9.186931610107422, | |
| "learning_rate": 4.3957115009746595e-06, | |
| "loss": 4.4611, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.2280701754385965, | |
| "grad_norm": 7.6313042640686035, | |
| "learning_rate": 4.298245614035088e-06, | |
| "loss": 4.5104, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.2280701754385965, | |
| "eval_q2q_data_loss": 4.353029727935791, | |
| "eval_q2q_data_runtime": 5.559, | |
| "eval_q2q_data_samples_per_second": 316.425, | |
| "eval_q2q_data_steps_per_second": 19.788, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.2280701754385965, | |
| "eval_q2p_data_loss": 4.787461757659912, | |
| "eval_q2p_data_runtime": 7.5053, | |
| "eval_q2p_data_samples_per_second": 54.095, | |
| "eval_q2p_data_steps_per_second": 3.464, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.2456140350877192, | |
| "grad_norm": 12.636022567749023, | |
| "learning_rate": 4.200779727095517e-06, | |
| "loss": 4.4742, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.263157894736842, | |
| "grad_norm": 16.598079681396484, | |
| "learning_rate": 4.103313840155946e-06, | |
| "loss": 4.4887, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.280701754385965, | |
| "grad_norm": 6.5720038414001465, | |
| "learning_rate": 4.005847953216375e-06, | |
| "loss": 4.406, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.2982456140350878, | |
| "grad_norm": 10.550318717956543, | |
| "learning_rate": 3.908382066276803e-06, | |
| "loss": 4.4049, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.3157894736842106, | |
| "grad_norm": 16.054428100585938, | |
| "learning_rate": 3.8109161793372323e-06, | |
| "loss": 4.4165, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.3157894736842106, | |
| "eval_q2q_data_loss": 4.348443031311035, | |
| "eval_q2q_data_runtime": 5.5669, | |
| "eval_q2q_data_samples_per_second": 315.976, | |
| "eval_q2q_data_steps_per_second": 19.76, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.3157894736842106, | |
| "eval_q2p_data_loss": 4.786614894866943, | |
| "eval_q2p_data_runtime": 7.508, | |
| "eval_q2p_data_samples_per_second": 54.076, | |
| "eval_q2p_data_steps_per_second": 3.463, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.3333333333333333, | |
| "grad_norm": 10.04055404663086, | |
| "learning_rate": 3.713450292397661e-06, | |
| "loss": 4.4274, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.3508771929824561, | |
| "grad_norm": 12.780068397521973, | |
| "learning_rate": 3.61598440545809e-06, | |
| "loss": 4.4855, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.368421052631579, | |
| "grad_norm": 10.54061222076416, | |
| "learning_rate": 3.5185185185185187e-06, | |
| "loss": 4.4571, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.3859649122807016, | |
| "grad_norm": 5.75900936126709, | |
| "learning_rate": 3.421052631578948e-06, | |
| "loss": 4.4307, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.4035087719298245, | |
| "grad_norm": 10.625808715820312, | |
| "learning_rate": 3.3235867446393765e-06, | |
| "loss": 4.4387, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.4035087719298245, | |
| "eval_q2q_data_loss": 4.345006465911865, | |
| "eval_q2q_data_runtime": 5.5698, | |
| "eval_q2q_data_samples_per_second": 315.808, | |
| "eval_q2q_data_steps_per_second": 19.749, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.4035087719298245, | |
| "eval_q2p_data_loss": 4.762818813323975, | |
| "eval_q2p_data_runtime": 7.5368, | |
| "eval_q2p_data_samples_per_second": 53.869, | |
| "eval_q2p_data_steps_per_second": 3.45, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.4210526315789473, | |
| "grad_norm": 9.662367820739746, | |
| "learning_rate": 3.2261208576998056e-06, | |
| "loss": 4.4592, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.4385964912280702, | |
| "grad_norm": 14.999639511108398, | |
| "learning_rate": 3.1286549707602342e-06, | |
| "loss": 4.4368, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.456140350877193, | |
| "grad_norm": 17.007898330688477, | |
| "learning_rate": 3.0311890838206633e-06, | |
| "loss": 4.4863, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.4736842105263157, | |
| "grad_norm": 14.116398811340332, | |
| "learning_rate": 2.933723196881092e-06, | |
| "loss": 4.463, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.4912280701754386, | |
| "grad_norm": 5.4955315589904785, | |
| "learning_rate": 2.8362573099415206e-06, | |
| "loss": 4.4113, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.4912280701754386, | |
| "eval_q2q_data_loss": 4.325167655944824, | |
| "eval_q2q_data_runtime": 5.5814, | |
| "eval_q2q_data_samples_per_second": 315.156, | |
| "eval_q2q_data_steps_per_second": 19.708, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.4912280701754386, | |
| "eval_q2p_data_loss": 4.761044979095459, | |
| "eval_q2p_data_runtime": 7.4985, | |
| "eval_q2p_data_samples_per_second": 54.144, | |
| "eval_q2p_data_steps_per_second": 3.467, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.5087719298245614, | |
| "grad_norm": 13.653097152709961, | |
| "learning_rate": 2.7387914230019497e-06, | |
| "loss": 4.4368, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.526315789473684, | |
| "grad_norm": 13.720170974731445, | |
| "learning_rate": 2.6413255360623784e-06, | |
| "loss": 4.4738, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.543859649122807, | |
| "grad_norm": 15.261076927185059, | |
| "learning_rate": 2.5438596491228075e-06, | |
| "loss": 4.4195, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.5614035087719298, | |
| "grad_norm": 10.974407196044922, | |
| "learning_rate": 2.446393762183236e-06, | |
| "loss": 4.4478, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.5789473684210527, | |
| "grad_norm": 10.83484935760498, | |
| "learning_rate": 2.3489278752436648e-06, | |
| "loss": 4.3849, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.5789473684210527, | |
| "eval_q2q_data_loss": 4.314022064208984, | |
| "eval_q2q_data_runtime": 5.5727, | |
| "eval_q2q_data_samples_per_second": 315.646, | |
| "eval_q2q_data_steps_per_second": 19.739, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.5789473684210527, | |
| "eval_q2p_data_loss": 4.751864910125732, | |
| "eval_q2p_data_runtime": 7.4934, | |
| "eval_q2p_data_samples_per_second": 54.181, | |
| "eval_q2p_data_steps_per_second": 3.47, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.5964912280701755, | |
| "grad_norm": 21.77918243408203, | |
| "learning_rate": 2.2514619883040934e-06, | |
| "loss": 4.4896, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.6140350877192984, | |
| "grad_norm": 7.528986930847168, | |
| "learning_rate": 2.1539961013645225e-06, | |
| "loss": 4.4301, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.631578947368421, | |
| "grad_norm": 8.18942928314209, | |
| "learning_rate": 2.056530214424951e-06, | |
| "loss": 4.4142, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.6491228070175439, | |
| "grad_norm": 10.001923561096191, | |
| "learning_rate": 1.9590643274853803e-06, | |
| "loss": 4.4582, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 10.730441093444824, | |
| "learning_rate": 1.861598440545809e-06, | |
| "loss": 4.5075, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "eval_q2q_data_loss": 4.3189191818237305, | |
| "eval_q2q_data_runtime": 5.5874, | |
| "eval_q2q_data_samples_per_second": 314.816, | |
| "eval_q2q_data_steps_per_second": 19.687, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "eval_q2p_data_loss": 4.725940704345703, | |
| "eval_q2p_data_runtime": 7.514, | |
| "eval_q2p_data_samples_per_second": 54.033, | |
| "eval_q2p_data_steps_per_second": 3.46, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.6842105263157894, | |
| "grad_norm": 9.174509048461914, | |
| "learning_rate": 1.7641325536062378e-06, | |
| "loss": 4.4454, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.7017543859649122, | |
| "grad_norm": 11.805915832519531, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 4.3547, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.719298245614035, | |
| "grad_norm": 9.230790138244629, | |
| "learning_rate": 1.5692007797270955e-06, | |
| "loss": 4.4016, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.736842105263158, | |
| "grad_norm": 13.292176246643066, | |
| "learning_rate": 1.4717348927875244e-06, | |
| "loss": 4.4064, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.7543859649122808, | |
| "grad_norm": 9.294161796569824, | |
| "learning_rate": 1.3742690058479533e-06, | |
| "loss": 4.4356, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.7543859649122808, | |
| "eval_q2q_data_loss": 4.3151326179504395, | |
| "eval_q2q_data_runtime": 5.5534, | |
| "eval_q2q_data_samples_per_second": 316.742, | |
| "eval_q2q_data_steps_per_second": 19.808, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.7543859649122808, | |
| "eval_q2p_data_loss": 4.727615833282471, | |
| "eval_q2p_data_runtime": 7.5335, | |
| "eval_q2p_data_samples_per_second": 53.893, | |
| "eval_q2p_data_steps_per_second": 3.451, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.7719298245614035, | |
| "grad_norm": 12.539956092834473, | |
| "learning_rate": 1.2768031189083821e-06, | |
| "loss": 4.4105, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.7894736842105263, | |
| "grad_norm": 15.329697608947754, | |
| "learning_rate": 1.179337231968811e-06, | |
| "loss": 4.4067, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.807017543859649, | |
| "grad_norm": 7.712077617645264, | |
| "learning_rate": 1.0818713450292399e-06, | |
| "loss": 4.4296, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.8245614035087718, | |
| "grad_norm": 7.909111976623535, | |
| "learning_rate": 9.844054580896685e-07, | |
| "loss": 4.4147, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.8421052631578947, | |
| "grad_norm": 16.474355697631836, | |
| "learning_rate": 8.869395711500975e-07, | |
| "loss": 4.3743, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.8421052631578947, | |
| "eval_q2q_data_loss": 4.313626289367676, | |
| "eval_q2q_data_runtime": 5.5976, | |
| "eval_q2q_data_samples_per_second": 314.244, | |
| "eval_q2q_data_steps_per_second": 19.651, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.8421052631578947, | |
| "eval_q2p_data_loss": 4.7181901931762695, | |
| "eval_q2p_data_runtime": 7.5322, | |
| "eval_q2p_data_samples_per_second": 53.902, | |
| "eval_q2p_data_steps_per_second": 3.452, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.8596491228070176, | |
| "grad_norm": 11.424127578735352, | |
| "learning_rate": 7.894736842105263e-07, | |
| "loss": 4.4065, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.8771929824561404, | |
| "grad_norm": 8.293243408203125, | |
| "learning_rate": 6.920077972709552e-07, | |
| "loss": 4.4025, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.8947368421052633, | |
| "grad_norm": 11.082077026367188, | |
| "learning_rate": 5.94541910331384e-07, | |
| "loss": 4.3912, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.912280701754386, | |
| "grad_norm": 13.221600532531738, | |
| "learning_rate": 4.970760233918129e-07, | |
| "loss": 4.3731, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.9298245614035088, | |
| "grad_norm": 16.041154861450195, | |
| "learning_rate": 3.996101364522417e-07, | |
| "loss": 4.3817, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.9298245614035088, | |
| "eval_q2q_data_loss": 4.311989784240723, | |
| "eval_q2q_data_runtime": 5.5712, | |
| "eval_q2q_data_samples_per_second": 315.734, | |
| "eval_q2q_data_steps_per_second": 19.745, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.9298245614035088, | |
| "eval_q2p_data_loss": 4.735711097717285, | |
| "eval_q2p_data_runtime": 7.4899, | |
| "eval_q2p_data_samples_per_second": 54.207, | |
| "eval_q2p_data_steps_per_second": 3.471, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.9473684210526314, | |
| "grad_norm": 8.501266479492188, | |
| "learning_rate": 3.021442495126706e-07, | |
| "loss": 4.4305, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.9649122807017543, | |
| "grad_norm": 7.625467300415039, | |
| "learning_rate": 2.046783625730994e-07, | |
| "loss": 4.3914, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.9824561403508771, | |
| "grad_norm": 11.992876052856445, | |
| "learning_rate": 1.0721247563352827e-07, | |
| "loss": 4.4753, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 5.4963908195495605, | |
| "learning_rate": 9.746588693957116e-09, | |
| "loss": 4.4536, | |
| "step": 1140 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1140, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 36, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |