{ "best_global_step": 1050, "best_metric": 4.7181901931762695, "best_model_checkpoint": ".../training_output/checkpoint-800", "epoch": 2.0, "eval_steps": 50, "global_step": 1140, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017543859649122806, "grad_norm": 0.8221026659011841, "learning_rate": 7.894736842105263e-07, "loss": 4.957, "step": 10 }, { "epoch": 0.03508771929824561, "grad_norm": 0.8544751405715942, "learning_rate": 1.6666666666666667e-06, "loss": 4.9467, "step": 20 }, { "epoch": 0.05263157894736842, "grad_norm": 0.964083731174469, "learning_rate": 2.5438596491228075e-06, "loss": 4.9452, "step": 30 }, { "epoch": 0.07017543859649122, "grad_norm": 0.9615139365196228, "learning_rate": 3.421052631578948e-06, "loss": 4.9325, "step": 40 }, { "epoch": 0.08771929824561403, "grad_norm": 1.156923770904541, "learning_rate": 4.298245614035088e-06, "loss": 4.9056, "step": 50 }, { "epoch": 0.08771929824561403, "eval_q2q_data_loss": 4.880394458770752, "eval_q2q_data_runtime": 5.5966, "eval_q2q_data_samples_per_second": 314.295, "eval_q2q_data_steps_per_second": 19.655, "step": 50 }, { "epoch": 0.08771929824561403, "eval_q2p_data_loss": 4.922183990478516, "eval_q2p_data_runtime": 7.55, "eval_q2p_data_samples_per_second": 53.775, "eval_q2p_data_steps_per_second": 3.444, "step": 50 }, { "epoch": 0.10526315789473684, "grad_norm": 1.2874988317489624, "learning_rate": 5.175438596491229e-06, "loss": 4.9041, "step": 60 }, { "epoch": 0.12280701754385964, "grad_norm": 1.5450624227523804, "learning_rate": 6.0526315789473685e-06, "loss": 4.8866, "step": 70 }, { "epoch": 0.14035087719298245, "grad_norm": 1.8990825414657593, "learning_rate": 6.92982456140351e-06, "loss": 4.844, "step": 80 }, { "epoch": 0.15789473684210525, "grad_norm": 2.0947864055633545, "learning_rate": 7.80701754385965e-06, "loss": 4.8064, "step": 90 }, { "epoch": 0.17543859649122806, "grad_norm": 2.2433862686157227, "learning_rate": 8.68421052631579e-06, "loss": 4.8182, "step": 100 }, { "epoch": 0.17543859649122806, "eval_q2q_data_loss": 4.724327087402344, "eval_q2q_data_runtime": 5.5749, "eval_q2q_data_samples_per_second": 315.523, "eval_q2q_data_steps_per_second": 19.731, "step": 100 }, { "epoch": 0.17543859649122806, "eval_q2p_data_loss": 4.865963459014893, "eval_q2p_data_runtime": 7.5397, "eval_q2p_data_samples_per_second": 53.849, "eval_q2p_data_steps_per_second": 3.448, "step": 100 }, { "epoch": 0.19298245614035087, "grad_norm": 2.198146104812622, "learning_rate": 9.56140350877193e-06, "loss": 4.7791, "step": 110 }, { "epoch": 0.21052631578947367, "grad_norm": 2.6786892414093018, "learning_rate": 9.951267056530215e-06, "loss": 4.7659, "step": 120 }, { "epoch": 0.22807017543859648, "grad_norm": 2.485137462615967, "learning_rate": 9.853801169590644e-06, "loss": 4.7572, "step": 130 }, { "epoch": 0.24561403508771928, "grad_norm": 2.5113883018493652, "learning_rate": 9.756335282651072e-06, "loss": 4.7234, "step": 140 }, { "epoch": 0.2631578947368421, "grad_norm": 3.184298276901245, "learning_rate": 9.658869395711503e-06, "loss": 4.726, "step": 150 }, { "epoch": 0.2631578947368421, "eval_q2q_data_loss": 4.626772403717041, "eval_q2q_data_runtime": 5.5905, "eval_q2q_data_samples_per_second": 314.638, "eval_q2q_data_steps_per_second": 19.676, "step": 150 }, { "epoch": 0.2631578947368421, "eval_q2p_data_loss": 4.871231555938721, "eval_q2p_data_runtime": 7.5434, "eval_q2p_data_samples_per_second": 53.822, "eval_q2p_data_steps_per_second": 3.447, "step": 150 }, { "epoch": 0.2807017543859649, "grad_norm": 3.1563026905059814, "learning_rate": 9.56140350877193e-06, "loss": 4.6932, "step": 160 }, { "epoch": 0.2982456140350877, "grad_norm": 3.4077727794647217, "learning_rate": 9.463937621832359e-06, "loss": 4.6654, "step": 170 }, { "epoch": 0.3157894736842105, "grad_norm": 3.617626428604126, "learning_rate": 9.366471734892788e-06, "loss": 4.6776, "step": 180 }, { "epoch": 0.3333333333333333, "grad_norm": 4.701232433319092, "learning_rate": 9.269005847953217e-06, "loss": 4.6617, "step": 190 }, { "epoch": 0.3508771929824561, "grad_norm": 7.48028564453125, "learning_rate": 9.171539961013646e-06, "loss": 4.6928, "step": 200 }, { "epoch": 0.3508771929824561, "eval_q2q_data_loss": 4.558098793029785, "eval_q2q_data_runtime": 5.5778, "eval_q2q_data_samples_per_second": 315.355, "eval_q2q_data_steps_per_second": 19.721, "step": 200 }, { "epoch": 0.3508771929824561, "eval_q2p_data_loss": 4.881445407867432, "eval_q2p_data_runtime": 7.5112, "eval_q2p_data_samples_per_second": 54.053, "eval_q2p_data_steps_per_second": 3.462, "step": 200 }, { "epoch": 0.3684210526315789, "grad_norm": 4.592555522918701, "learning_rate": 9.074074074074075e-06, "loss": 4.6497, "step": 210 }, { "epoch": 0.38596491228070173, "grad_norm": 4.758955478668213, "learning_rate": 8.976608187134503e-06, "loss": 4.677, "step": 220 }, { "epoch": 0.40350877192982454, "grad_norm": 4.005542278289795, "learning_rate": 8.879142300194934e-06, "loss": 4.6344, "step": 230 }, { "epoch": 0.42105263157894735, "grad_norm": 5.429654598236084, "learning_rate": 8.781676413255361e-06, "loss": 4.6612, "step": 240 }, { "epoch": 0.43859649122807015, "grad_norm": 5.14253044128418, "learning_rate": 8.68421052631579e-06, "loss": 4.6274, "step": 250 }, { "epoch": 0.43859649122807015, "eval_q2q_data_loss": 4.515370845794678, "eval_q2q_data_runtime": 5.5777, "eval_q2q_data_samples_per_second": 315.36, "eval_q2q_data_steps_per_second": 19.721, "step": 250 }, { "epoch": 0.43859649122807015, "eval_q2p_data_loss": 4.839608669281006, "eval_q2p_data_runtime": 7.5286, "eval_q2p_data_samples_per_second": 53.928, "eval_q2p_data_steps_per_second": 3.454, "step": 250 }, { "epoch": 0.45614035087719296, "grad_norm": 4.397937774658203, "learning_rate": 8.586744639376219e-06, "loss": 4.6556, "step": 260 }, { "epoch": 0.47368421052631576, "grad_norm": 6.12044095993042, "learning_rate": 8.489278752436648e-06, "loss": 4.6382, "step": 270 }, { "epoch": 0.49122807017543857, "grad_norm": 8.43116283416748, "learning_rate": 8.391812865497077e-06, "loss": 4.6053, "step": 280 }, { "epoch": 0.5087719298245614, "grad_norm": 7.88032341003418, "learning_rate": 8.294346978557506e-06, "loss": 4.6131, "step": 290 }, { "epoch": 0.5263157894736842, "grad_norm": 6.561196804046631, "learning_rate": 8.196881091617934e-06, "loss": 4.6453, "step": 300 }, { "epoch": 0.5263157894736842, "eval_q2q_data_loss": 4.495702743530273, "eval_q2q_data_runtime": 5.5691, "eval_q2q_data_samples_per_second": 315.85, "eval_q2q_data_steps_per_second": 19.752, "step": 300 }, { "epoch": 0.5263157894736842, "eval_q2p_data_loss": 4.831414222717285, "eval_q2p_data_runtime": 7.5076, "eval_q2p_data_samples_per_second": 54.079, "eval_q2p_data_steps_per_second": 3.463, "step": 300 }, { "epoch": 0.543859649122807, "grad_norm": 7.7354536056518555, "learning_rate": 8.099415204678363e-06, "loss": 4.5819, "step": 310 }, { "epoch": 0.5614035087719298, "grad_norm": 6.592026233673096, "learning_rate": 8.001949317738792e-06, "loss": 4.5948, "step": 320 }, { "epoch": 0.5789473684210527, "grad_norm": 8.176568031311035, "learning_rate": 7.904483430799221e-06, "loss": 4.5288, "step": 330 }, { "epoch": 0.5964912280701754, "grad_norm": 8.80689811706543, "learning_rate": 7.80701754385965e-06, "loss": 4.6152, "step": 340 }, { "epoch": 0.6140350877192983, "grad_norm": 6.051924228668213, "learning_rate": 7.70955165692008e-06, "loss": 4.5831, "step": 350 }, { "epoch": 0.6140350877192983, "eval_q2q_data_loss": 4.4657182693481445, "eval_q2q_data_runtime": 5.5705, "eval_q2q_data_samples_per_second": 315.77, "eval_q2q_data_steps_per_second": 19.747, "step": 350 }, { "epoch": 0.6140350877192983, "eval_q2p_data_loss": 4.795331001281738, "eval_q2p_data_runtime": 7.5177, "eval_q2p_data_samples_per_second": 54.006, "eval_q2p_data_steps_per_second": 3.458, "step": 350 }, { "epoch": 0.631578947368421, "grad_norm": 6.087244510650635, "learning_rate": 7.612085769980507e-06, "loss": 4.5507, "step": 360 }, { "epoch": 0.6491228070175439, "grad_norm": 8.209424018859863, "learning_rate": 7.5146198830409365e-06, "loss": 4.5718, "step": 370 }, { "epoch": 0.6666666666666666, "grad_norm": 11.899641990661621, "learning_rate": 7.417153996101365e-06, "loss": 4.6269, "step": 380 }, { "epoch": 0.6842105263157895, "grad_norm": 10.490060806274414, "learning_rate": 7.319688109161795e-06, "loss": 4.6017, "step": 390 }, { "epoch": 0.7017543859649122, "grad_norm": 6.545611381530762, "learning_rate": 7.222222222222223e-06, "loss": 4.5155, "step": 400 }, { "epoch": 0.7017543859649122, "eval_q2q_data_loss": 4.439589500427246, "eval_q2q_data_runtime": 5.563, "eval_q2q_data_samples_per_second": 316.195, "eval_q2q_data_steps_per_second": 19.773, "step": 400 }, { "epoch": 0.7017543859649122, "eval_q2p_data_loss": 4.769360542297363, "eval_q2p_data_runtime": 7.5013, "eval_q2p_data_samples_per_second": 54.124, "eval_q2p_data_steps_per_second": 3.466, "step": 400 }, { "epoch": 0.7192982456140351, "grad_norm": 9.658538818359375, "learning_rate": 7.124756335282652e-06, "loss": 4.5055, "step": 410 }, { "epoch": 0.7368421052631579, "grad_norm": 11.859044075012207, "learning_rate": 7.02729044834308e-06, "loss": 4.534, "step": 420 }, { "epoch": 0.7543859649122807, "grad_norm": 6.311577320098877, "learning_rate": 6.92982456140351e-06, "loss": 4.5358, "step": 430 }, { "epoch": 0.7719298245614035, "grad_norm": 15.303114891052246, "learning_rate": 6.832358674463938e-06, "loss": 4.5443, "step": 440 }, { "epoch": 0.7894736842105263, "grad_norm": 7.770440101623535, "learning_rate": 6.7348927875243675e-06, "loss": 4.5309, "step": 450 }, { "epoch": 0.7894736842105263, "eval_q2q_data_loss": 4.418254852294922, "eval_q2q_data_runtime": 5.5809, "eval_q2q_data_samples_per_second": 315.182, "eval_q2q_data_steps_per_second": 19.71, "step": 450 }, { "epoch": 0.7894736842105263, "eval_q2p_data_loss": 4.7750725746154785, "eval_q2p_data_runtime": 7.5356, "eval_q2p_data_samples_per_second": 53.878, "eval_q2p_data_steps_per_second": 3.45, "step": 450 }, { "epoch": 0.8070175438596491, "grad_norm": 10.787198066711426, "learning_rate": 6.637426900584796e-06, "loss": 4.5952, "step": 460 }, { "epoch": 0.8245614035087719, "grad_norm": 6.622506141662598, "learning_rate": 6.539961013645225e-06, "loss": 4.5561, "step": 470 }, { "epoch": 0.8421052631578947, "grad_norm": 9.452810287475586, "learning_rate": 6.442495126705654e-06, "loss": 4.5191, "step": 480 }, { "epoch": 0.8596491228070176, "grad_norm": 8.921065330505371, "learning_rate": 6.345029239766083e-06, "loss": 4.5066, "step": 490 }, { "epoch": 0.8771929824561403, "grad_norm": 6.36785364151001, "learning_rate": 6.247563352826511e-06, "loss": 4.4875, "step": 500 }, { "epoch": 0.8771929824561403, "eval_q2q_data_loss": 4.413846015930176, "eval_q2q_data_runtime": 5.5964, "eval_q2q_data_samples_per_second": 314.308, "eval_q2q_data_steps_per_second": 19.655, "step": 500 }, { "epoch": 0.8771929824561403, "eval_q2p_data_loss": 4.819548606872559, "eval_q2p_data_runtime": 7.5407, "eval_q2p_data_samples_per_second": 53.841, "eval_q2p_data_steps_per_second": 3.448, "step": 500 }, { "epoch": 0.8947368421052632, "grad_norm": 8.613053321838379, "learning_rate": 6.15009746588694e-06, "loss": 4.5051, "step": 510 }, { "epoch": 0.9122807017543859, "grad_norm": 6.249648571014404, "learning_rate": 6.0526315789473685e-06, "loss": 4.4872, "step": 520 }, { "epoch": 0.9298245614035088, "grad_norm": 14.66945743560791, "learning_rate": 5.9551656920077984e-06, "loss": 4.4918, "step": 530 }, { "epoch": 0.9473684210526315, "grad_norm": 13.305913925170898, "learning_rate": 5.857699805068227e-06, "loss": 4.5357, "step": 540 }, { "epoch": 0.9649122807017544, "grad_norm": 10.659647941589355, "learning_rate": 5.760233918128656e-06, "loss": 4.4898, "step": 550 }, { "epoch": 0.9649122807017544, "eval_q2q_data_loss": 4.375401020050049, "eval_q2q_data_runtime": 5.5712, "eval_q2q_data_samples_per_second": 315.731, "eval_q2q_data_steps_per_second": 19.744, "step": 550 }, { "epoch": 0.9649122807017544, "eval_q2p_data_loss": 4.779933929443359, "eval_q2p_data_runtime": 7.4961, "eval_q2p_data_samples_per_second": 54.162, "eval_q2p_data_steps_per_second": 3.468, "step": 550 }, { "epoch": 0.9824561403508771, "grad_norm": 7.730218410491943, "learning_rate": 5.662768031189084e-06, "loss": 4.5742, "step": 560 }, { "epoch": 1.0, "grad_norm": 9.418205261230469, "learning_rate": 5.565302144249514e-06, "loss": 4.5461, "step": 570 }, { "epoch": 1.0175438596491229, "grad_norm": 10.373188972473145, "learning_rate": 5.467836257309942e-06, "loss": 4.5505, "step": 580 }, { "epoch": 1.0350877192982457, "grad_norm": 11.559415817260742, "learning_rate": 5.370370370370371e-06, "loss": 4.5027, "step": 590 }, { "epoch": 1.0526315789473684, "grad_norm": 18.346025466918945, "learning_rate": 5.2729044834307995e-06, "loss": 4.5747, "step": 600 }, { "epoch": 1.0526315789473684, "eval_q2q_data_loss": 4.405951499938965, "eval_q2q_data_runtime": 5.5358, "eval_q2q_data_samples_per_second": 317.749, "eval_q2q_data_steps_per_second": 19.871, "step": 600 }, { "epoch": 1.0526315789473684, "eval_q2p_data_loss": 4.791478633880615, "eval_q2p_data_runtime": 7.389, "eval_q2p_data_samples_per_second": 54.947, "eval_q2p_data_steps_per_second": 3.519, "step": 600 }, { "epoch": 1.0701754385964912, "grad_norm": 8.272171020507812, "learning_rate": 5.175438596491229e-06, "loss": 4.5296, "step": 610 }, { "epoch": 1.087719298245614, "grad_norm": 8.837151527404785, "learning_rate": 5.077972709551658e-06, "loss": 4.4262, "step": 620 }, { "epoch": 1.1052631578947367, "grad_norm": 13.43027400970459, "learning_rate": 4.980506822612086e-06, "loss": 4.5415, "step": 630 }, { "epoch": 1.1228070175438596, "grad_norm": 8.466143608093262, "learning_rate": 4.883040935672515e-06, "loss": 4.5386, "step": 640 }, { "epoch": 1.1403508771929824, "grad_norm": 10.755400657653809, "learning_rate": 4.785575048732944e-06, "loss": 4.4552, "step": 650 }, { "epoch": 1.1403508771929824, "eval_q2q_data_loss": 4.363187789916992, "eval_q2q_data_runtime": 5.5237, "eval_q2q_data_samples_per_second": 318.449, "eval_q2q_data_steps_per_second": 19.914, "step": 650 }, { "epoch": 1.1403508771929824, "eval_q2p_data_loss": 4.810464382171631, "eval_q2p_data_runtime": 7.469, "eval_q2p_data_samples_per_second": 54.358, "eval_q2p_data_steps_per_second": 3.481, "step": 650 }, { "epoch": 1.1578947368421053, "grad_norm": 8.030132293701172, "learning_rate": 4.688109161793373e-06, "loss": 4.4473, "step": 660 }, { "epoch": 1.1754385964912282, "grad_norm": 8.19764518737793, "learning_rate": 4.590643274853801e-06, "loss": 4.5069, "step": 670 }, { "epoch": 1.1929824561403508, "grad_norm": 11.119821548461914, "learning_rate": 4.4931773879142305e-06, "loss": 4.5129, "step": 680 }, { "epoch": 1.2105263157894737, "grad_norm": 9.186931610107422, "learning_rate": 4.3957115009746595e-06, "loss": 4.4611, "step": 690 }, { "epoch": 1.2280701754385965, "grad_norm": 7.6313042640686035, "learning_rate": 4.298245614035088e-06, "loss": 4.5104, "step": 700 }, { "epoch": 1.2280701754385965, "eval_q2q_data_loss": 4.353029727935791, "eval_q2q_data_runtime": 5.559, "eval_q2q_data_samples_per_second": 316.425, "eval_q2q_data_steps_per_second": 19.788, "step": 700 }, { "epoch": 1.2280701754385965, "eval_q2p_data_loss": 4.787461757659912, "eval_q2p_data_runtime": 7.5053, "eval_q2p_data_samples_per_second": 54.095, "eval_q2p_data_steps_per_second": 3.464, "step": 700 }, { "epoch": 1.2456140350877192, "grad_norm": 12.636022567749023, "learning_rate": 4.200779727095517e-06, "loss": 4.4742, "step": 710 }, { "epoch": 1.263157894736842, "grad_norm": 16.598079681396484, "learning_rate": 4.103313840155946e-06, "loss": 4.4887, "step": 720 }, { "epoch": 1.280701754385965, "grad_norm": 6.5720038414001465, "learning_rate": 4.005847953216375e-06, "loss": 4.406, "step": 730 }, { "epoch": 1.2982456140350878, "grad_norm": 10.550318717956543, "learning_rate": 3.908382066276803e-06, "loss": 4.4049, "step": 740 }, { "epoch": 1.3157894736842106, "grad_norm": 16.054428100585938, "learning_rate": 3.8109161793372323e-06, "loss": 4.4165, "step": 750 }, { "epoch": 1.3157894736842106, "eval_q2q_data_loss": 4.348443031311035, "eval_q2q_data_runtime": 5.5669, "eval_q2q_data_samples_per_second": 315.976, "eval_q2q_data_steps_per_second": 19.76, "step": 750 }, { "epoch": 1.3157894736842106, "eval_q2p_data_loss": 4.786614894866943, "eval_q2p_data_runtime": 7.508, "eval_q2p_data_samples_per_second": 54.076, "eval_q2p_data_steps_per_second": 3.463, "step": 750 }, { "epoch": 1.3333333333333333, "grad_norm": 10.04055404663086, "learning_rate": 3.713450292397661e-06, "loss": 4.4274, "step": 760 }, { "epoch": 1.3508771929824561, "grad_norm": 12.780068397521973, "learning_rate": 3.61598440545809e-06, "loss": 4.4855, "step": 770 }, { "epoch": 1.368421052631579, "grad_norm": 10.54061222076416, "learning_rate": 3.5185185185185187e-06, "loss": 4.4571, "step": 780 }, { "epoch": 1.3859649122807016, "grad_norm": 5.75900936126709, "learning_rate": 3.421052631578948e-06, "loss": 4.4307, "step": 790 }, { "epoch": 1.4035087719298245, "grad_norm": 10.625808715820312, "learning_rate": 3.3235867446393765e-06, "loss": 4.4387, "step": 800 }, { "epoch": 1.4035087719298245, "eval_q2q_data_loss": 4.345006465911865, "eval_q2q_data_runtime": 5.5698, "eval_q2q_data_samples_per_second": 315.808, "eval_q2q_data_steps_per_second": 19.749, "step": 800 }, { "epoch": 1.4035087719298245, "eval_q2p_data_loss": 4.762818813323975, "eval_q2p_data_runtime": 7.5368, "eval_q2p_data_samples_per_second": 53.869, "eval_q2p_data_steps_per_second": 3.45, "step": 800 }, { "epoch": 1.4210526315789473, "grad_norm": 9.662367820739746, "learning_rate": 3.2261208576998056e-06, "loss": 4.4592, "step": 810 }, { "epoch": 1.4385964912280702, "grad_norm": 14.999639511108398, "learning_rate": 3.1286549707602342e-06, "loss": 4.4368, "step": 820 }, { "epoch": 1.456140350877193, "grad_norm": 17.007898330688477, "learning_rate": 3.0311890838206633e-06, "loss": 4.4863, "step": 830 }, { "epoch": 1.4736842105263157, "grad_norm": 14.116398811340332, "learning_rate": 2.933723196881092e-06, "loss": 4.463, "step": 840 }, { "epoch": 1.4912280701754386, "grad_norm": 5.4955315589904785, "learning_rate": 2.8362573099415206e-06, "loss": 4.4113, "step": 850 }, { "epoch": 1.4912280701754386, "eval_q2q_data_loss": 4.325167655944824, "eval_q2q_data_runtime": 5.5814, "eval_q2q_data_samples_per_second": 315.156, "eval_q2q_data_steps_per_second": 19.708, "step": 850 }, { "epoch": 1.4912280701754386, "eval_q2p_data_loss": 4.761044979095459, "eval_q2p_data_runtime": 7.4985, "eval_q2p_data_samples_per_second": 54.144, "eval_q2p_data_steps_per_second": 3.467, "step": 850 }, { "epoch": 1.5087719298245614, "grad_norm": 13.653097152709961, "learning_rate": 2.7387914230019497e-06, "loss": 4.4368, "step": 860 }, { "epoch": 1.526315789473684, "grad_norm": 13.720170974731445, "learning_rate": 2.6413255360623784e-06, "loss": 4.4738, "step": 870 }, { "epoch": 1.543859649122807, "grad_norm": 15.261076927185059, "learning_rate": 2.5438596491228075e-06, "loss": 4.4195, "step": 880 }, { "epoch": 1.5614035087719298, "grad_norm": 10.974407196044922, "learning_rate": 2.446393762183236e-06, "loss": 4.4478, "step": 890 }, { "epoch": 1.5789473684210527, "grad_norm": 10.83484935760498, "learning_rate": 2.3489278752436648e-06, "loss": 4.3849, "step": 900 }, { "epoch": 1.5789473684210527, "eval_q2q_data_loss": 4.314022064208984, "eval_q2q_data_runtime": 5.5727, "eval_q2q_data_samples_per_second": 315.646, "eval_q2q_data_steps_per_second": 19.739, "step": 900 }, { "epoch": 1.5789473684210527, "eval_q2p_data_loss": 4.751864910125732, "eval_q2p_data_runtime": 7.4934, "eval_q2p_data_samples_per_second": 54.181, "eval_q2p_data_steps_per_second": 3.47, "step": 900 }, { "epoch": 1.5964912280701755, "grad_norm": 21.77918243408203, "learning_rate": 2.2514619883040934e-06, "loss": 4.4896, "step": 910 }, { "epoch": 1.6140350877192984, "grad_norm": 7.528986930847168, "learning_rate": 2.1539961013645225e-06, "loss": 4.4301, "step": 920 }, { "epoch": 1.631578947368421, "grad_norm": 8.18942928314209, "learning_rate": 2.056530214424951e-06, "loss": 4.4142, "step": 930 }, { "epoch": 1.6491228070175439, "grad_norm": 10.001923561096191, "learning_rate": 1.9590643274853803e-06, "loss": 4.4582, "step": 940 }, { "epoch": 1.6666666666666665, "grad_norm": 10.730441093444824, "learning_rate": 1.861598440545809e-06, "loss": 4.5075, "step": 950 }, { "epoch": 1.6666666666666665, "eval_q2q_data_loss": 4.3189191818237305, "eval_q2q_data_runtime": 5.5874, "eval_q2q_data_samples_per_second": 314.816, "eval_q2q_data_steps_per_second": 19.687, "step": 950 }, { "epoch": 1.6666666666666665, "eval_q2p_data_loss": 4.725940704345703, "eval_q2p_data_runtime": 7.514, "eval_q2p_data_samples_per_second": 54.033, "eval_q2p_data_steps_per_second": 3.46, "step": 950 }, { "epoch": 1.6842105263157894, "grad_norm": 9.174509048461914, "learning_rate": 1.7641325536062378e-06, "loss": 4.4454, "step": 960 }, { "epoch": 1.7017543859649122, "grad_norm": 11.805915832519531, "learning_rate": 1.6666666666666667e-06, "loss": 4.3547, "step": 970 }, { "epoch": 1.719298245614035, "grad_norm": 9.230790138244629, "learning_rate": 1.5692007797270955e-06, "loss": 4.4016, "step": 980 }, { "epoch": 1.736842105263158, "grad_norm": 13.292176246643066, "learning_rate": 1.4717348927875244e-06, "loss": 4.4064, "step": 990 }, { "epoch": 1.7543859649122808, "grad_norm": 9.294161796569824, "learning_rate": 1.3742690058479533e-06, "loss": 4.4356, "step": 1000 }, { "epoch": 1.7543859649122808, "eval_q2q_data_loss": 4.3151326179504395, "eval_q2q_data_runtime": 5.5534, "eval_q2q_data_samples_per_second": 316.742, "eval_q2q_data_steps_per_second": 19.808, "step": 1000 }, { "epoch": 1.7543859649122808, "eval_q2p_data_loss": 4.727615833282471, "eval_q2p_data_runtime": 7.5335, "eval_q2p_data_samples_per_second": 53.893, "eval_q2p_data_steps_per_second": 3.451, "step": 1000 }, { "epoch": 1.7719298245614035, "grad_norm": 12.539956092834473, "learning_rate": 1.2768031189083821e-06, "loss": 4.4105, "step": 1010 }, { "epoch": 1.7894736842105263, "grad_norm": 15.329697608947754, "learning_rate": 1.179337231968811e-06, "loss": 4.4067, "step": 1020 }, { "epoch": 1.807017543859649, "grad_norm": 7.712077617645264, "learning_rate": 1.0818713450292399e-06, "loss": 4.4296, "step": 1030 }, { "epoch": 1.8245614035087718, "grad_norm": 7.909111976623535, "learning_rate": 9.844054580896685e-07, "loss": 4.4147, "step": 1040 }, { "epoch": 1.8421052631578947, "grad_norm": 16.474355697631836, "learning_rate": 8.869395711500975e-07, "loss": 4.3743, "step": 1050 }, { "epoch": 1.8421052631578947, "eval_q2q_data_loss": 4.313626289367676, "eval_q2q_data_runtime": 5.5976, "eval_q2q_data_samples_per_second": 314.244, "eval_q2q_data_steps_per_second": 19.651, "step": 1050 }, { "epoch": 1.8421052631578947, "eval_q2p_data_loss": 4.7181901931762695, "eval_q2p_data_runtime": 7.5322, "eval_q2p_data_samples_per_second": 53.902, "eval_q2p_data_steps_per_second": 3.452, "step": 1050 }, { "epoch": 1.8596491228070176, "grad_norm": 11.424127578735352, "learning_rate": 7.894736842105263e-07, "loss": 4.4065, "step": 1060 }, { "epoch": 1.8771929824561404, "grad_norm": 8.293243408203125, "learning_rate": 6.920077972709552e-07, "loss": 4.4025, "step": 1070 }, { "epoch": 1.8947368421052633, "grad_norm": 11.082077026367188, "learning_rate": 5.94541910331384e-07, "loss": 4.3912, "step": 1080 }, { "epoch": 1.912280701754386, "grad_norm": 13.221600532531738, "learning_rate": 4.970760233918129e-07, "loss": 4.3731, "step": 1090 }, { "epoch": 1.9298245614035088, "grad_norm": 16.041154861450195, "learning_rate": 3.996101364522417e-07, "loss": 4.3817, "step": 1100 }, { "epoch": 1.9298245614035088, "eval_q2q_data_loss": 4.311989784240723, "eval_q2q_data_runtime": 5.5712, "eval_q2q_data_samples_per_second": 315.734, "eval_q2q_data_steps_per_second": 19.745, "step": 1100 }, { "epoch": 1.9298245614035088, "eval_q2p_data_loss": 4.735711097717285, "eval_q2p_data_runtime": 7.4899, "eval_q2p_data_samples_per_second": 54.207, "eval_q2p_data_steps_per_second": 3.471, "step": 1100 }, { "epoch": 1.9473684210526314, "grad_norm": 8.501266479492188, "learning_rate": 3.021442495126706e-07, "loss": 4.4305, "step": 1110 }, { "epoch": 1.9649122807017543, "grad_norm": 7.625467300415039, "learning_rate": 2.046783625730994e-07, "loss": 4.3914, "step": 1120 }, { "epoch": 1.9824561403508771, "grad_norm": 11.992876052856445, "learning_rate": 1.0721247563352827e-07, "loss": 4.4753, "step": 1130 }, { "epoch": 2.0, "grad_norm": 5.4963908195495605, "learning_rate": 9.746588693957116e-09, "loss": 4.4536, "step": 1140 } ], "logging_steps": 10, "max_steps": 1140, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 36, "trial_name": null, "trial_params": null }