sledopyt_embedder_v4 / trainer_state.json
George2002's picture
Upload model checkpoint
efcf241 verified
{
"best_global_step": 1050,
"best_metric": 4.7181901931762695,
"best_model_checkpoint": ".../training_output/checkpoint-800",
"epoch": 2.0,
"eval_steps": 50,
"global_step": 1140,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.017543859649122806,
"grad_norm": 0.8221026659011841,
"learning_rate": 7.894736842105263e-07,
"loss": 4.957,
"step": 10
},
{
"epoch": 0.03508771929824561,
"grad_norm": 0.8544751405715942,
"learning_rate": 1.6666666666666667e-06,
"loss": 4.9467,
"step": 20
},
{
"epoch": 0.05263157894736842,
"grad_norm": 0.964083731174469,
"learning_rate": 2.5438596491228075e-06,
"loss": 4.9452,
"step": 30
},
{
"epoch": 0.07017543859649122,
"grad_norm": 0.9615139365196228,
"learning_rate": 3.421052631578948e-06,
"loss": 4.9325,
"step": 40
},
{
"epoch": 0.08771929824561403,
"grad_norm": 1.156923770904541,
"learning_rate": 4.298245614035088e-06,
"loss": 4.9056,
"step": 50
},
{
"epoch": 0.08771929824561403,
"eval_q2q_data_loss": 4.880394458770752,
"eval_q2q_data_runtime": 5.5966,
"eval_q2q_data_samples_per_second": 314.295,
"eval_q2q_data_steps_per_second": 19.655,
"step": 50
},
{
"epoch": 0.08771929824561403,
"eval_q2p_data_loss": 4.922183990478516,
"eval_q2p_data_runtime": 7.55,
"eval_q2p_data_samples_per_second": 53.775,
"eval_q2p_data_steps_per_second": 3.444,
"step": 50
},
{
"epoch": 0.10526315789473684,
"grad_norm": 1.2874988317489624,
"learning_rate": 5.175438596491229e-06,
"loss": 4.9041,
"step": 60
},
{
"epoch": 0.12280701754385964,
"grad_norm": 1.5450624227523804,
"learning_rate": 6.0526315789473685e-06,
"loss": 4.8866,
"step": 70
},
{
"epoch": 0.14035087719298245,
"grad_norm": 1.8990825414657593,
"learning_rate": 6.92982456140351e-06,
"loss": 4.844,
"step": 80
},
{
"epoch": 0.15789473684210525,
"grad_norm": 2.0947864055633545,
"learning_rate": 7.80701754385965e-06,
"loss": 4.8064,
"step": 90
},
{
"epoch": 0.17543859649122806,
"grad_norm": 2.2433862686157227,
"learning_rate": 8.68421052631579e-06,
"loss": 4.8182,
"step": 100
},
{
"epoch": 0.17543859649122806,
"eval_q2q_data_loss": 4.724327087402344,
"eval_q2q_data_runtime": 5.5749,
"eval_q2q_data_samples_per_second": 315.523,
"eval_q2q_data_steps_per_second": 19.731,
"step": 100
},
{
"epoch": 0.17543859649122806,
"eval_q2p_data_loss": 4.865963459014893,
"eval_q2p_data_runtime": 7.5397,
"eval_q2p_data_samples_per_second": 53.849,
"eval_q2p_data_steps_per_second": 3.448,
"step": 100
},
{
"epoch": 0.19298245614035087,
"grad_norm": 2.198146104812622,
"learning_rate": 9.56140350877193e-06,
"loss": 4.7791,
"step": 110
},
{
"epoch": 0.21052631578947367,
"grad_norm": 2.6786892414093018,
"learning_rate": 9.951267056530215e-06,
"loss": 4.7659,
"step": 120
},
{
"epoch": 0.22807017543859648,
"grad_norm": 2.485137462615967,
"learning_rate": 9.853801169590644e-06,
"loss": 4.7572,
"step": 130
},
{
"epoch": 0.24561403508771928,
"grad_norm": 2.5113883018493652,
"learning_rate": 9.756335282651072e-06,
"loss": 4.7234,
"step": 140
},
{
"epoch": 0.2631578947368421,
"grad_norm": 3.184298276901245,
"learning_rate": 9.658869395711503e-06,
"loss": 4.726,
"step": 150
},
{
"epoch": 0.2631578947368421,
"eval_q2q_data_loss": 4.626772403717041,
"eval_q2q_data_runtime": 5.5905,
"eval_q2q_data_samples_per_second": 314.638,
"eval_q2q_data_steps_per_second": 19.676,
"step": 150
},
{
"epoch": 0.2631578947368421,
"eval_q2p_data_loss": 4.871231555938721,
"eval_q2p_data_runtime": 7.5434,
"eval_q2p_data_samples_per_second": 53.822,
"eval_q2p_data_steps_per_second": 3.447,
"step": 150
},
{
"epoch": 0.2807017543859649,
"grad_norm": 3.1563026905059814,
"learning_rate": 9.56140350877193e-06,
"loss": 4.6932,
"step": 160
},
{
"epoch": 0.2982456140350877,
"grad_norm": 3.4077727794647217,
"learning_rate": 9.463937621832359e-06,
"loss": 4.6654,
"step": 170
},
{
"epoch": 0.3157894736842105,
"grad_norm": 3.617626428604126,
"learning_rate": 9.366471734892788e-06,
"loss": 4.6776,
"step": 180
},
{
"epoch": 0.3333333333333333,
"grad_norm": 4.701232433319092,
"learning_rate": 9.269005847953217e-06,
"loss": 4.6617,
"step": 190
},
{
"epoch": 0.3508771929824561,
"grad_norm": 7.48028564453125,
"learning_rate": 9.171539961013646e-06,
"loss": 4.6928,
"step": 200
},
{
"epoch": 0.3508771929824561,
"eval_q2q_data_loss": 4.558098793029785,
"eval_q2q_data_runtime": 5.5778,
"eval_q2q_data_samples_per_second": 315.355,
"eval_q2q_data_steps_per_second": 19.721,
"step": 200
},
{
"epoch": 0.3508771929824561,
"eval_q2p_data_loss": 4.881445407867432,
"eval_q2p_data_runtime": 7.5112,
"eval_q2p_data_samples_per_second": 54.053,
"eval_q2p_data_steps_per_second": 3.462,
"step": 200
},
{
"epoch": 0.3684210526315789,
"grad_norm": 4.592555522918701,
"learning_rate": 9.074074074074075e-06,
"loss": 4.6497,
"step": 210
},
{
"epoch": 0.38596491228070173,
"grad_norm": 4.758955478668213,
"learning_rate": 8.976608187134503e-06,
"loss": 4.677,
"step": 220
},
{
"epoch": 0.40350877192982454,
"grad_norm": 4.005542278289795,
"learning_rate": 8.879142300194934e-06,
"loss": 4.6344,
"step": 230
},
{
"epoch": 0.42105263157894735,
"grad_norm": 5.429654598236084,
"learning_rate": 8.781676413255361e-06,
"loss": 4.6612,
"step": 240
},
{
"epoch": 0.43859649122807015,
"grad_norm": 5.14253044128418,
"learning_rate": 8.68421052631579e-06,
"loss": 4.6274,
"step": 250
},
{
"epoch": 0.43859649122807015,
"eval_q2q_data_loss": 4.515370845794678,
"eval_q2q_data_runtime": 5.5777,
"eval_q2q_data_samples_per_second": 315.36,
"eval_q2q_data_steps_per_second": 19.721,
"step": 250
},
{
"epoch": 0.43859649122807015,
"eval_q2p_data_loss": 4.839608669281006,
"eval_q2p_data_runtime": 7.5286,
"eval_q2p_data_samples_per_second": 53.928,
"eval_q2p_data_steps_per_second": 3.454,
"step": 250
},
{
"epoch": 0.45614035087719296,
"grad_norm": 4.397937774658203,
"learning_rate": 8.586744639376219e-06,
"loss": 4.6556,
"step": 260
},
{
"epoch": 0.47368421052631576,
"grad_norm": 6.12044095993042,
"learning_rate": 8.489278752436648e-06,
"loss": 4.6382,
"step": 270
},
{
"epoch": 0.49122807017543857,
"grad_norm": 8.43116283416748,
"learning_rate": 8.391812865497077e-06,
"loss": 4.6053,
"step": 280
},
{
"epoch": 0.5087719298245614,
"grad_norm": 7.88032341003418,
"learning_rate": 8.294346978557506e-06,
"loss": 4.6131,
"step": 290
},
{
"epoch": 0.5263157894736842,
"grad_norm": 6.561196804046631,
"learning_rate": 8.196881091617934e-06,
"loss": 4.6453,
"step": 300
},
{
"epoch": 0.5263157894736842,
"eval_q2q_data_loss": 4.495702743530273,
"eval_q2q_data_runtime": 5.5691,
"eval_q2q_data_samples_per_second": 315.85,
"eval_q2q_data_steps_per_second": 19.752,
"step": 300
},
{
"epoch": 0.5263157894736842,
"eval_q2p_data_loss": 4.831414222717285,
"eval_q2p_data_runtime": 7.5076,
"eval_q2p_data_samples_per_second": 54.079,
"eval_q2p_data_steps_per_second": 3.463,
"step": 300
},
{
"epoch": 0.543859649122807,
"grad_norm": 7.7354536056518555,
"learning_rate": 8.099415204678363e-06,
"loss": 4.5819,
"step": 310
},
{
"epoch": 0.5614035087719298,
"grad_norm": 6.592026233673096,
"learning_rate": 8.001949317738792e-06,
"loss": 4.5948,
"step": 320
},
{
"epoch": 0.5789473684210527,
"grad_norm": 8.176568031311035,
"learning_rate": 7.904483430799221e-06,
"loss": 4.5288,
"step": 330
},
{
"epoch": 0.5964912280701754,
"grad_norm": 8.80689811706543,
"learning_rate": 7.80701754385965e-06,
"loss": 4.6152,
"step": 340
},
{
"epoch": 0.6140350877192983,
"grad_norm": 6.051924228668213,
"learning_rate": 7.70955165692008e-06,
"loss": 4.5831,
"step": 350
},
{
"epoch": 0.6140350877192983,
"eval_q2q_data_loss": 4.4657182693481445,
"eval_q2q_data_runtime": 5.5705,
"eval_q2q_data_samples_per_second": 315.77,
"eval_q2q_data_steps_per_second": 19.747,
"step": 350
},
{
"epoch": 0.6140350877192983,
"eval_q2p_data_loss": 4.795331001281738,
"eval_q2p_data_runtime": 7.5177,
"eval_q2p_data_samples_per_second": 54.006,
"eval_q2p_data_steps_per_second": 3.458,
"step": 350
},
{
"epoch": 0.631578947368421,
"grad_norm": 6.087244510650635,
"learning_rate": 7.612085769980507e-06,
"loss": 4.5507,
"step": 360
},
{
"epoch": 0.6491228070175439,
"grad_norm": 8.209424018859863,
"learning_rate": 7.5146198830409365e-06,
"loss": 4.5718,
"step": 370
},
{
"epoch": 0.6666666666666666,
"grad_norm": 11.899641990661621,
"learning_rate": 7.417153996101365e-06,
"loss": 4.6269,
"step": 380
},
{
"epoch": 0.6842105263157895,
"grad_norm": 10.490060806274414,
"learning_rate": 7.319688109161795e-06,
"loss": 4.6017,
"step": 390
},
{
"epoch": 0.7017543859649122,
"grad_norm": 6.545611381530762,
"learning_rate": 7.222222222222223e-06,
"loss": 4.5155,
"step": 400
},
{
"epoch": 0.7017543859649122,
"eval_q2q_data_loss": 4.439589500427246,
"eval_q2q_data_runtime": 5.563,
"eval_q2q_data_samples_per_second": 316.195,
"eval_q2q_data_steps_per_second": 19.773,
"step": 400
},
{
"epoch": 0.7017543859649122,
"eval_q2p_data_loss": 4.769360542297363,
"eval_q2p_data_runtime": 7.5013,
"eval_q2p_data_samples_per_second": 54.124,
"eval_q2p_data_steps_per_second": 3.466,
"step": 400
},
{
"epoch": 0.7192982456140351,
"grad_norm": 9.658538818359375,
"learning_rate": 7.124756335282652e-06,
"loss": 4.5055,
"step": 410
},
{
"epoch": 0.7368421052631579,
"grad_norm": 11.859044075012207,
"learning_rate": 7.02729044834308e-06,
"loss": 4.534,
"step": 420
},
{
"epoch": 0.7543859649122807,
"grad_norm": 6.311577320098877,
"learning_rate": 6.92982456140351e-06,
"loss": 4.5358,
"step": 430
},
{
"epoch": 0.7719298245614035,
"grad_norm": 15.303114891052246,
"learning_rate": 6.832358674463938e-06,
"loss": 4.5443,
"step": 440
},
{
"epoch": 0.7894736842105263,
"grad_norm": 7.770440101623535,
"learning_rate": 6.7348927875243675e-06,
"loss": 4.5309,
"step": 450
},
{
"epoch": 0.7894736842105263,
"eval_q2q_data_loss": 4.418254852294922,
"eval_q2q_data_runtime": 5.5809,
"eval_q2q_data_samples_per_second": 315.182,
"eval_q2q_data_steps_per_second": 19.71,
"step": 450
},
{
"epoch": 0.7894736842105263,
"eval_q2p_data_loss": 4.7750725746154785,
"eval_q2p_data_runtime": 7.5356,
"eval_q2p_data_samples_per_second": 53.878,
"eval_q2p_data_steps_per_second": 3.45,
"step": 450
},
{
"epoch": 0.8070175438596491,
"grad_norm": 10.787198066711426,
"learning_rate": 6.637426900584796e-06,
"loss": 4.5952,
"step": 460
},
{
"epoch": 0.8245614035087719,
"grad_norm": 6.622506141662598,
"learning_rate": 6.539961013645225e-06,
"loss": 4.5561,
"step": 470
},
{
"epoch": 0.8421052631578947,
"grad_norm": 9.452810287475586,
"learning_rate": 6.442495126705654e-06,
"loss": 4.5191,
"step": 480
},
{
"epoch": 0.8596491228070176,
"grad_norm": 8.921065330505371,
"learning_rate": 6.345029239766083e-06,
"loss": 4.5066,
"step": 490
},
{
"epoch": 0.8771929824561403,
"grad_norm": 6.36785364151001,
"learning_rate": 6.247563352826511e-06,
"loss": 4.4875,
"step": 500
},
{
"epoch": 0.8771929824561403,
"eval_q2q_data_loss": 4.413846015930176,
"eval_q2q_data_runtime": 5.5964,
"eval_q2q_data_samples_per_second": 314.308,
"eval_q2q_data_steps_per_second": 19.655,
"step": 500
},
{
"epoch": 0.8771929824561403,
"eval_q2p_data_loss": 4.819548606872559,
"eval_q2p_data_runtime": 7.5407,
"eval_q2p_data_samples_per_second": 53.841,
"eval_q2p_data_steps_per_second": 3.448,
"step": 500
},
{
"epoch": 0.8947368421052632,
"grad_norm": 8.613053321838379,
"learning_rate": 6.15009746588694e-06,
"loss": 4.5051,
"step": 510
},
{
"epoch": 0.9122807017543859,
"grad_norm": 6.249648571014404,
"learning_rate": 6.0526315789473685e-06,
"loss": 4.4872,
"step": 520
},
{
"epoch": 0.9298245614035088,
"grad_norm": 14.66945743560791,
"learning_rate": 5.9551656920077984e-06,
"loss": 4.4918,
"step": 530
},
{
"epoch": 0.9473684210526315,
"grad_norm": 13.305913925170898,
"learning_rate": 5.857699805068227e-06,
"loss": 4.5357,
"step": 540
},
{
"epoch": 0.9649122807017544,
"grad_norm": 10.659647941589355,
"learning_rate": 5.760233918128656e-06,
"loss": 4.4898,
"step": 550
},
{
"epoch": 0.9649122807017544,
"eval_q2q_data_loss": 4.375401020050049,
"eval_q2q_data_runtime": 5.5712,
"eval_q2q_data_samples_per_second": 315.731,
"eval_q2q_data_steps_per_second": 19.744,
"step": 550
},
{
"epoch": 0.9649122807017544,
"eval_q2p_data_loss": 4.779933929443359,
"eval_q2p_data_runtime": 7.4961,
"eval_q2p_data_samples_per_second": 54.162,
"eval_q2p_data_steps_per_second": 3.468,
"step": 550
},
{
"epoch": 0.9824561403508771,
"grad_norm": 7.730218410491943,
"learning_rate": 5.662768031189084e-06,
"loss": 4.5742,
"step": 560
},
{
"epoch": 1.0,
"grad_norm": 9.418205261230469,
"learning_rate": 5.565302144249514e-06,
"loss": 4.5461,
"step": 570
},
{
"epoch": 1.0175438596491229,
"grad_norm": 10.373188972473145,
"learning_rate": 5.467836257309942e-06,
"loss": 4.5505,
"step": 580
},
{
"epoch": 1.0350877192982457,
"grad_norm": 11.559415817260742,
"learning_rate": 5.370370370370371e-06,
"loss": 4.5027,
"step": 590
},
{
"epoch": 1.0526315789473684,
"grad_norm": 18.346025466918945,
"learning_rate": 5.2729044834307995e-06,
"loss": 4.5747,
"step": 600
},
{
"epoch": 1.0526315789473684,
"eval_q2q_data_loss": 4.405951499938965,
"eval_q2q_data_runtime": 5.5358,
"eval_q2q_data_samples_per_second": 317.749,
"eval_q2q_data_steps_per_second": 19.871,
"step": 600
},
{
"epoch": 1.0526315789473684,
"eval_q2p_data_loss": 4.791478633880615,
"eval_q2p_data_runtime": 7.389,
"eval_q2p_data_samples_per_second": 54.947,
"eval_q2p_data_steps_per_second": 3.519,
"step": 600
},
{
"epoch": 1.0701754385964912,
"grad_norm": 8.272171020507812,
"learning_rate": 5.175438596491229e-06,
"loss": 4.5296,
"step": 610
},
{
"epoch": 1.087719298245614,
"grad_norm": 8.837151527404785,
"learning_rate": 5.077972709551658e-06,
"loss": 4.4262,
"step": 620
},
{
"epoch": 1.1052631578947367,
"grad_norm": 13.43027400970459,
"learning_rate": 4.980506822612086e-06,
"loss": 4.5415,
"step": 630
},
{
"epoch": 1.1228070175438596,
"grad_norm": 8.466143608093262,
"learning_rate": 4.883040935672515e-06,
"loss": 4.5386,
"step": 640
},
{
"epoch": 1.1403508771929824,
"grad_norm": 10.755400657653809,
"learning_rate": 4.785575048732944e-06,
"loss": 4.4552,
"step": 650
},
{
"epoch": 1.1403508771929824,
"eval_q2q_data_loss": 4.363187789916992,
"eval_q2q_data_runtime": 5.5237,
"eval_q2q_data_samples_per_second": 318.449,
"eval_q2q_data_steps_per_second": 19.914,
"step": 650
},
{
"epoch": 1.1403508771929824,
"eval_q2p_data_loss": 4.810464382171631,
"eval_q2p_data_runtime": 7.469,
"eval_q2p_data_samples_per_second": 54.358,
"eval_q2p_data_steps_per_second": 3.481,
"step": 650
},
{
"epoch": 1.1578947368421053,
"grad_norm": 8.030132293701172,
"learning_rate": 4.688109161793373e-06,
"loss": 4.4473,
"step": 660
},
{
"epoch": 1.1754385964912282,
"grad_norm": 8.19764518737793,
"learning_rate": 4.590643274853801e-06,
"loss": 4.5069,
"step": 670
},
{
"epoch": 1.1929824561403508,
"grad_norm": 11.119821548461914,
"learning_rate": 4.4931773879142305e-06,
"loss": 4.5129,
"step": 680
},
{
"epoch": 1.2105263157894737,
"grad_norm": 9.186931610107422,
"learning_rate": 4.3957115009746595e-06,
"loss": 4.4611,
"step": 690
},
{
"epoch": 1.2280701754385965,
"grad_norm": 7.6313042640686035,
"learning_rate": 4.298245614035088e-06,
"loss": 4.5104,
"step": 700
},
{
"epoch": 1.2280701754385965,
"eval_q2q_data_loss": 4.353029727935791,
"eval_q2q_data_runtime": 5.559,
"eval_q2q_data_samples_per_second": 316.425,
"eval_q2q_data_steps_per_second": 19.788,
"step": 700
},
{
"epoch": 1.2280701754385965,
"eval_q2p_data_loss": 4.787461757659912,
"eval_q2p_data_runtime": 7.5053,
"eval_q2p_data_samples_per_second": 54.095,
"eval_q2p_data_steps_per_second": 3.464,
"step": 700
},
{
"epoch": 1.2456140350877192,
"grad_norm": 12.636022567749023,
"learning_rate": 4.200779727095517e-06,
"loss": 4.4742,
"step": 710
},
{
"epoch": 1.263157894736842,
"grad_norm": 16.598079681396484,
"learning_rate": 4.103313840155946e-06,
"loss": 4.4887,
"step": 720
},
{
"epoch": 1.280701754385965,
"grad_norm": 6.5720038414001465,
"learning_rate": 4.005847953216375e-06,
"loss": 4.406,
"step": 730
},
{
"epoch": 1.2982456140350878,
"grad_norm": 10.550318717956543,
"learning_rate": 3.908382066276803e-06,
"loss": 4.4049,
"step": 740
},
{
"epoch": 1.3157894736842106,
"grad_norm": 16.054428100585938,
"learning_rate": 3.8109161793372323e-06,
"loss": 4.4165,
"step": 750
},
{
"epoch": 1.3157894736842106,
"eval_q2q_data_loss": 4.348443031311035,
"eval_q2q_data_runtime": 5.5669,
"eval_q2q_data_samples_per_second": 315.976,
"eval_q2q_data_steps_per_second": 19.76,
"step": 750
},
{
"epoch": 1.3157894736842106,
"eval_q2p_data_loss": 4.786614894866943,
"eval_q2p_data_runtime": 7.508,
"eval_q2p_data_samples_per_second": 54.076,
"eval_q2p_data_steps_per_second": 3.463,
"step": 750
},
{
"epoch": 1.3333333333333333,
"grad_norm": 10.04055404663086,
"learning_rate": 3.713450292397661e-06,
"loss": 4.4274,
"step": 760
},
{
"epoch": 1.3508771929824561,
"grad_norm": 12.780068397521973,
"learning_rate": 3.61598440545809e-06,
"loss": 4.4855,
"step": 770
},
{
"epoch": 1.368421052631579,
"grad_norm": 10.54061222076416,
"learning_rate": 3.5185185185185187e-06,
"loss": 4.4571,
"step": 780
},
{
"epoch": 1.3859649122807016,
"grad_norm": 5.75900936126709,
"learning_rate": 3.421052631578948e-06,
"loss": 4.4307,
"step": 790
},
{
"epoch": 1.4035087719298245,
"grad_norm": 10.625808715820312,
"learning_rate": 3.3235867446393765e-06,
"loss": 4.4387,
"step": 800
},
{
"epoch": 1.4035087719298245,
"eval_q2q_data_loss": 4.345006465911865,
"eval_q2q_data_runtime": 5.5698,
"eval_q2q_data_samples_per_second": 315.808,
"eval_q2q_data_steps_per_second": 19.749,
"step": 800
},
{
"epoch": 1.4035087719298245,
"eval_q2p_data_loss": 4.762818813323975,
"eval_q2p_data_runtime": 7.5368,
"eval_q2p_data_samples_per_second": 53.869,
"eval_q2p_data_steps_per_second": 3.45,
"step": 800
},
{
"epoch": 1.4210526315789473,
"grad_norm": 9.662367820739746,
"learning_rate": 3.2261208576998056e-06,
"loss": 4.4592,
"step": 810
},
{
"epoch": 1.4385964912280702,
"grad_norm": 14.999639511108398,
"learning_rate": 3.1286549707602342e-06,
"loss": 4.4368,
"step": 820
},
{
"epoch": 1.456140350877193,
"grad_norm": 17.007898330688477,
"learning_rate": 3.0311890838206633e-06,
"loss": 4.4863,
"step": 830
},
{
"epoch": 1.4736842105263157,
"grad_norm": 14.116398811340332,
"learning_rate": 2.933723196881092e-06,
"loss": 4.463,
"step": 840
},
{
"epoch": 1.4912280701754386,
"grad_norm": 5.4955315589904785,
"learning_rate": 2.8362573099415206e-06,
"loss": 4.4113,
"step": 850
},
{
"epoch": 1.4912280701754386,
"eval_q2q_data_loss": 4.325167655944824,
"eval_q2q_data_runtime": 5.5814,
"eval_q2q_data_samples_per_second": 315.156,
"eval_q2q_data_steps_per_second": 19.708,
"step": 850
},
{
"epoch": 1.4912280701754386,
"eval_q2p_data_loss": 4.761044979095459,
"eval_q2p_data_runtime": 7.4985,
"eval_q2p_data_samples_per_second": 54.144,
"eval_q2p_data_steps_per_second": 3.467,
"step": 850
},
{
"epoch": 1.5087719298245614,
"grad_norm": 13.653097152709961,
"learning_rate": 2.7387914230019497e-06,
"loss": 4.4368,
"step": 860
},
{
"epoch": 1.526315789473684,
"grad_norm": 13.720170974731445,
"learning_rate": 2.6413255360623784e-06,
"loss": 4.4738,
"step": 870
},
{
"epoch": 1.543859649122807,
"grad_norm": 15.261076927185059,
"learning_rate": 2.5438596491228075e-06,
"loss": 4.4195,
"step": 880
},
{
"epoch": 1.5614035087719298,
"grad_norm": 10.974407196044922,
"learning_rate": 2.446393762183236e-06,
"loss": 4.4478,
"step": 890
},
{
"epoch": 1.5789473684210527,
"grad_norm": 10.83484935760498,
"learning_rate": 2.3489278752436648e-06,
"loss": 4.3849,
"step": 900
},
{
"epoch": 1.5789473684210527,
"eval_q2q_data_loss": 4.314022064208984,
"eval_q2q_data_runtime": 5.5727,
"eval_q2q_data_samples_per_second": 315.646,
"eval_q2q_data_steps_per_second": 19.739,
"step": 900
},
{
"epoch": 1.5789473684210527,
"eval_q2p_data_loss": 4.751864910125732,
"eval_q2p_data_runtime": 7.4934,
"eval_q2p_data_samples_per_second": 54.181,
"eval_q2p_data_steps_per_second": 3.47,
"step": 900
},
{
"epoch": 1.5964912280701755,
"grad_norm": 21.77918243408203,
"learning_rate": 2.2514619883040934e-06,
"loss": 4.4896,
"step": 910
},
{
"epoch": 1.6140350877192984,
"grad_norm": 7.528986930847168,
"learning_rate": 2.1539961013645225e-06,
"loss": 4.4301,
"step": 920
},
{
"epoch": 1.631578947368421,
"grad_norm": 8.18942928314209,
"learning_rate": 2.056530214424951e-06,
"loss": 4.4142,
"step": 930
},
{
"epoch": 1.6491228070175439,
"grad_norm": 10.001923561096191,
"learning_rate": 1.9590643274853803e-06,
"loss": 4.4582,
"step": 940
},
{
"epoch": 1.6666666666666665,
"grad_norm": 10.730441093444824,
"learning_rate": 1.861598440545809e-06,
"loss": 4.5075,
"step": 950
},
{
"epoch": 1.6666666666666665,
"eval_q2q_data_loss": 4.3189191818237305,
"eval_q2q_data_runtime": 5.5874,
"eval_q2q_data_samples_per_second": 314.816,
"eval_q2q_data_steps_per_second": 19.687,
"step": 950
},
{
"epoch": 1.6666666666666665,
"eval_q2p_data_loss": 4.725940704345703,
"eval_q2p_data_runtime": 7.514,
"eval_q2p_data_samples_per_second": 54.033,
"eval_q2p_data_steps_per_second": 3.46,
"step": 950
},
{
"epoch": 1.6842105263157894,
"grad_norm": 9.174509048461914,
"learning_rate": 1.7641325536062378e-06,
"loss": 4.4454,
"step": 960
},
{
"epoch": 1.7017543859649122,
"grad_norm": 11.805915832519531,
"learning_rate": 1.6666666666666667e-06,
"loss": 4.3547,
"step": 970
},
{
"epoch": 1.719298245614035,
"grad_norm": 9.230790138244629,
"learning_rate": 1.5692007797270955e-06,
"loss": 4.4016,
"step": 980
},
{
"epoch": 1.736842105263158,
"grad_norm": 13.292176246643066,
"learning_rate": 1.4717348927875244e-06,
"loss": 4.4064,
"step": 990
},
{
"epoch": 1.7543859649122808,
"grad_norm": 9.294161796569824,
"learning_rate": 1.3742690058479533e-06,
"loss": 4.4356,
"step": 1000
},
{
"epoch": 1.7543859649122808,
"eval_q2q_data_loss": 4.3151326179504395,
"eval_q2q_data_runtime": 5.5534,
"eval_q2q_data_samples_per_second": 316.742,
"eval_q2q_data_steps_per_second": 19.808,
"step": 1000
},
{
"epoch": 1.7543859649122808,
"eval_q2p_data_loss": 4.727615833282471,
"eval_q2p_data_runtime": 7.5335,
"eval_q2p_data_samples_per_second": 53.893,
"eval_q2p_data_steps_per_second": 3.451,
"step": 1000
},
{
"epoch": 1.7719298245614035,
"grad_norm": 12.539956092834473,
"learning_rate": 1.2768031189083821e-06,
"loss": 4.4105,
"step": 1010
},
{
"epoch": 1.7894736842105263,
"grad_norm": 15.329697608947754,
"learning_rate": 1.179337231968811e-06,
"loss": 4.4067,
"step": 1020
},
{
"epoch": 1.807017543859649,
"grad_norm": 7.712077617645264,
"learning_rate": 1.0818713450292399e-06,
"loss": 4.4296,
"step": 1030
},
{
"epoch": 1.8245614035087718,
"grad_norm": 7.909111976623535,
"learning_rate": 9.844054580896685e-07,
"loss": 4.4147,
"step": 1040
},
{
"epoch": 1.8421052631578947,
"grad_norm": 16.474355697631836,
"learning_rate": 8.869395711500975e-07,
"loss": 4.3743,
"step": 1050
},
{
"epoch": 1.8421052631578947,
"eval_q2q_data_loss": 4.313626289367676,
"eval_q2q_data_runtime": 5.5976,
"eval_q2q_data_samples_per_second": 314.244,
"eval_q2q_data_steps_per_second": 19.651,
"step": 1050
},
{
"epoch": 1.8421052631578947,
"eval_q2p_data_loss": 4.7181901931762695,
"eval_q2p_data_runtime": 7.5322,
"eval_q2p_data_samples_per_second": 53.902,
"eval_q2p_data_steps_per_second": 3.452,
"step": 1050
},
{
"epoch": 1.8596491228070176,
"grad_norm": 11.424127578735352,
"learning_rate": 7.894736842105263e-07,
"loss": 4.4065,
"step": 1060
},
{
"epoch": 1.8771929824561404,
"grad_norm": 8.293243408203125,
"learning_rate": 6.920077972709552e-07,
"loss": 4.4025,
"step": 1070
},
{
"epoch": 1.8947368421052633,
"grad_norm": 11.082077026367188,
"learning_rate": 5.94541910331384e-07,
"loss": 4.3912,
"step": 1080
},
{
"epoch": 1.912280701754386,
"grad_norm": 13.221600532531738,
"learning_rate": 4.970760233918129e-07,
"loss": 4.3731,
"step": 1090
},
{
"epoch": 1.9298245614035088,
"grad_norm": 16.041154861450195,
"learning_rate": 3.996101364522417e-07,
"loss": 4.3817,
"step": 1100
},
{
"epoch": 1.9298245614035088,
"eval_q2q_data_loss": 4.311989784240723,
"eval_q2q_data_runtime": 5.5712,
"eval_q2q_data_samples_per_second": 315.734,
"eval_q2q_data_steps_per_second": 19.745,
"step": 1100
},
{
"epoch": 1.9298245614035088,
"eval_q2p_data_loss": 4.735711097717285,
"eval_q2p_data_runtime": 7.4899,
"eval_q2p_data_samples_per_second": 54.207,
"eval_q2p_data_steps_per_second": 3.471,
"step": 1100
},
{
"epoch": 1.9473684210526314,
"grad_norm": 8.501266479492188,
"learning_rate": 3.021442495126706e-07,
"loss": 4.4305,
"step": 1110
},
{
"epoch": 1.9649122807017543,
"grad_norm": 7.625467300415039,
"learning_rate": 2.046783625730994e-07,
"loss": 4.3914,
"step": 1120
},
{
"epoch": 1.9824561403508771,
"grad_norm": 11.992876052856445,
"learning_rate": 1.0721247563352827e-07,
"loss": 4.4753,
"step": 1130
},
{
"epoch": 2.0,
"grad_norm": 5.4963908195495605,
"learning_rate": 9.746588693957116e-09,
"loss": 4.4536,
"step": 1140
}
],
"logging_steps": 10,
"max_steps": 1140,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 36,
"trial_name": null,
"trial_params": null
}