sledopyt_embedder_with_neg / trainer_state.json
George2002's picture
Upload model checkpoint
7b14f2b verified
{
"best_global_step": 2550,
"best_metric": 4.99726676940918,
"best_model_checkpoint": ".../training_output/checkpoint-1000",
"epoch": 3.0,
"eval_steps": 50,
"global_step": 3129,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009587727708533078,
"grad_norm": 1.2602713108062744,
"learning_rate": 2.875399361022364e-07,
"loss": 5.0879,
"step": 10
},
{
"epoch": 0.019175455417066157,
"grad_norm": 1.1363953351974487,
"learning_rate": 6.070287539936103e-07,
"loss": 5.1046,
"step": 20
},
{
"epoch": 0.028763183125599234,
"grad_norm": 1.1238548755645752,
"learning_rate": 9.265175718849841e-07,
"loss": 5.0837,
"step": 30
},
{
"epoch": 0.038350910834132314,
"grad_norm": 1.0674521923065186,
"learning_rate": 1.2460063897763578e-06,
"loss": 5.0778,
"step": 40
},
{
"epoch": 0.04793863854266539,
"grad_norm": 1.0108286142349243,
"learning_rate": 1.565495207667732e-06,
"loss": 5.0643,
"step": 50
},
{
"epoch": 0.04793863854266539,
"eval_q2q_data_loss": 5.071373462677002,
"eval_q2q_data_runtime": 8.6567,
"eval_q2q_data_samples_per_second": 312.475,
"eval_q2q_data_steps_per_second": 19.638,
"step": 50
},
{
"epoch": 0.04793863854266539,
"eval_q2p_data_loss": 5.046911239624023,
"eval_q2p_data_runtime": 15.4129,
"eval_q2p_data_samples_per_second": 52.683,
"eval_q2p_data_steps_per_second": 3.309,
"step": 50
},
{
"epoch": 0.05752636625119847,
"grad_norm": 1.051458477973938,
"learning_rate": 1.8849840255591056e-06,
"loss": 5.0424,
"step": 60
},
{
"epoch": 0.06711409395973154,
"grad_norm": 1.123085856437683,
"learning_rate": 2.2044728434504793e-06,
"loss": 5.0255,
"step": 70
},
{
"epoch": 0.07670182166826463,
"grad_norm": 0.8094280362129211,
"learning_rate": 2.5239616613418532e-06,
"loss": 5.0099,
"step": 80
},
{
"epoch": 0.0862895493767977,
"grad_norm": 1.4995239973068237,
"learning_rate": 2.8434504792332267e-06,
"loss": 5.0063,
"step": 90
},
{
"epoch": 0.09587727708533078,
"grad_norm": 0.6668018698692322,
"learning_rate": 3.162939297124601e-06,
"loss": 5.0033,
"step": 100
},
{
"epoch": 0.09587727708533078,
"eval_q2q_data_loss": 5.014667510986328,
"eval_q2q_data_runtime": 8.6334,
"eval_q2q_data_samples_per_second": 313.318,
"eval_q2q_data_steps_per_second": 19.691,
"step": 100
},
{
"epoch": 0.09587727708533078,
"eval_q2p_data_loss": 5.0004682540893555,
"eval_q2p_data_runtime": 15.4405,
"eval_q2p_data_samples_per_second": 52.589,
"eval_q2p_data_steps_per_second": 3.303,
"step": 100
},
{
"epoch": 0.10546500479386385,
"grad_norm": 0.811168909072876,
"learning_rate": 3.482428115015975e-06,
"loss": 5.003,
"step": 110
},
{
"epoch": 0.11505273250239693,
"grad_norm": 1.420505404472351,
"learning_rate": 3.8019169329073485e-06,
"loss": 4.9967,
"step": 120
},
{
"epoch": 0.12464046021093,
"grad_norm": 5.024260520935059,
"learning_rate": 4.121405750798722e-06,
"loss": 4.998,
"step": 130
},
{
"epoch": 0.1342281879194631,
"grad_norm": 4.843268394470215,
"learning_rate": 4.440894568690096e-06,
"loss": 5.0012,
"step": 140
},
{
"epoch": 0.14381591562799617,
"grad_norm": 0.6666759848594666,
"learning_rate": 4.76038338658147e-06,
"loss": 4.9989,
"step": 150
},
{
"epoch": 0.14381591562799617,
"eval_q2q_data_loss": 5.009535312652588,
"eval_q2q_data_runtime": 8.5717,
"eval_q2q_data_samples_per_second": 315.574,
"eval_q2q_data_steps_per_second": 19.833,
"step": 150
},
{
"epoch": 0.14381591562799617,
"eval_q2p_data_loss": 4.942420959472656,
"eval_q2p_data_runtime": 15.4905,
"eval_q2p_data_samples_per_second": 52.419,
"eval_q2p_data_steps_per_second": 3.292,
"step": 150
},
{
"epoch": 0.15340364333652926,
"grad_norm": 0.6130227446556091,
"learning_rate": 5.079872204472844e-06,
"loss": 4.9908,
"step": 160
},
{
"epoch": 0.1629913710450623,
"grad_norm": 0.7333933711051941,
"learning_rate": 5.399361022364218e-06,
"loss": 4.9735,
"step": 170
},
{
"epoch": 0.1725790987535954,
"grad_norm": 2.2645883560180664,
"learning_rate": 5.718849840255591e-06,
"loss": 4.9965,
"step": 180
},
{
"epoch": 0.18216682646212848,
"grad_norm": 0.6750437617301941,
"learning_rate": 6.038338658146965e-06,
"loss": 4.9825,
"step": 190
},
{
"epoch": 0.19175455417066156,
"grad_norm": 8.299290657043457,
"learning_rate": 6.35782747603834e-06,
"loss": 4.9514,
"step": 200
},
{
"epoch": 0.19175455417066156,
"eval_q2q_data_loss": 5.007415294647217,
"eval_q2q_data_runtime": 8.6664,
"eval_q2q_data_samples_per_second": 312.126,
"eval_q2q_data_steps_per_second": 19.616,
"step": 200
},
{
"epoch": 0.19175455417066156,
"eval_q2p_data_loss": 4.874378204345703,
"eval_q2p_data_runtime": 15.5099,
"eval_q2p_data_samples_per_second": 52.354,
"eval_q2p_data_steps_per_second": 3.288,
"step": 200
},
{
"epoch": 0.20134228187919462,
"grad_norm": 1.9930428266525269,
"learning_rate": 6.677316293929713e-06,
"loss": 4.9521,
"step": 210
},
{
"epoch": 0.2109300095877277,
"grad_norm": 4.539638042449951,
"learning_rate": 6.996805111821087e-06,
"loss": 4.968,
"step": 220
},
{
"epoch": 0.22051773729626079,
"grad_norm": 0.5192278027534485,
"learning_rate": 7.316293929712461e-06,
"loss": 4.96,
"step": 230
},
{
"epoch": 0.23010546500479387,
"grad_norm": 4.190878868103027,
"learning_rate": 7.635782747603835e-06,
"loss": 4.9758,
"step": 240
},
{
"epoch": 0.23969319271332695,
"grad_norm": 0.7492648959159851,
"learning_rate": 7.955271565495208e-06,
"loss": 4.9834,
"step": 250
},
{
"epoch": 0.23969319271332695,
"eval_q2q_data_loss": 5.00647497177124,
"eval_q2q_data_runtime": 8.6319,
"eval_q2q_data_samples_per_second": 313.372,
"eval_q2q_data_steps_per_second": 19.694,
"step": 250
},
{
"epoch": 0.23969319271332695,
"eval_q2p_data_loss": 4.842836856842041,
"eval_q2p_data_runtime": 15.4423,
"eval_q2p_data_samples_per_second": 52.583,
"eval_q2p_data_steps_per_second": 3.303,
"step": 250
},
{
"epoch": 0.24928092042186,
"grad_norm": 1.2294269800186157,
"learning_rate": 8.274760383386582e-06,
"loss": 4.9273,
"step": 260
},
{
"epoch": 0.2588686481303931,
"grad_norm": 1.7497507333755493,
"learning_rate": 8.594249201277956e-06,
"loss": 4.9796,
"step": 270
},
{
"epoch": 0.2684563758389262,
"grad_norm": 5.415214538574219,
"learning_rate": 8.91373801916933e-06,
"loss": 4.9517,
"step": 280
},
{
"epoch": 0.27804410354745923,
"grad_norm": 2.2691502571105957,
"learning_rate": 9.233226837060704e-06,
"loss": 4.9763,
"step": 290
},
{
"epoch": 0.28763183125599234,
"grad_norm": 5.458872318267822,
"learning_rate": 9.552715654952077e-06,
"loss": 4.9372,
"step": 300
},
{
"epoch": 0.28763183125599234,
"eval_q2q_data_loss": 5.0056328773498535,
"eval_q2q_data_runtime": 8.5076,
"eval_q2q_data_samples_per_second": 317.952,
"eval_q2q_data_steps_per_second": 19.982,
"step": 300
},
{
"epoch": 0.28763183125599234,
"eval_q2p_data_loss": 4.825343608856201,
"eval_q2p_data_runtime": 15.402,
"eval_q2p_data_samples_per_second": 52.72,
"eval_q2p_data_steps_per_second": 3.311,
"step": 300
},
{
"epoch": 0.2972195589645254,
"grad_norm": 4.435003757476807,
"learning_rate": 9.87220447284345e-06,
"loss": 4.9325,
"step": 310
},
{
"epoch": 0.3068072866730585,
"grad_norm": 0.34137386083602905,
"learning_rate": 9.978693181818183e-06,
"loss": 4.9477,
"step": 320
},
{
"epoch": 0.31639501438159157,
"grad_norm": 1.3951576948165894,
"learning_rate": 9.943181818181819e-06,
"loss": 4.9455,
"step": 330
},
{
"epoch": 0.3259827420901246,
"grad_norm": 8.795852661132812,
"learning_rate": 9.907670454545455e-06,
"loss": 4.9258,
"step": 340
},
{
"epoch": 0.33557046979865773,
"grad_norm": 0.4223299026489258,
"learning_rate": 9.872159090909091e-06,
"loss": 4.9799,
"step": 350
},
{
"epoch": 0.33557046979865773,
"eval_q2q_data_loss": 5.004530429840088,
"eval_q2q_data_runtime": 8.523,
"eval_q2q_data_samples_per_second": 317.375,
"eval_q2q_data_steps_per_second": 19.946,
"step": 350
},
{
"epoch": 0.33557046979865773,
"eval_q2p_data_loss": 4.843413352966309,
"eval_q2p_data_runtime": 15.444,
"eval_q2p_data_samples_per_second": 52.577,
"eval_q2p_data_steps_per_second": 3.302,
"step": 350
},
{
"epoch": 0.3451581975071908,
"grad_norm": 0.3708871006965637,
"learning_rate": 9.836647727272728e-06,
"loss": 4.9791,
"step": 360
},
{
"epoch": 0.3547459252157239,
"grad_norm": 0.3105733096599579,
"learning_rate": 9.801136363636364e-06,
"loss": 4.9437,
"step": 370
},
{
"epoch": 0.36433365292425696,
"grad_norm": 0.3218185007572174,
"learning_rate": 9.765625e-06,
"loss": 4.9873,
"step": 380
},
{
"epoch": 0.37392138063279,
"grad_norm": 0.29383164644241333,
"learning_rate": 9.730113636363636e-06,
"loss": 4.9425,
"step": 390
},
{
"epoch": 0.3835091083413231,
"grad_norm": 4.873048305511475,
"learning_rate": 9.694602272727274e-06,
"loss": 4.9837,
"step": 400
},
{
"epoch": 0.3835091083413231,
"eval_q2q_data_loss": 5.004254341125488,
"eval_q2q_data_runtime": 8.5135,
"eval_q2q_data_samples_per_second": 317.73,
"eval_q2q_data_steps_per_second": 19.968,
"step": 400
},
{
"epoch": 0.3835091083413231,
"eval_q2p_data_loss": 4.841865539550781,
"eval_q2p_data_runtime": 15.3899,
"eval_q2p_data_samples_per_second": 52.762,
"eval_q2p_data_steps_per_second": 3.314,
"step": 400
},
{
"epoch": 0.3930968360498562,
"grad_norm": 0.3491421639919281,
"learning_rate": 9.65909090909091e-06,
"loss": 5.0006,
"step": 410
},
{
"epoch": 0.40268456375838924,
"grad_norm": 5.751034259796143,
"learning_rate": 9.623579545454547e-06,
"loss": 4.9831,
"step": 420
},
{
"epoch": 0.41227229146692235,
"grad_norm": 0.34302422404289246,
"learning_rate": 9.588068181818183e-06,
"loss": 4.9531,
"step": 430
},
{
"epoch": 0.4218600191754554,
"grad_norm": 0.4230528771877289,
"learning_rate": 9.552556818181818e-06,
"loss": 4.9856,
"step": 440
},
{
"epoch": 0.4314477468839885,
"grad_norm": 17.237260818481445,
"learning_rate": 9.517045454545454e-06,
"loss": 4.8996,
"step": 450
},
{
"epoch": 0.4314477468839885,
"eval_q2q_data_loss": 5.005645751953125,
"eval_q2q_data_runtime": 8.483,
"eval_q2q_data_samples_per_second": 318.872,
"eval_q2q_data_steps_per_second": 20.04,
"step": 450
},
{
"epoch": 0.4314477468839885,
"eval_q2p_data_loss": 4.865195274353027,
"eval_q2p_data_runtime": 15.3699,
"eval_q2p_data_samples_per_second": 52.83,
"eval_q2p_data_steps_per_second": 3.318,
"step": 450
},
{
"epoch": 0.44103547459252157,
"grad_norm": 2.0367865562438965,
"learning_rate": 9.481534090909092e-06,
"loss": 4.9467,
"step": 460
},
{
"epoch": 0.4506232023010546,
"grad_norm": 0.41367027163505554,
"learning_rate": 9.446022727272728e-06,
"loss": 4.9724,
"step": 470
},
{
"epoch": 0.46021093000958774,
"grad_norm": 11.92837142944336,
"learning_rate": 9.410511363636365e-06,
"loss": 4.9797,
"step": 480
},
{
"epoch": 0.4697986577181208,
"grad_norm": 0.38374051451683044,
"learning_rate": 9.375000000000001e-06,
"loss": 4.9735,
"step": 490
},
{
"epoch": 0.4793863854266539,
"grad_norm": 5.73974609375,
"learning_rate": 9.339488636363637e-06,
"loss": 4.8765,
"step": 500
},
{
"epoch": 0.4793863854266539,
"eval_q2q_data_loss": 5.003554821014404,
"eval_q2q_data_runtime": 8.5075,
"eval_q2q_data_samples_per_second": 317.954,
"eval_q2q_data_steps_per_second": 19.982,
"step": 500
},
{
"epoch": 0.4793863854266539,
"eval_q2p_data_loss": 4.845742225646973,
"eval_q2p_data_runtime": 15.4131,
"eval_q2p_data_samples_per_second": 52.682,
"eval_q2p_data_steps_per_second": 3.309,
"step": 500
},
{
"epoch": 0.48897411313518696,
"grad_norm": 0.673588216304779,
"learning_rate": 9.303977272727273e-06,
"loss": 4.9136,
"step": 510
},
{
"epoch": 0.49856184084372,
"grad_norm": 0.6867577433586121,
"learning_rate": 9.26846590909091e-06,
"loss": 4.9688,
"step": 520
},
{
"epoch": 0.5081495685522531,
"grad_norm": 0.5350639224052429,
"learning_rate": 9.232954545454546e-06,
"loss": 4.9436,
"step": 530
},
{
"epoch": 0.5177372962607862,
"grad_norm": 0.4116136133670807,
"learning_rate": 9.197443181818184e-06,
"loss": 5.0017,
"step": 540
},
{
"epoch": 0.5273250239693192,
"grad_norm": 10.749342918395996,
"learning_rate": 9.161931818181818e-06,
"loss": 4.9867,
"step": 550
},
{
"epoch": 0.5273250239693192,
"eval_q2q_data_loss": 5.004271507263184,
"eval_q2q_data_runtime": 8.4877,
"eval_q2q_data_samples_per_second": 318.695,
"eval_q2q_data_steps_per_second": 20.029,
"step": 550
},
{
"epoch": 0.5273250239693192,
"eval_q2p_data_loss": 4.860942363739014,
"eval_q2p_data_runtime": 15.3408,
"eval_q2p_data_samples_per_second": 52.931,
"eval_q2p_data_steps_per_second": 3.324,
"step": 550
},
{
"epoch": 0.5369127516778524,
"grad_norm": 0.3119679093360901,
"learning_rate": 9.126420454545455e-06,
"loss": 4.9716,
"step": 560
},
{
"epoch": 0.5465004793863855,
"grad_norm": 0.2090018391609192,
"learning_rate": 9.090909090909091e-06,
"loss": 4.9338,
"step": 570
},
{
"epoch": 0.5560882070949185,
"grad_norm": 0.2094723880290985,
"learning_rate": 9.055397727272727e-06,
"loss": 4.9975,
"step": 580
},
{
"epoch": 0.5656759348034516,
"grad_norm": 0.16981257498264313,
"learning_rate": 9.019886363636364e-06,
"loss": 4.9485,
"step": 590
},
{
"epoch": 0.5752636625119847,
"grad_norm": 15.281989097595215,
"learning_rate": 8.984375000000002e-06,
"loss": 4.8959,
"step": 600
},
{
"epoch": 0.5752636625119847,
"eval_q2q_data_loss": 5.002608299255371,
"eval_q2q_data_runtime": 8.4635,
"eval_q2q_data_samples_per_second": 319.608,
"eval_q2q_data_steps_per_second": 20.086,
"step": 600
},
{
"epoch": 0.5752636625119847,
"eval_q2p_data_loss": 4.780869483947754,
"eval_q2p_data_runtime": 15.3652,
"eval_q2p_data_samples_per_second": 52.847,
"eval_q2p_data_steps_per_second": 3.319,
"step": 600
},
{
"epoch": 0.5848513902205177,
"grad_norm": 16.331180572509766,
"learning_rate": 8.948863636363638e-06,
"loss": 4.9769,
"step": 610
},
{
"epoch": 0.5944391179290508,
"grad_norm": 0.17700470983982086,
"learning_rate": 8.913352272727274e-06,
"loss": 4.9407,
"step": 620
},
{
"epoch": 0.6040268456375839,
"grad_norm": 6.958109378814697,
"learning_rate": 8.87784090909091e-06,
"loss": 4.9941,
"step": 630
},
{
"epoch": 0.613614573346117,
"grad_norm": 5.405721664428711,
"learning_rate": 8.842329545454547e-06,
"loss": 4.976,
"step": 640
},
{
"epoch": 0.62320230105465,
"grad_norm": 0.2884855270385742,
"learning_rate": 8.806818181818183e-06,
"loss": 4.986,
"step": 650
},
{
"epoch": 0.62320230105465,
"eval_q2q_data_loss": 5.003030776977539,
"eval_q2q_data_runtime": 8.5486,
"eval_q2q_data_samples_per_second": 316.425,
"eval_q2q_data_steps_per_second": 19.886,
"step": 650
},
{
"epoch": 0.62320230105465,
"eval_q2p_data_loss": 4.810172080993652,
"eval_q2p_data_runtime": 15.3666,
"eval_q2p_data_samples_per_second": 52.842,
"eval_q2p_data_steps_per_second": 3.319,
"step": 650
},
{
"epoch": 0.6327900287631831,
"grad_norm": 0.44038277864456177,
"learning_rate": 8.77130681818182e-06,
"loss": 4.94,
"step": 660
},
{
"epoch": 0.6423777564717162,
"grad_norm": 0.35095784068107605,
"learning_rate": 8.735795454545455e-06,
"loss": 4.9917,
"step": 670
},
{
"epoch": 0.6519654841802492,
"grad_norm": 0.7992573976516724,
"learning_rate": 8.700284090909092e-06,
"loss": 4.9938,
"step": 680
},
{
"epoch": 0.6615532118887824,
"grad_norm": 12.68810749053955,
"learning_rate": 8.664772727272728e-06,
"loss": 4.9373,
"step": 690
},
{
"epoch": 0.6711409395973155,
"grad_norm": 8.244370460510254,
"learning_rate": 8.629261363636364e-06,
"loss": 5.0235,
"step": 700
},
{
"epoch": 0.6711409395973155,
"eval_q2q_data_loss": 5.032140254974365,
"eval_q2q_data_runtime": 8.4755,
"eval_q2q_data_samples_per_second": 319.155,
"eval_q2q_data_steps_per_second": 20.058,
"step": 700
},
{
"epoch": 0.6711409395973155,
"eval_q2p_data_loss": 4.879370212554932,
"eval_q2p_data_runtime": 15.3816,
"eval_q2p_data_samples_per_second": 52.79,
"eval_q2p_data_steps_per_second": 3.316,
"step": 700
},
{
"epoch": 0.6807286673058485,
"grad_norm": 12.066866874694824,
"learning_rate": 8.59375e-06,
"loss": 4.939,
"step": 710
},
{
"epoch": 0.6903163950143816,
"grad_norm": 15.054842948913574,
"learning_rate": 8.558238636363637e-06,
"loss": 4.9682,
"step": 720
},
{
"epoch": 0.6999041227229147,
"grad_norm": 1.6012367010116577,
"learning_rate": 8.522727272727273e-06,
"loss": 4.9813,
"step": 730
},
{
"epoch": 0.7094918504314478,
"grad_norm": 6.062280654907227,
"learning_rate": 8.48721590909091e-06,
"loss": 4.9442,
"step": 740
},
{
"epoch": 0.7190795781399808,
"grad_norm": 0.4181146025657654,
"learning_rate": 8.451704545454547e-06,
"loss": 4.9354,
"step": 750
},
{
"epoch": 0.7190795781399808,
"eval_q2q_data_loss": 5.002427577972412,
"eval_q2q_data_runtime": 8.4867,
"eval_q2q_data_samples_per_second": 318.733,
"eval_q2q_data_steps_per_second": 20.031,
"step": 750
},
{
"epoch": 0.7190795781399808,
"eval_q2p_data_loss": 4.805325508117676,
"eval_q2p_data_runtime": 15.3619,
"eval_q2p_data_samples_per_second": 52.858,
"eval_q2p_data_steps_per_second": 3.32,
"step": 750
},
{
"epoch": 0.7286673058485139,
"grad_norm": 0.23768964409828186,
"learning_rate": 8.416193181818184e-06,
"loss": 4.9105,
"step": 760
},
{
"epoch": 0.738255033557047,
"grad_norm": 1.1970841884613037,
"learning_rate": 8.380681818181818e-06,
"loss": 4.9271,
"step": 770
},
{
"epoch": 0.74784276126558,
"grad_norm": 0.22903920710086823,
"learning_rate": 8.345170454545454e-06,
"loss": 4.9476,
"step": 780
},
{
"epoch": 0.7574304889741131,
"grad_norm": 9.315869331359863,
"learning_rate": 8.30965909090909e-06,
"loss": 4.8887,
"step": 790
},
{
"epoch": 0.7670182166826462,
"grad_norm": 0.27411147952079773,
"learning_rate": 8.274147727272727e-06,
"loss": 4.9576,
"step": 800
},
{
"epoch": 0.7670182166826462,
"eval_q2q_data_loss": 5.001960754394531,
"eval_q2q_data_runtime": 8.5354,
"eval_q2q_data_samples_per_second": 316.917,
"eval_q2q_data_steps_per_second": 19.917,
"step": 800
},
{
"epoch": 0.7670182166826462,
"eval_q2p_data_loss": 4.739698886871338,
"eval_q2p_data_runtime": 15.3694,
"eval_q2p_data_samples_per_second": 52.832,
"eval_q2p_data_steps_per_second": 3.318,
"step": 800
},
{
"epoch": 0.7766059443911792,
"grad_norm": 11.00167465209961,
"learning_rate": 8.238636363636365e-06,
"loss": 4.9577,
"step": 810
},
{
"epoch": 0.7861936720997124,
"grad_norm": 0.460358589887619,
"learning_rate": 8.203125000000001e-06,
"loss": 4.8974,
"step": 820
},
{
"epoch": 0.7957813998082455,
"grad_norm": 10.619705200195312,
"learning_rate": 8.167613636363637e-06,
"loss": 5.0033,
"step": 830
},
{
"epoch": 0.8053691275167785,
"grad_norm": 0.5667484998703003,
"learning_rate": 8.132102272727274e-06,
"loss": 4.976,
"step": 840
},
{
"epoch": 0.8149568552253116,
"grad_norm": 12.914066314697266,
"learning_rate": 8.09659090909091e-06,
"loss": 4.9915,
"step": 850
},
{
"epoch": 0.8149568552253116,
"eval_q2q_data_loss": 5.042208194732666,
"eval_q2q_data_runtime": 8.496,
"eval_q2q_data_samples_per_second": 318.386,
"eval_q2q_data_steps_per_second": 20.009,
"step": 850
},
{
"epoch": 0.8149568552253116,
"eval_q2p_data_loss": 4.936696529388428,
"eval_q2p_data_runtime": 15.4165,
"eval_q2p_data_samples_per_second": 52.671,
"eval_q2p_data_steps_per_second": 3.308,
"step": 850
},
{
"epoch": 0.8245445829338447,
"grad_norm": 7.874532699584961,
"learning_rate": 8.061079545454546e-06,
"loss": 4.9856,
"step": 860
},
{
"epoch": 0.8341323106423778,
"grad_norm": 3.6945109367370605,
"learning_rate": 8.025568181818183e-06,
"loss": 4.9566,
"step": 870
},
{
"epoch": 0.8437200383509108,
"grad_norm": 34.59883117675781,
"learning_rate": 7.990056818181819e-06,
"loss": 4.8738,
"step": 880
},
{
"epoch": 0.8533077660594439,
"grad_norm": 1.2880325317382812,
"learning_rate": 7.954545454545455e-06,
"loss": 4.9258,
"step": 890
},
{
"epoch": 0.862895493767977,
"grad_norm": 5.390997886657715,
"learning_rate": 7.919034090909091e-06,
"loss": 4.9118,
"step": 900
},
{
"epoch": 0.862895493767977,
"eval_q2q_data_loss": 5.003294944763184,
"eval_q2q_data_runtime": 8.4963,
"eval_q2q_data_samples_per_second": 318.375,
"eval_q2q_data_steps_per_second": 20.009,
"step": 900
},
{
"epoch": 0.862895493767977,
"eval_q2p_data_loss": 4.794476509094238,
"eval_q2p_data_runtime": 15.3667,
"eval_q2p_data_samples_per_second": 52.842,
"eval_q2p_data_steps_per_second": 3.319,
"step": 900
},
{
"epoch": 0.87248322147651,
"grad_norm": 3.2997488975524902,
"learning_rate": 7.883522727272728e-06,
"loss": 4.9782,
"step": 910
},
{
"epoch": 0.8820709491850431,
"grad_norm": 10.71391773223877,
"learning_rate": 7.848011363636364e-06,
"loss": 4.8659,
"step": 920
},
{
"epoch": 0.8916586768935763,
"grad_norm": 0.14661180973052979,
"learning_rate": 7.8125e-06,
"loss": 4.9197,
"step": 930
},
{
"epoch": 0.9012464046021093,
"grad_norm": 0.1432102769613266,
"learning_rate": 7.776988636363636e-06,
"loss": 4.9281,
"step": 940
},
{
"epoch": 0.9108341323106424,
"grad_norm": 0.13064274191856384,
"learning_rate": 7.741477272727274e-06,
"loss": 4.9427,
"step": 950
},
{
"epoch": 0.9108341323106424,
"eval_q2q_data_loss": 5.002143383026123,
"eval_q2q_data_runtime": 8.5053,
"eval_q2q_data_samples_per_second": 318.036,
"eval_q2q_data_steps_per_second": 19.988,
"step": 950
},
{
"epoch": 0.9108341323106424,
"eval_q2p_data_loss": 4.785708427429199,
"eval_q2p_data_runtime": 15.3288,
"eval_q2p_data_samples_per_second": 52.972,
"eval_q2p_data_steps_per_second": 3.327,
"step": 950
},
{
"epoch": 0.9204218600191755,
"grad_norm": 19.881868362426758,
"learning_rate": 7.70596590909091e-06,
"loss": 4.8966,
"step": 960
},
{
"epoch": 0.9300095877277086,
"grad_norm": 0.11643442511558533,
"learning_rate": 7.670454545454547e-06,
"loss": 4.9657,
"step": 970
},
{
"epoch": 0.9395973154362416,
"grad_norm": 0.20641827583312988,
"learning_rate": 7.634943181818183e-06,
"loss": 4.9597,
"step": 980
},
{
"epoch": 0.9491850431447747,
"grad_norm": 0.1226697638630867,
"learning_rate": 7.599431818181819e-06,
"loss": 4.9627,
"step": 990
},
{
"epoch": 0.9587727708533078,
"grad_norm": 0.17849154770374298,
"learning_rate": 7.563920454545455e-06,
"loss": 4.8603,
"step": 1000
},
{
"epoch": 0.9587727708533078,
"eval_q2q_data_loss": 5.001661777496338,
"eval_q2q_data_runtime": 8.4763,
"eval_q2q_data_samples_per_second": 319.123,
"eval_q2q_data_steps_per_second": 20.056,
"step": 1000
},
{
"epoch": 0.9587727708533078,
"eval_q2p_data_loss": 4.801548004150391,
"eval_q2p_data_runtime": 15.3711,
"eval_q2p_data_samples_per_second": 52.827,
"eval_q2p_data_steps_per_second": 3.318,
"step": 1000
},
{
"epoch": 0.9683604985618408,
"grad_norm": 0.11723767966032028,
"learning_rate": 7.528409090909091e-06,
"loss": 4.9817,
"step": 1010
},
{
"epoch": 0.9779482262703739,
"grad_norm": 0.14676721394062042,
"learning_rate": 7.4928977272727274e-06,
"loss": 4.813,
"step": 1020
},
{
"epoch": 0.987535953978907,
"grad_norm": 0.18476560711860657,
"learning_rate": 7.4573863636363646e-06,
"loss": 4.9688,
"step": 1030
},
{
"epoch": 0.99712368168744,
"grad_norm": 12.572381019592285,
"learning_rate": 7.421875000000001e-06,
"loss": 4.9802,
"step": 1040
},
{
"epoch": 1.0067114093959733,
"grad_norm": 30.89609146118164,
"learning_rate": 7.386363636363637e-06,
"loss": 4.8651,
"step": 1050
},
{
"epoch": 1.0067114093959733,
"eval_q2q_data_loss": 5.00149393081665,
"eval_q2q_data_runtime": 8.4886,
"eval_q2q_data_samples_per_second": 318.661,
"eval_q2q_data_steps_per_second": 20.027,
"step": 1050
},
{
"epoch": 1.0067114093959733,
"eval_q2p_data_loss": 4.796145439147949,
"eval_q2p_data_runtime": 15.3888,
"eval_q2p_data_samples_per_second": 52.766,
"eval_q2p_data_steps_per_second": 3.314,
"step": 1050
},
{
"epoch": 1.0162991371045063,
"grad_norm": 15.047320365905762,
"learning_rate": 7.350852272727273e-06,
"loss": 4.9286,
"step": 1060
},
{
"epoch": 1.0258868648130393,
"grad_norm": 0.20640498399734497,
"learning_rate": 7.31534090909091e-06,
"loss": 4.9124,
"step": 1070
},
{
"epoch": 1.0354745925215725,
"grad_norm": 5.841845989227295,
"learning_rate": 7.279829545454547e-06,
"loss": 4.9927,
"step": 1080
},
{
"epoch": 1.0450623202301055,
"grad_norm": 8.321894645690918,
"learning_rate": 7.244318181818183e-06,
"loss": 4.9769,
"step": 1090
},
{
"epoch": 1.0546500479386385,
"grad_norm": 0.8191462755203247,
"learning_rate": 7.2088068181818185e-06,
"loss": 5.0158,
"step": 1100
},
{
"epoch": 1.0546500479386385,
"eval_q2q_data_loss": 5.004606246948242,
"eval_q2q_data_runtime": 8.4874,
"eval_q2q_data_samples_per_second": 318.708,
"eval_q2q_data_steps_per_second": 20.03,
"step": 1100
},
{
"epoch": 1.0546500479386385,
"eval_q2p_data_loss": 5.120335102081299,
"eval_q2p_data_runtime": 15.3988,
"eval_q2p_data_samples_per_second": 52.731,
"eval_q2p_data_steps_per_second": 3.312,
"step": 1100
},
{
"epoch": 1.0642377756471717,
"grad_norm": 6.462870121002197,
"learning_rate": 7.173295454545455e-06,
"loss": 5.0234,
"step": 1110
},
{
"epoch": 1.0738255033557047,
"grad_norm": 19.973081588745117,
"learning_rate": 7.137784090909091e-06,
"loss": 4.9903,
"step": 1120
},
{
"epoch": 1.0834132310642377,
"grad_norm": 6.040268898010254,
"learning_rate": 7.102272727272727e-06,
"loss": 5.008,
"step": 1130
},
{
"epoch": 1.093000958772771,
"grad_norm": 64.06867218017578,
"learning_rate": 7.066761363636364e-06,
"loss": 4.9987,
"step": 1140
},
{
"epoch": 1.102588686481304,
"grad_norm": 51.97669982910156,
"learning_rate": 7.031250000000001e-06,
"loss": 5.0091,
"step": 1150
},
{
"epoch": 1.102588686481304,
"eval_q2q_data_loss": 5.01547384262085,
"eval_q2q_data_runtime": 8.5407,
"eval_q2q_data_samples_per_second": 316.718,
"eval_q2q_data_steps_per_second": 19.905,
"step": 1150
},
{
"epoch": 1.102588686481304,
"eval_q2p_data_loss": 5.103107929229736,
"eval_q2p_data_runtime": 15.3784,
"eval_q2p_data_samples_per_second": 52.801,
"eval_q2p_data_steps_per_second": 3.316,
"step": 1150
},
{
"epoch": 1.112176414189837,
"grad_norm": 10.005661010742188,
"learning_rate": 6.995738636363637e-06,
"loss": 5.0562,
"step": 1160
},
{
"epoch": 1.1217641418983701,
"grad_norm": 10.467660903930664,
"learning_rate": 6.960227272727273e-06,
"loss": 5.0129,
"step": 1170
},
{
"epoch": 1.1313518696069031,
"grad_norm": 7.998090744018555,
"learning_rate": 6.92471590909091e-06,
"loss": 5.0033,
"step": 1180
},
{
"epoch": 1.1409395973154361,
"grad_norm": 3.380247116088867,
"learning_rate": 6.889204545454547e-06,
"loss": 4.9961,
"step": 1190
},
{
"epoch": 1.1505273250239694,
"grad_norm": 8.895610809326172,
"learning_rate": 6.853693181818183e-06,
"loss": 4.988,
"step": 1200
},
{
"epoch": 1.1505273250239694,
"eval_q2q_data_loss": 5.000478744506836,
"eval_q2q_data_runtime": 8.5322,
"eval_q2q_data_samples_per_second": 317.034,
"eval_q2q_data_steps_per_second": 19.924,
"step": 1200
},
{
"epoch": 1.1505273250239694,
"eval_q2p_data_loss": 5.002507209777832,
"eval_q2p_data_runtime": 15.3615,
"eval_q2p_data_samples_per_second": 52.859,
"eval_q2p_data_steps_per_second": 3.32,
"step": 1200
},
{
"epoch": 1.1601150527325024,
"grad_norm": 6.491428852081299,
"learning_rate": 6.818181818181818e-06,
"loss": 4.9687,
"step": 1210
},
{
"epoch": 1.1697027804410354,
"grad_norm": 4.309035778045654,
"learning_rate": 6.7826704545454545e-06,
"loss": 4.9824,
"step": 1220
},
{
"epoch": 1.1792905081495686,
"grad_norm": 2.331423759460449,
"learning_rate": 6.747159090909091e-06,
"loss": 4.9955,
"step": 1230
},
{
"epoch": 1.1888782358581016,
"grad_norm": 3.439713954925537,
"learning_rate": 6.711647727272728e-06,
"loss": 4.9943,
"step": 1240
},
{
"epoch": 1.1984659635666346,
"grad_norm": 7.992236137390137,
"learning_rate": 6.676136363636364e-06,
"loss": 5.0552,
"step": 1250
},
{
"epoch": 1.1984659635666346,
"eval_q2q_data_loss": 5.000186920166016,
"eval_q2q_data_runtime": 8.5162,
"eval_q2q_data_samples_per_second": 317.629,
"eval_q2q_data_steps_per_second": 19.962,
"step": 1250
},
{
"epoch": 1.1984659635666346,
"eval_q2p_data_loss": 5.000546932220459,
"eval_q2p_data_runtime": 15.3961,
"eval_q2p_data_samples_per_second": 52.741,
"eval_q2p_data_steps_per_second": 3.313,
"step": 1250
},
{
"epoch": 1.2080536912751678,
"grad_norm": 3.6224541664123535,
"learning_rate": 6.6406250000000005e-06,
"loss": 5.0073,
"step": 1260
},
{
"epoch": 1.2176414189837008,
"grad_norm": 1.0430936813354492,
"learning_rate": 6.605113636363637e-06,
"loss": 4.9928,
"step": 1270
},
{
"epoch": 1.2272291466922338,
"grad_norm": 3.0630106925964355,
"learning_rate": 6.569602272727274e-06,
"loss": 5.0183,
"step": 1280
},
{
"epoch": 1.236816874400767,
"grad_norm": 4.258161544799805,
"learning_rate": 6.53409090909091e-06,
"loss": 4.9932,
"step": 1290
},
{
"epoch": 1.2464046021093,
"grad_norm": 2.9531047344207764,
"learning_rate": 6.498579545454546e-06,
"loss": 4.9737,
"step": 1300
},
{
"epoch": 1.2464046021093,
"eval_q2q_data_loss": 5.000265121459961,
"eval_q2q_data_runtime": 8.5548,
"eval_q2q_data_samples_per_second": 316.198,
"eval_q2q_data_steps_per_second": 19.872,
"step": 1300
},
{
"epoch": 1.2464046021093,
"eval_q2p_data_loss": 5.00175142288208,
"eval_q2p_data_runtime": 15.3988,
"eval_q2p_data_samples_per_second": 52.731,
"eval_q2p_data_steps_per_second": 3.312,
"step": 1300
},
{
"epoch": 1.255992329817833,
"grad_norm": 7.634608745574951,
"learning_rate": 6.463068181818183e-06,
"loss": 5.012,
"step": 1310
},
{
"epoch": 1.2655800575263663,
"grad_norm": 10.259374618530273,
"learning_rate": 6.427556818181818e-06,
"loss": 5.0138,
"step": 1320
},
{
"epoch": 1.2751677852348993,
"grad_norm": 10.425176620483398,
"learning_rate": 6.392045454545454e-06,
"loss": 5.0107,
"step": 1330
},
{
"epoch": 1.2847555129434325,
"grad_norm": 3.6952784061431885,
"learning_rate": 6.3565340909090915e-06,
"loss": 5.0226,
"step": 1340
},
{
"epoch": 1.2943432406519655,
"grad_norm": 2.3303303718566895,
"learning_rate": 6.321022727272728e-06,
"loss": 4.9827,
"step": 1350
},
{
"epoch": 1.2943432406519655,
"eval_q2q_data_loss": 5.000885009765625,
"eval_q2q_data_runtime": 8.4946,
"eval_q2q_data_samples_per_second": 318.436,
"eval_q2q_data_steps_per_second": 20.013,
"step": 1350
},
{
"epoch": 1.2943432406519655,
"eval_q2p_data_loss": 5.002125263214111,
"eval_q2p_data_runtime": 15.3928,
"eval_q2p_data_samples_per_second": 52.752,
"eval_q2p_data_steps_per_second": 3.313,
"step": 1350
},
{
"epoch": 1.3039309683604985,
"grad_norm": 1.1437593698501587,
"learning_rate": 6.285511363636364e-06,
"loss": 5.0089,
"step": 1360
},
{
"epoch": 1.3135186960690317,
"grad_norm": 3.3491806983947754,
"learning_rate": 6.25e-06,
"loss": 4.9869,
"step": 1370
},
{
"epoch": 1.3231064237775647,
"grad_norm": 4.804921627044678,
"learning_rate": 6.2144886363636366e-06,
"loss": 5.0178,
"step": 1380
},
{
"epoch": 1.332694151486098,
"grad_norm": 3.649508476257324,
"learning_rate": 6.178977272727274e-06,
"loss": 5.0038,
"step": 1390
},
{
"epoch": 1.342281879194631,
"grad_norm": 3.105538845062256,
"learning_rate": 6.14346590909091e-06,
"loss": 4.9761,
"step": 1400
},
{
"epoch": 1.342281879194631,
"eval_q2q_data_loss": 5.000288963317871,
"eval_q2q_data_runtime": 8.4946,
"eval_q2q_data_samples_per_second": 318.436,
"eval_q2q_data_steps_per_second": 20.013,
"step": 1400
},
{
"epoch": 1.342281879194631,
"eval_q2p_data_loss": 5.000768184661865,
"eval_q2p_data_runtime": 15.3448,
"eval_q2p_data_samples_per_second": 52.917,
"eval_q2p_data_steps_per_second": 3.324,
"step": 1400
},
{
"epoch": 1.351869606903164,
"grad_norm": 5.388565540313721,
"learning_rate": 6.107954545454546e-06,
"loss": 5.0025,
"step": 1410
},
{
"epoch": 1.3614573346116972,
"grad_norm": 4.318077564239502,
"learning_rate": 6.0724431818181825e-06,
"loss": 4.9973,
"step": 1420
},
{
"epoch": 1.3710450623202302,
"grad_norm": 5.794456481933594,
"learning_rate": 6.036931818181818e-06,
"loss": 4.9911,
"step": 1430
},
{
"epoch": 1.3806327900287632,
"grad_norm": 7.113480567932129,
"learning_rate": 6.001420454545455e-06,
"loss": 5.0088,
"step": 1440
},
{
"epoch": 1.3902205177372964,
"grad_norm": 4.235409736633301,
"learning_rate": 5.965909090909091e-06,
"loss": 4.986,
"step": 1450
},
{
"epoch": 1.3902205177372964,
"eval_q2q_data_loss": 5.0001349449157715,
"eval_q2q_data_runtime": 8.5502,
"eval_q2q_data_samples_per_second": 316.366,
"eval_q2q_data_steps_per_second": 19.883,
"step": 1450
},
{
"epoch": 1.3902205177372964,
"eval_q2p_data_loss": 5.000503063201904,
"eval_q2p_data_runtime": 15.3601,
"eval_q2p_data_samples_per_second": 52.864,
"eval_q2p_data_steps_per_second": 3.32,
"step": 1450
},
{
"epoch": 1.3998082454458294,
"grad_norm": 0.9855827689170837,
"learning_rate": 5.930397727272728e-06,
"loss": 5.0025,
"step": 1460
},
{
"epoch": 1.4093959731543624,
"grad_norm": 4.243587017059326,
"learning_rate": 5.894886363636364e-06,
"loss": 4.9907,
"step": 1470
},
{
"epoch": 1.4189837008628956,
"grad_norm": 9.807540893554688,
"learning_rate": 5.859375e-06,
"loss": 5.0012,
"step": 1480
},
{
"epoch": 1.4285714285714286,
"grad_norm": 3.3579766750335693,
"learning_rate": 5.823863636363637e-06,
"loss": 4.9928,
"step": 1490
},
{
"epoch": 1.4381591562799616,
"grad_norm": 2.363482713699341,
"learning_rate": 5.7883522727272735e-06,
"loss": 4.9955,
"step": 1500
},
{
"epoch": 1.4381591562799616,
"eval_q2q_data_loss": 5.000216960906982,
"eval_q2q_data_runtime": 8.5231,
"eval_q2q_data_samples_per_second": 317.374,
"eval_q2q_data_steps_per_second": 19.946,
"step": 1500
},
{
"epoch": 1.4381591562799616,
"eval_q2p_data_loss": 5.000642776489258,
"eval_q2p_data_runtime": 15.3802,
"eval_q2p_data_samples_per_second": 52.795,
"eval_q2p_data_steps_per_second": 3.316,
"step": 1500
},
{
"epoch": 1.4477468839884948,
"grad_norm": 2.8971104621887207,
"learning_rate": 5.75284090909091e-06,
"loss": 4.9952,
"step": 1510
},
{
"epoch": 1.4573346116970278,
"grad_norm": 4.56306266784668,
"learning_rate": 5.717329545454546e-06,
"loss": 4.9875,
"step": 1520
},
{
"epoch": 1.4669223394055608,
"grad_norm": 3.592824935913086,
"learning_rate": 5.681818181818183e-06,
"loss": 5.0027,
"step": 1530
},
{
"epoch": 1.476510067114094,
"grad_norm": 6.926996231079102,
"learning_rate": 5.646306818181818e-06,
"loss": 4.963,
"step": 1540
},
{
"epoch": 1.486097794822627,
"grad_norm": 8.679203987121582,
"learning_rate": 5.610795454545455e-06,
"loss": 4.9662,
"step": 1550
},
{
"epoch": 1.486097794822627,
"eval_q2q_data_loss": 5.001591205596924,
"eval_q2q_data_runtime": 8.4686,
"eval_q2q_data_samples_per_second": 319.414,
"eval_q2q_data_steps_per_second": 20.074,
"step": 1550
},
{
"epoch": 1.486097794822627,
"eval_q2p_data_loss": 5.006067276000977,
"eval_q2p_data_runtime": 15.3614,
"eval_q2p_data_samples_per_second": 52.86,
"eval_q2p_data_steps_per_second": 3.32,
"step": 1550
},
{
"epoch": 1.49568552253116,
"grad_norm": 11.07398796081543,
"learning_rate": 5.575284090909091e-06,
"loss": 4.9284,
"step": 1560
},
{
"epoch": 1.5052732502396933,
"grad_norm": 13.813140869140625,
"learning_rate": 5.539772727272727e-06,
"loss": 4.9773,
"step": 1570
},
{
"epoch": 1.5148609779482263,
"grad_norm": 32.947540283203125,
"learning_rate": 5.504261363636364e-06,
"loss": 5.0154,
"step": 1580
},
{
"epoch": 1.5244487056567593,
"grad_norm": 57.005271911621094,
"learning_rate": 5.468750000000001e-06,
"loss": 4.9956,
"step": 1590
},
{
"epoch": 1.5340364333652925,
"grad_norm": 21.25840187072754,
"learning_rate": 5.433238636363637e-06,
"loss": 5.0147,
"step": 1600
},
{
"epoch": 1.5340364333652925,
"eval_q2q_data_loss": 5.015188694000244,
"eval_q2q_data_runtime": 8.4996,
"eval_q2q_data_samples_per_second": 318.25,
"eval_q2q_data_steps_per_second": 20.001,
"step": 1600
},
{
"epoch": 1.5340364333652925,
"eval_q2p_data_loss": 5.062190532684326,
"eval_q2p_data_runtime": 15.3191,
"eval_q2p_data_samples_per_second": 53.006,
"eval_q2p_data_steps_per_second": 3.329,
"step": 1600
},
{
"epoch": 1.5436241610738255,
"grad_norm": 23.927370071411133,
"learning_rate": 5.397727272727273e-06,
"loss": 5.0216,
"step": 1610
},
{
"epoch": 1.5532118887823585,
"grad_norm": 29.68376350402832,
"learning_rate": 5.36221590909091e-06,
"loss": 5.0276,
"step": 1620
},
{
"epoch": 1.5627996164908917,
"grad_norm": 56.62722396850586,
"learning_rate": 5.326704545454546e-06,
"loss": 5.0115,
"step": 1630
},
{
"epoch": 1.5723873441994247,
"grad_norm": 30.375343322753906,
"learning_rate": 5.291193181818183e-06,
"loss": 4.9836,
"step": 1640
},
{
"epoch": 1.5819750719079577,
"grad_norm": 7.980493068695068,
"learning_rate": 5.255681818181818e-06,
"loss": 5.0171,
"step": 1650
},
{
"epoch": 1.5819750719079577,
"eval_q2q_data_loss": 5.000085353851318,
"eval_q2q_data_runtime": 8.4882,
"eval_q2q_data_samples_per_second": 318.678,
"eval_q2q_data_steps_per_second": 20.028,
"step": 1650
},
{
"epoch": 1.5819750719079577,
"eval_q2p_data_loss": 5.002185821533203,
"eval_q2p_data_runtime": 15.3825,
"eval_q2p_data_samples_per_second": 52.787,
"eval_q2p_data_steps_per_second": 3.315,
"step": 1650
},
{
"epoch": 1.591562799616491,
"grad_norm": 12.629569053649902,
"learning_rate": 5.220170454545455e-06,
"loss": 5.0266,
"step": 1660
},
{
"epoch": 1.601150527325024,
"grad_norm": 26.266088485717773,
"learning_rate": 5.184659090909091e-06,
"loss": 4.9617,
"step": 1670
},
{
"epoch": 1.610738255033557,
"grad_norm": 12.034894943237305,
"learning_rate": 5.149147727272727e-06,
"loss": 4.9691,
"step": 1680
},
{
"epoch": 1.6203259827420902,
"grad_norm": 27.641963958740234,
"learning_rate": 5.113636363636364e-06,
"loss": 5.0004,
"step": 1690
},
{
"epoch": 1.6299137104506232,
"grad_norm": 30.945240020751953,
"learning_rate": 5.078125000000001e-06,
"loss": 5.0173,
"step": 1700
},
{
"epoch": 1.6299137104506232,
"eval_q2q_data_loss": 5.039857387542725,
"eval_q2q_data_runtime": 8.4631,
"eval_q2q_data_samples_per_second": 319.624,
"eval_q2q_data_steps_per_second": 20.087,
"step": 1700
},
{
"epoch": 1.6299137104506232,
"eval_q2p_data_loss": 5.0407586097717285,
"eval_q2p_data_runtime": 15.3308,
"eval_q2p_data_samples_per_second": 52.965,
"eval_q2p_data_steps_per_second": 3.327,
"step": 1700
},
{
"epoch": 1.6395014381591562,
"grad_norm": 38.697303771972656,
"learning_rate": 5.042613636363637e-06,
"loss": 4.9824,
"step": 1710
},
{
"epoch": 1.6490891658676894,
"grad_norm": 1.1715205907821655,
"learning_rate": 5.007102272727273e-06,
"loss": 5.0099,
"step": 1720
},
{
"epoch": 1.6586768935762224,
"grad_norm": 1.030447006225586,
"learning_rate": 4.9715909090909094e-06,
"loss": 5.003,
"step": 1730
},
{
"epoch": 1.6682646212847554,
"grad_norm": 0.6143599152565002,
"learning_rate": 4.936079545454546e-06,
"loss": 5.0039,
"step": 1740
},
{
"epoch": 1.6778523489932886,
"grad_norm": 0.31595391035079956,
"learning_rate": 4.900568181818182e-06,
"loss": 5.0031,
"step": 1750
},
{
"epoch": 1.6778523489932886,
"eval_q2q_data_loss": 5.0020527839660645,
"eval_q2q_data_runtime": 8.472,
"eval_q2q_data_samples_per_second": 319.285,
"eval_q2q_data_steps_per_second": 20.066,
"step": 1750
},
{
"epoch": 1.6778523489932886,
"eval_q2p_data_loss": 5.010634422302246,
"eval_q2p_data_runtime": 15.3164,
"eval_q2p_data_samples_per_second": 53.015,
"eval_q2p_data_steps_per_second": 3.33,
"step": 1750
},
{
"epoch": 1.6874400767018218,
"grad_norm": 0.3842555284500122,
"learning_rate": 4.865056818181818e-06,
"loss": 4.9992,
"step": 1760
},
{
"epoch": 1.6970278044103546,
"grad_norm": 0.3934996426105499,
"learning_rate": 4.829545454545455e-06,
"loss": 4.9997,
"step": 1770
},
{
"epoch": 1.7066155321188878,
"grad_norm": 0.3144057095050812,
"learning_rate": 4.794034090909092e-06,
"loss": 4.9999,
"step": 1780
},
{
"epoch": 1.716203259827421,
"grad_norm": 0.33490219712257385,
"learning_rate": 4.758522727272727e-06,
"loss": 5.0022,
"step": 1790
},
{
"epoch": 1.7257909875359538,
"grad_norm": 0.35593223571777344,
"learning_rate": 4.723011363636364e-06,
"loss": 4.9988,
"step": 1800
},
{
"epoch": 1.7257909875359538,
"eval_q2q_data_loss": 5.001664638519287,
"eval_q2q_data_runtime": 8.4874,
"eval_q2q_data_samples_per_second": 318.706,
"eval_q2q_data_steps_per_second": 20.03,
"step": 1800
},
{
"epoch": 1.7257909875359538,
"eval_q2p_data_loss": 5.009975433349609,
"eval_q2p_data_runtime": 15.3185,
"eval_q2p_data_samples_per_second": 53.008,
"eval_q2p_data_steps_per_second": 3.329,
"step": 1800
},
{
"epoch": 1.735378715244487,
"grad_norm": 0.5832622051239014,
"learning_rate": 4.6875000000000004e-06,
"loss": 4.9987,
"step": 1810
},
{
"epoch": 1.7449664429530203,
"grad_norm": 0.4001566171646118,
"learning_rate": 4.651988636363637e-06,
"loss": 5.0029,
"step": 1820
},
{
"epoch": 1.754554170661553,
"grad_norm": 1.2833226919174194,
"learning_rate": 4.616477272727273e-06,
"loss": 4.9949,
"step": 1830
},
{
"epoch": 1.7641418983700863,
"grad_norm": 0.7543688416481018,
"learning_rate": 4.580965909090909e-06,
"loss": 4.999,
"step": 1840
},
{
"epoch": 1.7737296260786195,
"grad_norm": 0.7849061489105225,
"learning_rate": 4.5454545454545455e-06,
"loss": 5.0017,
"step": 1850
},
{
"epoch": 1.7737296260786195,
"eval_q2q_data_loss": 5.003254413604736,
"eval_q2q_data_runtime": 8.5165,
"eval_q2q_data_samples_per_second": 317.618,
"eval_q2q_data_steps_per_second": 19.961,
"step": 1850
},
{
"epoch": 1.7737296260786195,
"eval_q2p_data_loss": 4.987276077270508,
"eval_q2p_data_runtime": 15.3548,
"eval_q2p_data_samples_per_second": 52.882,
"eval_q2p_data_steps_per_second": 3.321,
"step": 1850
},
{
"epoch": 1.7833173537871523,
"grad_norm": 12.080714225769043,
"learning_rate": 4.509943181818182e-06,
"loss": 4.9866,
"step": 1860
},
{
"epoch": 1.7929050814956855,
"grad_norm": 1.030135989189148,
"learning_rate": 4.474431818181819e-06,
"loss": 4.9976,
"step": 1870
},
{
"epoch": 1.8024928092042187,
"grad_norm": 2.636124610900879,
"learning_rate": 4.438920454545455e-06,
"loss": 4.9784,
"step": 1880
},
{
"epoch": 1.8120805369127517,
"grad_norm": 51.49758529663086,
"learning_rate": 4.4034090909090914e-06,
"loss": 4.9824,
"step": 1890
},
{
"epoch": 1.8216682646212847,
"grad_norm": 59.32814025878906,
"learning_rate": 4.367897727272728e-06,
"loss": 4.9945,
"step": 1900
},
{
"epoch": 1.8216682646212847,
"eval_q2q_data_loss": 5.014230251312256,
"eval_q2q_data_runtime": 8.519,
"eval_q2q_data_samples_per_second": 317.527,
"eval_q2q_data_steps_per_second": 19.955,
"step": 1900
},
{
"epoch": 1.8216682646212847,
"eval_q2p_data_loss": 5.155740737915039,
"eval_q2p_data_runtime": 15.3763,
"eval_q2p_data_samples_per_second": 52.808,
"eval_q2p_data_steps_per_second": 3.317,
"step": 1900
},
{
"epoch": 1.831255992329818,
"grad_norm": 10.061817169189453,
"learning_rate": 4.332386363636364e-06,
"loss": 4.9445,
"step": 1910
},
{
"epoch": 1.840843720038351,
"grad_norm": 1.1698871850967407,
"learning_rate": 4.296875e-06,
"loss": 4.9477,
"step": 1920
},
{
"epoch": 1.850431447746884,
"grad_norm": 0.6934572458267212,
"learning_rate": 4.2613636363636365e-06,
"loss": 5.0047,
"step": 1930
},
{
"epoch": 1.8600191754554172,
"grad_norm": 18.0229434967041,
"learning_rate": 4.225852272727274e-06,
"loss": 4.9307,
"step": 1940
},
{
"epoch": 1.8696069031639502,
"grad_norm": 8.73933219909668,
"learning_rate": 4.190340909090909e-06,
"loss": 4.9634,
"step": 1950
},
{
"epoch": 1.8696069031639502,
"eval_q2q_data_loss": 5.002269268035889,
"eval_q2q_data_runtime": 8.4962,
"eval_q2q_data_samples_per_second": 318.378,
"eval_q2q_data_steps_per_second": 20.009,
"step": 1950
},
{
"epoch": 1.8696069031639502,
"eval_q2p_data_loss": 4.8260931968688965,
"eval_q2p_data_runtime": 15.3516,
"eval_q2p_data_samples_per_second": 52.894,
"eval_q2p_data_steps_per_second": 3.322,
"step": 1950
},
{
"epoch": 1.8791946308724832,
"grad_norm": 1.5762324333190918,
"learning_rate": 4.154829545454545e-06,
"loss": 4.9791,
"step": 1960
},
{
"epoch": 1.8887823585810164,
"grad_norm": 0.3121432363986969,
"learning_rate": 4.1193181818181825e-06,
"loss": 4.9792,
"step": 1970
},
{
"epoch": 1.8983700862895494,
"grad_norm": 1.5927631855010986,
"learning_rate": 4.083806818181819e-06,
"loss": 4.9041,
"step": 1980
},
{
"epoch": 1.9079578139980824,
"grad_norm": 14.304738998413086,
"learning_rate": 4.048295454545455e-06,
"loss": 4.9349,
"step": 1990
},
{
"epoch": 1.9175455417066156,
"grad_norm": 0.2702763080596924,
"learning_rate": 4.012784090909091e-06,
"loss": 4.8942,
"step": 2000
},
{
"epoch": 1.9175455417066156,
"eval_q2q_data_loss": 5.001285076141357,
"eval_q2q_data_runtime": 8.47,
"eval_q2q_data_samples_per_second": 319.362,
"eval_q2q_data_steps_per_second": 20.071,
"step": 2000
},
{
"epoch": 1.9175455417066156,
"eval_q2p_data_loss": 4.750080585479736,
"eval_q2p_data_runtime": 15.3459,
"eval_q2p_data_samples_per_second": 52.913,
"eval_q2p_data_steps_per_second": 3.323,
"step": 2000
},
{
"epoch": 1.9271332694151486,
"grad_norm": 0.2623966634273529,
"learning_rate": 3.9772727272727275e-06,
"loss": 4.9871,
"step": 2010
},
{
"epoch": 1.9367209971236816,
"grad_norm": 0.24292069673538208,
"learning_rate": 3.941761363636364e-06,
"loss": 4.9631,
"step": 2020
},
{
"epoch": 1.9463087248322148,
"grad_norm": 0.2756921947002411,
"learning_rate": 3.90625e-06,
"loss": 4.9604,
"step": 2030
},
{
"epoch": 1.9558964525407478,
"grad_norm": 0.2825332581996918,
"learning_rate": 3.870738636363637e-06,
"loss": 4.9346,
"step": 2040
},
{
"epoch": 1.9654841802492808,
"grad_norm": 0.2173183411359787,
"learning_rate": 3.8352272727272735e-06,
"loss": 4.9398,
"step": 2050
},
{
"epoch": 1.9654841802492808,
"eval_q2q_data_loss": 5.001183032989502,
"eval_q2q_data_runtime": 8.5081,
"eval_q2q_data_samples_per_second": 317.931,
"eval_q2q_data_steps_per_second": 19.981,
"step": 2050
},
{
"epoch": 1.9654841802492808,
"eval_q2p_data_loss": 4.761696815490723,
"eval_q2p_data_runtime": 15.3478,
"eval_q2p_data_samples_per_second": 52.907,
"eval_q2p_data_steps_per_second": 3.323,
"step": 2050
},
{
"epoch": 1.975071907957814,
"grad_norm": 16.142738342285156,
"learning_rate": 3.7997159090909093e-06,
"loss": 4.9262,
"step": 2060
},
{
"epoch": 1.984659635666347,
"grad_norm": 0.2226814180612564,
"learning_rate": 3.7642045454545456e-06,
"loss": 4.9505,
"step": 2070
},
{
"epoch": 1.99424736337488,
"grad_norm": 0.22450749576091766,
"learning_rate": 3.7286931818181823e-06,
"loss": 4.9667,
"step": 2080
},
{
"epoch": 2.0038350910834133,
"grad_norm": 18.707637786865234,
"learning_rate": 3.6931818181818186e-06,
"loss": 4.8763,
"step": 2090
},
{
"epoch": 2.0134228187919465,
"grad_norm": 0.2756267189979553,
"learning_rate": 3.657670454545455e-06,
"loss": 4.9116,
"step": 2100
},
{
"epoch": 2.0134228187919465,
"eval_q2q_data_loss": 5.001041412353516,
"eval_q2q_data_runtime": 8.4882,
"eval_q2q_data_samples_per_second": 318.678,
"eval_q2q_data_steps_per_second": 20.028,
"step": 2100
},
{
"epoch": 2.0134228187919465,
"eval_q2p_data_loss": 4.771986961364746,
"eval_q2p_data_runtime": 15.3318,
"eval_q2p_data_samples_per_second": 52.962,
"eval_q2p_data_steps_per_second": 3.326,
"step": 2100
},
{
"epoch": 2.0230105465004793,
"grad_norm": 0.19571331143379211,
"learning_rate": 3.6221590909090915e-06,
"loss": 4.9367,
"step": 2110
},
{
"epoch": 2.0325982742090125,
"grad_norm": 0.21739406883716583,
"learning_rate": 3.5866477272727274e-06,
"loss": 4.9546,
"step": 2120
},
{
"epoch": 2.0421860019175457,
"grad_norm": 1.4178483486175537,
"learning_rate": 3.5511363636363636e-06,
"loss": 4.9743,
"step": 2130
},
{
"epoch": 2.0517737296260785,
"grad_norm": 0.20393171906471252,
"learning_rate": 3.5156250000000003e-06,
"loss": 4.9795,
"step": 2140
},
{
"epoch": 2.0613614573346117,
"grad_norm": 0.18679551780223846,
"learning_rate": 3.4801136363636366e-06,
"loss": 4.9647,
"step": 2150
},
{
"epoch": 2.0613614573346117,
"eval_q2q_data_loss": 5.0010271072387695,
"eval_q2q_data_runtime": 8.5086,
"eval_q2q_data_samples_per_second": 317.913,
"eval_q2q_data_steps_per_second": 19.98,
"step": 2150
},
{
"epoch": 2.0613614573346117,
"eval_q2p_data_loss": 4.773245811462402,
"eval_q2p_data_runtime": 15.3323,
"eval_q2p_data_samples_per_second": 52.96,
"eval_q2p_data_steps_per_second": 3.326,
"step": 2150
},
{
"epoch": 2.070949185043145,
"grad_norm": 10.774163246154785,
"learning_rate": 3.4446022727272733e-06,
"loss": 4.9856,
"step": 2160
},
{
"epoch": 2.0805369127516777,
"grad_norm": 0.229711651802063,
"learning_rate": 3.409090909090909e-06,
"loss": 4.9553,
"step": 2170
},
{
"epoch": 2.090124640460211,
"grad_norm": 12.86821174621582,
"learning_rate": 3.3735795454545454e-06,
"loss": 4.9479,
"step": 2180
},
{
"epoch": 2.099712368168744,
"grad_norm": 0.19190755486488342,
"learning_rate": 3.338068181818182e-06,
"loss": 4.9672,
"step": 2190
},
{
"epoch": 2.109300095877277,
"grad_norm": 6.124110698699951,
"learning_rate": 3.3025568181818184e-06,
"loss": 4.9645,
"step": 2200
},
{
"epoch": 2.109300095877277,
"eval_q2q_data_loss": 5.001131057739258,
"eval_q2q_data_runtime": 8.4876,
"eval_q2q_data_samples_per_second": 318.702,
"eval_q2q_data_steps_per_second": 20.029,
"step": 2200
},
{
"epoch": 2.109300095877277,
"eval_q2p_data_loss": 4.75758171081543,
"eval_q2p_data_runtime": 15.4135,
"eval_q2p_data_samples_per_second": 52.681,
"eval_q2p_data_steps_per_second": 3.309,
"step": 2200
},
{
"epoch": 2.11888782358581,
"grad_norm": 3.4443752765655518,
"learning_rate": 3.267045454545455e-06,
"loss": 4.9299,
"step": 2210
},
{
"epoch": 2.1284755512943434,
"grad_norm": 0.27355676889419556,
"learning_rate": 3.2315340909090913e-06,
"loss": 4.9777,
"step": 2220
},
{
"epoch": 2.138063279002876,
"grad_norm": 6.125870227813721,
"learning_rate": 3.196022727272727e-06,
"loss": 4.94,
"step": 2230
},
{
"epoch": 2.1476510067114094,
"grad_norm": 23.490581512451172,
"learning_rate": 3.160511363636364e-06,
"loss": 4.978,
"step": 2240
},
{
"epoch": 2.1572387344199426,
"grad_norm": 9.1142578125,
"learning_rate": 3.125e-06,
"loss": 4.968,
"step": 2250
},
{
"epoch": 2.1572387344199426,
"eval_q2q_data_loss": 4.999406814575195,
"eval_q2q_data_runtime": 8.4764,
"eval_q2q_data_samples_per_second": 319.121,
"eval_q2q_data_steps_per_second": 20.056,
"step": 2250
},
{
"epoch": 2.1572387344199426,
"eval_q2p_data_loss": 4.755669116973877,
"eval_q2p_data_runtime": 15.4053,
"eval_q2p_data_samples_per_second": 52.709,
"eval_q2p_data_steps_per_second": 3.311,
"step": 2250
},
{
"epoch": 2.1668264621284754,
"grad_norm": 0.5820243954658508,
"learning_rate": 3.089488636363637e-06,
"loss": 4.9512,
"step": 2260
},
{
"epoch": 2.1764141898370086,
"grad_norm": 0.20500487089157104,
"learning_rate": 3.053977272727273e-06,
"loss": 4.9539,
"step": 2270
},
{
"epoch": 2.186001917545542,
"grad_norm": 0.18161769211292267,
"learning_rate": 3.018465909090909e-06,
"loss": 4.9508,
"step": 2280
},
{
"epoch": 2.1955896452540746,
"grad_norm": 0.19371207058429718,
"learning_rate": 2.9829545454545457e-06,
"loss": 4.8871,
"step": 2290
},
{
"epoch": 2.205177372962608,
"grad_norm": 0.2863902747631073,
"learning_rate": 2.947443181818182e-06,
"loss": 4.909,
"step": 2300
},
{
"epoch": 2.205177372962608,
"eval_q2q_data_loss": 5.001042366027832,
"eval_q2q_data_runtime": 8.4998,
"eval_q2q_data_samples_per_second": 318.244,
"eval_q2q_data_steps_per_second": 20.001,
"step": 2300
},
{
"epoch": 2.205177372962608,
"eval_q2p_data_loss": 4.744427680969238,
"eval_q2p_data_runtime": 15.3338,
"eval_q2p_data_samples_per_second": 52.955,
"eval_q2p_data_steps_per_second": 3.326,
"step": 2300
},
{
"epoch": 2.214765100671141,
"grad_norm": 0.21279603242874146,
"learning_rate": 2.9119318181818186e-06,
"loss": 4.9587,
"step": 2310
},
{
"epoch": 2.224352828379674,
"grad_norm": 0.18541747331619263,
"learning_rate": 2.876420454545455e-06,
"loss": 4.8956,
"step": 2320
},
{
"epoch": 2.233940556088207,
"grad_norm": 0.22428183257579803,
"learning_rate": 2.8409090909090916e-06,
"loss": 4.9891,
"step": 2330
},
{
"epoch": 2.2435282837967403,
"grad_norm": 12.067822456359863,
"learning_rate": 2.8053977272727274e-06,
"loss": 4.8795,
"step": 2340
},
{
"epoch": 2.253116011505273,
"grad_norm": 7.028346061706543,
"learning_rate": 2.7698863636363637e-06,
"loss": 4.887,
"step": 2350
},
{
"epoch": 2.253116011505273,
"eval_q2q_data_loss": 5.001026630401611,
"eval_q2q_data_runtime": 8.487,
"eval_q2q_data_samples_per_second": 318.721,
"eval_q2q_data_steps_per_second": 20.031,
"step": 2350
},
{
"epoch": 2.253116011505273,
"eval_q2p_data_loss": 4.744780540466309,
"eval_q2p_data_runtime": 15.3798,
"eval_q2p_data_samples_per_second": 52.796,
"eval_q2p_data_steps_per_second": 3.316,
"step": 2350
},
{
"epoch": 2.2627037392138063,
"grad_norm": 0.15497416257858276,
"learning_rate": 2.7343750000000004e-06,
"loss": 4.9723,
"step": 2360
},
{
"epoch": 2.2722914669223395,
"grad_norm": 0.14897240698337555,
"learning_rate": 2.6988636363636367e-06,
"loss": 4.8967,
"step": 2370
},
{
"epoch": 2.2818791946308723,
"grad_norm": 6.019428730010986,
"learning_rate": 2.663352272727273e-06,
"loss": 4.8975,
"step": 2380
},
{
"epoch": 2.2914669223394055,
"grad_norm": 7.852274417877197,
"learning_rate": 2.627840909090909e-06,
"loss": 4.9177,
"step": 2390
},
{
"epoch": 2.3010546500479387,
"grad_norm": 128.83132934570312,
"learning_rate": 2.5923295454545455e-06,
"loss": 4.9272,
"step": 2400
},
{
"epoch": 2.3010546500479387,
"eval_q2q_data_loss": 5.000960350036621,
"eval_q2q_data_runtime": 8.4827,
"eval_q2q_data_samples_per_second": 318.882,
"eval_q2q_data_steps_per_second": 20.041,
"step": 2400
},
{
"epoch": 2.3010546500479387,
"eval_q2p_data_loss": 4.7287445068359375,
"eval_q2p_data_runtime": 15.3674,
"eval_q2p_data_samples_per_second": 52.839,
"eval_q2p_data_steps_per_second": 3.319,
"step": 2400
},
{
"epoch": 2.310642377756472,
"grad_norm": 0.1605680286884308,
"learning_rate": 2.556818181818182e-06,
"loss": 4.9283,
"step": 2410
},
{
"epoch": 2.3202301054650047,
"grad_norm": 25.14031982421875,
"learning_rate": 2.5213068181818184e-06,
"loss": 4.9061,
"step": 2420
},
{
"epoch": 2.329817833173538,
"grad_norm": 0.1336502879858017,
"learning_rate": 2.4857954545454547e-06,
"loss": 4.9279,
"step": 2430
},
{
"epoch": 2.3394055608820707,
"grad_norm": 0.5942106246948242,
"learning_rate": 2.450284090909091e-06,
"loss": 4.9856,
"step": 2440
},
{
"epoch": 2.348993288590604,
"grad_norm": 6.196929454803467,
"learning_rate": 2.4147727272727277e-06,
"loss": 4.8988,
"step": 2450
},
{
"epoch": 2.348993288590604,
"eval_q2q_data_loss": 5.000965118408203,
"eval_q2q_data_runtime": 8.4496,
"eval_q2q_data_samples_per_second": 320.134,
"eval_q2q_data_steps_per_second": 20.119,
"step": 2450
},
{
"epoch": 2.348993288590604,
"eval_q2p_data_loss": 4.726756572723389,
"eval_q2p_data_runtime": 15.3322,
"eval_q2p_data_samples_per_second": 52.96,
"eval_q2p_data_steps_per_second": 3.326,
"step": 2450
},
{
"epoch": 2.358581016299137,
"grad_norm": 0.11395616829395294,
"learning_rate": 2.3792613636363635e-06,
"loss": 4.9269,
"step": 2460
},
{
"epoch": 2.3681687440076704,
"grad_norm": 0.14515432715415955,
"learning_rate": 2.3437500000000002e-06,
"loss": 4.9318,
"step": 2470
},
{
"epoch": 2.377756471716203,
"grad_norm": 2.5160467624664307,
"learning_rate": 2.3082386363636365e-06,
"loss": 4.8814,
"step": 2480
},
{
"epoch": 2.3873441994247364,
"grad_norm": 0.1416112333536148,
"learning_rate": 2.2727272727272728e-06,
"loss": 4.9912,
"step": 2490
},
{
"epoch": 2.396931927133269,
"grad_norm": 10.503127098083496,
"learning_rate": 2.2372159090909095e-06,
"loss": 4.9226,
"step": 2500
},
{
"epoch": 2.396931927133269,
"eval_q2q_data_loss": 5.000875949859619,
"eval_q2q_data_runtime": 8.4684,
"eval_q2q_data_samples_per_second": 319.422,
"eval_q2q_data_steps_per_second": 20.075,
"step": 2500
},
{
"epoch": 2.396931927133269,
"eval_q2p_data_loss": 4.719711780548096,
"eval_q2p_data_runtime": 15.359,
"eval_q2p_data_samples_per_second": 52.868,
"eval_q2p_data_steps_per_second": 3.321,
"step": 2500
},
{
"epoch": 2.4065196548418024,
"grad_norm": 0.14310245215892792,
"learning_rate": 2.2017045454545457e-06,
"loss": 4.9437,
"step": 2510
},
{
"epoch": 2.4161073825503356,
"grad_norm": 0.12047765403985977,
"learning_rate": 2.166193181818182e-06,
"loss": 4.9553,
"step": 2520
},
{
"epoch": 2.425695110258869,
"grad_norm": 0.1301940679550171,
"learning_rate": 2.1306818181818183e-06,
"loss": 4.9355,
"step": 2530
},
{
"epoch": 2.4352828379674016,
"grad_norm": 0.42147210240364075,
"learning_rate": 2.0951704545454545e-06,
"loss": 4.9063,
"step": 2540
},
{
"epoch": 2.444870565675935,
"grad_norm": 44.65216064453125,
"learning_rate": 2.0596590909090912e-06,
"loss": 4.9095,
"step": 2550
},
{
"epoch": 2.444870565675935,
"eval_q2q_data_loss": 4.99726676940918,
"eval_q2q_data_runtime": 8.4873,
"eval_q2q_data_samples_per_second": 318.711,
"eval_q2q_data_steps_per_second": 20.03,
"step": 2550
},
{
"epoch": 2.444870565675935,
"eval_q2p_data_loss": 4.74806547164917,
"eval_q2p_data_runtime": 15.3525,
"eval_q2p_data_samples_per_second": 52.891,
"eval_q2p_data_steps_per_second": 3.322,
"step": 2550
},
{
"epoch": 2.4544582933844676,
"grad_norm": 22.98095703125,
"learning_rate": 2.0241477272727275e-06,
"loss": 4.9624,
"step": 2560
},
{
"epoch": 2.464046021093001,
"grad_norm": 0.5905591249465942,
"learning_rate": 1.9886363636363638e-06,
"loss": 4.9731,
"step": 2570
},
{
"epoch": 2.473633748801534,
"grad_norm": 24.247333526611328,
"learning_rate": 1.953125e-06,
"loss": 4.9156,
"step": 2580
},
{
"epoch": 2.4832214765100673,
"grad_norm": 32.6563720703125,
"learning_rate": 1.9176136363636367e-06,
"loss": 4.8714,
"step": 2590
},
{
"epoch": 2.4928092042186,
"grad_norm": 36.43191146850586,
"learning_rate": 1.8821022727272728e-06,
"loss": 4.9532,
"step": 2600
},
{
"epoch": 2.4928092042186,
"eval_q2q_data_loss": 5.000910758972168,
"eval_q2q_data_runtime": 8.4722,
"eval_q2q_data_samples_per_second": 319.28,
"eval_q2q_data_steps_per_second": 20.066,
"step": 2600
},
{
"epoch": 2.4928092042186,
"eval_q2p_data_loss": 4.732726573944092,
"eval_q2p_data_runtime": 15.3101,
"eval_q2p_data_samples_per_second": 53.037,
"eval_q2p_data_steps_per_second": 3.331,
"step": 2600
},
{
"epoch": 2.5023969319271333,
"grad_norm": 6.501353740692139,
"learning_rate": 1.8465909090909093e-06,
"loss": 4.9196,
"step": 2610
},
{
"epoch": 2.511984659635666,
"grad_norm": 57.751441955566406,
"learning_rate": 1.8110795454545458e-06,
"loss": 4.9477,
"step": 2620
},
{
"epoch": 2.5215723873441993,
"grad_norm": 0.12283805757761002,
"learning_rate": 1.7755681818181818e-06,
"loss": 4.9725,
"step": 2630
},
{
"epoch": 2.5311601150527325,
"grad_norm": 17.9443302154541,
"learning_rate": 1.7400568181818183e-06,
"loss": 4.9483,
"step": 2640
},
{
"epoch": 2.5407478427612658,
"grad_norm": 0.27849340438842773,
"learning_rate": 1.7045454545454546e-06,
"loss": 4.9124,
"step": 2650
},
{
"epoch": 2.5407478427612658,
"eval_q2q_data_loss": 5.000847339630127,
"eval_q2q_data_runtime": 8.4514,
"eval_q2q_data_samples_per_second": 320.064,
"eval_q2q_data_steps_per_second": 20.115,
"step": 2650
},
{
"epoch": 2.5407478427612658,
"eval_q2p_data_loss": 4.775162220001221,
"eval_q2p_data_runtime": 15.3209,
"eval_q2p_data_samples_per_second": 53.0,
"eval_q2p_data_steps_per_second": 3.329,
"step": 2650
},
{
"epoch": 2.5503355704697985,
"grad_norm": 0.1170654371380806,
"learning_rate": 1.669034090909091e-06,
"loss": 4.9056,
"step": 2660
},
{
"epoch": 2.5599232981783318,
"grad_norm": 9.846685409545898,
"learning_rate": 1.6335227272727275e-06,
"loss": 4.9396,
"step": 2670
},
{
"epoch": 2.569511025886865,
"grad_norm": 0.1312805712223053,
"learning_rate": 1.5980113636363636e-06,
"loss": 4.9472,
"step": 2680
},
{
"epoch": 2.5790987535953978,
"grad_norm": 0.16425052285194397,
"learning_rate": 1.5625e-06,
"loss": 4.9322,
"step": 2690
},
{
"epoch": 2.588686481303931,
"grad_norm": 26.310592651367188,
"learning_rate": 1.5269886363636366e-06,
"loss": 4.9147,
"step": 2700
},
{
"epoch": 2.588686481303931,
"eval_q2q_data_loss": 5.000824928283691,
"eval_q2q_data_runtime": 8.4934,
"eval_q2q_data_samples_per_second": 318.482,
"eval_q2q_data_steps_per_second": 20.016,
"step": 2700
},
{
"epoch": 2.588686481303931,
"eval_q2p_data_loss": 4.735974311828613,
"eval_q2p_data_runtime": 15.3216,
"eval_q2p_data_samples_per_second": 52.997,
"eval_q2p_data_steps_per_second": 3.329,
"step": 2700
},
{
"epoch": 2.598274209012464,
"grad_norm": 0.11873164027929306,
"learning_rate": 1.4914772727272728e-06,
"loss": 4.9511,
"step": 2710
},
{
"epoch": 2.607861936720997,
"grad_norm": 0.11559820920228958,
"learning_rate": 1.4559659090909093e-06,
"loss": 4.9229,
"step": 2720
},
{
"epoch": 2.61744966442953,
"grad_norm": 0.1333041489124298,
"learning_rate": 1.4204545454545458e-06,
"loss": 4.9207,
"step": 2730
},
{
"epoch": 2.6270373921380634,
"grad_norm": 0.16187268495559692,
"learning_rate": 1.3849431818181819e-06,
"loss": 4.9695,
"step": 2740
},
{
"epoch": 2.636625119846596,
"grad_norm": 40.309261322021484,
"learning_rate": 1.3494318181818183e-06,
"loss": 4.8886,
"step": 2750
},
{
"epoch": 2.636625119846596,
"eval_q2q_data_loss": 5.0007758140563965,
"eval_q2q_data_runtime": 8.4851,
"eval_q2q_data_samples_per_second": 318.795,
"eval_q2q_data_steps_per_second": 20.035,
"step": 2750
},
{
"epoch": 2.636625119846596,
"eval_q2p_data_loss": 4.76162052154541,
"eval_q2p_data_runtime": 15.3319,
"eval_q2p_data_samples_per_second": 52.961,
"eval_q2p_data_steps_per_second": 3.326,
"step": 2750
},
{
"epoch": 2.6462128475551294,
"grad_norm": 0.11944945156574249,
"learning_rate": 1.3139204545454546e-06,
"loss": 4.9878,
"step": 2760
},
{
"epoch": 2.6558005752636626,
"grad_norm": 0.1411992311477661,
"learning_rate": 1.278409090909091e-06,
"loss": 4.9647,
"step": 2770
},
{
"epoch": 2.665388302972196,
"grad_norm": 0.11750555783510208,
"learning_rate": 1.2428977272727274e-06,
"loss": 4.9552,
"step": 2780
},
{
"epoch": 2.6749760306807286,
"grad_norm": 12.01413631439209,
"learning_rate": 1.2073863636363638e-06,
"loss": 5.0171,
"step": 2790
},
{
"epoch": 2.684563758389262,
"grad_norm": 39.38778305053711,
"learning_rate": 1.1718750000000001e-06,
"loss": 4.9379,
"step": 2800
},
{
"epoch": 2.684563758389262,
"eval_q2q_data_loss": 5.0007734298706055,
"eval_q2q_data_runtime": 8.5072,
"eval_q2q_data_samples_per_second": 317.965,
"eval_q2q_data_steps_per_second": 19.983,
"step": 2800
},
{
"epoch": 2.684563758389262,
"eval_q2p_data_loss": 4.756326198577881,
"eval_q2p_data_runtime": 15.3794,
"eval_q2p_data_samples_per_second": 52.798,
"eval_q2p_data_steps_per_second": 3.316,
"step": 2800
},
{
"epoch": 2.6941514860977946,
"grad_norm": 0.2822560966014862,
"learning_rate": 1.1363636363636364e-06,
"loss": 4.9727,
"step": 2810
},
{
"epoch": 2.703739213806328,
"grad_norm": 0.9750680923461914,
"learning_rate": 1.1008522727272729e-06,
"loss": 4.9798,
"step": 2820
},
{
"epoch": 2.713326941514861,
"grad_norm": 12.072766304016113,
"learning_rate": 1.0653409090909091e-06,
"loss": 4.9726,
"step": 2830
},
{
"epoch": 2.7229146692233943,
"grad_norm": 24.833826065063477,
"learning_rate": 1.0298295454545456e-06,
"loss": 4.956,
"step": 2840
},
{
"epoch": 2.732502396931927,
"grad_norm": 15.921252250671387,
"learning_rate": 9.943181818181819e-07,
"loss": 4.9512,
"step": 2850
},
{
"epoch": 2.732502396931927,
"eval_q2q_data_loss": 5.000742435455322,
"eval_q2q_data_runtime": 8.4355,
"eval_q2q_data_samples_per_second": 320.669,
"eval_q2q_data_steps_per_second": 20.153,
"step": 2850
},
{
"epoch": 2.732502396931927,
"eval_q2p_data_loss": 4.766937255859375,
"eval_q2p_data_runtime": 15.3173,
"eval_q2p_data_samples_per_second": 53.012,
"eval_q2p_data_steps_per_second": 3.33,
"step": 2850
},
{
"epoch": 2.7420901246404603,
"grad_norm": 0.15265218913555145,
"learning_rate": 9.588068181818184e-07,
"loss": 4.9705,
"step": 2860
},
{
"epoch": 2.751677852348993,
"grad_norm": 15.488290786743164,
"learning_rate": 9.232954545454546e-07,
"loss": 4.8603,
"step": 2870
},
{
"epoch": 2.7612655800575263,
"grad_norm": 0.121486134827137,
"learning_rate": 8.877840909090909e-07,
"loss": 4.9764,
"step": 2880
},
{
"epoch": 2.7708533077660595,
"grad_norm": 0.1105041652917862,
"learning_rate": 8.522727272727273e-07,
"loss": 4.9187,
"step": 2890
},
{
"epoch": 2.7804410354745928,
"grad_norm": 0.10993187129497528,
"learning_rate": 8.167613636363638e-07,
"loss": 4.8941,
"step": 2900
},
{
"epoch": 2.7804410354745928,
"eval_q2q_data_loss": 5.000753402709961,
"eval_q2q_data_runtime": 8.462,
"eval_q2q_data_samples_per_second": 319.666,
"eval_q2q_data_steps_per_second": 20.09,
"step": 2900
},
{
"epoch": 2.7804410354745928,
"eval_q2p_data_loss": 4.73110818862915,
"eval_q2p_data_runtime": 15.3141,
"eval_q2p_data_samples_per_second": 53.023,
"eval_q2p_data_steps_per_second": 3.33,
"step": 2900
},
{
"epoch": 2.7900287631831255,
"grad_norm": 0.09844540059566498,
"learning_rate": 7.8125e-07,
"loss": 4.9592,
"step": 2910
},
{
"epoch": 2.7996164908916588,
"grad_norm": 21.05035400390625,
"learning_rate": 7.457386363636364e-07,
"loss": 4.9141,
"step": 2920
},
{
"epoch": 2.8092042186001915,
"grad_norm": 0.11973018944263458,
"learning_rate": 7.102272727272729e-07,
"loss": 4.9198,
"step": 2930
},
{
"epoch": 2.8187919463087248,
"grad_norm": 0.12149699777364731,
"learning_rate": 6.747159090909092e-07,
"loss": 5.0112,
"step": 2940
},
{
"epoch": 2.828379674017258,
"grad_norm": 5.942767143249512,
"learning_rate": 6.392045454545455e-07,
"loss": 4.9778,
"step": 2950
},
{
"epoch": 2.828379674017258,
"eval_q2q_data_loss": 5.00074577331543,
"eval_q2q_data_runtime": 8.4603,
"eval_q2q_data_samples_per_second": 319.73,
"eval_q2q_data_steps_per_second": 20.094,
"step": 2950
},
{
"epoch": 2.828379674017258,
"eval_q2p_data_loss": 4.73326301574707,
"eval_q2p_data_runtime": 15.3687,
"eval_q2p_data_samples_per_second": 52.835,
"eval_q2p_data_steps_per_second": 3.318,
"step": 2950
},
{
"epoch": 2.837967401725791,
"grad_norm": 5.33225679397583,
"learning_rate": 6.036931818181819e-07,
"loss": 4.8999,
"step": 2960
},
{
"epoch": 2.847555129434324,
"grad_norm": 25.030715942382812,
"learning_rate": 5.681818181818182e-07,
"loss": 4.9223,
"step": 2970
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.1237885057926178,
"learning_rate": 5.326704545454546e-07,
"loss": 4.9369,
"step": 2980
},
{
"epoch": 2.86673058485139,
"grad_norm": 0.09552864730358124,
"learning_rate": 4.971590909090909e-07,
"loss": 4.8722,
"step": 2990
},
{
"epoch": 2.876318312559923,
"grad_norm": 0.1201782152056694,
"learning_rate": 4.616477272727273e-07,
"loss": 4.9299,
"step": 3000
},
{
"epoch": 2.876318312559923,
"eval_q2q_data_loss": 5.000753402709961,
"eval_q2q_data_runtime": 8.4812,
"eval_q2q_data_samples_per_second": 318.942,
"eval_q2q_data_steps_per_second": 20.044,
"step": 3000
},
{
"epoch": 2.876318312559923,
"eval_q2p_data_loss": 4.7280192375183105,
"eval_q2p_data_runtime": 15.3569,
"eval_q2p_data_samples_per_second": 52.875,
"eval_q2p_data_steps_per_second": 3.321,
"step": 3000
},
{
"epoch": 2.8859060402684564,
"grad_norm": 18.201995849609375,
"learning_rate": 4.2613636363636364e-07,
"loss": 4.8457,
"step": 3010
},
{
"epoch": 2.8954937679769897,
"grad_norm": 0.09412606805562973,
"learning_rate": 3.90625e-07,
"loss": 4.8864,
"step": 3020
},
{
"epoch": 2.9050814956855224,
"grad_norm": 21.844467163085938,
"learning_rate": 3.5511363636363645e-07,
"loss": 4.882,
"step": 3030
},
{
"epoch": 2.9146692233940557,
"grad_norm": 0.1089194044470787,
"learning_rate": 3.1960227272727277e-07,
"loss": 4.8897,
"step": 3040
},
{
"epoch": 2.9242569511025884,
"grad_norm": 0.20910155773162842,
"learning_rate": 2.840909090909091e-07,
"loss": 4.9663,
"step": 3050
},
{
"epoch": 2.9242569511025884,
"eval_q2q_data_loss": 5.000741481781006,
"eval_q2q_data_runtime": 8.4976,
"eval_q2q_data_samples_per_second": 318.326,
"eval_q2q_data_steps_per_second": 20.006,
"step": 3050
},
{
"epoch": 2.9242569511025884,
"eval_q2p_data_loss": 4.723778247833252,
"eval_q2p_data_runtime": 15.2952,
"eval_q2p_data_samples_per_second": 53.088,
"eval_q2p_data_steps_per_second": 3.334,
"step": 3050
},
{
"epoch": 2.9338446788111217,
"grad_norm": 0.1785881370306015,
"learning_rate": 2.4857954545454547e-07,
"loss": 4.946,
"step": 3060
},
{
"epoch": 2.943432406519655,
"grad_norm": 26.99447250366211,
"learning_rate": 2.1306818181818182e-07,
"loss": 4.9555,
"step": 3070
},
{
"epoch": 2.953020134228188,
"grad_norm": 0.10196644067764282,
"learning_rate": 1.7755681818181822e-07,
"loss": 4.9005,
"step": 3080
},
{
"epoch": 2.962607861936721,
"grad_norm": 26.543190002441406,
"learning_rate": 1.4204545454545455e-07,
"loss": 4.9097,
"step": 3090
},
{
"epoch": 2.972195589645254,
"grad_norm": 0.12280410528182983,
"learning_rate": 1.0653409090909091e-07,
"loss": 4.924,
"step": 3100
},
{
"epoch": 2.972195589645254,
"eval_q2q_data_loss": 5.000741004943848,
"eval_q2q_data_runtime": 8.473,
"eval_q2q_data_samples_per_second": 319.25,
"eval_q2q_data_steps_per_second": 20.064,
"step": 3100
},
{
"epoch": 2.972195589645254,
"eval_q2p_data_loss": 4.72309684753418,
"eval_q2p_data_runtime": 15.3713,
"eval_q2p_data_samples_per_second": 52.826,
"eval_q2p_data_steps_per_second": 3.318,
"step": 3100
},
{
"epoch": 2.981783317353787,
"grad_norm": 0.0916726365685463,
"learning_rate": 7.102272727272727e-08,
"loss": 4.8929,
"step": 3110
},
{
"epoch": 2.99137104506232,
"grad_norm": 15.717903137207031,
"learning_rate": 3.551136363636364e-08,
"loss": 4.93,
"step": 3120
}
],
"logging_steps": 10,
"max_steps": 3129,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}