Mistral-7B-Instruct-SPPO-Iter1 / trainer_state.json
Williampixel's picture
Upload folder using huggingface_hub
07f118d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 2471,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004046944556859571,
"grad_norm": 3260930.195416938,
"learning_rate": 2.0161290322580643e-09,
"logits/chosen": -2.216688871383667,
"logits/rejected": -2.1725575923919678,
"logps/chosen": -62.37783432006836,
"logps/rejected": -57.61228561401367,
"loss": 137728.9531,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.004046944556859571,
"grad_norm": 3951641.9256235515,
"learning_rate": 2.0161290322580644e-08,
"logits/chosen": -2.3231096267700195,
"logits/rejected": -2.3050363063812256,
"logps/chosen": -109.29280090332031,
"logps/rejected": -105.20187377929688,
"loss": 128824.3056,
"rewards/accuracies": 0.4027777910232544,
"rewards/chosen": -8.181909652194008e-05,
"rewards/margins": -8.144730236381292e-05,
"rewards/rejected": -3.7179981404733553e-07,
"step": 10
},
{
"epoch": 0.008093889113719142,
"grad_norm": 3636837.085798033,
"learning_rate": 4.032258064516129e-08,
"logits/chosen": -2.3102259635925293,
"logits/rejected": -2.3181633949279785,
"logps/chosen": -102.9901351928711,
"logps/rejected": -103.0818099975586,
"loss": 128439.1625,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -3.3925286970770685e-06,
"rewards/margins": -7.394707154162461e-06,
"rewards/rejected": 4.00217959395377e-06,
"step": 20
},
{
"epoch": 0.012140833670578713,
"grad_norm": 4189016.609602417,
"learning_rate": 6.048387096774194e-08,
"logits/chosen": -2.2731196880340576,
"logits/rejected": -2.261061191558838,
"logps/chosen": -104.67350769042969,
"logps/rejected": -116.59749603271484,
"loss": 124740.475,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.00028529245173558593,
"rewards/margins": 2.6274694391759112e-05,
"rewards/rejected": -0.00031156709883362055,
"step": 30
},
{
"epoch": 0.016187778227438283,
"grad_norm": 3141568.670348898,
"learning_rate": 8.064516129032257e-08,
"logits/chosen": -2.3156943321228027,
"logits/rejected": -2.294349193572998,
"logps/chosen": -129.86062622070312,
"logps/rejected": -117.5326156616211,
"loss": 131411.2,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.00011367227125447243,
"rewards/margins": -1.9382667233003303e-05,
"rewards/rejected": -9.428960038349032e-05,
"step": 40
},
{
"epoch": 0.020234722784297856,
"grad_norm": 4300244.422627452,
"learning_rate": 1.0080645161290321e-07,
"logits/chosen": -2.271444320678711,
"logits/rejected": -2.2707998752593994,
"logps/chosen": -107.74246978759766,
"logps/rejected": -112.56591796875,
"loss": 128522.9375,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0001433270808774978,
"rewards/margins": 8.896701183402911e-05,
"rewards/rejected": -0.00023229411453939974,
"step": 50
},
{
"epoch": 0.024281667341157425,
"grad_norm": 4087404.2243083506,
"learning_rate": 1.2096774193548387e-07,
"logits/chosen": -2.2509658336639404,
"logits/rejected": -2.235924005508423,
"logps/chosen": -98.1602783203125,
"logps/rejected": -97.8387222290039,
"loss": 134684.625,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -3.988773187302286e-06,
"rewards/margins": 0.0002523847797419876,
"rewards/rejected": -0.00025637357612140477,
"step": 60
},
{
"epoch": 0.028328611898016998,
"grad_norm": 3240131.8848123536,
"learning_rate": 1.4112903225806453e-07,
"logits/chosen": -2.3215599060058594,
"logits/rejected": -2.3164916038513184,
"logps/chosen": -113.9156265258789,
"logps/rejected": -114.72650146484375,
"loss": 127554.8875,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": 0.0005882935947738588,
"rewards/margins": -0.0001803641061997041,
"rewards/rejected": 0.000768657773733139,
"step": 70
},
{
"epoch": 0.03237555645487657,
"grad_norm": 4463707.543855141,
"learning_rate": 1.6129032258064515e-07,
"logits/chosen": -2.197829246520996,
"logits/rejected": -2.2096786499023438,
"logps/chosen": -99.81291198730469,
"logps/rejected": -96.83836364746094,
"loss": 129532.875,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 3.9735146856401116e-05,
"rewards/margins": 0.00038970523746684194,
"rewards/rejected": -0.0003499701269902289,
"step": 80
},
{
"epoch": 0.036422501011736136,
"grad_norm": 5504977.483755038,
"learning_rate": 1.814516129032258e-07,
"logits/chosen": -2.2197558879852295,
"logits/rejected": -2.200068712234497,
"logps/chosen": -112.21453857421875,
"logps/rejected": -110.07649993896484,
"loss": 132607.275,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.00027725097606889904,
"rewards/margins": 0.00037192669697105885,
"rewards/rejected": -0.0006491777021437883,
"step": 90
},
{
"epoch": 0.04046944556859571,
"grad_norm": 3275423.408726695,
"learning_rate": 2.0161290322580642e-07,
"logits/chosen": -2.2803494930267334,
"logits/rejected": -2.277498245239258,
"logps/chosen": -118.47029876708984,
"logps/rejected": -121.81834411621094,
"loss": 129364.775,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.00027665990637615323,
"rewards/margins": 0.0005457916995510459,
"rewards/rejected": -0.00026913188048638403,
"step": 100
},
{
"epoch": 0.04451639012545528,
"grad_norm": 3537595.5413078354,
"learning_rate": 2.2177419354838707e-07,
"logits/chosen": -2.2598938941955566,
"logits/rejected": -2.243565797805786,
"logps/chosen": -123.01219177246094,
"logps/rejected": -127.5718994140625,
"loss": 128605.475,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.001143028261139989,
"rewards/margins": 0.0005904460558667779,
"rewards/rejected": -0.0017334744334220886,
"step": 110
},
{
"epoch": 0.04856333468231485,
"grad_norm": 4700408.411362441,
"learning_rate": 2.4193548387096775e-07,
"logits/chosen": -2.189763307571411,
"logits/rejected": -2.230834484100342,
"logps/chosen": -111.58349609375,
"logps/rejected": -116.3633041381836,
"loss": 132832.7375,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.0004227511235512793,
"rewards/margins": -0.00036064969026483595,
"rewards/rejected": -6.210146966623142e-05,
"step": 120
},
{
"epoch": 0.052610279239174426,
"grad_norm": 4839768.05175113,
"learning_rate": 2.6209677419354835e-07,
"logits/chosen": -2.172719717025757,
"logits/rejected": -2.154069423675537,
"logps/chosen": -131.54049682617188,
"logps/rejected": -127.31382751464844,
"loss": 126528.7375,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.0012601370690390468,
"rewards/margins": 0.0014206544728949666,
"rewards/rejected": -0.0026807915419340134,
"step": 130
},
{
"epoch": 0.056657223796033995,
"grad_norm": 4462123.98520813,
"learning_rate": 2.8225806451612905e-07,
"logits/chosen": -2.2781708240509033,
"logits/rejected": -2.2529187202453613,
"logps/chosen": -109.3487319946289,
"logps/rejected": -108.7385025024414,
"loss": 128939.875,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.0024039470590651035,
"rewards/margins": 0.0019583911634981632,
"rewards/rejected": -0.004362338222563267,
"step": 140
},
{
"epoch": 0.060704168352893564,
"grad_norm": 4413918.737440498,
"learning_rate": 3.0241935483870965e-07,
"logits/chosen": -2.0262560844421387,
"logits/rejected": -2.0333077907562256,
"logps/chosen": -115.6955337524414,
"logps/rejected": -129.337890625,
"loss": 125950.125,
"rewards/accuracies": 0.4124999940395355,
"rewards/chosen": -3.777583970077103e-06,
"rewards/margins": -0.00044618500396609306,
"rewards/rejected": 0.0004424075596034527,
"step": 150
},
{
"epoch": 0.06475111290975313,
"grad_norm": 4956705.44191673,
"learning_rate": 3.225806451612903e-07,
"logits/chosen": -2.127880096435547,
"logits/rejected": -2.081531524658203,
"logps/chosen": -115.7586669921875,
"logps/rejected": -115.57160949707031,
"loss": 127159.3875,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.0029864097014069557,
"rewards/margins": 0.0028749522753059864,
"rewards/rejected": -0.005861361511051655,
"step": 160
},
{
"epoch": 0.0687980574666127,
"grad_norm": 5249631.129700843,
"learning_rate": 3.4274193548387095e-07,
"logits/chosen": -1.924232840538025,
"logits/rejected": -1.9467108249664307,
"logps/chosen": -130.4487762451172,
"logps/rejected": -133.85560607910156,
"loss": 125375.3875,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.008038095198571682,
"rewards/margins": 0.002187486505135894,
"rewards/rejected": -0.01022558193653822,
"step": 170
},
{
"epoch": 0.07284500202347227,
"grad_norm": 4122924.0724422527,
"learning_rate": 3.629032258064516e-07,
"logits/chosen": -2.045342445373535,
"logits/rejected": -2.0415282249450684,
"logps/chosen": -118.37638854980469,
"logps/rejected": -112.38387298583984,
"loss": 126785.075,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.006844646297395229,
"rewards/margins": 0.0005232656258158386,
"rewards/rejected": -0.00736791267991066,
"step": 180
},
{
"epoch": 0.07689194658033185,
"grad_norm": 4041640.919843376,
"learning_rate": 3.8306451612903225e-07,
"logits/chosen": -2.0350680351257324,
"logits/rejected": -2.0383336544036865,
"logps/chosen": -96.37462615966797,
"logps/rejected": -109.77508544921875,
"loss": 123590.025,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.005209728144109249,
"rewards/margins": 0.0012681197840720415,
"rewards/rejected": -0.006477847695350647,
"step": 190
},
{
"epoch": 0.08093889113719142,
"grad_norm": 5009394.271837804,
"learning_rate": 4.0322580645161285e-07,
"logits/chosen": -2.0026402473449707,
"logits/rejected": -1.9795843362808228,
"logps/chosen": -111.41777038574219,
"logps/rejected": -113.9148178100586,
"loss": 126448.05,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.010595456697046757,
"rewards/margins": 0.003854970680549741,
"rewards/rejected": -0.01445042621344328,
"step": 200
},
{
"epoch": 0.08498583569405099,
"grad_norm": 5619272.0636549145,
"learning_rate": 4.2338709677419355e-07,
"logits/chosen": -2.1407713890075684,
"logits/rejected": -2.1545071601867676,
"logps/chosen": -110.00148010253906,
"logps/rejected": -112.6539077758789,
"loss": 128766.15,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.009294772520661354,
"rewards/margins": 0.0027922452427446842,
"rewards/rejected": -0.012087016366422176,
"step": 210
},
{
"epoch": 0.08903278025091056,
"grad_norm": 4415731.805933493,
"learning_rate": 4.4354838709677415e-07,
"logits/chosen": -2.3430728912353516,
"logits/rejected": -2.3087539672851562,
"logps/chosen": -131.23556518554688,
"logps/rejected": -134.5377655029297,
"loss": 132823.0125,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.007352087646722794,
"rewards/margins": 0.004128883592784405,
"rewards/rejected": -0.011480971239507198,
"step": 220
},
{
"epoch": 0.09307972480777013,
"grad_norm": 5355155.642181672,
"learning_rate": 4.637096774193548e-07,
"logits/chosen": -2.242619752883911,
"logits/rejected": -2.2341275215148926,
"logps/chosen": -127.72953033447266,
"logps/rejected": -131.52122497558594,
"loss": 126450.9125,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.01162335742264986,
"rewards/margins": 0.005030062980949879,
"rewards/rejected": -0.01665342040359974,
"step": 230
},
{
"epoch": 0.0971266693646297,
"grad_norm": 4222976.278932744,
"learning_rate": 4.838709677419355e-07,
"logits/chosen": -2.2030246257781982,
"logits/rejected": -2.2015702724456787,
"logps/chosen": -108.4349365234375,
"logps/rejected": -110.720703125,
"loss": 133174.225,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.009825309738516808,
"rewards/margins": 0.003020837437361479,
"rewards/rejected": -0.012846146710216999,
"step": 240
},
{
"epoch": 0.10117361392148927,
"grad_norm": 5332420.53657513,
"learning_rate": 4.995501574448943e-07,
"logits/chosen": -2.1023497581481934,
"logits/rejected": -2.1095871925354004,
"logps/chosen": -110.6066665649414,
"logps/rejected": -117.8967056274414,
"loss": 127655.45,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.006971948780119419,
"rewards/margins": 0.0035569421015679836,
"rewards/rejected": -0.010528890416026115,
"step": 250
},
{
"epoch": 0.10522055847834885,
"grad_norm": 5679375.003121498,
"learning_rate": 4.973009446693657e-07,
"logits/chosen": -2.199481964111328,
"logits/rejected": -2.18941330909729,
"logps/chosen": -117.3616943359375,
"logps/rejected": -118.54112243652344,
"loss": 132409.6875,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.013199470937252045,
"rewards/margins": 0.001887517748400569,
"rewards/rejected": -0.015086987987160683,
"step": 260
},
{
"epoch": 0.10926750303520842,
"grad_norm": 4610431.437626582,
"learning_rate": 4.950517318938372e-07,
"logits/chosen": -2.3225109577178955,
"logits/rejected": -2.3390707969665527,
"logps/chosen": -124.8027572631836,
"logps/rejected": -129.42922973632812,
"loss": 125030.1375,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.012582078576087952,
"rewards/margins": 0.0034210742451250553,
"rewards/rejected": -0.01600315235555172,
"step": 270
},
{
"epoch": 0.11331444759206799,
"grad_norm": 6556255.599818757,
"learning_rate": 4.928025191183086e-07,
"logits/chosen": -2.1859679222106934,
"logits/rejected": -2.2081589698791504,
"logps/chosen": -114.65242767333984,
"logps/rejected": -124.5443344116211,
"loss": 124704.9,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.016057247295975685,
"rewards/margins": 0.002059857128188014,
"rewards/rejected": -0.018117103725671768,
"step": 280
},
{
"epoch": 0.11736139214892756,
"grad_norm": 6805409.426599127,
"learning_rate": 4.9055330634278e-07,
"logits/chosen": -2.219308853149414,
"logits/rejected": -2.214458465576172,
"logps/chosen": -134.7147979736328,
"logps/rejected": -142.11083984375,
"loss": 127160.525,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.009862681850790977,
"rewards/margins": 0.005548264365643263,
"rewards/rejected": -0.015410944819450378,
"step": 290
},
{
"epoch": 0.12140833670578713,
"grad_norm": 6068819.6586095495,
"learning_rate": 4.883040935672515e-07,
"logits/chosen": -2.258293390274048,
"logits/rejected": -2.228738307952881,
"logps/chosen": -132.33441162109375,
"logps/rejected": -141.69737243652344,
"loss": 128142.7625,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.008503363467752934,
"rewards/margins": 0.007645074278116226,
"rewards/rejected": -0.016148436814546585,
"step": 300
},
{
"epoch": 0.1254552812626467,
"grad_norm": 5267448.920762056,
"learning_rate": 4.860548807917229e-07,
"logits/chosen": -2.2171027660369873,
"logits/rejected": -2.2104790210723877,
"logps/chosen": -125.05142974853516,
"logps/rejected": -133.34071350097656,
"loss": 125674.1,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.019455790519714355,
"rewards/margins": 0.0074443453922867775,
"rewards/rejected": -0.026900136843323708,
"step": 310
},
{
"epoch": 0.12950222581950627,
"grad_norm": 6667685.083680488,
"learning_rate": 4.838056680161944e-07,
"logits/chosen": -2.1860244274139404,
"logits/rejected": -2.2035775184631348,
"logps/chosen": -122.4665756225586,
"logps/rejected": -132.46490478515625,
"loss": 125480.4125,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.017699861899018288,
"rewards/margins": 0.006000404246151447,
"rewards/rejected": -0.02370026707649231,
"step": 320
},
{
"epoch": 0.13354917037636585,
"grad_norm": 6064623.088294091,
"learning_rate": 4.815564552406658e-07,
"logits/chosen": -2.0421011447906494,
"logits/rejected": -2.057572603225708,
"logps/chosen": -134.10183715820312,
"logps/rejected": -144.3116912841797,
"loss": 124604.7875,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.019064148887991905,
"rewards/margins": 0.006390860769897699,
"rewards/rejected": -0.025455012917518616,
"step": 330
},
{
"epoch": 0.1375961149332254,
"grad_norm": 12560788.46443257,
"learning_rate": 4.793072424651372e-07,
"logits/chosen": -1.9278684854507446,
"logits/rejected": -1.909166693687439,
"logps/chosen": -146.60585021972656,
"logps/rejected": -166.07937622070312,
"loss": 140379.8375,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.0366508811712265,
"rewards/margins": 0.013624541461467743,
"rewards/rejected": -0.050275422632694244,
"step": 340
},
{
"epoch": 0.141643059490085,
"grad_norm": 6487628.638191885,
"learning_rate": 4.770580296896087e-07,
"logits/chosen": -2.11842679977417,
"logits/rejected": -2.1011595726013184,
"logps/chosen": -119.56195068359375,
"logps/rejected": -136.84823608398438,
"loss": 130511.3625,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.0215927604585886,
"rewards/margins": 0.007697033230215311,
"rewards/rejected": -0.029289793223142624,
"step": 350
},
{
"epoch": 0.14569000404694454,
"grad_norm": 4799050.3946148325,
"learning_rate": 4.7480881691408005e-07,
"logits/chosen": -2.0867960453033447,
"logits/rejected": -2.082698345184326,
"logps/chosen": -128.99429321289062,
"logps/rejected": -130.89785766601562,
"loss": 127926.0,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.016279883682727814,
"rewards/margins": 0.0010558776557445526,
"rewards/rejected": -0.017335761338472366,
"step": 360
},
{
"epoch": 0.14973694860380413,
"grad_norm": 5249060.947314388,
"learning_rate": 4.725596041385515e-07,
"logits/chosen": -2.1251657009124756,
"logits/rejected": -2.1052744388580322,
"logps/chosen": -121.3799819946289,
"logps/rejected": -121.27701568603516,
"loss": 131676.2375,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.017586207017302513,
"rewards/margins": 0.003813033225014806,
"rewards/rejected": -0.0213992390781641,
"step": 370
},
{
"epoch": 0.1537838931606637,
"grad_norm": 5293517.066249103,
"learning_rate": 4.7031039136302294e-07,
"logits/chosen": -2.15531587600708,
"logits/rejected": -2.153560161590576,
"logps/chosen": -159.96005249023438,
"logps/rejected": -153.87448120117188,
"loss": 121504.05,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.017819028347730637,
"rewards/margins": 0.008313515223562717,
"rewards/rejected": -0.02613254450261593,
"step": 380
},
{
"epoch": 0.15783083771752326,
"grad_norm": 5270002.835932803,
"learning_rate": 4.6806117858749433e-07,
"logits/chosen": -2.1870741844177246,
"logits/rejected": -2.171494483947754,
"logps/chosen": -148.86929321289062,
"logps/rejected": -164.97915649414062,
"loss": 129892.05,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.02027943730354309,
"rewards/margins": 0.0176930520683527,
"rewards/rejected": -0.03797249123454094,
"step": 390
},
{
"epoch": 0.16187778227438285,
"grad_norm": 4898003.511863262,
"learning_rate": 4.658119658119658e-07,
"logits/chosen": -2.1435184478759766,
"logits/rejected": -2.148679256439209,
"logps/chosen": -128.7902069091797,
"logps/rejected": -139.18150329589844,
"loss": 122692.925,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.018103724345564842,
"rewards/margins": 0.006611344870179892,
"rewards/rejected": -0.02471506968140602,
"step": 400
},
{
"epoch": 0.1659247268312424,
"grad_norm": 4183644.18979808,
"learning_rate": 4.635627530364372e-07,
"logits/chosen": -2.150381565093994,
"logits/rejected": -2.154317617416382,
"logps/chosen": -108.93717193603516,
"logps/rejected": -118.07032775878906,
"loss": 126758.0375,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.021253790706396103,
"rewards/margins": 0.003508577588945627,
"rewards/rejected": -0.024762369692325592,
"step": 410
},
{
"epoch": 0.16997167138810199,
"grad_norm": 5593714.467836783,
"learning_rate": 4.6131354026090867e-07,
"logits/chosen": -2.180170774459839,
"logits/rejected": -2.1523594856262207,
"logps/chosen": -126.38621520996094,
"logps/rejected": -136.35755920410156,
"loss": 121196.2,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.01847546175122261,
"rewards/margins": 0.0067595853470265865,
"rewards/rejected": -0.025235047563910484,
"step": 420
},
{
"epoch": 0.17401861594496154,
"grad_norm": 3566616.7901411816,
"learning_rate": 4.590643274853801e-07,
"logits/chosen": -2.120450258255005,
"logits/rejected": -2.150542974472046,
"logps/chosen": -137.63836669921875,
"logps/rejected": -141.17825317382812,
"loss": 132284.5875,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.02999441884458065,
"rewards/margins": 0.0025890106335282326,
"rewards/rejected": -0.03258342668414116,
"step": 430
},
{
"epoch": 0.17806556050182112,
"grad_norm": 6039791.29116107,
"learning_rate": 4.568151147098515e-07,
"logits/chosen": -2.2097067832946777,
"logits/rejected": -2.1825873851776123,
"logps/chosen": -127.94209289550781,
"logps/rejected": -137.39776611328125,
"loss": 128589.475,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.023606717586517334,
"rewards/margins": 0.009609794244170189,
"rewards/rejected": -0.03321651369333267,
"step": 440
},
{
"epoch": 0.1821125050586807,
"grad_norm": 6343148.886392033,
"learning_rate": 4.54565901934323e-07,
"logits/chosen": -2.1717894077301025,
"logits/rejected": -2.2131998538970947,
"logps/chosen": -129.89688110351562,
"logps/rejected": -145.33839416503906,
"loss": 124381.275,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.020499037578701973,
"rewards/margins": 0.013838306069374084,
"rewards/rejected": -0.03433733806014061,
"step": 450
},
{
"epoch": 0.18615944961554026,
"grad_norm": 4981408.5070092585,
"learning_rate": 4.523166891587944e-07,
"logits/chosen": -2.2632086277008057,
"logits/rejected": -2.306267738342285,
"logps/chosen": -163.80706787109375,
"logps/rejected": -155.72915649414062,
"loss": 158881.6375,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.03987263888120651,
"rewards/margins": -0.008330432698130608,
"rewards/rejected": -0.03154221177101135,
"step": 460
},
{
"epoch": 0.19020639417239985,
"grad_norm": 6186406.38261752,
"learning_rate": 4.500674763832658e-07,
"logits/chosen": -2.4067013263702393,
"logits/rejected": -2.4073116779327393,
"logps/chosen": -123.8814697265625,
"logps/rejected": -133.23178100585938,
"loss": 129765.4625,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.01776200719177723,
"rewards/margins": 0.006926923058927059,
"rewards/rejected": -0.024688933044672012,
"step": 470
},
{
"epoch": 0.1942533387292594,
"grad_norm": 7924184.670127909,
"learning_rate": 4.478182636077373e-07,
"logits/chosen": -2.4064009189605713,
"logits/rejected": -2.3933303356170654,
"logps/chosen": -120.53520202636719,
"logps/rejected": -124.30986022949219,
"loss": 127188.5875,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.013866530731320381,
"rewards/margins": 0.0045671057887375355,
"rewards/rejected": -0.01843363419175148,
"step": 480
},
{
"epoch": 0.19830028328611898,
"grad_norm": 6796881.168124855,
"learning_rate": 4.455690508322087e-07,
"logits/chosen": -2.35581636428833,
"logits/rejected": -2.276433229446411,
"logps/chosen": -113.40742492675781,
"logps/rejected": -126.89019775390625,
"loss": 122585.6875,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.015190203674137592,
"rewards/margins": 0.010421663522720337,
"rewards/rejected": -0.025611868128180504,
"step": 490
},
{
"epoch": 0.20234722784297854,
"grad_norm": 9409785.188721178,
"learning_rate": 4.433198380566802e-07,
"logits/chosen": -2.200453519821167,
"logits/rejected": -2.2011332511901855,
"logps/chosen": -156.01809692382812,
"logps/rejected": -169.92514038085938,
"loss": 129704.3,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.025881418958306313,
"rewards/margins": 0.010276483371853828,
"rewards/rejected": -0.03615789860486984,
"step": 500
},
{
"epoch": 0.20639417239983812,
"grad_norm": 5757712.5781175345,
"learning_rate": 4.410706252811516e-07,
"logits/chosen": -2.127547025680542,
"logits/rejected": -2.1388392448425293,
"logps/chosen": -130.27249145507812,
"logps/rejected": -145.90647888183594,
"loss": 123361.8125,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.03281703591346741,
"rewards/margins": 0.009785661473870277,
"rewards/rejected": -0.042602695524692535,
"step": 510
},
{
"epoch": 0.2104411169566977,
"grad_norm": 5742087.523014036,
"learning_rate": 4.3882141250562297e-07,
"logits/chosen": -2.2757978439331055,
"logits/rejected": -2.2460601329803467,
"logps/chosen": -153.6471710205078,
"logps/rejected": -165.54989624023438,
"loss": 127158.9125,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.02743702568113804,
"rewards/margins": 0.017125947400927544,
"rewards/rejected": -0.04456297308206558,
"step": 520
},
{
"epoch": 0.21448806151355726,
"grad_norm": 6000988.402036818,
"learning_rate": 4.3657219973009447e-07,
"logits/chosen": -2.14945387840271,
"logits/rejected": -2.160613775253296,
"logps/chosen": -152.8687286376953,
"logps/rejected": -157.02215576171875,
"loss": 130855.4125,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.03712679445743561,
"rewards/margins": -0.002356339478865266,
"rewards/rejected": -0.03477045148611069,
"step": 530
},
{
"epoch": 0.21853500607041684,
"grad_norm": 7039581.88958706,
"learning_rate": 4.3432298695456586e-07,
"logits/chosen": -2.1952900886535645,
"logits/rejected": -2.125767946243286,
"logps/chosen": -121.56607818603516,
"logps/rejected": -136.8863983154297,
"loss": 124032.45,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.021455224603414536,
"rewards/margins": 0.01183997467160225,
"rewards/rejected": -0.03329520300030708,
"step": 540
},
{
"epoch": 0.2225819506272764,
"grad_norm": 6851510.087607766,
"learning_rate": 4.3207377417903736e-07,
"logits/chosen": -2.3099186420440674,
"logits/rejected": -2.2750840187072754,
"logps/chosen": -133.94058227539062,
"logps/rejected": -165.82687377929688,
"loss": 127159.35,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.02089579775929451,
"rewards/margins": 0.011938202194869518,
"rewards/rejected": -0.0328340008854866,
"step": 550
},
{
"epoch": 0.22662889518413598,
"grad_norm": 7651455.0301742535,
"learning_rate": 4.2982456140350876e-07,
"logits/chosen": -2.281270980834961,
"logits/rejected": -2.291888475418091,
"logps/chosen": -139.83163452148438,
"logps/rejected": -141.5286865234375,
"loss": 130547.225,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.02193923108279705,
"rewards/margins": 0.0074460976757109165,
"rewards/rejected": -0.02938532829284668,
"step": 560
},
{
"epoch": 0.23067583974099554,
"grad_norm": 4842418.287253727,
"learning_rate": 4.2757534862798015e-07,
"logits/chosen": -2.28908634185791,
"logits/rejected": -2.2613823413848877,
"logps/chosen": -130.56756591796875,
"logps/rejected": -136.48858642578125,
"loss": 129810.7,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.018774276599287987,
"rewards/margins": 0.012544331140816212,
"rewards/rejected": -0.03131860867142677,
"step": 570
},
{
"epoch": 0.23472278429785512,
"grad_norm": 5753286.585250832,
"learning_rate": 4.2532613585245165e-07,
"logits/chosen": -2.3290882110595703,
"logits/rejected": -2.2913310527801514,
"logps/chosen": -128.60073852539062,
"logps/rejected": -144.4147491455078,
"loss": 125407.5625,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.022057032212615013,
"rewards/margins": 0.013317006640136242,
"rewards/rejected": -0.03537403792142868,
"step": 580
},
{
"epoch": 0.2387697288547147,
"grad_norm": 6854533.186683347,
"learning_rate": 4.2307692307692304e-07,
"logits/chosen": -2.1821513175964355,
"logits/rejected": -2.227368116378784,
"logps/chosen": -132.9744873046875,
"logps/rejected": -143.91380310058594,
"loss": 119907.075,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.024536501616239548,
"rewards/margins": 0.00792471133172512,
"rewards/rejected": -0.03246121481060982,
"step": 590
},
{
"epoch": 0.24281667341157426,
"grad_norm": 7000163.800918494,
"learning_rate": 4.208277103013945e-07,
"logits/chosen": -2.2966506481170654,
"logits/rejected": -2.274991989135742,
"logps/chosen": -140.1864776611328,
"logps/rejected": -142.9268798828125,
"loss": 129494.7625,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.026311520487070084,
"rewards/margins": 0.005165449343621731,
"rewards/rejected": -0.03147696703672409,
"step": 600
},
{
"epoch": 0.24686361796843384,
"grad_norm": 5155538.44716785,
"learning_rate": 4.1857849752586593e-07,
"logits/chosen": -2.2126269340515137,
"logits/rejected": -2.2339818477630615,
"logps/chosen": -143.7578125,
"logps/rejected": -148.81027221679688,
"loss": 131088.325,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.022633636370301247,
"rewards/margins": 0.005649174097925425,
"rewards/rejected": -0.028282811865210533,
"step": 610
},
{
"epoch": 0.2509105625252934,
"grad_norm": 6494761.148749808,
"learning_rate": 4.1632928475033733e-07,
"logits/chosen": -2.2412619590759277,
"logits/rejected": -2.215108633041382,
"logps/chosen": -133.82061767578125,
"logps/rejected": -144.2487030029297,
"loss": 127834.35,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.023778211325407028,
"rewards/margins": 0.008780455216765404,
"rewards/rejected": -0.03255866840481758,
"step": 620
},
{
"epoch": 0.254957507082153,
"grad_norm": 6581411.527197339,
"learning_rate": 4.140800719748088e-07,
"logits/chosen": -2.3006882667541504,
"logits/rejected": -2.279165744781494,
"logps/chosen": -127.95011901855469,
"logps/rejected": -144.5232696533203,
"loss": 128899.5125,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.017114771530032158,
"rewards/margins": 0.012585528194904327,
"rewards/rejected": -0.029700294137001038,
"step": 630
},
{
"epoch": 0.25900445163901253,
"grad_norm": 6993144.620436077,
"learning_rate": 4.118308591992802e-07,
"logits/chosen": -2.288159132003784,
"logits/rejected": -2.27152681350708,
"logps/chosen": -116.51515197753906,
"logps/rejected": -134.83572387695312,
"loss": 122510.6375,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.014996351674199104,
"rewards/margins": 0.018196506425738335,
"rewards/rejected": -0.03319285809993744,
"step": 640
},
{
"epoch": 0.2630513961958721,
"grad_norm": 5352708.94864355,
"learning_rate": 4.0958164642375167e-07,
"logits/chosen": -2.33659029006958,
"logits/rejected": -2.3185806274414062,
"logps/chosen": -143.27899169921875,
"logps/rejected": -154.21240234375,
"loss": 128047.15,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.022664163261651993,
"rewards/margins": 0.016272926703095436,
"rewards/rejected": -0.03893708810210228,
"step": 650
},
{
"epoch": 0.2670983407527317,
"grad_norm": 5853928.770602061,
"learning_rate": 4.073324336482231e-07,
"logits/chosen": -2.2209713459014893,
"logits/rejected": -2.197364091873169,
"logps/chosen": -154.97152709960938,
"logps/rejected": -164.9137725830078,
"loss": 126285.6125,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.02928345464169979,
"rewards/margins": 0.017678027972579002,
"rewards/rejected": -0.046961478888988495,
"step": 660
},
{
"epoch": 0.27114528530959126,
"grad_norm": 5468563.620033422,
"learning_rate": 4.0508322087269456e-07,
"logits/chosen": -2.368302822113037,
"logits/rejected": -2.359222888946533,
"logps/chosen": -138.3487091064453,
"logps/rejected": -131.19773864746094,
"loss": 135010.325,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.022097600623965263,
"rewards/margins": -0.0010355912381783128,
"rewards/rejected": -0.021062009036540985,
"step": 670
},
{
"epoch": 0.2751922298664508,
"grad_norm": 5145007.282508669,
"learning_rate": 4.02834008097166e-07,
"logits/chosen": -2.2279224395751953,
"logits/rejected": -2.227818250656128,
"logps/chosen": -151.80599975585938,
"logps/rejected": -155.39369201660156,
"loss": 124851.875,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.023444540798664093,
"rewards/margins": 0.006362411193549633,
"rewards/rejected": -0.0298069529235363,
"step": 680
},
{
"epoch": 0.2792391744233104,
"grad_norm": 5800338.778716969,
"learning_rate": 4.005847953216374e-07,
"logits/chosen": -2.3348867893218994,
"logits/rejected": -2.3266310691833496,
"logps/chosen": -125.41386413574219,
"logps/rejected": -131.49343872070312,
"loss": 127372.8125,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.021091241389513016,
"rewards/margins": 0.00724734365940094,
"rewards/rejected": -0.028338585048913956,
"step": 690
},
{
"epoch": 0.28328611898017,
"grad_norm": 8105894.218684362,
"learning_rate": 3.9833558254610884e-07,
"logits/chosen": -2.309593677520752,
"logits/rejected": -2.2981934547424316,
"logps/chosen": -132.08596801757812,
"logps/rejected": -137.70201110839844,
"loss": 124781.725,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.021190345287322998,
"rewards/margins": 0.0066665285266935825,
"rewards/rejected": -0.027856875211000443,
"step": 700
},
{
"epoch": 0.28733306353702953,
"grad_norm": 5039380.169629858,
"learning_rate": 3.960863697705803e-07,
"logits/chosen": -2.315074920654297,
"logits/rejected": -2.3194656372070312,
"logps/chosen": -147.24713134765625,
"logps/rejected": -158.99636840820312,
"loss": 128105.9625,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.018308859318494797,
"rewards/margins": 0.00702436501160264,
"rewards/rejected": -0.0253332257270813,
"step": 710
},
{
"epoch": 0.2913800080938891,
"grad_norm": 6770507.385238732,
"learning_rate": 3.9383715699505173e-07,
"logits/chosen": -2.3582499027252197,
"logits/rejected": -2.307143211364746,
"logps/chosen": -141.00454711914062,
"logps/rejected": -145.4442901611328,
"loss": 128073.85,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.02170463278889656,
"rewards/margins": 0.004128533415496349,
"rewards/rejected": -0.025833168998360634,
"step": 720
},
{
"epoch": 0.2954269526507487,
"grad_norm": 6843599.0452563455,
"learning_rate": 3.9158794421952313e-07,
"logits/chosen": -2.2773690223693848,
"logits/rejected": -2.2705273628234863,
"logps/chosen": -127.78352355957031,
"logps/rejected": -128.8694305419922,
"loss": 133363.3375,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.019749607890844345,
"rewards/margins": 0.0013956364709883928,
"rewards/rejected": -0.02114524319767952,
"step": 730
},
{
"epoch": 0.29947389720760825,
"grad_norm": 6414207.014030725,
"learning_rate": 3.893387314439946e-07,
"logits/chosen": -2.2219457626342773,
"logits/rejected": -2.1614620685577393,
"logps/chosen": -138.95530700683594,
"logps/rejected": -159.24916076660156,
"loss": 125832.575,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.01657973788678646,
"rewards/margins": 0.015021143481135368,
"rewards/rejected": -0.03160088509321213,
"step": 740
},
{
"epoch": 0.3035208417644678,
"grad_norm": 6251391.785537995,
"learning_rate": 3.87089518668466e-07,
"logits/chosen": -2.216029167175293,
"logits/rejected": -2.2095859050750732,
"logps/chosen": -139.25477600097656,
"logps/rejected": -146.38613891601562,
"loss": 126431.6625,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.02343796379864216,
"rewards/margins": 0.012653100304305553,
"rewards/rejected": -0.03609105944633484,
"step": 750
},
{
"epoch": 0.3075677863213274,
"grad_norm": 5534957.115190962,
"learning_rate": 3.8484030589293747e-07,
"logits/chosen": -2.2073702812194824,
"logits/rejected": -2.209057569503784,
"logps/chosen": -130.53199768066406,
"logps/rejected": -137.91250610351562,
"loss": 127669.2,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.01518569327890873,
"rewards/margins": 0.007037720177322626,
"rewards/rejected": -0.02222341299057007,
"step": 760
},
{
"epoch": 0.311614730878187,
"grad_norm": 4890109.310049175,
"learning_rate": 3.825910931174089e-07,
"logits/chosen": -2.225956678390503,
"logits/rejected": -2.210540294647217,
"logps/chosen": -127.26595306396484,
"logps/rejected": -133.6049041748047,
"loss": 124534.6875,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.02195551246404648,
"rewards/margins": 0.006126697175204754,
"rewards/rejected": -0.02808220684528351,
"step": 770
},
{
"epoch": 0.31566167543504653,
"grad_norm": 6427608.185533696,
"learning_rate": 3.803418803418803e-07,
"logits/chosen": -2.2634310722351074,
"logits/rejected": -2.245199203491211,
"logps/chosen": -137.40240478515625,
"logps/rejected": -143.7775115966797,
"loss": 129704.1875,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.022509312257170677,
"rewards/margins": 0.003963272087275982,
"rewards/rejected": -0.026472587138414383,
"step": 780
},
{
"epoch": 0.3197086199919061,
"grad_norm": 6730619.873094239,
"learning_rate": 3.7809266756635175e-07,
"logits/chosen": -2.1104772090911865,
"logits/rejected": -2.0919671058654785,
"logps/chosen": -125.5869369506836,
"logps/rejected": -133.48800659179688,
"loss": 125677.675,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.024079788476228714,
"rewards/margins": 0.007324723992496729,
"rewards/rejected": -0.0314045175909996,
"step": 790
},
{
"epoch": 0.3237555645487657,
"grad_norm": 6156066.531026818,
"learning_rate": 3.758434547908232e-07,
"logits/chosen": -2.213543176651001,
"logits/rejected": -2.1960647106170654,
"logps/chosen": -145.46665954589844,
"logps/rejected": -159.2154541015625,
"loss": 121552.525,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.021815134212374687,
"rewards/margins": 0.014243106357753277,
"rewards/rejected": -0.03605823963880539,
"step": 800
},
{
"epoch": 0.32780250910562525,
"grad_norm": 6503545.886073305,
"learning_rate": 3.735942420152946e-07,
"logits/chosen": -2.120095729827881,
"logits/rejected": -2.0986738204956055,
"logps/chosen": -134.55508422851562,
"logps/rejected": -152.37815856933594,
"loss": 122828.6875,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.025746628642082214,
"rewards/margins": 0.01750759594142437,
"rewards/rejected": -0.043254222720861435,
"step": 810
},
{
"epoch": 0.3318494536624848,
"grad_norm": 5263993.227861122,
"learning_rate": 3.713450292397661e-07,
"logits/chosen": -2.236570358276367,
"logits/rejected": -2.216663360595703,
"logps/chosen": -137.65792846679688,
"logps/rejected": -137.9815673828125,
"loss": 125940.1375,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.025968383997678757,
"rewards/margins": 0.009893245995044708,
"rewards/rejected": -0.03586163371801376,
"step": 820
},
{
"epoch": 0.3358963982193444,
"grad_norm": 5564470.498348531,
"learning_rate": 3.690958164642375e-07,
"logits/chosen": -2.2721188068389893,
"logits/rejected": -2.2634165287017822,
"logps/chosen": -146.41432189941406,
"logps/rejected": -148.6261749267578,
"loss": 130783.825,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.028409641236066818,
"rewards/margins": 0.013621616177260876,
"rewards/rejected": -0.04203125834465027,
"step": 830
},
{
"epoch": 0.33994334277620397,
"grad_norm": 4256533.420167086,
"learning_rate": 3.66846603688709e-07,
"logits/chosen": -2.355905532836914,
"logits/rejected": -2.3262717723846436,
"logps/chosen": -135.9013671875,
"logps/rejected": -144.34478759765625,
"loss": 126088.525,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.02681833505630493,
"rewards/margins": 0.009647052735090256,
"rewards/rejected": -0.03646538779139519,
"step": 840
},
{
"epoch": 0.3439902873330635,
"grad_norm": 6179484.090448809,
"learning_rate": 3.645973909131804e-07,
"logits/chosen": -2.2327308654785156,
"logits/rejected": -2.194852828979492,
"logps/chosen": -131.39376831054688,
"logps/rejected": -155.78671264648438,
"loss": 125825.075,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.02463443949818611,
"rewards/margins": 0.01199124101549387,
"rewards/rejected": -0.03662567585706711,
"step": 850
},
{
"epoch": 0.3480372318899231,
"grad_norm": 5448182.802456733,
"learning_rate": 3.6234817813765177e-07,
"logits/chosen": -2.2509052753448486,
"logits/rejected": -2.2193102836608887,
"logps/chosen": -131.55270385742188,
"logps/rejected": -144.63186645507812,
"loss": 130804.4625,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.019251421093940735,
"rewards/margins": 0.008828094229102135,
"rewards/rejected": -0.02807951346039772,
"step": 860
},
{
"epoch": 0.3520841764467827,
"grad_norm": 4837603.177346373,
"learning_rate": 3.6009896536212327e-07,
"logits/chosen": -2.433258056640625,
"logits/rejected": -2.404008150100708,
"logps/chosen": -135.0418243408203,
"logps/rejected": -134.2267303466797,
"loss": 122885.925,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.017599385231733322,
"rewards/margins": 0.0028229092713445425,
"rewards/rejected": -0.02042229473590851,
"step": 870
},
{
"epoch": 0.35613112100364225,
"grad_norm": 5692190.523993364,
"learning_rate": 3.5784975258659466e-07,
"logits/chosen": -2.372664213180542,
"logits/rejected": -2.4038546085357666,
"logps/chosen": -145.62425231933594,
"logps/rejected": -161.4815216064453,
"loss": 125917.475,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.021967049688100815,
"rewards/margins": 0.006153530441224575,
"rewards/rejected": -0.028120581060647964,
"step": 880
},
{
"epoch": 0.3601780655605018,
"grad_norm": 5143465.092976311,
"learning_rate": 3.5560053981106616e-07,
"logits/chosen": -2.4298062324523926,
"logits/rejected": -2.441378116607666,
"logps/chosen": -114.91922760009766,
"logps/rejected": -128.6441192626953,
"loss": 125689.425,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.02063891664147377,
"rewards/margins": 0.0056837559677660465,
"rewards/rejected": -0.02632267400622368,
"step": 890
},
{
"epoch": 0.3642250101173614,
"grad_norm": 7348686.998660731,
"learning_rate": 3.5335132703553755e-07,
"logits/chosen": -2.3470609188079834,
"logits/rejected": -2.338306427001953,
"logps/chosen": -142.22169494628906,
"logps/rejected": -155.0847625732422,
"loss": 127013.675,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.021488003432750702,
"rewards/margins": 0.00833697896450758,
"rewards/rejected": -0.029824981465935707,
"step": 900
},
{
"epoch": 0.36827195467422097,
"grad_norm": 5455491.748505866,
"learning_rate": 3.5110211426000895e-07,
"logits/chosen": -2.328141689300537,
"logits/rejected": -2.300947666168213,
"logps/chosen": -144.3502655029297,
"logps/rejected": -160.5428466796875,
"loss": 132699.2,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.019702186807990074,
"rewards/margins": 0.012580236420035362,
"rewards/rejected": -0.032282426953315735,
"step": 910
},
{
"epoch": 0.3723188992310805,
"grad_norm": 5918454.321642784,
"learning_rate": 3.4885290148448044e-07,
"logits/chosen": -2.2618203163146973,
"logits/rejected": -2.273591995239258,
"logps/chosen": -140.4850616455078,
"logps/rejected": -144.3483123779297,
"loss": 126713.925,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.02276991680264473,
"rewards/margins": 0.008334951475262642,
"rewards/rejected": -0.03110486827790737,
"step": 920
},
{
"epoch": 0.3763658437879401,
"grad_norm": 7045240.388394526,
"learning_rate": 3.4660368870895184e-07,
"logits/chosen": -2.3371381759643555,
"logits/rejected": -2.3148632049560547,
"logps/chosen": -141.95742797851562,
"logps/rejected": -160.81312561035156,
"loss": 124856.1375,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.02805008552968502,
"rewards/margins": 0.014153921976685524,
"rewards/rejected": -0.042204007506370544,
"step": 930
},
{
"epoch": 0.3804127883447997,
"grad_norm": 5526632.5094240755,
"learning_rate": 3.443544759334233e-07,
"logits/chosen": -2.3289644718170166,
"logits/rejected": -2.3078227043151855,
"logps/chosen": -151.3744354248047,
"logps/rejected": -153.30511474609375,
"loss": 126556.475,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.02584829553961754,
"rewards/margins": 0.005777581594884396,
"rewards/rejected": -0.03162587806582451,
"step": 940
},
{
"epoch": 0.38445973290165925,
"grad_norm": 6075148.892704811,
"learning_rate": 3.4210526315789473e-07,
"logits/chosen": -2.2021899223327637,
"logits/rejected": -2.199693441390991,
"logps/chosen": -126.38993072509766,
"logps/rejected": -135.55516052246094,
"loss": 130061.4375,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.02466515824198723,
"rewards/margins": 0.0069196284748613834,
"rewards/rejected": -0.03158479183912277,
"step": 950
},
{
"epoch": 0.3885066774585188,
"grad_norm": 5994722.402682892,
"learning_rate": 3.398560503823661e-07,
"logits/chosen": -2.375749349594116,
"logits/rejected": -2.350696086883545,
"logps/chosen": -135.23800659179688,
"logps/rejected": -143.55755615234375,
"loss": 130424.2625,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.020815346390008926,
"rewards/margins": 0.009448934346437454,
"rewards/rejected": -0.03026428259909153,
"step": 960
},
{
"epoch": 0.3925536220153784,
"grad_norm": 6955347.020210707,
"learning_rate": 3.376068376068376e-07,
"logits/chosen": -2.41917085647583,
"logits/rejected": -2.3552684783935547,
"logps/chosen": -133.61973571777344,
"logps/rejected": -150.07192993164062,
"loss": 126116.675,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.023608971387147903,
"rewards/margins": 0.013977563008666039,
"rewards/rejected": -0.03758653253316879,
"step": 970
},
{
"epoch": 0.39660056657223797,
"grad_norm": 7026469.492711891,
"learning_rate": 3.35357624831309e-07,
"logits/chosen": -2.472712993621826,
"logits/rejected": -2.4403810501098633,
"logps/chosen": -144.36697387695312,
"logps/rejected": -160.89016723632812,
"loss": 125593.175,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.0269068144261837,
"rewards/margins": 0.013755050487816334,
"rewards/rejected": -0.04066186398267746,
"step": 980
},
{
"epoch": 0.4006475111290975,
"grad_norm": 5279874.300128242,
"learning_rate": 3.3310841205578046e-07,
"logits/chosen": -2.3706583976745605,
"logits/rejected": -2.362694263458252,
"logps/chosen": -130.1677703857422,
"logps/rejected": -150.29214477539062,
"loss": 122425.7125,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.02780333161354065,
"rewards/margins": 0.010212745517492294,
"rewards/rejected": -0.038016077131032944,
"step": 990
},
{
"epoch": 0.4046944556859571,
"grad_norm": 7346421.9033947745,
"learning_rate": 3.308591992802519e-07,
"logits/chosen": -2.3910489082336426,
"logits/rejected": -2.360917091369629,
"logps/chosen": -134.7192840576172,
"logps/rejected": -145.80410766601562,
"loss": 120740.6875,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.02175028808414936,
"rewards/margins": 0.012434590607881546,
"rewards/rejected": -0.03418487682938576,
"step": 1000
},
{
"epoch": 0.4087414002428167,
"grad_norm": 6266220.786790677,
"learning_rate": 3.286099865047233e-07,
"logits/chosen": -2.258577585220337,
"logits/rejected": -2.278409957885742,
"logps/chosen": -134.9305877685547,
"logps/rejected": -154.0025177001953,
"loss": 127529.4875,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.020327283069491386,
"rewards/margins": 0.010549711063504219,
"rewards/rejected": -0.030876994132995605,
"step": 1010
},
{
"epoch": 0.41278834479967624,
"grad_norm": 6569607.008702802,
"learning_rate": 3.263607737291948e-07,
"logits/chosen": -2.2700555324554443,
"logits/rejected": -2.2413737773895264,
"logps/chosen": -145.36404418945312,
"logps/rejected": -159.6646270751953,
"loss": 129882.5125,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.020192014053463936,
"rewards/margins": 0.008731147274374962,
"rewards/rejected": -0.0289231576025486,
"step": 1020
},
{
"epoch": 0.4168352893565358,
"grad_norm": 6604946.805112461,
"learning_rate": 3.241115609536662e-07,
"logits/chosen": -2.2907137870788574,
"logits/rejected": -2.2590463161468506,
"logps/chosen": -148.82757568359375,
"logps/rejected": -158.55458068847656,
"loss": 123561.0125,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.02791331335902214,
"rewards/margins": 0.01067260093986988,
"rewards/rejected": -0.03858591616153717,
"step": 1030
},
{
"epoch": 0.4208822339133954,
"grad_norm": 7819744.1215137215,
"learning_rate": 3.2186234817813764e-07,
"logits/chosen": -2.3501906394958496,
"logits/rejected": -2.382286548614502,
"logps/chosen": -145.73556518554688,
"logps/rejected": -145.95733642578125,
"loss": 125984.175,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.024780739098787308,
"rewards/margins": 0.01095888763666153,
"rewards/rejected": -0.03573962673544884,
"step": 1040
},
{
"epoch": 0.42492917847025496,
"grad_norm": 8329276.874833419,
"learning_rate": 3.196131354026091e-07,
"logits/chosen": -2.3430895805358887,
"logits/rejected": -2.2940633296966553,
"logps/chosen": -156.47262573242188,
"logps/rejected": -172.1248321533203,
"loss": 127542.5875,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.026906628161668777,
"rewards/margins": 0.02299944870173931,
"rewards/rejected": -0.04990607872605324,
"step": 1050
},
{
"epoch": 0.4289761230271145,
"grad_norm": 5411023.24022027,
"learning_rate": 3.1736392262708053e-07,
"logits/chosen": -2.350010633468628,
"logits/rejected": -2.348132610321045,
"logps/chosen": -134.76171875,
"logps/rejected": -165.0662384033203,
"loss": 124288.825,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.026082511991262436,
"rewards/margins": 0.02408730424940586,
"rewards/rejected": -0.05016981437802315,
"step": 1060
},
{
"epoch": 0.4330230675839741,
"grad_norm": 6438956.528553201,
"learning_rate": 3.151147098515519e-07,
"logits/chosen": -2.403751850128174,
"logits/rejected": -2.391162395477295,
"logps/chosen": -133.4490966796875,
"logps/rejected": -145.66363525390625,
"loss": 122699.675,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.023318186402320862,
"rewards/margins": 0.010946491733193398,
"rewards/rejected": -0.03426467627286911,
"step": 1070
},
{
"epoch": 0.4370700121408337,
"grad_norm": 5922234.372544115,
"learning_rate": 3.1286549707602337e-07,
"logits/chosen": -2.2476916313171387,
"logits/rejected": -2.2207980155944824,
"logps/chosen": -142.42433166503906,
"logps/rejected": -152.08865356445312,
"loss": 123837.95,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.028287163004279137,
"rewards/margins": 0.015818050131201744,
"rewards/rejected": -0.04410521313548088,
"step": 1080
},
{
"epoch": 0.44111695669769324,
"grad_norm": 5883039.362660401,
"learning_rate": 3.106162843004948e-07,
"logits/chosen": -2.3883793354034424,
"logits/rejected": -2.3448328971862793,
"logps/chosen": -135.72998046875,
"logps/rejected": -153.7415008544922,
"loss": 124484.025,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.026858652010560036,
"rewards/margins": 0.02032136358320713,
"rewards/rejected": -0.04718000814318657,
"step": 1090
},
{
"epoch": 0.4451639012545528,
"grad_norm": 7158679.357876272,
"learning_rate": 3.0836707152496626e-07,
"logits/chosen": -2.3638851642608643,
"logits/rejected": -2.3225362300872803,
"logps/chosen": -145.95896911621094,
"logps/rejected": -169.71176147460938,
"loss": 130674.3625,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.025300731882452965,
"rewards/margins": 0.016643613576889038,
"rewards/rejected": -0.04194434732198715,
"step": 1100
},
{
"epoch": 0.4492108458114124,
"grad_norm": 7065276.414180166,
"learning_rate": 3.061178587494377e-07,
"logits/chosen": -2.3456435203552246,
"logits/rejected": -2.3152847290039062,
"logps/chosen": -126.73854064941406,
"logps/rejected": -143.69662475585938,
"loss": 127769.775,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.01884400099515915,
"rewards/margins": 0.015868009999394417,
"rewards/rejected": -0.03471200913190842,
"step": 1110
},
{
"epoch": 0.45325779036827196,
"grad_norm": 8872861.336008936,
"learning_rate": 3.038686459739091e-07,
"logits/chosen": -2.3893070220947266,
"logits/rejected": -2.379615068435669,
"logps/chosen": -135.2264404296875,
"logps/rejected": -147.5668487548828,
"loss": 121978.65,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.025070184841752052,
"rewards/margins": 0.01242685504257679,
"rewards/rejected": -0.037497036159038544,
"step": 1120
},
{
"epoch": 0.4573047349251315,
"grad_norm": 4362461.84620477,
"learning_rate": 3.0161943319838055e-07,
"logits/chosen": -2.3373289108276367,
"logits/rejected": -2.3283610343933105,
"logps/chosen": -113.62736511230469,
"logps/rejected": -132.5222930908203,
"loss": 122763.6,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.027866637334227562,
"rewards/margins": 0.010607337579131126,
"rewards/rejected": -0.03847397491335869,
"step": 1130
},
{
"epoch": 0.4613516794819911,
"grad_norm": 6441902.5854437305,
"learning_rate": 2.99370220422852e-07,
"logits/chosen": -2.4195449352264404,
"logits/rejected": -2.421095848083496,
"logps/chosen": -138.25782775878906,
"logps/rejected": -152.2244110107422,
"loss": 128506.9875,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.01849151961505413,
"rewards/margins": 0.007775165140628815,
"rewards/rejected": -0.026266688480973244,
"step": 1140
},
{
"epoch": 0.4653986240388507,
"grad_norm": 7047614.500405596,
"learning_rate": 2.971210076473234e-07,
"logits/chosen": -2.4957115650177,
"logits/rejected": -2.4509148597717285,
"logps/chosen": -137.063720703125,
"logps/rejected": -144.99172973632812,
"loss": 121503.125,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.018852662295103073,
"rewards/margins": 0.013833269476890564,
"rewards/rejected": -0.03268593177199364,
"step": 1150
},
{
"epoch": 0.46944556859571024,
"grad_norm": 6473613.10465131,
"learning_rate": 2.948717948717949e-07,
"logits/chosen": -2.508885145187378,
"logits/rejected": -2.4511702060699463,
"logps/chosen": -144.10775756835938,
"logps/rejected": -154.82540893554688,
"loss": 129245.7125,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.025161290541291237,
"rewards/margins": 0.008113402873277664,
"rewards/rejected": -0.03327469527721405,
"step": 1160
},
{
"epoch": 0.4734925131525698,
"grad_norm": 6225189.741747939,
"learning_rate": 2.926225820962663e-07,
"logits/chosen": -2.5162465572357178,
"logits/rejected": -2.526261568069458,
"logps/chosen": -134.27059936523438,
"logps/rejected": -153.3767852783203,
"loss": 129228.5,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.020399171859025955,
"rewards/margins": 0.01304579060524702,
"rewards/rejected": -0.03344495967030525,
"step": 1170
},
{
"epoch": 0.4775394577094294,
"grad_norm": 6686211.632899741,
"learning_rate": 2.903733693207377e-07,
"logits/chosen": -2.500845432281494,
"logits/rejected": -2.4776079654693604,
"logps/chosen": -139.28172302246094,
"logps/rejected": -162.630126953125,
"loss": 127296.0,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.021778758615255356,
"rewards/margins": 0.016411086544394493,
"rewards/rejected": -0.038189847022295,
"step": 1180
},
{
"epoch": 0.48158640226628896,
"grad_norm": 7327493.262833852,
"learning_rate": 2.8812415654520917e-07,
"logits/chosen": -2.5000369548797607,
"logits/rejected": -2.4850993156433105,
"logps/chosen": -133.94406127929688,
"logps/rejected": -149.57473754882812,
"loss": 130669.6,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.02554786205291748,
"rewards/margins": 0.0159430094063282,
"rewards/rejected": -0.04149087145924568,
"step": 1190
},
{
"epoch": 0.4856333468231485,
"grad_norm": 8827001.924529044,
"learning_rate": 2.8587494376968056e-07,
"logits/chosen": -2.411595582962036,
"logits/rejected": -2.4169204235076904,
"logps/chosen": -130.80325317382812,
"logps/rejected": -142.50013732910156,
"loss": 121415.625,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.019139764830470085,
"rewards/margins": 0.018100781366229057,
"rewards/rejected": -0.03724054619669914,
"step": 1200
},
{
"epoch": 0.48968029138000807,
"grad_norm": 5557571.235890019,
"learning_rate": 2.8362573099415206e-07,
"logits/chosen": -2.538846015930176,
"logits/rejected": -2.502953052520752,
"logps/chosen": -134.97543334960938,
"logps/rejected": -142.17556762695312,
"loss": 118867.85,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.013044399209320545,
"rewards/margins": 0.011013238690793514,
"rewards/rejected": -0.02405763790011406,
"step": 1210
},
{
"epoch": 0.4937272359368677,
"grad_norm": 6914411.411931411,
"learning_rate": 2.8137651821862346e-07,
"logits/chosen": -2.389519691467285,
"logits/rejected": -2.3551812171936035,
"logps/chosen": -138.05276489257812,
"logps/rejected": -160.35946655273438,
"loss": 127666.15,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.025238817557692528,
"rewards/margins": 0.02308265119791031,
"rewards/rejected": -0.048321474343538284,
"step": 1220
},
{
"epoch": 0.49777418049372724,
"grad_norm": 5609340.931669485,
"learning_rate": 2.7912730544309496e-07,
"logits/chosen": -2.4456872940063477,
"logits/rejected": -2.410588026046753,
"logps/chosen": -152.31521606445312,
"logps/rejected": -166.41482543945312,
"loss": 126962.65,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.029917621985077858,
"rewards/margins": 0.013386559672653675,
"rewards/rejected": -0.04330417513847351,
"step": 1230
},
{
"epoch": 0.5018211250505868,
"grad_norm": 5540227.088123434,
"learning_rate": 2.7687809266756635e-07,
"logits/chosen": -2.3677735328674316,
"logits/rejected": -2.354952335357666,
"logps/chosen": -126.54608154296875,
"logps/rejected": -145.7926483154297,
"loss": 127253.85,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.018986444920301437,
"rewards/margins": 0.016626928001642227,
"rewards/rejected": -0.035613369196653366,
"step": 1240
},
{
"epoch": 0.5058680696074463,
"grad_norm": 8562833.744693786,
"learning_rate": 2.7462887989203774e-07,
"logits/chosen": -2.344916820526123,
"logits/rejected": -2.312051296234131,
"logps/chosen": -138.89639282226562,
"logps/rejected": -144.3848876953125,
"loss": 134452.375,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.02468470111489296,
"rewards/margins": 0.008161008358001709,
"rewards/rejected": -0.03284571319818497,
"step": 1250
},
{
"epoch": 0.509915014164306,
"grad_norm": 5502511.320429619,
"learning_rate": 2.7237966711650924e-07,
"logits/chosen": -2.283324718475342,
"logits/rejected": -2.2585034370422363,
"logps/chosen": -142.3020782470703,
"logps/rejected": -157.03909301757812,
"loss": 126080.5375,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.02461249753832817,
"rewards/margins": 0.02032613940536976,
"rewards/rejected": -0.04493863508105278,
"step": 1260
},
{
"epoch": 0.5139619587211656,
"grad_norm": 8682050.76660753,
"learning_rate": 2.7013045434098063e-07,
"logits/chosen": -2.275059223175049,
"logits/rejected": -2.2415084838867188,
"logps/chosen": -140.72640991210938,
"logps/rejected": -159.70445251464844,
"loss": 128182.575,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.033078595995903015,
"rewards/margins": 0.01954609341919422,
"rewards/rejected": -0.052624695003032684,
"step": 1270
},
{
"epoch": 0.5180089032780251,
"grad_norm": 9710402.98065158,
"learning_rate": 2.678812415654521e-07,
"logits/chosen": -2.2827441692352295,
"logits/rejected": -2.2365641593933105,
"logps/chosen": -159.8201446533203,
"logps/rejected": -167.04348754882812,
"loss": 126161.6,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.0369146391749382,
"rewards/margins": 0.009988631121814251,
"rewards/rejected": -0.04690327122807503,
"step": 1280
},
{
"epoch": 0.5220558478348847,
"grad_norm": 5687965.997013683,
"learning_rate": 2.656320287899235e-07,
"logits/chosen": -2.440713405609131,
"logits/rejected": -2.420994281768799,
"logps/chosen": -141.1729278564453,
"logps/rejected": -146.4890899658203,
"loss": 120775.6875,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.02869614027440548,
"rewards/margins": 0.01065666601061821,
"rewards/rejected": -0.03935280814766884,
"step": 1290
},
{
"epoch": 0.5261027923917442,
"grad_norm": 6948542.641971354,
"learning_rate": 2.633828160143949e-07,
"logits/chosen": -2.462017774581909,
"logits/rejected": -2.479309558868408,
"logps/chosen": -148.1350860595703,
"logps/rejected": -158.99916076660156,
"loss": 127668.2,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.020126869902014732,
"rewards/margins": 0.011567593552172184,
"rewards/rejected": -0.03169446438550949,
"step": 1300
},
{
"epoch": 0.5301497369486038,
"grad_norm": 8950839.210890554,
"learning_rate": 2.611336032388664e-07,
"logits/chosen": -2.379216432571411,
"logits/rejected": -2.349857807159424,
"logps/chosen": -158.7649688720703,
"logps/rejected": -158.4715118408203,
"loss": 133855.0375,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.030759122222661972,
"rewards/margins": 0.004436601884663105,
"rewards/rejected": -0.0351957306265831,
"step": 1310
},
{
"epoch": 0.5341966815054634,
"grad_norm": 5080965.438845941,
"learning_rate": 2.588843904633378e-07,
"logits/chosen": -2.4408226013183594,
"logits/rejected": -2.4230995178222656,
"logps/chosen": -122.5213394165039,
"logps/rejected": -136.46041870117188,
"loss": 125633.5875,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.022939473390579224,
"rewards/margins": 0.01324677187949419,
"rewards/rejected": -0.03618624433875084,
"step": 1320
},
{
"epoch": 0.5382436260623229,
"grad_norm": 5182504.597846507,
"learning_rate": 2.5663517768780926e-07,
"logits/chosen": -2.505174160003662,
"logits/rejected": -2.483182191848755,
"logps/chosen": -142.41513061523438,
"logps/rejected": -153.15907287597656,
"loss": 123496.5125,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.027721602469682693,
"rewards/margins": 0.015740955248475075,
"rewards/rejected": -0.04346255585551262,
"step": 1330
},
{
"epoch": 0.5422905706191825,
"grad_norm": 7128845.279886402,
"learning_rate": 2.543859649122807e-07,
"logits/chosen": -2.478231430053711,
"logits/rejected": -2.448133945465088,
"logps/chosen": -135.66380310058594,
"logps/rejected": -153.63766479492188,
"loss": 126797.3625,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.02139298990368843,
"rewards/margins": 0.010669348761439323,
"rewards/rejected": -0.0320623405277729,
"step": 1340
},
{
"epoch": 0.5463375151760421,
"grad_norm": 6676875.1940184785,
"learning_rate": 2.521367521367521e-07,
"logits/chosen": -2.4742424488067627,
"logits/rejected": -2.4604861736297607,
"logps/chosen": -117.7722396850586,
"logps/rejected": -130.4296875,
"loss": 125562.3,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.01471949927508831,
"rewards/margins": 0.010869570076465607,
"rewards/rejected": -0.025589067488908768,
"step": 1350
},
{
"epoch": 0.5503844597329016,
"grad_norm": 5238344.574016525,
"learning_rate": 2.4988753936122354e-07,
"logits/chosen": -2.404810905456543,
"logits/rejected": -2.386918544769287,
"logps/chosen": -129.4068603515625,
"logps/rejected": -142.20303344726562,
"loss": 121443.45,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.014744667336344719,
"rewards/margins": 0.01344493217766285,
"rewards/rejected": -0.02818959951400757,
"step": 1360
},
{
"epoch": 0.5544314042897612,
"grad_norm": 6640943.9300566595,
"learning_rate": 2.47638326585695e-07,
"logits/chosen": -2.3393630981445312,
"logits/rejected": -2.3323869705200195,
"logps/chosen": -132.86630249023438,
"logps/rejected": -144.26113891601562,
"loss": 127953.15,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.026765987277030945,
"rewards/margins": 0.01323648076504469,
"rewards/rejected": -0.04000247269868851,
"step": 1370
},
{
"epoch": 0.5584783488466208,
"grad_norm": 11500911.034452418,
"learning_rate": 2.4538911381016643e-07,
"logits/chosen": -2.2898106575012207,
"logits/rejected": -2.346161127090454,
"logps/chosen": -147.8282928466797,
"logps/rejected": -163.23367309570312,
"loss": 116464.0875,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.018574833869934082,
"rewards/margins": 0.024278491735458374,
"rewards/rejected": -0.042853325605392456,
"step": 1380
},
{
"epoch": 0.5625252934034803,
"grad_norm": 5617383.1061038,
"learning_rate": 2.431399010346379e-07,
"logits/chosen": -2.4383928775787354,
"logits/rejected": -2.447169780731201,
"logps/chosen": -125.36312103271484,
"logps/rejected": -135.82142639160156,
"loss": 127916.95,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.018465066328644753,
"rewards/margins": 0.007541149854660034,
"rewards/rejected": -0.026006218045949936,
"step": 1390
},
{
"epoch": 0.56657223796034,
"grad_norm": 7455055.133065385,
"learning_rate": 2.408906882591093e-07,
"logits/chosen": -2.351076126098633,
"logits/rejected": -2.386265993118286,
"logps/chosen": -138.0536346435547,
"logps/rejected": -153.21102905273438,
"loss": 126554.1,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.02320500835776329,
"rewards/margins": 0.008944050408899784,
"rewards/rejected": -0.0321490578353405,
"step": 1400
},
{
"epoch": 0.5706191825171996,
"grad_norm": 5233092.811472115,
"learning_rate": 2.386414754835807e-07,
"logits/chosen": -2.378037929534912,
"logits/rejected": -2.3643617630004883,
"logps/chosen": -160.79556274414062,
"logps/rejected": -167.969970703125,
"loss": 121087.4625,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.021083252504467964,
"rewards/margins": 0.008589145727455616,
"rewards/rejected": -0.029672399163246155,
"step": 1410
},
{
"epoch": 0.5746661270740591,
"grad_norm": 8648367.034730982,
"learning_rate": 2.363922627080522e-07,
"logits/chosen": -2.4605114459991455,
"logits/rejected": -2.432900905609131,
"logps/chosen": -145.24966430664062,
"logps/rejected": -169.27865600585938,
"loss": 127293.625,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.028744569048285484,
"rewards/margins": 0.01984976790845394,
"rewards/rejected": -0.04859434440732002,
"step": 1420
},
{
"epoch": 0.5787130716309187,
"grad_norm": 6917627.189571037,
"learning_rate": 2.3414304993252359e-07,
"logits/chosen": -2.415008783340454,
"logits/rejected": -2.390291213989258,
"logps/chosen": -118.60847473144531,
"logps/rejected": -137.45223999023438,
"loss": 126428.6625,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.021594971418380737,
"rewards/margins": 0.010223200544714928,
"rewards/rejected": -0.031818170100450516,
"step": 1430
},
{
"epoch": 0.5827600161877782,
"grad_norm": 7112073.222735109,
"learning_rate": 2.3189383715699503e-07,
"logits/chosen": -2.361323595046997,
"logits/rejected": -2.359731674194336,
"logps/chosen": -136.58473205566406,
"logps/rejected": -162.69363403320312,
"loss": 126602.9125,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.02461722306907177,
"rewards/margins": 0.010918731801211834,
"rewards/rejected": -0.03553595766425133,
"step": 1440
},
{
"epoch": 0.5868069607446378,
"grad_norm": 5694863.468784067,
"learning_rate": 2.2964462438146648e-07,
"logits/chosen": -2.4715747833251953,
"logits/rejected": -2.4460928440093994,
"logps/chosen": -139.74227905273438,
"logps/rejected": -143.89366149902344,
"loss": 124850.7625,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.019559044390916824,
"rewards/margins": 0.007027704268693924,
"rewards/rejected": -0.0265867467969656,
"step": 1450
},
{
"epoch": 0.5908539053014974,
"grad_norm": 6596951.588588771,
"learning_rate": 2.2739541160593792e-07,
"logits/chosen": -2.405214786529541,
"logits/rejected": -2.376192569732666,
"logps/chosen": -132.67037963867188,
"logps/rejected": -152.36544799804688,
"loss": 129629.9375,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.019801389425992966,
"rewards/margins": 0.02020254358649254,
"rewards/rejected": -0.040003933012485504,
"step": 1460
},
{
"epoch": 0.5949008498583569,
"grad_norm": 8604502.436566744,
"learning_rate": 2.2514619883040934e-07,
"logits/chosen": -2.4290502071380615,
"logits/rejected": -2.4105029106140137,
"logps/chosen": -138.2572784423828,
"logps/rejected": -157.66226196289062,
"loss": 126705.45,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.023596590384840965,
"rewards/margins": 0.01850738190114498,
"rewards/rejected": -0.042103976011276245,
"step": 1470
},
{
"epoch": 0.5989477944152165,
"grad_norm": 6476200.112160567,
"learning_rate": 2.2289698605488076e-07,
"logits/chosen": -2.405041217803955,
"logits/rejected": -2.343621253967285,
"logps/chosen": -135.63980102539062,
"logps/rejected": -157.7008056640625,
"loss": 124231.4625,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.025133201852440834,
"rewards/margins": 0.02330349013209343,
"rewards/rejected": -0.048436690121889114,
"step": 1480
},
{
"epoch": 0.6029947389720761,
"grad_norm": 6131917.113665815,
"learning_rate": 2.206477732793522e-07,
"logits/chosen": -2.413145065307617,
"logits/rejected": -2.4113070964813232,
"logps/chosen": -131.548583984375,
"logps/rejected": -143.9263153076172,
"loss": 123647.9125,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.024967512115836143,
"rewards/margins": 0.014087630435824394,
"rewards/rejected": -0.03905514255166054,
"step": 1490
},
{
"epoch": 0.6070416835289356,
"grad_norm": 6175564.2334703235,
"learning_rate": 2.1839856050382366e-07,
"logits/chosen": -2.4009850025177,
"logits/rejected": -2.3908042907714844,
"logps/chosen": -138.0840606689453,
"logps/rejected": -150.7329864501953,
"loss": 128591.4875,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.02667851746082306,
"rewards/margins": 0.0038915693294256926,
"rewards/rejected": -0.03057008981704712,
"step": 1500
},
{
"epoch": 0.6110886280857952,
"grad_norm": 7212353.914117165,
"learning_rate": 2.161493477282951e-07,
"logits/chosen": -2.405449628829956,
"logits/rejected": -2.3872292041778564,
"logps/chosen": -123.17295837402344,
"logps/rejected": -143.22288513183594,
"loss": 130148.7,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.01992269791662693,
"rewards/margins": 0.015165319666266441,
"rewards/rejected": -0.03508801758289337,
"step": 1510
},
{
"epoch": 0.6151355726426548,
"grad_norm": 9046114.113444956,
"learning_rate": 2.1390013495276652e-07,
"logits/chosen": -2.405348777770996,
"logits/rejected": -2.4298527240753174,
"logps/chosen": -147.79513549804688,
"logps/rejected": -168.75872802734375,
"loss": 127527.1,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.026391273364424706,
"rewards/margins": 0.009268445894122124,
"rewards/rejected": -0.03565971553325653,
"step": 1520
},
{
"epoch": 0.6191825171995143,
"grad_norm": 6130066.161443972,
"learning_rate": 2.1165092217723797e-07,
"logits/chosen": -2.3494181632995605,
"logits/rejected": -2.3200573921203613,
"logps/chosen": -131.47329711914062,
"logps/rejected": -151.9276885986328,
"loss": 121959.575,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.02297963574528694,
"rewards/margins": 0.016111956909298897,
"rewards/rejected": -0.03909159451723099,
"step": 1530
},
{
"epoch": 0.623229461756374,
"grad_norm": 6650864.453901279,
"learning_rate": 2.0940170940170939e-07,
"logits/chosen": -2.3809990882873535,
"logits/rejected": -2.3723671436309814,
"logps/chosen": -156.38284301757812,
"logps/rejected": -171.9077606201172,
"loss": 122674.425,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.02390839345753193,
"rewards/margins": 0.019179565832018852,
"rewards/rejected": -0.04308795928955078,
"step": 1540
},
{
"epoch": 0.6272764063132336,
"grad_norm": 5183894.402422156,
"learning_rate": 2.0715249662618083e-07,
"logits/chosen": -2.4737048149108887,
"logits/rejected": -2.446381092071533,
"logps/chosen": -145.76119995117188,
"logps/rejected": -165.5288848876953,
"loss": 125087.2875,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.02032746747136116,
"rewards/margins": 0.009251989424228668,
"rewards/rejected": -0.029579460620880127,
"step": 1550
},
{
"epoch": 0.6313233508700931,
"grad_norm": 6969624.578927646,
"learning_rate": 2.0490328385065225e-07,
"logits/chosen": -2.405435800552368,
"logits/rejected": -2.4140048027038574,
"logps/chosen": -119.48077392578125,
"logps/rejected": -130.77532958984375,
"loss": 125878.4625,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.018460571765899658,
"rewards/margins": 0.010110612958669662,
"rewards/rejected": -0.02857118286192417,
"step": 1560
},
{
"epoch": 0.6353702954269527,
"grad_norm": 5800628.746003852,
"learning_rate": 2.026540710751237e-07,
"logits/chosen": -2.366516351699829,
"logits/rejected": -2.3698983192443848,
"logps/chosen": -147.12881469726562,
"logps/rejected": -146.3048553466797,
"loss": 129275.3375,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.01768730953335762,
"rewards/margins": 0.011509931646287441,
"rewards/rejected": -0.029197242110967636,
"step": 1570
},
{
"epoch": 0.6394172399838122,
"grad_norm": 14918875.434130527,
"learning_rate": 2.0040485829959514e-07,
"logits/chosen": -2.4734253883361816,
"logits/rejected": -2.4595344066619873,
"logps/chosen": -125.6633529663086,
"logps/rejected": -142.96157836914062,
"loss": 123910.425,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.018165288493037224,
"rewards/margins": 0.015782013535499573,
"rewards/rejected": -0.033947303891181946,
"step": 1580
},
{
"epoch": 0.6434641845406718,
"grad_norm": 5025901.152056148,
"learning_rate": 1.981556455240666e-07,
"logits/chosen": -2.4370574951171875,
"logits/rejected": -2.4227161407470703,
"logps/chosen": -141.5856475830078,
"logps/rejected": -164.05941772460938,
"loss": 128733.9125,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.021845245733857155,
"rewards/margins": 0.011800579726696014,
"rewards/rejected": -0.03364582732319832,
"step": 1590
},
{
"epoch": 0.6475111290975314,
"grad_norm": 5965330.851620259,
"learning_rate": 1.9590643274853798e-07,
"logits/chosen": -2.398038387298584,
"logits/rejected": -2.37715482711792,
"logps/chosen": -118.133544921875,
"logps/rejected": -130.4759063720703,
"loss": 123004.3375,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.01905783638358116,
"rewards/margins": 0.011883154511451721,
"rewards/rejected": -0.030940990895032883,
"step": 1600
},
{
"epoch": 0.6515580736543909,
"grad_norm": 6299921.629356138,
"learning_rate": 1.9365721997300943e-07,
"logits/chosen": -2.3423843383789062,
"logits/rejected": -2.2989087104797363,
"logps/chosen": -115.65093994140625,
"logps/rejected": -138.21340942382812,
"loss": 127277.8375,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.02753433585166931,
"rewards/margins": 0.014595555141568184,
"rewards/rejected": -0.042129892855882645,
"step": 1610
},
{
"epoch": 0.6556050182112505,
"grad_norm": 7576070.098325265,
"learning_rate": 1.9140800719748088e-07,
"logits/chosen": -2.320422649383545,
"logits/rejected": -2.290821075439453,
"logps/chosen": -117.01118469238281,
"logps/rejected": -125.79508972167969,
"loss": 124367.8875,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.01931951195001602,
"rewards/margins": 0.006707245949655771,
"rewards/rejected": -0.026026759296655655,
"step": 1620
},
{
"epoch": 0.6596519627681101,
"grad_norm": 6162429.013600917,
"learning_rate": 1.8915879442195232e-07,
"logits/chosen": -2.334224224090576,
"logits/rejected": -2.3541088104248047,
"logps/chosen": -136.18832397460938,
"logps/rejected": -151.46710205078125,
"loss": 122585.7625,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.019429903477430344,
"rewards/margins": 0.011553862132132053,
"rewards/rejected": -0.03098376654088497,
"step": 1630
},
{
"epoch": 0.6636989073249696,
"grad_norm": 4988734.781634246,
"learning_rate": 1.8690958164642374e-07,
"logits/chosen": -2.4520297050476074,
"logits/rejected": -2.42329478263855,
"logps/chosen": -144.2743682861328,
"logps/rejected": -155.20095825195312,
"loss": 124995.575,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.016223471611738205,
"rewards/margins": 0.015287751331925392,
"rewards/rejected": -0.031511224806308746,
"step": 1640
},
{
"epoch": 0.6677458518818292,
"grad_norm": 6629567.008457434,
"learning_rate": 1.8466036887089516e-07,
"logits/chosen": -2.345116376876831,
"logits/rejected": -2.348301887512207,
"logps/chosen": -129.6711883544922,
"logps/rejected": -151.17837524414062,
"loss": 122800.6625,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.020190779119729996,
"rewards/margins": 0.0185395535081625,
"rewards/rejected": -0.03873033449053764,
"step": 1650
},
{
"epoch": 0.6717927964386888,
"grad_norm": 5601907.507778279,
"learning_rate": 1.824111560953666e-07,
"logits/chosen": -2.2657582759857178,
"logits/rejected": -2.260693311691284,
"logps/chosen": -128.01144409179688,
"logps/rejected": -155.50888061523438,
"loss": 124625.45,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.027610983699560165,
"rewards/margins": 0.020689968019723892,
"rewards/rejected": -0.04830095171928406,
"step": 1660
},
{
"epoch": 0.6758397409955483,
"grad_norm": 5845172.102872888,
"learning_rate": 1.8016194331983805e-07,
"logits/chosen": -2.289998769760132,
"logits/rejected": -2.2984931468963623,
"logps/chosen": -120.5185775756836,
"logps/rejected": -140.23655700683594,
"loss": 125251.7625,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.029199788346886635,
"rewards/margins": 0.014481378719210625,
"rewards/rejected": -0.04368116706609726,
"step": 1670
},
{
"epoch": 0.6798866855524079,
"grad_norm": 6195370.996741281,
"learning_rate": 1.779127305443095e-07,
"logits/chosen": -2.3538708686828613,
"logits/rejected": -2.334139347076416,
"logps/chosen": -136.63980102539062,
"logps/rejected": -144.39056396484375,
"loss": 129559.5375,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.030268553644418716,
"rewards/margins": 0.0073168775998055935,
"rewards/rejected": -0.03758542984724045,
"step": 1680
},
{
"epoch": 0.6839336301092676,
"grad_norm": 8790702.611604873,
"learning_rate": 1.7566351776878092e-07,
"logits/chosen": -2.3637337684631348,
"logits/rejected": -2.3632633686065674,
"logps/chosen": -129.40554809570312,
"logps/rejected": -147.02755737304688,
"loss": 128803.0875,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.023326028138399124,
"rewards/margins": 0.011800029315054417,
"rewards/rejected": -0.03512606397271156,
"step": 1690
},
{
"epoch": 0.687980574666127,
"grad_norm": 8047500.436527203,
"learning_rate": 1.7341430499325237e-07,
"logits/chosen": -2.2449162006378174,
"logits/rejected": -2.254812717437744,
"logps/chosen": -128.66867065429688,
"logps/rejected": -134.86846923828125,
"loss": 130052.75,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.021912669762969017,
"rewards/margins": 0.007869280874729156,
"rewards/rejected": -0.029781952500343323,
"step": 1700
},
{
"epoch": 0.6920275192229867,
"grad_norm": 7961693.976507484,
"learning_rate": 1.7116509221772378e-07,
"logits/chosen": -2.308650255203247,
"logits/rejected": -2.325552463531494,
"logps/chosen": -118.10685729980469,
"logps/rejected": -132.20530700683594,
"loss": 125613.825,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.022883836179971695,
"rewards/margins": 0.00968220829963684,
"rewards/rejected": -0.032566044479608536,
"step": 1710
},
{
"epoch": 0.6960744637798462,
"grad_norm": 8983241.448290937,
"learning_rate": 1.6891587944219523e-07,
"logits/chosen": -2.3228538036346436,
"logits/rejected": -2.295989990234375,
"logps/chosen": -136.067138671875,
"logps/rejected": -149.92922973632812,
"loss": 124795.0875,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.01805921457707882,
"rewards/margins": 0.018675491213798523,
"rewards/rejected": -0.03673470392823219,
"step": 1720
},
{
"epoch": 0.7001214083367058,
"grad_norm": 6598233.046729883,
"learning_rate": 1.6666666666666665e-07,
"logits/chosen": -2.321946144104004,
"logits/rejected": -2.2634482383728027,
"logps/chosen": -156.81881713867188,
"logps/rejected": -175.48133850097656,
"loss": 124281.6625,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.023901356384158134,
"rewards/margins": 0.020154178142547607,
"rewards/rejected": -0.04405553638935089,
"step": 1730
},
{
"epoch": 0.7041683528935654,
"grad_norm": 7393573.408673971,
"learning_rate": 1.644174538911381e-07,
"logits/chosen": -2.157721996307373,
"logits/rejected": -2.1384575366973877,
"logps/chosen": -156.61431884765625,
"logps/rejected": -169.68267822265625,
"loss": 123967.125,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.033402346074581146,
"rewards/margins": 0.017039528116583824,
"rewards/rejected": -0.05044187977910042,
"step": 1740
},
{
"epoch": 0.7082152974504249,
"grad_norm": 8650281.232154809,
"learning_rate": 1.6216824111560954e-07,
"logits/chosen": -2.3099396228790283,
"logits/rejected": -2.314627170562744,
"logps/chosen": -140.70175170898438,
"logps/rejected": -165.3170928955078,
"loss": 125535.2875,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.02742253616452217,
"rewards/margins": 0.015254299156367779,
"rewards/rejected": -0.04267684370279312,
"step": 1750
},
{
"epoch": 0.7122622420072845,
"grad_norm": 6208948.317347317,
"learning_rate": 1.5991902834008096e-07,
"logits/chosen": -2.3783583641052246,
"logits/rejected": -2.362631320953369,
"logps/chosen": -148.7383270263672,
"logps/rejected": -162.62327575683594,
"loss": 121080.075,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.014549913816154003,
"rewards/margins": 0.013842826709151268,
"rewards/rejected": -0.028392743319272995,
"step": 1760
},
{
"epoch": 0.7163091865641441,
"grad_norm": 7739071.113911002,
"learning_rate": 1.5766981556455238e-07,
"logits/chosen": -2.299868583679199,
"logits/rejected": -2.2598750591278076,
"logps/chosen": -162.82052612304688,
"logps/rejected": -184.3058319091797,
"loss": 122385.7125,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.030940508469939232,
"rewards/margins": 0.023769445717334747,
"rewards/rejected": -0.05470995977520943,
"step": 1770
},
{
"epoch": 0.7203561311210036,
"grad_norm": 6707665.892655141,
"learning_rate": 1.5542060278902383e-07,
"logits/chosen": -2.3239502906799316,
"logits/rejected": -2.3085806369781494,
"logps/chosen": -139.06484985351562,
"logps/rejected": -157.50460815429688,
"loss": 115194.475,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.03321670740842819,
"rewards/margins": 0.021816464141011238,
"rewards/rejected": -0.05503316968679428,
"step": 1780
},
{
"epoch": 0.7244030756778632,
"grad_norm": 7475588.891135754,
"learning_rate": 1.5317139001349527e-07,
"logits/chosen": -2.380169630050659,
"logits/rejected": -2.3587822914123535,
"logps/chosen": -134.81069946289062,
"logps/rejected": -149.78839111328125,
"loss": 135028.0125,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.03387707471847534,
"rewards/margins": 0.011253075674176216,
"rewards/rejected": -0.04513014853000641,
"step": 1790
},
{
"epoch": 0.7284500202347228,
"grad_norm": 6224311.633435066,
"learning_rate": 1.5092217723796672e-07,
"logits/chosen": -2.4899191856384277,
"logits/rejected": -2.461540460586548,
"logps/chosen": -139.72994995117188,
"logps/rejected": -154.91757202148438,
"loss": 127101.55,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.022551989182829857,
"rewards/margins": 0.019368382170796394,
"rewards/rejected": -0.04192037135362625,
"step": 1800
},
{
"epoch": 0.7324969647915823,
"grad_norm": 6407363.569135414,
"learning_rate": 1.4867296446243814e-07,
"logits/chosen": -2.457529067993164,
"logits/rejected": -2.4312427043914795,
"logps/chosen": -171.8442840576172,
"logps/rejected": -170.40664672851562,
"loss": 126581.2375,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.02567433752119541,
"rewards/margins": 0.008936228230595589,
"rewards/rejected": -0.0346105620265007,
"step": 1810
},
{
"epoch": 0.7365439093484419,
"grad_norm": 5335773.687286384,
"learning_rate": 1.4642375168690956e-07,
"logits/chosen": -2.442826986312866,
"logits/rejected": -2.424445867538452,
"logps/chosen": -130.82366943359375,
"logps/rejected": -150.00717163085938,
"loss": 121689.35,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.020857717841863632,
"rewards/margins": 0.011433606036007404,
"rewards/rejected": -0.03229131922125816,
"step": 1820
},
{
"epoch": 0.7405908539053015,
"grad_norm": 5919606.114162943,
"learning_rate": 1.44174538911381e-07,
"logits/chosen": -2.4367711544036865,
"logits/rejected": -2.4152512550354004,
"logps/chosen": -116.6092758178711,
"logps/rejected": -137.42446899414062,
"loss": 124829.175,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.018279392272233963,
"rewards/margins": 0.017404617741703987,
"rewards/rejected": -0.0356840081512928,
"step": 1830
},
{
"epoch": 0.744637798462161,
"grad_norm": 4526671.180016859,
"learning_rate": 1.4192532613585245e-07,
"logits/chosen": -2.3979544639587402,
"logits/rejected": -2.3597800731658936,
"logps/chosen": -135.9434814453125,
"logps/rejected": -138.03778076171875,
"loss": 129111.95,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.024272512644529343,
"rewards/margins": 0.012156413868069649,
"rewards/rejected": -0.03642892464995384,
"step": 1840
},
{
"epoch": 0.7486847430190207,
"grad_norm": 7139221.010538934,
"learning_rate": 1.396761133603239e-07,
"logits/chosen": -2.4428467750549316,
"logits/rejected": -2.428190231323242,
"logps/chosen": -123.2089614868164,
"logps/rejected": -138.09390258789062,
"loss": 128958.6625,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.018383827060461044,
"rewards/margins": 0.010701683349907398,
"rewards/rejected": -0.029085511341691017,
"step": 1850
},
{
"epoch": 0.7527316875758802,
"grad_norm": 7046675.547216455,
"learning_rate": 1.3742690058479532e-07,
"logits/chosen": -2.4591715335845947,
"logits/rejected": -2.426462411880493,
"logps/chosen": -133.52520751953125,
"logps/rejected": -138.1920623779297,
"loss": 130433.475,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.014568162150681019,
"rewards/margins": 0.013440297916531563,
"rewards/rejected": -0.028008460998535156,
"step": 1860
},
{
"epoch": 0.7567786321327398,
"grad_norm": 6183615.802176011,
"learning_rate": 1.3517768780926674e-07,
"logits/chosen": -2.4390716552734375,
"logits/rejected": -2.3869736194610596,
"logps/chosen": -127.33221435546875,
"logps/rejected": -149.60716247558594,
"loss": 126095.0375,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.020692896097898483,
"rewards/margins": 0.015832407400012016,
"rewards/rejected": -0.03652530163526535,
"step": 1870
},
{
"epoch": 0.7608255766895994,
"grad_norm": 5139359.676607012,
"learning_rate": 1.3292847503373818e-07,
"logits/chosen": -2.4137744903564453,
"logits/rejected": -2.4111902713775635,
"logps/chosen": -138.07791137695312,
"logps/rejected": -150.23367309570312,
"loss": 122845.4375,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.015661675482988358,
"rewards/margins": 0.011621621437370777,
"rewards/rejected": -0.02728329598903656,
"step": 1880
},
{
"epoch": 0.7648725212464589,
"grad_norm": 5436765.081995142,
"learning_rate": 1.3067926225820963e-07,
"logits/chosen": -2.2953848838806152,
"logits/rejected": -2.261265754699707,
"logps/chosen": -131.72573852539062,
"logps/rejected": -158.7117156982422,
"loss": 120437.1625,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.018344033509492874,
"rewards/margins": 0.025271952152252197,
"rewards/rejected": -0.04361598566174507,
"step": 1890
},
{
"epoch": 0.7689194658033185,
"grad_norm": 7773151.683082246,
"learning_rate": 1.2843004948268105e-07,
"logits/chosen": -2.229933023452759,
"logits/rejected": -2.166466474533081,
"logps/chosen": -147.3013153076172,
"logps/rejected": -160.14205932617188,
"loss": 130466.3875,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.024005400016903877,
"rewards/margins": 0.014198745600879192,
"rewards/rejected": -0.038204144686460495,
"step": 1900
},
{
"epoch": 0.7729664103601781,
"grad_norm": 6242141.939931843,
"learning_rate": 1.261808367071525e-07,
"logits/chosen": -2.2624001502990723,
"logits/rejected": -2.229830503463745,
"logps/chosen": -138.10633850097656,
"logps/rejected": -152.7989044189453,
"loss": 127404.4375,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.02552894689142704,
"rewards/margins": 0.008202909491956234,
"rewards/rejected": -0.033731859177351,
"step": 1910
},
{
"epoch": 0.7770133549170376,
"grad_norm": 6920375.30724732,
"learning_rate": 1.2393162393162394e-07,
"logits/chosen": -2.350060224533081,
"logits/rejected": -2.3308169841766357,
"logps/chosen": -132.56320190429688,
"logps/rejected": -153.30557250976562,
"loss": 126830.1,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.029025813564658165,
"rewards/margins": 0.019299551844596863,
"rewards/rejected": -0.04832536727190018,
"step": 1920
},
{
"epoch": 0.7810602994738972,
"grad_norm": 6671009.085790114,
"learning_rate": 1.2168241115609536e-07,
"logits/chosen": -2.2904415130615234,
"logits/rejected": -2.329463481903076,
"logps/chosen": -141.00816345214844,
"logps/rejected": -143.0104217529297,
"loss": 129713.6875,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.03294295817613602,
"rewards/margins": 0.0017857927596196532,
"rewards/rejected": -0.034728746861219406,
"step": 1930
},
{
"epoch": 0.7851072440307568,
"grad_norm": 6655872.215382386,
"learning_rate": 1.194331983805668e-07,
"logits/chosen": -2.3119730949401855,
"logits/rejected": -2.2890148162841797,
"logps/chosen": -131.12327575683594,
"logps/rejected": -148.35281372070312,
"loss": 126911.35,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.024319607764482498,
"rewards/margins": 0.012184834107756615,
"rewards/rejected": -0.036504440009593964,
"step": 1940
},
{
"epoch": 0.7891541885876163,
"grad_norm": 7036617.58409423,
"learning_rate": 1.1718398560503823e-07,
"logits/chosen": -2.3747105598449707,
"logits/rejected": -2.3655359745025635,
"logps/chosen": -127.3541259765625,
"logps/rejected": -142.77593994140625,
"loss": 125537.975,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.017514357343316078,
"rewards/margins": 0.013664362952113152,
"rewards/rejected": -0.03117872215807438,
"step": 1950
},
{
"epoch": 0.7932011331444759,
"grad_norm": 6573765.081555526,
"learning_rate": 1.1493477282950967e-07,
"logits/chosen": -2.412942409515381,
"logits/rejected": -2.390746593475342,
"logps/chosen": -134.2810821533203,
"logps/rejected": -158.460693359375,
"loss": 123726.75,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.021586496382951736,
"rewards/margins": 0.017149869352579117,
"rewards/rejected": -0.03873636573553085,
"step": 1960
},
{
"epoch": 0.7972480777013355,
"grad_norm": 7038673.672056439,
"learning_rate": 1.1268556005398109e-07,
"logits/chosen": -2.370753765106201,
"logits/rejected": -2.3683507442474365,
"logps/chosen": -124.46659851074219,
"logps/rejected": -131.4204559326172,
"loss": 126820.75,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.024847570806741714,
"rewards/margins": 0.011085819453001022,
"rewards/rejected": -0.03593338653445244,
"step": 1970
},
{
"epoch": 0.801295022258195,
"grad_norm": 6451758.008444387,
"learning_rate": 1.1043634727845254e-07,
"logits/chosen": -2.325690984725952,
"logits/rejected": -2.336920976638794,
"logps/chosen": -122.97123718261719,
"logps/rejected": -147.48297119140625,
"loss": 123985.45,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.016694985330104828,
"rewards/margins": 0.016988877207040787,
"rewards/rejected": -0.033683862537145615,
"step": 1980
},
{
"epoch": 0.8053419668150547,
"grad_norm": 9482913.906109469,
"learning_rate": 1.0818713450292397e-07,
"logits/chosen": -2.2602345943450928,
"logits/rejected": -2.243213653564453,
"logps/chosen": -122.68096923828125,
"logps/rejected": -138.34278869628906,
"loss": 124861.1125,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.028803208842873573,
"rewards/margins": 0.015811622142791748,
"rewards/rejected": -0.04461482912302017,
"step": 1990
},
{
"epoch": 0.8093889113719142,
"grad_norm": 6927553.6188276345,
"learning_rate": 1.059379217273954e-07,
"logits/chosen": -2.3502438068389893,
"logits/rejected": -2.337284564971924,
"logps/chosen": -132.24378967285156,
"logps/rejected": -149.02110290527344,
"loss": 125569.4625,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.018480569124221802,
"rewards/margins": 0.016136765480041504,
"rewards/rejected": -0.034617334604263306,
"step": 2000
},
{
"epoch": 0.8134358559287738,
"grad_norm": 6611093.761936569,
"learning_rate": 1.0368870895186684e-07,
"logits/chosen": -2.3684864044189453,
"logits/rejected": -2.324704170227051,
"logps/chosen": -135.81961059570312,
"logps/rejected": -160.5324249267578,
"loss": 121162.075,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.012911828234791756,
"rewards/margins": 0.030595939606428146,
"rewards/rejected": -0.04350776970386505,
"step": 2010
},
{
"epoch": 0.8174828004856334,
"grad_norm": 6723882.38264995,
"learning_rate": 1.0143949617633828e-07,
"logits/chosen": -2.2761006355285645,
"logits/rejected": -2.2606966495513916,
"logps/chosen": -119.88040924072266,
"logps/rejected": -145.41346740722656,
"loss": 129622.7125,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.025995198637247086,
"rewards/margins": 0.019534587860107422,
"rewards/rejected": -0.045529790222644806,
"step": 2020
},
{
"epoch": 0.8215297450424929,
"grad_norm": 8229250.060941711,
"learning_rate": 9.919028340080972e-08,
"logits/chosen": -2.3351616859436035,
"logits/rejected": -2.280089855194092,
"logps/chosen": -138.04751586914062,
"logps/rejected": -154.4039306640625,
"loss": 121636.6375,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.02659204974770546,
"rewards/margins": 0.022092049941420555,
"rewards/rejected": -0.048684097826480865,
"step": 2030
},
{
"epoch": 0.8255766895993525,
"grad_norm": 8360918.973606626,
"learning_rate": 9.694107062528115e-08,
"logits/chosen": -2.302302837371826,
"logits/rejected": -2.3009400367736816,
"logps/chosen": -133.8302459716797,
"logps/rejected": -153.5994110107422,
"loss": 124760.0625,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.020709946751594543,
"rewards/margins": 0.014815042726695538,
"rewards/rejected": -0.03552498668432236,
"step": 2040
},
{
"epoch": 0.8296236341562121,
"grad_norm": 8136804.681969547,
"learning_rate": 9.46918578497526e-08,
"logits/chosen": -2.325496196746826,
"logits/rejected": -2.3160691261291504,
"logps/chosen": -133.07785034179688,
"logps/rejected": -157.1102294921875,
"loss": 122905.6875,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.022938497364521027,
"rewards/margins": 0.02051146700978279,
"rewards/rejected": -0.04344996064901352,
"step": 2050
},
{
"epoch": 0.8336705787130716,
"grad_norm": 5880924.756183454,
"learning_rate": 9.244264507422401e-08,
"logits/chosen": -2.247741460800171,
"logits/rejected": -2.2601161003112793,
"logps/chosen": -138.5823974609375,
"logps/rejected": -150.9891357421875,
"loss": 122247.55,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.021218325942754745,
"rewards/margins": 0.013738051056861877,
"rewards/rejected": -0.034956373274326324,
"step": 2060
},
{
"epoch": 0.8377175232699312,
"grad_norm": 6479318.093365777,
"learning_rate": 9.019343229869546e-08,
"logits/chosen": -2.287973403930664,
"logits/rejected": -2.27508282661438,
"logps/chosen": -148.0543975830078,
"logps/rejected": -174.75563049316406,
"loss": 122681.5375,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.028557494282722473,
"rewards/margins": 0.02148330584168434,
"rewards/rejected": -0.050040800124406815,
"step": 2070
},
{
"epoch": 0.8417644678267908,
"grad_norm": 7530654.549282354,
"learning_rate": 8.794421952316688e-08,
"logits/chosen": -2.3192243576049805,
"logits/rejected": -2.301488161087036,
"logps/chosen": -140.3570556640625,
"logps/rejected": -144.66439819335938,
"loss": 127493.5,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.027281736955046654,
"rewards/margins": 0.011399330571293831,
"rewards/rejected": -0.03868107125163078,
"step": 2080
},
{
"epoch": 0.8458114123836503,
"grad_norm": 5545588.639352677,
"learning_rate": 8.569500674763833e-08,
"logits/chosen": -2.3623504638671875,
"logits/rejected": -2.327298641204834,
"logps/chosen": -125.07554626464844,
"logps/rejected": -162.51771545410156,
"loss": 122307.35,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.023772019892930984,
"rewards/margins": 0.024702411144971848,
"rewards/rejected": -0.048474427312612534,
"step": 2090
},
{
"epoch": 0.8498583569405099,
"grad_norm": 7677881.561277626,
"learning_rate": 8.344579397210976e-08,
"logits/chosen": -2.400023937225342,
"logits/rejected": -2.398374557495117,
"logps/chosen": -143.50096130371094,
"logps/rejected": -154.2194061279297,
"loss": 126753.125,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.03331100195646286,
"rewards/margins": 0.007773646619170904,
"rewards/rejected": -0.0410846471786499,
"step": 2100
},
{
"epoch": 0.8539053014973695,
"grad_norm": 9639697.081950434,
"learning_rate": 8.119658119658119e-08,
"logits/chosen": -2.2588629722595215,
"logits/rejected": -2.2181789875030518,
"logps/chosen": -136.6796417236328,
"logps/rejected": -171.16500854492188,
"loss": 127603.3375,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.03126269578933716,
"rewards/margins": 0.025771383196115494,
"rewards/rejected": -0.057034075260162354,
"step": 2110
},
{
"epoch": 0.857952246054229,
"grad_norm": 5314294.806910229,
"learning_rate": 7.894736842105262e-08,
"logits/chosen": -2.486797332763672,
"logits/rejected": -2.4730780124664307,
"logps/chosen": -146.88571166992188,
"logps/rejected": -158.010986328125,
"loss": 125283.2125,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.020792517811059952,
"rewards/margins": 0.017872992902994156,
"rewards/rejected": -0.03866551071405411,
"step": 2120
},
{
"epoch": 0.8619991906110887,
"grad_norm": 8035986.499722224,
"learning_rate": 7.669815564552407e-08,
"logits/chosen": -2.421731472015381,
"logits/rejected": -2.425063371658325,
"logps/chosen": -116.5892562866211,
"logps/rejected": -132.42050170898438,
"loss": 125860.925,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.021887045353651047,
"rewards/margins": 0.014476152136921883,
"rewards/rejected": -0.03636319935321808,
"step": 2130
},
{
"epoch": 0.8660461351679482,
"grad_norm": 6251020.741893531,
"learning_rate": 7.444894286999549e-08,
"logits/chosen": -2.364879608154297,
"logits/rejected": -2.311974048614502,
"logps/chosen": -120.81207275390625,
"logps/rejected": -145.0894317626953,
"loss": 119764.95,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.02619818225502968,
"rewards/margins": 0.023076878860592842,
"rewards/rejected": -0.04927505925297737,
"step": 2140
},
{
"epoch": 0.8700930797248078,
"grad_norm": 6301369.537300293,
"learning_rate": 7.219973009446694e-08,
"logits/chosen": -2.379647970199585,
"logits/rejected": -2.3476970195770264,
"logps/chosen": -134.33572387695312,
"logps/rejected": -155.9193115234375,
"loss": 118915.75,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.019399352371692657,
"rewards/margins": 0.016515102237462997,
"rewards/rejected": -0.03591445833444595,
"step": 2150
},
{
"epoch": 0.8741400242816674,
"grad_norm": 6200780.785181898,
"learning_rate": 6.995051731893837e-08,
"logits/chosen": -2.4005587100982666,
"logits/rejected": -2.381075382232666,
"logps/chosen": -134.69631958007812,
"logps/rejected": -142.3704071044922,
"loss": 122057.2,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.024357806891202927,
"rewards/margins": 0.01147081982344389,
"rewards/rejected": -0.03582862392067909,
"step": 2160
},
{
"epoch": 0.8781869688385269,
"grad_norm": 7707200.943905766,
"learning_rate": 6.77013045434098e-08,
"logits/chosen": -2.1691622734069824,
"logits/rejected": -2.1518099308013916,
"logps/chosen": -133.11085510253906,
"logps/rejected": -150.32522583007812,
"loss": 124932.0875,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.02330431155860424,
"rewards/margins": 0.0101194242015481,
"rewards/rejected": -0.033423732966184616,
"step": 2170
},
{
"epoch": 0.8822339133953865,
"grad_norm": 6068434.174209909,
"learning_rate": 6.545209176788123e-08,
"logits/chosen": -2.2563586235046387,
"logits/rejected": -2.249168872833252,
"logps/chosen": -126.6658935546875,
"logps/rejected": -150.95640563964844,
"loss": 125160.825,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.015440529212355614,
"rewards/margins": 0.015524588525295258,
"rewards/rejected": -0.030965115875005722,
"step": 2180
},
{
"epoch": 0.8862808579522461,
"grad_norm": 7537201.72606691,
"learning_rate": 6.320287899235267e-08,
"logits/chosen": -2.364108085632324,
"logits/rejected": -2.3574013710021973,
"logps/chosen": -127.92137145996094,
"logps/rejected": -143.64276123046875,
"loss": 128988.85,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.015308101661503315,
"rewards/margins": 0.012061825022101402,
"rewards/rejected": -0.027369925752282143,
"step": 2190
},
{
"epoch": 0.8903278025091056,
"grad_norm": 20144650.289658338,
"learning_rate": 6.095366621682411e-08,
"logits/chosen": -2.3388938903808594,
"logits/rejected": -2.309027910232544,
"logps/chosen": -131.75338745117188,
"logps/rejected": -147.02613830566406,
"loss": 131861.55,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.033888086676597595,
"rewards/margins": 0.011972433887422085,
"rewards/rejected": -0.045860521495342255,
"step": 2200
},
{
"epoch": 0.8943747470659652,
"grad_norm": 7275705.0028550355,
"learning_rate": 5.8704453441295546e-08,
"logits/chosen": -2.4196584224700928,
"logits/rejected": -2.4083645343780518,
"logps/chosen": -137.82431030273438,
"logps/rejected": -152.37692260742188,
"loss": 125663.2125,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.01876234821975231,
"rewards/margins": 0.0189601369202137,
"rewards/rejected": -0.03772248700261116,
"step": 2210
},
{
"epoch": 0.8984216916228248,
"grad_norm": 7203116.155779993,
"learning_rate": 5.645524066576698e-08,
"logits/chosen": -2.4381861686706543,
"logits/rejected": -2.403198003768921,
"logps/chosen": -131.7117156982422,
"logps/rejected": -142.67185974121094,
"loss": 123239.9375,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.01855655573308468,
"rewards/margins": 0.015223322436213493,
"rewards/rejected": -0.03377988189458847,
"step": 2220
},
{
"epoch": 0.9024686361796843,
"grad_norm": 5199237.236235915,
"learning_rate": 5.420602789023841e-08,
"logits/chosen": -2.3613171577453613,
"logits/rejected": -2.2851357460021973,
"logps/chosen": -153.357177734375,
"logps/rejected": -159.00067138671875,
"loss": 123367.6125,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.024445852264761925,
"rewards/margins": 0.012765263207256794,
"rewards/rejected": -0.037211112678050995,
"step": 2230
},
{
"epoch": 0.9065155807365439,
"grad_norm": 8322118.880155748,
"learning_rate": 5.1956815114709844e-08,
"logits/chosen": -2.4463276863098145,
"logits/rejected": -2.4448184967041016,
"logps/chosen": -166.32937622070312,
"logps/rejected": -170.3524932861328,
"loss": 127037.6,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.034686215221881866,
"rewards/margins": 0.006833164487034082,
"rewards/rejected": -0.04151938110589981,
"step": 2240
},
{
"epoch": 0.9105625252934035,
"grad_norm": 7843111.5615214445,
"learning_rate": 4.9707602339181284e-08,
"logits/chosen": -2.406442165374756,
"logits/rejected": -2.3739068508148193,
"logps/chosen": -135.46450805664062,
"logps/rejected": -151.98318481445312,
"loss": 119829.0,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.027578959241509438,
"rewards/margins": 0.019721323624253273,
"rewards/rejected": -0.04730028659105301,
"step": 2250
},
{
"epoch": 0.914609469850263,
"grad_norm": 6508134.384511007,
"learning_rate": 4.745838956365272e-08,
"logits/chosen": -2.374009609222412,
"logits/rejected": -2.330867290496826,
"logps/chosen": -147.81002807617188,
"logps/rejected": -150.0068817138672,
"loss": 123565.525,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.030267197638750076,
"rewards/margins": 0.008453629910945892,
"rewards/rejected": -0.03872082382440567,
"step": 2260
},
{
"epoch": 0.9186564144071226,
"grad_norm": 4876699.168643324,
"learning_rate": 4.5209176788124156e-08,
"logits/chosen": -2.4444994926452637,
"logits/rejected": -2.370756149291992,
"logps/chosen": -148.59286499023438,
"logps/rejected": -159.95957946777344,
"loss": 121402.275,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.027829691767692566,
"rewards/margins": 0.01744781993329525,
"rewards/rejected": -0.045277513563632965,
"step": 2270
},
{
"epoch": 0.9227033589639821,
"grad_norm": 7528661.562948466,
"learning_rate": 4.2959964012595596e-08,
"logits/chosen": -2.421567678451538,
"logits/rejected": -2.4027442932128906,
"logps/chosen": -138.11740112304688,
"logps/rejected": -148.74745178222656,
"loss": 125506.075,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.025351068004965782,
"rewards/margins": 0.01246053259819746,
"rewards/rejected": -0.037811603397130966,
"step": 2280
},
{
"epoch": 0.9267503035208418,
"grad_norm": 5869282.00575302,
"learning_rate": 4.071075123706703e-08,
"logits/chosen": -2.344552993774414,
"logits/rejected": -2.309044122695923,
"logps/chosen": -135.99095153808594,
"logps/rejected": -162.00845336914062,
"loss": 119300.3375,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.01940598525106907,
"rewards/margins": 0.02190936915576458,
"rewards/rejected": -0.04131535068154335,
"step": 2290
},
{
"epoch": 0.9307972480777014,
"grad_norm": 6508908.496606901,
"learning_rate": 3.846153846153846e-08,
"logits/chosen": -2.2711312770843506,
"logits/rejected": -2.235738754272461,
"logps/chosen": -156.75537109375,
"logps/rejected": -162.16207885742188,
"loss": 121451.2625,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.0278861615806818,
"rewards/margins": 0.010201343335211277,
"rewards/rejected": -0.038087502121925354,
"step": 2300
},
{
"epoch": 0.9348441926345609,
"grad_norm": 8865730.632557675,
"learning_rate": 3.6212325686009894e-08,
"logits/chosen": -2.324096202850342,
"logits/rejected": -2.2920069694519043,
"logps/chosen": -117.9946060180664,
"logps/rejected": -133.47108459472656,
"loss": 122291.0625,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.022024232894182205,
"rewards/margins": 0.014470313675701618,
"rewards/rejected": -0.03649454563856125,
"step": 2310
},
{
"epoch": 0.9388911371914205,
"grad_norm": 9971453.387611723,
"learning_rate": 3.3963112910481334e-08,
"logits/chosen": -2.409850597381592,
"logits/rejected": -2.3329081535339355,
"logps/chosen": -146.290283203125,
"logps/rejected": -182.57904052734375,
"loss": 120819.0,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.024361763149499893,
"rewards/margins": 0.03630813583731651,
"rewards/rejected": -0.060669898986816406,
"step": 2320
},
{
"epoch": 0.9429380817482801,
"grad_norm": 6070350.046980263,
"learning_rate": 3.1713900134952766e-08,
"logits/chosen": -2.34000301361084,
"logits/rejected": -2.3207154273986816,
"logps/chosen": -135.71852111816406,
"logps/rejected": -158.15370178222656,
"loss": 127869.825,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.026078131049871445,
"rewards/margins": 0.018510058522224426,
"rewards/rejected": -0.04458818584680557,
"step": 2330
},
{
"epoch": 0.9469850263051396,
"grad_norm": 8157133.764333476,
"learning_rate": 2.94646873594242e-08,
"logits/chosen": -2.4080350399017334,
"logits/rejected": -2.3741536140441895,
"logps/chosen": -139.84112548828125,
"logps/rejected": -175.55917358398438,
"loss": 126005.05,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.029435906559228897,
"rewards/margins": 0.02673642337322235,
"rewards/rejected": -0.05617233365774155,
"step": 2340
},
{
"epoch": 0.9510319708619992,
"grad_norm": 8440293.351897202,
"learning_rate": 2.7215474583895635e-08,
"logits/chosen": -2.4163994789123535,
"logits/rejected": -2.384222984313965,
"logps/chosen": -158.38975524902344,
"logps/rejected": -164.0137176513672,
"loss": 123603.3875,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.03101927414536476,
"rewards/margins": 0.011726012453436852,
"rewards/rejected": -0.042745284736156464,
"step": 2350
},
{
"epoch": 0.9550789154188588,
"grad_norm": 6375292.788715957,
"learning_rate": 2.496626180836707e-08,
"logits/chosen": -2.2860450744628906,
"logits/rejected": -2.28193998336792,
"logps/chosen": -137.55361938476562,
"logps/rejected": -162.74652099609375,
"loss": 125693.3125,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.024988356977701187,
"rewards/margins": 0.023718636482954025,
"rewards/rejected": -0.04870699718594551,
"step": 2360
},
{
"epoch": 0.9591258599757183,
"grad_norm": 9472423.8957588,
"learning_rate": 2.2717049032838504e-08,
"logits/chosen": -2.359046459197998,
"logits/rejected": -2.3519136905670166,
"logps/chosen": -135.88697814941406,
"logps/rejected": -158.06321716308594,
"loss": 127380.25,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.027975231409072876,
"rewards/margins": 0.01904093287885189,
"rewards/rejected": -0.04701615869998932,
"step": 2370
},
{
"epoch": 0.9631728045325779,
"grad_norm": 6950351.310431428,
"learning_rate": 2.046783625730994e-08,
"logits/chosen": -2.278303623199463,
"logits/rejected": -2.271866798400879,
"logps/chosen": -147.02255249023438,
"logps/rejected": -162.41444396972656,
"loss": 131236.1875,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.03206818923354149,
"rewards/margins": 0.013242989778518677,
"rewards/rejected": -0.045311179012060165,
"step": 2380
},
{
"epoch": 0.9672197490894374,
"grad_norm": 6190616.100642323,
"learning_rate": 1.8218623481781373e-08,
"logits/chosen": -2.3274073600769043,
"logits/rejected": -2.2292959690093994,
"logps/chosen": -152.0672149658203,
"logps/rejected": -174.8033905029297,
"loss": 124131.975,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.027064388617873192,
"rewards/margins": 0.015537412837147713,
"rewards/rejected": -0.042601801455020905,
"step": 2390
},
{
"epoch": 0.971266693646297,
"grad_norm": 8140978.954292232,
"learning_rate": 1.5969410706252813e-08,
"logits/chosen": -2.3674769401550293,
"logits/rejected": -2.3571083545684814,
"logps/chosen": -144.0354461669922,
"logps/rejected": -160.3311004638672,
"loss": 125102.2375,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.02302565984427929,
"rewards/margins": 0.0166311115026474,
"rewards/rejected": -0.03965677320957184,
"step": 2400
},
{
"epoch": 0.9753136382031566,
"grad_norm": 8862305.552745355,
"learning_rate": 1.3720197930724246e-08,
"logits/chosen": -2.178356647491455,
"logits/rejected": -2.179384708404541,
"logps/chosen": -143.9452362060547,
"logps/rejected": -151.89974975585938,
"loss": 123180.975,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.035528309643268585,
"rewards/margins": 0.007553645875304937,
"rewards/rejected": -0.04308196157217026,
"step": 2410
},
{
"epoch": 0.9793605827600161,
"grad_norm": 4848306.613269352,
"learning_rate": 1.1470985155195682e-08,
"logits/chosen": -2.402296781539917,
"logits/rejected": -2.3765056133270264,
"logps/chosen": -125.8743896484375,
"logps/rejected": -145.01162719726562,
"loss": 122925.2125,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.017237985506653786,
"rewards/margins": 0.019860463216900826,
"rewards/rejected": -0.03709845244884491,
"step": 2420
},
{
"epoch": 0.9834075273168758,
"grad_norm": 5809904.408249709,
"learning_rate": 9.221772379667116e-09,
"logits/chosen": -2.4065396785736084,
"logits/rejected": -2.3716368675231934,
"logps/chosen": -143.05075073242188,
"logps/rejected": -167.95664978027344,
"loss": 124604.825,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.029827838763594627,
"rewards/margins": 0.02450350485742092,
"rewards/rejected": -0.05433133989572525,
"step": 2430
},
{
"epoch": 0.9874544718737354,
"grad_norm": 7148593.283376818,
"learning_rate": 6.972559604138551e-09,
"logits/chosen": -2.3499531745910645,
"logits/rejected": -2.3520779609680176,
"logps/chosen": -130.91500854492188,
"logps/rejected": -159.89820861816406,
"loss": 119113.2625,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.026112830266356468,
"rewards/margins": 0.027585214003920555,
"rewards/rejected": -0.05369805172085762,
"step": 2440
},
{
"epoch": 0.9915014164305949,
"grad_norm": 7332505.899956737,
"learning_rate": 4.723346828609986e-09,
"logits/chosen": -2.3807873725891113,
"logits/rejected": -2.3282299041748047,
"logps/chosen": -138.15525817871094,
"logps/rejected": -150.75531005859375,
"loss": 124786.675,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.02242584154009819,
"rewards/margins": 0.013798736035823822,
"rewards/rejected": -0.03622458130121231,
"step": 2450
},
{
"epoch": 0.9955483609874545,
"grad_norm": 5408793.194556523,
"learning_rate": 2.474134053081421e-09,
"logits/chosen": -2.305051803588867,
"logits/rejected": -2.2709367275238037,
"logps/chosen": -127.0162124633789,
"logps/rejected": -154.36273193359375,
"loss": 125528.575,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.02522462233901024,
"rewards/margins": 0.017697211354970932,
"rewards/rejected": -0.04292182996869087,
"step": 2460
},
{
"epoch": 0.9995953055443141,
"grad_norm": 6547393.597919743,
"learning_rate": 2.249212775528565e-10,
"logits/chosen": -2.3911020755767822,
"logits/rejected": -2.3864622116088867,
"logps/chosen": -147.30072021484375,
"logps/rejected": -168.179443359375,
"loss": 121667.85,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.027930116280913353,
"rewards/margins": 0.010017314925789833,
"rewards/rejected": -0.037947431206703186,
"step": 2470
}
],
"logging_steps": 10,
"max_steps": 2471,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}