Mistral-7B-Instruct-SPPO-Iter3 / trainer_state.json
Williampixel's picture
Upload folder using huggingface_hub
d191c1a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 100,
"global_step": 2500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004,
"grad_norm": 7034011.299815918,
"learning_rate": 2e-09,
"logits/chosen": -2.3609464168548584,
"logits/rejected": -2.4021644592285156,
"logps/chosen": -72.32479858398438,
"logps/rejected": -106.78115844726562,
"loss": 138817.4219,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.004,
"grad_norm": 5234909.800991719,
"learning_rate": 2e-08,
"logits/chosen": -2.3249125480651855,
"logits/rejected": -2.3054325580596924,
"logps/chosen": -70.72610473632812,
"logps/rejected": -68.99564361572266,
"loss": 125594.3333,
"rewards/accuracies": 0.375,
"rewards/chosen": 6.445489361794898e-06,
"rewards/margins": -2.8922620913363062e-05,
"rewards/rejected": 3.536810982041061e-05,
"step": 10
},
{
"epoch": 0.008,
"grad_norm": 6739339.394495674,
"learning_rate": 4e-08,
"logits/chosen": -2.3423686027526855,
"logits/rejected": -2.3319311141967773,
"logps/chosen": -72.6821060180664,
"logps/rejected": -76.68476867675781,
"loss": 128657.6,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 7.994120824150741e-05,
"rewards/margins": 0.00015745378914289176,
"rewards/rejected": -7.751256634946913e-05,
"step": 20
},
{
"epoch": 0.012,
"grad_norm": 6270796.093737289,
"learning_rate": 6e-08,
"logits/chosen": -2.3667407035827637,
"logits/rejected": -2.366872549057007,
"logps/chosen": -86.75081634521484,
"logps/rejected": -96.1201171875,
"loss": 129234.2,
"rewards/accuracies": 0.5,
"rewards/chosen": -9.695839253254235e-05,
"rewards/margins": -2.3627610062249005e-05,
"rewards/rejected": -7.333078247029334e-05,
"step": 30
},
{
"epoch": 0.016,
"grad_norm": 6487964.338687279,
"learning_rate": 8e-08,
"logits/chosen": -2.330949068069458,
"logits/rejected": -2.304487466812134,
"logps/chosen": -70.66746520996094,
"logps/rejected": -76.26786804199219,
"loss": 132677.1375,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.00011891069880221039,
"rewards/margins": 2.9055625418550335e-05,
"rewards/rejected": -0.00014796630421187729,
"step": 40
},
{
"epoch": 0.02,
"grad_norm": 5808382.473785103,
"learning_rate": 1e-07,
"logits/chosen": -2.3761391639709473,
"logits/rejected": -2.4001965522766113,
"logps/chosen": -64.84712219238281,
"logps/rejected": -85.47789001464844,
"loss": 131065.9,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.00034615895128808916,
"rewards/margins": 0.00022984863608144224,
"rewards/rejected": -0.0005760076455771923,
"step": 50
},
{
"epoch": 0.024,
"grad_norm": 4282982.05143524,
"learning_rate": 1.2e-07,
"logits/chosen": -2.3628551959991455,
"logits/rejected": -2.325425386428833,
"logps/chosen": -76.96721649169922,
"logps/rejected": -81.25682067871094,
"loss": 126675.9375,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.0004624463617801666,
"rewards/margins": 0.0005581938894465566,
"rewards/rejected": -0.001020640367642045,
"step": 60
},
{
"epoch": 0.028,
"grad_norm": 7692791.228759594,
"learning_rate": 1.4e-07,
"logits/chosen": -2.3956987857818604,
"logits/rejected": -2.4127683639526367,
"logps/chosen": -71.30229187011719,
"logps/rejected": -74.56432342529297,
"loss": 134539.0625,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0008852133760228753,
"rewards/margins": 9.680164657766e-05,
"rewards/rejected": -0.0009820150444284081,
"step": 70
},
{
"epoch": 0.032,
"grad_norm": 5583562.2282863185,
"learning_rate": 1.6e-07,
"logits/chosen": -2.3956503868103027,
"logits/rejected": -2.350037097930908,
"logps/chosen": -75.2908706665039,
"logps/rejected": -77.54324340820312,
"loss": 125353.275,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.0011667849030345678,
"rewards/margins": 0.00017681324970908463,
"rewards/rejected": -0.0013435978908091784,
"step": 80
},
{
"epoch": 0.036,
"grad_norm": 7411307.114758692,
"learning_rate": 1.8e-07,
"logits/chosen": -2.3411900997161865,
"logits/rejected": -2.32747220993042,
"logps/chosen": -76.68790435791016,
"logps/rejected": -77.229736328125,
"loss": 134888.05,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.0010598390363156796,
"rewards/margins": 0.00033332061138935387,
"rewards/rejected": -0.001393159618601203,
"step": 90
},
{
"epoch": 0.04,
"grad_norm": 6105565.361340655,
"learning_rate": 2e-07,
"logits/chosen": -2.2321219444274902,
"logits/rejected": -2.25978684425354,
"logps/chosen": -69.29805755615234,
"logps/rejected": -70.91548156738281,
"loss": 128186.8375,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.0023287434596568346,
"rewards/margins": 0.0005028151208534837,
"rewards/rejected": -0.0028315584640949965,
"step": 100
},
{
"epoch": 0.044,
"grad_norm": 7344396.598489226,
"learning_rate": 2.1999999999999998e-07,
"logits/chosen": -2.1552886962890625,
"logits/rejected": -2.167569637298584,
"logps/chosen": -70.13446044921875,
"logps/rejected": -86.2125015258789,
"loss": 129394.525,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.005608140490949154,
"rewards/margins": 0.004356575198471546,
"rewards/rejected": -0.009964716620743275,
"step": 110
},
{
"epoch": 0.048,
"grad_norm": 9496198.88633965,
"learning_rate": 2.4e-07,
"logits/chosen": -1.9701220989227295,
"logits/rejected": -1.9230693578720093,
"logps/chosen": -110.21456146240234,
"logps/rejected": -117.58609771728516,
"loss": 129791.4125,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.01813032478094101,
"rewards/margins": 0.003227741690352559,
"rewards/rejected": -0.02135806903243065,
"step": 120
},
{
"epoch": 0.052,
"grad_norm": 10004391.66976001,
"learning_rate": 2.6e-07,
"logits/chosen": -2.015622854232788,
"logits/rejected": -2.026458740234375,
"logps/chosen": -100.22117614746094,
"logps/rejected": -107.8635482788086,
"loss": 129000.775,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.015937697142362595,
"rewards/margins": 0.008421100676059723,
"rewards/rejected": -0.024358797818422318,
"step": 130
},
{
"epoch": 0.056,
"grad_norm": 10735198.371540312,
"learning_rate": 2.8e-07,
"logits/chosen": -1.9458515644073486,
"logits/rejected": -1.964914321899414,
"logps/chosen": -100.13922882080078,
"logps/rejected": -123.00533294677734,
"loss": 132137.4875,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.024147292599081993,
"rewards/margins": 0.011009057983756065,
"rewards/rejected": -0.03515635430812836,
"step": 140
},
{
"epoch": 0.06,
"grad_norm": 7516435.594198061,
"learning_rate": 3e-07,
"logits/chosen": -1.9916763305664062,
"logits/rejected": -1.9970576763153076,
"logps/chosen": -89.16957092285156,
"logps/rejected": -104.74922180175781,
"loss": 125561.5375,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.020500652492046356,
"rewards/margins": 0.011713030748069286,
"rewards/rejected": -0.03221368417143822,
"step": 150
},
{
"epoch": 0.064,
"grad_norm": 7027700.618512748,
"learning_rate": 3.2e-07,
"logits/chosen": -2.159398078918457,
"logits/rejected": -2.1420938968658447,
"logps/chosen": -76.12110900878906,
"logps/rejected": -94.0234603881836,
"loss": 124492.5625,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.014212280511856079,
"rewards/margins": 0.01188136450946331,
"rewards/rejected": -0.02609364315867424,
"step": 160
},
{
"epoch": 0.068,
"grad_norm": 8112303.913406089,
"learning_rate": 3.4000000000000003e-07,
"logits/chosen": -2.103625774383545,
"logits/rejected": -2.0622053146362305,
"logps/chosen": -101.08997344970703,
"logps/rejected": -129.8938446044922,
"loss": 120661.3375,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.013411534950137138,
"rewards/margins": 0.01583944633603096,
"rewards/rejected": -0.029250985011458397,
"step": 170
},
{
"epoch": 0.072,
"grad_norm": 8237392.210524994,
"learning_rate": 3.6e-07,
"logits/chosen": -2.1253855228424072,
"logits/rejected": -2.123330593109131,
"logps/chosen": -80.26612854003906,
"logps/rejected": -116.00606536865234,
"loss": 125813.875,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.014310337603092194,
"rewards/margins": 0.027810264378786087,
"rewards/rejected": -0.04212059825658798,
"step": 180
},
{
"epoch": 0.076,
"grad_norm": 8482656.343559477,
"learning_rate": 3.7999999999999996e-07,
"logits/chosen": -2.122274875640869,
"logits/rejected": -2.0924127101898193,
"logps/chosen": -70.27191162109375,
"logps/rejected": -88.20128631591797,
"loss": 122274.4125,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.011920216493308544,
"rewards/margins": 0.012561318464577198,
"rewards/rejected": -0.02448153682053089,
"step": 190
},
{
"epoch": 0.08,
"grad_norm": 15995751.957306068,
"learning_rate": 4e-07,
"logits/chosen": -2.16402006149292,
"logits/rejected": -2.1568922996520996,
"logps/chosen": -84.34500885009766,
"logps/rejected": -106.30509185791016,
"loss": 124034.3625,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.019195228815078735,
"rewards/margins": 0.019908469170331955,
"rewards/rejected": -0.03910370171070099,
"step": 200
},
{
"epoch": 0.084,
"grad_norm": 9976473.779353945,
"learning_rate": 4.1999999999999995e-07,
"logits/chosen": -2.1947314739227295,
"logits/rejected": -2.155924081802368,
"logps/chosen": -85.31925964355469,
"logps/rejected": -116.8820571899414,
"loss": 133085.9375,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.016604367643594742,
"rewards/margins": 0.010993210598826408,
"rewards/rejected": -0.027597576379776,
"step": 210
},
{
"epoch": 0.088,
"grad_norm": 7143746.706174395,
"learning_rate": 4.3999999999999997e-07,
"logits/chosen": -2.181243419647217,
"logits/rejected": -2.1664962768554688,
"logps/chosen": -74.75950622558594,
"logps/rejected": -87.78418731689453,
"loss": 127414.575,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.014525257050991058,
"rewards/margins": 0.003328158985823393,
"rewards/rejected": -0.01785341463983059,
"step": 220
},
{
"epoch": 0.092,
"grad_norm": 9204902.414337158,
"learning_rate": 4.6e-07,
"logits/chosen": -2.108741044998169,
"logits/rejected": -2.048841953277588,
"logps/chosen": -78.65644073486328,
"logps/rejected": -95.38871765136719,
"loss": 127270.9375,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.013477807864546776,
"rewards/margins": 0.008071732707321644,
"rewards/rejected": -0.021549541503190994,
"step": 230
},
{
"epoch": 0.096,
"grad_norm": 6495004.829819743,
"learning_rate": 4.8e-07,
"logits/chosen": -2.111128330230713,
"logits/rejected": -2.0940356254577637,
"logps/chosen": -92.46067810058594,
"logps/rejected": -117.76658630371094,
"loss": 122517.0,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.015996446833014488,
"rewards/margins": 0.015014531090855598,
"rewards/rejected": -0.031010976061224937,
"step": 240
},
{
"epoch": 0.1,
"grad_norm": 3914167.3327231077,
"learning_rate": 5e-07,
"logits/chosen": -2.129955768585205,
"logits/rejected": -2.1201798915863037,
"logps/chosen": -91.17083740234375,
"logps/rejected": -122.591064453125,
"loss": 126768.9,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.017495278269052505,
"rewards/margins": 0.019053593277931213,
"rewards/rejected": -0.03654887527227402,
"step": 250
},
{
"epoch": 0.104,
"grad_norm": 9274304.06015198,
"learning_rate": 4.977777777777777e-07,
"logits/chosen": -2.1294829845428467,
"logits/rejected": -2.1332812309265137,
"logps/chosen": -86.34416198730469,
"logps/rejected": -110.06596374511719,
"loss": 123661.6875,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.019683022052049637,
"rewards/margins": 0.017954688519239426,
"rewards/rejected": -0.03763771429657936,
"step": 260
},
{
"epoch": 0.108,
"grad_norm": 5830092.205559107,
"learning_rate": 4.955555555555556e-07,
"logits/chosen": -2.1960532665252686,
"logits/rejected": -2.2218282222747803,
"logps/chosen": -94.33865356445312,
"logps/rejected": -113.83499908447266,
"loss": 125464.4375,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.018877552822232246,
"rewards/margins": 0.010512979701161385,
"rewards/rejected": -0.02939053252339363,
"step": 270
},
{
"epoch": 0.112,
"grad_norm": 8438742.306721646,
"learning_rate": 4.933333333333333e-07,
"logits/chosen": -2.257341146469116,
"logits/rejected": -2.3109583854675293,
"logps/chosen": -94.39613342285156,
"logps/rejected": -118.54805755615234,
"loss": 126082.3375,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.021040180698037148,
"rewards/margins": 0.01762349344789982,
"rewards/rejected": -0.038663674145936966,
"step": 280
},
{
"epoch": 0.116,
"grad_norm": 7914235.455845386,
"learning_rate": 4.91111111111111e-07,
"logits/chosen": -2.327070713043213,
"logits/rejected": -2.3570895195007324,
"logps/chosen": -87.95598602294922,
"logps/rejected": -108.05839538574219,
"loss": 129599.7125,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0221787728369236,
"rewards/margins": 0.01672218181192875,
"rewards/rejected": -0.0389009527862072,
"step": 290
},
{
"epoch": 0.12,
"grad_norm": 6457742.625324568,
"learning_rate": 4.888888888888889e-07,
"logits/chosen": -2.4012951850891113,
"logits/rejected": -2.4345765113830566,
"logps/chosen": -74.74217224121094,
"logps/rejected": -100.60356140136719,
"loss": 124479.875,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.018593108281493187,
"rewards/margins": 0.021032758057117462,
"rewards/rejected": -0.0396258682012558,
"step": 300
},
{
"epoch": 0.124,
"grad_norm": 6578431.206476413,
"learning_rate": 4.866666666666666e-07,
"logits/chosen": -2.452791452407837,
"logits/rejected": -2.4812235832214355,
"logps/chosen": -95.68658447265625,
"logps/rejected": -111.42750549316406,
"loss": 126451.425,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.018242117017507553,
"rewards/margins": 0.01176674384623766,
"rewards/rejected": -0.030008861795067787,
"step": 310
},
{
"epoch": 0.128,
"grad_norm": 10851035.518032862,
"learning_rate": 4.844444444444445e-07,
"logits/chosen": -2.4537229537963867,
"logits/rejected": -2.4691717624664307,
"logps/chosen": -82.9326171875,
"logps/rejected": -116.93620300292969,
"loss": 123506.3125,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.02308080717921257,
"rewards/margins": 0.024886813014745712,
"rewards/rejected": -0.04796762019395828,
"step": 320
},
{
"epoch": 0.132,
"grad_norm": 9223772.443364851,
"learning_rate": 4.822222222222222e-07,
"logits/chosen": -2.391624927520752,
"logits/rejected": -2.407311201095581,
"logps/chosen": -91.67464447021484,
"logps/rejected": -117.0147705078125,
"loss": 121261.9125,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.023308029398322105,
"rewards/margins": 0.017484817653894424,
"rewards/rejected": -0.04079284518957138,
"step": 330
},
{
"epoch": 0.136,
"grad_norm": 8085358.939583512,
"learning_rate": 4.8e-07,
"logits/chosen": -2.48149037361145,
"logits/rejected": -2.4932546615600586,
"logps/chosen": -96.05111694335938,
"logps/rejected": -131.0735626220703,
"loss": 126914.2875,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.02739790640771389,
"rewards/margins": 0.029247354716062546,
"rewards/rejected": -0.056645262986421585,
"step": 340
},
{
"epoch": 0.14,
"grad_norm": 7944883.3667990295,
"learning_rate": 4.777777777777778e-07,
"logits/chosen": -2.45344877243042,
"logits/rejected": -2.5137851238250732,
"logps/chosen": -89.93304443359375,
"logps/rejected": -108.8600845336914,
"loss": 122914.55,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.0250026136636734,
"rewards/margins": 0.014374235644936562,
"rewards/rejected": -0.03937685117125511,
"step": 350
},
{
"epoch": 0.144,
"grad_norm": 11202153.92151104,
"learning_rate": 4.7555555555555554e-07,
"logits/chosen": -2.569916248321533,
"logits/rejected": -2.5970470905303955,
"logps/chosen": -95.05995178222656,
"logps/rejected": -127.2323226928711,
"loss": 124243.4125,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.025854643434286118,
"rewards/margins": 0.023228293284773827,
"rewards/rejected": -0.049082934856414795,
"step": 360
},
{
"epoch": 0.148,
"grad_norm": 6901221.964419149,
"learning_rate": 4.733333333333333e-07,
"logits/chosen": -2.4675538539886475,
"logits/rejected": -2.4503865242004395,
"logps/chosen": -85.31706237792969,
"logps/rejected": -102.17588806152344,
"loss": 127540.3375,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.024571221321821213,
"rewards/margins": 0.011107890866696835,
"rewards/rejected": -0.03567911311984062,
"step": 370
},
{
"epoch": 0.152,
"grad_norm": 6993857.423860367,
"learning_rate": 4.711111111111111e-07,
"logits/chosen": -2.459782123565674,
"logits/rejected": -2.48356032371521,
"logps/chosen": -110.59651184082031,
"logps/rejected": -130.7666778564453,
"loss": 127438.7,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.031783945858478546,
"rewards/margins": 0.014480188488960266,
"rewards/rejected": -0.046264130622148514,
"step": 380
},
{
"epoch": 0.156,
"grad_norm": 6436648.717203954,
"learning_rate": 4.6888888888888887e-07,
"logits/chosen": -2.4548838138580322,
"logits/rejected": -2.456662654876709,
"logps/chosen": -117.49080657958984,
"logps/rejected": -128.62191772460938,
"loss": 126004.45,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.027950212359428406,
"rewards/margins": 0.008381237275898457,
"rewards/rejected": -0.036331452429294586,
"step": 390
},
{
"epoch": 0.16,
"grad_norm": 7569273.392881057,
"learning_rate": 4.6666666666666666e-07,
"logits/chosen": -2.4525585174560547,
"logits/rejected": -2.4519400596618652,
"logps/chosen": -104.88145446777344,
"logps/rejected": -128.08416748046875,
"loss": 126857.475,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.022735530510544777,
"rewards/margins": 0.01596837118268013,
"rewards/rejected": -0.038703907281160355,
"step": 400
},
{
"epoch": 0.164,
"grad_norm": 6861745.545448723,
"learning_rate": 4.644444444444444e-07,
"logits/chosen": -2.5066254138946533,
"logits/rejected": -2.5164337158203125,
"logps/chosen": -86.57884216308594,
"logps/rejected": -119.33331298828125,
"loss": 124486.5125,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.02158975414931774,
"rewards/margins": 0.030435938388109207,
"rewards/rejected": -0.0520256944000721,
"step": 410
},
{
"epoch": 0.168,
"grad_norm": 6923216.083132582,
"learning_rate": 4.622222222222222e-07,
"logits/chosen": -2.4752566814422607,
"logits/rejected": -2.463294506072998,
"logps/chosen": -85.61151885986328,
"logps/rejected": -102.90364837646484,
"loss": 124946.475,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.020689889788627625,
"rewards/margins": 0.011657947674393654,
"rewards/rejected": -0.03234783932566643,
"step": 420
},
{
"epoch": 0.172,
"grad_norm": 7450190.250939408,
"learning_rate": 4.6e-07,
"logits/chosen": -2.547010660171509,
"logits/rejected": -2.531663179397583,
"logps/chosen": -97.37740325927734,
"logps/rejected": -135.26629638671875,
"loss": 131081.35,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.025491004809737206,
"rewards/margins": 0.02565266191959381,
"rewards/rejected": -0.051143668591976166,
"step": 430
},
{
"epoch": 0.176,
"grad_norm": 6469045.345880665,
"learning_rate": 4.577777777777778e-07,
"logits/chosen": -2.6610159873962402,
"logits/rejected": -2.659968852996826,
"logps/chosen": -99.9223861694336,
"logps/rejected": -124.8481216430664,
"loss": 117640.1,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.027902353554964066,
"rewards/margins": 0.019689548760652542,
"rewards/rejected": -0.04759190231561661,
"step": 440
},
{
"epoch": 0.18,
"grad_norm": 7434403.705215201,
"learning_rate": 4.555555555555555e-07,
"logits/chosen": -2.700005054473877,
"logits/rejected": -2.6503853797912598,
"logps/chosen": -96.16465759277344,
"logps/rejected": -124.81199645996094,
"loss": 123096.9,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.026438185945153236,
"rewards/margins": 0.018792394548654556,
"rewards/rejected": -0.04523057863116264,
"step": 450
},
{
"epoch": 0.184,
"grad_norm": 7874629.531526446,
"learning_rate": 4.5333333333333326e-07,
"logits/chosen": -2.724388837814331,
"logits/rejected": -2.7652335166931152,
"logps/chosen": -101.94267272949219,
"logps/rejected": -110.65840148925781,
"loss": 128125.7,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.024334359914064407,
"rewards/margins": 0.005520271137356758,
"rewards/rejected": -0.029854634776711464,
"step": 460
},
{
"epoch": 0.188,
"grad_norm": 7560431.171192426,
"learning_rate": 4.511111111111111e-07,
"logits/chosen": -2.6493871212005615,
"logits/rejected": -2.6714179515838623,
"logps/chosen": -105.4653549194336,
"logps/rejected": -150.9534149169922,
"loss": 122614.7375,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.02969837561249733,
"rewards/margins": 0.027200985699892044,
"rewards/rejected": -0.056899357587099075,
"step": 470
},
{
"epoch": 0.192,
"grad_norm": 8394101.213808972,
"learning_rate": 4.4888888888888885e-07,
"logits/chosen": -2.495974063873291,
"logits/rejected": -2.4936139583587646,
"logps/chosen": -105.5544662475586,
"logps/rejected": -139.51068115234375,
"loss": 127604.6125,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.026578962802886963,
"rewards/margins": 0.027212362736463547,
"rewards/rejected": -0.05379132181406021,
"step": 480
},
{
"epoch": 0.196,
"grad_norm": 8149957.80282127,
"learning_rate": 4.4666666666666664e-07,
"logits/chosen": -2.44439959526062,
"logits/rejected": -2.4615180492401123,
"logps/chosen": -115.50093078613281,
"logps/rejected": -153.07492065429688,
"loss": 120568.475,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.02712417207658291,
"rewards/margins": 0.03037952445447445,
"rewards/rejected": -0.05750369280576706,
"step": 490
},
{
"epoch": 0.2,
"grad_norm": 9689223.456248827,
"learning_rate": 4.444444444444444e-07,
"logits/chosen": -2.4836983680725098,
"logits/rejected": -2.4954071044921875,
"logps/chosen": -108.3786392211914,
"logps/rejected": -131.80975341796875,
"loss": 130260.4375,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.03161335736513138,
"rewards/margins": 0.019194485619664192,
"rewards/rejected": -0.05080784484744072,
"step": 500
},
{
"epoch": 0.204,
"grad_norm": 9433316.334462296,
"learning_rate": 4.4222222222222223e-07,
"logits/chosen": -2.637115955352783,
"logits/rejected": -2.6541285514831543,
"logps/chosen": -106.7952880859375,
"logps/rejected": -119.38945007324219,
"loss": 125224.65,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.02900713123381138,
"rewards/margins": 0.011991321109235287,
"rewards/rejected": -0.04099845141172409,
"step": 510
},
{
"epoch": 0.208,
"grad_norm": 6547291.880532919,
"learning_rate": 4.3999999999999997e-07,
"logits/chosen": -2.6971993446350098,
"logits/rejected": -2.6676185131073,
"logps/chosen": -89.63105773925781,
"logps/rejected": -119.62776947021484,
"loss": 121176.4,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.025825385004281998,
"rewards/margins": 0.021441804245114326,
"rewards/rejected": -0.04726719111204147,
"step": 520
},
{
"epoch": 0.212,
"grad_norm": 6024322.818937677,
"learning_rate": 4.3777777777777776e-07,
"logits/chosen": -2.732637405395508,
"logits/rejected": -2.718721866607666,
"logps/chosen": -84.20713806152344,
"logps/rejected": -120.58128356933594,
"loss": 118993.7,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.023359743878245354,
"rewards/margins": 0.02675134316086769,
"rewards/rejected": -0.0501110777258873,
"step": 530
},
{
"epoch": 0.216,
"grad_norm": 6460660.035353449,
"learning_rate": 4.355555555555555e-07,
"logits/chosen": -2.488724708557129,
"logits/rejected": -2.504575490951538,
"logps/chosen": -98.4452133178711,
"logps/rejected": -128.78163146972656,
"loss": 123458.275,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.026253730058670044,
"rewards/margins": 0.024806631729006767,
"rewards/rejected": -0.05106035992503166,
"step": 540
},
{
"epoch": 0.22,
"grad_norm": 8461699.658062043,
"learning_rate": 4.3333333333333335e-07,
"logits/chosen": -2.5291218757629395,
"logits/rejected": -2.536240339279175,
"logps/chosen": -100.16661071777344,
"logps/rejected": -149.42355346679688,
"loss": 124909.075,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.027327323332428932,
"rewards/margins": 0.036261945962905884,
"rewards/rejected": -0.06358926743268967,
"step": 550
},
{
"epoch": 0.224,
"grad_norm": 6921956.012662945,
"learning_rate": 4.311111111111111e-07,
"logits/chosen": -2.628513813018799,
"logits/rejected": -2.6058189868927,
"logps/chosen": -107.0963363647461,
"logps/rejected": -113.91001892089844,
"loss": 127428.7375,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.03390585258603096,
"rewards/margins": 0.005185864400118589,
"rewards/rejected": -0.039091721177101135,
"step": 560
},
{
"epoch": 0.228,
"grad_norm": 8197834.099947786,
"learning_rate": 4.2888888888888883e-07,
"logits/chosen": -2.561366558074951,
"logits/rejected": -2.561455249786377,
"logps/chosen": -105.6274185180664,
"logps/rejected": -136.18792724609375,
"loss": 126906.175,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.031804267317056656,
"rewards/margins": 0.02525196596980095,
"rewards/rejected": -0.057056229561567307,
"step": 570
},
{
"epoch": 0.232,
"grad_norm": 8846278.644102238,
"learning_rate": 4.266666666666667e-07,
"logits/chosen": -2.6525886058807373,
"logits/rejected": -2.588754653930664,
"logps/chosen": -136.8567657470703,
"logps/rejected": -178.04689025878906,
"loss": 132691.7875,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.04016602411866188,
"rewards/margins": 0.027137309312820435,
"rewards/rejected": -0.06730332970619202,
"step": 580
},
{
"epoch": 0.236,
"grad_norm": 6333324.696405062,
"learning_rate": 4.244444444444444e-07,
"logits/chosen": -2.4680473804473877,
"logits/rejected": -2.465623378753662,
"logps/chosen": -123.52412414550781,
"logps/rejected": -135.17831420898438,
"loss": 129527.5,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.03627150505781174,
"rewards/margins": 0.012457914650440216,
"rewards/rejected": -0.048729415982961655,
"step": 590
},
{
"epoch": 0.24,
"grad_norm": 7036898.682503791,
"learning_rate": 4.222222222222222e-07,
"logits/chosen": -2.584538459777832,
"logits/rejected": -2.579874038696289,
"logps/chosen": -99.02510070800781,
"logps/rejected": -107.3072280883789,
"loss": 128720.725,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.03315219283103943,
"rewards/margins": 0.003997699357569218,
"rewards/rejected": -0.03714989498257637,
"step": 600
},
{
"epoch": 0.244,
"grad_norm": 7159293.986125982,
"learning_rate": 4.1999999999999995e-07,
"logits/chosen": -2.649663209915161,
"logits/rejected": -2.6600253582000732,
"logps/chosen": -101.28582000732422,
"logps/rejected": -119.93243408203125,
"loss": 129047.95,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.031149577349424362,
"rewards/margins": 0.011909973807632923,
"rewards/rejected": -0.04305955022573471,
"step": 610
},
{
"epoch": 0.248,
"grad_norm": 7467292.937718221,
"learning_rate": 4.177777777777778e-07,
"logits/chosen": -2.572115898132324,
"logits/rejected": -2.525055408477783,
"logps/chosen": -105.48246765136719,
"logps/rejected": -131.28749084472656,
"loss": 122677.825,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.03393036499619484,
"rewards/margins": 0.017631059512495995,
"rewards/rejected": -0.051561422646045685,
"step": 620
},
{
"epoch": 0.252,
"grad_norm": 6649301.452495339,
"learning_rate": 4.1555555555555554e-07,
"logits/chosen": -2.5688421726226807,
"logits/rejected": -2.5850729942321777,
"logps/chosen": -106.14555358886719,
"logps/rejected": -141.26199340820312,
"loss": 124169.6,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.027926957234740257,
"rewards/margins": 0.031329791992902756,
"rewards/rejected": -0.05925675109028816,
"step": 630
},
{
"epoch": 0.256,
"grad_norm": 5701852.577919224,
"learning_rate": 4.1333333333333333e-07,
"logits/chosen": -2.5793604850769043,
"logits/rejected": -2.6216492652893066,
"logps/chosen": -94.40669250488281,
"logps/rejected": -141.09725952148438,
"loss": 119443.9125,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.029792586341500282,
"rewards/margins": 0.0306295957416296,
"rewards/rejected": -0.06042218208312988,
"step": 640
},
{
"epoch": 0.26,
"grad_norm": 7828661.867350275,
"learning_rate": 4.1111111111111107e-07,
"logits/chosen": -2.5071213245391846,
"logits/rejected": -2.5167384147644043,
"logps/chosen": -100.64595794677734,
"logps/rejected": -135.44271850585938,
"loss": 127055.0875,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.03445083647966385,
"rewards/margins": 0.021719755604863167,
"rewards/rejected": -0.056170590221881866,
"step": 650
},
{
"epoch": 0.264,
"grad_norm": 7230836.816701007,
"learning_rate": 4.088888888888889e-07,
"logits/chosen": -2.49631404876709,
"logits/rejected": -2.536076068878174,
"logps/chosen": -101.35478210449219,
"logps/rejected": -124.0557632446289,
"loss": 128004.0625,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.03691136837005615,
"rewards/margins": 0.013063013553619385,
"rewards/rejected": -0.04997437819838524,
"step": 660
},
{
"epoch": 0.268,
"grad_norm": 9181742.684525523,
"learning_rate": 4.0666666666666666e-07,
"logits/chosen": -2.5028629302978516,
"logits/rejected": -2.5143425464630127,
"logps/chosen": -114.0814437866211,
"logps/rejected": -130.69984436035156,
"loss": 132355.1,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.03546031937003136,
"rewards/margins": 0.012831469066441059,
"rewards/rejected": -0.04829178377985954,
"step": 670
},
{
"epoch": 0.272,
"grad_norm": 5953835.069109496,
"learning_rate": 4.044444444444444e-07,
"logits/chosen": -2.3693368434906006,
"logits/rejected": -2.3933675289154053,
"logps/chosen": -96.07215881347656,
"logps/rejected": -133.98353576660156,
"loss": 122900.6875,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.029592838138341904,
"rewards/margins": 0.028876733034849167,
"rewards/rejected": -0.05846957489848137,
"step": 680
},
{
"epoch": 0.276,
"grad_norm": 9742020.165182771,
"learning_rate": 4.022222222222222e-07,
"logits/chosen": -2.362238645553589,
"logits/rejected": -2.3378891944885254,
"logps/chosen": -128.8614501953125,
"logps/rejected": -144.07827758789062,
"loss": 125425.0375,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.03773171454668045,
"rewards/margins": 0.013301841914653778,
"rewards/rejected": -0.05103355646133423,
"step": 690
},
{
"epoch": 0.28,
"grad_norm": 5698717.703929527,
"learning_rate": 4e-07,
"logits/chosen": -2.4838829040527344,
"logits/rejected": -2.50342059135437,
"logps/chosen": -111.4194107055664,
"logps/rejected": -129.75094604492188,
"loss": 125053.125,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.03320156782865524,
"rewards/margins": 0.017379306256771088,
"rewards/rejected": -0.05058088153600693,
"step": 700
},
{
"epoch": 0.284,
"grad_norm": 8985877.177134423,
"learning_rate": 3.977777777777778e-07,
"logits/chosen": -2.495482921600342,
"logits/rejected": -2.4964957237243652,
"logps/chosen": -122.45703125,
"logps/rejected": -142.83428955078125,
"loss": 128369.4125,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.03306376561522484,
"rewards/margins": 0.016845058649778366,
"rewards/rejected": -0.049908824265003204,
"step": 710
},
{
"epoch": 0.288,
"grad_norm": 6293531.48238979,
"learning_rate": 3.955555555555555e-07,
"logits/chosen": -2.391200065612793,
"logits/rejected": -2.4489917755126953,
"logps/chosen": -98.37910461425781,
"logps/rejected": -126.7882080078125,
"loss": 127457.0,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.029601294547319412,
"rewards/margins": 0.01803305558860302,
"rewards/rejected": -0.04763435199856758,
"step": 720
},
{
"epoch": 0.292,
"grad_norm": 7897192.107065841,
"learning_rate": 3.933333333333333e-07,
"logits/chosen": -2.4582555294036865,
"logits/rejected": -2.4759137630462646,
"logps/chosen": -98.81364440917969,
"logps/rejected": -144.63548278808594,
"loss": 123344.0875,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.029745137318968773,
"rewards/margins": 0.03918560594320297,
"rewards/rejected": -0.06893075257539749,
"step": 730
},
{
"epoch": 0.296,
"grad_norm": 6327010.4235527,
"learning_rate": 3.911111111111111e-07,
"logits/chosen": -2.578629732131958,
"logits/rejected": -2.5296969413757324,
"logps/chosen": -93.8338623046875,
"logps/rejected": -119.47264099121094,
"loss": 130155.3875,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.031289439648389816,
"rewards/margins": 0.02071293443441391,
"rewards/rejected": -0.052002377808094025,
"step": 740
},
{
"epoch": 0.3,
"grad_norm": 6618521.62330563,
"learning_rate": 3.888888888888889e-07,
"logits/chosen": -2.5220892429351807,
"logits/rejected": -2.533546209335327,
"logps/chosen": -115.8681869506836,
"logps/rejected": -141.22885131835938,
"loss": 119614.875,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.027656156569719315,
"rewards/margins": 0.018271705135703087,
"rewards/rejected": -0.04592785984277725,
"step": 750
},
{
"epoch": 0.304,
"grad_norm": 10187163.188769344,
"learning_rate": 3.8666666666666664e-07,
"logits/chosen": -2.5118823051452637,
"logits/rejected": -2.504910945892334,
"logps/chosen": -101.41685485839844,
"logps/rejected": -148.4906005859375,
"loss": 126592.4875,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.033656515181064606,
"rewards/margins": 0.03055490553379059,
"rewards/rejected": -0.06421142816543579,
"step": 760
},
{
"epoch": 0.308,
"grad_norm": 5303031.219251705,
"learning_rate": 3.8444444444444443e-07,
"logits/chosen": -2.430718421936035,
"logits/rejected": -2.42429518699646,
"logps/chosen": -104.3238754272461,
"logps/rejected": -152.8984832763672,
"loss": 120932.45,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.02648145519196987,
"rewards/margins": 0.034856077283620834,
"rewards/rejected": -0.06133753061294556,
"step": 770
},
{
"epoch": 0.312,
"grad_norm": 7788606.775046531,
"learning_rate": 3.822222222222222e-07,
"logits/chosen": -2.440924882888794,
"logits/rejected": -2.420545816421509,
"logps/chosen": -115.5602035522461,
"logps/rejected": -155.5218048095703,
"loss": 125781.1125,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.033296506851911545,
"rewards/margins": 0.026110276579856873,
"rewards/rejected": -0.059406787157058716,
"step": 780
},
{
"epoch": 0.316,
"grad_norm": 8342190.296045118,
"learning_rate": 3.7999999999999996e-07,
"logits/chosen": -2.476647138595581,
"logits/rejected": -2.469700813293457,
"logps/chosen": -128.41778564453125,
"logps/rejected": -171.7101287841797,
"loss": 123186.7,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.036413662135601044,
"rewards/margins": 0.02540646493434906,
"rewards/rejected": -0.0618201307952404,
"step": 790
},
{
"epoch": 0.32,
"grad_norm": 6557257.336959155,
"learning_rate": 3.7777777777777775e-07,
"logits/chosen": -2.5064010620117188,
"logits/rejected": -2.4810726642608643,
"logps/chosen": -92.31603240966797,
"logps/rejected": -137.16452026367188,
"loss": 122089.9625,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.026451414451003075,
"rewards/margins": 0.033910416066646576,
"rewards/rejected": -0.0603618249297142,
"step": 800
},
{
"epoch": 0.324,
"grad_norm": 6904823.359277868,
"learning_rate": 3.755555555555555e-07,
"logits/chosen": -2.4048755168914795,
"logits/rejected": -2.4163169860839844,
"logps/chosen": -103.93983459472656,
"logps/rejected": -146.17251586914062,
"loss": 125580.95,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.034494899213314056,
"rewards/margins": 0.028069961816072464,
"rewards/rejected": -0.06256486475467682,
"step": 810
},
{
"epoch": 0.328,
"grad_norm": 7723585.3043322805,
"learning_rate": 3.7333333333333334e-07,
"logits/chosen": -2.3999173641204834,
"logits/rejected": -2.3777313232421875,
"logps/chosen": -98.98230743408203,
"logps/rejected": -113.75994873046875,
"loss": 128410.575,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.0317092090845108,
"rewards/margins": 0.005884192418307066,
"rewards/rejected": -0.037593401968479156,
"step": 820
},
{
"epoch": 0.332,
"grad_norm": 5742576.756144718,
"learning_rate": 3.711111111111111e-07,
"logits/chosen": -2.3831002712249756,
"logits/rejected": -2.3882925510406494,
"logps/chosen": -94.30198669433594,
"logps/rejected": -129.58078002929688,
"loss": 126154.8625,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.02777601219713688,
"rewards/margins": 0.01992596499621868,
"rewards/rejected": -0.04770197719335556,
"step": 830
},
{
"epoch": 0.336,
"grad_norm": 7609192.056262804,
"learning_rate": 3.688888888888889e-07,
"logits/chosen": -2.434124708175659,
"logits/rejected": -2.4821083545684814,
"logps/chosen": -96.55790710449219,
"logps/rejected": -122.23677062988281,
"loss": 123363.6125,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.026610519737005234,
"rewards/margins": 0.02544989623129368,
"rewards/rejected": -0.05206041410565376,
"step": 840
},
{
"epoch": 0.34,
"grad_norm": 8034675.762755762,
"learning_rate": 3.666666666666666e-07,
"logits/chosen": -2.3713624477386475,
"logits/rejected": -2.366699457168579,
"logps/chosen": -115.35933685302734,
"logps/rejected": -129.19752502441406,
"loss": 126533.75,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.0336347371339798,
"rewards/margins": 0.0172466691583395,
"rewards/rejected": -0.05088140815496445,
"step": 850
},
{
"epoch": 0.344,
"grad_norm": 5975456.469702007,
"learning_rate": 3.6444444444444446e-07,
"logits/chosen": -2.527848482131958,
"logits/rejected": -2.5321898460388184,
"logps/chosen": -101.052734375,
"logps/rejected": -146.85986328125,
"loss": 117886.0,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.026471996679902077,
"rewards/margins": 0.038311395794153214,
"rewards/rejected": -0.06478338688611984,
"step": 860
},
{
"epoch": 0.348,
"grad_norm": 6906670.4436170915,
"learning_rate": 3.622222222222222e-07,
"logits/chosen": -2.5193216800689697,
"logits/rejected": -2.55930495262146,
"logps/chosen": -103.41465759277344,
"logps/rejected": -136.6109161376953,
"loss": 129100.275,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.029144983738660812,
"rewards/margins": 0.02550877258181572,
"rewards/rejected": -0.05465375632047653,
"step": 870
},
{
"epoch": 0.352,
"grad_norm": 7471407.013777157,
"learning_rate": 3.6e-07,
"logits/chosen": -2.451826572418213,
"logits/rejected": -2.4506518840789795,
"logps/chosen": -98.89453125,
"logps/rejected": -141.08523559570312,
"loss": 122662.65,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.025945227593183517,
"rewards/margins": 0.03901258111000061,
"rewards/rejected": -0.06495781242847443,
"step": 880
},
{
"epoch": 0.356,
"grad_norm": 6728467.134306638,
"learning_rate": 3.5777777777777773e-07,
"logits/chosen": -2.5389392375946045,
"logits/rejected": -2.5246338844299316,
"logps/chosen": -120.97161865234375,
"logps/rejected": -168.5087432861328,
"loss": 122320.4375,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.03245999664068222,
"rewards/margins": 0.04135856777429581,
"rewards/rejected": -0.07381855696439743,
"step": 890
},
{
"epoch": 0.36,
"grad_norm": 7104916.695741348,
"learning_rate": 3.5555555555555553e-07,
"logits/chosen": -2.5385003089904785,
"logits/rejected": -2.5232315063476562,
"logps/chosen": -87.90635681152344,
"logps/rejected": -116.77327728271484,
"loss": 126440.375,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.032553546130657196,
"rewards/margins": 0.01972118578851223,
"rewards/rejected": -0.052274733781814575,
"step": 900
},
{
"epoch": 0.364,
"grad_norm": 5970259.1415026905,
"learning_rate": 3.533333333333333e-07,
"logits/chosen": -2.508732557296753,
"logits/rejected": -2.5209858417510986,
"logps/chosen": -101.33818054199219,
"logps/rejected": -125.9253921508789,
"loss": 121322.675,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.03439612686634064,
"rewards/margins": 0.02015717700123787,
"rewards/rejected": -0.054553307592868805,
"step": 910
},
{
"epoch": 0.368,
"grad_norm": 8365175.917846098,
"learning_rate": 3.5111111111111106e-07,
"logits/chosen": -2.4121992588043213,
"logits/rejected": -2.403446674346924,
"logps/chosen": -110.45259857177734,
"logps/rejected": -146.91270446777344,
"loss": 124905.35,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.03642472252249718,
"rewards/margins": 0.029706323519349098,
"rewards/rejected": -0.06613104045391083,
"step": 920
},
{
"epoch": 0.372,
"grad_norm": 7152589.843566105,
"learning_rate": 3.488888888888889e-07,
"logits/chosen": -2.4605088233947754,
"logits/rejected": -2.5225741863250732,
"logps/chosen": -104.7183609008789,
"logps/rejected": -129.02984619140625,
"loss": 125537.525,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.030679216608405113,
"rewards/margins": 0.023534944280982018,
"rewards/rejected": -0.05421415716409683,
"step": 930
},
{
"epoch": 0.376,
"grad_norm": 7902918.244011351,
"learning_rate": 3.4666666666666665e-07,
"logits/chosen": -2.4962337017059326,
"logits/rejected": -2.560391664505005,
"logps/chosen": -88.20844268798828,
"logps/rejected": -139.12539672851562,
"loss": 120117.675,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.023946184664964676,
"rewards/margins": 0.035228628665208817,
"rewards/rejected": -0.05917481333017349,
"step": 940
},
{
"epoch": 0.38,
"grad_norm": 7058088.493943834,
"learning_rate": 3.4444444444444444e-07,
"logits/chosen": -2.4186971187591553,
"logits/rejected": -2.479079484939575,
"logps/chosen": -109.65245056152344,
"logps/rejected": -145.39144897460938,
"loss": 131717.475,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.035849470645189285,
"rewards/margins": 0.030028488487005234,
"rewards/rejected": -0.06587796658277512,
"step": 950
},
{
"epoch": 0.384,
"grad_norm": 9519920.624854606,
"learning_rate": 3.422222222222222e-07,
"logits/chosen": -2.388206958770752,
"logits/rejected": -2.4197933673858643,
"logps/chosen": -130.57398986816406,
"logps/rejected": -206.50595092773438,
"loss": 121014.6125,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.03395534306764603,
"rewards/margins": 0.04584265127778053,
"rewards/rejected": -0.07979799807071686,
"step": 960
},
{
"epoch": 0.388,
"grad_norm": 6508580.208947205,
"learning_rate": 3.4000000000000003e-07,
"logits/chosen": -2.4630963802337646,
"logits/rejected": -2.4617621898651123,
"logps/chosen": -97.78787231445312,
"logps/rejected": -137.83377075195312,
"loss": 129298.375,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.03728308528661728,
"rewards/margins": 0.032224711030721664,
"rewards/rejected": -0.06950780749320984,
"step": 970
},
{
"epoch": 0.392,
"grad_norm": 8870008.273394352,
"learning_rate": 3.3777777777777777e-07,
"logits/chosen": -2.346057176589966,
"logits/rejected": -2.388782262802124,
"logps/chosen": -99.29377746582031,
"logps/rejected": -174.5662841796875,
"loss": 123724.2875,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.025686006993055344,
"rewards/margins": 0.048049140721559525,
"rewards/rejected": -0.07373513281345367,
"step": 980
},
{
"epoch": 0.396,
"grad_norm": 8011988.902105641,
"learning_rate": 3.3555555555555556e-07,
"logits/chosen": -2.392421007156372,
"logits/rejected": -2.373751163482666,
"logps/chosen": -98.69950866699219,
"logps/rejected": -133.2752227783203,
"loss": 122756.525,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.029735039919614792,
"rewards/margins": 0.027242526412010193,
"rewards/rejected": -0.05697755888104439,
"step": 990
},
{
"epoch": 0.4,
"grad_norm": 9489098.770004237,
"learning_rate": 3.333333333333333e-07,
"logits/chosen": -2.326141595840454,
"logits/rejected": -2.350912094116211,
"logps/chosen": -113.98426818847656,
"logps/rejected": -161.37271118164062,
"loss": 120187.8125,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.03596794605255127,
"rewards/margins": 0.03252139315009117,
"rewards/rejected": -0.06848934292793274,
"step": 1000
},
{
"epoch": 0.404,
"grad_norm": 7831638.304742374,
"learning_rate": 3.311111111111111e-07,
"logits/chosen": -2.4753966331481934,
"logits/rejected": -2.4720969200134277,
"logps/chosen": -93.01488494873047,
"logps/rejected": -129.34759521484375,
"loss": 123952.0625,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.029807209968566895,
"rewards/margins": 0.028074974194169044,
"rewards/rejected": -0.05788217857480049,
"step": 1010
},
{
"epoch": 0.408,
"grad_norm": 9432041.545640234,
"learning_rate": 3.288888888888889e-07,
"logits/chosen": -2.4366466999053955,
"logits/rejected": -2.4423012733459473,
"logps/chosen": -104.15936279296875,
"logps/rejected": -127.2192611694336,
"loss": 126318.675,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.03481433913111687,
"rewards/margins": 0.019000452011823654,
"rewards/rejected": -0.05381479114294052,
"step": 1020
},
{
"epoch": 0.412,
"grad_norm": 8015912.365619365,
"learning_rate": 3.2666666666666663e-07,
"logits/chosen": -2.55189847946167,
"logits/rejected": -2.544987440109253,
"logps/chosen": -107.0470199584961,
"logps/rejected": -162.27200317382812,
"loss": 121624.775,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.034415554255247116,
"rewards/margins": 0.04376577213406563,
"rewards/rejected": -0.07818132638931274,
"step": 1030
},
{
"epoch": 0.416,
"grad_norm": 6289031.406175271,
"learning_rate": 3.244444444444444e-07,
"logits/chosen": -2.46891450881958,
"logits/rejected": -2.4879581928253174,
"logps/chosen": -95.2767562866211,
"logps/rejected": -143.16920471191406,
"loss": 121056.125,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.031673580408096313,
"rewards/margins": 0.03125213831663132,
"rewards/rejected": -0.06292571872472763,
"step": 1040
},
{
"epoch": 0.42,
"grad_norm": 8474897.39551202,
"learning_rate": 3.222222222222222e-07,
"logits/chosen": -2.5424282550811768,
"logits/rejected": -2.5662999153137207,
"logps/chosen": -111.43955993652344,
"logps/rejected": -153.2001190185547,
"loss": 122305.1625,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.034298766404390335,
"rewards/margins": 0.029853323474526405,
"rewards/rejected": -0.06415208429098129,
"step": 1050
},
{
"epoch": 0.424,
"grad_norm": 7118829.871963521,
"learning_rate": 3.2e-07,
"logits/chosen": -2.5913608074188232,
"logits/rejected": -2.607485294342041,
"logps/chosen": -110.69046783447266,
"logps/rejected": -146.65023803710938,
"loss": 120745.8125,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.02970978617668152,
"rewards/margins": 0.02858895994722843,
"rewards/rejected": -0.0582987479865551,
"step": 1060
},
{
"epoch": 0.428,
"grad_norm": 6184465.945134909,
"learning_rate": 3.1777777777777775e-07,
"logits/chosen": -2.61472225189209,
"logits/rejected": -2.6045632362365723,
"logps/chosen": -118.44596862792969,
"logps/rejected": -141.05259704589844,
"loss": 119233.675,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.030257636681199074,
"rewards/margins": 0.027423173189163208,
"rewards/rejected": -0.05768080800771713,
"step": 1070
},
{
"epoch": 0.432,
"grad_norm": 8384350.327301131,
"learning_rate": 3.1555555555555554e-07,
"logits/chosen": -2.5862181186676025,
"logits/rejected": -2.5921201705932617,
"logps/chosen": -107.20783996582031,
"logps/rejected": -146.17771911621094,
"loss": 123723.375,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.028097758069634438,
"rewards/margins": 0.03525155037641525,
"rewards/rejected": -0.06334930658340454,
"step": 1080
},
{
"epoch": 0.436,
"grad_norm": 6672967.979703066,
"learning_rate": 3.1333333333333333e-07,
"logits/chosen": -2.5249099731445312,
"logits/rejected": -2.5252327919006348,
"logps/chosen": -82.09342956542969,
"logps/rejected": -124.22785949707031,
"loss": 125011.25,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.02518371120095253,
"rewards/margins": 0.0352654904127121,
"rewards/rejected": -0.06044920161366463,
"step": 1090
},
{
"epoch": 0.44,
"grad_norm": 8650903.887605142,
"learning_rate": 3.111111111111111e-07,
"logits/chosen": -2.4037208557128906,
"logits/rejected": -2.4414310455322266,
"logps/chosen": -97.84339141845703,
"logps/rejected": -142.56967163085938,
"loss": 119550.375,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.029644513502717018,
"rewards/margins": 0.03777293115854263,
"rewards/rejected": -0.0674174427986145,
"step": 1100
},
{
"epoch": 0.444,
"grad_norm": 6832446.375717195,
"learning_rate": 3.0888888888888887e-07,
"logits/chosen": -2.4431042671203613,
"logits/rejected": -2.456604480743408,
"logps/chosen": -107.29023742675781,
"logps/rejected": -135.47000122070312,
"loss": 121918.525,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.03453664109110832,
"rewards/margins": 0.019270362332463264,
"rewards/rejected": -0.05380700156092644,
"step": 1110
},
{
"epoch": 0.448,
"grad_norm": 6428580.094510342,
"learning_rate": 3.066666666666666e-07,
"logits/chosen": -2.515045642852783,
"logits/rejected": -2.51838755607605,
"logps/chosen": -117.26094818115234,
"logps/rejected": -153.82720947265625,
"loss": 127075.725,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.031917281448841095,
"rewards/margins": 0.024132903665304184,
"rewards/rejected": -0.05605018883943558,
"step": 1120
},
{
"epoch": 0.452,
"grad_norm": 7059143.842956961,
"learning_rate": 3.0444444444444445e-07,
"logits/chosen": -2.4530346393585205,
"logits/rejected": -2.4681754112243652,
"logps/chosen": -106.003662109375,
"logps/rejected": -166.18124389648438,
"loss": 125480.3875,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.03530178219079971,
"rewards/margins": 0.03853614255785942,
"rewards/rejected": -0.07383792102336884,
"step": 1130
},
{
"epoch": 0.456,
"grad_norm": 7003392.120968072,
"learning_rate": 3.022222222222222e-07,
"logits/chosen": -2.526331901550293,
"logits/rejected": -2.5595362186431885,
"logps/chosen": -103.13214111328125,
"logps/rejected": -151.08694458007812,
"loss": 122330.225,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.0366961732506752,
"rewards/margins": 0.045485951006412506,
"rewards/rejected": -0.08218212425708771,
"step": 1140
},
{
"epoch": 0.46,
"grad_norm": 8693683.344797961,
"learning_rate": 3e-07,
"logits/chosen": -2.570127248764038,
"logits/rejected": -2.5569756031036377,
"logps/chosen": -118.91337585449219,
"logps/rejected": -157.19509887695312,
"loss": 121470.875,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.030104130506515503,
"rewards/margins": 0.03460243344306946,
"rewards/rejected": -0.06470657885074615,
"step": 1150
},
{
"epoch": 0.464,
"grad_norm": 9534213.097492808,
"learning_rate": 2.9777777777777773e-07,
"logits/chosen": -2.5106987953186035,
"logits/rejected": -2.4634547233581543,
"logps/chosen": -89.20586395263672,
"logps/rejected": -149.52505493164062,
"loss": 116528.675,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.02851666882634163,
"rewards/margins": 0.04616966471076012,
"rewards/rejected": -0.07468634098768234,
"step": 1160
},
{
"epoch": 0.468,
"grad_norm": 8777276.722714778,
"learning_rate": 2.9555555555555557e-07,
"logits/chosen": -2.488731861114502,
"logits/rejected": -2.4706058502197266,
"logps/chosen": -112.614501953125,
"logps/rejected": -158.08847045898438,
"loss": 123641.3375,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.037982989102602005,
"rewards/margins": 0.035983096808195114,
"rewards/rejected": -0.07396609336137772,
"step": 1170
},
{
"epoch": 0.472,
"grad_norm": 7805149.222381511,
"learning_rate": 2.933333333333333e-07,
"logits/chosen": -2.4993319511413574,
"logits/rejected": -2.4856998920440674,
"logps/chosen": -103.1233901977539,
"logps/rejected": -121.76432037353516,
"loss": 122802.05,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.03225432708859444,
"rewards/margins": 0.015282504260540009,
"rewards/rejected": -0.04753682762384415,
"step": 1180
},
{
"epoch": 0.476,
"grad_norm": 9127383.125403812,
"learning_rate": 2.911111111111111e-07,
"logits/chosen": -2.3260111808776855,
"logits/rejected": -2.384988784790039,
"logps/chosen": -111.52685546875,
"logps/rejected": -160.5020751953125,
"loss": 125717.575,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.03155955299735069,
"rewards/margins": 0.043639086186885834,
"rewards/rejected": -0.07519863545894623,
"step": 1190
},
{
"epoch": 0.48,
"grad_norm": 8277710.5533771645,
"learning_rate": 2.8888888888888885e-07,
"logits/chosen": -2.4511587619781494,
"logits/rejected": -2.473177433013916,
"logps/chosen": -101.6271743774414,
"logps/rejected": -154.0011749267578,
"loss": 119860.575,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.03446241840720177,
"rewards/margins": 0.0441209152340889,
"rewards/rejected": -0.07858333736658096,
"step": 1200
},
{
"epoch": 0.484,
"grad_norm": 9000426.122325586,
"learning_rate": 2.866666666666667e-07,
"logits/chosen": -2.352407693862915,
"logits/rejected": -2.3879640102386475,
"logps/chosen": -111.2037353515625,
"logps/rejected": -155.64559936523438,
"loss": 125368.55,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.033860720694065094,
"rewards/margins": 0.028398964554071426,
"rewards/rejected": -0.06225968152284622,
"step": 1210
},
{
"epoch": 0.488,
"grad_norm": 7439506.210055379,
"learning_rate": 2.8444444444444443e-07,
"logits/chosen": -2.4001262187957764,
"logits/rejected": -2.406364917755127,
"logps/chosen": -96.29491424560547,
"logps/rejected": -133.1979522705078,
"loss": 124972.3125,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.031061682850122452,
"rewards/margins": 0.028077807277441025,
"rewards/rejected": -0.05913949012756348,
"step": 1220
},
{
"epoch": 0.492,
"grad_norm": 8522710.668188507,
"learning_rate": 2.8222222222222217e-07,
"logits/chosen": -2.455540418624878,
"logits/rejected": -2.5077738761901855,
"logps/chosen": -118.55711364746094,
"logps/rejected": -183.99917602539062,
"loss": 123136.65,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.03442969545722008,
"rewards/margins": 0.04292518272995949,
"rewards/rejected": -0.07735487818717957,
"step": 1230
},
{
"epoch": 0.496,
"grad_norm": 7800702.638161312,
"learning_rate": 2.8e-07,
"logits/chosen": -2.4330172538757324,
"logits/rejected": -2.4518215656280518,
"logps/chosen": -106.44327545166016,
"logps/rejected": -148.51351928710938,
"loss": 123850.675,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.032119907438755035,
"rewards/margins": 0.03575160354375839,
"rewards/rejected": -0.06787151843309402,
"step": 1240
},
{
"epoch": 0.5,
"grad_norm": 6293849.548335739,
"learning_rate": 2.7777777777777776e-07,
"logits/chosen": -2.5701098442077637,
"logits/rejected": -2.563598394393921,
"logps/chosen": -112.28812408447266,
"logps/rejected": -145.1765594482422,
"loss": 122208.9125,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.034340750426054,
"rewards/margins": 0.029792601242661476,
"rewards/rejected": -0.06413334608078003,
"step": 1250
},
{
"epoch": 0.504,
"grad_norm": 8164963.35867732,
"learning_rate": 2.7555555555555555e-07,
"logits/chosen": -2.5201942920684814,
"logits/rejected": -2.491058111190796,
"logps/chosen": -116.97322845458984,
"logps/rejected": -161.62765502929688,
"loss": 123380.825,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.04085034877061844,
"rewards/margins": 0.03332878276705742,
"rewards/rejected": -0.07417913526296616,
"step": 1260
},
{
"epoch": 0.508,
"grad_norm": 7878494.751400375,
"learning_rate": 2.733333333333333e-07,
"logits/chosen": -2.5432791709899902,
"logits/rejected": -2.5216073989868164,
"logps/chosen": -104.03746032714844,
"logps/rejected": -157.35208129882812,
"loss": 115568.125,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.02601642534136772,
"rewards/margins": 0.04680890962481499,
"rewards/rejected": -0.07282533496618271,
"step": 1270
},
{
"epoch": 0.512,
"grad_norm": 8078756.180573847,
"learning_rate": 2.7111111111111114e-07,
"logits/chosen": -2.4107511043548584,
"logits/rejected": -2.4441466331481934,
"logps/chosen": -119.5540771484375,
"logps/rejected": -161.68992614746094,
"loss": 125619.95,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.0344010666012764,
"rewards/margins": 0.039026811718940735,
"rewards/rejected": -0.07342787086963654,
"step": 1280
},
{
"epoch": 0.516,
"grad_norm": 8353367.375300777,
"learning_rate": 2.688888888888889e-07,
"logits/chosen": -2.553907632827759,
"logits/rejected": -2.5770821571350098,
"logps/chosen": -119.15167236328125,
"logps/rejected": -144.7032012939453,
"loss": 121825.9875,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.03453459218144417,
"rewards/margins": 0.01783282496035099,
"rewards/rejected": -0.05236741900444031,
"step": 1290
},
{
"epoch": 0.52,
"grad_norm": 7928099.234468471,
"learning_rate": 2.6666666666666667e-07,
"logits/chosen": -2.4872889518737793,
"logits/rejected": -2.4608452320098877,
"logps/chosen": -97.89281463623047,
"logps/rejected": -121.05744934082031,
"loss": 121883.65,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.028054479509592056,
"rewards/margins": 0.019761094823479652,
"rewards/rejected": -0.04781556874513626,
"step": 1300
},
{
"epoch": 0.524,
"grad_norm": 7247015.687306507,
"learning_rate": 2.644444444444444e-07,
"logits/chosen": -2.516070604324341,
"logits/rejected": -2.552224636077881,
"logps/chosen": -106.81401062011719,
"logps/rejected": -151.16502380371094,
"loss": 115673.95,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.025754611939191818,
"rewards/margins": 0.03385675325989723,
"rewards/rejected": -0.05961136147379875,
"step": 1310
},
{
"epoch": 0.528,
"grad_norm": 7422418.597697799,
"learning_rate": 2.6222222222222226e-07,
"logits/chosen": -2.524921417236328,
"logits/rejected": -2.5199413299560547,
"logps/chosen": -91.73576354980469,
"logps/rejected": -131.1287384033203,
"loss": 122971.8125,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.031464457511901855,
"rewards/margins": 0.029294759035110474,
"rewards/rejected": -0.06075920909643173,
"step": 1320
},
{
"epoch": 0.532,
"grad_norm": 6692184.86494848,
"learning_rate": 2.6e-07,
"logits/chosen": -2.4487650394439697,
"logits/rejected": -2.42417573928833,
"logps/chosen": -105.08377838134766,
"logps/rejected": -152.42251586914062,
"loss": 124211.9125,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.03637874126434326,
"rewards/margins": 0.04163909703493118,
"rewards/rejected": -0.07801783829927444,
"step": 1330
},
{
"epoch": 0.536,
"grad_norm": 8776394.815220755,
"learning_rate": 2.5777777777777774e-07,
"logits/chosen": -2.426945209503174,
"logits/rejected": -2.4211270809173584,
"logps/chosen": -100.32334899902344,
"logps/rejected": -158.4632568359375,
"loss": 119074.8125,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.02933502197265625,
"rewards/margins": 0.04685003310441971,
"rewards/rejected": -0.07618506252765656,
"step": 1340
},
{
"epoch": 0.54,
"grad_norm": 7921315.937162929,
"learning_rate": 2.5555555555555553e-07,
"logits/chosen": -2.4435505867004395,
"logits/rejected": -2.4641623497009277,
"logps/chosen": -97.31932067871094,
"logps/rejected": -139.09976196289062,
"loss": 119875.825,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.03033040091395378,
"rewards/margins": 0.03286002576351166,
"rewards/rejected": -0.06319043040275574,
"step": 1350
},
{
"epoch": 0.544,
"grad_norm": 7064909.318018072,
"learning_rate": 2.533333333333333e-07,
"logits/chosen": -2.4914050102233887,
"logits/rejected": -2.4965710639953613,
"logps/chosen": -117.77690124511719,
"logps/rejected": -150.23826599121094,
"loss": 119877.625,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.0384187288582325,
"rewards/margins": 0.02663787081837654,
"rewards/rejected": -0.06505659967660904,
"step": 1360
},
{
"epoch": 0.548,
"grad_norm": 7218217.506885473,
"learning_rate": 2.511111111111111e-07,
"logits/chosen": -2.4381630420684814,
"logits/rejected": -2.4487814903259277,
"logps/chosen": -102.14595794677734,
"logps/rejected": -154.1560516357422,
"loss": 112836.2875,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.029405618086457253,
"rewards/margins": 0.04147377982735634,
"rewards/rejected": -0.07087938487529755,
"step": 1370
},
{
"epoch": 0.552,
"grad_norm": 8751055.476830697,
"learning_rate": 2.4888888888888886e-07,
"logits/chosen": -2.3979814052581787,
"logits/rejected": -2.4224982261657715,
"logps/chosen": -112.16400146484375,
"logps/rejected": -158.16567993164062,
"loss": 123894.4,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.031505607068538666,
"rewards/margins": 0.03382585197687149,
"rewards/rejected": -0.06533145159482956,
"step": 1380
},
{
"epoch": 0.556,
"grad_norm": 10628471.178826654,
"learning_rate": 2.4666666666666665e-07,
"logits/chosen": -2.42820143699646,
"logits/rejected": -2.439427375793457,
"logps/chosen": -86.3087387084961,
"logps/rejected": -118.8100814819336,
"loss": 122684.5,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.028887376189231873,
"rewards/margins": 0.024274542927742004,
"rewards/rejected": -0.05316191911697388,
"step": 1390
},
{
"epoch": 0.56,
"grad_norm": 7340403.161227933,
"learning_rate": 2.4444444444444445e-07,
"logits/chosen": -2.3746609687805176,
"logits/rejected": -2.39011812210083,
"logps/chosen": -112.76751708984375,
"logps/rejected": -155.74038696289062,
"loss": 121104.9875,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.032277911901474,
"rewards/margins": 0.032715652137994766,
"rewards/rejected": -0.06499356776475906,
"step": 1400
},
{
"epoch": 0.564,
"grad_norm": 7097101.153173888,
"learning_rate": 2.4222222222222224e-07,
"logits/chosen": -2.440396547317505,
"logits/rejected": -2.4504265785217285,
"logps/chosen": -104.66170501708984,
"logps/rejected": -160.94100952148438,
"loss": 117720.675,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.019881997257471085,
"rewards/margins": 0.04168248176574707,
"rewards/rejected": -0.06156448274850845,
"step": 1410
},
{
"epoch": 0.568,
"grad_norm": 5809898.3420226695,
"learning_rate": 2.4e-07,
"logits/chosen": -2.445601463317871,
"logits/rejected": -2.4264917373657227,
"logps/chosen": -95.17555236816406,
"logps/rejected": -143.88890075683594,
"loss": 120431.9875,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.03070194646716118,
"rewards/margins": 0.04166869446635246,
"rewards/rejected": -0.07237063348293304,
"step": 1420
},
{
"epoch": 0.572,
"grad_norm": 4988644.975711128,
"learning_rate": 2.3777777777777777e-07,
"logits/chosen": -2.4375877380371094,
"logits/rejected": -2.441622257232666,
"logps/chosen": -90.29289245605469,
"logps/rejected": -128.98793029785156,
"loss": 119927.5,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.027714330703020096,
"rewards/margins": 0.029140587896108627,
"rewards/rejected": -0.056854914873838425,
"step": 1430
},
{
"epoch": 0.576,
"grad_norm": 7822455.89349568,
"learning_rate": 2.3555555555555554e-07,
"logits/chosen": -2.458700656890869,
"logits/rejected": -2.4986705780029297,
"logps/chosen": -117.4685287475586,
"logps/rejected": -151.8163299560547,
"loss": 123864.1,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.03203599527478218,
"rewards/margins": 0.031274113804101944,
"rewards/rejected": -0.06331010907888412,
"step": 1440
},
{
"epoch": 0.58,
"grad_norm": 14175243.183000157,
"learning_rate": 2.3333333333333333e-07,
"logits/chosen": -2.4828193187713623,
"logits/rejected": -2.479646682739258,
"logps/chosen": -103.30030822753906,
"logps/rejected": -158.0785369873047,
"loss": 124849.6125,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.032672982662916183,
"rewards/margins": 0.04446934536099434,
"rewards/rejected": -0.07714232802391052,
"step": 1450
},
{
"epoch": 0.584,
"grad_norm": 6091294.506455895,
"learning_rate": 2.311111111111111e-07,
"logits/chosen": -2.39980411529541,
"logits/rejected": -2.382380723953247,
"logps/chosen": -95.15815734863281,
"logps/rejected": -165.42840576171875,
"loss": 123090.975,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.027900245040655136,
"rewards/margins": 0.05805187299847603,
"rewards/rejected": -0.08595212548971176,
"step": 1460
},
{
"epoch": 0.588,
"grad_norm": 6711707.485572769,
"learning_rate": 2.288888888888889e-07,
"logits/chosen": -2.4506986141204834,
"logits/rejected": -2.460022449493408,
"logps/chosen": -99.18559265136719,
"logps/rejected": -133.17782592773438,
"loss": 120549.275,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.02819480374455452,
"rewards/margins": 0.026910748332738876,
"rewards/rejected": -0.0551055483520031,
"step": 1470
},
{
"epoch": 0.592,
"grad_norm": 7790254.418823996,
"learning_rate": 2.2666666666666663e-07,
"logits/chosen": -2.512367010116577,
"logits/rejected": -2.5243959426879883,
"logps/chosen": -124.3129653930664,
"logps/rejected": -171.23019409179688,
"loss": 119269.2875,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.03208141773939133,
"rewards/margins": 0.038219161331653595,
"rewards/rejected": -0.07030057162046432,
"step": 1480
},
{
"epoch": 0.596,
"grad_norm": 7436386.208544868,
"learning_rate": 2.2444444444444442e-07,
"logits/chosen": -2.45894193649292,
"logits/rejected": -2.438983678817749,
"logps/chosen": -81.72193908691406,
"logps/rejected": -120.81268310546875,
"loss": 119383.7875,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.023425322026014328,
"rewards/margins": 0.031169379130005836,
"rewards/rejected": -0.054594703018665314,
"step": 1490
},
{
"epoch": 0.6,
"grad_norm": 6098566.562808592,
"learning_rate": 2.222222222222222e-07,
"logits/chosen": -2.4491584300994873,
"logits/rejected": -2.4278082847595215,
"logps/chosen": -109.7543716430664,
"logps/rejected": -137.11195373535156,
"loss": 122590.0875,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.03018758073449135,
"rewards/margins": 0.019970091059803963,
"rewards/rejected": -0.05015767365694046,
"step": 1500
},
{
"epoch": 0.604,
"grad_norm": 5676696.337246529,
"learning_rate": 2.1999999999999998e-07,
"logits/chosen": -2.3899359703063965,
"logits/rejected": -2.3801169395446777,
"logps/chosen": -125.1462173461914,
"logps/rejected": -157.1474151611328,
"loss": 123626.575,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.035767387598752975,
"rewards/margins": 0.029546618461608887,
"rewards/rejected": -0.06531400978565216,
"step": 1510
},
{
"epoch": 0.608,
"grad_norm": 6408066.768940639,
"learning_rate": 2.1777777777777775e-07,
"logits/chosen": -2.511399984359741,
"logits/rejected": -2.556304693222046,
"logps/chosen": -102.42405700683594,
"logps/rejected": -140.045654296875,
"loss": 124030.1,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.026485705748200417,
"rewards/margins": 0.027697976678609848,
"rewards/rejected": -0.054183680564165115,
"step": 1520
},
{
"epoch": 0.612,
"grad_norm": 5703068.150369112,
"learning_rate": 2.1555555555555554e-07,
"logits/chosen": -2.328718900680542,
"logits/rejected": -2.3354427814483643,
"logps/chosen": -104.07130432128906,
"logps/rejected": -145.46546936035156,
"loss": 119267.475,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.035999588668346405,
"rewards/margins": 0.03417082130908966,
"rewards/rejected": -0.07017041742801666,
"step": 1530
},
{
"epoch": 0.616,
"grad_norm": 6771552.786870031,
"learning_rate": 2.1333333333333334e-07,
"logits/chosen": -2.3548483848571777,
"logits/rejected": -2.3827786445617676,
"logps/chosen": -101.12074279785156,
"logps/rejected": -159.588623046875,
"loss": 122909.8,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.030131155624985695,
"rewards/margins": 0.04275660961866379,
"rewards/rejected": -0.07288776338100433,
"step": 1540
},
{
"epoch": 0.62,
"grad_norm": 6949724.758162161,
"learning_rate": 2.111111111111111e-07,
"logits/chosen": -2.3255906105041504,
"logits/rejected": -2.3439719676971436,
"logps/chosen": -108.85466003417969,
"logps/rejected": -158.19845581054688,
"loss": 123216.3625,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.02804810181260109,
"rewards/margins": 0.046050988137722015,
"rewards/rejected": -0.0740990936756134,
"step": 1550
},
{
"epoch": 0.624,
"grad_norm": 9265006.397727864,
"learning_rate": 2.088888888888889e-07,
"logits/chosen": -2.3858678340911865,
"logits/rejected": -2.3714261054992676,
"logps/chosen": -107.79356384277344,
"logps/rejected": -142.68203735351562,
"loss": 128108.675,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0350666344165802,
"rewards/margins": 0.025052737444639206,
"rewards/rejected": -0.0601193793118,
"step": 1560
},
{
"epoch": 0.628,
"grad_norm": 8448160.384168701,
"learning_rate": 2.0666666666666666e-07,
"logits/chosen": -2.302703857421875,
"logits/rejected": -2.317382335662842,
"logps/chosen": -106.1356201171875,
"logps/rejected": -165.3655242919922,
"loss": 119009.5625,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.02817341312766075,
"rewards/margins": 0.043338801711797714,
"rewards/rejected": -0.07151221483945847,
"step": 1570
},
{
"epoch": 0.632,
"grad_norm": 6797346.176978063,
"learning_rate": 2.0444444444444446e-07,
"logits/chosen": -2.3419301509857178,
"logits/rejected": -2.360405683517456,
"logps/chosen": -100.89833068847656,
"logps/rejected": -136.70143127441406,
"loss": 121410.0125,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.026873702183365822,
"rewards/margins": 0.03099043294787407,
"rewards/rejected": -0.05786413699388504,
"step": 1580
},
{
"epoch": 0.636,
"grad_norm": 7957900.602760819,
"learning_rate": 2.022222222222222e-07,
"logits/chosen": -2.325685739517212,
"logits/rejected": -2.3536834716796875,
"logps/chosen": -86.800048828125,
"logps/rejected": -118.13228607177734,
"loss": 121560.0125,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.027103954926133156,
"rewards/margins": 0.023416386917233467,
"rewards/rejected": -0.050520338118076324,
"step": 1590
},
{
"epoch": 0.64,
"grad_norm": 7524893.989731965,
"learning_rate": 2e-07,
"logits/chosen": -2.379504919052124,
"logits/rejected": -2.364494562149048,
"logps/chosen": -92.89854431152344,
"logps/rejected": -144.94566345214844,
"loss": 118599.6,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.027095776051282883,
"rewards/margins": 0.04017645865678787,
"rewards/rejected": -0.06727223098278046,
"step": 1600
},
{
"epoch": 0.644,
"grad_norm": 7808682.74577819,
"learning_rate": 1.9777777777777776e-07,
"logits/chosen": -2.462635040283203,
"logits/rejected": -2.3920907974243164,
"logps/chosen": -123.39002990722656,
"logps/rejected": -168.66348266601562,
"loss": 123616.1375,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.02962644398212433,
"rewards/margins": 0.026749838143587112,
"rewards/rejected": -0.05637627840042114,
"step": 1610
},
{
"epoch": 0.648,
"grad_norm": 7508118.164730088,
"learning_rate": 1.9555555555555555e-07,
"logits/chosen": -2.3979110717773438,
"logits/rejected": -2.394200563430786,
"logps/chosen": -113.19117736816406,
"logps/rejected": -152.71365356445312,
"loss": 122617.65,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.0326806977391243,
"rewards/margins": 0.03160088136792183,
"rewards/rejected": -0.06428157538175583,
"step": 1620
},
{
"epoch": 0.652,
"grad_norm": 9886937.518719045,
"learning_rate": 1.9333333333333332e-07,
"logits/chosen": -2.4126977920532227,
"logits/rejected": -2.4325013160705566,
"logps/chosen": -91.72865295410156,
"logps/rejected": -134.23829650878906,
"loss": 125535.1625,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.029735634103417397,
"rewards/margins": 0.03641930967569351,
"rewards/rejected": -0.06615494191646576,
"step": 1630
},
{
"epoch": 0.656,
"grad_norm": 5382013.583341201,
"learning_rate": 1.911111111111111e-07,
"logits/chosen": -2.3764185905456543,
"logits/rejected": -2.382178544998169,
"logps/chosen": -91.31561279296875,
"logps/rejected": -137.80345153808594,
"loss": 111719.4375,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.01971900835633278,
"rewards/margins": 0.04298390820622444,
"rewards/rejected": -0.06270290911197662,
"step": 1640
},
{
"epoch": 0.66,
"grad_norm": 6409699.176115095,
"learning_rate": 1.8888888888888888e-07,
"logits/chosen": -2.330606698989868,
"logits/rejected": -2.312016010284424,
"logps/chosen": -114.25804138183594,
"logps/rejected": -165.2264404296875,
"loss": 122533.2125,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.026577278971672058,
"rewards/margins": 0.0414334274828434,
"rewards/rejected": -0.06801070272922516,
"step": 1650
},
{
"epoch": 0.664,
"grad_norm": 6977532.54737261,
"learning_rate": 1.8666666666666667e-07,
"logits/chosen": -2.320591449737549,
"logits/rejected": -2.3048043251037598,
"logps/chosen": -117.83941650390625,
"logps/rejected": -176.08311462402344,
"loss": 118259.3375,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.030085504055023193,
"rewards/margins": 0.03497748449444771,
"rewards/rejected": -0.0650629848241806,
"step": 1660
},
{
"epoch": 0.668,
"grad_norm": 7151487.887768503,
"learning_rate": 1.8444444444444444e-07,
"logits/chosen": -2.325629949569702,
"logits/rejected": -2.37601637840271,
"logps/chosen": -95.17213439941406,
"logps/rejected": -149.96546936035156,
"loss": 117774.0875,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.028405601158738136,
"rewards/margins": 0.04639150947332382,
"rewards/rejected": -0.07479710876941681,
"step": 1670
},
{
"epoch": 0.672,
"grad_norm": 5729479.50083699,
"learning_rate": 1.8222222222222223e-07,
"logits/chosen": -2.4593453407287598,
"logits/rejected": -2.5003867149353027,
"logps/chosen": -96.59799194335938,
"logps/rejected": -145.85215759277344,
"loss": 123032.9,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.03039277158677578,
"rewards/margins": 0.034516870975494385,
"rewards/rejected": -0.06490965187549591,
"step": 1680
},
{
"epoch": 0.676,
"grad_norm": 6685189.148553702,
"learning_rate": 1.8e-07,
"logits/chosen": -2.420285224914551,
"logits/rejected": -2.405721426010132,
"logps/chosen": -101.75447082519531,
"logps/rejected": -151.68434143066406,
"loss": 121099.5375,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.033398691564798355,
"rewards/margins": 0.04428454115986824,
"rewards/rejected": -0.0776832327246666,
"step": 1690
},
{
"epoch": 0.68,
"grad_norm": 6727764.202542203,
"learning_rate": 1.7777777777777776e-07,
"logits/chosen": -2.5021843910217285,
"logits/rejected": -2.475396156311035,
"logps/chosen": -103.27125549316406,
"logps/rejected": -146.76480102539062,
"loss": 124458.15,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.027325114235281944,
"rewards/margins": 0.04307966306805611,
"rewards/rejected": -0.0704047828912735,
"step": 1700
},
{
"epoch": 0.684,
"grad_norm": 8951493.717829395,
"learning_rate": 1.7555555555555553e-07,
"logits/chosen": -2.359651565551758,
"logits/rejected": -2.3753628730773926,
"logps/chosen": -92.47693634033203,
"logps/rejected": -143.73085021972656,
"loss": 122129.25,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.021263476461172104,
"rewards/margins": 0.047465912997722626,
"rewards/rejected": -0.06872939318418503,
"step": 1710
},
{
"epoch": 0.688,
"grad_norm": 6117231.523934995,
"learning_rate": 1.7333333333333332e-07,
"logits/chosen": -2.3933348655700684,
"logits/rejected": -2.424694538116455,
"logps/chosen": -101.28050231933594,
"logps/rejected": -161.67837524414062,
"loss": 114482.8625,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.025342673063278198,
"rewards/margins": 0.051556408405303955,
"rewards/rejected": -0.07689908146858215,
"step": 1720
},
{
"epoch": 0.692,
"grad_norm": 8463223.381087825,
"learning_rate": 1.711111111111111e-07,
"logits/chosen": -2.4435369968414307,
"logits/rejected": -2.421454668045044,
"logps/chosen": -94.87931823730469,
"logps/rejected": -127.29423522949219,
"loss": 117968.2625,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.030688803642988205,
"rewards/margins": 0.025830263271927834,
"rewards/rejected": -0.05651906877756119,
"step": 1730
},
{
"epoch": 0.696,
"grad_norm": 8481439.735231701,
"learning_rate": 1.6888888888888888e-07,
"logits/chosen": -2.422682285308838,
"logits/rejected": -2.4244956970214844,
"logps/chosen": -103.77542877197266,
"logps/rejected": -137.70925903320312,
"loss": 120156.825,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.028543483465909958,
"rewards/margins": 0.021968597546219826,
"rewards/rejected": -0.05051208287477493,
"step": 1740
},
{
"epoch": 0.7,
"grad_norm": 9216619.77120461,
"learning_rate": 1.6666666666666665e-07,
"logits/chosen": -2.505079507827759,
"logits/rejected": -2.477112054824829,
"logps/chosen": -105.45587158203125,
"logps/rejected": -140.0381317138672,
"loss": 113155.975,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.02943289838731289,
"rewards/margins": 0.03403625637292862,
"rewards/rejected": -0.06346915662288666,
"step": 1750
},
{
"epoch": 0.704,
"grad_norm": 7248683.781183183,
"learning_rate": 1.6444444444444444e-07,
"logits/chosen": -2.480299949645996,
"logits/rejected": -2.470823287963867,
"logps/chosen": -122.30104064941406,
"logps/rejected": -164.48951721191406,
"loss": 117051.6,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.029943257570266724,
"rewards/margins": 0.036385588347911835,
"rewards/rejected": -0.06632884591817856,
"step": 1760
},
{
"epoch": 0.708,
"grad_norm": 8402038.296433628,
"learning_rate": 1.622222222222222e-07,
"logits/chosen": -2.472791910171509,
"logits/rejected": -2.4726855754852295,
"logps/chosen": -114.65007019042969,
"logps/rejected": -153.65573120117188,
"loss": 120727.275,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.03354992717504501,
"rewards/margins": 0.03721586614847183,
"rewards/rejected": -0.07076579332351685,
"step": 1770
},
{
"epoch": 0.712,
"grad_norm": 9719574.93632757,
"learning_rate": 1.6e-07,
"logits/chosen": -2.4326717853546143,
"logits/rejected": -2.421159267425537,
"logps/chosen": -115.16218566894531,
"logps/rejected": -164.61166381835938,
"loss": 122159.4125,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.031878646463155746,
"rewards/margins": 0.04045126959681511,
"rewards/rejected": -0.07232991605997086,
"step": 1780
},
{
"epoch": 0.716,
"grad_norm": 7274636.012655509,
"learning_rate": 1.5777777777777777e-07,
"logits/chosen": -2.3492178916931152,
"logits/rejected": -2.3669705390930176,
"logps/chosen": -110.37937927246094,
"logps/rejected": -148.16090393066406,
"loss": 122808.1875,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.033610500395298004,
"rewards/margins": 0.031820036470890045,
"rewards/rejected": -0.06543054431676865,
"step": 1790
},
{
"epoch": 0.72,
"grad_norm": 7960487.891716203,
"learning_rate": 1.5555555555555556e-07,
"logits/chosen": -2.5343594551086426,
"logits/rejected": -2.509129762649536,
"logps/chosen": -112.82719421386719,
"logps/rejected": -139.89376831054688,
"loss": 124301.7875,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.031808070838451385,
"rewards/margins": 0.015915410593152046,
"rewards/rejected": -0.04772348329424858,
"step": 1800
},
{
"epoch": 0.724,
"grad_norm": 7184987.914163716,
"learning_rate": 1.533333333333333e-07,
"logits/chosen": -2.5088343620300293,
"logits/rejected": -2.5166666507720947,
"logps/chosen": -104.326904296875,
"logps/rejected": -150.2886505126953,
"loss": 128243.85,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.034587159752845764,
"rewards/margins": 0.03759396821260452,
"rewards/rejected": -0.07218112796545029,
"step": 1810
},
{
"epoch": 0.728,
"grad_norm": 10039229.460908188,
"learning_rate": 1.511111111111111e-07,
"logits/chosen": -2.45367693901062,
"logits/rejected": -2.4529166221618652,
"logps/chosen": -107.16294860839844,
"logps/rejected": -140.8982696533203,
"loss": 124612.4625,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.03276178613305092,
"rewards/margins": 0.03512474521994591,
"rewards/rejected": -0.06788653880357742,
"step": 1820
},
{
"epoch": 0.732,
"grad_norm": 7124197.534522493,
"learning_rate": 1.4888888888888886e-07,
"logits/chosen": -2.4414725303649902,
"logits/rejected": -2.422440767288208,
"logps/chosen": -84.65110778808594,
"logps/rejected": -130.88641357421875,
"loss": 125120.0625,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.018780285492539406,
"rewards/margins": 0.03842353820800781,
"rewards/rejected": -0.05720382183790207,
"step": 1830
},
{
"epoch": 0.736,
"grad_norm": 9253360.097651359,
"learning_rate": 1.4666666666666666e-07,
"logits/chosen": -2.3342652320861816,
"logits/rejected": -2.3714077472686768,
"logps/chosen": -101.37190246582031,
"logps/rejected": -128.89321899414062,
"loss": 120892.2875,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.02679322101175785,
"rewards/margins": 0.027122925966978073,
"rewards/rejected": -0.053916145116090775,
"step": 1840
},
{
"epoch": 0.74,
"grad_norm": 7423694.38438217,
"learning_rate": 1.4444444444444442e-07,
"logits/chosen": -2.4630966186523438,
"logits/rejected": -2.413529634475708,
"logps/chosen": -103.9755630493164,
"logps/rejected": -133.18716430664062,
"loss": 123775.5125,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.03415101021528244,
"rewards/margins": 0.02861020900309086,
"rewards/rejected": -0.06276122480630875,
"step": 1850
},
{
"epoch": 0.744,
"grad_norm": 5647373.604720096,
"learning_rate": 1.4222222222222222e-07,
"logits/chosen": -2.464050769805908,
"logits/rejected": -2.501260757446289,
"logps/chosen": -125.53248596191406,
"logps/rejected": -175.7611083984375,
"loss": 117425.8375,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.02989697828888893,
"rewards/margins": 0.040645621716976166,
"rewards/rejected": -0.0705425962805748,
"step": 1860
},
{
"epoch": 0.748,
"grad_norm": 8230581.203213356,
"learning_rate": 1.4e-07,
"logits/chosen": -2.5151472091674805,
"logits/rejected": -2.509429454803467,
"logps/chosen": -92.0722885131836,
"logps/rejected": -167.84060668945312,
"loss": 119465.4875,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.02767838165163994,
"rewards/margins": 0.05135294049978256,
"rewards/rejected": -0.0790313258767128,
"step": 1870
},
{
"epoch": 0.752,
"grad_norm": 12754476.984279104,
"learning_rate": 1.3777777777777778e-07,
"logits/chosen": -2.5021023750305176,
"logits/rejected": -2.484841823577881,
"logps/chosen": -94.76568603515625,
"logps/rejected": -139.48953247070312,
"loss": 118982.3875,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.02872481569647789,
"rewards/margins": 0.03676723688840866,
"rewards/rejected": -0.06549205631017685,
"step": 1880
},
{
"epoch": 0.756,
"grad_norm": 6983911.433265186,
"learning_rate": 1.3555555555555557e-07,
"logits/chosen": -2.5138821601867676,
"logits/rejected": -2.5218331813812256,
"logps/chosen": -89.62753295898438,
"logps/rejected": -128.04603576660156,
"loss": 120175.3625,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.025366436690092087,
"rewards/margins": 0.028226271271705627,
"rewards/rejected": -0.053592704236507416,
"step": 1890
},
{
"epoch": 0.76,
"grad_norm": 7315179.12228925,
"learning_rate": 1.3333333333333334e-07,
"logits/chosen": -2.3722000122070312,
"logits/rejected": -2.361008644104004,
"logps/chosen": -106.6434326171875,
"logps/rejected": -154.347412109375,
"loss": 118594.25,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.028675338253378868,
"rewards/margins": 0.03925652056932449,
"rewards/rejected": -0.06793185323476791,
"step": 1900
},
{
"epoch": 0.764,
"grad_norm": 6191576.733772953,
"learning_rate": 1.3111111111111113e-07,
"logits/chosen": -2.361506938934326,
"logits/rejected": -2.416877269744873,
"logps/chosen": -121.11296081542969,
"logps/rejected": -182.23646545410156,
"loss": 120340.05,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.027058254927396774,
"rewards/margins": 0.04295941814780235,
"rewards/rejected": -0.07001767307519913,
"step": 1910
},
{
"epoch": 0.768,
"grad_norm": 7960326.332116376,
"learning_rate": 1.2888888888888887e-07,
"logits/chosen": -2.3773555755615234,
"logits/rejected": -2.393650531768799,
"logps/chosen": -86.46308135986328,
"logps/rejected": -132.29776000976562,
"loss": 122114.95,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.02390897274017334,
"rewards/margins": 0.037305351346731186,
"rewards/rejected": -0.061214327812194824,
"step": 1920
},
{
"epoch": 0.772,
"grad_norm": 8545237.759950696,
"learning_rate": 1.2666666666666666e-07,
"logits/chosen": -2.3730955123901367,
"logits/rejected": -2.346529960632324,
"logps/chosen": -98.66458129882812,
"logps/rejected": -133.67532348632812,
"loss": 121957.9,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.030175382271409035,
"rewards/margins": 0.031160462647676468,
"rewards/rejected": -0.06133584305644035,
"step": 1930
},
{
"epoch": 0.776,
"grad_norm": 7541759.4377164915,
"learning_rate": 1.2444444444444443e-07,
"logits/chosen": -2.4905495643615723,
"logits/rejected": -2.467737913131714,
"logps/chosen": -97.59669494628906,
"logps/rejected": -144.62152099609375,
"loss": 120647.3875,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.029411058872938156,
"rewards/margins": 0.03424420207738876,
"rewards/rejected": -0.06365526467561722,
"step": 1940
},
{
"epoch": 0.78,
"grad_norm": 9641484.377151929,
"learning_rate": 1.2222222222222222e-07,
"logits/chosen": -2.49107027053833,
"logits/rejected": -2.490995407104492,
"logps/chosen": -101.06126403808594,
"logps/rejected": -136.541748046875,
"loss": 121525.775,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.026407599449157715,
"rewards/margins": 0.03157456964254379,
"rewards/rejected": -0.05798216909170151,
"step": 1950
},
{
"epoch": 0.784,
"grad_norm": 5545873.184552746,
"learning_rate": 1.2e-07,
"logits/chosen": -2.5349438190460205,
"logits/rejected": -2.5639233589172363,
"logps/chosen": -100.9637222290039,
"logps/rejected": -172.61062622070312,
"loss": 119077.0375,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.019367219880223274,
"rewards/margins": 0.057732999324798584,
"rewards/rejected": -0.0771002247929573,
"step": 1960
},
{
"epoch": 0.788,
"grad_norm": 9492873.423361877,
"learning_rate": 1.1777777777777777e-07,
"logits/chosen": -2.358701229095459,
"logits/rejected": -2.379284381866455,
"logps/chosen": -92.12281036376953,
"logps/rejected": -166.2657012939453,
"loss": 114722.3,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.02654215320944786,
"rewards/margins": 0.06289757788181305,
"rewards/rejected": -0.08943972736597061,
"step": 1970
},
{
"epoch": 0.792,
"grad_norm": 8128359.015235812,
"learning_rate": 1.1555555555555555e-07,
"logits/chosen": -2.445798397064209,
"logits/rejected": -2.463740110397339,
"logps/chosen": -111.72492980957031,
"logps/rejected": -154.72195434570312,
"loss": 123011.1125,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.029620587825775146,
"rewards/margins": 0.03493895009160042,
"rewards/rejected": -0.06455953419208527,
"step": 1980
},
{
"epoch": 0.796,
"grad_norm": 9306300.401598651,
"learning_rate": 1.1333333333333332e-07,
"logits/chosen": -2.417834520339966,
"logits/rejected": -2.4220337867736816,
"logps/chosen": -91.2249526977539,
"logps/rejected": -162.97373962402344,
"loss": 118218.9,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.023723283782601357,
"rewards/margins": 0.057645224034786224,
"rewards/rejected": -0.08136852085590363,
"step": 1990
},
{
"epoch": 0.8,
"grad_norm": 8765512.962268472,
"learning_rate": 1.111111111111111e-07,
"logits/chosen": -2.40264630317688,
"logits/rejected": -2.427499294281006,
"logps/chosen": -96.80880737304688,
"logps/rejected": -145.74375915527344,
"loss": 120947.8375,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.02879420481622219,
"rewards/margins": 0.04018041118979454,
"rewards/rejected": -0.06897461414337158,
"step": 2000
},
{
"epoch": 0.804,
"grad_norm": 7358207.507871911,
"learning_rate": 1.0888888888888888e-07,
"logits/chosen": -2.491158962249756,
"logits/rejected": -2.4770848751068115,
"logps/chosen": -109.10551452636719,
"logps/rejected": -167.1204071044922,
"loss": 116044.225,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.02867368422448635,
"rewards/margins": 0.04405529797077179,
"rewards/rejected": -0.07272897660732269,
"step": 2010
},
{
"epoch": 0.808,
"grad_norm": 6847868.235200222,
"learning_rate": 1.0666666666666667e-07,
"logits/chosen": -2.4977283477783203,
"logits/rejected": -2.4795002937316895,
"logps/chosen": -92.58442687988281,
"logps/rejected": -137.3270721435547,
"loss": 120623.725,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.02764459326863289,
"rewards/margins": 0.029529035091400146,
"rewards/rejected": -0.057173628360033035,
"step": 2020
},
{
"epoch": 0.812,
"grad_norm": 7345224.131775115,
"learning_rate": 1.0444444444444445e-07,
"logits/chosen": -2.581921100616455,
"logits/rejected": -2.5818488597869873,
"logps/chosen": -94.2405014038086,
"logps/rejected": -151.82931518554688,
"loss": 118029.625,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.026779672130942345,
"rewards/margins": 0.04871240258216858,
"rewards/rejected": -0.07549206912517548,
"step": 2030
},
{
"epoch": 0.816,
"grad_norm": 8179129.121592561,
"learning_rate": 1.0222222222222223e-07,
"logits/chosen": -2.510425090789795,
"logits/rejected": -2.5082054138183594,
"logps/chosen": -98.96800231933594,
"logps/rejected": -132.51150512695312,
"loss": 119617.05,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.02764180861413479,
"rewards/margins": 0.024158382788300514,
"rewards/rejected": -0.0518001914024353,
"step": 2040
},
{
"epoch": 0.82,
"grad_norm": 8464195.380143736,
"learning_rate": 1e-07,
"logits/chosen": -2.4244985580444336,
"logits/rejected": -2.43390154838562,
"logps/chosen": -106.4210205078125,
"logps/rejected": -164.44268798828125,
"loss": 120873.525,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.029137443751096725,
"rewards/margins": 0.049099259078502655,
"rewards/rejected": -0.07823669910430908,
"step": 2050
},
{
"epoch": 0.824,
"grad_norm": 7635743.090822039,
"learning_rate": 9.777777777777778e-08,
"logits/chosen": -2.45817494392395,
"logits/rejected": -2.472324848175049,
"logps/chosen": -102.89268493652344,
"logps/rejected": -155.74337768554688,
"loss": 120353.5125,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.0331621877849102,
"rewards/margins": 0.045612066984176636,
"rewards/rejected": -0.07877425849437714,
"step": 2060
},
{
"epoch": 0.828,
"grad_norm": 6226582.326517582,
"learning_rate": 9.555555555555556e-08,
"logits/chosen": -2.4659340381622314,
"logits/rejected": -2.475663661956787,
"logps/chosen": -94.48152923583984,
"logps/rejected": -128.58985900878906,
"loss": 123484.35,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.02351956069469452,
"rewards/margins": 0.03220217674970627,
"rewards/rejected": -0.05572172999382019,
"step": 2070
},
{
"epoch": 0.832,
"grad_norm": 9949795.202652398,
"learning_rate": 9.333333333333334e-08,
"logits/chosen": -2.4222323894500732,
"logits/rejected": -2.42124342918396,
"logps/chosen": -103.56013488769531,
"logps/rejected": -154.19448852539062,
"loss": 122220.075,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.032791610807180405,
"rewards/margins": 0.03772992268204689,
"rewards/rejected": -0.0705215334892273,
"step": 2080
},
{
"epoch": 0.836,
"grad_norm": 7095174.456895947,
"learning_rate": 9.111111111111112e-08,
"logits/chosen": -2.5458078384399414,
"logits/rejected": -2.5440893173217773,
"logps/chosen": -104.78141784667969,
"logps/rejected": -135.82858276367188,
"loss": 121374.0375,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.026525821536779404,
"rewards/margins": 0.02192816510796547,
"rewards/rejected": -0.04845398664474487,
"step": 2090
},
{
"epoch": 0.84,
"grad_norm": 10736924.474295698,
"learning_rate": 8.888888888888888e-08,
"logits/chosen": -2.4108989238739014,
"logits/rejected": -2.442476749420166,
"logps/chosen": -109.81797790527344,
"logps/rejected": -154.10580444335938,
"loss": 118612.975,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.030571172013878822,
"rewards/margins": 0.04257971793413162,
"rewards/rejected": -0.0731508880853653,
"step": 2100
},
{
"epoch": 0.844,
"grad_norm": 7793006.873257276,
"learning_rate": 8.666666666666666e-08,
"logits/chosen": -2.408449411392212,
"logits/rejected": -2.401078939437866,
"logps/chosen": -98.3482894897461,
"logps/rejected": -144.44593811035156,
"loss": 119906.3125,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.025788147002458572,
"rewards/margins": 0.0393107533454895,
"rewards/rejected": -0.06509890407323837,
"step": 2110
},
{
"epoch": 0.848,
"grad_norm": 8262242.910041632,
"learning_rate": 8.444444444444444e-08,
"logits/chosen": -2.328664779663086,
"logits/rejected": -2.3485660552978516,
"logps/chosen": -115.07906341552734,
"logps/rejected": -163.2090301513672,
"loss": 118916.3625,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.028421631082892418,
"rewards/margins": 0.04112589359283447,
"rewards/rejected": -0.06954751908779144,
"step": 2120
},
{
"epoch": 0.852,
"grad_norm": 7788645.367897111,
"learning_rate": 8.222222222222222e-08,
"logits/chosen": -2.381272554397583,
"logits/rejected": -2.3988587856292725,
"logps/chosen": -98.18142700195312,
"logps/rejected": -158.4501190185547,
"loss": 114372.2,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.027454352006316185,
"rewards/margins": 0.04615364223718643,
"rewards/rejected": -0.07360798865556717,
"step": 2130
},
{
"epoch": 0.856,
"grad_norm": 7241736.462505716,
"learning_rate": 8e-08,
"logits/chosen": -2.384153127670288,
"logits/rejected": -2.4060769081115723,
"logps/chosen": -112.68023681640625,
"logps/rejected": -133.3109588623047,
"loss": 122776.875,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.035701293498277664,
"rewards/margins": 0.014023616909980774,
"rewards/rejected": -0.04972491040825844,
"step": 2140
},
{
"epoch": 0.86,
"grad_norm": 7653248.620162212,
"learning_rate": 7.777777777777778e-08,
"logits/chosen": -2.382563591003418,
"logits/rejected": -2.4016215801239014,
"logps/chosen": -91.17201232910156,
"logps/rejected": -136.58473205566406,
"loss": 117265.6125,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.025211047381162643,
"rewards/margins": 0.03577146679162979,
"rewards/rejected": -0.060982514172792435,
"step": 2150
},
{
"epoch": 0.864,
"grad_norm": 7731438.19804926,
"learning_rate": 7.555555555555555e-08,
"logits/chosen": -2.3781442642211914,
"logits/rejected": -2.3403193950653076,
"logps/chosen": -121.6617431640625,
"logps/rejected": -155.835205078125,
"loss": 120368.2625,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.027185499668121338,
"rewards/margins": 0.031700123101472855,
"rewards/rejected": -0.05888562276959419,
"step": 2160
},
{
"epoch": 0.868,
"grad_norm": 8267606.777008629,
"learning_rate": 7.333333333333333e-08,
"logits/chosen": -2.3575565814971924,
"logits/rejected": -2.386939287185669,
"logps/chosen": -101.64804077148438,
"logps/rejected": -148.3137664794922,
"loss": 115195.7375,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.023973077535629272,
"rewards/margins": 0.044489845633506775,
"rewards/rejected": -0.06846292316913605,
"step": 2170
},
{
"epoch": 0.872,
"grad_norm": 9005598.684897516,
"learning_rate": 7.111111111111111e-08,
"logits/chosen": -2.3442575931549072,
"logits/rejected": -2.366135597229004,
"logps/chosen": -103.1920166015625,
"logps/rejected": -157.7998504638672,
"loss": 118398.575,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.03024686500430107,
"rewards/margins": 0.04585784301161766,
"rewards/rejected": -0.07610471546649933,
"step": 2180
},
{
"epoch": 0.876,
"grad_norm": 9533026.874445997,
"learning_rate": 6.888888888888889e-08,
"logits/chosen": -2.380309581756592,
"logits/rejected": -2.356041431427002,
"logps/chosen": -110.62892150878906,
"logps/rejected": -141.9584197998047,
"loss": 121318.95,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.03162779659032822,
"rewards/margins": 0.02817652001976967,
"rewards/rejected": -0.059804320335388184,
"step": 2190
},
{
"epoch": 0.88,
"grad_norm": 10673576.986433594,
"learning_rate": 6.666666666666667e-08,
"logits/chosen": -2.3506739139556885,
"logits/rejected": -2.372131586074829,
"logps/chosen": -100.54646301269531,
"logps/rejected": -135.2465362548828,
"loss": 126468.65,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.03161459043622017,
"rewards/margins": 0.02985607460141182,
"rewards/rejected": -0.06147066876292229,
"step": 2200
},
{
"epoch": 0.884,
"grad_norm": 8461999.794241536,
"learning_rate": 6.444444444444443e-08,
"logits/chosen": -2.362769842147827,
"logits/rejected": -2.406325578689575,
"logps/chosen": -97.97603607177734,
"logps/rejected": -142.4342041015625,
"loss": 127463.35,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.03243636339902878,
"rewards/margins": 0.029870545491576195,
"rewards/rejected": -0.062306903302669525,
"step": 2210
},
{
"epoch": 0.888,
"grad_norm": 7368698.764464808,
"learning_rate": 6.222222222222221e-08,
"logits/chosen": -2.3991339206695557,
"logits/rejected": -2.4037344455718994,
"logps/chosen": -93.59549713134766,
"logps/rejected": -162.4844970703125,
"loss": 120745.575,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.027704676613211632,
"rewards/margins": 0.05211573839187622,
"rewards/rejected": -0.0798204094171524,
"step": 2220
},
{
"epoch": 0.892,
"grad_norm": 8665468.38774931,
"learning_rate": 6e-08,
"logits/chosen": -2.4542791843414307,
"logits/rejected": -2.4168543815612793,
"logps/chosen": -93.68408966064453,
"logps/rejected": -148.40516662597656,
"loss": 118598.825,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.03159097954630852,
"rewards/margins": 0.04323247820138931,
"rewards/rejected": -0.07482346147298813,
"step": 2230
},
{
"epoch": 0.896,
"grad_norm": 7933695.302057784,
"learning_rate": 5.7777777777777775e-08,
"logits/chosen": -2.3784477710723877,
"logits/rejected": -2.3589086532592773,
"logps/chosen": -92.63359832763672,
"logps/rejected": -134.8274383544922,
"loss": 120496.3875,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.028888177126646042,
"rewards/margins": 0.03402668610215187,
"rewards/rejected": -0.06291486322879791,
"step": 2240
},
{
"epoch": 0.9,
"grad_norm": 9969155.359289682,
"learning_rate": 5.555555555555555e-08,
"logits/chosen": -2.2903988361358643,
"logits/rejected": -2.3016340732574463,
"logps/chosen": -85.76798248291016,
"logps/rejected": -127.5031967163086,
"loss": 120240.075,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.02621074579656124,
"rewards/margins": 0.03313954919576645,
"rewards/rejected": -0.05935030058026314,
"step": 2250
},
{
"epoch": 0.904,
"grad_norm": 9157992.023374882,
"learning_rate": 5.3333333333333334e-08,
"logits/chosen": -2.336127996444702,
"logits/rejected": -2.2945499420166016,
"logps/chosen": -91.5769271850586,
"logps/rejected": -135.12843322753906,
"loss": 120103.1125,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.027380824089050293,
"rewards/margins": 0.03264584392309189,
"rewards/rejected": -0.06002666801214218,
"step": 2260
},
{
"epoch": 0.908,
"grad_norm": 8951108.084423833,
"learning_rate": 5.1111111111111114e-08,
"logits/chosen": -2.347435712814331,
"logits/rejected": -2.3315415382385254,
"logps/chosen": -113.215576171875,
"logps/rejected": -128.22955322265625,
"loss": 124138.05,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.032336726784706116,
"rewards/margins": 0.017136305570602417,
"rewards/rejected": -0.04947303608059883,
"step": 2270
},
{
"epoch": 0.912,
"grad_norm": 8242314.92846825,
"learning_rate": 4.888888888888889e-08,
"logits/chosen": -2.4864022731781006,
"logits/rejected": -2.4819133281707764,
"logps/chosen": -105.97566223144531,
"logps/rejected": -151.5330047607422,
"loss": 120462.5125,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.026608863845467567,
"rewards/margins": 0.0363970547914505,
"rewards/rejected": -0.06300591677427292,
"step": 2280
},
{
"epoch": 0.916,
"grad_norm": 9094101.689046768,
"learning_rate": 4.666666666666667e-08,
"logits/chosen": -2.4374148845672607,
"logits/rejected": -2.4441134929656982,
"logps/chosen": -103.56266784667969,
"logps/rejected": -172.01528930664062,
"loss": 121326.7125,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.02742699161171913,
"rewards/margins": 0.06065355986356735,
"rewards/rejected": -0.08808055520057678,
"step": 2290
},
{
"epoch": 0.92,
"grad_norm": 10641032.764093434,
"learning_rate": 4.444444444444444e-08,
"logits/chosen": -2.4069132804870605,
"logits/rejected": -2.4145545959472656,
"logps/chosen": -82.15616607666016,
"logps/rejected": -143.5443115234375,
"loss": 115663.2125,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.02152777649462223,
"rewards/margins": 0.0520428791642189,
"rewards/rejected": -0.07357065379619598,
"step": 2300
},
{
"epoch": 0.924,
"grad_norm": 12548436.959082082,
"learning_rate": 4.222222222222222e-08,
"logits/chosen": -2.448031187057495,
"logits/rejected": -2.4767231941223145,
"logps/chosen": -115.41275787353516,
"logps/rejected": -162.3332977294922,
"loss": 123697.6375,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.024739082902669907,
"rewards/margins": 0.045056119561195374,
"rewards/rejected": -0.06979519873857498,
"step": 2310
},
{
"epoch": 0.928,
"grad_norm": 8095941.315550662,
"learning_rate": 4e-08,
"logits/chosen": -2.3472633361816406,
"logits/rejected": -2.34629225730896,
"logps/chosen": -96.70509338378906,
"logps/rejected": -156.36058044433594,
"loss": 123115.8875,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.02763884700834751,
"rewards/margins": 0.04638766124844551,
"rewards/rejected": -0.07402651011943817,
"step": 2320
},
{
"epoch": 0.932,
"grad_norm": 9261661.294557055,
"learning_rate": 3.7777777777777774e-08,
"logits/chosen": -2.379624128341675,
"logits/rejected": -2.39247727394104,
"logps/chosen": -84.12887573242188,
"logps/rejected": -139.15478515625,
"loss": 116878.95,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.0225827656686306,
"rewards/margins": 0.04350755736231804,
"rewards/rejected": -0.06609033048152924,
"step": 2330
},
{
"epoch": 0.936,
"grad_norm": 11782492.59177351,
"learning_rate": 3.5555555555555554e-08,
"logits/chosen": -2.453583240509033,
"logits/rejected": -2.4496898651123047,
"logps/chosen": -101.22218322753906,
"logps/rejected": -146.49490356445312,
"loss": 116143.6875,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.0275820791721344,
"rewards/margins": 0.036730751395225525,
"rewards/rejected": -0.06431283056735992,
"step": 2340
},
{
"epoch": 0.94,
"grad_norm": 7788195.554054044,
"learning_rate": 3.3333333333333334e-08,
"logits/chosen": -2.3583855628967285,
"logits/rejected": -2.325178623199463,
"logps/chosen": -86.75674438476562,
"logps/rejected": -152.60946655273438,
"loss": 122277.5875,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.02565554343163967,
"rewards/margins": 0.05285739153623581,
"rewards/rejected": -0.07851293683052063,
"step": 2350
},
{
"epoch": 0.944,
"grad_norm": 9203769.822900785,
"learning_rate": 3.111111111111111e-08,
"logits/chosen": -2.3866257667541504,
"logits/rejected": -2.3973793983459473,
"logps/chosen": -103.6191177368164,
"logps/rejected": -177.63768005371094,
"loss": 119365.9125,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.03137093037366867,
"rewards/margins": 0.06034323573112488,
"rewards/rejected": -0.09171417355537415,
"step": 2360
},
{
"epoch": 0.948,
"grad_norm": 9423001.082938906,
"learning_rate": 2.8888888888888887e-08,
"logits/chosen": -2.467428684234619,
"logits/rejected": -2.4925427436828613,
"logps/chosen": -104.86148834228516,
"logps/rejected": -129.6764373779297,
"loss": 123624.5625,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.030190538614988327,
"rewards/margins": 0.025643909350037575,
"rewards/rejected": -0.05583444982767105,
"step": 2370
},
{
"epoch": 0.952,
"grad_norm": 8004704.995606078,
"learning_rate": 2.6666666666666667e-08,
"logits/chosen": -2.478123188018799,
"logits/rejected": -2.4850118160247803,
"logps/chosen": -78.22926330566406,
"logps/rejected": -149.72119140625,
"loss": 111276.7625,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.020721960812807083,
"rewards/margins": 0.06527476012706757,
"rewards/rejected": -0.08599671721458435,
"step": 2380
},
{
"epoch": 0.956,
"grad_norm": 9750139.945790045,
"learning_rate": 2.4444444444444444e-08,
"logits/chosen": -2.4585745334625244,
"logits/rejected": -2.4466397762298584,
"logps/chosen": -103.5519790649414,
"logps/rejected": -164.13674926757812,
"loss": 121310.4375,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0302131325006485,
"rewards/margins": 0.04971124976873398,
"rewards/rejected": -0.07992438226938248,
"step": 2390
},
{
"epoch": 0.96,
"grad_norm": 7278490.232350556,
"learning_rate": 2.222222222222222e-08,
"logits/chosen": -2.43925404548645,
"logits/rejected": -2.438615322113037,
"logps/chosen": -105.12055969238281,
"logps/rejected": -164.0720977783203,
"loss": 118679.0375,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.02877245843410492,
"rewards/margins": 0.052051056176424026,
"rewards/rejected": -0.08082351088523865,
"step": 2400
},
{
"epoch": 0.964,
"grad_norm": 10483716.8937703,
"learning_rate": 2e-08,
"logits/chosen": -2.4821510314941406,
"logits/rejected": -2.46364426612854,
"logps/chosen": -101.2098159790039,
"logps/rejected": -143.97183227539062,
"loss": 117960.6,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.03032396174967289,
"rewards/margins": 0.03740059584379196,
"rewards/rejected": -0.0677245557308197,
"step": 2410
},
{
"epoch": 0.968,
"grad_norm": 12173721.82019396,
"learning_rate": 1.7777777777777777e-08,
"logits/chosen": -2.379589557647705,
"logits/rejected": -2.4067189693450928,
"logps/chosen": -122.52884674072266,
"logps/rejected": -172.4658966064453,
"loss": 124411.0625,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.03309926018118858,
"rewards/margins": 0.042371779680252075,
"rewards/rejected": -0.07547104358673096,
"step": 2420
},
{
"epoch": 0.972,
"grad_norm": 9706266.63311398,
"learning_rate": 1.5555555555555554e-08,
"logits/chosen": -2.436565637588501,
"logits/rejected": -2.4684276580810547,
"logps/chosen": -109.63997650146484,
"logps/rejected": -152.08335876464844,
"loss": 122077.7,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.03295578807592392,
"rewards/margins": 0.03596381098031998,
"rewards/rejected": -0.0689195990562439,
"step": 2430
},
{
"epoch": 0.976,
"grad_norm": 5892646.249318525,
"learning_rate": 1.3333333333333334e-08,
"logits/chosen": -2.531766891479492,
"logits/rejected": -2.5235583782196045,
"logps/chosen": -125.18925476074219,
"logps/rejected": -160.97665405273438,
"loss": 123783.1,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.026068557053804398,
"rewards/margins": 0.030096372589468956,
"rewards/rejected": -0.0561649315059185,
"step": 2440
},
{
"epoch": 0.98,
"grad_norm": 10534775.155367365,
"learning_rate": 1.111111111111111e-08,
"logits/chosen": -2.369849443435669,
"logits/rejected": -2.381124258041382,
"logps/chosen": -103.98789978027344,
"logps/rejected": -156.1312255859375,
"loss": 119931.5125,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.02474259026348591,
"rewards/margins": 0.03790457919239998,
"rewards/rejected": -0.06264716386795044,
"step": 2450
},
{
"epoch": 0.984,
"grad_norm": 9946607.778478429,
"learning_rate": 8.888888888888889e-09,
"logits/chosen": -2.487614393234253,
"logits/rejected": -2.465937614440918,
"logps/chosen": -111.37520599365234,
"logps/rejected": -137.24546813964844,
"loss": 124467.0125,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.03445830196142197,
"rewards/margins": 0.02246803045272827,
"rewards/rejected": -0.05692633241415024,
"step": 2460
},
{
"epoch": 0.988,
"grad_norm": 12838026.989788342,
"learning_rate": 6.666666666666667e-09,
"logits/chosen": -2.3559987545013428,
"logits/rejected": -2.3515543937683105,
"logps/chosen": -103.37791442871094,
"logps/rejected": -170.76101684570312,
"loss": 123802.275,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.028120238333940506,
"rewards/margins": 0.047586239874362946,
"rewards/rejected": -0.07570647448301315,
"step": 2470
},
{
"epoch": 0.992,
"grad_norm": 8735124.614081156,
"learning_rate": 4.444444444444444e-09,
"logits/chosen": -2.4489405155181885,
"logits/rejected": -2.4833984375,
"logps/chosen": -95.29798889160156,
"logps/rejected": -143.02792358398438,
"loss": 118030.2125,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.025047576054930687,
"rewards/margins": 0.04086794704198837,
"rewards/rejected": -0.06591552495956421,
"step": 2480
},
{
"epoch": 0.996,
"grad_norm": 9991241.9099312,
"learning_rate": 2.222222222222222e-09,
"logits/chosen": -2.351677417755127,
"logits/rejected": -2.272353410720825,
"logps/chosen": -93.49928283691406,
"logps/rejected": -141.19610595703125,
"loss": 118406.9625,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.02804369106888771,
"rewards/margins": 0.03656899183988571,
"rewards/rejected": -0.06461267918348312,
"step": 2490
},
{
"epoch": 1.0,
"grad_norm": 11977840.992411703,
"learning_rate": 0.0,
"logits/chosen": -2.5079355239868164,
"logits/rejected": -2.510873556137085,
"logps/chosen": -100.9855728149414,
"logps/rejected": -157.40550231933594,
"loss": 117766.25,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.029782477766275406,
"rewards/margins": 0.0452786386013031,
"rewards/rejected": -0.07506111264228821,
"step": 2500
}
],
"logging_steps": 10,
"max_steps": 2500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}