mistral-7b-dpo / trainer_state.json
Jenbenarye's picture
Model save
5d16a7c verified
{
"best_global_step": 2800,
"best_metric": 0.4922027885913849,
"best_model_checkpoint": "/workspace/adversarial-rlhf/runs/dpo-mistral-7b-sft-20251109-1358/checkpoints/checkpoint-2800",
"epoch": 1.0,
"eval_steps": 400,
"global_step": 3821,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0013085579691180318,
"grad_norm": 8.714019775390625,
"learning_rate": 1.9979063072494113e-05,
"logits/chosen": -3.054150104522705,
"logits/rejected": -2.987114429473877,
"logps/chosen": -285.14349365234375,
"logps/rejected": -257.39349365234375,
"loss": 0.6783,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.7019270062446594,
"rewards/margins": 0.2313508689403534,
"rewards/rejected": -0.9332779049873352,
"step": 5
},
{
"epoch": 0.0026171159382360636,
"grad_norm": 9.763724327087402,
"learning_rate": 1.995289191311175e-05,
"logits/chosen": -3.0378708839416504,
"logits/rejected": -3.088167667388916,
"logps/chosen": -266.4122619628906,
"logps/rejected": -260.312255859375,
"loss": 0.6071,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.1473495066165924,
"rewards/margins": 0.6945368647575378,
"rewards/rejected": -0.8418864011764526,
"step": 10
},
{
"epoch": 0.003925673907354096,
"grad_norm": 7.875804901123047,
"learning_rate": 1.9926720753729393e-05,
"logits/chosen": -2.932835102081299,
"logits/rejected": -2.925532341003418,
"logps/chosen": -251.9230499267578,
"logps/rejected": -243.9307861328125,
"loss": 0.5651,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.1294022798538208,
"rewards/margins": 0.73015958070755,
"rewards/rejected": -0.8595618009567261,
"step": 15
},
{
"epoch": 0.005234231876472127,
"grad_norm": 12.238419532775879,
"learning_rate": 1.990054959434703e-05,
"logits/chosen": -2.963040828704834,
"logits/rejected": -2.9602060317993164,
"logps/chosen": -282.7713623046875,
"logps/rejected": -284.95208740234375,
"loss": 0.7029,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.3746412992477417,
"rewards/margins": 0.4599940776824951,
"rewards/rejected": -0.8346353769302368,
"step": 20
},
{
"epoch": 0.00654278984559016,
"grad_norm": 22.53215217590332,
"learning_rate": 1.987437843496467e-05,
"logits/chosen": -2.866917848587036,
"logits/rejected": -3.069469928741455,
"logps/chosen": -254.5303192138672,
"logps/rejected": -276.4834289550781,
"loss": 0.7576,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.47834309935569763,
"rewards/margins": 0.296063095331192,
"rewards/rejected": -0.7744062542915344,
"step": 25
},
{
"epoch": 0.007851347814708191,
"grad_norm": 12.339269638061523,
"learning_rate": 1.9848207275582308e-05,
"logits/chosen": -2.9572787284851074,
"logits/rejected": -2.9429330825805664,
"logps/chosen": -357.49896240234375,
"logps/rejected": -307.7606201171875,
"loss": 0.6489,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.26222458481788635,
"rewards/margins": 0.576259970664978,
"rewards/rejected": -0.31403541564941406,
"step": 30
},
{
"epoch": 0.009159905783826224,
"grad_norm": 14.245732307434082,
"learning_rate": 1.982203611619995e-05,
"logits/chosen": -2.901461362838745,
"logits/rejected": -2.9073104858398438,
"logps/chosen": -296.9358215332031,
"logps/rejected": -288.05694580078125,
"loss": 0.5916,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.48122453689575195,
"rewards/margins": 0.6745239496231079,
"rewards/rejected": -0.19329944252967834,
"step": 35
},
{
"epoch": 0.010468463752944255,
"grad_norm": 9.731245040893555,
"learning_rate": 1.979586495681759e-05,
"logits/chosen": -3.002631425857544,
"logits/rejected": -2.911168336868286,
"logps/chosen": -294.17767333984375,
"logps/rejected": -266.30242919921875,
"loss": 0.6211,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.7887242436408997,
"rewards/margins": 0.6125372052192688,
"rewards/rejected": 0.17618700861930847,
"step": 40
},
{
"epoch": 0.011777021722062287,
"grad_norm": 10.440970420837402,
"learning_rate": 1.9769693797435227e-05,
"logits/chosen": -2.9317564964294434,
"logits/rejected": -2.990628719329834,
"logps/chosen": -270.26019287109375,
"logps/rejected": -251.9040985107422,
"loss": 0.687,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.7482277154922485,
"rewards/margins": 0.3524876832962036,
"rewards/rejected": 0.39574000239372253,
"step": 45
},
{
"epoch": 0.01308557969118032,
"grad_norm": 31.515274047851562,
"learning_rate": 1.974352263805287e-05,
"logits/chosen": -3.0300424098968506,
"logits/rejected": -2.9904026985168457,
"logps/chosen": -274.59783935546875,
"logps/rejected": -304.90802001953125,
"loss": 0.6521,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.6415464878082275,
"rewards/margins": 0.4436315596103668,
"rewards/rejected": 0.19791492819786072,
"step": 50
},
{
"epoch": 0.014394137660298352,
"grad_norm": 11.78197956085205,
"learning_rate": 1.9717351478670507e-05,
"logits/chosen": -2.8813371658325195,
"logits/rejected": -2.969799757003784,
"logps/chosen": -311.2370300292969,
"logps/rejected": -267.30950927734375,
"loss": 0.7051,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.6925315856933594,
"rewards/margins": 0.44889575242996216,
"rewards/rejected": 0.243635892868042,
"step": 55
},
{
"epoch": 0.015702695629416383,
"grad_norm": 10.35225772857666,
"learning_rate": 1.9691180319288145e-05,
"logits/chosen": -3.0078067779541016,
"logits/rejected": -2.9517364501953125,
"logps/chosen": -283.88946533203125,
"logps/rejected": -305.6519775390625,
"loss": 0.6004,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.9307526350021362,
"rewards/margins": 0.6657913327217102,
"rewards/rejected": 0.2649613320827484,
"step": 60
},
{
"epoch": 0.017011253598534413,
"grad_norm": 8.242342948913574,
"learning_rate": 1.9665009159905787e-05,
"logits/chosen": -3.02073335647583,
"logits/rejected": -3.0664925575256348,
"logps/chosen": -242.3530731201172,
"logps/rejected": -191.35472106933594,
"loss": 0.5493,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.9005786180496216,
"rewards/margins": 0.8057304620742798,
"rewards/rejected": 0.09484807401895523,
"step": 65
},
{
"epoch": 0.018319811567652448,
"grad_norm": 9.840733528137207,
"learning_rate": 1.9638838000523425e-05,
"logits/chosen": -2.8799917697906494,
"logits/rejected": -3.000739812850952,
"logps/chosen": -234.18637084960938,
"logps/rejected": -286.62249755859375,
"loss": 0.4875,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.5168167352676392,
"rewards/margins": 1.134171724319458,
"rewards/rejected": -0.6173551678657532,
"step": 70
},
{
"epoch": 0.01962836953677048,
"grad_norm": 10.049373626708984,
"learning_rate": 1.9612666841141064e-05,
"logits/chosen": -3.011906862258911,
"logits/rejected": -3.059084892272949,
"logps/chosen": -290.17071533203125,
"logps/rejected": -269.0260009765625,
"loss": 0.6103,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.2669380307197571,
"rewards/margins": 0.9674070477485657,
"rewards/rejected": -0.700468897819519,
"step": 75
},
{
"epoch": 0.02093692750588851,
"grad_norm": 11.49927806854248,
"learning_rate": 1.9586495681758702e-05,
"logits/chosen": -3.0820119380950928,
"logits/rejected": -3.016047954559326,
"logps/chosen": -276.62841796875,
"logps/rejected": -263.7878723144531,
"loss": 0.6079,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.20246413350105286,
"rewards/margins": 1.208956003189087,
"rewards/rejected": -1.4114201068878174,
"step": 80
},
{
"epoch": 0.022245485475006543,
"grad_norm": 9.35627269744873,
"learning_rate": 1.9560324522376344e-05,
"logits/chosen": -2.9503092765808105,
"logits/rejected": -2.965207576751709,
"logps/chosen": -276.2810974121094,
"logps/rejected": -255.14111328125,
"loss": 0.5773,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.09604328870773315,
"rewards/margins": 1.1082497835159302,
"rewards/rejected": -1.0122063159942627,
"step": 85
},
{
"epoch": 0.023554043444124574,
"grad_norm": 8.25130844116211,
"learning_rate": 1.9534153362993982e-05,
"logits/chosen": -3.0043704509735107,
"logits/rejected": -3.0347070693969727,
"logps/chosen": -252.2435302734375,
"logps/rejected": -264.5854797363281,
"loss": 0.5873,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": 0.10906165838241577,
"rewards/margins": 0.7212487459182739,
"rewards/rejected": -0.6121870279312134,
"step": 90
},
{
"epoch": 0.02486260141324261,
"grad_norm": 16.03078842163086,
"learning_rate": 1.950798220361162e-05,
"logits/chosen": -3.1061108112335205,
"logits/rejected": -3.0602822303771973,
"logps/chosen": -257.32159423828125,
"logps/rejected": -257.50579833984375,
"loss": 0.5985,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.06869806349277496,
"rewards/margins": 0.6951006650924683,
"rewards/rejected": -0.6264026165008545,
"step": 95
},
{
"epoch": 0.02617115938236064,
"grad_norm": 14.365320205688477,
"learning_rate": 1.9481811044229262e-05,
"logits/chosen": -2.9251151084899902,
"logits/rejected": -2.903024196624756,
"logps/chosen": -272.07733154296875,
"logps/rejected": -242.03759765625,
"loss": 0.6797,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.20394983887672424,
"rewards/margins": 0.6385009288787842,
"rewards/rejected": -0.43455109000205994,
"step": 100
},
{
"epoch": 0.02747971735147867,
"grad_norm": 7.471345901489258,
"learning_rate": 1.94556398848469e-05,
"logits/chosen": -2.9216761589050293,
"logits/rejected": -2.9945147037506104,
"logps/chosen": -263.98724365234375,
"logps/rejected": -267.64422607421875,
"loss": 0.5756,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.037400662899017334,
"rewards/margins": 0.7236355543136597,
"rewards/rejected": -0.7610361576080322,
"step": 105
},
{
"epoch": 0.028788275320596704,
"grad_norm": 11.088753700256348,
"learning_rate": 1.942946872546454e-05,
"logits/chosen": -2.968885898590088,
"logits/rejected": -3.0910255908966064,
"logps/chosen": -284.25360107421875,
"logps/rejected": -248.49545288085938,
"loss": 0.6703,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.018188726156949997,
"rewards/margins": 0.46626266837120056,
"rewards/rejected": -0.48445138335227966,
"step": 110
},
{
"epoch": 0.030096833289714735,
"grad_norm": 8.927261352539062,
"learning_rate": 1.940329756608218e-05,
"logits/chosen": -2.9310824871063232,
"logits/rejected": -3.0046865940093994,
"logps/chosen": -266.04168701171875,
"logps/rejected": -239.96011352539062,
"loss": 0.4949,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.1395626664161682,
"rewards/margins": 1.0587072372436523,
"rewards/rejected": -1.1982699632644653,
"step": 115
},
{
"epoch": 0.031405391258832765,
"grad_norm": 9.871482849121094,
"learning_rate": 1.937712640669982e-05,
"logits/chosen": -3.0028929710388184,
"logits/rejected": -2.9868929386138916,
"logps/chosen": -302.8767395019531,
"logps/rejected": -311.38995361328125,
"loss": 0.6321,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.15047518908977509,
"rewards/margins": 0.8277810215950012,
"rewards/rejected": -0.9782562255859375,
"step": 120
},
{
"epoch": 0.032713949227950796,
"grad_norm": 9.17164421081543,
"learning_rate": 1.9350955247317458e-05,
"logits/chosen": -2.9586594104766846,
"logits/rejected": -2.9300732612609863,
"logps/chosen": -258.01080322265625,
"logps/rejected": -308.2096862792969,
"loss": 0.5535,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.07981396466493607,
"rewards/margins": 1.1827664375305176,
"rewards/rejected": -1.2625802755355835,
"step": 125
},
{
"epoch": 0.03402250719706883,
"grad_norm": 10.289148330688477,
"learning_rate": 1.93247840879351e-05,
"logits/chosen": -2.9960761070251465,
"logits/rejected": -3.059537172317505,
"logps/chosen": -254.6339874267578,
"logps/rejected": -250.0092315673828,
"loss": 0.6809,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.0245519932359457,
"rewards/margins": 0.7872114181518555,
"rewards/rejected": -0.8117634654045105,
"step": 130
},
{
"epoch": 0.035331065166186865,
"grad_norm": 7.545676231384277,
"learning_rate": 1.9298612928552734e-05,
"logits/chosen": -2.88844633102417,
"logits/rejected": -2.9349312782287598,
"logps/chosen": -257.82183837890625,
"logps/rejected": -272.8002014160156,
"loss": 0.5251,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.11282087862491608,
"rewards/margins": 1.0913177728652954,
"rewards/rejected": -0.9784967303276062,
"step": 135
},
{
"epoch": 0.036639623135304895,
"grad_norm": 9.456621170043945,
"learning_rate": 1.9272441769170376e-05,
"logits/chosen": -2.9461770057678223,
"logits/rejected": -3.001709461212158,
"logps/chosen": -276.93646240234375,
"logps/rejected": -288.7925109863281,
"loss": 0.5606,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.20001547038555145,
"rewards/margins": 1.2281715869903564,
"rewards/rejected": -1.0281562805175781,
"step": 140
},
{
"epoch": 0.037948181104422926,
"grad_norm": 8.707345962524414,
"learning_rate": 1.9246270609788015e-05,
"logits/chosen": -3.082422971725464,
"logits/rejected": -3.10573673248291,
"logps/chosen": -270.7345275878906,
"logps/rejected": -260.58013916015625,
"loss": 0.6272,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.25665849447250366,
"rewards/margins": 1.013261079788208,
"rewards/rejected": -1.2699196338653564,
"step": 145
},
{
"epoch": 0.03925673907354096,
"grad_norm": 7.2488203048706055,
"learning_rate": 1.9220099450405653e-05,
"logits/chosen": -2.9218993186950684,
"logits/rejected": -2.962679386138916,
"logps/chosen": -242.43215942382812,
"logps/rejected": -234.44113159179688,
"loss": 0.5128,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.46268120408058167,
"rewards/margins": 1.0108039379119873,
"rewards/rejected": -1.4734852313995361,
"step": 150
},
{
"epoch": 0.04056529704265899,
"grad_norm": 6.86571741104126,
"learning_rate": 1.9193928291023295e-05,
"logits/chosen": -2.927309513092041,
"logits/rejected": -3.0099613666534424,
"logps/chosen": -274.22613525390625,
"logps/rejected": -243.80191040039062,
"loss": 0.4807,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.6335469484329224,
"rewards/margins": 1.2637865543365479,
"rewards/rejected": -1.8973333835601807,
"step": 155
},
{
"epoch": 0.04187385501177702,
"grad_norm": 6.156006336212158,
"learning_rate": 1.9167757131640933e-05,
"logits/chosen": -2.88226580619812,
"logits/rejected": -2.913170099258423,
"logps/chosen": -277.84765625,
"logps/rejected": -283.1264343261719,
"loss": 0.549,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8374231457710266,
"rewards/margins": 1.1124131679534912,
"rewards/rejected": -1.9498363733291626,
"step": 160
},
{
"epoch": 0.043182412980895056,
"grad_norm": 13.543861389160156,
"learning_rate": 1.914158597225857e-05,
"logits/chosen": -3.1270055770874023,
"logits/rejected": -3.066056728363037,
"logps/chosen": -258.69879150390625,
"logps/rejected": -248.0587921142578,
"loss": 0.5938,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8121991157531738,
"rewards/margins": 0.9060171246528625,
"rewards/rejected": -1.7182163000106812,
"step": 165
},
{
"epoch": 0.04449097095001309,
"grad_norm": 11.699880599975586,
"learning_rate": 1.9115414812876213e-05,
"logits/chosen": -2.890881299972534,
"logits/rejected": -2.9870123863220215,
"logps/chosen": -280.4810485839844,
"logps/rejected": -268.2397155761719,
"loss": 0.5342,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.5369232892990112,
"rewards/margins": 1.2705774307250977,
"rewards/rejected": -1.8075008392333984,
"step": 170
},
{
"epoch": 0.04579952891913112,
"grad_norm": 6.853356838226318,
"learning_rate": 1.908924365349385e-05,
"logits/chosen": -2.9231629371643066,
"logits/rejected": -2.9792449474334717,
"logps/chosen": -299.10687255859375,
"logps/rejected": -262.38604736328125,
"loss": 0.385,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.3845639228820801,
"rewards/margins": 1.6383775472640991,
"rewards/rejected": -2.0229413509368896,
"step": 175
},
{
"epoch": 0.04710808688824915,
"grad_norm": 13.25985050201416,
"learning_rate": 1.906307249411149e-05,
"logits/chosen": -3.018812894821167,
"logits/rejected": -3.0761470794677734,
"logps/chosen": -262.8817443847656,
"logps/rejected": -241.5967559814453,
"loss": 0.7499,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.7485982775688171,
"rewards/margins": 0.9674726724624634,
"rewards/rejected": -1.7160708904266357,
"step": 180
},
{
"epoch": 0.04841664485736718,
"grad_norm": 8.176429748535156,
"learning_rate": 1.9036901334729128e-05,
"logits/chosen": -2.7412192821502686,
"logits/rejected": -2.875842571258545,
"logps/chosen": -287.76531982421875,
"logps/rejected": -286.1810302734375,
"loss": 0.5304,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.9107913970947266,
"rewards/margins": 1.261719822883606,
"rewards/rejected": -2.172511339187622,
"step": 185
},
{
"epoch": 0.04972520282648522,
"grad_norm": 6.8885416984558105,
"learning_rate": 1.901073017534677e-05,
"logits/chosen": -2.882361888885498,
"logits/rejected": -2.919163227081299,
"logps/chosen": -283.5059509277344,
"logps/rejected": -279.5335998535156,
"loss": 0.539,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.9788363575935364,
"rewards/margins": 1.1429237127304077,
"rewards/rejected": -2.1217598915100098,
"step": 190
},
{
"epoch": 0.05103376079560325,
"grad_norm": 7.295929908752441,
"learning_rate": 1.898455901596441e-05,
"logits/chosen": -2.9931766986846924,
"logits/rejected": -3.0010311603546143,
"logps/chosen": -278.93463134765625,
"logps/rejected": -269.3666076660156,
"loss": 0.4351,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.090458631515503,
"rewards/margins": 1.1227349042892456,
"rewards/rejected": -2.213193893432617,
"step": 195
},
{
"epoch": 0.05234231876472128,
"grad_norm": 7.319633483886719,
"learning_rate": 1.8958387856582047e-05,
"logits/chosen": -2.991227626800537,
"logits/rejected": -2.9963135719299316,
"logps/chosen": -261.43463134765625,
"logps/rejected": -284.7808837890625,
"loss": 0.479,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.3308995962142944,
"rewards/margins": 1.1909126043319702,
"rewards/rejected": -2.5218122005462646,
"step": 200
},
{
"epoch": 0.05365087673383931,
"grad_norm": 8.167856216430664,
"learning_rate": 1.893221669719969e-05,
"logits/chosen": -3.0124361515045166,
"logits/rejected": -3.044581890106201,
"logps/chosen": -263.484619140625,
"logps/rejected": -291.4689636230469,
"loss": 0.6073,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.635348916053772,
"rewards/margins": 0.6897547245025635,
"rewards/rejected": -2.325103521347046,
"step": 205
},
{
"epoch": 0.05495943470295734,
"grad_norm": 10.466708183288574,
"learning_rate": 1.8906045537817327e-05,
"logits/chosen": -2.9249165058135986,
"logits/rejected": -3.049833059310913,
"logps/chosen": -294.9049072265625,
"logps/rejected": -312.395263671875,
"loss": 0.5377,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1728683710098267,
"rewards/margins": 1.2662220001220703,
"rewards/rejected": -2.4390902519226074,
"step": 210
},
{
"epoch": 0.05626799267207537,
"grad_norm": 11.2478666305542,
"learning_rate": 1.8879874378434965e-05,
"logits/chosen": -2.9483745098114014,
"logits/rejected": -2.929673194885254,
"logps/chosen": -274.67230224609375,
"logps/rejected": -265.0277404785156,
"loss": 0.748,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.8842582702636719,
"rewards/margins": 0.5387625694274902,
"rewards/rejected": -2.423020839691162,
"step": 215
},
{
"epoch": 0.05757655064119341,
"grad_norm": 9.180577278137207,
"learning_rate": 1.8853703219052607e-05,
"logits/chosen": -2.966609477996826,
"logits/rejected": -3.0011610984802246,
"logps/chosen": -294.72418212890625,
"logps/rejected": -295.13134765625,
"loss": 0.4851,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.5100880861282349,
"rewards/margins": 1.0742136240005493,
"rewards/rejected": -2.5843019485473633,
"step": 220
},
{
"epoch": 0.05888510861031144,
"grad_norm": 9.102991104125977,
"learning_rate": 1.8827532059670245e-05,
"logits/chosen": -3.008378028869629,
"logits/rejected": -3.0497703552246094,
"logps/chosen": -292.7835388183594,
"logps/rejected": -260.39215087890625,
"loss": 0.526,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0618221759796143,
"rewards/margins": 1.2599637508392334,
"rewards/rejected": -2.3217859268188477,
"step": 225
},
{
"epoch": 0.06019366657942947,
"grad_norm": 7.269233226776123,
"learning_rate": 1.8801360900287884e-05,
"logits/chosen": -2.9787087440490723,
"logits/rejected": -3.0631184577941895,
"logps/chosen": -322.45257568359375,
"logps/rejected": -326.454345703125,
"loss": 0.5484,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6545368432998657,
"rewards/margins": 1.1309837102890015,
"rewards/rejected": -1.7855205535888672,
"step": 230
},
{
"epoch": 0.0615022245485475,
"grad_norm": 10.517141342163086,
"learning_rate": 1.8775189740905526e-05,
"logits/chosen": -2.922563076019287,
"logits/rejected": -2.997480869293213,
"logps/chosen": -262.8779602050781,
"logps/rejected": -304.7298278808594,
"loss": 0.5244,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.6560412645339966,
"rewards/margins": 1.1671555042266846,
"rewards/rejected": -1.8231967687606812,
"step": 235
},
{
"epoch": 0.06281078251766553,
"grad_norm": 6.655497074127197,
"learning_rate": 1.8749018581523164e-05,
"logits/chosen": -2.994204521179199,
"logits/rejected": -3.054076671600342,
"logps/chosen": -240.95181274414062,
"logps/rejected": -258.10052490234375,
"loss": 0.5248,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.3981846868991852,
"rewards/margins": 1.1931384801864624,
"rewards/rejected": -1.5913231372833252,
"step": 240
},
{
"epoch": 0.06411934048678357,
"grad_norm": 13.732154846191406,
"learning_rate": 1.8722847422140802e-05,
"logits/chosen": -2.8550612926483154,
"logits/rejected": -3.0134494304656982,
"logps/chosen": -261.8082580566406,
"logps/rejected": -251.3233184814453,
"loss": 0.576,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.26595190167427063,
"rewards/margins": 1.036379337310791,
"rewards/rejected": -1.3023312091827393,
"step": 245
},
{
"epoch": 0.06542789845590159,
"grad_norm": 6.477427005767822,
"learning_rate": 1.869667626275844e-05,
"logits/chosen": -2.8698863983154297,
"logits/rejected": -3.006420850753784,
"logps/chosen": -277.37860107421875,
"logps/rejected": -293.40899658203125,
"loss": 0.4292,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.00966604333370924,
"rewards/margins": 1.26931631565094,
"rewards/rejected": -1.278982400894165,
"step": 250
},
{
"epoch": 0.06673645642501963,
"grad_norm": 4.57761812210083,
"learning_rate": 1.8670505103376082e-05,
"logits/chosen": -3.044351100921631,
"logits/rejected": -3.0704281330108643,
"logps/chosen": -306.3280029296875,
"logps/rejected": -274.84344482421875,
"loss": 0.5014,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.05158301070332527,
"rewards/margins": 1.2034839391708374,
"rewards/rejected": -1.1519008874893188,
"step": 255
},
{
"epoch": 0.06804501439413765,
"grad_norm": 10.730956077575684,
"learning_rate": 1.864433394399372e-05,
"logits/chosen": -2.7571253776550293,
"logits/rejected": -2.827652931213379,
"logps/chosen": -326.50445556640625,
"logps/rejected": -288.9650573730469,
"loss": 0.4991,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.46068209409713745,
"rewards/margins": 1.4844446182250977,
"rewards/rejected": -1.9451268911361694,
"step": 260
},
{
"epoch": 0.06935357236325569,
"grad_norm": 7.003467082977295,
"learning_rate": 1.861816278461136e-05,
"logits/chosen": -2.933865547180176,
"logits/rejected": -3.005654811859131,
"logps/chosen": -247.2929229736328,
"logps/rejected": -237.02456665039062,
"loss": 0.5385,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.32561081647872925,
"rewards/margins": 1.441275954246521,
"rewards/rejected": -1.7668869495391846,
"step": 265
},
{
"epoch": 0.07066213033237373,
"grad_norm": 9.393101692199707,
"learning_rate": 1.8591991625229e-05,
"logits/chosen": -2.863271713256836,
"logits/rejected": -2.964902877807617,
"logps/chosen": -292.7139892578125,
"logps/rejected": -270.59466552734375,
"loss": 0.5143,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.8659523129463196,
"rewards/margins": 1.305040717124939,
"rewards/rejected": -2.1709930896759033,
"step": 270
},
{
"epoch": 0.07197068830149175,
"grad_norm": 7.726457595825195,
"learning_rate": 1.856582046584664e-05,
"logits/chosen": -2.9840664863586426,
"logits/rejected": -3.0086734294891357,
"logps/chosen": -323.1566467285156,
"logps/rejected": -292.984375,
"loss": 0.488,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.8014192581176758,
"rewards/margins": 1.3756850957870483,
"rewards/rejected": -2.1771044731140137,
"step": 275
},
{
"epoch": 0.07327924627060979,
"grad_norm": 11.760858535766602,
"learning_rate": 1.8539649306464278e-05,
"logits/chosen": -3.0070672035217285,
"logits/rejected": -3.066682815551758,
"logps/chosen": -269.5534362792969,
"logps/rejected": -247.11062622070312,
"loss": 0.5206,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8253080248832703,
"rewards/margins": 1.3371561765670776,
"rewards/rejected": -2.1624643802642822,
"step": 280
},
{
"epoch": 0.07458780423972781,
"grad_norm": 8.769774436950684,
"learning_rate": 1.851347814708192e-05,
"logits/chosen": -2.873136281967163,
"logits/rejected": -2.9881954193115234,
"logps/chosen": -270.26043701171875,
"logps/rejected": -258.38629150390625,
"loss": 0.5568,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.9168025851249695,
"rewards/margins": 1.2144734859466553,
"rewards/rejected": -2.1312763690948486,
"step": 285
},
{
"epoch": 0.07589636220884585,
"grad_norm": 9.61667251586914,
"learning_rate": 1.8487306987699554e-05,
"logits/chosen": -2.9167819023132324,
"logits/rejected": -2.8649868965148926,
"logps/chosen": -285.44305419921875,
"logps/rejected": -276.2369384765625,
"loss": 0.5117,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.127011775970459,
"rewards/margins": 1.2268073558807373,
"rewards/rejected": -2.3538193702697754,
"step": 290
},
{
"epoch": 0.07720492017796389,
"grad_norm": 9.920550346374512,
"learning_rate": 1.8461135828317196e-05,
"logits/chosen": -2.9163198471069336,
"logits/rejected": -2.803847074508667,
"logps/chosen": -303.9400939941406,
"logps/rejected": -318.45843505859375,
"loss": 0.6123,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.005907654762268,
"rewards/margins": 0.9369108080863953,
"rewards/rejected": -1.9428186416625977,
"step": 295
},
{
"epoch": 0.07851347814708191,
"grad_norm": 8.180704116821289,
"learning_rate": 1.8434964668934835e-05,
"logits/chosen": -2.8901426792144775,
"logits/rejected": -2.998375415802002,
"logps/chosen": -291.26788330078125,
"logps/rejected": -256.09100341796875,
"loss": 0.6608,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.0462286472320557,
"rewards/margins": 0.6313639879226685,
"rewards/rejected": -1.6775926351547241,
"step": 300
},
{
"epoch": 0.07982203611619995,
"grad_norm": 8.73984146118164,
"learning_rate": 1.8408793509552473e-05,
"logits/chosen": -2.893742322921753,
"logits/rejected": -2.846432685852051,
"logps/chosen": -326.13775634765625,
"logps/rejected": -305.88873291015625,
"loss": 0.5684,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.74317467212677,
"rewards/margins": 0.9149740934371948,
"rewards/rejected": -1.658149003982544,
"step": 305
},
{
"epoch": 0.08113059408531798,
"grad_norm": 10.25295639038086,
"learning_rate": 1.8382622350170115e-05,
"logits/chosen": -2.9782931804656982,
"logits/rejected": -3.011655330657959,
"logps/chosen": -317.3922119140625,
"logps/rejected": -312.5651550292969,
"loss": 0.6222,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.720477283000946,
"rewards/margins": 1.068196415901184,
"rewards/rejected": -1.7886736392974854,
"step": 310
},
{
"epoch": 0.08243915205443601,
"grad_norm": 5.97982120513916,
"learning_rate": 1.8356451190787753e-05,
"logits/chosen": -2.924208879470825,
"logits/rejected": -2.9455435276031494,
"logps/chosen": -293.01611328125,
"logps/rejected": -308.312255859375,
"loss": 0.442,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.49260687828063965,
"rewards/margins": 1.3362938165664673,
"rewards/rejected": -1.8289005756378174,
"step": 315
},
{
"epoch": 0.08374771002355404,
"grad_norm": 10.534534454345703,
"learning_rate": 1.833028003140539e-05,
"logits/chosen": -2.9009976387023926,
"logits/rejected": -2.8593392372131348,
"logps/chosen": -241.1309356689453,
"logps/rejected": -300.59051513671875,
"loss": 0.5321,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.9334520101547241,
"rewards/margins": 1.1237605810165405,
"rewards/rejected": -2.0572123527526855,
"step": 320
},
{
"epoch": 0.08505626799267207,
"grad_norm": 5.939411163330078,
"learning_rate": 1.8304108872023033e-05,
"logits/chosen": -2.9997639656066895,
"logits/rejected": -3.0202760696411133,
"logps/chosen": -269.6910400390625,
"logps/rejected": -314.3893127441406,
"loss": 0.5945,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7280455827713013,
"rewards/margins": 1.2291756868362427,
"rewards/rejected": -1.957221269607544,
"step": 325
},
{
"epoch": 0.08636482596179011,
"grad_norm": 11.953298568725586,
"learning_rate": 1.827793771264067e-05,
"logits/chosen": -2.9474053382873535,
"logits/rejected": -2.8666484355926514,
"logps/chosen": -305.43157958984375,
"logps/rejected": -280.44232177734375,
"loss": 0.6738,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.7594338059425354,
"rewards/margins": 1.0998165607452393,
"rewards/rejected": -1.8592504262924194,
"step": 330
},
{
"epoch": 0.08767338393090814,
"grad_norm": 5.373962879180908,
"learning_rate": 1.825176655325831e-05,
"logits/chosen": -2.947441816329956,
"logits/rejected": -3.092377185821533,
"logps/chosen": -309.4542236328125,
"logps/rejected": -287.61309814453125,
"loss": 0.5345,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.5140593647956848,
"rewards/margins": 1.1911535263061523,
"rewards/rejected": -1.705212950706482,
"step": 335
},
{
"epoch": 0.08898194190002617,
"grad_norm": 7.115630149841309,
"learning_rate": 1.822559539387595e-05,
"logits/chosen": -2.958962917327881,
"logits/rejected": -3.1099159717559814,
"logps/chosen": -279.24053955078125,
"logps/rejected": -241.4726104736328,
"loss": 0.6099,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.852770984172821,
"rewards/margins": 0.7456468343734741,
"rewards/rejected": -1.59841787815094,
"step": 340
},
{
"epoch": 0.0902904998691442,
"grad_norm": 10.483302116394043,
"learning_rate": 1.819942423449359e-05,
"logits/chosen": -2.8903632164001465,
"logits/rejected": -2.9951415061950684,
"logps/chosen": -306.63531494140625,
"logps/rejected": -260.45281982421875,
"loss": 0.4238,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.48541754484176636,
"rewards/margins": 1.6238024234771729,
"rewards/rejected": -2.109220027923584,
"step": 345
},
{
"epoch": 0.09159905783826224,
"grad_norm": 5.090464115142822,
"learning_rate": 1.817325307511123e-05,
"logits/chosen": -2.9965295791625977,
"logits/rejected": -3.0568604469299316,
"logps/chosen": -282.74853515625,
"logps/rejected": -271.244384765625,
"loss": 0.4535,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.5810127258300781,
"rewards/margins": 1.4132329225540161,
"rewards/rejected": -1.9942457675933838,
"step": 350
},
{
"epoch": 0.09290761580738027,
"grad_norm": 9.859427452087402,
"learning_rate": 1.8147081915728867e-05,
"logits/chosen": -2.8435680866241455,
"logits/rejected": -2.930812120437622,
"logps/chosen": -268.23638916015625,
"logps/rejected": -276.4360046386719,
"loss": 0.5494,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8862309455871582,
"rewards/margins": 1.2033555507659912,
"rewards/rejected": -2.0895867347717285,
"step": 355
},
{
"epoch": 0.0942161737764983,
"grad_norm": 6.459297180175781,
"learning_rate": 1.812091075634651e-05,
"logits/chosen": -2.76271915435791,
"logits/rejected": -2.815265417098999,
"logps/chosen": -275.8890380859375,
"logps/rejected": -268.1244201660156,
"loss": 0.4645,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.765187680721283,
"rewards/margins": 1.524695634841919,
"rewards/rejected": -2.289883613586426,
"step": 360
},
{
"epoch": 0.09552473174561633,
"grad_norm": 5.793435096740723,
"learning_rate": 1.8094739596964147e-05,
"logits/chosen": -2.9098358154296875,
"logits/rejected": -2.8405723571777344,
"logps/chosen": -273.9772033691406,
"logps/rejected": -330.71588134765625,
"loss": 0.5838,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.3435543179512024,
"rewards/margins": 1.3868069648742676,
"rewards/rejected": -1.7303612232208252,
"step": 365
},
{
"epoch": 0.09683328971473436,
"grad_norm": 7.4615159034729,
"learning_rate": 1.8068568437581785e-05,
"logits/chosen": -3.007171392440796,
"logits/rejected": -2.9444351196289062,
"logps/chosen": -317.48297119140625,
"logps/rejected": -327.6475830078125,
"loss": 0.5003,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.00199909508228302,
"rewards/margins": 1.4891326427459717,
"rewards/rejected": -1.4871336221694946,
"step": 370
},
{
"epoch": 0.0981418476838524,
"grad_norm": 12.441634178161621,
"learning_rate": 1.8042397278199427e-05,
"logits/chosen": -2.8079023361206055,
"logits/rejected": -2.825000286102295,
"logps/chosen": -310.08551025390625,
"logps/rejected": -294.376220703125,
"loss": 0.657,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.27615275979042053,
"rewards/margins": 1.0713586807250977,
"rewards/rejected": -1.3475112915039062,
"step": 375
},
{
"epoch": 0.09945040565297043,
"grad_norm": 7.3414764404296875,
"learning_rate": 1.8016226118817065e-05,
"logits/chosen": -2.9461495876312256,
"logits/rejected": -3.022108554840088,
"logps/chosen": -261.1568908691406,
"logps/rejected": -277.7593994140625,
"loss": 0.5084,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.24008110165596008,
"rewards/margins": 1.1185321807861328,
"rewards/rejected": -0.8784511685371399,
"step": 380
},
{
"epoch": 0.10075896362208846,
"grad_norm": 6.003385066986084,
"learning_rate": 1.7990054959434704e-05,
"logits/chosen": -2.936765193939209,
"logits/rejected": -2.9935214519500732,
"logps/chosen": -303.5323791503906,
"logps/rejected": -254.99526977539062,
"loss": 0.4317,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.10325083881616592,
"rewards/margins": 1.691622018814087,
"rewards/rejected": -1.7948728799819946,
"step": 385
},
{
"epoch": 0.1020675215912065,
"grad_norm": 11.351863861083984,
"learning_rate": 1.7963883800052346e-05,
"logits/chosen": -2.944807529449463,
"logits/rejected": -2.961432933807373,
"logps/chosen": -303.90130615234375,
"logps/rejected": -256.1380310058594,
"loss": 0.5825,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.37615787982940674,
"rewards/margins": 1.2104028463363647,
"rewards/rejected": -1.586560606956482,
"step": 390
},
{
"epoch": 0.10337607956032452,
"grad_norm": 5.774983882904053,
"learning_rate": 1.7937712640669984e-05,
"logits/chosen": -2.9214158058166504,
"logits/rejected": -2.9814064502716064,
"logps/chosen": -239.17587280273438,
"logps/rejected": -241.3479766845703,
"loss": 0.4779,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.5130228400230408,
"rewards/margins": 1.39810311794281,
"rewards/rejected": -1.911125898361206,
"step": 395
},
{
"epoch": 0.10468463752944256,
"grad_norm": 7.241608142852783,
"learning_rate": 1.7911541481287622e-05,
"logits/chosen": -2.7809369564056396,
"logits/rejected": -2.818983554840088,
"logps/chosen": -297.4482116699219,
"logps/rejected": -268.3238220214844,
"loss": 0.426,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.6951895356178284,
"rewards/margins": 1.5675487518310547,
"rewards/rejected": -2.2627382278442383,
"step": 400
},
{
"epoch": 0.10468463752944256,
"eval_logits/chosen": -2.9021153450012207,
"eval_logits/rejected": -2.9161083698272705,
"eval_logps/chosen": -294.0876159667969,
"eval_logps/rejected": -286.1116027832031,
"eval_loss": 0.5800156593322754,
"eval_rewards/accuracies": 0.7139999866485596,
"eval_rewards/chosen": -1.085422396659851,
"eval_rewards/margins": 1.1322746276855469,
"eval_rewards/rejected": -2.2176966667175293,
"eval_runtime": 763.418,
"eval_samples_per_second": 2.62,
"eval_steps_per_second": 0.327,
"step": 400
},
{
"epoch": 0.10599319549856058,
"grad_norm": 7.2183356285095215,
"learning_rate": 1.788537032190526e-05,
"logits/chosen": -2.7751684188842773,
"logits/rejected": -2.7491390705108643,
"logps/chosen": -269.92083740234375,
"logps/rejected": -294.2773742675781,
"loss": 0.5384,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.0581508874893188,
"rewards/margins": 1.3911430835723877,
"rewards/rejected": -2.449293851852417,
"step": 405
},
{
"epoch": 0.10730175346767862,
"grad_norm": 6.803300857543945,
"learning_rate": 1.7859199162522902e-05,
"logits/chosen": -2.8796913623809814,
"logits/rejected": -2.933354616165161,
"logps/chosen": -330.7289123535156,
"logps/rejected": -279.0171813964844,
"loss": 0.5261,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.6245895624160767,
"rewards/margins": 1.040086030960083,
"rewards/rejected": -2.664675235748291,
"step": 410
},
{
"epoch": 0.10861031143679666,
"grad_norm": 10.539965629577637,
"learning_rate": 1.783302800314054e-05,
"logits/chosen": -2.959507942199707,
"logits/rejected": -3.0030369758605957,
"logps/chosen": -310.83880615234375,
"logps/rejected": -307.89508056640625,
"loss": 0.5226,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2687180042266846,
"rewards/margins": 1.3614869117736816,
"rewards/rejected": -2.630204916000366,
"step": 415
},
{
"epoch": 0.10991886940591468,
"grad_norm": 4.8235626220703125,
"learning_rate": 1.780685684375818e-05,
"logits/chosen": -3.0115878582000732,
"logits/rejected": -3.0866780281066895,
"logps/chosen": -313.1065673828125,
"logps/rejected": -276.90252685546875,
"loss": 0.3901,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.8931556940078735,
"rewards/margins": 1.7108110189437866,
"rewards/rejected": -3.6039669513702393,
"step": 420
},
{
"epoch": 0.11122742737503272,
"grad_norm": 7.7981390953063965,
"learning_rate": 1.778068568437582e-05,
"logits/chosen": -2.925339937210083,
"logits/rejected": -2.993244171142578,
"logps/chosen": -278.63165283203125,
"logps/rejected": -298.1899108886719,
"loss": 0.5171,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.9866344928741455,
"rewards/margins": 1.4827523231506348,
"rewards/rejected": -3.469386577606201,
"step": 425
},
{
"epoch": 0.11253598534415074,
"grad_norm": 8.36640739440918,
"learning_rate": 1.775451452499346e-05,
"logits/chosen": -2.83201265335083,
"logits/rejected": -2.883605480194092,
"logps/chosen": -292.6142883300781,
"logps/rejected": -290.2992248535156,
"loss": 0.6229,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.152479410171509,
"rewards/margins": 1.368148922920227,
"rewards/rejected": -3.5206284523010254,
"step": 430
},
{
"epoch": 0.11384454331326878,
"grad_norm": 8.783513069152832,
"learning_rate": 1.7728343365611098e-05,
"logits/chosen": -2.845114231109619,
"logits/rejected": -2.8904240131378174,
"logps/chosen": -291.27093505859375,
"logps/rejected": -297.92047119140625,
"loss": 0.5382,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.8989918231964111,
"rewards/margins": 1.32305908203125,
"rewards/rejected": -3.222050905227661,
"step": 435
},
{
"epoch": 0.11515310128238682,
"grad_norm": 3.8981902599334717,
"learning_rate": 1.770217220622874e-05,
"logits/chosen": -2.915076494216919,
"logits/rejected": -2.996788263320923,
"logps/chosen": -285.4013977050781,
"logps/rejected": -287.89410400390625,
"loss": 0.4423,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.5970932245254517,
"rewards/margins": 1.7360804080963135,
"rewards/rejected": -3.3331737518310547,
"step": 440
},
{
"epoch": 0.11646165925150484,
"grad_norm": 8.515145301818848,
"learning_rate": 1.7676001046846374e-05,
"logits/chosen": -2.9851067066192627,
"logits/rejected": -2.9686992168426514,
"logps/chosen": -326.7468566894531,
"logps/rejected": -294.19927978515625,
"loss": 0.5876,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.047184467315674,
"rewards/margins": 1.3992159366607666,
"rewards/rejected": -3.4464004039764404,
"step": 445
},
{
"epoch": 0.11777021722062288,
"grad_norm": 10.733110427856445,
"learning_rate": 1.7649829887464016e-05,
"logits/chosen": -2.961892604827881,
"logits/rejected": -3.0297560691833496,
"logps/chosen": -320.75885009765625,
"logps/rejected": -314.3056640625,
"loss": 0.7165,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.082216501235962,
"rewards/margins": 0.8804648518562317,
"rewards/rejected": -2.962681293487549,
"step": 450
},
{
"epoch": 0.1190787751897409,
"grad_norm": 8.966882705688477,
"learning_rate": 1.7623658728081658e-05,
"logits/chosen": -2.9676575660705566,
"logits/rejected": -3.0381922721862793,
"logps/chosen": -338.7352600097656,
"logps/rejected": -317.4922790527344,
"loss": 0.6433,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.862184762954712,
"rewards/margins": 1.0298666954040527,
"rewards/rejected": -2.8920512199401855,
"step": 455
},
{
"epoch": 0.12038733315885894,
"grad_norm": 5.6694865226745605,
"learning_rate": 1.7597487568699293e-05,
"logits/chosen": -2.8035426139831543,
"logits/rejected": -2.9555842876434326,
"logps/chosen": -264.0455322265625,
"logps/rejected": -252.0572967529297,
"loss": 0.5003,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.9079868793487549,
"rewards/margins": 1.1119083166122437,
"rewards/rejected": -3.019895076751709,
"step": 460
},
{
"epoch": 0.12169589112797696,
"grad_norm": 7.887356758117676,
"learning_rate": 1.7571316409316935e-05,
"logits/chosen": -3.0163044929504395,
"logits/rejected": -3.06129789352417,
"logps/chosen": -325.7791748046875,
"logps/rejected": -265.0768127441406,
"loss": 0.4852,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4090251922607422,
"rewards/margins": 1.3811366558074951,
"rewards/rejected": -2.7901618480682373,
"step": 465
},
{
"epoch": 0.123004449097095,
"grad_norm": 7.931512832641602,
"learning_rate": 1.7545145249934573e-05,
"logits/chosen": -2.864602565765381,
"logits/rejected": -2.876664161682129,
"logps/chosen": -297.8940734863281,
"logps/rejected": -293.0927429199219,
"loss": 0.3987,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.5729058980941772,
"rewards/margins": 1.4379829168319702,
"rewards/rejected": -3.0108885765075684,
"step": 470
},
{
"epoch": 0.12431300706621304,
"grad_norm": 5.773271083831787,
"learning_rate": 1.751897409055221e-05,
"logits/chosen": -2.9812722206115723,
"logits/rejected": -2.986359119415283,
"logps/chosen": -307.86053466796875,
"logps/rejected": -298.8404541015625,
"loss": 0.5465,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1548845767974854,
"rewards/margins": 1.2083876132965088,
"rewards/rejected": -2.363272190093994,
"step": 475
},
{
"epoch": 0.12562156503533106,
"grad_norm": 9.784249305725098,
"learning_rate": 1.7492802931169853e-05,
"logits/chosen": -2.872807025909424,
"logits/rejected": -2.967390298843384,
"logps/chosen": -277.0250244140625,
"logps/rejected": -278.3902893066406,
"loss": 0.6129,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.5591986179351807,
"rewards/margins": 1.1454228162765503,
"rewards/rejected": -2.7046213150024414,
"step": 480
},
{
"epoch": 0.12693012300444909,
"grad_norm": 8.737171173095703,
"learning_rate": 1.746663177178749e-05,
"logits/chosen": -2.9234728813171387,
"logits/rejected": -2.9303388595581055,
"logps/chosen": -334.78704833984375,
"logps/rejected": -279.1730651855469,
"loss": 0.5064,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.4824053049087524,
"rewards/margins": 1.2012741565704346,
"rewards/rejected": -2.6836795806884766,
"step": 485
},
{
"epoch": 0.12823868097356714,
"grad_norm": 6.572619915008545,
"learning_rate": 1.744046061240513e-05,
"logits/chosen": -2.853210926055908,
"logits/rejected": -2.9061577320098877,
"logps/chosen": -311.03680419921875,
"logps/rejected": -303.3858947753906,
"loss": 0.4983,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1631245613098145,
"rewards/margins": 1.1259291172027588,
"rewards/rejected": -2.289053440093994,
"step": 490
},
{
"epoch": 0.12954723894268516,
"grad_norm": 9.202119827270508,
"learning_rate": 1.741428945302277e-05,
"logits/chosen": -2.9796509742736816,
"logits/rejected": -3.069392204284668,
"logps/chosen": -335.2552185058594,
"logps/rejected": -280.06353759765625,
"loss": 0.6068,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.4664899110794067,
"rewards/margins": 0.9758504629135132,
"rewards/rejected": -2.442340135574341,
"step": 495
},
{
"epoch": 0.13085579691180318,
"grad_norm": 12.070145606994629,
"learning_rate": 1.738811829364041e-05,
"logits/chosen": -2.8466262817382812,
"logits/rejected": -2.9542205333709717,
"logps/chosen": -297.8977966308594,
"logps/rejected": -246.33316040039062,
"loss": 0.5466,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.8024070262908936,
"rewards/margins": 1.1843513250350952,
"rewards/rejected": -2.9867584705352783,
"step": 500
},
{
"epoch": 0.13216435488092124,
"grad_norm": 5.3597331047058105,
"learning_rate": 1.736194713425805e-05,
"logits/chosen": -2.9541573524475098,
"logits/rejected": -2.9777073860168457,
"logps/chosen": -330.4364013671875,
"logps/rejected": -300.50970458984375,
"loss": 0.4779,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.542964220046997,
"rewards/margins": 1.2471535205841064,
"rewards/rejected": -2.7901177406311035,
"step": 505
},
{
"epoch": 0.13347291285003926,
"grad_norm": 6.6305060386657715,
"learning_rate": 1.7335775974875687e-05,
"logits/chosen": -2.8647067546844482,
"logits/rejected": -2.9916813373565674,
"logps/chosen": -320.9283752441406,
"logps/rejected": -306.7779541015625,
"loss": 0.4591,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.785896897315979,
"rewards/margins": 1.2584455013275146,
"rewards/rejected": -3.044342517852783,
"step": 510
},
{
"epoch": 0.13478147081915728,
"grad_norm": 9.145403861999512,
"learning_rate": 1.730960481549333e-05,
"logits/chosen": -2.910454750061035,
"logits/rejected": -2.956942081451416,
"logps/chosen": -306.08074951171875,
"logps/rejected": -320.6488952636719,
"loss": 0.6325,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.1406688690185547,
"rewards/margins": 0.9226516485214233,
"rewards/rejected": -3.0633203983306885,
"step": 515
},
{
"epoch": 0.1360900287882753,
"grad_norm": 6.541450023651123,
"learning_rate": 1.7283433656110967e-05,
"logits/chosen": -2.9004507064819336,
"logits/rejected": -2.8952460289001465,
"logps/chosen": -241.9619140625,
"logps/rejected": -298.10394287109375,
"loss": 0.5313,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.0367915630340576,
"rewards/margins": 1.4055403470993042,
"rewards/rejected": -3.4423320293426514,
"step": 520
},
{
"epoch": 0.13739858675739336,
"grad_norm": 7.66124963760376,
"learning_rate": 1.7257262496728605e-05,
"logits/chosen": -3.003178596496582,
"logits/rejected": -3.0736184120178223,
"logps/chosen": -271.2985534667969,
"logps/rejected": -264.1616516113281,
"loss": 0.6178,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.4967647790908813,
"rewards/margins": 0.7704972624778748,
"rewards/rejected": -2.2672622203826904,
"step": 525
},
{
"epoch": 0.13870714472651138,
"grad_norm": 6.042169570922852,
"learning_rate": 1.7231091337346247e-05,
"logits/chosen": -2.7864420413970947,
"logits/rejected": -2.8688900470733643,
"logps/chosen": -328.5660095214844,
"logps/rejected": -304.4143981933594,
"loss": 0.475,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.1899480819702148,
"rewards/margins": 1.6948280334472656,
"rewards/rejected": -2.8847761154174805,
"step": 530
},
{
"epoch": 0.1400157026956294,
"grad_norm": 7.055963039398193,
"learning_rate": 1.7204920177963885e-05,
"logits/chosen": -2.9281864166259766,
"logits/rejected": -3.05684232711792,
"logps/chosen": -321.26922607421875,
"logps/rejected": -301.43731689453125,
"loss": 0.4043,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.954908549785614,
"rewards/margins": 1.4210506677627563,
"rewards/rejected": -2.375959634780884,
"step": 535
},
{
"epoch": 0.14132426066474746,
"grad_norm": 10.064764976501465,
"learning_rate": 1.7178749018581524e-05,
"logits/chosen": -2.8885293006896973,
"logits/rejected": -2.951124668121338,
"logps/chosen": -265.4410400390625,
"logps/rejected": -301.2450866699219,
"loss": 0.5462,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.6822071075439453,
"rewards/margins": 1.1935334205627441,
"rewards/rejected": -2.8757405281066895,
"step": 540
},
{
"epoch": 0.14263281863386548,
"grad_norm": 9.2780122756958,
"learning_rate": 1.7152577859199166e-05,
"logits/chosen": -3.063042640686035,
"logits/rejected": -3.0384624004364014,
"logps/chosen": -240.93661499023438,
"logps/rejected": -255.59017944335938,
"loss": 0.5752,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.7355148792266846,
"rewards/margins": 1.3015474081039429,
"rewards/rejected": -3.037062406539917,
"step": 545
},
{
"epoch": 0.1439413766029835,
"grad_norm": 9.463279724121094,
"learning_rate": 1.7126406699816804e-05,
"logits/chosen": -2.814814567565918,
"logits/rejected": -2.95234751701355,
"logps/chosen": -288.8094177246094,
"logps/rejected": -302.408447265625,
"loss": 0.6,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.7943483591079712,
"rewards/margins": 1.192277193069458,
"rewards/rejected": -2.9866251945495605,
"step": 550
},
{
"epoch": 0.14524993457210156,
"grad_norm": 9.526763916015625,
"learning_rate": 1.7100235540434442e-05,
"logits/chosen": -3.0983662605285645,
"logits/rejected": -3.1188201904296875,
"logps/chosen": -273.67181396484375,
"logps/rejected": -264.828369140625,
"loss": 0.5915,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.6144587993621826,
"rewards/margins": 1.0379388332366943,
"rewards/rejected": -2.652397632598877,
"step": 555
},
{
"epoch": 0.14655849254121958,
"grad_norm": 5.127192497253418,
"learning_rate": 1.707406438105208e-05,
"logits/chosen": -2.8516077995300293,
"logits/rejected": -2.908602476119995,
"logps/chosen": -267.1623229980469,
"logps/rejected": -243.82681274414062,
"loss": 0.4718,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.203351616859436,
"rewards/margins": 1.4361515045166016,
"rewards/rejected": -2.639503002166748,
"step": 560
},
{
"epoch": 0.1478670505103376,
"grad_norm": 13.090658187866211,
"learning_rate": 1.7047893221669722e-05,
"logits/chosen": -2.9258511066436768,
"logits/rejected": -2.9709384441375732,
"logps/chosen": -276.4967346191406,
"logps/rejected": -312.01348876953125,
"loss": 0.6656,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.651240587234497,
"rewards/margins": 1.1099951267242432,
"rewards/rejected": -2.7612357139587402,
"step": 565
},
{
"epoch": 0.14917560847945563,
"grad_norm": 7.876342296600342,
"learning_rate": 1.702172206228736e-05,
"logits/chosen": -2.9246954917907715,
"logits/rejected": -2.9212677478790283,
"logps/chosen": -297.02996826171875,
"logps/rejected": -288.9739074707031,
"loss": 0.4067,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.2050755023956299,
"rewards/margins": 1.5973553657531738,
"rewards/rejected": -2.8024308681488037,
"step": 570
},
{
"epoch": 0.15048416644857368,
"grad_norm": 7.590128421783447,
"learning_rate": 1.6995550902905e-05,
"logits/chosen": -2.969716787338257,
"logits/rejected": -2.8748910427093506,
"logps/chosen": -319.4613342285156,
"logps/rejected": -333.3774719238281,
"loss": 0.5323,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.351219892501831,
"rewards/margins": 1.1748894453048706,
"rewards/rejected": -2.526109218597412,
"step": 575
},
{
"epoch": 0.1517927244176917,
"grad_norm": 10.58719539642334,
"learning_rate": 1.696937974352264e-05,
"logits/chosen": -3.0049643516540527,
"logits/rejected": -3.038872241973877,
"logps/chosen": -276.7691650390625,
"logps/rejected": -242.3045654296875,
"loss": 0.4836,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.9573993682861328,
"rewards/margins": 1.4744302034378052,
"rewards/rejected": -2.4318299293518066,
"step": 580
},
{
"epoch": 0.15310128238680973,
"grad_norm": 5.224278926849365,
"learning_rate": 1.694320858414028e-05,
"logits/chosen": -3.024735689163208,
"logits/rejected": -2.978595733642578,
"logps/chosen": -285.96759033203125,
"logps/rejected": -268.8092041015625,
"loss": 0.5066,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.8840298652648926,
"rewards/margins": 1.2104597091674805,
"rewards/rejected": -2.094489574432373,
"step": 585
},
{
"epoch": 0.15440984035592778,
"grad_norm": 5.459282398223877,
"learning_rate": 1.6917037424757918e-05,
"logits/chosen": -2.83048415184021,
"logits/rejected": -2.8999111652374268,
"logps/chosen": -262.3257751464844,
"logps/rejected": -246.4630584716797,
"loss": 0.5232,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9185107946395874,
"rewards/margins": 1.1749193668365479,
"rewards/rejected": -2.0934300422668457,
"step": 590
},
{
"epoch": 0.1557183983250458,
"grad_norm": 10.723037719726562,
"learning_rate": 1.689086626537556e-05,
"logits/chosen": -2.9317269325256348,
"logits/rejected": -2.9059762954711914,
"logps/chosen": -254.2577667236328,
"logps/rejected": -269.9380798339844,
"loss": 0.4659,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.6864464282989502,
"rewards/margins": 1.4080874919891357,
"rewards/rejected": -2.094534158706665,
"step": 595
},
{
"epoch": 0.15702695629416383,
"grad_norm": 4.004018306732178,
"learning_rate": 1.6864695105993198e-05,
"logits/chosen": -2.922083616256714,
"logits/rejected": -3.023949146270752,
"logps/chosen": -296.1883544921875,
"logps/rejected": -307.1870422363281,
"loss": 0.544,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.974421501159668,
"rewards/margins": 1.086411714553833,
"rewards/rejected": -2.06083345413208,
"step": 600
},
{
"epoch": 0.15833551426328185,
"grad_norm": 6.0055623054504395,
"learning_rate": 1.6838523946610836e-05,
"logits/chosen": -2.6909618377685547,
"logits/rejected": -2.8049652576446533,
"logps/chosen": -254.7784881591797,
"logps/rejected": -252.23828125,
"loss": 0.6389,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2264916896820068,
"rewards/margins": 1.0717966556549072,
"rewards/rejected": -2.298288106918335,
"step": 605
},
{
"epoch": 0.1596440722323999,
"grad_norm": 5.655783176422119,
"learning_rate": 1.6812352787228478e-05,
"logits/chosen": -2.911292552947998,
"logits/rejected": -2.9565579891204834,
"logps/chosen": -319.2688293457031,
"logps/rejected": -331.54315185546875,
"loss": 0.6664,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.3095886707305908,
"rewards/margins": 1.2178889513015747,
"rewards/rejected": -2.527477502822876,
"step": 610
},
{
"epoch": 0.16095263020151793,
"grad_norm": 9.346418380737305,
"learning_rate": 1.6786181627846113e-05,
"logits/chosen": -2.83856463432312,
"logits/rejected": -2.893974781036377,
"logps/chosen": -323.4009704589844,
"logps/rejected": -296.84783935546875,
"loss": 0.468,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.903913140296936,
"rewards/margins": 1.8124818801879883,
"rewards/rejected": -2.7163949012756348,
"step": 615
},
{
"epoch": 0.16226118817063595,
"grad_norm": 7.140948295593262,
"learning_rate": 1.6760010468463755e-05,
"logits/chosen": -3.0348329544067383,
"logits/rejected": -2.941676378250122,
"logps/chosen": -276.62664794921875,
"logps/rejected": -271.2499084472656,
"loss": 0.4796,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.1876803636550903,
"rewards/margins": 1.2585867643356323,
"rewards/rejected": -2.4462671279907227,
"step": 620
},
{
"epoch": 0.163569746139754,
"grad_norm": 5.661865711212158,
"learning_rate": 1.6733839309081393e-05,
"logits/chosen": -2.9504222869873047,
"logits/rejected": -3.029871940612793,
"logps/chosen": -366.68597412109375,
"logps/rejected": -306.84674072265625,
"loss": 0.4189,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.0696208477020264,
"rewards/margins": 1.6653257608413696,
"rewards/rejected": -2.7349467277526855,
"step": 625
},
{
"epoch": 0.16487830410887203,
"grad_norm": 11.919709205627441,
"learning_rate": 1.670766814969903e-05,
"logits/chosen": -2.9389662742614746,
"logits/rejected": -2.9783272743225098,
"logps/chosen": -321.91021728515625,
"logps/rejected": -358.46343994140625,
"loss": 0.4938,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.0782363414764404,
"rewards/margins": 1.602421522140503,
"rewards/rejected": -2.6806578636169434,
"step": 630
},
{
"epoch": 0.16618686207799005,
"grad_norm": 6.807126998901367,
"learning_rate": 1.6681496990316673e-05,
"logits/chosen": -2.7277305126190186,
"logits/rejected": -2.8635544776916504,
"logps/chosen": -276.2630615234375,
"logps/rejected": -287.2210388183594,
"loss": 0.5649,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.179365634918213,
"rewards/margins": 1.416500449180603,
"rewards/rejected": -2.5958662033081055,
"step": 635
},
{
"epoch": 0.16749542004710807,
"grad_norm": 5.412283897399902,
"learning_rate": 1.665532583093431e-05,
"logits/chosen": -2.9597675800323486,
"logits/rejected": -2.948847532272339,
"logps/chosen": -269.4257507324219,
"logps/rejected": -257.947021484375,
"loss": 0.4222,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.7173232436180115,
"rewards/margins": 1.5117555856704712,
"rewards/rejected": -2.229078769683838,
"step": 640
},
{
"epoch": 0.16880397801622612,
"grad_norm": 8.06993293762207,
"learning_rate": 1.662915467155195e-05,
"logits/chosen": -2.844407558441162,
"logits/rejected": -2.8795688152313232,
"logps/chosen": -237.12722778320312,
"logps/rejected": -251.5007781982422,
"loss": 0.4479,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.5313388705253601,
"rewards/margins": 1.5623525381088257,
"rewards/rejected": -2.09369158744812,
"step": 645
},
{
"epoch": 0.17011253598534415,
"grad_norm": 8.971646308898926,
"learning_rate": 1.660298351216959e-05,
"logits/chosen": -2.9815077781677246,
"logits/rejected": -2.9581987857818604,
"logps/chosen": -240.3534698486328,
"logps/rejected": -255.795166015625,
"loss": 0.6226,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.010803407058119774,
"rewards/margins": 1.1583298444747925,
"rewards/rejected": -1.147526502609253,
"step": 650
},
{
"epoch": 0.17142109395446217,
"grad_norm": 8.448005676269531,
"learning_rate": 1.657681235278723e-05,
"logits/chosen": -2.9446420669555664,
"logits/rejected": -2.990194797515869,
"logps/chosen": -240.1005859375,
"logps/rejected": -285.579345703125,
"loss": 0.501,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.36709222197532654,
"rewards/margins": 1.4954938888549805,
"rewards/rejected": -1.1284016370773315,
"step": 655
},
{
"epoch": 0.17272965192358022,
"grad_norm": 7.867677688598633,
"learning_rate": 1.655064119340487e-05,
"logits/chosen": -2.9015254974365234,
"logits/rejected": -2.964930772781372,
"logps/chosen": -318.57244873046875,
"logps/rejected": -296.36236572265625,
"loss": 0.6188,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.12054802477359772,
"rewards/margins": 1.1496269702911377,
"rewards/rejected": -1.2701750993728638,
"step": 660
},
{
"epoch": 0.17403820989269825,
"grad_norm": 6.955239772796631,
"learning_rate": 1.6524470034022507e-05,
"logits/chosen": -2.933335781097412,
"logits/rejected": -2.9919040203094482,
"logps/chosen": -257.498779296875,
"logps/rejected": -255.42098999023438,
"loss": 0.4597,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.10350354015827179,
"rewards/margins": 1.4865562915802002,
"rewards/rejected": -1.5900598764419556,
"step": 665
},
{
"epoch": 0.17534676786181627,
"grad_norm": 4.296855449676514,
"learning_rate": 1.649829887464015e-05,
"logits/chosen": -2.9155821800231934,
"logits/rejected": -2.9966659545898438,
"logps/chosen": -281.1880187988281,
"logps/rejected": -312.45404052734375,
"loss": 0.5136,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.11840929836034775,
"rewards/margins": 1.6070137023925781,
"rewards/rejected": -1.7254230976104736,
"step": 670
},
{
"epoch": 0.17665532583093432,
"grad_norm": 9.262114524841309,
"learning_rate": 1.6472127715257787e-05,
"logits/chosen": -2.967158317565918,
"logits/rejected": -2.952332019805908,
"logps/chosen": -253.4345245361328,
"logps/rejected": -320.2381591796875,
"loss": 0.5256,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.6660485863685608,
"rewards/margins": 1.4134299755096436,
"rewards/rejected": -2.0794787406921387,
"step": 675
},
{
"epoch": 0.17796388380005235,
"grad_norm": 10.10685920715332,
"learning_rate": 1.6445956555875425e-05,
"logits/chosen": -2.8754544258117676,
"logits/rejected": -2.988997459411621,
"logps/chosen": -271.6341552734375,
"logps/rejected": -260.13323974609375,
"loss": 0.5957,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.0736002922058105,
"rewards/margins": 1.3498318195343018,
"rewards/rejected": -2.4234321117401123,
"step": 680
},
{
"epoch": 0.17927244176917037,
"grad_norm": 8.677872657775879,
"learning_rate": 1.6419785396493067e-05,
"logits/chosen": -2.8993847370147705,
"logits/rejected": -2.964925527572632,
"logps/chosen": -289.35986328125,
"logps/rejected": -288.6343078613281,
"loss": 0.6984,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.4944241046905518,
"rewards/margins": 1.0044472217559814,
"rewards/rejected": -2.498871326446533,
"step": 685
},
{
"epoch": 0.1805809997382884,
"grad_norm": 8.92746353149414,
"learning_rate": 1.6393614237110705e-05,
"logits/chosen": -2.9617457389831543,
"logits/rejected": -2.9055941104888916,
"logps/chosen": -218.28567504882812,
"logps/rejected": -230.8361053466797,
"loss": 0.6033,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.201545000076294,
"rewards/margins": 1.0764273405075073,
"rewards/rejected": -2.277972459793091,
"step": 690
},
{
"epoch": 0.18188955770740645,
"grad_norm": 8.533876419067383,
"learning_rate": 1.6367443077728344e-05,
"logits/chosen": -2.8661141395568848,
"logits/rejected": -3.0062363147735596,
"logps/chosen": -310.6184997558594,
"logps/rejected": -263.51409912109375,
"loss": 0.525,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.0192677974700928,
"rewards/margins": 1.2579354047775269,
"rewards/rejected": -2.277203321456909,
"step": 695
},
{
"epoch": 0.18319811567652447,
"grad_norm": 8.52597713470459,
"learning_rate": 1.6341271918345986e-05,
"logits/chosen": -2.9457011222839355,
"logits/rejected": -2.949756145477295,
"logps/chosen": -312.6112060546875,
"logps/rejected": -284.3760986328125,
"loss": 0.6767,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9324936866760254,
"rewards/margins": 0.9121745824813843,
"rewards/rejected": -1.8446681499481201,
"step": 700
},
{
"epoch": 0.1845066736456425,
"grad_norm": 10.356490135192871,
"learning_rate": 1.6315100758963624e-05,
"logits/chosen": -2.836061477661133,
"logits/rejected": -2.839916229248047,
"logps/chosen": -278.6529235839844,
"logps/rejected": -308.8656921386719,
"loss": 0.553,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.8688641786575317,
"rewards/margins": 1.2454078197479248,
"rewards/rejected": -2.114271879196167,
"step": 705
},
{
"epoch": 0.18581523161476055,
"grad_norm": 7.835634231567383,
"learning_rate": 1.6288929599581262e-05,
"logits/chosen": -2.892359495162964,
"logits/rejected": -2.923079490661621,
"logps/chosen": -360.86151123046875,
"logps/rejected": -314.2693786621094,
"loss": 0.5471,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9443033337593079,
"rewards/margins": 1.0037552118301392,
"rewards/rejected": -1.9480584859848022,
"step": 710
},
{
"epoch": 0.18712378958387857,
"grad_norm": 12.043888092041016,
"learning_rate": 1.6262758440198904e-05,
"logits/chosen": -2.9467732906341553,
"logits/rejected": -2.9871439933776855,
"logps/chosen": -285.0309753417969,
"logps/rejected": -277.4990539550781,
"loss": 0.6576,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.7473507523536682,
"rewards/margins": 0.9092914462089539,
"rewards/rejected": -1.6566423177719116,
"step": 715
},
{
"epoch": 0.1884323475529966,
"grad_norm": 7.492431163787842,
"learning_rate": 1.6236587280816542e-05,
"logits/chosen": -2.9607863426208496,
"logits/rejected": -2.986611843109131,
"logps/chosen": -304.36065673828125,
"logps/rejected": -293.6368103027344,
"loss": 0.524,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8549752235412598,
"rewards/margins": 1.1655179262161255,
"rewards/rejected": -2.0204930305480957,
"step": 720
},
{
"epoch": 0.18974090552211462,
"grad_norm": 7.2737274169921875,
"learning_rate": 1.621041612143418e-05,
"logits/chosen": -2.8971705436706543,
"logits/rejected": -2.9472594261169434,
"logps/chosen": -296.6607971191406,
"logps/rejected": -282.5213928222656,
"loss": 0.4271,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.9705104827880859,
"rewards/margins": 1.591907262802124,
"rewards/rejected": -2.56241774559021,
"step": 725
},
{
"epoch": 0.19104946349123267,
"grad_norm": 8.259176254272461,
"learning_rate": 1.618424496205182e-05,
"logits/chosen": -3.0036911964416504,
"logits/rejected": -3.027312755584717,
"logps/chosen": -242.5350799560547,
"logps/rejected": -247.4747772216797,
"loss": 0.4891,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.1831446886062622,
"rewards/margins": 1.270116925239563,
"rewards/rejected": -2.453261613845825,
"step": 730
},
{
"epoch": 0.1923580214603507,
"grad_norm": 6.371599197387695,
"learning_rate": 1.615807380266946e-05,
"logits/chosen": -3.0131289958953857,
"logits/rejected": -3.0459389686584473,
"logps/chosen": -289.00909423828125,
"logps/rejected": -269.96038818359375,
"loss": 0.6098,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.491188883781433,
"rewards/margins": 1.4899593591690063,
"rewards/rejected": -2.9811482429504395,
"step": 735
},
{
"epoch": 0.19366657942946872,
"grad_norm": 8.84294605255127,
"learning_rate": 1.61319026432871e-05,
"logits/chosen": -2.978916645050049,
"logits/rejected": -3.041909694671631,
"logps/chosen": -317.71484375,
"logps/rejected": -323.35638427734375,
"loss": 0.5665,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6817235946655273,
"rewards/margins": 1.4670751094818115,
"rewards/rejected": -3.148798942565918,
"step": 740
},
{
"epoch": 0.19497513739858677,
"grad_norm": 8.727971076965332,
"learning_rate": 1.6105731483904738e-05,
"logits/chosen": -2.951904773712158,
"logits/rejected": -2.9564967155456543,
"logps/chosen": -327.0289611816406,
"logps/rejected": -340.25408935546875,
"loss": 0.4344,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.040027141571045,
"rewards/margins": 1.7067238092422485,
"rewards/rejected": -3.746751070022583,
"step": 745
},
{
"epoch": 0.1962836953677048,
"grad_norm": 7.898123264312744,
"learning_rate": 1.607956032452238e-05,
"logits/chosen": -2.6445517539978027,
"logits/rejected": -2.82399845123291,
"logps/chosen": -311.4280090332031,
"logps/rejected": -291.5944519042969,
"loss": 0.5745,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.266785144805908,
"rewards/margins": 1.2938086986541748,
"rewards/rejected": -3.560593843460083,
"step": 750
},
{
"epoch": 0.19759225333682282,
"grad_norm": 10.65861701965332,
"learning_rate": 1.6053389165140018e-05,
"logits/chosen": -2.853435516357422,
"logits/rejected": -2.9262051582336426,
"logps/chosen": -313.5797119140625,
"logps/rejected": -305.50750732421875,
"loss": 0.6388,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.798832893371582,
"rewards/margins": 1.235661268234253,
"rewards/rejected": -3.034493923187256,
"step": 755
},
{
"epoch": 0.19890081130594087,
"grad_norm": 6.924118518829346,
"learning_rate": 1.6027218005757656e-05,
"logits/chosen": -2.8775856494903564,
"logits/rejected": -2.9751439094543457,
"logps/chosen": -257.1426086425781,
"logps/rejected": -287.93536376953125,
"loss": 0.6444,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.176374912261963,
"rewards/margins": 1.34031081199646,
"rewards/rejected": -3.516685962677002,
"step": 760
},
{
"epoch": 0.2002093692750589,
"grad_norm": 7.09359073638916,
"learning_rate": 1.6001046846375298e-05,
"logits/chosen": -2.904832363128662,
"logits/rejected": -2.8512656688690186,
"logps/chosen": -313.9578552246094,
"logps/rejected": -327.7096862792969,
"loss": 0.5336,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.0932674407958984,
"rewards/margins": 1.5766878128051758,
"rewards/rejected": -3.669955015182495,
"step": 765
},
{
"epoch": 0.20151792724417691,
"grad_norm": 10.785176277160645,
"learning_rate": 1.5974875686992933e-05,
"logits/chosen": -2.8301949501037598,
"logits/rejected": -2.922959566116333,
"logps/chosen": -298.0000915527344,
"logps/rejected": -271.7386474609375,
"loss": 0.6569,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.130873441696167,
"rewards/margins": 1.5558631420135498,
"rewards/rejected": -3.686736583709717,
"step": 770
},
{
"epoch": 0.20282648521329494,
"grad_norm": 6.668851375579834,
"learning_rate": 1.5948704527610575e-05,
"logits/chosen": -2.9211020469665527,
"logits/rejected": -3.0021309852600098,
"logps/chosen": -292.0265197753906,
"logps/rejected": -315.9195251464844,
"loss": 0.4928,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.8320987224578857,
"rewards/margins": 1.6694583892822266,
"rewards/rejected": -3.5015571117401123,
"step": 775
},
{
"epoch": 0.204135043182413,
"grad_norm": 5.2693586349487305,
"learning_rate": 1.5922533368228213e-05,
"logits/chosen": -2.7756457328796387,
"logits/rejected": -2.890991687774658,
"logps/chosen": -311.8628845214844,
"logps/rejected": -321.1475524902344,
"loss": 0.4051,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.8370088338851929,
"rewards/margins": 1.7155002355575562,
"rewards/rejected": -3.552509307861328,
"step": 780
},
{
"epoch": 0.205443601151531,
"grad_norm": 3.826359987258911,
"learning_rate": 1.589636220884585e-05,
"logits/chosen": -2.9222168922424316,
"logits/rejected": -3.0205399990081787,
"logps/chosen": -267.26727294921875,
"logps/rejected": -281.6890869140625,
"loss": 0.511,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.8738367557525635,
"rewards/margins": 1.5201383829116821,
"rewards/rejected": -3.393975019454956,
"step": 785
},
{
"epoch": 0.20675215912064904,
"grad_norm": 5.202337741851807,
"learning_rate": 1.5870191049463493e-05,
"logits/chosen": -2.9837005138397217,
"logits/rejected": -2.9869465827941895,
"logps/chosen": -303.33209228515625,
"logps/rejected": -274.7121276855469,
"loss": 0.485,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.9972702264785767,
"rewards/margins": 1.458465337753296,
"rewards/rejected": -3.455735683441162,
"step": 790
},
{
"epoch": 0.2080607170897671,
"grad_norm": 7.0012006759643555,
"learning_rate": 1.584401989008113e-05,
"logits/chosen": -2.9645447731018066,
"logits/rejected": -3.017319917678833,
"logps/chosen": -259.61328125,
"logps/rejected": -239.63583374023438,
"loss": 0.6769,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -2.050851345062256,
"rewards/margins": 0.9884527325630188,
"rewards/rejected": -3.039304256439209,
"step": 795
},
{
"epoch": 0.2093692750588851,
"grad_norm": 5.7104997634887695,
"learning_rate": 1.581784873069877e-05,
"logits/chosen": -2.81542706489563,
"logits/rejected": -2.957751750946045,
"logps/chosen": -299.0074462890625,
"logps/rejected": -303.9309997558594,
"loss": 0.4352,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.7871536016464233,
"rewards/margins": 1.4007420539855957,
"rewards/rejected": -3.1878955364227295,
"step": 800
},
{
"epoch": 0.2093692750588851,
"eval_logits/chosen": -2.9802186489105225,
"eval_logits/rejected": -2.9955716133117676,
"eval_logps/chosen": -301.6160888671875,
"eval_logps/rejected": -295.63427734375,
"eval_loss": 0.5342944860458374,
"eval_rewards/accuracies": 0.7335000038146973,
"eval_rewards/chosen": -1.8382680416107178,
"eval_rewards/margins": 1.3316991329193115,
"eval_rewards/rejected": -3.16996693611145,
"eval_runtime": 762.4379,
"eval_samples_per_second": 2.623,
"eval_steps_per_second": 0.328,
"step": 800
},
{
"epoch": 0.21067783302800314,
"grad_norm": 11.304238319396973,
"learning_rate": 1.579167757131641e-05,
"logits/chosen": -2.933737277984619,
"logits/rejected": -3.042451858520508,
"logps/chosen": -332.7558898925781,
"logps/rejected": -288.9327087402344,
"loss": 0.5466,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.800733208656311,
"rewards/margins": 1.2069900035858154,
"rewards/rejected": -3.007722854614258,
"step": 805
},
{
"epoch": 0.21198639099712116,
"grad_norm": 9.210540771484375,
"learning_rate": 1.576550641193405e-05,
"logits/chosen": -2.868955135345459,
"logits/rejected": -2.874704122543335,
"logps/chosen": -280.4917907714844,
"logps/rejected": -300.5773010253906,
"loss": 0.4237,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.1213161945343018,
"rewards/margins": 1.7197173833847046,
"rewards/rejected": -3.841033458709717,
"step": 810
},
{
"epoch": 0.2132949489662392,
"grad_norm": 8.686461448669434,
"learning_rate": 1.573933525255169e-05,
"logits/chosen": -2.9677047729492188,
"logits/rejected": -2.894029378890991,
"logps/chosen": -287.06121826171875,
"logps/rejected": -304.01068115234375,
"loss": 0.4436,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.7622085809707642,
"rewards/margins": 1.928234338760376,
"rewards/rejected": -3.6904425621032715,
"step": 815
},
{
"epoch": 0.21460350693535724,
"grad_norm": 14.101777076721191,
"learning_rate": 1.571316409316933e-05,
"logits/chosen": -2.800630807876587,
"logits/rejected": -2.8711049556732178,
"logps/chosen": -312.0852966308594,
"logps/rejected": -299.11114501953125,
"loss": 0.5871,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.8398411273956299,
"rewards/margins": 1.5045777559280396,
"rewards/rejected": -3.34441876411438,
"step": 820
},
{
"epoch": 0.21591206490447526,
"grad_norm": 7.489319801330566,
"learning_rate": 1.568699293378697e-05,
"logits/chosen": -3.0238049030303955,
"logits/rejected": -3.048868417739868,
"logps/chosen": -312.80792236328125,
"logps/rejected": -281.77056884765625,
"loss": 0.5911,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.506508469581604,
"rewards/margins": 1.415809988975525,
"rewards/rejected": -2.922318458557129,
"step": 825
},
{
"epoch": 0.2172206228735933,
"grad_norm": 6.227260112762451,
"learning_rate": 1.5660821774404607e-05,
"logits/chosen": -3.0102522373199463,
"logits/rejected": -3.1055586338043213,
"logps/chosen": -323.2419128417969,
"logps/rejected": -277.99188232421875,
"loss": 0.5915,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.2429955005645752,
"rewards/margins": 1.4162633419036865,
"rewards/rejected": -2.6592588424682617,
"step": 830
},
{
"epoch": 0.21852918084271133,
"grad_norm": 7.631740093231201,
"learning_rate": 1.5634650615022245e-05,
"logits/chosen": -2.921370029449463,
"logits/rejected": -2.9625651836395264,
"logps/chosen": -267.8898010253906,
"logps/rejected": -277.86322021484375,
"loss": 0.5891,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.3811520338058472,
"rewards/margins": 1.4056289196014404,
"rewards/rejected": -2.786780834197998,
"step": 835
},
{
"epoch": 0.21983773881182936,
"grad_norm": 8.195059776306152,
"learning_rate": 1.5608479455639887e-05,
"logits/chosen": -2.9599719047546387,
"logits/rejected": -2.9905598163604736,
"logps/chosen": -273.840576171875,
"logps/rejected": -254.96310424804688,
"loss": 0.5733,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.299445390701294,
"rewards/margins": 1.0443214178085327,
"rewards/rejected": -2.343766927719116,
"step": 840
},
{
"epoch": 0.22114629678094738,
"grad_norm": 10.264986038208008,
"learning_rate": 1.5582308296257525e-05,
"logits/chosen": -2.9382224082946777,
"logits/rejected": -2.994401454925537,
"logps/chosen": -305.3935546875,
"logps/rejected": -325.06201171875,
"loss": 0.6201,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.1238162517547607,
"rewards/margins": 0.8402196764945984,
"rewards/rejected": -1.964035987854004,
"step": 845
},
{
"epoch": 0.22245485475006543,
"grad_norm": 4.49017858505249,
"learning_rate": 1.5556137136875164e-05,
"logits/chosen": -2.9530601501464844,
"logits/rejected": -2.9725661277770996,
"logps/chosen": -247.1348114013672,
"logps/rejected": -260.9407653808594,
"loss": 0.5108,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.8431808352470398,
"rewards/margins": 1.3323025703430176,
"rewards/rejected": -2.1754837036132812,
"step": 850
},
{
"epoch": 0.22376341271918346,
"grad_norm": 8.495135307312012,
"learning_rate": 1.5529965977492806e-05,
"logits/chosen": -3.0015053749084473,
"logits/rejected": -2.9371116161346436,
"logps/chosen": -293.3446350097656,
"logps/rejected": -295.124755859375,
"loss": 0.5258,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.7353643774986267,
"rewards/margins": 0.9804586172103882,
"rewards/rejected": -1.7158231735229492,
"step": 855
},
{
"epoch": 0.22507197068830148,
"grad_norm": 6.842737674713135,
"learning_rate": 1.5503794818110444e-05,
"logits/chosen": -2.9119091033935547,
"logits/rejected": -2.943918228149414,
"logps/chosen": -296.51153564453125,
"logps/rejected": -366.8010559082031,
"loss": 0.5271,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.7795664668083191,
"rewards/margins": 1.3591724634170532,
"rewards/rejected": -2.1387391090393066,
"step": 860
},
{
"epoch": 0.22638052865741953,
"grad_norm": 7.191964149475098,
"learning_rate": 1.5477623658728082e-05,
"logits/chosen": -2.9336752891540527,
"logits/rejected": -3.028198719024658,
"logps/chosen": -345.4011535644531,
"logps/rejected": -314.87322998046875,
"loss": 0.4365,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.0388554334640503,
"rewards/margins": 1.4215975999832153,
"rewards/rejected": -2.4604530334472656,
"step": 865
},
{
"epoch": 0.22768908662653756,
"grad_norm": 6.653241157531738,
"learning_rate": 1.5451452499345724e-05,
"logits/chosen": -2.873534917831421,
"logits/rejected": -2.9371185302734375,
"logps/chosen": -257.2475891113281,
"logps/rejected": -265.35906982421875,
"loss": 0.5493,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2732057571411133,
"rewards/margins": 1.3971422910690308,
"rewards/rejected": -2.6703481674194336,
"step": 870
},
{
"epoch": 0.22899764459565558,
"grad_norm": 7.598719120025635,
"learning_rate": 1.5425281339963362e-05,
"logits/chosen": -2.9009196758270264,
"logits/rejected": -3.0005578994750977,
"logps/chosen": -316.24188232421875,
"logps/rejected": -311.5219421386719,
"loss": 0.4582,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.5340157747268677,
"rewards/margins": 1.5491135120391846,
"rewards/rejected": -3.083129405975342,
"step": 875
},
{
"epoch": 0.23030620256477363,
"grad_norm": 5.385990142822266,
"learning_rate": 1.5399110180581e-05,
"logits/chosen": -2.87817120552063,
"logits/rejected": -2.9192519187927246,
"logps/chosen": -280.2408752441406,
"logps/rejected": -259.8953857421875,
"loss": 0.2971,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -1.4804073572158813,
"rewards/margins": 2.1618080139160156,
"rewards/rejected": -3.6422152519226074,
"step": 880
},
{
"epoch": 0.23161476053389166,
"grad_norm": 9.287992477416992,
"learning_rate": 1.537293902119864e-05,
"logits/chosen": -3.0223453044891357,
"logits/rejected": -3.060854434967041,
"logps/chosen": -325.62457275390625,
"logps/rejected": -286.83599853515625,
"loss": 0.4514,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.2279030084609985,
"rewards/margins": 1.9207531213760376,
"rewards/rejected": -3.1486563682556152,
"step": 885
},
{
"epoch": 0.23292331850300968,
"grad_norm": 6.760774612426758,
"learning_rate": 1.534676786181628e-05,
"logits/chosen": -2.885204553604126,
"logits/rejected": -2.9592041969299316,
"logps/chosen": -306.09747314453125,
"logps/rejected": -291.3219299316406,
"loss": 0.5089,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.0230804681777954,
"rewards/margins": 1.92485773563385,
"rewards/rejected": -2.9479384422302246,
"step": 890
},
{
"epoch": 0.2342318764721277,
"grad_norm": 6.3509521484375,
"learning_rate": 1.532059670243392e-05,
"logits/chosen": -3.0189361572265625,
"logits/rejected": -3.0262978076934814,
"logps/chosen": -272.2768859863281,
"logps/rejected": -317.0359802246094,
"loss": 0.5246,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.8292263746261597,
"rewards/margins": 1.4462980031967163,
"rewards/rejected": -2.275524377822876,
"step": 895
},
{
"epoch": 0.23554043444124576,
"grad_norm": 7.06498908996582,
"learning_rate": 1.5294425543051558e-05,
"logits/chosen": -2.9776549339294434,
"logits/rejected": -3.023850202560425,
"logps/chosen": -232.8544158935547,
"logps/rejected": -277.17779541015625,
"loss": 0.3988,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.6818987727165222,
"rewards/margins": 2.0731732845306396,
"rewards/rejected": -2.7550721168518066,
"step": 900
},
{
"epoch": 0.23684899241036378,
"grad_norm": 10.507072448730469,
"learning_rate": 1.52682543836692e-05,
"logits/chosen": -2.891435384750366,
"logits/rejected": -2.8695714473724365,
"logps/chosen": -305.70037841796875,
"logps/rejected": -349.24957275390625,
"loss": 0.4944,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0776326656341553,
"rewards/margins": 1.8689689636230469,
"rewards/rejected": -2.9466018676757812,
"step": 905
},
{
"epoch": 0.2381575503794818,
"grad_norm": 6.330070495605469,
"learning_rate": 1.5242083224286836e-05,
"logits/chosen": -3.0263266563415527,
"logits/rejected": -3.0248868465423584,
"logps/chosen": -348.4727478027344,
"logps/rejected": -341.72454833984375,
"loss": 0.5856,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.3683652877807617,
"rewards/margins": 1.5177398920059204,
"rewards/rejected": -2.8861050605773926,
"step": 910
},
{
"epoch": 0.23946610834859985,
"grad_norm": 10.05215072631836,
"learning_rate": 1.5215912064904476e-05,
"logits/chosen": -3.062643051147461,
"logits/rejected": -3.0395474433898926,
"logps/chosen": -288.37176513671875,
"logps/rejected": -341.9781188964844,
"loss": 0.5938,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.2461017370224,
"rewards/margins": 1.4891635179519653,
"rewards/rejected": -2.7352652549743652,
"step": 915
},
{
"epoch": 0.24077466631771788,
"grad_norm": 5.560635566711426,
"learning_rate": 1.5189740905522116e-05,
"logits/chosen": -2.876089096069336,
"logits/rejected": -2.888617753982544,
"logps/chosen": -309.76898193359375,
"logps/rejected": -306.857421875,
"loss": 0.5375,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.8361619114875793,
"rewards/margins": 1.5215009450912476,
"rewards/rejected": -2.3576629161834717,
"step": 920
},
{
"epoch": 0.2420832242868359,
"grad_norm": 6.002974510192871,
"learning_rate": 1.5163569746139755e-05,
"logits/chosen": -2.9480457305908203,
"logits/rejected": -2.96376895904541,
"logps/chosen": -240.8140106201172,
"logps/rejected": -266.04876708984375,
"loss": 0.4586,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0198220014572144,
"rewards/margins": 1.3288692235946655,
"rewards/rejected": -2.348691463470459,
"step": 925
},
{
"epoch": 0.24339178225595393,
"grad_norm": 9.024993896484375,
"learning_rate": 1.5137398586757395e-05,
"logits/chosen": -2.9898533821105957,
"logits/rejected": -3.0677971839904785,
"logps/chosen": -277.70867919921875,
"logps/rejected": -248.4989776611328,
"loss": 0.5183,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9063308835029602,
"rewards/margins": 1.4718434810638428,
"rewards/rejected": -2.3781745433807373,
"step": 930
},
{
"epoch": 0.24470034022507198,
"grad_norm": 7.851377010345459,
"learning_rate": 1.5111227427375035e-05,
"logits/chosen": -3.0159552097320557,
"logits/rejected": -3.011725902557373,
"logps/chosen": -317.4552001953125,
"logps/rejected": -295.1143798828125,
"loss": 0.5266,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.4428759813308716,
"rewards/margins": 0.9541980028152466,
"rewards/rejected": -2.397073984146118,
"step": 935
},
{
"epoch": 0.24600889819419,
"grad_norm": 8.614380836486816,
"learning_rate": 1.5085056267992673e-05,
"logits/chosen": -3.000847578048706,
"logits/rejected": -2.930838108062744,
"logps/chosen": -258.88580322265625,
"logps/rejected": -253.4350128173828,
"loss": 0.4261,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.52020263671875,
"rewards/margins": 1.513375997543335,
"rewards/rejected": -3.033578634262085,
"step": 940
},
{
"epoch": 0.24731745616330802,
"grad_norm": 6.095282554626465,
"learning_rate": 1.5058885108610313e-05,
"logits/chosen": -2.8448147773742676,
"logits/rejected": -3.0329713821411133,
"logps/chosen": -328.8890075683594,
"logps/rejected": -273.93780517578125,
"loss": 0.6222,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.6622378826141357,
"rewards/margins": 1.036049723625183,
"rewards/rejected": -2.6982877254486084,
"step": 945
},
{
"epoch": 0.24862601413242608,
"grad_norm": 7.413900852203369,
"learning_rate": 1.5032713949227953e-05,
"logits/chosen": -3.015958547592163,
"logits/rejected": -3.03420090675354,
"logps/chosen": -283.6328430175781,
"logps/rejected": -314.6015625,
"loss": 0.5066,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.7141430377960205,
"rewards/margins": 1.4117655754089355,
"rewards/rejected": -3.125908613204956,
"step": 950
},
{
"epoch": 0.2499345721015441,
"grad_norm": 9.373759269714355,
"learning_rate": 1.500654278984559e-05,
"logits/chosen": -3.109154224395752,
"logits/rejected": -3.100782871246338,
"logps/chosen": -352.9435729980469,
"logps/rejected": -289.6947326660156,
"loss": 0.662,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.0580523014068604,
"rewards/margins": 1.144476294517517,
"rewards/rejected": -3.202528715133667,
"step": 955
},
{
"epoch": 0.2512431300706621,
"grad_norm": 7.236301898956299,
"learning_rate": 1.498037163046323e-05,
"logits/chosen": -3.0416085720062256,
"logits/rejected": -2.942519426345825,
"logps/chosen": -284.47186279296875,
"logps/rejected": -271.6186218261719,
"loss": 0.5027,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.4203197956085205,
"rewards/margins": 1.4996702671051025,
"rewards/rejected": -3.919990062713623,
"step": 960
},
{
"epoch": 0.2525516880397802,
"grad_norm": 8.457993507385254,
"learning_rate": 1.495420047108087e-05,
"logits/chosen": -3.023918628692627,
"logits/rejected": -3.0495004653930664,
"logps/chosen": -327.3899841308594,
"logps/rejected": -347.85992431640625,
"loss": 0.4475,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.4540486335754395,
"rewards/margins": 1.992296814918518,
"rewards/rejected": -4.446345329284668,
"step": 965
},
{
"epoch": 0.25386024600889817,
"grad_norm": 4.348357677459717,
"learning_rate": 1.4928029311698508e-05,
"logits/chosen": -3.017470359802246,
"logits/rejected": -3.073237895965576,
"logps/chosen": -384.67791748046875,
"logps/rejected": -363.98028564453125,
"loss": 0.4786,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.603308916091919,
"rewards/margins": 1.511370301246643,
"rewards/rejected": -4.114678859710693,
"step": 970
},
{
"epoch": 0.2551688039780162,
"grad_norm": 8.246343612670898,
"learning_rate": 1.4901858152316149e-05,
"logits/chosen": -3.0291495323181152,
"logits/rejected": -3.1045758724212646,
"logps/chosen": -331.7242736816406,
"logps/rejected": -312.28240966796875,
"loss": 0.3814,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -2.5157318115234375,
"rewards/margins": 1.9870193004608154,
"rewards/rejected": -4.502751350402832,
"step": 975
},
{
"epoch": 0.2564773619471343,
"grad_norm": 5.465985298156738,
"learning_rate": 1.4875686992933789e-05,
"logits/chosen": -2.9358432292938232,
"logits/rejected": -2.9799187183380127,
"logps/chosen": -305.95867919921875,
"logps/rejected": -268.3275146484375,
"loss": 0.5552,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.049027681350708,
"rewards/margins": 1.7989801168441772,
"rewards/rejected": -3.848007917404175,
"step": 980
},
{
"epoch": 0.25778591991625227,
"grad_norm": 10.069610595703125,
"learning_rate": 1.4849515833551427e-05,
"logits/chosen": -2.8917248249053955,
"logits/rejected": -3.050568103790283,
"logps/chosen": -314.1599426269531,
"logps/rejected": -358.89093017578125,
"loss": 0.4803,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.7643229961395264,
"rewards/margins": 1.844347596168518,
"rewards/rejected": -3.608670711517334,
"step": 985
},
{
"epoch": 0.2590944778853703,
"grad_norm": 7.225062370300293,
"learning_rate": 1.4823344674169067e-05,
"logits/chosen": -3.099228620529175,
"logits/rejected": -3.1308112144470215,
"logps/chosen": -294.34185791015625,
"logps/rejected": -274.89483642578125,
"loss": 0.687,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.4007724523544312,
"rewards/margins": 0.9705973863601685,
"rewards/rejected": -2.3713696002960205,
"step": 990
},
{
"epoch": 0.2604030358544884,
"grad_norm": 4.582951068878174,
"learning_rate": 1.4797173514786707e-05,
"logits/chosen": -2.9205706119537354,
"logits/rejected": -2.8099303245544434,
"logps/chosen": -232.9236297607422,
"logps/rejected": -277.83404541015625,
"loss": 0.5797,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.636836051940918,
"rewards/margins": 0.9393714666366577,
"rewards/rejected": -2.5762076377868652,
"step": 995
},
{
"epoch": 0.26171159382360637,
"grad_norm": 7.905261516571045,
"learning_rate": 1.4771002355404345e-05,
"logits/chosen": -3.0076744556427,
"logits/rejected": -3.063244581222534,
"logps/chosen": -247.02780151367188,
"logps/rejected": -313.2117919921875,
"loss": 0.4223,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.349962830543518,
"rewards/margins": 1.5588710308074951,
"rewards/rejected": -2.9088339805603027,
"step": 1000
},
{
"epoch": 0.2630201517927244,
"grad_norm": 6.2216644287109375,
"learning_rate": 1.4744831196021986e-05,
"logits/chosen": -2.932487964630127,
"logits/rejected": -3.0266571044921875,
"logps/chosen": -322.1878356933594,
"logps/rejected": -319.5711975097656,
"loss": 0.5979,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4996315240859985,
"rewards/margins": 1.2307758331298828,
"rewards/rejected": -2.730407238006592,
"step": 1005
},
{
"epoch": 0.2643287097618425,
"grad_norm": 9.049347877502441,
"learning_rate": 1.4718660036639626e-05,
"logits/chosen": -3.0737674236297607,
"logits/rejected": -3.073376178741455,
"logps/chosen": -263.8631896972656,
"logps/rejected": -273.49334716796875,
"loss": 0.6037,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.218382477760315,
"rewards/margins": 1.3037409782409668,
"rewards/rejected": -2.5221235752105713,
"step": 1010
},
{
"epoch": 0.26563726773096047,
"grad_norm": 8.119050025939941,
"learning_rate": 1.4692488877257262e-05,
"logits/chosen": -3.072826623916626,
"logits/rejected": -3.0864005088806152,
"logps/chosen": -239.6600341796875,
"logps/rejected": -260.7843933105469,
"loss": 0.4576,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3692705631256104,
"rewards/margins": 1.317340612411499,
"rewards/rejected": -2.6866109371185303,
"step": 1015
},
{
"epoch": 0.2669458257000785,
"grad_norm": 6.507713317871094,
"learning_rate": 1.4666317717874902e-05,
"logits/chosen": -3.034135341644287,
"logits/rejected": -3.121675729751587,
"logps/chosen": -296.17181396484375,
"logps/rejected": -308.36956787109375,
"loss": 0.5734,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.3819348812103271,
"rewards/margins": 0.8500627279281616,
"rewards/rejected": -2.2319977283477783,
"step": 1020
},
{
"epoch": 0.26825438366919657,
"grad_norm": 7.960349082946777,
"learning_rate": 1.4640146558492542e-05,
"logits/chosen": -2.9665277004241943,
"logits/rejected": -3.0263547897338867,
"logps/chosen": -286.6892395019531,
"logps/rejected": -301.81182861328125,
"loss": 0.519,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.1299912929534912,
"rewards/margins": 1.5969384908676147,
"rewards/rejected": -2.7269299030303955,
"step": 1025
},
{
"epoch": 0.26956294163831457,
"grad_norm": 8.107889175415039,
"learning_rate": 1.461397539911018e-05,
"logits/chosen": -2.971383571624756,
"logits/rejected": -3.0405492782592773,
"logps/chosen": -300.4283752441406,
"logps/rejected": -304.0221252441406,
"loss": 0.485,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.752074122428894,
"rewards/margins": 1.4214041233062744,
"rewards/rejected": -2.173478364944458,
"step": 1030
},
{
"epoch": 0.2708714996074326,
"grad_norm": 4.233426094055176,
"learning_rate": 1.458780423972782e-05,
"logits/chosen": -2.9646596908569336,
"logits/rejected": -2.9252395629882812,
"logps/chosen": -302.5656433105469,
"logps/rejected": -310.8838195800781,
"loss": 0.4718,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.9153076410293579,
"rewards/margins": 1.480229139328003,
"rewards/rejected": -2.3955366611480713,
"step": 1035
},
{
"epoch": 0.2721800575765506,
"grad_norm": 7.866308689117432,
"learning_rate": 1.4561633080345461e-05,
"logits/chosen": -3.0362677574157715,
"logits/rejected": -3.028848886489868,
"logps/chosen": -285.7720947265625,
"logps/rejected": -300.82073974609375,
"loss": 0.5479,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.9976667165756226,
"rewards/margins": 1.399741530418396,
"rewards/rejected": -2.3974080085754395,
"step": 1040
},
{
"epoch": 0.27348861554566867,
"grad_norm": 6.74353551864624,
"learning_rate": 1.4535461920963101e-05,
"logits/chosen": -3.0779507160186768,
"logits/rejected": -3.0220227241516113,
"logps/chosen": -300.1296691894531,
"logps/rejected": -300.6394348144531,
"loss": 0.6002,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1509724855422974,
"rewards/margins": 1.479297161102295,
"rewards/rejected": -2.6302695274353027,
"step": 1045
},
{
"epoch": 0.2747971735147867,
"grad_norm": 5.20580530166626,
"learning_rate": 1.450929076158074e-05,
"logits/chosen": -3.0654895305633545,
"logits/rejected": -3.1023030281066895,
"logps/chosen": -315.8076171875,
"logps/rejected": -290.9166564941406,
"loss": 0.4704,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.1931313276290894,
"rewards/margins": 1.4464247226715088,
"rewards/rejected": -2.6395561695098877,
"step": 1050
},
{
"epoch": 0.2761057314839047,
"grad_norm": 10.180657386779785,
"learning_rate": 1.448311960219838e-05,
"logits/chosen": -3.0087831020355225,
"logits/rejected": -3.116145372390747,
"logps/chosen": -279.8875732421875,
"logps/rejected": -296.86968994140625,
"loss": 0.6297,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.051395297050476,
"rewards/margins": 1.0588595867156982,
"rewards/rejected": -2.110255002975464,
"step": 1055
},
{
"epoch": 0.27741428945302277,
"grad_norm": 8.839851379394531,
"learning_rate": 1.445694844281602e-05,
"logits/chosen": -3.0243630409240723,
"logits/rejected": -3.0204248428344727,
"logps/chosen": -283.5610046386719,
"logps/rejected": -266.9760437011719,
"loss": 0.5281,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0122487545013428,
"rewards/margins": 1.085057020187378,
"rewards/rejected": -2.0973057746887207,
"step": 1060
},
{
"epoch": 0.2787228474221408,
"grad_norm": 11.794388771057129,
"learning_rate": 1.4430777283433656e-05,
"logits/chosen": -3.078611373901367,
"logits/rejected": -3.032027006149292,
"logps/chosen": -295.5868225097656,
"logps/rejected": -251.0193634033203,
"loss": 0.6133,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.292945384979248,
"rewards/margins": 0.9534968137741089,
"rewards/rejected": -2.2464423179626465,
"step": 1065
},
{
"epoch": 0.2800314053912588,
"grad_norm": 7.8189778327941895,
"learning_rate": 1.4404606124051296e-05,
"logits/chosen": -3.0526416301727295,
"logits/rejected": -2.993549346923828,
"logps/chosen": -308.3221740722656,
"logps/rejected": -313.1771240234375,
"loss": 0.4449,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.091350793838501,
"rewards/margins": 1.4200166463851929,
"rewards/rejected": -2.5113673210144043,
"step": 1070
},
{
"epoch": 0.28133996336037687,
"grad_norm": 5.642181396484375,
"learning_rate": 1.4378434964668936e-05,
"logits/chosen": -2.9637482166290283,
"logits/rejected": -2.9531664848327637,
"logps/chosen": -280.0079040527344,
"logps/rejected": -269.2755432128906,
"loss": 0.5285,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3164184093475342,
"rewards/margins": 1.267393708229065,
"rewards/rejected": -2.5838122367858887,
"step": 1075
},
{
"epoch": 0.2826485213294949,
"grad_norm": 7.877563953399658,
"learning_rate": 1.4352263805286575e-05,
"logits/chosen": -2.9859046936035156,
"logits/rejected": -3.0330402851104736,
"logps/chosen": -278.54620361328125,
"logps/rejected": -315.25311279296875,
"loss": 0.5361,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.5659092664718628,
"rewards/margins": 1.165024995803833,
"rewards/rejected": -2.7309341430664062,
"step": 1080
},
{
"epoch": 0.2839570792986129,
"grad_norm": 8.220552444458008,
"learning_rate": 1.4326092645904215e-05,
"logits/chosen": -3.093210458755493,
"logits/rejected": -3.0550553798675537,
"logps/chosen": -265.4081726074219,
"logps/rejected": -305.39404296875,
"loss": 0.5215,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.9443448781967163,
"rewards/margins": 1.2935254573822021,
"rewards/rejected": -3.237870454788208,
"step": 1085
},
{
"epoch": 0.28526563726773096,
"grad_norm": 5.695536136627197,
"learning_rate": 1.4299921486521855e-05,
"logits/chosen": -2.920870065689087,
"logits/rejected": -3.0504040718078613,
"logps/chosen": -290.29327392578125,
"logps/rejected": -273.7704162597656,
"loss": 0.4375,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.6533771753311157,
"rewards/margins": 1.6679798364639282,
"rewards/rejected": -3.3213565349578857,
"step": 1090
},
{
"epoch": 0.286574195236849,
"grad_norm": 6.138227939605713,
"learning_rate": 1.4273750327139493e-05,
"logits/chosen": -3.021458148956299,
"logits/rejected": -3.085855484008789,
"logps/chosen": -271.1431884765625,
"logps/rejected": -241.7474365234375,
"loss": 0.4496,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.505995512008667,
"rewards/margins": 1.7411110401153564,
"rewards/rejected": -3.2471065521240234,
"step": 1095
},
{
"epoch": 0.287882753205967,
"grad_norm": 6.326923847198486,
"learning_rate": 1.4247579167757133e-05,
"logits/chosen": -2.997809886932373,
"logits/rejected": -3.044062376022339,
"logps/chosen": -252.74203491210938,
"logps/rejected": -284.060546875,
"loss": 0.4742,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7201992273330688,
"rewards/margins": 1.2570292949676514,
"rewards/rejected": -2.9772286415100098,
"step": 1100
},
{
"epoch": 0.28919131117508506,
"grad_norm": 9.969533920288086,
"learning_rate": 1.4221408008374773e-05,
"logits/chosen": -2.8809401988983154,
"logits/rejected": -3.0844621658325195,
"logps/chosen": -296.0122375488281,
"logps/rejected": -286.38775634765625,
"loss": 0.606,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.8422445058822632,
"rewards/margins": 1.1577017307281494,
"rewards/rejected": -2.999946117401123,
"step": 1105
},
{
"epoch": 0.2904998691442031,
"grad_norm": 6.7198591232299805,
"learning_rate": 1.4195236848992412e-05,
"logits/chosen": -3.018711566925049,
"logits/rejected": -3.08083438873291,
"logps/chosen": -279.4359436035156,
"logps/rejected": -273.54571533203125,
"loss": 0.3774,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.371663212776184,
"rewards/margins": 2.2079691886901855,
"rewards/rejected": -3.579632520675659,
"step": 1110
},
{
"epoch": 0.2918084271133211,
"grad_norm": 12.09412956237793,
"learning_rate": 1.4169065689610052e-05,
"logits/chosen": -3.0689175128936768,
"logits/rejected": -3.059790849685669,
"logps/chosen": -310.78375244140625,
"logps/rejected": -268.98004150390625,
"loss": 0.5401,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.4807078838348389,
"rewards/margins": 1.5228500366210938,
"rewards/rejected": -3.0035579204559326,
"step": 1115
},
{
"epoch": 0.29311698508243916,
"grad_norm": 6.893748760223389,
"learning_rate": 1.4142894530227692e-05,
"logits/chosen": -3.032919406890869,
"logits/rejected": -3.0950205326080322,
"logps/chosen": -242.1670379638672,
"logps/rejected": -239.2816619873047,
"loss": 0.5792,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5399396419525146,
"rewards/margins": 1.3476312160491943,
"rewards/rejected": -2.887570858001709,
"step": 1120
},
{
"epoch": 0.29442554305155716,
"grad_norm": 7.757107734680176,
"learning_rate": 1.4116723370845328e-05,
"logits/chosen": -2.9414639472961426,
"logits/rejected": -3.0114121437072754,
"logps/chosen": -257.2752380371094,
"logps/rejected": -283.72088623046875,
"loss": 0.5001,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.2555142641067505,
"rewards/margins": 1.4964978694915771,
"rewards/rejected": -2.752012252807617,
"step": 1125
},
{
"epoch": 0.2957341010206752,
"grad_norm": 10.208292961120605,
"learning_rate": 1.4090552211462969e-05,
"logits/chosen": -2.973788261413574,
"logits/rejected": -3.0658910274505615,
"logps/chosen": -373.2972717285156,
"logps/rejected": -342.21929931640625,
"loss": 0.5265,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.2128429412841797,
"rewards/margins": 1.3734642267227173,
"rewards/rejected": -2.5863070487976074,
"step": 1130
},
{
"epoch": 0.29704265898979326,
"grad_norm": 9.619853973388672,
"learning_rate": 1.4064381052080609e-05,
"logits/chosen": -2.830082893371582,
"logits/rejected": -2.9827170372009277,
"logps/chosen": -301.67999267578125,
"logps/rejected": -302.10540771484375,
"loss": 0.6362,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3880640268325806,
"rewards/margins": 1.111306071281433,
"rewards/rejected": -2.4993700981140137,
"step": 1135
},
{
"epoch": 0.29835121695891126,
"grad_norm": 7.463226795196533,
"learning_rate": 1.4038209892698247e-05,
"logits/chosen": -3.006840229034424,
"logits/rejected": -3.0112249851226807,
"logps/chosen": -324.7442932128906,
"logps/rejected": -299.2283630371094,
"loss": 0.4186,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.5023479461669922,
"rewards/margins": 1.465083360671997,
"rewards/rejected": -2.9674313068389893,
"step": 1140
},
{
"epoch": 0.2996597749280293,
"grad_norm": 8.654081344604492,
"learning_rate": 1.4012038733315887e-05,
"logits/chosen": -2.949709415435791,
"logits/rejected": -2.9855611324310303,
"logps/chosen": -293.55523681640625,
"logps/rejected": -316.7439880371094,
"loss": 0.4152,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.4069695472717285,
"rewards/margins": 1.7172313928604126,
"rewards/rejected": -3.1242008209228516,
"step": 1145
},
{
"epoch": 0.30096833289714736,
"grad_norm": 5.447267532348633,
"learning_rate": 1.3985867573933527e-05,
"logits/chosen": -2.9884190559387207,
"logits/rejected": -3.067103862762451,
"logps/chosen": -300.8186950683594,
"logps/rejected": -308.9688415527344,
"loss": 0.3237,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.4933931827545166,
"rewards/margins": 2.1875128746032715,
"rewards/rejected": -3.680905818939209,
"step": 1150
},
{
"epoch": 0.30227689086626536,
"grad_norm": 7.315623760223389,
"learning_rate": 1.3959696414551165e-05,
"logits/chosen": -2.877784013748169,
"logits/rejected": -2.967470645904541,
"logps/chosen": -245.8937530517578,
"logps/rejected": -253.87759399414062,
"loss": 0.5557,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.9919893741607666,
"rewards/margins": 1.786003828048706,
"rewards/rejected": -3.7779934406280518,
"step": 1155
},
{
"epoch": 0.3035854488353834,
"grad_norm": 9.264678955078125,
"learning_rate": 1.3933525255168806e-05,
"logits/chosen": -2.937941074371338,
"logits/rejected": -3.082321882247925,
"logps/chosen": -267.06365966796875,
"logps/rejected": -274.55108642578125,
"loss": 0.5468,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7004693746566772,
"rewards/margins": 2.156771183013916,
"rewards/rejected": -3.8572402000427246,
"step": 1160
},
{
"epoch": 0.30489400680450146,
"grad_norm": 5.902349472045898,
"learning_rate": 1.3907354095786446e-05,
"logits/chosen": -3.035977602005005,
"logits/rejected": -3.0683465003967285,
"logps/chosen": -292.7044982910156,
"logps/rejected": -259.0909118652344,
"loss": 0.4338,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.780005693435669,
"rewards/margins": 2.0152504444122314,
"rewards/rejected": -3.7952563762664795,
"step": 1165
},
{
"epoch": 0.30620256477361946,
"grad_norm": 4.958015441894531,
"learning_rate": 1.3881182936404082e-05,
"logits/chosen": -2.9945685863494873,
"logits/rejected": -3.0434579849243164,
"logps/chosen": -329.266357421875,
"logps/rejected": -357.74798583984375,
"loss": 0.4398,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.4882758855819702,
"rewards/margins": 1.6685161590576172,
"rewards/rejected": -3.156792163848877,
"step": 1170
},
{
"epoch": 0.3075111227427375,
"grad_norm": 7.654452323913574,
"learning_rate": 1.3855011777021722e-05,
"logits/chosen": -3.058605670928955,
"logits/rejected": -3.0306811332702637,
"logps/chosen": -263.643310546875,
"logps/rejected": -307.02520751953125,
"loss": 0.6095,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.639232873916626,
"rewards/margins": 1.4032961130142212,
"rewards/rejected": -3.0425288677215576,
"step": 1175
},
{
"epoch": 0.30881968071185556,
"grad_norm": 5.414381504058838,
"learning_rate": 1.3828840617639362e-05,
"logits/chosen": -3.0163514614105225,
"logits/rejected": -3.0375030040740967,
"logps/chosen": -300.1225891113281,
"logps/rejected": -287.8301696777344,
"loss": 0.5153,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.1381218433380127,
"rewards/margins": 1.5844132900238037,
"rewards/rejected": -2.7225348949432373,
"step": 1180
},
{
"epoch": 0.31012823868097356,
"grad_norm": 7.217351913452148,
"learning_rate": 1.3802669458257e-05,
"logits/chosen": -2.8652729988098145,
"logits/rejected": -2.965222120285034,
"logps/chosen": -316.91094970703125,
"logps/rejected": -331.66265869140625,
"loss": 0.5718,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3781057596206665,
"rewards/margins": 1.089707612991333,
"rewards/rejected": -2.467813014984131,
"step": 1185
},
{
"epoch": 0.3114367966500916,
"grad_norm": 8.356551170349121,
"learning_rate": 1.377649829887464e-05,
"logits/chosen": -3.049598217010498,
"logits/rejected": -3.1053690910339355,
"logps/chosen": -315.70916748046875,
"logps/rejected": -300.895751953125,
"loss": 0.4796,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1529004573822021,
"rewards/margins": 1.447994589805603,
"rewards/rejected": -2.6008951663970947,
"step": 1190
},
{
"epoch": 0.3127453546192096,
"grad_norm": 10.391530990600586,
"learning_rate": 1.3750327139492281e-05,
"logits/chosen": -3.0718863010406494,
"logits/rejected": -3.013375759124756,
"logps/chosen": -283.4698181152344,
"logps/rejected": -284.2854309082031,
"loss": 0.6442,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.585439920425415,
"rewards/margins": 1.2667933702468872,
"rewards/rejected": -2.8522331714630127,
"step": 1195
},
{
"epoch": 0.31405391258832765,
"grad_norm": 10.304366111755371,
"learning_rate": 1.372415598010992e-05,
"logits/chosen": -2.875305414199829,
"logits/rejected": -2.9016032218933105,
"logps/chosen": -338.66583251953125,
"logps/rejected": -296.04669189453125,
"loss": 0.5551,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.2871284484863281,
"rewards/margins": 1.2437200546264648,
"rewards/rejected": -2.530848264694214,
"step": 1200
},
{
"epoch": 0.31405391258832765,
"eval_logits/chosen": -3.0260488986968994,
"eval_logits/rejected": -3.0437211990356445,
"eval_logps/chosen": -299.2873840332031,
"eval_logps/rejected": -293.77947998046875,
"eval_loss": 0.5235101580619812,
"eval_rewards/accuracies": 0.7335000038146973,
"eval_rewards/chosen": -1.6054028272628784,
"eval_rewards/margins": 1.3790825605392456,
"eval_rewards/rejected": -2.984485387802124,
"eval_runtime": 763.0817,
"eval_samples_per_second": 2.621,
"eval_steps_per_second": 0.328,
"step": 1200
},
{
"epoch": 0.3153624705574457,
"grad_norm": 7.599167346954346,
"learning_rate": 1.369798482072756e-05,
"logits/chosen": -2.9609599113464355,
"logits/rejected": -3.05143666267395,
"logps/chosen": -327.25323486328125,
"logps/rejected": -286.7802429199219,
"loss": 0.4555,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.5968466997146606,
"rewards/margins": 1.700269103050232,
"rewards/rejected": -3.2971160411834717,
"step": 1205
},
{
"epoch": 0.3166710285265637,
"grad_norm": 6.793883323669434,
"learning_rate": 1.36718136613452e-05,
"logits/chosen": -3.022416591644287,
"logits/rejected": -3.0568621158599854,
"logps/chosen": -291.0765075683594,
"logps/rejected": -259.074951171875,
"loss": 0.5201,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7141826152801514,
"rewards/margins": 1.5414519309997559,
"rewards/rejected": -3.255634307861328,
"step": 1210
},
{
"epoch": 0.31797958649568175,
"grad_norm": 3.9788708686828613,
"learning_rate": 1.364564250196284e-05,
"logits/chosen": -2.8990914821624756,
"logits/rejected": -2.990156888961792,
"logps/chosen": -347.00177001953125,
"logps/rejected": -328.04791259765625,
"loss": 0.3949,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.5140724182128906,
"rewards/margins": 2.129075527191162,
"rewards/rejected": -3.6431479454040527,
"step": 1215
},
{
"epoch": 0.3192881444647998,
"grad_norm": 7.489226818084717,
"learning_rate": 1.3619471342580476e-05,
"logits/chosen": -3.049175977706909,
"logits/rejected": -3.0629820823669434,
"logps/chosen": -338.41302490234375,
"logps/rejected": -340.4408264160156,
"loss": 0.4952,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.8001915216445923,
"rewards/margins": 1.876043677330017,
"rewards/rejected": -3.6762351989746094,
"step": 1220
},
{
"epoch": 0.3205967024339178,
"grad_norm": 4.567310333251953,
"learning_rate": 1.3593300183198118e-05,
"logits/chosen": -2.9615187644958496,
"logits/rejected": -3.008460521697998,
"logps/chosen": -295.421142578125,
"logps/rejected": -277.5303039550781,
"loss": 0.4912,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.8743877410888672,
"rewards/margins": 1.695482850074768,
"rewards/rejected": -3.5698704719543457,
"step": 1225
},
{
"epoch": 0.32190526040303585,
"grad_norm": 8.410323143005371,
"learning_rate": 1.3567129023815758e-05,
"logits/chosen": -3.09279465675354,
"logits/rejected": -3.0178253650665283,
"logps/chosen": -343.17034912109375,
"logps/rejected": -322.3753356933594,
"loss": 0.715,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.4411779642105103,
"rewards/margins": 1.4588241577148438,
"rewards/rejected": -2.9000020027160645,
"step": 1230
},
{
"epoch": 0.3232138183721539,
"grad_norm": 8.890939712524414,
"learning_rate": 1.3540957864433395e-05,
"logits/chosen": -3.0198092460632324,
"logits/rejected": -3.1039481163024902,
"logps/chosen": -303.96820068359375,
"logps/rejected": -279.45831298828125,
"loss": 0.6816,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.2015838623046875,
"rewards/margins": 1.0614570379257202,
"rewards/rejected": -2.2630410194396973,
"step": 1235
},
{
"epoch": 0.3245223763412719,
"grad_norm": 8.1312894821167,
"learning_rate": 1.3514786705051035e-05,
"logits/chosen": -3.032458782196045,
"logits/rejected": -3.077017068862915,
"logps/chosen": -297.26678466796875,
"logps/rejected": -258.3316650390625,
"loss": 0.5992,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.8163844347000122,
"rewards/margins": 1.0739367008209229,
"rewards/rejected": -1.8903210163116455,
"step": 1240
},
{
"epoch": 0.32583093431038995,
"grad_norm": 9.6901273727417,
"learning_rate": 1.3488615545668675e-05,
"logits/chosen": -3.017458200454712,
"logits/rejected": -3.1017868518829346,
"logps/chosen": -301.77484130859375,
"logps/rejected": -290.51373291015625,
"loss": 0.4924,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.7117307782173157,
"rewards/margins": 1.4712412357330322,
"rewards/rejected": -2.182971954345703,
"step": 1245
},
{
"epoch": 0.327139492279508,
"grad_norm": 10.449470520019531,
"learning_rate": 1.3462444386286313e-05,
"logits/chosen": -3.0146572589874268,
"logits/rejected": -3.053591012954712,
"logps/chosen": -311.7223815917969,
"logps/rejected": -352.6505432128906,
"loss": 0.5767,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.2211498022079468,
"rewards/margins": 1.2701174020767212,
"rewards/rejected": -2.491266965866089,
"step": 1250
},
{
"epoch": 0.328448050248626,
"grad_norm": 7.611923694610596,
"learning_rate": 1.3436273226903953e-05,
"logits/chosen": -3.033237934112549,
"logits/rejected": -3.0723347663879395,
"logps/chosen": -262.8450012207031,
"logps/rejected": -245.06201171875,
"loss": 0.4277,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.213969111442566,
"rewards/margins": 1.6178230047225952,
"rewards/rejected": -2.8317923545837402,
"step": 1255
},
{
"epoch": 0.32975660821774405,
"grad_norm": 10.214200973510742,
"learning_rate": 1.3410102067521593e-05,
"logits/chosen": -3.102858066558838,
"logits/rejected": -3.180795192718506,
"logps/chosen": -277.609130859375,
"logps/rejected": -312.61041259765625,
"loss": 0.5196,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.1733005046844482,
"rewards/margins": 1.6840053796768188,
"rewards/rejected": -2.8573060035705566,
"step": 1260
},
{
"epoch": 0.3310651661868621,
"grad_norm": 6.558178901672363,
"learning_rate": 1.3383930908139232e-05,
"logits/chosen": -3.1322267055511475,
"logits/rejected": -3.180755138397217,
"logps/chosen": -310.9773864746094,
"logps/rejected": -311.936767578125,
"loss": 0.5532,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5742419958114624,
"rewards/margins": 1.6253976821899414,
"rewards/rejected": -3.1996397972106934,
"step": 1265
},
{
"epoch": 0.3323737241559801,
"grad_norm": 6.47090482711792,
"learning_rate": 1.3357759748756872e-05,
"logits/chosen": -3.085824966430664,
"logits/rejected": -3.1216368675231934,
"logps/chosen": -326.1288146972656,
"logps/rejected": -289.3221435546875,
"loss": 0.4169,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.6582863330841064,
"rewards/margins": 1.6689634323120117,
"rewards/rejected": -3.3272500038146973,
"step": 1270
},
{
"epoch": 0.33368228212509815,
"grad_norm": 5.296478748321533,
"learning_rate": 1.3331588589374512e-05,
"logits/chosen": -3.14270281791687,
"logits/rejected": -3.174440383911133,
"logps/chosen": -262.1026611328125,
"logps/rejected": -269.72503662109375,
"loss": 0.4051,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4073164463043213,
"rewards/margins": 1.6645927429199219,
"rewards/rejected": -3.0719094276428223,
"step": 1275
},
{
"epoch": 0.33499084009421615,
"grad_norm": 5.415592670440674,
"learning_rate": 1.3305417429992148e-05,
"logits/chosen": -3.004148006439209,
"logits/rejected": -3.151362657546997,
"logps/chosen": -307.1809997558594,
"logps/rejected": -300.0960998535156,
"loss": 0.4607,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.156396746635437,
"rewards/margins": 1.7337665557861328,
"rewards/rejected": -2.8901631832122803,
"step": 1280
},
{
"epoch": 0.3362993980633342,
"grad_norm": 8.379344940185547,
"learning_rate": 1.3279246270609789e-05,
"logits/chosen": -3.0923662185668945,
"logits/rejected": -3.1500816345214844,
"logps/chosen": -295.65936279296875,
"logps/rejected": -311.3245849609375,
"loss": 0.3996,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.008171558380127,
"rewards/margins": 1.8367351293563843,
"rewards/rejected": -2.84490704536438,
"step": 1285
},
{
"epoch": 0.33760795603245225,
"grad_norm": 9.645184516906738,
"learning_rate": 1.3253075111227429e-05,
"logits/chosen": -2.793687343597412,
"logits/rejected": -3.0188405513763428,
"logps/chosen": -317.2360534667969,
"logps/rejected": -257.33026123046875,
"loss": 0.5077,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5827356576919556,
"rewards/margins": 1.593719482421875,
"rewards/rejected": -3.176455020904541,
"step": 1290
},
{
"epoch": 0.33891651400157025,
"grad_norm": 6.084274768829346,
"learning_rate": 1.3226903951845067e-05,
"logits/chosen": -3.1023552417755127,
"logits/rejected": -3.086031436920166,
"logps/chosen": -280.6290588378906,
"logps/rejected": -294.7873229980469,
"loss": 0.4734,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.627698540687561,
"rewards/margins": 1.7080399990081787,
"rewards/rejected": -3.33573842048645,
"step": 1295
},
{
"epoch": 0.3402250719706883,
"grad_norm": 5.004266262054443,
"learning_rate": 1.3200732792462707e-05,
"logits/chosen": -3.0339324474334717,
"logits/rejected": -2.9874372482299805,
"logps/chosen": -282.74798583984375,
"logps/rejected": -281.6959228515625,
"loss": 0.6804,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.755840539932251,
"rewards/margins": 1.4509661197662354,
"rewards/rejected": -3.2068066596984863,
"step": 1300
},
{
"epoch": 0.34153362993980635,
"grad_norm": 4.275951385498047,
"learning_rate": 1.3174561633080347e-05,
"logits/chosen": -3.0189881324768066,
"logits/rejected": -3.025322437286377,
"logps/chosen": -319.19903564453125,
"logps/rejected": -289.3116455078125,
"loss": 0.4103,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.3581633567810059,
"rewards/margins": 1.6812947988510132,
"rewards/rejected": -3.0394580364227295,
"step": 1305
},
{
"epoch": 0.34284218790892435,
"grad_norm": 7.874205112457275,
"learning_rate": 1.3148390473697985e-05,
"logits/chosen": -3.078430652618408,
"logits/rejected": -3.1446545124053955,
"logps/chosen": -335.8284606933594,
"logps/rejected": -299.3724060058594,
"loss": 0.3688,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.0086791515350342,
"rewards/margins": 1.6358997821807861,
"rewards/rejected": -2.644578695297241,
"step": 1310
},
{
"epoch": 0.3441507458780424,
"grad_norm": 7.652405738830566,
"learning_rate": 1.3122219314315626e-05,
"logits/chosen": -3.027405261993408,
"logits/rejected": -2.970729112625122,
"logps/chosen": -301.2220764160156,
"logps/rejected": -305.47210693359375,
"loss": 0.5549,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.775042176246643,
"rewards/margins": 1.3307900428771973,
"rewards/rejected": -3.105832099914551,
"step": 1315
},
{
"epoch": 0.34545930384716045,
"grad_norm": 6.982247829437256,
"learning_rate": 1.3096048154933266e-05,
"logits/chosen": -2.9840004444122314,
"logits/rejected": -3.0685667991638184,
"logps/chosen": -269.3052673339844,
"logps/rejected": -255.6454315185547,
"loss": 0.5786,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.399322509765625,
"rewards/margins": 1.243403434753418,
"rewards/rejected": -2.642725944519043,
"step": 1320
},
{
"epoch": 0.34676786181627844,
"grad_norm": 6.584117412567139,
"learning_rate": 1.3069876995550902e-05,
"logits/chosen": -2.9669992923736572,
"logits/rejected": -3.0108094215393066,
"logps/chosen": -289.9788818359375,
"logps/rejected": -285.2430419921875,
"loss": 0.5317,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.6822948455810547,
"rewards/margins": 1.5722761154174805,
"rewards/rejected": -3.2545711994171143,
"step": 1325
},
{
"epoch": 0.3480764197853965,
"grad_norm": 7.847695827484131,
"learning_rate": 1.3043705836168542e-05,
"logits/chosen": -2.9359116554260254,
"logits/rejected": -2.916821241378784,
"logps/chosen": -279.84820556640625,
"logps/rejected": -288.5198669433594,
"loss": 0.5384,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.1366277933120728,
"rewards/margins": 1.640825629234314,
"rewards/rejected": -2.7774531841278076,
"step": 1330
},
{
"epoch": 0.34938497775451455,
"grad_norm": 9.220524787902832,
"learning_rate": 1.3017534676786182e-05,
"logits/chosen": -3.050198793411255,
"logits/rejected": -3.1035842895507812,
"logps/chosen": -271.7259826660156,
"logps/rejected": -224.1414794921875,
"loss": 0.5167,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.0262686014175415,
"rewards/margins": 1.3584415912628174,
"rewards/rejected": -2.3847105503082275,
"step": 1335
},
{
"epoch": 0.35069353572363254,
"grad_norm": 8.469073295593262,
"learning_rate": 1.299136351740382e-05,
"logits/chosen": -3.018068313598633,
"logits/rejected": -3.0691208839416504,
"logps/chosen": -305.9311218261719,
"logps/rejected": -271.46624755859375,
"loss": 0.5274,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2072837352752686,
"rewards/margins": 1.4922826290130615,
"rewards/rejected": -2.69956636428833,
"step": 1340
},
{
"epoch": 0.3520020936927506,
"grad_norm": 8.611955642700195,
"learning_rate": 1.296519235802146e-05,
"logits/chosen": -2.9702281951904297,
"logits/rejected": -3.02331280708313,
"logps/chosen": -271.5640869140625,
"logps/rejected": -277.6800231933594,
"loss": 0.4417,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.3191261291503906,
"rewards/margins": 1.439081072807312,
"rewards/rejected": -2.758207082748413,
"step": 1345
},
{
"epoch": 0.35331065166186865,
"grad_norm": 9.181700706481934,
"learning_rate": 1.2939021198639101e-05,
"logits/chosen": -3.0127110481262207,
"logits/rejected": -3.1290624141693115,
"logps/chosen": -271.6927185058594,
"logps/rejected": -245.85733032226562,
"loss": 0.4842,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.475193738937378,
"rewards/margins": 1.6307731866836548,
"rewards/rejected": -3.1059670448303223,
"step": 1350
},
{
"epoch": 0.35461920963098664,
"grad_norm": 4.142209529876709,
"learning_rate": 1.291285003925674e-05,
"logits/chosen": -2.8960788249969482,
"logits/rejected": -2.8379464149475098,
"logps/chosen": -291.3749084472656,
"logps/rejected": -328.5154724121094,
"loss": 0.3166,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.1906328201293945,
"rewards/margins": 2.0907857418060303,
"rewards/rejected": -3.281418561935425,
"step": 1355
},
{
"epoch": 0.3559277676001047,
"grad_norm": 7.561165809631348,
"learning_rate": 1.288667887987438e-05,
"logits/chosen": -2.9457709789276123,
"logits/rejected": -3.032975673675537,
"logps/chosen": -313.8270568847656,
"logps/rejected": -306.0509338378906,
"loss": 0.4867,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.628435492515564,
"rewards/margins": 1.5076749324798584,
"rewards/rejected": -3.136110782623291,
"step": 1360
},
{
"epoch": 0.3572363255692227,
"grad_norm": 4.377691745758057,
"learning_rate": 1.286050772049202e-05,
"logits/chosen": -2.9811158180236816,
"logits/rejected": -3.0263185501098633,
"logps/chosen": -264.35443115234375,
"logps/rejected": -254.58731079101562,
"loss": 0.3929,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.4240038394927979,
"rewards/margins": 1.7809474468231201,
"rewards/rejected": -3.204951524734497,
"step": 1365
},
{
"epoch": 0.35854488353834074,
"grad_norm": 10.971863746643066,
"learning_rate": 1.283433656110966e-05,
"logits/chosen": -2.9616293907165527,
"logits/rejected": -3.0293776988983154,
"logps/chosen": -290.515625,
"logps/rejected": -293.56170654296875,
"loss": 0.7177,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.722092628479004,
"rewards/margins": 1.0726094245910645,
"rewards/rejected": -2.7947020530700684,
"step": 1370
},
{
"epoch": 0.3598534415074588,
"grad_norm": 6.668981075286865,
"learning_rate": 1.2808165401727298e-05,
"logits/chosen": -3.0443549156188965,
"logits/rejected": -3.052109479904175,
"logps/chosen": -285.5496826171875,
"logps/rejected": -281.496337890625,
"loss": 0.3757,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.2626408338546753,
"rewards/margins": 1.825859785079956,
"rewards/rejected": -3.0885009765625,
"step": 1375
},
{
"epoch": 0.3611619994765768,
"grad_norm": 8.7103910446167,
"learning_rate": 1.2781994242344938e-05,
"logits/chosen": -2.9803757667541504,
"logits/rejected": -3.045557737350464,
"logps/chosen": -335.71868896484375,
"logps/rejected": -303.4695739746094,
"loss": 0.4753,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.295375108718872,
"rewards/margins": 1.8782527446746826,
"rewards/rejected": -3.1736278533935547,
"step": 1380
},
{
"epoch": 0.36247055744569484,
"grad_norm": 6.965200424194336,
"learning_rate": 1.2755823082962578e-05,
"logits/chosen": -3.007631540298462,
"logits/rejected": -3.0434601306915283,
"logps/chosen": -306.3366394042969,
"logps/rejected": -323.5538330078125,
"loss": 0.5288,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3271119594573975,
"rewards/margins": 1.4354439973831177,
"rewards/rejected": -2.7625560760498047,
"step": 1385
},
{
"epoch": 0.3637791154148129,
"grad_norm": 4.985629081726074,
"learning_rate": 1.2729651923580215e-05,
"logits/chosen": -2.9433679580688477,
"logits/rejected": -3.051299810409546,
"logps/chosen": -275.38519287109375,
"logps/rejected": -241.19888305664062,
"loss": 0.5702,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.9085711240768433,
"rewards/margins": 1.2991278171539307,
"rewards/rejected": -2.2076992988586426,
"step": 1390
},
{
"epoch": 0.3650876733839309,
"grad_norm": 7.742268085479736,
"learning_rate": 1.2703480764197855e-05,
"logits/chosen": -2.9358208179473877,
"logits/rejected": -2.947000026702881,
"logps/chosen": -286.78033447265625,
"logps/rejected": -308.0422058105469,
"loss": 0.6321,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.871289074420929,
"rewards/margins": 1.0567841529846191,
"rewards/rejected": -1.9280732870101929,
"step": 1395
},
{
"epoch": 0.36639623135304894,
"grad_norm": 7.653586387634277,
"learning_rate": 1.2677309604815495e-05,
"logits/chosen": -2.9908015727996826,
"logits/rejected": -2.9822075366973877,
"logps/chosen": -286.308837890625,
"logps/rejected": -272.92156982421875,
"loss": 0.5557,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.609847366809845,
"rewards/margins": 1.0760449171066284,
"rewards/rejected": -1.6858923435211182,
"step": 1400
},
{
"epoch": 0.367704789322167,
"grad_norm": 7.872668266296387,
"learning_rate": 1.2651138445433133e-05,
"logits/chosen": -3.0261340141296387,
"logits/rejected": -3.0703518390655518,
"logps/chosen": -290.16070556640625,
"logps/rejected": -282.38336181640625,
"loss": 0.6173,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.7330259084701538,
"rewards/margins": 1.229697585105896,
"rewards/rejected": -1.962723731994629,
"step": 1405
},
{
"epoch": 0.369013347291285,
"grad_norm": 11.86557388305664,
"learning_rate": 1.2624967286050773e-05,
"logits/chosen": -3.050842761993408,
"logits/rejected": -3.036444902420044,
"logps/chosen": -289.84393310546875,
"logps/rejected": -261.6834411621094,
"loss": 0.6361,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.7127243280410767,
"rewards/margins": 0.9847332835197449,
"rewards/rejected": -1.6974576711654663,
"step": 1410
},
{
"epoch": 0.37032190526040304,
"grad_norm": 12.028075218200684,
"learning_rate": 1.2598796126668413e-05,
"logits/chosen": -2.887280225753784,
"logits/rejected": -2.9627726078033447,
"logps/chosen": -277.523681640625,
"logps/rejected": -255.8815460205078,
"loss": 0.472,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7555484771728516,
"rewards/margins": 1.6585193872451782,
"rewards/rejected": -2.4140677452087402,
"step": 1415
},
{
"epoch": 0.3716304632295211,
"grad_norm": 6.440135955810547,
"learning_rate": 1.2572624967286052e-05,
"logits/chosen": -2.875539779663086,
"logits/rejected": -3.007462978363037,
"logps/chosen": -356.36224365234375,
"logps/rejected": -322.9365539550781,
"loss": 0.3654,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.6236860156059265,
"rewards/margins": 2.0594842433929443,
"rewards/rejected": -2.6831705570220947,
"step": 1420
},
{
"epoch": 0.3729390211986391,
"grad_norm": 7.960480690002441,
"learning_rate": 1.2546453807903692e-05,
"logits/chosen": -2.993530035018921,
"logits/rejected": -3.0551657676696777,
"logps/chosen": -304.01422119140625,
"logps/rejected": -298.3731384277344,
"loss": 0.6268,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.3483511209487915,
"rewards/margins": 1.3386003971099854,
"rewards/rejected": -2.6869513988494873,
"step": 1425
},
{
"epoch": 0.37424757916775714,
"grad_norm": 6.786019802093506,
"learning_rate": 1.2520282648521332e-05,
"logits/chosen": -3.001875400543213,
"logits/rejected": -3.0228638648986816,
"logps/chosen": -248.81704711914062,
"logps/rejected": -263.12738037109375,
"loss": 0.581,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.7031877040863037,
"rewards/margins": 1.1887766122817993,
"rewards/rejected": -2.8919644355773926,
"step": 1430
},
{
"epoch": 0.3755561371368752,
"grad_norm": 8.263105392456055,
"learning_rate": 1.2494111489138968e-05,
"logits/chosen": -3.038954973220825,
"logits/rejected": -3.062197685241699,
"logps/chosen": -313.73321533203125,
"logps/rejected": -283.0350646972656,
"loss": 0.619,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.5002611875534058,
"rewards/margins": 1.2015249729156494,
"rewards/rejected": -2.7017860412597656,
"step": 1435
},
{
"epoch": 0.3768646951059932,
"grad_norm": 6.546252727508545,
"learning_rate": 1.2467940329756609e-05,
"logits/chosen": -2.9777824878692627,
"logits/rejected": -3.0632386207580566,
"logps/chosen": -308.88482666015625,
"logps/rejected": -304.6534729003906,
"loss": 0.4268,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.5263839960098267,
"rewards/margins": 1.5267943143844604,
"rewards/rejected": -3.053178310394287,
"step": 1440
},
{
"epoch": 0.37817325307511124,
"grad_norm": 10.60797119140625,
"learning_rate": 1.2441769170374249e-05,
"logits/chosen": -2.8980348110198975,
"logits/rejected": -2.9805784225463867,
"logps/chosen": -300.85955810546875,
"logps/rejected": -279.0732116699219,
"loss": 0.5448,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6329902410507202,
"rewards/margins": 1.6282199621200562,
"rewards/rejected": -3.2612102031707764,
"step": 1445
},
{
"epoch": 0.37948181104422923,
"grad_norm": 6.1115217208862305,
"learning_rate": 1.2415598010991887e-05,
"logits/chosen": -2.7547950744628906,
"logits/rejected": -2.8454792499542236,
"logps/chosen": -292.1466064453125,
"logps/rejected": -284.0655517578125,
"loss": 0.5144,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.6082839965820312,
"rewards/margins": 1.9727370738983154,
"rewards/rejected": -3.5810210704803467,
"step": 1450
},
{
"epoch": 0.3807903690133473,
"grad_norm": 10.30879020690918,
"learning_rate": 1.2389426851609527e-05,
"logits/chosen": -2.8726394176483154,
"logits/rejected": -2.970806360244751,
"logps/chosen": -343.87860107421875,
"logps/rejected": -320.80615234375,
"loss": 0.4237,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.5264251232147217,
"rewards/margins": 2.020369052886963,
"rewards/rejected": -3.5467944145202637,
"step": 1455
},
{
"epoch": 0.38209892698246534,
"grad_norm": 6.011262893676758,
"learning_rate": 1.2363255692227167e-05,
"logits/chosen": -2.860525131225586,
"logits/rejected": -2.7346343994140625,
"logps/chosen": -310.897216796875,
"logps/rejected": -351.1201171875,
"loss": 0.5297,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.8876367807388306,
"rewards/margins": 1.8191211223602295,
"rewards/rejected": -3.7067580223083496,
"step": 1460
},
{
"epoch": 0.38340748495158333,
"grad_norm": 6.5547566413879395,
"learning_rate": 1.2337084532844805e-05,
"logits/chosen": -2.7850310802459717,
"logits/rejected": -2.8178579807281494,
"logps/chosen": -280.5059814453125,
"logps/rejected": -279.82525634765625,
"loss": 0.4661,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.7109603881835938,
"rewards/margins": 1.4891334772109985,
"rewards/rejected": -3.2000937461853027,
"step": 1465
},
{
"epoch": 0.3847160429207014,
"grad_norm": 5.547062873840332,
"learning_rate": 1.2310913373462446e-05,
"logits/chosen": -2.890692710876465,
"logits/rejected": -2.9257652759552,
"logps/chosen": -237.19577026367188,
"logps/rejected": -270.4554748535156,
"loss": 0.4804,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.332466721534729,
"rewards/margins": 1.3955414295196533,
"rewards/rejected": -2.7280080318450928,
"step": 1470
},
{
"epoch": 0.38602460088981944,
"grad_norm": 7.388108253479004,
"learning_rate": 1.2284742214080086e-05,
"logits/chosen": -2.9597058296203613,
"logits/rejected": -2.9998531341552734,
"logps/chosen": -262.3876037597656,
"logps/rejected": -249.7728729248047,
"loss": 0.5758,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.322337031364441,
"rewards/margins": 1.3156559467315674,
"rewards/rejected": -2.6379926204681396,
"step": 1475
},
{
"epoch": 0.38733315885893743,
"grad_norm": 6.45510196685791,
"learning_rate": 1.2258571054697724e-05,
"logits/chosen": -3.0368587970733643,
"logits/rejected": -3.017752170562744,
"logps/chosen": -272.9659729003906,
"logps/rejected": -285.7885437011719,
"loss": 0.5161,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.1351183652877808,
"rewards/margins": 1.5519227981567383,
"rewards/rejected": -2.6870410442352295,
"step": 1480
},
{
"epoch": 0.3886417168280555,
"grad_norm": 4.70168399810791,
"learning_rate": 1.2232399895315364e-05,
"logits/chosen": -2.9754042625427246,
"logits/rejected": -3.0704946517944336,
"logps/chosen": -308.5724792480469,
"logps/rejected": -273.3597412109375,
"loss": 0.4308,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.7960319519042969,
"rewards/margins": 1.450454592704773,
"rewards/rejected": -2.2464864253997803,
"step": 1485
},
{
"epoch": 0.38995027479717354,
"grad_norm": 7.209184169769287,
"learning_rate": 1.2206228735933004e-05,
"logits/chosen": -3.016166925430298,
"logits/rejected": -3.0774433612823486,
"logps/chosen": -293.2237243652344,
"logps/rejected": -322.7275390625,
"loss": 0.4859,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.1815975904464722,
"rewards/margins": 1.4720970392227173,
"rewards/rejected": -2.6536946296691895,
"step": 1490
},
{
"epoch": 0.39125883276629153,
"grad_norm": 6.350897789001465,
"learning_rate": 1.218005757655064e-05,
"logits/chosen": -2.953706741333008,
"logits/rejected": -3.063767671585083,
"logps/chosen": -305.0714416503906,
"logps/rejected": -276.573486328125,
"loss": 0.4575,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.3502719402313232,
"rewards/margins": 1.5017601251602173,
"rewards/rejected": -2.852031946182251,
"step": 1495
},
{
"epoch": 0.3925673907354096,
"grad_norm": 8.479778289794922,
"learning_rate": 1.2153886417168281e-05,
"logits/chosen": -2.9702401161193848,
"logits/rejected": -2.8454244136810303,
"logps/chosen": -281.9237365722656,
"logps/rejected": -288.2297668457031,
"loss": 0.4503,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5473536252975464,
"rewards/margins": 1.5539802312850952,
"rewards/rejected": -3.1013338565826416,
"step": 1500
},
{
"epoch": 0.39387594870452763,
"grad_norm": 6.467402935028076,
"learning_rate": 1.2127715257785921e-05,
"logits/chosen": -3.0142505168914795,
"logits/rejected": -3.0140280723571777,
"logps/chosen": -242.34933471679688,
"logps/rejected": -276.01068115234375,
"loss": 0.552,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.787334680557251,
"rewards/margins": 1.3136866092681885,
"rewards/rejected": -3.1010212898254395,
"step": 1505
},
{
"epoch": 0.39518450667364563,
"grad_norm": 4.064186096191406,
"learning_rate": 1.210154409840356e-05,
"logits/chosen": -3.0537848472595215,
"logits/rejected": -2.9990789890289307,
"logps/chosen": -273.3089904785156,
"logps/rejected": -318.97015380859375,
"loss": 0.6266,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3775596618652344,
"rewards/margins": 1.1523305177688599,
"rewards/rejected": -2.529890298843384,
"step": 1510
},
{
"epoch": 0.3964930646427637,
"grad_norm": 10.651188850402832,
"learning_rate": 1.20753729390212e-05,
"logits/chosen": -3.0355381965637207,
"logits/rejected": -3.023265838623047,
"logps/chosen": -302.71124267578125,
"logps/rejected": -282.6175537109375,
"loss": 0.5721,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.462561011314392,
"rewards/margins": 1.2974321842193604,
"rewards/rejected": -2.759993076324463,
"step": 1515
},
{
"epoch": 0.39780162261188173,
"grad_norm": 6.771826267242432,
"learning_rate": 1.204920177963884e-05,
"logits/chosen": -3.036020040512085,
"logits/rejected": -3.0711162090301514,
"logps/chosen": -354.17230224609375,
"logps/rejected": -338.1542053222656,
"loss": 0.4863,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.067155361175537,
"rewards/margins": 1.3505350351333618,
"rewards/rejected": -2.4176905155181885,
"step": 1520
},
{
"epoch": 0.39911018058099973,
"grad_norm": 6.441925048828125,
"learning_rate": 1.2023030620256478e-05,
"logits/chosen": -3.0521793365478516,
"logits/rejected": -3.059692859649658,
"logps/chosen": -253.529296875,
"logps/rejected": -257.99017333984375,
"loss": 0.4195,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.8811559677124023,
"rewards/margins": 1.760744333267212,
"rewards/rejected": -2.6419005393981934,
"step": 1525
},
{
"epoch": 0.4004187385501178,
"grad_norm": 10.745322227478027,
"learning_rate": 1.1996859460874118e-05,
"logits/chosen": -3.0835869312286377,
"logits/rejected": -3.089754581451416,
"logps/chosen": -272.4412536621094,
"logps/rejected": -269.2596740722656,
"loss": 0.6076,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.4502677917480469,
"rewards/margins": 0.9978004693984985,
"rewards/rejected": -2.448068857192993,
"step": 1530
},
{
"epoch": 0.4017272965192358,
"grad_norm": 7.261354446411133,
"learning_rate": 1.1970688301491758e-05,
"logits/chosen": -2.9888062477111816,
"logits/rejected": -2.9264347553253174,
"logps/chosen": -276.4285888671875,
"logps/rejected": -300.2745056152344,
"loss": 0.5551,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.3649812936782837,
"rewards/margins": 1.3235492706298828,
"rewards/rejected": -2.688530445098877,
"step": 1535
},
{
"epoch": 0.40303585448835383,
"grad_norm": 10.943098068237305,
"learning_rate": 1.1944517142109398e-05,
"logits/chosen": -2.9183526039123535,
"logits/rejected": -2.9452197551727295,
"logps/chosen": -241.52294921875,
"logps/rejected": -289.0009765625,
"loss": 0.5724,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.5533595085144043,
"rewards/margins": 1.3406226634979248,
"rewards/rejected": -2.893982172012329,
"step": 1540
},
{
"epoch": 0.4043444124574719,
"grad_norm": 4.496222972869873,
"learning_rate": 1.1918345982727035e-05,
"logits/chosen": -2.9785215854644775,
"logits/rejected": -2.9904141426086426,
"logps/chosen": -242.4897918701172,
"logps/rejected": -264.7366027832031,
"loss": 0.5285,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.193482518196106,
"rewards/margins": 1.3687217235565186,
"rewards/rejected": -2.562204122543335,
"step": 1545
},
{
"epoch": 0.4056529704265899,
"grad_norm": 6.670289516448975,
"learning_rate": 1.1892174823344675e-05,
"logits/chosen": -2.9499192237854004,
"logits/rejected": -2.9757332801818848,
"logps/chosen": -264.1338806152344,
"logps/rejected": -285.5947265625,
"loss": 0.397,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.3419907093048096,
"rewards/margins": 1.739337682723999,
"rewards/rejected": -3.0813281536102295,
"step": 1550
},
{
"epoch": 0.4069615283957079,
"grad_norm": 8.708571434020996,
"learning_rate": 1.1866003663962315e-05,
"logits/chosen": -2.891721248626709,
"logits/rejected": -2.9555718898773193,
"logps/chosen": -292.3914489746094,
"logps/rejected": -285.41632080078125,
"loss": 0.5604,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3459782600402832,
"rewards/margins": 1.5892525911331177,
"rewards/rejected": -2.9352307319641113,
"step": 1555
},
{
"epoch": 0.408270086364826,
"grad_norm": 7.487111568450928,
"learning_rate": 1.1839832504579953e-05,
"logits/chosen": -2.8305745124816895,
"logits/rejected": -2.858198642730713,
"logps/chosen": -298.46734619140625,
"logps/rejected": -303.1842346191406,
"loss": 0.5079,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.168373703956604,
"rewards/margins": 1.9308793544769287,
"rewards/rejected": -3.0992531776428223,
"step": 1560
},
{
"epoch": 0.409578644333944,
"grad_norm": 6.141557216644287,
"learning_rate": 1.1813661345197593e-05,
"logits/chosen": -2.9441263675689697,
"logits/rejected": -2.9874396324157715,
"logps/chosen": -328.82275390625,
"logps/rejected": -282.8536376953125,
"loss": 0.3869,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.0537731647491455,
"rewards/margins": 2.476039409637451,
"rewards/rejected": -3.5298125743865967,
"step": 1565
},
{
"epoch": 0.410887202303062,
"grad_norm": 7.910996913909912,
"learning_rate": 1.1787490185815233e-05,
"logits/chosen": -3.0016915798187256,
"logits/rejected": -2.9591012001037598,
"logps/chosen": -251.32180786132812,
"logps/rejected": -303.64031982421875,
"loss": 0.5086,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.980921983718872,
"rewards/margins": 1.729113221168518,
"rewards/rejected": -3.7100348472595215,
"step": 1570
},
{
"epoch": 0.4121957602721801,
"grad_norm": 6.531032562255859,
"learning_rate": 1.1761319026432872e-05,
"logits/chosen": -2.9189350605010986,
"logits/rejected": -2.996328353881836,
"logps/chosen": -271.4986572265625,
"logps/rejected": -244.9109344482422,
"loss": 0.4335,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.749506950378418,
"rewards/margins": 2.123915672302246,
"rewards/rejected": -3.8734230995178223,
"step": 1575
},
{
"epoch": 0.4135043182412981,
"grad_norm": 8.456581115722656,
"learning_rate": 1.1735147867050512e-05,
"logits/chosen": -3.066283941268921,
"logits/rejected": -3.0604310035705566,
"logps/chosen": -283.52801513671875,
"logps/rejected": -289.96990966796875,
"loss": 0.5287,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.6674515008926392,
"rewards/margins": 1.9076945781707764,
"rewards/rejected": -3.575146198272705,
"step": 1580
},
{
"epoch": 0.4148128762104161,
"grad_norm": 9.457367897033691,
"learning_rate": 1.1708976707668152e-05,
"logits/chosen": -2.846237897872925,
"logits/rejected": -2.9507431983947754,
"logps/chosen": -326.3301086425781,
"logps/rejected": -293.23577880859375,
"loss": 0.3963,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.5260112285614014,
"rewards/margins": 1.8521969318389893,
"rewards/rejected": -3.3782081604003906,
"step": 1585
},
{
"epoch": 0.4161214341795342,
"grad_norm": 5.837276458740234,
"learning_rate": 1.168280554828579e-05,
"logits/chosen": -2.9573163986206055,
"logits/rejected": -3.001347064971924,
"logps/chosen": -325.12139892578125,
"logps/rejected": -266.9141540527344,
"loss": 0.4964,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.1669349670410156,
"rewards/margins": 1.709183931350708,
"rewards/rejected": -2.8761186599731445,
"step": 1590
},
{
"epoch": 0.4174299921486522,
"grad_norm": 7.193180561065674,
"learning_rate": 1.165663438890343e-05,
"logits/chosen": -2.921961784362793,
"logits/rejected": -2.9729673862457275,
"logps/chosen": -324.3099060058594,
"logps/rejected": -299.9588317871094,
"loss": 0.4785,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1953010559082031,
"rewards/margins": 1.7333354949951172,
"rewards/rejected": -2.9286367893218994,
"step": 1595
},
{
"epoch": 0.4187385501177702,
"grad_norm": 9.289549827575684,
"learning_rate": 1.163046322952107e-05,
"logits/chosen": -3.059566020965576,
"logits/rejected": -3.0449271202087402,
"logps/chosen": -311.70147705078125,
"logps/rejected": -314.4573669433594,
"loss": 0.4369,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9875648617744446,
"rewards/margins": 1.5768333673477173,
"rewards/rejected": -2.5643982887268066,
"step": 1600
},
{
"epoch": 0.4187385501177702,
"eval_logits/chosen": -3.0115966796875,
"eval_logits/rejected": -3.030435085296631,
"eval_logps/chosen": -295.0363464355469,
"eval_logps/rejected": -289.9421691894531,
"eval_loss": 0.5138216614723206,
"eval_rewards/accuracies": 0.7390000224113464,
"eval_rewards/chosen": -1.1802964210510254,
"eval_rewards/margins": 1.4204589128494263,
"eval_rewards/rejected": -2.600755453109741,
"eval_runtime": 763.512,
"eval_samples_per_second": 2.619,
"eval_steps_per_second": 0.327,
"step": 1600
},
{
"epoch": 0.4200471080868883,
"grad_norm": 5.8733625411987305,
"learning_rate": 1.1604292070138707e-05,
"logits/chosen": -2.9307186603546143,
"logits/rejected": -2.9098129272460938,
"logps/chosen": -273.51361083984375,
"logps/rejected": -329.70135498046875,
"loss": 0.4702,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1704580783843994,
"rewards/margins": 1.3907597064971924,
"rewards/rejected": -2.561217784881592,
"step": 1605
},
{
"epoch": 0.4213556660560063,
"grad_norm": 10.016422271728516,
"learning_rate": 1.1578120910756347e-05,
"logits/chosen": -3.039794921875,
"logits/rejected": -3.1136136054992676,
"logps/chosen": -261.3140869140625,
"logps/rejected": -261.41033935546875,
"loss": 0.5391,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.0977901220321655,
"rewards/margins": 1.2529916763305664,
"rewards/rejected": -2.3507819175720215,
"step": 1610
},
{
"epoch": 0.4226642240251243,
"grad_norm": 7.613191604614258,
"learning_rate": 1.1551949751373987e-05,
"logits/chosen": -2.9766361713409424,
"logits/rejected": -3.0103912353515625,
"logps/chosen": -278.36700439453125,
"logps/rejected": -298.30010986328125,
"loss": 0.5107,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.2617706060409546,
"rewards/margins": 1.5523849725723267,
"rewards/rejected": -2.8141555786132812,
"step": 1615
},
{
"epoch": 0.4239727819942423,
"grad_norm": 6.346404552459717,
"learning_rate": 1.1525778591991625e-05,
"logits/chosen": -2.930680751800537,
"logits/rejected": -3.0358388423919678,
"logps/chosen": -280.64837646484375,
"logps/rejected": -283.0661315917969,
"loss": 0.3852,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.1420165300369263,
"rewards/margins": 1.7675631046295166,
"rewards/rejected": -2.9095797538757324,
"step": 1620
},
{
"epoch": 0.42528133996336037,
"grad_norm": 5.357637882232666,
"learning_rate": 1.1499607432609266e-05,
"logits/chosen": -2.919867515563965,
"logits/rejected": -2.999633312225342,
"logps/chosen": -260.5415344238281,
"logps/rejected": -286.1955261230469,
"loss": 0.5426,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.837245762348175,
"rewards/margins": 1.5137102603912354,
"rewards/rejected": -2.3509559631347656,
"step": 1625
},
{
"epoch": 0.4265898979324784,
"grad_norm": 7.890727519989014,
"learning_rate": 1.1473436273226906e-05,
"logits/chosen": -2.9257941246032715,
"logits/rejected": -2.988372802734375,
"logps/chosen": -318.0213623046875,
"logps/rejected": -305.0336608886719,
"loss": 0.493,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.134577989578247,
"rewards/margins": 1.7011257410049438,
"rewards/rejected": -2.8357038497924805,
"step": 1630
},
{
"epoch": 0.4278984559015964,
"grad_norm": 10.864569664001465,
"learning_rate": 1.1447265113844544e-05,
"logits/chosen": -2.940944194793701,
"logits/rejected": -2.9746077060699463,
"logps/chosen": -291.7694091796875,
"logps/rejected": -308.31719970703125,
"loss": 0.6127,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0913641452789307,
"rewards/margins": 1.432383418083191,
"rewards/rejected": -2.523747682571411,
"step": 1635
},
{
"epoch": 0.42920701387071447,
"grad_norm": 9.867708206176758,
"learning_rate": 1.1421093954462184e-05,
"logits/chosen": -2.9872641563415527,
"logits/rejected": -3.0981907844543457,
"logps/chosen": -314.36566162109375,
"logps/rejected": -270.13153076171875,
"loss": 0.5011,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.106109380722046,
"rewards/margins": 1.637105941772461,
"rewards/rejected": -2.743215322494507,
"step": 1640
},
{
"epoch": 0.4305155718398325,
"grad_norm": 8.497987747192383,
"learning_rate": 1.1394922795079824e-05,
"logits/chosen": -3.008741617202759,
"logits/rejected": -3.0486505031585693,
"logps/chosen": -318.6070861816406,
"logps/rejected": -341.03662109375,
"loss": 0.6013,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.4147374629974365,
"rewards/margins": 1.432152271270752,
"rewards/rejected": -2.8468894958496094,
"step": 1645
},
{
"epoch": 0.4318241298089505,
"grad_norm": 7.426600456237793,
"learning_rate": 1.136875163569746e-05,
"logits/chosen": -2.854031562805176,
"logits/rejected": -2.9757132530212402,
"logps/chosen": -286.28009033203125,
"logps/rejected": -309.74456787109375,
"loss": 0.4337,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.4343945980072021,
"rewards/margins": 1.5377739667892456,
"rewards/rejected": -2.9721689224243164,
"step": 1650
},
{
"epoch": 0.43313268777806857,
"grad_norm": 10.064818382263184,
"learning_rate": 1.1342580476315101e-05,
"logits/chosen": -2.9875996112823486,
"logits/rejected": -3.022408962249756,
"logps/chosen": -319.95794677734375,
"logps/rejected": -289.5148620605469,
"loss": 0.5412,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.296550989151001,
"rewards/margins": 1.3865230083465576,
"rewards/rejected": -2.6830739974975586,
"step": 1655
},
{
"epoch": 0.4344412457471866,
"grad_norm": 8.107555389404297,
"learning_rate": 1.1316409316932741e-05,
"logits/chosen": -2.915742874145508,
"logits/rejected": -2.943075656890869,
"logps/chosen": -273.85931396484375,
"logps/rejected": -286.5356140136719,
"loss": 0.4829,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.364490270614624,
"rewards/margins": 1.4877455234527588,
"rewards/rejected": -2.852235794067383,
"step": 1660
},
{
"epoch": 0.4357498037163046,
"grad_norm": 6.44484281539917,
"learning_rate": 1.129023815755038e-05,
"logits/chosen": -2.88740611076355,
"logits/rejected": -3.0205512046813965,
"logps/chosen": -264.7518615722656,
"logps/rejected": -284.6231384277344,
"loss": 0.466,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.3103464841842651,
"rewards/margins": 1.4976861476898193,
"rewards/rejected": -2.808032751083374,
"step": 1665
},
{
"epoch": 0.43705836168542267,
"grad_norm": 4.719677925109863,
"learning_rate": 1.126406699816802e-05,
"logits/chosen": -3.025489330291748,
"logits/rejected": -3.0286028385162354,
"logps/chosen": -285.00445556640625,
"logps/rejected": -307.9769592285156,
"loss": 0.5365,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.5843397378921509,
"rewards/margins": 1.2534945011138916,
"rewards/rejected": -2.837834358215332,
"step": 1670
},
{
"epoch": 0.4383669196545407,
"grad_norm": 4.686459541320801,
"learning_rate": 1.123789583878566e-05,
"logits/chosen": -2.8781819343566895,
"logits/rejected": -2.9757750034332275,
"logps/chosen": -272.54534912109375,
"logps/rejected": -261.4615478515625,
"loss": 0.3518,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.2964471578598022,
"rewards/margins": 2.0138936042785645,
"rewards/rejected": -3.3103408813476562,
"step": 1675
},
{
"epoch": 0.4396754776236587,
"grad_norm": 9.84245777130127,
"learning_rate": 1.1211724679403298e-05,
"logits/chosen": -3.0288212299346924,
"logits/rejected": -3.009911060333252,
"logps/chosen": -314.16424560546875,
"logps/rejected": -322.6404724121094,
"loss": 0.6753,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.9346834421157837,
"rewards/margins": 1.1214497089385986,
"rewards/rejected": -3.05613374710083,
"step": 1680
},
{
"epoch": 0.44098403559277677,
"grad_norm": 10.62396240234375,
"learning_rate": 1.1185553520020938e-05,
"logits/chosen": -3.026965618133545,
"logits/rejected": -2.9794087409973145,
"logps/chosen": -311.1338195800781,
"logps/rejected": -302.8864440917969,
"loss": 0.5711,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.7278903722763062,
"rewards/margins": 1.2248623371124268,
"rewards/rejected": -2.9527530670166016,
"step": 1685
},
{
"epoch": 0.44229259356189476,
"grad_norm": 11.11765193939209,
"learning_rate": 1.1159382360638578e-05,
"logits/chosen": -2.982098340988159,
"logits/rejected": -3.0129332542419434,
"logps/chosen": -263.9950256347656,
"logps/rejected": -278.71240234375,
"loss": 0.4873,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.5713714361190796,
"rewards/margins": 1.3477413654327393,
"rewards/rejected": -2.9191126823425293,
"step": 1690
},
{
"epoch": 0.4436011515310128,
"grad_norm": 6.4845194816589355,
"learning_rate": 1.1133211201256216e-05,
"logits/chosen": -2.9548566341400146,
"logits/rejected": -3.0447587966918945,
"logps/chosen": -256.5769958496094,
"logps/rejected": -324.57464599609375,
"loss": 0.4231,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.5655686855316162,
"rewards/margins": 1.5172860622406006,
"rewards/rejected": -3.082854747772217,
"step": 1695
},
{
"epoch": 0.44490970950013087,
"grad_norm": 8.451342582702637,
"learning_rate": 1.1107040041873856e-05,
"logits/chosen": -2.9164369106292725,
"logits/rejected": -2.9938321113586426,
"logps/chosen": -264.7562561035156,
"logps/rejected": -290.6343688964844,
"loss": 0.6177,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.135646343231201,
"rewards/margins": 1.4988205432891846,
"rewards/rejected": -3.6344668865203857,
"step": 1700
},
{
"epoch": 0.44621826746924886,
"grad_norm": 7.753484725952148,
"learning_rate": 1.1080868882491496e-05,
"logits/chosen": -2.8294832706451416,
"logits/rejected": -2.9608490467071533,
"logps/chosen": -277.0535888671875,
"logps/rejected": -280.58575439453125,
"loss": 0.5377,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.4995322227478027,
"rewards/margins": 1.5786781311035156,
"rewards/rejected": -4.078210353851318,
"step": 1705
},
{
"epoch": 0.4475268254383669,
"grad_norm": 8.849166870117188,
"learning_rate": 1.1054697723109137e-05,
"logits/chosen": -3.094038248062134,
"logits/rejected": -3.051264524459839,
"logps/chosen": -291.5003356933594,
"logps/rejected": -312.36102294921875,
"loss": 0.5622,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.875213384628296,
"rewards/margins": 1.4177398681640625,
"rewards/rejected": -4.292952537536621,
"step": 1710
},
{
"epoch": 0.44883538340748497,
"grad_norm": 7.06804084777832,
"learning_rate": 1.1028526563726773e-05,
"logits/chosen": -3.0508930683135986,
"logits/rejected": -3.085616111755371,
"logps/chosen": -298.27947998046875,
"logps/rejected": -285.8116760253906,
"loss": 0.4942,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.573955535888672,
"rewards/margins": 1.687349557876587,
"rewards/rejected": -4.261305332183838,
"step": 1715
},
{
"epoch": 0.45014394137660296,
"grad_norm": 9.525605201721191,
"learning_rate": 1.1002355404344413e-05,
"logits/chosen": -3.011475086212158,
"logits/rejected": -2.99470591545105,
"logps/chosen": -324.5032958984375,
"logps/rejected": -346.570068359375,
"loss": 0.4977,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.779179096221924,
"rewards/margins": 1.9469417333602905,
"rewards/rejected": -4.726120948791504,
"step": 1720
},
{
"epoch": 0.451452499345721,
"grad_norm": 7.508541107177734,
"learning_rate": 1.0976184244962053e-05,
"logits/chosen": -3.010701894760132,
"logits/rejected": -3.016294479370117,
"logps/chosen": -337.5274963378906,
"logps/rejected": -315.5035705566406,
"loss": 0.5408,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.488309621810913,
"rewards/margins": 1.4098851680755615,
"rewards/rejected": -3.8981945514678955,
"step": 1725
},
{
"epoch": 0.45276105731483907,
"grad_norm": 8.462430953979492,
"learning_rate": 1.0950013085579692e-05,
"logits/chosen": -2.9911746978759766,
"logits/rejected": -3.018477201461792,
"logps/chosen": -363.7921447753906,
"logps/rejected": -355.11370849609375,
"loss": 0.378,
"rewards/accuracies": 0.875,
"rewards/chosen": -2.2156341075897217,
"rewards/margins": 2.0937671661376953,
"rewards/rejected": -4.309401512145996,
"step": 1730
},
{
"epoch": 0.45406961528395706,
"grad_norm": 10.691130638122559,
"learning_rate": 1.0923841926197332e-05,
"logits/chosen": -2.9475789070129395,
"logits/rejected": -3.0337929725646973,
"logps/chosen": -320.70965576171875,
"logps/rejected": -280.87640380859375,
"loss": 0.4101,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.593120574951172,
"rewards/margins": 2.064436197280884,
"rewards/rejected": -4.657556533813477,
"step": 1735
},
{
"epoch": 0.4553781732530751,
"grad_norm": 11.62152099609375,
"learning_rate": 1.0897670766814972e-05,
"logits/chosen": -2.9737391471862793,
"logits/rejected": -2.951845645904541,
"logps/chosen": -318.7091369628906,
"logps/rejected": -347.60369873046875,
"loss": 0.4556,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -3.3738818168640137,
"rewards/margins": 1.9359108209609985,
"rewards/rejected": -5.309792995452881,
"step": 1740
},
{
"epoch": 0.45668673122219317,
"grad_norm": 8.028305053710938,
"learning_rate": 1.087149960743261e-05,
"logits/chosen": -2.9262282848358154,
"logits/rejected": -3.0200741291046143,
"logps/chosen": -309.8990783691406,
"logps/rejected": -284.8616027832031,
"loss": 0.6053,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -3.4491634368896484,
"rewards/margins": 1.6944007873535156,
"rewards/rejected": -5.143564224243164,
"step": 1745
},
{
"epoch": 0.45799528919131116,
"grad_norm": 8.402384757995605,
"learning_rate": 1.084532844805025e-05,
"logits/chosen": -2.9701414108276367,
"logits/rejected": -3.0136189460754395,
"logps/chosen": -313.6144104003906,
"logps/rejected": -314.5592346191406,
"loss": 0.5168,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -3.197741746902466,
"rewards/margins": 1.6113052368164062,
"rewards/rejected": -4.809047222137451,
"step": 1750
},
{
"epoch": 0.4593038471604292,
"grad_norm": 6.7802300453186035,
"learning_rate": 1.081915728866789e-05,
"logits/chosen": -3.0216054916381836,
"logits/rejected": -2.97468638420105,
"logps/chosen": -314.1781311035156,
"logps/rejected": -302.1847229003906,
"loss": 0.4397,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.9468579292297363,
"rewards/margins": 1.776281714439392,
"rewards/rejected": -4.723139762878418,
"step": 1755
},
{
"epoch": 0.46061240512954726,
"grad_norm": 8.857020378112793,
"learning_rate": 1.0792986129285527e-05,
"logits/chosen": -2.968179225921631,
"logits/rejected": -3.0504894256591797,
"logps/chosen": -324.0634460449219,
"logps/rejected": -351.1040344238281,
"loss": 0.4999,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.852264881134033,
"rewards/margins": 1.665531873703003,
"rewards/rejected": -4.517796993255615,
"step": 1760
},
{
"epoch": 0.46192096309866526,
"grad_norm": 10.597983360290527,
"learning_rate": 1.0766814969903167e-05,
"logits/chosen": -2.9628982543945312,
"logits/rejected": -2.9831249713897705,
"logps/chosen": -315.58612060546875,
"logps/rejected": -304.35260009765625,
"loss": 0.5103,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -3.043048143386841,
"rewards/margins": 1.534234881401062,
"rewards/rejected": -4.577282905578613,
"step": 1765
},
{
"epoch": 0.4632295210677833,
"grad_norm": 6.825730323791504,
"learning_rate": 1.0740643810520807e-05,
"logits/chosen": -2.9639222621917725,
"logits/rejected": -2.9877357482910156,
"logps/chosen": -349.4540100097656,
"logps/rejected": -332.49505615234375,
"loss": 0.5114,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -2.425964832305908,
"rewards/margins": 1.4884039163589478,
"rewards/rejected": -3.9143683910369873,
"step": 1770
},
{
"epoch": 0.4645380790369013,
"grad_norm": 7.913449764251709,
"learning_rate": 1.0714472651138445e-05,
"logits/chosen": -3.0387954711914062,
"logits/rejected": -2.9809329509735107,
"logps/chosen": -348.6420593261719,
"logps/rejected": -361.21490478515625,
"loss": 0.4134,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.1633763313293457,
"rewards/margins": 1.880319595336914,
"rewards/rejected": -4.043695449829102,
"step": 1775
},
{
"epoch": 0.46584663700601936,
"grad_norm": 6.947761535644531,
"learning_rate": 1.0688301491756086e-05,
"logits/chosen": -3.0049796104431152,
"logits/rejected": -3.0774781703948975,
"logps/chosen": -317.28387451171875,
"logps/rejected": -278.49609375,
"loss": 0.4753,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.159966468811035,
"rewards/margins": 1.6736412048339844,
"rewards/rejected": -3.8336079120635986,
"step": 1780
},
{
"epoch": 0.4671551949751374,
"grad_norm": 7.269952774047852,
"learning_rate": 1.0662130332373726e-05,
"logits/chosen": -3.0013515949249268,
"logits/rejected": -3.048266649246216,
"logps/chosen": -316.55950927734375,
"logps/rejected": -275.30780029296875,
"loss": 0.4839,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.8857390880584717,
"rewards/margins": 1.6397733688354492,
"rewards/rejected": -3.5255126953125,
"step": 1785
},
{
"epoch": 0.4684637529442554,
"grad_norm": 7.783169269561768,
"learning_rate": 1.0635959172991364e-05,
"logits/chosen": -2.9019923210144043,
"logits/rejected": -2.9270970821380615,
"logps/chosen": -232.925048828125,
"logps/rejected": -278.81976318359375,
"loss": 0.5087,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.096328020095825,
"rewards/margins": 1.481418251991272,
"rewards/rejected": -3.5777459144592285,
"step": 1790
},
{
"epoch": 0.46977231091337346,
"grad_norm": 5.545890808105469,
"learning_rate": 1.0609788013609004e-05,
"logits/chosen": -2.8715972900390625,
"logits/rejected": -3.0421102046966553,
"logps/chosen": -276.5980224609375,
"logps/rejected": -261.9891662597656,
"loss": 0.5702,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.9896501302719116,
"rewards/margins": 1.5085725784301758,
"rewards/rejected": -3.498222827911377,
"step": 1795
},
{
"epoch": 0.4710808688824915,
"grad_norm": 7.721518039703369,
"learning_rate": 1.0583616854226644e-05,
"logits/chosen": -2.9710397720336914,
"logits/rejected": -3.0190649032592773,
"logps/chosen": -342.3359069824219,
"logps/rejected": -313.8819274902344,
"loss": 0.5255,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6568866968154907,
"rewards/margins": 1.3916410207748413,
"rewards/rejected": -3.048527479171753,
"step": 1800
},
{
"epoch": 0.4723894268516095,
"grad_norm": 6.621272563934326,
"learning_rate": 1.055744569484428e-05,
"logits/chosen": -2.8590407371520996,
"logits/rejected": -2.9601387977600098,
"logps/chosen": -274.2496643066406,
"logps/rejected": -268.46722412109375,
"loss": 0.4795,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -2.092193126678467,
"rewards/margins": 1.7326141595840454,
"rewards/rejected": -3.824807643890381,
"step": 1805
},
{
"epoch": 0.47369798482072756,
"grad_norm": 6.7753801345825195,
"learning_rate": 1.0531274535461921e-05,
"logits/chosen": -2.928391695022583,
"logits/rejected": -3.0080180168151855,
"logps/chosen": -361.22479248046875,
"logps/rejected": -321.481201171875,
"loss": 0.546,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5347732305526733,
"rewards/margins": 1.3956406116485596,
"rewards/rejected": -2.9304137229919434,
"step": 1810
},
{
"epoch": 0.4750065427898456,
"grad_norm": 8.390039443969727,
"learning_rate": 1.0505103376079561e-05,
"logits/chosen": -2.939857244491577,
"logits/rejected": -3.047630786895752,
"logps/chosen": -316.05804443359375,
"logps/rejected": -300.89349365234375,
"loss": 0.3695,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.7353222370147705,
"rewards/margins": 1.6302111148834229,
"rewards/rejected": -3.3655333518981934,
"step": 1815
},
{
"epoch": 0.4763151007589636,
"grad_norm": 5.884995937347412,
"learning_rate": 1.04789322166972e-05,
"logits/chosen": -3.018889904022217,
"logits/rejected": -3.0105044841766357,
"logps/chosen": -300.02117919921875,
"logps/rejected": -284.6014709472656,
"loss": 0.4955,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.9147802591323853,
"rewards/margins": 1.5021367073059082,
"rewards/rejected": -3.416916608810425,
"step": 1820
},
{
"epoch": 0.47762365872808166,
"grad_norm": 7.659745216369629,
"learning_rate": 1.045276105731484e-05,
"logits/chosen": -2.9221673011779785,
"logits/rejected": -3.043600082397461,
"logps/chosen": -303.6266784667969,
"logps/rejected": -328.57269287109375,
"loss": 0.6357,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.8578948974609375,
"rewards/margins": 1.369075059890747,
"rewards/rejected": -3.2269699573516846,
"step": 1825
},
{
"epoch": 0.4789322166971997,
"grad_norm": 8.604077339172363,
"learning_rate": 1.042658989793248e-05,
"logits/chosen": -2.9077818393707275,
"logits/rejected": -2.9542300701141357,
"logps/chosen": -319.1806335449219,
"logps/rejected": -304.31854248046875,
"loss": 0.5592,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -2.1609792709350586,
"rewards/margins": 1.1750867366790771,
"rewards/rejected": -3.3360657691955566,
"step": 1830
},
{
"epoch": 0.4802407746663177,
"grad_norm": 7.944966793060303,
"learning_rate": 1.0400418738550118e-05,
"logits/chosen": -2.9522438049316406,
"logits/rejected": -3.004556179046631,
"logps/chosen": -299.261474609375,
"logps/rejected": -300.3248596191406,
"loss": 0.4544,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.2106218338012695,
"rewards/margins": 1.8094288110733032,
"rewards/rejected": -3.020050287246704,
"step": 1835
},
{
"epoch": 0.48154933263543576,
"grad_norm": 7.704443454742432,
"learning_rate": 1.0374247579167758e-05,
"logits/chosen": -2.9210996627807617,
"logits/rejected": -2.998807430267334,
"logps/chosen": -325.70635986328125,
"logps/rejected": -287.83367919921875,
"loss": 0.5732,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.7213985919952393,
"rewards/margins": 1.4890426397323608,
"rewards/rejected": -3.2104415893554688,
"step": 1840
},
{
"epoch": 0.4828578906045538,
"grad_norm": 8.773541450500488,
"learning_rate": 1.0348076419785398e-05,
"logits/chosen": -2.9155502319335938,
"logits/rejected": -2.884643793106079,
"logps/chosen": -307.8321228027344,
"logps/rejected": -301.53973388671875,
"loss": 0.5069,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.5945721864700317,
"rewards/margins": 1.4130241870880127,
"rewards/rejected": -3.007596492767334,
"step": 1845
},
{
"epoch": 0.4841664485736718,
"grad_norm": 9.956704139709473,
"learning_rate": 1.0321905260403036e-05,
"logits/chosen": -2.9357645511627197,
"logits/rejected": -2.912456512451172,
"logps/chosen": -341.252197265625,
"logps/rejected": -370.76727294921875,
"loss": 0.6932,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.7379004955291748,
"rewards/margins": 1.0990798473358154,
"rewards/rejected": -2.8369803428649902,
"step": 1850
},
{
"epoch": 0.48547500654278986,
"grad_norm": 5.837508201599121,
"learning_rate": 1.0295734101020676e-05,
"logits/chosen": -2.8405072689056396,
"logits/rejected": -2.8357603549957275,
"logps/chosen": -264.3638916015625,
"logps/rejected": -256.2645263671875,
"loss": 0.4158,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5331943035125732,
"rewards/margins": 1.753406286239624,
"rewards/rejected": -3.2866005897521973,
"step": 1855
},
{
"epoch": 0.48678356451190785,
"grad_norm": 5.697160243988037,
"learning_rate": 1.0269562941638316e-05,
"logits/chosen": -2.8203680515289307,
"logits/rejected": -2.922636032104492,
"logps/chosen": -266.6560974121094,
"logps/rejected": -267.4014892578125,
"loss": 0.4632,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.468200445175171,
"rewards/margins": 1.5373655557632446,
"rewards/rejected": -3.005565881729126,
"step": 1860
},
{
"epoch": 0.4880921224810259,
"grad_norm": 6.479247570037842,
"learning_rate": 1.0243391782255957e-05,
"logits/chosen": -2.966212511062622,
"logits/rejected": -2.9255197048187256,
"logps/chosen": -267.66082763671875,
"logps/rejected": -291.5536193847656,
"loss": 0.4129,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.4823805093765259,
"rewards/margins": 1.5329614877700806,
"rewards/rejected": -3.0153422355651855,
"step": 1865
},
{
"epoch": 0.48940068045014395,
"grad_norm": 8.965081214904785,
"learning_rate": 1.0217220622873593e-05,
"logits/chosen": -3.036112070083618,
"logits/rejected": -3.028759241104126,
"logps/chosen": -281.88330078125,
"logps/rejected": -290.57879638671875,
"loss": 0.4945,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.8533744812011719,
"rewards/margins": 1.3441312313079834,
"rewards/rejected": -3.1975057125091553,
"step": 1870
},
{
"epoch": 0.49070923841926195,
"grad_norm": 9.065515518188477,
"learning_rate": 1.0191049463491233e-05,
"logits/chosen": -2.9614920616149902,
"logits/rejected": -2.9071099758148193,
"logps/chosen": -296.06732177734375,
"logps/rejected": -283.09521484375,
"loss": 0.5388,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -2.123257875442505,
"rewards/margins": 1.5720679759979248,
"rewards/rejected": -3.6953258514404297,
"step": 1875
},
{
"epoch": 0.49201779638838,
"grad_norm": 7.550040245056152,
"learning_rate": 1.0164878304108873e-05,
"logits/chosen": -2.826888084411621,
"logits/rejected": -2.8919272422790527,
"logps/chosen": -321.03424072265625,
"logps/rejected": -347.545654296875,
"loss": 0.4234,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.8993819952011108,
"rewards/margins": 1.9633245468139648,
"rewards/rejected": -3.8627066612243652,
"step": 1880
},
{
"epoch": 0.49332635435749805,
"grad_norm": 9.235989570617676,
"learning_rate": 1.0138707144726512e-05,
"logits/chosen": -2.784430980682373,
"logits/rejected": -2.9601588249206543,
"logps/chosen": -355.8182067871094,
"logps/rejected": -314.7313232421875,
"loss": 0.4828,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.221585273742676,
"rewards/margins": 1.3879907131195068,
"rewards/rejected": -3.609576463699341,
"step": 1885
},
{
"epoch": 0.49463491232661605,
"grad_norm": 8.161865234375,
"learning_rate": 1.0112535985344152e-05,
"logits/chosen": -2.9995341300964355,
"logits/rejected": -3.0830676555633545,
"logps/chosen": -284.3017883300781,
"logps/rejected": -287.4847106933594,
"loss": 0.6595,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.8426592350006104,
"rewards/margins": 1.0771812200546265,
"rewards/rejected": -2.9198403358459473,
"step": 1890
},
{
"epoch": 0.4959434702957341,
"grad_norm": 9.801424026489258,
"learning_rate": 1.0086364825961792e-05,
"logits/chosen": -2.9873173236846924,
"logits/rejected": -2.9882421493530273,
"logps/chosen": -266.23260498046875,
"logps/rejected": -304.121337890625,
"loss": 0.506,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.6780809164047241,
"rewards/margins": 1.3383406400680542,
"rewards/rejected": -3.0164215564727783,
"step": 1895
},
{
"epoch": 0.49725202826485215,
"grad_norm": 8.835881233215332,
"learning_rate": 1.006019366657943e-05,
"logits/chosen": -2.8944599628448486,
"logits/rejected": -2.927996873855591,
"logps/chosen": -292.03179931640625,
"logps/rejected": -257.7762756347656,
"loss": 0.4871,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.8646316528320312,
"rewards/margins": 1.4865410327911377,
"rewards/rejected": -3.351172685623169,
"step": 1900
},
{
"epoch": 0.49856058623397015,
"grad_norm": 6.250274181365967,
"learning_rate": 1.003402250719707e-05,
"logits/chosen": -3.011723518371582,
"logits/rejected": -2.9369235038757324,
"logps/chosen": -291.50860595703125,
"logps/rejected": -304.639404296875,
"loss": 0.6066,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.6494977474212646,
"rewards/margins": 1.0280935764312744,
"rewards/rejected": -2.6775918006896973,
"step": 1905
},
{
"epoch": 0.4998691442030882,
"grad_norm": 8.099715232849121,
"learning_rate": 1.000785134781471e-05,
"logits/chosen": -2.9518039226531982,
"logits/rejected": -2.928983688354492,
"logps/chosen": -266.6195983886719,
"logps/rejected": -278.40496826171875,
"loss": 0.5232,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.6815688610076904,
"rewards/margins": 1.2674450874328613,
"rewards/rejected": -2.9490139484405518,
"step": 1910
},
{
"epoch": 0.5011777021722063,
"grad_norm": 7.579314231872559,
"learning_rate": 9.981680188432349e-06,
"logits/chosen": -2.948557138442993,
"logits/rejected": -2.9623727798461914,
"logps/chosen": -265.8631896972656,
"logps/rejected": -255.78439331054688,
"loss": 0.5151,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4069931507110596,
"rewards/margins": 1.1944947242736816,
"rewards/rejected": -2.601487636566162,
"step": 1915
},
{
"epoch": 0.5024862601413242,
"grad_norm": 6.461823463439941,
"learning_rate": 9.955509029049987e-06,
"logits/chosen": -2.9259510040283203,
"logits/rejected": -2.986363649368286,
"logps/chosen": -258.81256103515625,
"logps/rejected": -254.9714813232422,
"loss": 0.5218,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7223705053329468,
"rewards/margins": 1.1844781637191772,
"rewards/rejected": -2.906848669052124,
"step": 1920
},
{
"epoch": 0.5037948181104422,
"grad_norm": 8.694483757019043,
"learning_rate": 9.929337869667627e-06,
"logits/chosen": -2.9782071113586426,
"logits/rejected": -3.065417766571045,
"logps/chosen": -320.1847229003906,
"logps/rejected": -307.56927490234375,
"loss": 0.5354,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.4156098365783691,
"rewards/margins": 1.2600579261779785,
"rewards/rejected": -2.6756675243377686,
"step": 1925
},
{
"epoch": 0.5051033760795604,
"grad_norm": 8.963370323181152,
"learning_rate": 9.903166710285267e-06,
"logits/chosen": -2.980201244354248,
"logits/rejected": -3.0007660388946533,
"logps/chosen": -314.35528564453125,
"logps/rejected": -313.6076354980469,
"loss": 0.4637,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.692043662071228,
"rewards/margins": 1.345879077911377,
"rewards/rejected": -3.0379226207733154,
"step": 1930
},
{
"epoch": 0.5064119340486783,
"grad_norm": 4.796964645385742,
"learning_rate": 9.876995550902906e-06,
"logits/chosen": -3.02583646774292,
"logits/rejected": -2.948720932006836,
"logps/chosen": -288.92779541015625,
"logps/rejected": -283.83978271484375,
"loss": 0.742,
"rewards/accuracies": 0.625,
"rewards/chosen": -2.043752431869507,
"rewards/margins": 0.7483810186386108,
"rewards/rejected": -2.7921335697174072,
"step": 1935
},
{
"epoch": 0.5077204920177963,
"grad_norm": 8.783916473388672,
"learning_rate": 9.850824391520546e-06,
"logits/chosen": -2.9932680130004883,
"logits/rejected": -2.9787936210632324,
"logps/chosen": -312.96728515625,
"logps/rejected": -295.27716064453125,
"loss": 0.6287,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.8972053527832031,
"rewards/margins": 1.0939700603485107,
"rewards/rejected": -2.991175651550293,
"step": 1940
},
{
"epoch": 0.5090290499869145,
"grad_norm": 9.5989351272583,
"learning_rate": 9.824653232138186e-06,
"logits/chosen": -2.9638044834136963,
"logits/rejected": -3.0555965900421143,
"logps/chosen": -278.03289794921875,
"logps/rejected": -268.63946533203125,
"loss": 0.5472,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7958492040634155,
"rewards/margins": 1.3007774353027344,
"rewards/rejected": -3.0966267585754395,
"step": 1945
},
{
"epoch": 0.5103376079560324,
"grad_norm": 8.681360244750977,
"learning_rate": 9.798482072755824e-06,
"logits/chosen": -2.989262342453003,
"logits/rejected": -2.974743127822876,
"logps/chosen": -254.3424072265625,
"logps/rejected": -264.46270751953125,
"loss": 0.4737,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.655994176864624,
"rewards/margins": 1.5236682891845703,
"rewards/rejected": -3.1796624660491943,
"step": 1950
},
{
"epoch": 0.5116461659251504,
"grad_norm": 5.015665531158447,
"learning_rate": 9.772310913373462e-06,
"logits/chosen": -2.7786598205566406,
"logits/rejected": -2.894463062286377,
"logps/chosen": -297.5106201171875,
"logps/rejected": -282.90631103515625,
"loss": 0.4604,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.898776650428772,
"rewards/margins": 1.5486475229263306,
"rewards/rejected": -3.4474239349365234,
"step": 1955
},
{
"epoch": 0.5129547238942685,
"grad_norm": 6.68208122253418,
"learning_rate": 9.746139753991103e-06,
"logits/chosen": -2.8744349479675293,
"logits/rejected": -2.9163289070129395,
"logps/chosen": -219.7449951171875,
"logps/rejected": -280.440673828125,
"loss": 0.4416,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.8795044422149658,
"rewards/margins": 1.2291165590286255,
"rewards/rejected": -3.1086208820343018,
"step": 1960
},
{
"epoch": 0.5142632818633865,
"grad_norm": 6.96512508392334,
"learning_rate": 9.719968594608743e-06,
"logits/chosen": -3.036520004272461,
"logits/rejected": -3.0686442852020264,
"logps/chosen": -270.5203857421875,
"logps/rejected": -249.11648559570312,
"loss": 0.4188,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7024023532867432,
"rewards/margins": 1.3636255264282227,
"rewards/rejected": -3.066027879714966,
"step": 1965
},
{
"epoch": 0.5155718398325045,
"grad_norm": 7.726841926574707,
"learning_rate": 9.693797435226381e-06,
"logits/chosen": -2.8714888095855713,
"logits/rejected": -2.9929287433624268,
"logps/chosen": -347.3719482421875,
"logps/rejected": -307.1537170410156,
"loss": 0.5226,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.9428255558013916,
"rewards/margins": 1.3433595895767212,
"rewards/rejected": -3.2861855030059814,
"step": 1970
},
{
"epoch": 0.5168803978016226,
"grad_norm": 9.329137802124023,
"learning_rate": 9.667626275844021e-06,
"logits/chosen": -2.8178086280822754,
"logits/rejected": -2.9708805084228516,
"logps/chosen": -261.303955078125,
"logps/rejected": -262.2368469238281,
"loss": 0.5773,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.8946533203125,
"rewards/margins": 1.4651789665222168,
"rewards/rejected": -3.3598320484161377,
"step": 1975
},
{
"epoch": 0.5181889557707406,
"grad_norm": 8.616991996765137,
"learning_rate": 9.64145511646166e-06,
"logits/chosen": -2.7979893684387207,
"logits/rejected": -2.8203110694885254,
"logps/chosen": -295.17608642578125,
"logps/rejected": -279.5721740722656,
"loss": 0.5596,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -2.0426454544067383,
"rewards/margins": 1.3784496784210205,
"rewards/rejected": -3.4210948944091797,
"step": 1980
},
{
"epoch": 0.5194975137398586,
"grad_norm": 12.504921913146973,
"learning_rate": 9.6152839570793e-06,
"logits/chosen": -2.8116393089294434,
"logits/rejected": -2.9229259490966797,
"logps/chosen": -360.49200439453125,
"logps/rejected": -346.6287841796875,
"loss": 0.6584,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.7081302404403687,
"rewards/margins": 1.1589677333831787,
"rewards/rejected": -2.867098093032837,
"step": 1985
},
{
"epoch": 0.5208060717089767,
"grad_norm": 9.76529312133789,
"learning_rate": 9.58911279769694e-06,
"logits/chosen": -2.9271957874298096,
"logits/rejected": -2.9286372661590576,
"logps/chosen": -304.6205749511719,
"logps/rejected": -270.85345458984375,
"loss": 0.5885,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.750661849975586,
"rewards/margins": 1.2978880405426025,
"rewards/rejected": -3.0485496520996094,
"step": 1990
},
{
"epoch": 0.5221146296780947,
"grad_norm": 6.337282180786133,
"learning_rate": 9.562941638314578e-06,
"logits/chosen": -2.8846378326416016,
"logits/rejected": -3.0143656730651855,
"logps/chosen": -344.0283203125,
"logps/rejected": -331.4059753417969,
"loss": 0.5132,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4416577816009521,
"rewards/margins": 1.4556671380996704,
"rewards/rejected": -2.897324800491333,
"step": 1995
},
{
"epoch": 0.5234231876472127,
"grad_norm": 6.361517429351807,
"learning_rate": 9.536770478932218e-06,
"logits/chosen": -2.839524507522583,
"logits/rejected": -2.855935573577881,
"logps/chosen": -287.5265808105469,
"logps/rejected": -308.75701904296875,
"loss": 0.5585,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.3233356475830078,
"rewards/margins": 1.5039829015731812,
"rewards/rejected": -2.8273186683654785,
"step": 2000
},
{
"epoch": 0.5234231876472127,
"eval_logits/chosen": -2.9509613513946533,
"eval_logits/rejected": -2.9671716690063477,
"eval_logps/chosen": -295.8841552734375,
"eval_logps/rejected": -290.96044921875,
"eval_loss": 0.5118626952171326,
"eval_rewards/accuracies": 0.7434999942779541,
"eval_rewards/chosen": -1.2650753259658813,
"eval_rewards/margins": 1.4375065565109253,
"eval_rewards/rejected": -2.7025818824768066,
"eval_runtime": 764.2545,
"eval_samples_per_second": 2.617,
"eval_steps_per_second": 0.327,
"step": 2000
},
{
"epoch": 0.5247317456163308,
"grad_norm": 8.742295265197754,
"learning_rate": 9.510599319549856e-06,
"logits/chosen": -2.8847405910491943,
"logits/rejected": -2.829425811767578,
"logps/chosen": -284.7441711425781,
"logps/rejected": -327.94415283203125,
"loss": 0.5754,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.0865317583084106,
"rewards/margins": 1.353413701057434,
"rewards/rejected": -2.439945697784424,
"step": 2005
},
{
"epoch": 0.5260403035854488,
"grad_norm": 9.733824729919434,
"learning_rate": 9.484428160167496e-06,
"logits/chosen": -2.9705443382263184,
"logits/rejected": -2.9440743923187256,
"logps/chosen": -305.75921630859375,
"logps/rejected": -309.7073669433594,
"loss": 0.5781,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.1363924741744995,
"rewards/margins": 1.0744402408599854,
"rewards/rejected": -2.2108325958251953,
"step": 2010
},
{
"epoch": 0.5273488615545668,
"grad_norm": 5.931386470794678,
"learning_rate": 9.458257000785136e-06,
"logits/chosen": -2.9103846549987793,
"logits/rejected": -2.9428811073303223,
"logps/chosen": -325.4049987792969,
"logps/rejected": -326.3647155761719,
"loss": 0.4821,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.080718994140625,
"rewards/margins": 1.3079969882965088,
"rewards/rejected": -2.388716220855713,
"step": 2015
},
{
"epoch": 0.528657419523685,
"grad_norm": 8.640279769897461,
"learning_rate": 9.432085841402775e-06,
"logits/chosen": -2.983135938644409,
"logits/rejected": -3.0264086723327637,
"logps/chosen": -285.4974060058594,
"logps/rejected": -277.7830505371094,
"loss": 0.5414,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.9102684259414673,
"rewards/margins": 1.2370755672454834,
"rewards/rejected": -2.147343873977661,
"step": 2020
},
{
"epoch": 0.5299659774928029,
"grad_norm": 6.43532133102417,
"learning_rate": 9.405914682020413e-06,
"logits/chosen": -2.9441254138946533,
"logits/rejected": -2.9812066555023193,
"logps/chosen": -336.6988525390625,
"logps/rejected": -311.3836364746094,
"loss": 0.627,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.9982717633247375,
"rewards/margins": 1.2580807209014893,
"rewards/rejected": -2.256352663040161,
"step": 2025
},
{
"epoch": 0.5312745354619209,
"grad_norm": 7.047455787658691,
"learning_rate": 9.379743522638053e-06,
"logits/chosen": -2.9485602378845215,
"logits/rejected": -2.8787522315979004,
"logps/chosen": -304.49786376953125,
"logps/rejected": -285.79425048828125,
"loss": 0.6052,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.8985692858695984,
"rewards/margins": 1.0381746292114258,
"rewards/rejected": -1.936743974685669,
"step": 2030
},
{
"epoch": 0.532583093431039,
"grad_norm": 8.520474433898926,
"learning_rate": 9.353572363255693e-06,
"logits/chosen": -2.8559398651123047,
"logits/rejected": -2.9105234146118164,
"logps/chosen": -301.0296936035156,
"logps/rejected": -269.9303894042969,
"loss": 0.5853,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1256568431854248,
"rewards/margins": 1.10567307472229,
"rewards/rejected": -2.231329917907715,
"step": 2035
},
{
"epoch": 0.533891651400157,
"grad_norm": 7.910058498382568,
"learning_rate": 9.327401203873332e-06,
"logits/chosen": -2.9811348915100098,
"logits/rejected": -3.01061749458313,
"logps/chosen": -310.69586181640625,
"logps/rejected": -335.1217956542969,
"loss": 0.6182,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.7057936191558838,
"rewards/margins": 0.8416263461112976,
"rewards/rejected": -1.547419786453247,
"step": 2040
},
{
"epoch": 0.535200209369275,
"grad_norm": 6.437031269073486,
"learning_rate": 9.301230044490972e-06,
"logits/chosen": -2.955460786819458,
"logits/rejected": -3.0287487506866455,
"logps/chosen": -302.23388671875,
"logps/rejected": -284.850830078125,
"loss": 0.4264,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.5403744578361511,
"rewards/margins": 1.4383584260940552,
"rewards/rejected": -1.9787328243255615,
"step": 2045
},
{
"epoch": 0.5365087673383931,
"grad_norm": 6.636228084564209,
"learning_rate": 9.27505888510861e-06,
"logits/chosen": -2.859696388244629,
"logits/rejected": -2.9273557662963867,
"logps/chosen": -282.9736022949219,
"logps/rejected": -305.96087646484375,
"loss": 0.4716,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.6416420340538025,
"rewards/margins": 1.3998219966888428,
"rewards/rejected": -2.041464328765869,
"step": 2050
},
{
"epoch": 0.5378173253075111,
"grad_norm": 7.346982955932617,
"learning_rate": 9.24888772572625e-06,
"logits/chosen": -2.8341944217681885,
"logits/rejected": -2.946746349334717,
"logps/chosen": -308.4425354003906,
"logps/rejected": -269.16448974609375,
"loss": 0.5097,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.9091157913208008,
"rewards/margins": 1.4609925746917725,
"rewards/rejected": -2.3701086044311523,
"step": 2055
},
{
"epoch": 0.5391258832766291,
"grad_norm": 5.3710432052612305,
"learning_rate": 9.22271656634389e-06,
"logits/chosen": -2.867131471633911,
"logits/rejected": -2.7901980876922607,
"logps/chosen": -294.47900390625,
"logps/rejected": -304.6952819824219,
"loss": 0.4905,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.1196249723434448,
"rewards/margins": 1.551390290260315,
"rewards/rejected": -2.6710152626037598,
"step": 2060
},
{
"epoch": 0.5404344412457471,
"grad_norm": 8.228592872619629,
"learning_rate": 9.196545406961529e-06,
"logits/chosen": -2.9531893730163574,
"logits/rejected": -2.912006378173828,
"logps/chosen": -335.7592468261719,
"logps/rejected": -351.94818115234375,
"loss": 0.3817,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.1705877780914307,
"rewards/margins": 2.073190689086914,
"rewards/rejected": -3.2437782287597656,
"step": 2065
},
{
"epoch": 0.5417429992148652,
"grad_norm": 6.4846954345703125,
"learning_rate": 9.170374247579169e-06,
"logits/chosen": -3.026458263397217,
"logits/rejected": -3.085371971130371,
"logps/chosen": -276.64141845703125,
"logps/rejected": -233.27804565429688,
"loss": 0.4266,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.0601603984832764,
"rewards/margins": 1.4462592601776123,
"rewards/rejected": -2.5064198970794678,
"step": 2070
},
{
"epoch": 0.5430515571839832,
"grad_norm": 9.074657440185547,
"learning_rate": 9.144203088196809e-06,
"logits/chosen": -2.9421777725219727,
"logits/rejected": -2.927706480026245,
"logps/chosen": -303.47467041015625,
"logps/rejected": -296.0120544433594,
"loss": 0.363,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.3702189922332764,
"rewards/margins": 2.219372272491455,
"rewards/rejected": -3.5895907878875732,
"step": 2075
},
{
"epoch": 0.5443601151531012,
"grad_norm": 11.485058784484863,
"learning_rate": 9.118031928814447e-06,
"logits/chosen": -3.0031659603118896,
"logits/rejected": -3.0807993412017822,
"logps/chosen": -277.64984130859375,
"logps/rejected": -287.6283264160156,
"loss": 0.5127,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5950052738189697,
"rewards/margins": 1.7552427053451538,
"rewards/rejected": -3.350247859954834,
"step": 2080
},
{
"epoch": 0.5456686731222193,
"grad_norm": 7.12365198135376,
"learning_rate": 9.091860769432087e-06,
"logits/chosen": -2.9222354888916016,
"logits/rejected": -3.0730066299438477,
"logps/chosen": -268.171142578125,
"logps/rejected": -258.3194885253906,
"loss": 0.4631,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5083897113800049,
"rewards/margins": 1.9156631231307983,
"rewards/rejected": -3.4240524768829346,
"step": 2085
},
{
"epoch": 0.5469772310913373,
"grad_norm": 7.024919033050537,
"learning_rate": 9.065689610049726e-06,
"logits/chosen": -2.9282279014587402,
"logits/rejected": -2.9903883934020996,
"logps/chosen": -245.131103515625,
"logps/rejected": -287.2030334472656,
"loss": 0.5131,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.8197906017303467,
"rewards/margins": 1.6753877401351929,
"rewards/rejected": -3.49517822265625,
"step": 2090
},
{
"epoch": 0.5482857890604553,
"grad_norm": 5.535000324249268,
"learning_rate": 9.039518450667366e-06,
"logits/chosen": -2.9842965602874756,
"logits/rejected": -2.9172840118408203,
"logps/chosen": -354.969970703125,
"logps/rejected": -327.3636779785156,
"loss": 0.4529,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.4617191553115845,
"rewards/margins": 1.7047052383422852,
"rewards/rejected": -3.166424512863159,
"step": 2095
},
{
"epoch": 0.5495943470295734,
"grad_norm": 7.004070281982422,
"learning_rate": 9.013347291285006e-06,
"logits/chosen": -2.9624948501586914,
"logits/rejected": -2.85925030708313,
"logps/chosen": -310.21282958984375,
"logps/rejected": -320.2611389160156,
"loss": 0.5602,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.095109462738037,
"rewards/margins": 1.4026702642440796,
"rewards/rejected": -3.4977798461914062,
"step": 2100
},
{
"epoch": 0.5509029049986914,
"grad_norm": 13.739896774291992,
"learning_rate": 8.987176131902644e-06,
"logits/chosen": -2.917407751083374,
"logits/rejected": -2.9701895713806152,
"logps/chosen": -318.69793701171875,
"logps/rejected": -318.5054626464844,
"loss": 0.5382,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.4192984104156494,
"rewards/margins": 1.9310028553009033,
"rewards/rejected": -3.3503010272979736,
"step": 2105
},
{
"epoch": 0.5522114629678094,
"grad_norm": 8.459038734436035,
"learning_rate": 8.961004972520282e-06,
"logits/chosen": -2.9396519660949707,
"logits/rejected": -3.022418975830078,
"logps/chosen": -284.6721496582031,
"logps/rejected": -299.3384704589844,
"loss": 0.5301,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.4773406982421875,
"rewards/margins": 1.503132939338684,
"rewards/rejected": -2.980473756790161,
"step": 2110
},
{
"epoch": 0.5535200209369275,
"grad_norm": 6.923474311828613,
"learning_rate": 8.934833813137923e-06,
"logits/chosen": -2.9355552196502686,
"logits/rejected": -2.9257876873016357,
"logps/chosen": -308.79180908203125,
"logps/rejected": -338.4895324707031,
"loss": 0.5048,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.2020846605300903,
"rewards/margins": 1.280178427696228,
"rewards/rejected": -2.4822630882263184,
"step": 2115
},
{
"epoch": 0.5548285789060455,
"grad_norm": 10.126736640930176,
"learning_rate": 8.908662653755563e-06,
"logits/chosen": -2.876392126083374,
"logits/rejected": -3.0241641998291016,
"logps/chosen": -279.66033935546875,
"logps/rejected": -269.89288330078125,
"loss": 0.5188,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1738862991333008,
"rewards/margins": 1.4325240850448608,
"rewards/rejected": -2.606410264968872,
"step": 2120
},
{
"epoch": 0.5561371368751635,
"grad_norm": 5.744837760925293,
"learning_rate": 8.882491494373201e-06,
"logits/chosen": -2.7355411052703857,
"logits/rejected": -2.8887104988098145,
"logps/chosen": -256.62213134765625,
"logps/rejected": -286.48992919921875,
"loss": 0.4672,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.1047096252441406,
"rewards/margins": 1.5396578311920166,
"rewards/rejected": -2.6443674564361572,
"step": 2125
},
{
"epoch": 0.5574456948442816,
"grad_norm": 6.99560022354126,
"learning_rate": 8.856320334990841e-06,
"logits/chosen": -3.060870409011841,
"logits/rejected": -3.0697546005249023,
"logps/chosen": -310.7414245605469,
"logps/rejected": -238.05508422851562,
"loss": 0.454,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.8422433137893677,
"rewards/margins": 1.337949514389038,
"rewards/rejected": -2.180192708969116,
"step": 2130
},
{
"epoch": 0.5587542528133996,
"grad_norm": 7.061825275421143,
"learning_rate": 8.83014917560848e-06,
"logits/chosen": -2.8841323852539062,
"logits/rejected": -2.943366527557373,
"logps/chosen": -227.40780639648438,
"logps/rejected": -212.0872039794922,
"loss": 0.5566,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.1873077154159546,
"rewards/margins": 1.2954754829406738,
"rewards/rejected": -2.482783317565918,
"step": 2135
},
{
"epoch": 0.5600628107825176,
"grad_norm": 3.9094793796539307,
"learning_rate": 8.80397801622612e-06,
"logits/chosen": -2.954484224319458,
"logits/rejected": -3.0178635120391846,
"logps/chosen": -259.7063903808594,
"logps/rejected": -256.8832702636719,
"loss": 0.3804,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.817324161529541,
"rewards/margins": 1.6765632629394531,
"rewards/rejected": -2.493887424468994,
"step": 2140
},
{
"epoch": 0.5613713687516357,
"grad_norm": 7.536386966705322,
"learning_rate": 8.77780685684376e-06,
"logits/chosen": -2.761610507965088,
"logits/rejected": -2.877763271331787,
"logps/chosen": -257.71246337890625,
"logps/rejected": -227.89743041992188,
"loss": 0.4166,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.9659379124641418,
"rewards/margins": 1.762495994567871,
"rewards/rejected": -2.728433847427368,
"step": 2145
},
{
"epoch": 0.5626799267207537,
"grad_norm": 7.488414764404297,
"learning_rate": 8.751635697461398e-06,
"logits/chosen": -2.770033836364746,
"logits/rejected": -3.0158371925354004,
"logps/chosen": -282.5020446777344,
"logps/rejected": -273.234619140625,
"loss": 0.5068,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9604713320732117,
"rewards/margins": 1.5011874437332153,
"rewards/rejected": -2.4616587162017822,
"step": 2150
},
{
"epoch": 0.5639884846898717,
"grad_norm": 7.021855354309082,
"learning_rate": 8.725464538079038e-06,
"logits/chosen": -2.9612388610839844,
"logits/rejected": -2.963198184967041,
"logps/chosen": -299.5247802734375,
"logps/rejected": -272.65948486328125,
"loss": 0.5239,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0747485160827637,
"rewards/margins": 1.7154130935668945,
"rewards/rejected": -2.790161609649658,
"step": 2155
},
{
"epoch": 0.5652970426589898,
"grad_norm": 5.9548797607421875,
"learning_rate": 8.699293378696676e-06,
"logits/chosen": -3.007736921310425,
"logits/rejected": -3.0112733840942383,
"logps/chosen": -292.77142333984375,
"logps/rejected": -297.82061767578125,
"loss": 0.4134,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.9671595692634583,
"rewards/margins": 2.1301705837249756,
"rewards/rejected": -3.0973305702209473,
"step": 2160
},
{
"epoch": 0.5666056006281078,
"grad_norm": 7.590676307678223,
"learning_rate": 8.673122219314316e-06,
"logits/chosen": -2.9755873680114746,
"logits/rejected": -3.037572145462036,
"logps/chosen": -314.57171630859375,
"logps/rejected": -303.40411376953125,
"loss": 0.6617,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.9973914623260498,
"rewards/margins": 1.1834059953689575,
"rewards/rejected": -3.180797815322876,
"step": 2165
},
{
"epoch": 0.5679141585972258,
"grad_norm": 8.90278148651123,
"learning_rate": 8.646951059931956e-06,
"logits/chosen": -3.0032241344451904,
"logits/rejected": -3.049996852874756,
"logps/chosen": -278.5682067871094,
"logps/rejected": -271.05926513671875,
"loss": 0.5459,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.4959290027618408,
"rewards/margins": 1.4511077404022217,
"rewards/rejected": -2.9470365047454834,
"step": 2170
},
{
"epoch": 0.5692227165663439,
"grad_norm": 7.02294397354126,
"learning_rate": 8.620779900549595e-06,
"logits/chosen": -2.896298885345459,
"logits/rejected": -2.9629626274108887,
"logps/chosen": -305.6729431152344,
"logps/rejected": -343.16595458984375,
"loss": 0.5035,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.7510077953338623,
"rewards/margins": 1.6487032175064087,
"rewards/rejected": -3.3997111320495605,
"step": 2175
},
{
"epoch": 0.5705312745354619,
"grad_norm": 7.937122821807861,
"learning_rate": 8.594608741167235e-06,
"logits/chosen": -2.9339687824249268,
"logits/rejected": -3.004333972930908,
"logps/chosen": -328.10791015625,
"logps/rejected": -312.27032470703125,
"loss": 0.3867,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.7036577463150024,
"rewards/margins": 1.9725004434585571,
"rewards/rejected": -3.6761581897735596,
"step": 2180
},
{
"epoch": 0.5718398325045799,
"grad_norm": 7.029597759246826,
"learning_rate": 8.568437581784875e-06,
"logits/chosen": -2.831653118133545,
"logits/rejected": -2.8481671810150146,
"logps/chosen": -266.60882568359375,
"logps/rejected": -274.7930603027344,
"loss": 0.5076,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.6596145629882812,
"rewards/margins": 1.5107403993606567,
"rewards/rejected": -3.1703550815582275,
"step": 2185
},
{
"epoch": 0.573148390473698,
"grad_norm": 6.836891174316406,
"learning_rate": 8.542266422402513e-06,
"logits/chosen": -2.917339563369751,
"logits/rejected": -3.0334200859069824,
"logps/chosen": -238.88229370117188,
"logps/rejected": -249.5050048828125,
"loss": 0.5092,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.2485872507095337,
"rewards/margins": 1.4144742488861084,
"rewards/rejected": -2.6630613803863525,
"step": 2190
},
{
"epoch": 0.574456948442816,
"grad_norm": 7.6597981452941895,
"learning_rate": 8.516095263020152e-06,
"logits/chosen": -2.942396402359009,
"logits/rejected": -3.004131555557251,
"logps/chosen": -263.2633361816406,
"logps/rejected": -259.5253601074219,
"loss": 0.5657,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.4041615724563599,
"rewards/margins": 1.3960957527160645,
"rewards/rejected": -2.8002572059631348,
"step": 2195
},
{
"epoch": 0.575765506411934,
"grad_norm": 10.346620559692383,
"learning_rate": 8.489924103637792e-06,
"logits/chosen": -2.958832025527954,
"logits/rejected": -2.9972152709960938,
"logps/chosen": -311.97686767578125,
"logps/rejected": -293.7490234375,
"loss": 0.4452,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.709890365600586,
"rewards/margins": 1.7548198699951172,
"rewards/rejected": -3.4647107124328613,
"step": 2200
},
{
"epoch": 0.5770740643810521,
"grad_norm": 6.80871057510376,
"learning_rate": 8.463752944255432e-06,
"logits/chosen": -2.995701313018799,
"logits/rejected": -2.9955978393554688,
"logps/chosen": -238.9085235595703,
"logps/rejected": -260.15234375,
"loss": 0.5445,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.4946893453598022,
"rewards/margins": 1.4255342483520508,
"rewards/rejected": -2.9202239513397217,
"step": 2205
},
{
"epoch": 0.5783826223501701,
"grad_norm": 6.468989849090576,
"learning_rate": 8.43758178487307e-06,
"logits/chosen": -3.0018184185028076,
"logits/rejected": -3.012239694595337,
"logps/chosen": -313.91973876953125,
"logps/rejected": -300.5749206542969,
"loss": 0.4718,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.8711507320404053,
"rewards/margins": 1.6339337825775146,
"rewards/rejected": -3.505084276199341,
"step": 2210
},
{
"epoch": 0.5796911803192881,
"grad_norm": 7.442930221557617,
"learning_rate": 8.41141062549071e-06,
"logits/chosen": -2.836812734603882,
"logits/rejected": -2.8636796474456787,
"logps/chosen": -264.2229309082031,
"logps/rejected": -278.0809631347656,
"loss": 0.5215,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.6197808980941772,
"rewards/margins": 1.3861788511276245,
"rewards/rejected": -3.005959987640381,
"step": 2215
},
{
"epoch": 0.5809997382884062,
"grad_norm": 9.090330123901367,
"learning_rate": 8.385239466108349e-06,
"logits/chosen": -2.896070957183838,
"logits/rejected": -2.8910112380981445,
"logps/chosen": -294.02801513671875,
"logps/rejected": -274.5065002441406,
"loss": 0.5265,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.9419463872909546,
"rewards/margins": 1.3138511180877686,
"rewards/rejected": -3.2557971477508545,
"step": 2220
},
{
"epoch": 0.5823082962575242,
"grad_norm": 8.379128456115723,
"learning_rate": 8.359068306725989e-06,
"logits/chosen": -2.844043016433716,
"logits/rejected": -2.9461586475372314,
"logps/chosen": -375.4545593261719,
"logps/rejected": -330.26068115234375,
"loss": 0.5607,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.5715759992599487,
"rewards/margins": 1.545506477355957,
"rewards/rejected": -3.1170825958251953,
"step": 2225
},
{
"epoch": 0.5836168542266422,
"grad_norm": 5.378319263458252,
"learning_rate": 8.332897147343629e-06,
"logits/chosen": -2.9515392780303955,
"logits/rejected": -3.003692150115967,
"logps/chosen": -303.4684753417969,
"logps/rejected": -290.7479248046875,
"loss": 0.456,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7767654657363892,
"rewards/margins": 1.4634473323822021,
"rewards/rejected": -3.2402126789093018,
"step": 2230
},
{
"epoch": 0.5849254121957602,
"grad_norm": 10.692007064819336,
"learning_rate": 8.306725987961267e-06,
"logits/chosen": -2.96927809715271,
"logits/rejected": -2.9832751750946045,
"logps/chosen": -301.08135986328125,
"logps/rejected": -289.53436279296875,
"loss": 0.5926,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.165919065475464,
"rewards/margins": 1.5472595691680908,
"rewards/rejected": -3.7131786346435547,
"step": 2235
},
{
"epoch": 0.5862339701648783,
"grad_norm": 6.6370744705200195,
"learning_rate": 8.280554828578907e-06,
"logits/chosen": -2.9266231060028076,
"logits/rejected": -2.936668872833252,
"logps/chosen": -312.54803466796875,
"logps/rejected": -291.4410400390625,
"loss": 0.5544,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.9401369094848633,
"rewards/margins": 1.5303126573562622,
"rewards/rejected": -3.470449924468994,
"step": 2240
},
{
"epoch": 0.5875425281339963,
"grad_norm": 4.578351020812988,
"learning_rate": 8.254383669196546e-06,
"logits/chosen": -2.8608431816101074,
"logits/rejected": -2.8830883502960205,
"logps/chosen": -331.26593017578125,
"logps/rejected": -305.5675048828125,
"loss": 0.3923,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.8561537265777588,
"rewards/margins": 1.607722520828247,
"rewards/rejected": -3.463876724243164,
"step": 2245
},
{
"epoch": 0.5888510861031143,
"grad_norm": 6.522406578063965,
"learning_rate": 8.228212509814186e-06,
"logits/chosen": -2.880575656890869,
"logits/rejected": -2.9558682441711426,
"logps/chosen": -341.096923828125,
"logps/rejected": -299.71624755859375,
"loss": 0.567,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.9508142471313477,
"rewards/margins": 1.496654748916626,
"rewards/rejected": -3.4474685192108154,
"step": 2250
},
{
"epoch": 0.5901596440722324,
"grad_norm": 6.557366371154785,
"learning_rate": 8.202041350431826e-06,
"logits/chosen": -2.893763542175293,
"logits/rejected": -3.021374225616455,
"logps/chosen": -321.5784606933594,
"logps/rejected": -259.36566162109375,
"loss": 0.4864,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.7807306051254272,
"rewards/margins": 1.5349364280700684,
"rewards/rejected": -3.315666913986206,
"step": 2255
},
{
"epoch": 0.5914682020413504,
"grad_norm": 10.000592231750488,
"learning_rate": 8.175870191049464e-06,
"logits/chosen": -2.7134666442871094,
"logits/rejected": -2.776644229888916,
"logps/chosen": -329.2943420410156,
"logps/rejected": -301.40338134765625,
"loss": 0.525,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.6413131952285767,
"rewards/margins": 1.7001183032989502,
"rewards/rejected": -3.3414313793182373,
"step": 2260
},
{
"epoch": 0.5927767600104684,
"grad_norm": 7.016106128692627,
"learning_rate": 8.149699031667102e-06,
"logits/chosen": -2.9477972984313965,
"logits/rejected": -2.94490909576416,
"logps/chosen": -298.5036926269531,
"logps/rejected": -280.6138610839844,
"loss": 0.4627,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7625675201416016,
"rewards/margins": 1.7733052968978882,
"rewards/rejected": -3.5358726978302,
"step": 2265
},
{
"epoch": 0.5940853179795865,
"grad_norm": 7.888564586639404,
"learning_rate": 8.123527872284743e-06,
"logits/chosen": -2.8331191539764404,
"logits/rejected": -2.9611730575561523,
"logps/chosen": -284.2607727050781,
"logps/rejected": -262.9834289550781,
"loss": 0.4227,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.5213582515716553,
"rewards/margins": 1.6578248739242554,
"rewards/rejected": -3.179183006286621,
"step": 2270
},
{
"epoch": 0.5953938759487045,
"grad_norm": 6.094775199890137,
"learning_rate": 8.097356712902383e-06,
"logits/chosen": -2.918649196624756,
"logits/rejected": -2.9431166648864746,
"logps/chosen": -273.8592834472656,
"logps/rejected": -277.9981689453125,
"loss": 0.577,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.7764129638671875,
"rewards/margins": 1.1202478408813477,
"rewards/rejected": -2.896660566329956,
"step": 2275
},
{
"epoch": 0.5967024339178225,
"grad_norm": 7.925708293914795,
"learning_rate": 8.071185553520021e-06,
"logits/chosen": -2.9345788955688477,
"logits/rejected": -3.008537769317627,
"logps/chosen": -307.4665832519531,
"logps/rejected": -274.0155029296875,
"loss": 0.5001,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.6642906665802002,
"rewards/margins": 1.5600162744522095,
"rewards/rejected": -3.2243072986602783,
"step": 2280
},
{
"epoch": 0.5980109918869406,
"grad_norm": 9.510141372680664,
"learning_rate": 8.045014394137661e-06,
"logits/chosen": -2.7968087196350098,
"logits/rejected": -2.7900938987731934,
"logps/chosen": -273.7165832519531,
"logps/rejected": -263.81915283203125,
"loss": 0.4475,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.9605255126953125,
"rewards/margins": 1.7081210613250732,
"rewards/rejected": -3.6686465740203857,
"step": 2285
},
{
"epoch": 0.5993195498560586,
"grad_norm": 7.476105213165283,
"learning_rate": 8.0188432347553e-06,
"logits/chosen": -2.966008424758911,
"logits/rejected": -3.010863780975342,
"logps/chosen": -301.1141052246094,
"logps/rejected": -314.04248046875,
"loss": 0.4834,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.9317734241485596,
"rewards/margins": 1.6113536357879639,
"rewards/rejected": -3.5431270599365234,
"step": 2290
},
{
"epoch": 0.6006281078251766,
"grad_norm": 5.335666656494141,
"learning_rate": 7.99267207537294e-06,
"logits/chosen": -2.929396152496338,
"logits/rejected": -2.97169828414917,
"logps/chosen": -287.72955322265625,
"logps/rejected": -318.4091796875,
"loss": 0.5622,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.0816383361816406,
"rewards/margins": 1.6077334880828857,
"rewards/rejected": -3.6893723011016846,
"step": 2295
},
{
"epoch": 0.6019366657942947,
"grad_norm": 8.660011291503906,
"learning_rate": 7.96650091599058e-06,
"logits/chosen": -2.955652952194214,
"logits/rejected": -2.9135398864746094,
"logps/chosen": -282.47552490234375,
"logps/rejected": -285.1706237792969,
"loss": 0.5139,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -2.121570587158203,
"rewards/margins": 1.3121795654296875,
"rewards/rejected": -3.433750629425049,
"step": 2300
},
{
"epoch": 0.6032452237634127,
"grad_norm": 7.009401321411133,
"learning_rate": 7.940329756608218e-06,
"logits/chosen": -2.8541007041931152,
"logits/rejected": -2.9473915100097656,
"logps/chosen": -335.2101745605469,
"logps/rejected": -323.11773681640625,
"loss": 0.4064,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4906498193740845,
"rewards/margins": 1.9026981592178345,
"rewards/rejected": -3.393347978591919,
"step": 2305
},
{
"epoch": 0.6045537817325307,
"grad_norm": 5.929949760437012,
"learning_rate": 7.914158597225858e-06,
"logits/chosen": -2.790144205093384,
"logits/rejected": -2.7649049758911133,
"logps/chosen": -233.0009002685547,
"logps/rejected": -255.8800506591797,
"loss": 0.502,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.5350371599197388,
"rewards/margins": 1.4691892862319946,
"rewards/rejected": -3.0042262077331543,
"step": 2310
},
{
"epoch": 0.6058623397016488,
"grad_norm": 5.333659648895264,
"learning_rate": 7.887987437843498e-06,
"logits/chosen": -2.83431339263916,
"logits/rejected": -2.869320869445801,
"logps/chosen": -291.4210510253906,
"logps/rejected": -289.72772216796875,
"loss": 0.4205,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.24900221824646,
"rewards/margins": 1.8472312688827515,
"rewards/rejected": -3.096233606338501,
"step": 2315
},
{
"epoch": 0.6071708976707668,
"grad_norm": 7.005737781524658,
"learning_rate": 7.861816278461136e-06,
"logits/chosen": -2.858786106109619,
"logits/rejected": -2.824657917022705,
"logps/chosen": -307.7120056152344,
"logps/rejected": -309.0006103515625,
"loss": 0.5332,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.736547827720642,
"rewards/margins": 1.4633195400238037,
"rewards/rejected": -3.1998674869537354,
"step": 2320
},
{
"epoch": 0.6084794556398848,
"grad_norm": 7.5584330558776855,
"learning_rate": 7.835645119078776e-06,
"logits/chosen": -2.8849313259124756,
"logits/rejected": -2.8943004608154297,
"logps/chosen": -256.06292724609375,
"logps/rejected": -318.1280212402344,
"loss": 0.4454,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5195752382278442,
"rewards/margins": 1.7365375757217407,
"rewards/rejected": -3.256112575531006,
"step": 2325
},
{
"epoch": 0.6097880136090029,
"grad_norm": 9.819621086120605,
"learning_rate": 7.809473959696415e-06,
"logits/chosen": -2.734415292739868,
"logits/rejected": -2.8705554008483887,
"logps/chosen": -326.63543701171875,
"logps/rejected": -277.5613708496094,
"loss": 0.6395,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3960602283477783,
"rewards/margins": 1.3911415338516235,
"rewards/rejected": -2.7872016429901123,
"step": 2330
},
{
"epoch": 0.6110965715781209,
"grad_norm": 10.025915145874023,
"learning_rate": 7.783302800314055e-06,
"logits/chosen": -2.8270926475524902,
"logits/rejected": -2.8613481521606445,
"logps/chosen": -306.28033447265625,
"logps/rejected": -287.93597412109375,
"loss": 0.5738,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.4758658409118652,
"rewards/margins": 1.4804503917694092,
"rewards/rejected": -2.9563159942626953,
"step": 2335
},
{
"epoch": 0.6124051295472389,
"grad_norm": 6.9185261726379395,
"learning_rate": 7.757131640931695e-06,
"logits/chosen": -2.9443321228027344,
"logits/rejected": -2.948289155960083,
"logps/chosen": -375.3352355957031,
"logps/rejected": -333.23834228515625,
"loss": 0.4571,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.5819398164749146,
"rewards/margins": 1.5913922786712646,
"rewards/rejected": -3.1733319759368896,
"step": 2340
},
{
"epoch": 0.613713687516357,
"grad_norm": 6.314789772033691,
"learning_rate": 7.730960481549333e-06,
"logits/chosen": -2.9200820922851562,
"logits/rejected": -2.8788719177246094,
"logps/chosen": -298.35296630859375,
"logps/rejected": -275.4773864746094,
"loss": 0.7795,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.692670464515686,
"rewards/margins": 0.9236106872558594,
"rewards/rejected": -2.616281270980835,
"step": 2345
},
{
"epoch": 0.615022245485475,
"grad_norm": 6.88268518447876,
"learning_rate": 7.704789322166972e-06,
"logits/chosen": -2.885225296020508,
"logits/rejected": -2.945056676864624,
"logps/chosen": -291.22589111328125,
"logps/rejected": -264.28485107421875,
"loss": 0.621,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.813675880432129,
"rewards/margins": 1.1586147546768188,
"rewards/rejected": -2.9722909927368164,
"step": 2350
},
{
"epoch": 0.616330803454593,
"grad_norm": 7.270668983459473,
"learning_rate": 7.678618162784612e-06,
"logits/chosen": -2.767096996307373,
"logits/rejected": -2.9178149700164795,
"logps/chosen": -273.98443603515625,
"logps/rejected": -270.39849853515625,
"loss": 0.4296,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.385209083557129,
"rewards/margins": 1.4976346492767334,
"rewards/rejected": -2.882843494415283,
"step": 2355
},
{
"epoch": 0.6176393614237111,
"grad_norm": 6.517707824707031,
"learning_rate": 7.652447003402252e-06,
"logits/chosen": -2.9354186058044434,
"logits/rejected": -2.8951194286346436,
"logps/chosen": -293.964599609375,
"logps/rejected": -319.67596435546875,
"loss": 0.5387,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.7208137512207031,
"rewards/margins": 1.1261407136917114,
"rewards/rejected": -2.846954822540283,
"step": 2360
},
{
"epoch": 0.6189479193928291,
"grad_norm": 3.2250170707702637,
"learning_rate": 7.62627584401989e-06,
"logits/chosen": -2.8758339881896973,
"logits/rejected": -2.9541773796081543,
"logps/chosen": -283.46185302734375,
"logps/rejected": -290.2981872558594,
"loss": 0.4126,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.4215718507766724,
"rewards/margins": 1.676491379737854,
"rewards/rejected": -3.0980632305145264,
"step": 2365
},
{
"epoch": 0.6202564773619471,
"grad_norm": 7.372755527496338,
"learning_rate": 7.60010468463753e-06,
"logits/chosen": -2.9308109283447266,
"logits/rejected": -2.946016788482666,
"logps/chosen": -314.4420471191406,
"logps/rejected": -285.1692810058594,
"loss": 0.339,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.5270148515701294,
"rewards/margins": 1.9710546731948853,
"rewards/rejected": -3.4980697631835938,
"step": 2370
},
{
"epoch": 0.6215650353310652,
"grad_norm": 7.630323886871338,
"learning_rate": 7.5739335252551695e-06,
"logits/chosen": -2.897289752960205,
"logits/rejected": -2.9930167198181152,
"logps/chosen": -347.9987487792969,
"logps/rejected": -320.02008056640625,
"loss": 0.5088,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.688640832901001,
"rewards/margins": 1.6700690984725952,
"rewards/rejected": -3.3587098121643066,
"step": 2375
},
{
"epoch": 0.6228735933001832,
"grad_norm": 8.262691497802734,
"learning_rate": 7.547762365872809e-06,
"logits/chosen": -2.941046953201294,
"logits/rejected": -2.9043381214141846,
"logps/chosen": -265.45703125,
"logps/rejected": -254.2328338623047,
"loss": 0.5536,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.5332114696502686,
"rewards/margins": 1.3792575597763062,
"rewards/rejected": -2.912468910217285,
"step": 2380
},
{
"epoch": 0.6241821512693012,
"grad_norm": 8.945664405822754,
"learning_rate": 7.521591206490449e-06,
"logits/chosen": -2.7841739654541016,
"logits/rejected": -2.9014525413513184,
"logps/chosen": -250.78091430664062,
"logps/rejected": -283.4075927734375,
"loss": 0.5543,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.4417940378189087,
"rewards/margins": 1.4987192153930664,
"rewards/rejected": -2.9405131340026855,
"step": 2385
},
{
"epoch": 0.6254907092384192,
"grad_norm": 5.672665596008301,
"learning_rate": 7.495420047108087e-06,
"logits/chosen": -2.9190821647644043,
"logits/rejected": -2.9655425548553467,
"logps/chosen": -308.78057861328125,
"logps/rejected": -286.87030029296875,
"loss": 0.5464,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.820185661315918,
"rewards/margins": 1.1439754962921143,
"rewards/rejected": -2.9641611576080322,
"step": 2390
},
{
"epoch": 0.6267992672075373,
"grad_norm": 6.946528911590576,
"learning_rate": 7.469248887725726e-06,
"logits/chosen": -2.8645572662353516,
"logits/rejected": -2.8304049968719482,
"logps/chosen": -278.38128662109375,
"logps/rejected": -312.0920104980469,
"loss": 0.4588,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.3060362339019775,
"rewards/margins": 1.906618356704712,
"rewards/rejected": -3.2126548290252686,
"step": 2395
},
{
"epoch": 0.6281078251766553,
"grad_norm": 6.621123313903809,
"learning_rate": 7.4430777283433664e-06,
"logits/chosen": -2.8144335746765137,
"logits/rejected": -2.8863790035247803,
"logps/chosen": -327.2550354003906,
"logps/rejected": -266.57574462890625,
"loss": 0.5097,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.4507298469543457,
"rewards/margins": 1.4654420614242554,
"rewards/rejected": -2.9161720275878906,
"step": 2400
},
{
"epoch": 0.6281078251766553,
"eval_logits/chosen": -2.921830177307129,
"eval_logits/rejected": -2.93990159034729,
"eval_logps/chosen": -297.5841064453125,
"eval_logps/rejected": -293.12017822265625,
"eval_loss": 0.5002692937850952,
"eval_rewards/accuracies": 0.75,
"eval_rewards/chosen": -1.435073971748352,
"eval_rewards/margins": 1.4834831953048706,
"eval_rewards/rejected": -2.9185571670532227,
"eval_runtime": 764.0029,
"eval_samples_per_second": 2.618,
"eval_steps_per_second": 0.327,
"step": 2400
},
{
"epoch": 0.6294163831457733,
"grad_norm": 7.230893135070801,
"learning_rate": 7.416906568961006e-06,
"logits/chosen": -2.885646104812622,
"logits/rejected": -2.8738009929656982,
"logps/chosen": -277.877685546875,
"logps/rejected": -286.29608154296875,
"loss": 0.6041,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.5320603847503662,
"rewards/margins": 1.3425400257110596,
"rewards/rejected": -2.8746001720428467,
"step": 2405
},
{
"epoch": 0.6307249411148914,
"grad_norm": 6.221070289611816,
"learning_rate": 7.390735409578646e-06,
"logits/chosen": -2.8773863315582275,
"logits/rejected": -2.948465347290039,
"logps/chosen": -354.95001220703125,
"logps/rejected": -330.3343505859375,
"loss": 0.7765,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.9595705270767212,
"rewards/margins": 0.9573482275009155,
"rewards/rejected": -2.9169187545776367,
"step": 2410
},
{
"epoch": 0.6320334990840094,
"grad_norm": 4.76958703994751,
"learning_rate": 7.364564250196284e-06,
"logits/chosen": -2.9388175010681152,
"logits/rejected": -2.9442152976989746,
"logps/chosen": -263.76995849609375,
"logps/rejected": -264.36749267578125,
"loss": 0.469,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.533185362815857,
"rewards/margins": 1.467279076576233,
"rewards/rejected": -3.000464677810669,
"step": 2415
},
{
"epoch": 0.6333420570531274,
"grad_norm": 8.054102897644043,
"learning_rate": 7.338393090813923e-06,
"logits/chosen": -2.8760969638824463,
"logits/rejected": -2.887807607650757,
"logps/chosen": -289.54315185546875,
"logps/rejected": -323.89935302734375,
"loss": 0.4297,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.7106348276138306,
"rewards/margins": 1.607092261314392,
"rewards/rejected": -3.3177268505096436,
"step": 2420
},
{
"epoch": 0.6346506150222455,
"grad_norm": 6.590506076812744,
"learning_rate": 7.312221931431563e-06,
"logits/chosen": -2.808767318725586,
"logits/rejected": -2.9431216716766357,
"logps/chosen": -298.54888916015625,
"logps/rejected": -359.0498962402344,
"loss": 0.379,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.2185065746307373,
"rewards/margins": 1.578076720237732,
"rewards/rejected": -2.796583414077759,
"step": 2425
},
{
"epoch": 0.6359591729913635,
"grad_norm": 7.335874557495117,
"learning_rate": 7.286050772049203e-06,
"logits/chosen": -2.8771159648895264,
"logits/rejected": -2.7269673347473145,
"logps/chosen": -253.0265655517578,
"logps/rejected": -315.6542663574219,
"loss": 0.459,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.6146316528320312,
"rewards/margins": 1.7264766693115234,
"rewards/rejected": -3.3411078453063965,
"step": 2430
},
{
"epoch": 0.6372677309604815,
"grad_norm": 7.070531368255615,
"learning_rate": 7.259879612666841e-06,
"logits/chosen": -2.8449649810791016,
"logits/rejected": -2.9761483669281006,
"logps/chosen": -274.96630859375,
"logps/rejected": -268.19732666015625,
"loss": 0.5524,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.5227558612823486,
"rewards/margins": 1.515989899635315,
"rewards/rejected": -3.0387461185455322,
"step": 2435
},
{
"epoch": 0.6385762889295996,
"grad_norm": 5.620815277099609,
"learning_rate": 7.233708453284481e-06,
"logits/chosen": -2.837573528289795,
"logits/rejected": -2.907860040664673,
"logps/chosen": -312.64727783203125,
"logps/rejected": -317.07342529296875,
"loss": 0.5055,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.5986802577972412,
"rewards/margins": 1.4222062826156616,
"rewards/rejected": -3.020886182785034,
"step": 2440
},
{
"epoch": 0.6398848468987176,
"grad_norm": 8.898514747619629,
"learning_rate": 7.20753729390212e-06,
"logits/chosen": -2.9366869926452637,
"logits/rejected": -3.005056381225586,
"logps/chosen": -362.00030517578125,
"logps/rejected": -289.4993896484375,
"loss": 0.4732,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.4922668933868408,
"rewards/margins": 1.557114601135254,
"rewards/rejected": -3.049381732940674,
"step": 2445
},
{
"epoch": 0.6411934048678356,
"grad_norm": 5.531732082366943,
"learning_rate": 7.1813661345197595e-06,
"logits/chosen": -2.8961660861968994,
"logits/rejected": -2.9190566539764404,
"logps/chosen": -337.93487548828125,
"logps/rejected": -288.6690979003906,
"loss": 0.4368,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.5329985618591309,
"rewards/margins": 1.7288877964019775,
"rewards/rejected": -3.2618861198425293,
"step": 2450
},
{
"epoch": 0.6425019628369537,
"grad_norm": 6.2724103927612305,
"learning_rate": 7.1551949751373995e-06,
"logits/chosen": -2.958855152130127,
"logits/rejected": -3.0128567218780518,
"logps/chosen": -313.1339111328125,
"logps/rejected": -276.3824768066406,
"loss": 0.5257,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.249989628791809,
"rewards/margins": 1.6770312786102295,
"rewards/rejected": -2.927020788192749,
"step": 2455
},
{
"epoch": 0.6438105208060717,
"grad_norm": 7.424379825592041,
"learning_rate": 7.129023815755039e-06,
"logits/chosen": -2.9298288822174072,
"logits/rejected": -2.917689085006714,
"logps/chosen": -288.6741638183594,
"logps/rejected": -274.7695007324219,
"loss": 0.49,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7392343282699585,
"rewards/margins": 1.7145700454711914,
"rewards/rejected": -3.4538040161132812,
"step": 2460
},
{
"epoch": 0.6451190787751897,
"grad_norm": 5.782430171966553,
"learning_rate": 7.102852656372677e-06,
"logits/chosen": -2.796736478805542,
"logits/rejected": -2.917767286300659,
"logps/chosen": -316.8404541015625,
"logps/rejected": -304.5469970703125,
"loss": 0.3817,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.597495675086975,
"rewards/margins": 2.14239501953125,
"rewards/rejected": -3.7398905754089355,
"step": 2465
},
{
"epoch": 0.6464276367443078,
"grad_norm": 7.320804595947266,
"learning_rate": 7.076681496990317e-06,
"logits/chosen": -2.8980050086975098,
"logits/rejected": -2.9889309406280518,
"logps/chosen": -309.5592346191406,
"logps/rejected": -293.1075134277344,
"loss": 0.4587,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.9160000085830688,
"rewards/margins": 1.8903604745864868,
"rewards/rejected": -3.8063607215881348,
"step": 2470
},
{
"epoch": 0.6477361947134258,
"grad_norm": 7.583347797393799,
"learning_rate": 7.050510337607956e-06,
"logits/chosen": -2.924474000930786,
"logits/rejected": -2.9702935218811035,
"logps/chosen": -315.4239807128906,
"logps/rejected": -279.01507568359375,
"loss": 0.4572,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.9099395275115967,
"rewards/margins": 1.9472777843475342,
"rewards/rejected": -3.857217311859131,
"step": 2475
},
{
"epoch": 0.6490447526825438,
"grad_norm": 3.8960468769073486,
"learning_rate": 7.0243391782255965e-06,
"logits/chosen": -2.9497616291046143,
"logits/rejected": -3.0569794178009033,
"logps/chosen": -294.25347900390625,
"logps/rejected": -303.3873596191406,
"loss": 0.5478,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.9578750133514404,
"rewards/margins": 1.8239530324935913,
"rewards/rejected": -3.7818284034729004,
"step": 2480
},
{
"epoch": 0.6503533106516619,
"grad_norm": 7.425008296966553,
"learning_rate": 6.998168018843236e-06,
"logits/chosen": -2.807569980621338,
"logits/rejected": -2.941020965576172,
"logps/chosen": -360.5503845214844,
"logps/rejected": -316.94677734375,
"loss": 0.3726,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.6083217859268188,
"rewards/margins": 2.2246527671813965,
"rewards/rejected": -3.8329741954803467,
"step": 2485
},
{
"epoch": 0.6516618686207799,
"grad_norm": 8.695091247558594,
"learning_rate": 6.971996859460874e-06,
"logits/chosen": -2.9309356212615967,
"logits/rejected": -2.965848922729492,
"logps/chosen": -295.5977783203125,
"logps/rejected": -287.81689453125,
"loss": 0.4912,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.896004319190979,
"rewards/margins": 2.422163724899292,
"rewards/rejected": -4.318167686462402,
"step": 2490
},
{
"epoch": 0.6529704265898979,
"grad_norm": 9.302384376525879,
"learning_rate": 6.945825700078514e-06,
"logits/chosen": -2.886300563812256,
"logits/rejected": -2.930643320083618,
"logps/chosen": -296.8878173828125,
"logps/rejected": -288.6578063964844,
"loss": 0.487,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.846461534500122,
"rewards/margins": 1.7389333248138428,
"rewards/rejected": -3.585395097732544,
"step": 2495
},
{
"epoch": 0.654278984559016,
"grad_norm": 9.560452461242676,
"learning_rate": 6.919654540696153e-06,
"logits/chosen": -2.8469934463500977,
"logits/rejected": -2.935584306716919,
"logps/chosen": -260.8996887207031,
"logps/rejected": -299.7215270996094,
"loss": 0.5645,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.634810447692871,
"rewards/margins": 1.784527063369751,
"rewards/rejected": -3.419337511062622,
"step": 2500
},
{
"epoch": 0.655587542528134,
"grad_norm": 7.051764011383057,
"learning_rate": 6.8934833813137926e-06,
"logits/chosen": -2.8771421909332275,
"logits/rejected": -2.951645612716675,
"logps/chosen": -305.71038818359375,
"logps/rejected": -284.0111389160156,
"loss": 0.4771,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5477997064590454,
"rewards/margins": 1.621638298034668,
"rewards/rejected": -3.169437885284424,
"step": 2505
},
{
"epoch": 0.656896100497252,
"grad_norm": 10.278838157653809,
"learning_rate": 6.867312221931433e-06,
"logits/chosen": -2.9116673469543457,
"logits/rejected": -2.973714828491211,
"logps/chosen": -270.4404296875,
"logps/rejected": -254.27865600585938,
"loss": 0.5929,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.5017471313476562,
"rewards/margins": 1.1838232278823853,
"rewards/rejected": -2.685570240020752,
"step": 2510
},
{
"epoch": 0.6582046584663701,
"grad_norm": 8.013856887817383,
"learning_rate": 6.841141062549072e-06,
"logits/chosen": -2.8651554584503174,
"logits/rejected": -2.9246227741241455,
"logps/chosen": -227.64547729492188,
"logps/rejected": -250.6405487060547,
"loss": 0.6189,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.8577998876571655,
"rewards/margins": 0.9316193461418152,
"rewards/rejected": -2.789419174194336,
"step": 2515
},
{
"epoch": 0.6595132164354881,
"grad_norm": 6.389255523681641,
"learning_rate": 6.81496990316671e-06,
"logits/chosen": -2.7882442474365234,
"logits/rejected": -2.774643898010254,
"logps/chosen": -261.3345642089844,
"logps/rejected": -283.9215087890625,
"loss": 0.49,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.8135063648223877,
"rewards/margins": 1.7753257751464844,
"rewards/rejected": -3.588832139968872,
"step": 2520
},
{
"epoch": 0.6608217744046061,
"grad_norm": 8.19912338256836,
"learning_rate": 6.78879874378435e-06,
"logits/chosen": -2.8803114891052246,
"logits/rejected": -2.9215333461761475,
"logps/chosen": -280.89788818359375,
"logps/rejected": -320.07012939453125,
"loss": 0.5114,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.8473488092422485,
"rewards/margins": 1.4395393133163452,
"rewards/rejected": -3.2868876457214355,
"step": 2525
},
{
"epoch": 0.6621303323737242,
"grad_norm": 12.39254379272461,
"learning_rate": 6.7626275844019895e-06,
"logits/chosen": -2.866048812866211,
"logits/rejected": -2.9401180744171143,
"logps/chosen": -286.77490234375,
"logps/rejected": -260.3507995605469,
"loss": 0.4503,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.7392785549163818,
"rewards/margins": 1.815791130065918,
"rewards/rejected": -3.5550696849823,
"step": 2530
},
{
"epoch": 0.6634388903428422,
"grad_norm": 7.732004165649414,
"learning_rate": 6.736456425019629e-06,
"logits/chosen": -2.8534507751464844,
"logits/rejected": -2.924248218536377,
"logps/chosen": -319.3000793457031,
"logps/rejected": -301.00396728515625,
"loss": 0.5023,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.768906593322754,
"rewards/margins": 1.7894645929336548,
"rewards/rejected": -3.5583713054656982,
"step": 2535
},
{
"epoch": 0.6647474483119602,
"grad_norm": 7.866215705871582,
"learning_rate": 6.710285265637269e-06,
"logits/chosen": -3.003950595855713,
"logits/rejected": -2.9520037174224854,
"logps/chosen": -321.5911560058594,
"logps/rejected": -313.03955078125,
"loss": 0.5501,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.6573905944824219,
"rewards/margins": 1.614404320716858,
"rewards/rejected": -3.2717947959899902,
"step": 2540
},
{
"epoch": 0.6660560062810783,
"grad_norm": 8.200257301330566,
"learning_rate": 6.684114106254907e-06,
"logits/chosen": -2.8988094329833984,
"logits/rejected": -2.9182348251342773,
"logps/chosen": -269.07464599609375,
"logps/rejected": -266.5125427246094,
"loss": 0.4299,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.3188421726226807,
"rewards/margins": 2.008021354675293,
"rewards/rejected": -3.3268637657165527,
"step": 2545
},
{
"epoch": 0.6673645642501963,
"grad_norm": 8.644819259643555,
"learning_rate": 6.657942946872546e-06,
"logits/chosen": -2.9794552326202393,
"logits/rejected": -2.959059238433838,
"logps/chosen": -286.81549072265625,
"logps/rejected": -325.7430114746094,
"loss": 0.4827,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3967323303222656,
"rewards/margins": 1.7587554454803467,
"rewards/rejected": -3.155487537384033,
"step": 2550
},
{
"epoch": 0.6686731222193143,
"grad_norm": 6.768131256103516,
"learning_rate": 6.6317717874901865e-06,
"logits/chosen": -2.858863592147827,
"logits/rejected": -2.9572906494140625,
"logps/chosen": -252.885498046875,
"logps/rejected": -306.57196044921875,
"loss": 0.4259,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.3627374172210693,
"rewards/margins": 2.1756911277770996,
"rewards/rejected": -3.538428783416748,
"step": 2555
},
{
"epoch": 0.6699816801884323,
"grad_norm": 5.3841552734375,
"learning_rate": 6.605600628107826e-06,
"logits/chosen": -2.863058090209961,
"logits/rejected": -2.9404101371765137,
"logps/chosen": -292.02935791015625,
"logps/rejected": -301.8074645996094,
"loss": 0.4912,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.5280544757843018,
"rewards/margins": 1.760963797569275,
"rewards/rejected": -3.289018154144287,
"step": 2560
},
{
"epoch": 0.6712902381575504,
"grad_norm": 4.600503921508789,
"learning_rate": 6.579429468725466e-06,
"logits/chosen": -2.857856273651123,
"logits/rejected": -2.9453234672546387,
"logps/chosen": -288.824951171875,
"logps/rejected": -269.5613708496094,
"loss": 0.519,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3591705560684204,
"rewards/margins": 1.5891754627227783,
"rewards/rejected": -2.948345899581909,
"step": 2565
},
{
"epoch": 0.6725987961266684,
"grad_norm": 7.518556118011475,
"learning_rate": 6.553258309343105e-06,
"logits/chosen": -2.8256914615631104,
"logits/rejected": -2.893322467803955,
"logps/chosen": -361.60540771484375,
"logps/rejected": -349.1207275390625,
"loss": 0.4789,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.478563904762268,
"rewards/margins": 1.865069031715393,
"rewards/rejected": -3.3436331748962402,
"step": 2570
},
{
"epoch": 0.6739073540957864,
"grad_norm": 4.997129440307617,
"learning_rate": 6.527087149960743e-06,
"logits/chosen": -2.830671787261963,
"logits/rejected": -2.8592820167541504,
"logps/chosen": -265.6669921875,
"logps/rejected": -297.15960693359375,
"loss": 0.3297,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.161731481552124,
"rewards/margins": 2.2399954795837402,
"rewards/rejected": -3.4017269611358643,
"step": 2575
},
{
"epoch": 0.6752159120649045,
"grad_norm": 6.114622592926025,
"learning_rate": 6.500915990578383e-06,
"logits/chosen": -2.8342204093933105,
"logits/rejected": -2.8946008682250977,
"logps/chosen": -295.8099670410156,
"logps/rejected": -326.4892578125,
"loss": 0.4826,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4532636404037476,
"rewards/margins": 1.522308111190796,
"rewards/rejected": -2.975571870803833,
"step": 2580
},
{
"epoch": 0.6765244700340225,
"grad_norm": 8.778839111328125,
"learning_rate": 6.474744831196023e-06,
"logits/chosen": -2.8909201622009277,
"logits/rejected": -2.9403858184814453,
"logps/chosen": -303.0553283691406,
"logps/rejected": -298.9825439453125,
"loss": 0.5317,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.4531861543655396,
"rewards/margins": 1.1554243564605713,
"rewards/rejected": -2.6086106300354004,
"step": 2585
},
{
"epoch": 0.6778330280031405,
"grad_norm": 7.011162757873535,
"learning_rate": 6.448573671813662e-06,
"logits/chosen": -2.8256630897521973,
"logits/rejected": -2.8981237411499023,
"logps/chosen": -290.13531494140625,
"logps/rejected": -302.042724609375,
"loss": 0.4532,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.2221115827560425,
"rewards/margins": 1.5408296585083008,
"rewards/rejected": -2.762941360473633,
"step": 2590
},
{
"epoch": 0.6791415859722586,
"grad_norm": 7.359402179718018,
"learning_rate": 6.422402512431302e-06,
"logits/chosen": -2.989375591278076,
"logits/rejected": -2.956820487976074,
"logps/chosen": -244.23779296875,
"logps/rejected": -300.09613037109375,
"loss": 0.5183,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.2675294876098633,
"rewards/margins": 1.5874407291412354,
"rewards/rejected": -2.8549702167510986,
"step": 2595
},
{
"epoch": 0.6804501439413766,
"grad_norm": 9.219598770141602,
"learning_rate": 6.39623135304894e-06,
"logits/chosen": -2.8202285766601562,
"logits/rejected": -2.9479031562805176,
"logps/chosen": -292.47711181640625,
"logps/rejected": -301.48541259765625,
"loss": 0.4481,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2682809829711914,
"rewards/margins": 1.6929352283477783,
"rewards/rejected": -2.9612162113189697,
"step": 2600
},
{
"epoch": 0.6817587019104946,
"grad_norm": 10.345588684082031,
"learning_rate": 6.3700601936665795e-06,
"logits/chosen": -2.9142959117889404,
"logits/rejected": -3.0136189460754395,
"logps/chosen": -275.33282470703125,
"logps/rejected": -271.1238708496094,
"loss": 0.6053,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.4676587581634521,
"rewards/margins": 1.2222703695297241,
"rewards/rejected": -2.689929485321045,
"step": 2605
},
{
"epoch": 0.6830672598796127,
"grad_norm": 8.903675079345703,
"learning_rate": 6.3438890342842196e-06,
"logits/chosen": -2.9312522411346436,
"logits/rejected": -2.9643070697784424,
"logps/chosen": -268.38250732421875,
"logps/rejected": -276.27752685546875,
"loss": 0.5743,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.5821937322616577,
"rewards/margins": 1.11379075050354,
"rewards/rejected": -2.695984363555908,
"step": 2610
},
{
"epoch": 0.6843758178487307,
"grad_norm": 4.592360973358154,
"learning_rate": 6.317717874901859e-06,
"logits/chosen": -2.953341007232666,
"logits/rejected": -2.873232364654541,
"logps/chosen": -290.78289794921875,
"logps/rejected": -318.7877502441406,
"loss": 0.3272,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -1.0863611698150635,
"rewards/margins": 1.956146240234375,
"rewards/rejected": -3.0425074100494385,
"step": 2615
},
{
"epoch": 0.6856843758178487,
"grad_norm": 8.31113338470459,
"learning_rate": 6.291546715519498e-06,
"logits/chosen": -2.826333999633789,
"logits/rejected": -2.865309953689575,
"logps/chosen": -328.55352783203125,
"logps/rejected": -317.73321533203125,
"loss": 0.4531,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.1655009984970093,
"rewards/margins": 1.6823265552520752,
"rewards/rejected": -2.847827434539795,
"step": 2620
},
{
"epoch": 0.6869929337869668,
"grad_norm": 5.8730292320251465,
"learning_rate": 6.265375556137138e-06,
"logits/chosen": -2.9004733562469482,
"logits/rejected": -2.986215114593506,
"logps/chosen": -304.14996337890625,
"logps/rejected": -325.6918029785156,
"loss": 0.4951,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.4205595254898071,
"rewards/margins": 1.4959120750427246,
"rewards/rejected": -2.9164717197418213,
"step": 2625
},
{
"epoch": 0.6883014917560848,
"grad_norm": 6.657158851623535,
"learning_rate": 6.2392043967547764e-06,
"logits/chosen": -2.896409034729004,
"logits/rejected": -2.907116413116455,
"logps/chosen": -279.0505676269531,
"logps/rejected": -285.6734619140625,
"loss": 0.4821,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.2515028715133667,
"rewards/margins": 1.3621675968170166,
"rewards/rejected": -2.613670587539673,
"step": 2630
},
{
"epoch": 0.6896100497252028,
"grad_norm": 6.316776275634766,
"learning_rate": 6.213033237372416e-06,
"logits/chosen": -2.944918155670166,
"logits/rejected": -3.0123748779296875,
"logps/chosen": -355.7431945800781,
"logps/rejected": -290.6505126953125,
"loss": 0.4345,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.2796906232833862,
"rewards/margins": 1.6674339771270752,
"rewards/rejected": -2.947124719619751,
"step": 2635
},
{
"epoch": 0.6909186076943209,
"grad_norm": 6.7956037521362305,
"learning_rate": 6.186862077990056e-06,
"logits/chosen": -2.8235573768615723,
"logits/rejected": -2.815488815307617,
"logps/chosen": -261.06329345703125,
"logps/rejected": -269.0946350097656,
"loss": 0.3785,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.296617031097412,
"rewards/margins": 1.810760736465454,
"rewards/rejected": -3.1073780059814453,
"step": 2640
},
{
"epoch": 0.6922271656634389,
"grad_norm": 9.322579383850098,
"learning_rate": 6.160690918607695e-06,
"logits/chosen": -2.9193060398101807,
"logits/rejected": -3.0121703147888184,
"logps/chosen": -290.3641052246094,
"logps/rejected": -252.94198608398438,
"loss": 0.376,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.0163379907608032,
"rewards/margins": 1.8828426599502563,
"rewards/rejected": -2.8991806507110596,
"step": 2645
},
{
"epoch": 0.6935357236325569,
"grad_norm": 8.177329063415527,
"learning_rate": 6.134519759225335e-06,
"logits/chosen": -2.927708387374878,
"logits/rejected": -3.0037269592285156,
"logps/chosen": -308.3323059082031,
"logps/rejected": -318.09002685546875,
"loss": 0.5581,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.1352941989898682,
"rewards/margins": 1.3868353366851807,
"rewards/rejected": -2.522129535675049,
"step": 2650
},
{
"epoch": 0.694844281601675,
"grad_norm": 10.346382141113281,
"learning_rate": 6.108348599842973e-06,
"logits/chosen": -2.8416390419006348,
"logits/rejected": -2.8420791625976562,
"logps/chosen": -267.317626953125,
"logps/rejected": -282.65570068359375,
"loss": 0.6123,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.2157745361328125,
"rewards/margins": 1.3261229991912842,
"rewards/rejected": -2.5418972969055176,
"step": 2655
},
{
"epoch": 0.696152839570793,
"grad_norm": 7.9833455085754395,
"learning_rate": 6.082177440460613e-06,
"logits/chosen": -3.010012626647949,
"logits/rejected": -3.0121254920959473,
"logps/chosen": -314.1331787109375,
"logps/rejected": -295.4376525878906,
"loss": 0.5446,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.2808277606964111,
"rewards/margins": 1.409279227256775,
"rewards/rejected": -2.6901068687438965,
"step": 2660
},
{
"epoch": 0.697461397539911,
"grad_norm": 6.993275165557861,
"learning_rate": 6.056006281078253e-06,
"logits/chosen": -3.0581610202789307,
"logits/rejected": -3.0452892780303955,
"logps/chosen": -276.1371154785156,
"logps/rejected": -248.65353393554688,
"loss": 0.4886,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2057183980941772,
"rewards/margins": 1.5118528604507446,
"rewards/rejected": -2.717571258544922,
"step": 2665
},
{
"epoch": 0.6987699555090291,
"grad_norm": 6.968574047088623,
"learning_rate": 6.029835121695892e-06,
"logits/chosen": -2.9653797149658203,
"logits/rejected": -3.0148282051086426,
"logps/chosen": -324.75762939453125,
"logps/rejected": -289.0790100097656,
"loss": 0.5447,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.534473180770874,
"rewards/margins": 1.1837376356124878,
"rewards/rejected": -2.7182106971740723,
"step": 2670
},
{
"epoch": 0.7000785134781471,
"grad_norm": 7.007041931152344,
"learning_rate": 6.00366396231353e-06,
"logits/chosen": -2.9610750675201416,
"logits/rejected": -2.9877383708953857,
"logps/chosen": -269.05517578125,
"logps/rejected": -309.22509765625,
"loss": 0.4013,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.3385134935379028,
"rewards/margins": 1.7016382217407227,
"rewards/rejected": -3.040152072906494,
"step": 2675
},
{
"epoch": 0.7013870714472651,
"grad_norm": 8.326202392578125,
"learning_rate": 5.977492802931171e-06,
"logits/chosen": -2.8807883262634277,
"logits/rejected": -2.998851776123047,
"logps/chosen": -299.6264953613281,
"logps/rejected": -335.3392639160156,
"loss": 0.6032,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4939355850219727,
"rewards/margins": 1.3633782863616943,
"rewards/rejected": -2.857314109802246,
"step": 2680
},
{
"epoch": 0.7026956294163832,
"grad_norm": 7.929780006408691,
"learning_rate": 5.9513216435488095e-06,
"logits/chosen": -2.8801910877227783,
"logits/rejected": -2.940908432006836,
"logps/chosen": -303.807861328125,
"logps/rejected": -265.91217041015625,
"loss": 0.5652,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0966110229492188,
"rewards/margins": 1.154617428779602,
"rewards/rejected": -2.2512283325195312,
"step": 2685
},
{
"epoch": 0.7040041873855012,
"grad_norm": 6.519760608673096,
"learning_rate": 5.925150484166449e-06,
"logits/chosen": -2.935680389404297,
"logits/rejected": -3.0190281867980957,
"logps/chosen": -292.3984069824219,
"logps/rejected": -277.78302001953125,
"loss": 0.6154,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.2248005867004395,
"rewards/margins": 1.3764398097991943,
"rewards/rejected": -2.601240634918213,
"step": 2690
},
{
"epoch": 0.7053127453546192,
"grad_norm": 5.480634689331055,
"learning_rate": 5.898979324784089e-06,
"logits/chosen": -2.9160866737365723,
"logits/rejected": -3.0109434127807617,
"logps/chosen": -268.1304626464844,
"logps/rejected": -307.4697570800781,
"loss": 0.4778,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.837879478931427,
"rewards/margins": 1.6470983028411865,
"rewards/rejected": -2.484978199005127,
"step": 2695
},
{
"epoch": 0.7066213033237373,
"grad_norm": 6.198331832885742,
"learning_rate": 5.872808165401728e-06,
"logits/chosen": -2.925713300704956,
"logits/rejected": -3.0466697216033936,
"logps/chosen": -287.07537841796875,
"logps/rejected": -267.8861083984375,
"loss": 0.4928,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.7808945178985596,
"rewards/margins": 1.2690925598144531,
"rewards/rejected": -2.049987316131592,
"step": 2700
},
{
"epoch": 0.7079298612928553,
"grad_norm": 6.397192001342773,
"learning_rate": 5.846637006019366e-06,
"logits/chosen": -2.9498023986816406,
"logits/rejected": -3.021524667739868,
"logps/chosen": -356.5502014160156,
"logps/rejected": -305.47344970703125,
"loss": 0.4493,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.1231266260147095,
"rewards/margins": 1.5973341464996338,
"rewards/rejected": -2.7204606533050537,
"step": 2705
},
{
"epoch": 0.7092384192619733,
"grad_norm": 4.5419697761535645,
"learning_rate": 5.8204658466370065e-06,
"logits/chosen": -2.9890902042388916,
"logits/rejected": -2.981123208999634,
"logps/chosen": -280.586669921875,
"logps/rejected": -268.80108642578125,
"loss": 0.5424,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0319064855575562,
"rewards/margins": 1.4627954959869385,
"rewards/rejected": -2.494702100753784,
"step": 2710
},
{
"epoch": 0.7105469772310914,
"grad_norm": 5.94234561920166,
"learning_rate": 5.794294687254646e-06,
"logits/chosen": -3.0644314289093018,
"logits/rejected": -3.11076021194458,
"logps/chosen": -307.77947998046875,
"logps/rejected": -304.94781494140625,
"loss": 0.4971,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.1088581085205078,
"rewards/margins": 1.72271728515625,
"rewards/rejected": -2.831575632095337,
"step": 2715
},
{
"epoch": 0.7118555352002094,
"grad_norm": 4.344791412353516,
"learning_rate": 5.768123527872285e-06,
"logits/chosen": -3.0225837230682373,
"logits/rejected": -3.0797817707061768,
"logps/chosen": -314.328857421875,
"logps/rejected": -300.76959228515625,
"loss": 0.5329,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.093664526939392,
"rewards/margins": 1.374602198600769,
"rewards/rejected": -2.4682669639587402,
"step": 2720
},
{
"epoch": 0.7131640931693274,
"grad_norm": 6.447783470153809,
"learning_rate": 5.741952368489925e-06,
"logits/chosen": -2.824953079223633,
"logits/rejected": -2.915835380554199,
"logps/chosen": -251.0038604736328,
"logps/rejected": -229.1015167236328,
"loss": 0.4933,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0042675733566284,
"rewards/margins": 1.4493547677993774,
"rewards/rejected": -2.453622579574585,
"step": 2725
},
{
"epoch": 0.7144726511384454,
"grad_norm": 4.415769100189209,
"learning_rate": 5.715781209107563e-06,
"logits/chosen": -2.971993923187256,
"logits/rejected": -3.010190963745117,
"logps/chosen": -345.302734375,
"logps/rejected": -356.4703063964844,
"loss": 0.3961,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.6722342371940613,
"rewards/margins": 1.6876239776611328,
"rewards/rejected": -2.359858274459839,
"step": 2730
},
{
"epoch": 0.7157812091075635,
"grad_norm": 7.8874640464782715,
"learning_rate": 5.689610049725203e-06,
"logits/chosen": -3.04805850982666,
"logits/rejected": -3.023395538330078,
"logps/chosen": -252.2591094970703,
"logps/rejected": -256.0127868652344,
"loss": 0.4838,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.035502552986145,
"rewards/margins": 1.598411202430725,
"rewards/rejected": -2.63391375541687,
"step": 2735
},
{
"epoch": 0.7170897670766815,
"grad_norm": 7.325488567352295,
"learning_rate": 5.663438890342843e-06,
"logits/chosen": -3.0302627086639404,
"logits/rejected": -2.9467923641204834,
"logps/chosen": -280.1380615234375,
"logps/rejected": -290.256591796875,
"loss": 0.5807,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.152597188949585,
"rewards/margins": 1.088136911392212,
"rewards/rejected": -2.240734338760376,
"step": 2740
},
{
"epoch": 0.7183983250457995,
"grad_norm": 7.371074676513672,
"learning_rate": 5.637267730960482e-06,
"logits/chosen": -2.880004644393921,
"logits/rejected": -2.8563218116760254,
"logps/chosen": -313.92279052734375,
"logps/rejected": -299.59124755859375,
"loss": 0.5275,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.3305268287658691,
"rewards/margins": 1.441772222518921,
"rewards/rejected": -2.772299289703369,
"step": 2745
},
{
"epoch": 0.7197068830149176,
"grad_norm": 4.628333568572998,
"learning_rate": 5.611096571578122e-06,
"logits/chosen": -2.926056385040283,
"logits/rejected": -2.9997127056121826,
"logps/chosen": -326.965087890625,
"logps/rejected": -287.66619873046875,
"loss": 0.4962,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.244328260421753,
"rewards/margins": 1.6143608093261719,
"rewards/rejected": -2.8586888313293457,
"step": 2750
},
{
"epoch": 0.7210154409840356,
"grad_norm": 8.735886573791504,
"learning_rate": 5.584925412195761e-06,
"logits/chosen": -2.9470670223236084,
"logits/rejected": -2.914656162261963,
"logps/chosen": -291.71343994140625,
"logps/rejected": -282.67962646484375,
"loss": 0.4442,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.071761131286621,
"rewards/margins": 1.5430763959884644,
"rewards/rejected": -2.614838123321533,
"step": 2755
},
{
"epoch": 0.7223239989531536,
"grad_norm": 6.217535495758057,
"learning_rate": 5.5587542528133995e-06,
"logits/chosen": -3.065964698791504,
"logits/rejected": -2.982069730758667,
"logps/chosen": -328.1929626464844,
"logps/rejected": -315.532470703125,
"loss": 0.4297,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -1.2176027297973633,
"rewards/margins": 1.6291157007217407,
"rewards/rejected": -2.8467183113098145,
"step": 2760
},
{
"epoch": 0.7236325569222717,
"grad_norm": 10.293171882629395,
"learning_rate": 5.5325830934310396e-06,
"logits/chosen": -2.8855443000793457,
"logits/rejected": -2.993187665939331,
"logps/chosen": -273.87005615234375,
"logps/rejected": -276.5475158691406,
"loss": 0.6475,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.329725980758667,
"rewards/margins": 1.0645885467529297,
"rewards/rejected": -2.3943145275115967,
"step": 2765
},
{
"epoch": 0.7249411148913897,
"grad_norm": 6.53238582611084,
"learning_rate": 5.506411934048679e-06,
"logits/chosen": -2.9521586894989014,
"logits/rejected": -2.991973638534546,
"logps/chosen": -240.7144775390625,
"logps/rejected": -256.20306396484375,
"loss": 0.433,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.3881947994232178,
"rewards/margins": 1.3882848024368286,
"rewards/rejected": -2.776479721069336,
"step": 2770
},
{
"epoch": 0.7262496728605077,
"grad_norm": 5.987064361572266,
"learning_rate": 5.480240774666318e-06,
"logits/chosen": -3.003497838973999,
"logits/rejected": -3.095515727996826,
"logps/chosen": -291.5846252441406,
"logps/rejected": -314.75897216796875,
"loss": 0.4325,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.2352880239486694,
"rewards/margins": 1.6097803115844727,
"rewards/rejected": -2.8450684547424316,
"step": 2775
},
{
"epoch": 0.7275582308296258,
"grad_norm": 8.166574478149414,
"learning_rate": 5.454069615283958e-06,
"logits/chosen": -2.9763851165771484,
"logits/rejected": -2.9842090606689453,
"logps/chosen": -245.7153778076172,
"logps/rejected": -240.0733184814453,
"loss": 0.5427,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.0601531267166138,
"rewards/margins": 1.1864640712738037,
"rewards/rejected": -2.246617078781128,
"step": 2780
},
{
"epoch": 0.7288667887987438,
"grad_norm": 5.318563938140869,
"learning_rate": 5.4278984559015964e-06,
"logits/chosen": -2.8072509765625,
"logits/rejected": -2.9027323722839355,
"logps/chosen": -281.28741455078125,
"logps/rejected": -281.07037353515625,
"loss": 0.4658,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4859296083450317,
"rewards/margins": 1.3587102890014648,
"rewards/rejected": -2.844639778137207,
"step": 2785
},
{
"epoch": 0.7301753467678618,
"grad_norm": 7.786154270172119,
"learning_rate": 5.401727296519236e-06,
"logits/chosen": -3.0480589866638184,
"logits/rejected": -3.023777723312378,
"logps/chosen": -291.3687744140625,
"logps/rejected": -273.6734619140625,
"loss": 0.4742,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2371892929077148,
"rewards/margins": 1.3749767541885376,
"rewards/rejected": -2.612166166305542,
"step": 2790
},
{
"epoch": 0.7314839047369799,
"grad_norm": 6.338807582855225,
"learning_rate": 5.375556137136876e-06,
"logits/chosen": -2.8649275302886963,
"logits/rejected": -2.9403674602508545,
"logps/chosen": -297.8705749511719,
"logps/rejected": -286.6891174316406,
"loss": 0.5284,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.5126415491104126,
"rewards/margins": 1.4750219583511353,
"rewards/rejected": -2.987663507461548,
"step": 2795
},
{
"epoch": 0.7327924627060979,
"grad_norm": 7.393446445465088,
"learning_rate": 5.349384977754515e-06,
"logits/chosen": -2.7669756412506104,
"logits/rejected": -2.9230990409851074,
"logps/chosen": -273.23651123046875,
"logps/rejected": -301.51702880859375,
"loss": 0.5417,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.8848727941513062,
"rewards/margins": 1.4472146034240723,
"rewards/rejected": -3.3320870399475098,
"step": 2800
},
{
"epoch": 0.7327924627060979,
"eval_logits/chosen": -3.0151562690734863,
"eval_logits/rejected": -3.032125473022461,
"eval_logps/chosen": -299.7012939453125,
"eval_logps/rejected": -294.1008605957031,
"eval_loss": 0.4922027885913849,
"eval_rewards/accuracies": 0.7544999718666077,
"eval_rewards/chosen": -1.6467875242233276,
"eval_rewards/margins": 1.3698387145996094,
"eval_rewards/rejected": -3.0166258811950684,
"eval_runtime": 764.128,
"eval_samples_per_second": 2.617,
"eval_steps_per_second": 0.327,
"step": 2800
},
{
"epoch": 0.7341010206752159,
"grad_norm": 4.838715076446533,
"learning_rate": 5.323213818372154e-06,
"logits/chosen": -2.983696460723877,
"logits/rejected": -2.989692211151123,
"logps/chosen": -326.8425598144531,
"logps/rejected": -332.06634521484375,
"loss": 0.6553,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.7837555408477783,
"rewards/margins": 1.0040950775146484,
"rewards/rejected": -2.7878506183624268,
"step": 2805
},
{
"epoch": 0.735409578644334,
"grad_norm": 7.242071151733398,
"learning_rate": 5.297042658989794e-06,
"logits/chosen": -2.930501937866211,
"logits/rejected": -3.0442214012145996,
"logps/chosen": -284.0743408203125,
"logps/rejected": -317.67840576171875,
"loss": 0.626,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.8534746170043945,
"rewards/margins": 0.8570615649223328,
"rewards/rejected": -2.710536241531372,
"step": 2810
},
{
"epoch": 0.736718136613452,
"grad_norm": 6.814687728881836,
"learning_rate": 5.270871499607433e-06,
"logits/chosen": -2.9587209224700928,
"logits/rejected": -3.0509235858917236,
"logps/chosen": -265.3656921386719,
"logps/rejected": -268.7741394042969,
"loss": 0.4371,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.7126185894012451,
"rewards/margins": 1.4011996984481812,
"rewards/rejected": -3.113818645477295,
"step": 2815
},
{
"epoch": 0.73802669458257,
"grad_norm": 6.233587265014648,
"learning_rate": 5.244700340225073e-06,
"logits/chosen": -3.000408172607422,
"logits/rejected": -3.0090386867523193,
"logps/chosen": -268.31610107421875,
"logps/rejected": -252.54257202148438,
"loss": 0.5113,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.8398408889770508,
"rewards/margins": 1.0091921091079712,
"rewards/rejected": -2.8490328788757324,
"step": 2820
},
{
"epoch": 0.7393352525516881,
"grad_norm": 5.571577072143555,
"learning_rate": 5.218529180842712e-06,
"logits/chosen": -2.983218193054199,
"logits/rejected": -2.993669271469116,
"logps/chosen": -253.2226104736328,
"logps/rejected": -260.74432373046875,
"loss": 0.4537,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.7749837636947632,
"rewards/margins": 1.258331537246704,
"rewards/rejected": -3.033315420150757,
"step": 2825
},
{
"epoch": 0.7406438105208061,
"grad_norm": 8.487015724182129,
"learning_rate": 5.192358021460351e-06,
"logits/chosen": -3.1049036979675293,
"logits/rejected": -3.024142026901245,
"logps/chosen": -331.94085693359375,
"logps/rejected": -290.12615966796875,
"loss": 0.4844,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.843961477279663,
"rewards/margins": 1.2949938774108887,
"rewards/rejected": -3.1389553546905518,
"step": 2830
},
{
"epoch": 0.7419523684899241,
"grad_norm": 9.09654426574707,
"learning_rate": 5.166186862077991e-06,
"logits/chosen": -3.0611274242401123,
"logits/rejected": -3.0860462188720703,
"logps/chosen": -314.9799499511719,
"logps/rejected": -305.2943115234375,
"loss": 0.549,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.8103431463241577,
"rewards/margins": 1.2532975673675537,
"rewards/rejected": -3.063640832901001,
"step": 2835
},
{
"epoch": 0.7432609264590422,
"grad_norm": 7.024341583251953,
"learning_rate": 5.1400157026956295e-06,
"logits/chosen": -2.8816845417022705,
"logits/rejected": -2.858503818511963,
"logps/chosen": -238.06704711914062,
"logps/rejected": -253.7016143798828,
"loss": 0.5255,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.8397810459136963,
"rewards/margins": 1.2725389003753662,
"rewards/rejected": -3.1123194694519043,
"step": 2840
},
{
"epoch": 0.7445694844281602,
"grad_norm": 7.907282829284668,
"learning_rate": 5.113844543313269e-06,
"logits/chosen": -3.0064635276794434,
"logits/rejected": -3.0659427642822266,
"logps/chosen": -268.85504150390625,
"logps/rejected": -243.37680053710938,
"loss": 0.5628,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.0020573139190674,
"rewards/margins": 1.343377947807312,
"rewards/rejected": -3.345435380935669,
"step": 2845
},
{
"epoch": 0.7458780423972782,
"grad_norm": 7.9956464767456055,
"learning_rate": 5.087673383930909e-06,
"logits/chosen": -2.9852802753448486,
"logits/rejected": -3.0461061000823975,
"logps/chosen": -336.5865478515625,
"logps/rejected": -308.699951171875,
"loss": 0.5046,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -2.004922389984131,
"rewards/margins": 1.480072021484375,
"rewards/rejected": -3.484994411468506,
"step": 2850
},
{
"epoch": 0.7471866003663963,
"grad_norm": 5.574730396270752,
"learning_rate": 5.061502224548548e-06,
"logits/chosen": -2.90897798538208,
"logits/rejected": -3.0144150257110596,
"logps/chosen": -294.5137634277344,
"logps/rejected": -311.13787841796875,
"loss": 0.4537,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.66525137424469,
"rewards/margins": 1.5599861145019531,
"rewards/rejected": -3.2252373695373535,
"step": 2855
},
{
"epoch": 0.7484951583355143,
"grad_norm": 8.235709190368652,
"learning_rate": 5.035331065166187e-06,
"logits/chosen": -2.959197521209717,
"logits/rejected": -2.9602441787719727,
"logps/chosen": -266.502685546875,
"logps/rejected": -283.14190673828125,
"loss": 0.483,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.8751081228256226,
"rewards/margins": 1.5132043361663818,
"rewards/rejected": -3.388312578201294,
"step": 2860
},
{
"epoch": 0.7498037163046323,
"grad_norm": 10.213626861572266,
"learning_rate": 5.009159905783827e-06,
"logits/chosen": -2.9322094917297363,
"logits/rejected": -2.9696342945098877,
"logps/chosen": -317.585205078125,
"logps/rejected": -321.09234619140625,
"loss": 0.3887,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.844107985496521,
"rewards/margins": 1.7453218698501587,
"rewards/rejected": -3.5894293785095215,
"step": 2865
},
{
"epoch": 0.7511122742737504,
"grad_norm": 4.826941013336182,
"learning_rate": 4.982988746401466e-06,
"logits/chosen": -3.0065250396728516,
"logits/rejected": -3.007230281829834,
"logps/chosen": -262.45745849609375,
"logps/rejected": -316.62176513671875,
"loss": 0.5233,
"rewards/accuracies": 0.75,
"rewards/chosen": -2.0057075023651123,
"rewards/margins": 1.4744579792022705,
"rewards/rejected": -3.4801650047302246,
"step": 2870
},
{
"epoch": 0.7524208322428684,
"grad_norm": 7.547994613647461,
"learning_rate": 4.956817587019106e-06,
"logits/chosen": -2.992795467376709,
"logits/rejected": -3.04087495803833,
"logps/chosen": -288.2919616699219,
"logps/rejected": -297.0101623535156,
"loss": 0.5241,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.7158348560333252,
"rewards/margins": 1.439750075340271,
"rewards/rejected": -3.1555845737457275,
"step": 2875
},
{
"epoch": 0.7537293902119864,
"grad_norm": 8.768786430358887,
"learning_rate": 4.930646427636745e-06,
"logits/chosen": -3.0172226428985596,
"logits/rejected": -3.03350567817688,
"logps/chosen": -284.9944152832031,
"logps/rejected": -287.624267578125,
"loss": 0.678,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.690021276473999,
"rewards/margins": 1.1501357555389404,
"rewards/rejected": -2.8401570320129395,
"step": 2880
},
{
"epoch": 0.7550379481811044,
"grad_norm": 7.017695426940918,
"learning_rate": 4.904475268254384e-06,
"logits/chosen": -3.009779214859009,
"logits/rejected": -2.9742047786712646,
"logps/chosen": -298.6566467285156,
"logps/rejected": -313.1660461425781,
"loss": 0.4167,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.845075011253357,
"rewards/margins": 1.5888986587524414,
"rewards/rejected": -3.433973789215088,
"step": 2885
},
{
"epoch": 0.7563465061502225,
"grad_norm": 7.003612518310547,
"learning_rate": 4.878304108872023e-06,
"logits/chosen": -3.0446248054504395,
"logits/rejected": -3.0289571285247803,
"logps/chosen": -262.9287109375,
"logps/rejected": -265.87652587890625,
"loss": 0.4688,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.8515796661376953,
"rewards/margins": 1.2062747478485107,
"rewards/rejected": -3.057854413986206,
"step": 2890
},
{
"epoch": 0.7576550641193405,
"grad_norm": 9.433390617370605,
"learning_rate": 4.852132949489663e-06,
"logits/chosen": -3.0275864601135254,
"logits/rejected": -3.0345730781555176,
"logps/chosen": -275.7232971191406,
"logps/rejected": -304.7025451660156,
"loss": 0.5553,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.858757734298706,
"rewards/margins": 1.2982639074325562,
"rewards/rejected": -3.1570212841033936,
"step": 2895
},
{
"epoch": 0.7589636220884585,
"grad_norm": 8.930874824523926,
"learning_rate": 4.825961790107302e-06,
"logits/chosen": -3.054530620574951,
"logits/rejected": -3.0698142051696777,
"logps/chosen": -266.8838195800781,
"logps/rejected": -265.035400390625,
"loss": 0.6378,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.7914512157440186,
"rewards/margins": 0.9923449754714966,
"rewards/rejected": -2.7837963104248047,
"step": 2900
},
{
"epoch": 0.7602721800575766,
"grad_norm": 6.339506149291992,
"learning_rate": 4.799790630724941e-06,
"logits/chosen": -2.9580090045928955,
"logits/rejected": -3.0641238689422607,
"logps/chosen": -343.8387756347656,
"logps/rejected": -306.41131591796875,
"loss": 0.5801,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.693976640701294,
"rewards/margins": 1.276618480682373,
"rewards/rejected": -2.970594882965088,
"step": 2905
},
{
"epoch": 0.7615807380266946,
"grad_norm": 7.329442977905273,
"learning_rate": 4.773619471342581e-06,
"logits/chosen": -2.9426028728485107,
"logits/rejected": -3.050346851348877,
"logps/chosen": -329.4857177734375,
"logps/rejected": -312.78326416015625,
"loss": 0.4338,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4311538934707642,
"rewards/margins": 1.5746768712997437,
"rewards/rejected": -3.005831003189087,
"step": 2910
},
{
"epoch": 0.7628892959958126,
"grad_norm": 9.978039741516113,
"learning_rate": 4.74744831196022e-06,
"logits/chosen": -2.9623208045959473,
"logits/rejected": -3.00553035736084,
"logps/chosen": -272.9161682128906,
"logps/rejected": -275.42938232421875,
"loss": 0.5062,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.415977954864502,
"rewards/margins": 1.6318323612213135,
"rewards/rejected": -3.0478103160858154,
"step": 2915
},
{
"epoch": 0.7641978539649307,
"grad_norm": 9.443857192993164,
"learning_rate": 4.7212771525778596e-06,
"logits/chosen": -3.0159335136413574,
"logits/rejected": -3.0996041297912598,
"logps/chosen": -368.2574768066406,
"logps/rejected": -347.77099609375,
"loss": 0.5326,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.184187650680542,
"rewards/margins": 1.3835378885269165,
"rewards/rejected": -2.56772518157959,
"step": 2920
},
{
"epoch": 0.7655064119340487,
"grad_norm": 6.5323004722595215,
"learning_rate": 4.695105993195499e-06,
"logits/chosen": -2.9812138080596924,
"logits/rejected": -3.046619415283203,
"logps/chosen": -291.3216247558594,
"logps/rejected": -335.71905517578125,
"loss": 0.4556,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.1183115243911743,
"rewards/margins": 1.6687357425689697,
"rewards/rejected": -2.7870471477508545,
"step": 2925
},
{
"epoch": 0.7668149699031667,
"grad_norm": 7.712170124053955,
"learning_rate": 4.668934833813139e-06,
"logits/chosen": -2.9688682556152344,
"logits/rejected": -2.981501817703247,
"logps/chosen": -286.4724426269531,
"logps/rejected": -257.21624755859375,
"loss": 0.6616,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.215268850326538,
"rewards/margins": 1.1600792407989502,
"rewards/rejected": -2.3753480911254883,
"step": 2930
},
{
"epoch": 0.7681235278722848,
"grad_norm": 6.923589706420898,
"learning_rate": 4.642763674430777e-06,
"logits/chosen": -2.9614179134368896,
"logits/rejected": -2.9387574195861816,
"logps/chosen": -311.4593811035156,
"logps/rejected": -321.67041015625,
"loss": 0.5725,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0969353914260864,
"rewards/margins": 1.274997591972351,
"rewards/rejected": -2.3719329833984375,
"step": 2935
},
{
"epoch": 0.7694320858414028,
"grad_norm": 5.904993534088135,
"learning_rate": 4.616592515048417e-06,
"logits/chosen": -3.0357108116149902,
"logits/rejected": -2.909348487854004,
"logps/chosen": -281.0157775878906,
"logps/rejected": -285.02655029296875,
"loss": 0.5521,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.1831865310668945,
"rewards/margins": 1.486000895500183,
"rewards/rejected": -2.669187307357788,
"step": 2940
},
{
"epoch": 0.7707406438105208,
"grad_norm": 6.901908874511719,
"learning_rate": 4.5904213556660565e-06,
"logits/chosen": -2.8211007118225098,
"logits/rejected": -2.88071870803833,
"logps/chosen": -271.14752197265625,
"logps/rejected": -254.5480194091797,
"loss": 0.5307,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.3990057706832886,
"rewards/margins": 1.23850417137146,
"rewards/rejected": -2.637510061264038,
"step": 2945
},
{
"epoch": 0.7720492017796389,
"grad_norm": 6.1756978034973145,
"learning_rate": 4.564250196283696e-06,
"logits/chosen": -2.891523838043213,
"logits/rejected": -2.9697718620300293,
"logps/chosen": -291.3207702636719,
"logps/rejected": -290.206787109375,
"loss": 0.5143,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3040390014648438,
"rewards/margins": 1.3442087173461914,
"rewards/rejected": -2.6482479572296143,
"step": 2950
},
{
"epoch": 0.7733577597487569,
"grad_norm": 8.50953483581543,
"learning_rate": 4.538079036901335e-06,
"logits/chosen": -2.8105571269989014,
"logits/rejected": -2.8145947456359863,
"logps/chosen": -294.50286865234375,
"logps/rejected": -265.7152404785156,
"loss": 0.5944,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2462295293807983,
"rewards/margins": 1.3464641571044922,
"rewards/rejected": -2.59269380569458,
"step": 2955
},
{
"epoch": 0.7746663177178749,
"grad_norm": 8.569794654846191,
"learning_rate": 4.511907877518974e-06,
"logits/chosen": -2.9901108741760254,
"logits/rejected": -2.969892978668213,
"logps/chosen": -309.3415832519531,
"logps/rejected": -323.17120361328125,
"loss": 0.5608,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.099224328994751,
"rewards/margins": 1.1640383005142212,
"rewards/rejected": -2.2632622718811035,
"step": 2960
},
{
"epoch": 0.775974875686993,
"grad_norm": 6.8540143966674805,
"learning_rate": 4.485736718136614e-06,
"logits/chosen": -2.9594240188598633,
"logits/rejected": -3.0211386680603027,
"logps/chosen": -293.110107421875,
"logps/rejected": -330.95574951171875,
"loss": 0.5497,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.0265569686889648,
"rewards/margins": 1.503934621810913,
"rewards/rejected": -2.530491352081299,
"step": 2965
},
{
"epoch": 0.777283433656111,
"grad_norm": 6.731704235076904,
"learning_rate": 4.459565558754253e-06,
"logits/chosen": -2.9934017658233643,
"logits/rejected": -3.0968306064605713,
"logps/chosen": -261.086181640625,
"logps/rejected": -260.58489990234375,
"loss": 0.5805,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1074072122573853,
"rewards/margins": 1.2867536544799805,
"rewards/rejected": -2.3941612243652344,
"step": 2970
},
{
"epoch": 0.778591991625229,
"grad_norm": 6.898943901062012,
"learning_rate": 4.433394399371893e-06,
"logits/chosen": -2.861293315887451,
"logits/rejected": -2.998875141143799,
"logps/chosen": -283.81689453125,
"logps/rejected": -248.768798828125,
"loss": 0.4341,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1969306468963623,
"rewards/margins": 1.3554353713989258,
"rewards/rejected": -2.552365779876709,
"step": 2975
},
{
"epoch": 0.7799005495943471,
"grad_norm": 7.095617294311523,
"learning_rate": 4.407223239989532e-06,
"logits/chosen": -2.9728245735168457,
"logits/rejected": -3.051290273666382,
"logps/chosen": -310.69158935546875,
"logps/rejected": -277.09735107421875,
"loss": 0.5192,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.1353857517242432,
"rewards/margins": 1.3678357601165771,
"rewards/rejected": -2.5032215118408203,
"step": 2980
},
{
"epoch": 0.7812091075634651,
"grad_norm": 8.380833625793457,
"learning_rate": 4.381052080607171e-06,
"logits/chosen": -2.9203391075134277,
"logits/rejected": -2.90177321434021,
"logps/chosen": -270.91522216796875,
"logps/rejected": -295.3896179199219,
"loss": 0.5453,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3055708408355713,
"rewards/margins": 1.305996298789978,
"rewards/rejected": -2.6115670204162598,
"step": 2985
},
{
"epoch": 0.7825176655325831,
"grad_norm": 5.626356601715088,
"learning_rate": 4.35488092122481e-06,
"logits/chosen": -3.0040221214294434,
"logits/rejected": -2.9230525493621826,
"logps/chosen": -313.6124267578125,
"logps/rejected": -320.3941955566406,
"loss": 0.5426,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2137765884399414,
"rewards/margins": 1.1921061277389526,
"rewards/rejected": -2.4058830738067627,
"step": 2990
},
{
"epoch": 0.7838262235017012,
"grad_norm": 7.655642986297607,
"learning_rate": 4.32870976184245e-06,
"logits/chosen": -2.935147762298584,
"logits/rejected": -2.9671669006347656,
"logps/chosen": -319.5419006347656,
"logps/rejected": -267.1727600097656,
"loss": 0.4952,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.858169436454773,
"rewards/margins": 1.29342520236969,
"rewards/rejected": -2.151594400405884,
"step": 2995
},
{
"epoch": 0.7851347814708192,
"grad_norm": 3.3605234622955322,
"learning_rate": 4.30253860246009e-06,
"logits/chosen": -2.91528058052063,
"logits/rejected": -2.91469407081604,
"logps/chosen": -303.7840270996094,
"logps/rejected": -274.5299377441406,
"loss": 0.4449,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.7587825655937195,
"rewards/margins": 1.5326234102249146,
"rewards/rejected": -2.2914059162139893,
"step": 3000
},
{
"epoch": 0.7864433394399372,
"grad_norm": 7.770859718322754,
"learning_rate": 4.276367443077729e-06,
"logits/chosen": -2.953711748123169,
"logits/rejected": -2.9724724292755127,
"logps/chosen": -318.14874267578125,
"logps/rejected": -262.6700744628906,
"loss": 0.6077,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.4992079734802246,
"rewards/margins": 0.9534958004951477,
"rewards/rejected": -2.4527037143707275,
"step": 3005
},
{
"epoch": 0.7877518974090553,
"grad_norm": 6.161098003387451,
"learning_rate": 4.250196283695368e-06,
"logits/chosen": -2.969233751296997,
"logits/rejected": -3.020934581756592,
"logps/chosen": -288.4843444824219,
"logps/rejected": -319.7886657714844,
"loss": 0.5435,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.9259554743766785,
"rewards/margins": 1.0652966499328613,
"rewards/rejected": -1.9912521839141846,
"step": 3010
},
{
"epoch": 0.7890604553781733,
"grad_norm": 7.810305118560791,
"learning_rate": 4.224025124313007e-06,
"logits/chosen": -2.9788691997528076,
"logits/rejected": -3.0794777870178223,
"logps/chosen": -277.36798095703125,
"logps/rejected": -287.5350646972656,
"loss": 0.5337,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8270560503005981,
"rewards/margins": 1.2853838205337524,
"rewards/rejected": -2.1124396324157715,
"step": 3015
},
{
"epoch": 0.7903690133472913,
"grad_norm": 5.958588600158691,
"learning_rate": 4.1978539649306465e-06,
"logits/chosen": -2.976804494857788,
"logits/rejected": -3.009307384490967,
"logps/chosen": -244.9545440673828,
"logps/rejected": -222.89016723632812,
"loss": 0.5216,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0923906564712524,
"rewards/margins": 1.161705732345581,
"rewards/rejected": -2.254096508026123,
"step": 3020
},
{
"epoch": 0.7916775713164094,
"grad_norm": 5.89898157119751,
"learning_rate": 4.171682805548286e-06,
"logits/chosen": -2.9162070751190186,
"logits/rejected": -2.9571709632873535,
"logps/chosen": -259.6465759277344,
"logps/rejected": -298.78155517578125,
"loss": 0.4204,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.9322217702865601,
"rewards/margins": 1.6251089572906494,
"rewards/rejected": -2.55733060836792,
"step": 3025
},
{
"epoch": 0.7929861292855274,
"grad_norm": 6.151041030883789,
"learning_rate": 4.145511646165926e-06,
"logits/chosen": -2.9598276615142822,
"logits/rejected": -3.0385665893554688,
"logps/chosen": -338.153076171875,
"logps/rejected": -307.7692565917969,
"loss": 0.5783,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3869907855987549,
"rewards/margins": 1.015866994857788,
"rewards/rejected": -2.402858018875122,
"step": 3030
},
{
"epoch": 0.7942946872546454,
"grad_norm": 5.3807291984558105,
"learning_rate": 4.119340486783565e-06,
"logits/chosen": -3.0343217849731445,
"logits/rejected": -3.071495771408081,
"logps/chosen": -235.5870361328125,
"logps/rejected": -239.8998565673828,
"loss": 0.5386,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.5046651363372803,
"rewards/margins": 1.3207638263702393,
"rewards/rejected": -2.8254292011260986,
"step": 3035
},
{
"epoch": 0.7956032452237635,
"grad_norm": 7.987537860870361,
"learning_rate": 4.093169327401204e-06,
"logits/chosen": -2.9960923194885254,
"logits/rejected": -3.0310943126678467,
"logps/chosen": -300.4662780761719,
"logps/rejected": -280.687744140625,
"loss": 0.555,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.4423376321792603,
"rewards/margins": 1.2718671560287476,
"rewards/rejected": -2.714204788208008,
"step": 3040
},
{
"epoch": 0.7969118031928815,
"grad_norm": 7.680141448974609,
"learning_rate": 4.066998168018843e-06,
"logits/chosen": -2.984675645828247,
"logits/rejected": -2.976343870162964,
"logps/chosen": -283.5673828125,
"logps/rejected": -267.16009521484375,
"loss": 0.5549,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6328814029693604,
"rewards/margins": 1.3648440837860107,
"rewards/rejected": -2.997725486755371,
"step": 3045
},
{
"epoch": 0.7982203611619995,
"grad_norm": 6.773283004760742,
"learning_rate": 4.0408270086364835e-06,
"logits/chosen": -2.978335380554199,
"logits/rejected": -3.0025360584259033,
"logps/chosen": -329.9278564453125,
"logps/rejected": -262.8837890625,
"loss": 0.5166,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.4951245784759521,
"rewards/margins": 1.3376452922821045,
"rewards/rejected": -2.8327701091766357,
"step": 3050
},
{
"epoch": 0.7995289191311175,
"grad_norm": 8.977509498596191,
"learning_rate": 4.014655849254122e-06,
"logits/chosen": -3.0407679080963135,
"logits/rejected": -3.0819222927093506,
"logps/chosen": -307.4114990234375,
"logps/rejected": -280.94561767578125,
"loss": 0.5877,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.498591423034668,
"rewards/margins": 1.1345003843307495,
"rewards/rejected": -2.633091926574707,
"step": 3055
},
{
"epoch": 0.8008374771002356,
"grad_norm": 8.170519828796387,
"learning_rate": 3.988484689871762e-06,
"logits/chosen": -3.032698631286621,
"logits/rejected": -3.0359036922454834,
"logps/chosen": -236.0676727294922,
"logps/rejected": -247.2089080810547,
"loss": 0.6471,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.6221116781234741,
"rewards/margins": 1.1387770175933838,
"rewards/rejected": -2.7608890533447266,
"step": 3060
},
{
"epoch": 0.8021460350693536,
"grad_norm": 9.307779312133789,
"learning_rate": 3.962313530489401e-06,
"logits/chosen": -2.9368393421173096,
"logits/rejected": -2.983513355255127,
"logps/chosen": -348.62835693359375,
"logps/rejected": -318.1614685058594,
"loss": 0.4591,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.301815390586853,
"rewards/margins": 1.657279372215271,
"rewards/rejected": -2.959094762802124,
"step": 3065
},
{
"epoch": 0.8034545930384716,
"grad_norm": 7.106703758239746,
"learning_rate": 3.93614237110704e-06,
"logits/chosen": -2.9417312145233154,
"logits/rejected": -2.972074031829834,
"logps/chosen": -292.5469970703125,
"logps/rejected": -245.0569305419922,
"loss": 0.5217,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.4775664806365967,
"rewards/margins": 1.3939807415008545,
"rewards/rejected": -2.871546983718872,
"step": 3070
},
{
"epoch": 0.8047631510075897,
"grad_norm": 10.409375190734863,
"learning_rate": 3.90997121172468e-06,
"logits/chosen": -2.9813740253448486,
"logits/rejected": -2.9496383666992188,
"logps/chosen": -306.1288146972656,
"logps/rejected": -290.25262451171875,
"loss": 0.5501,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.700331449508667,
"rewards/margins": 1.1744781732559204,
"rewards/rejected": -2.874809980392456,
"step": 3075
},
{
"epoch": 0.8060717089767077,
"grad_norm": 5.405389308929443,
"learning_rate": 3.883800052342319e-06,
"logits/chosen": -2.9348511695861816,
"logits/rejected": -3.074619770050049,
"logps/chosen": -305.00555419921875,
"logps/rejected": -289.0831604003906,
"loss": 0.4904,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.4995722770690918,
"rewards/margins": 1.304827332496643,
"rewards/rejected": -2.8043994903564453,
"step": 3080
},
{
"epoch": 0.8073802669458257,
"grad_norm": 7.511078834533691,
"learning_rate": 3.857628892959959e-06,
"logits/chosen": -2.7204461097717285,
"logits/rejected": -2.8985595703125,
"logps/chosen": -276.2601318359375,
"logps/rejected": -236.35366821289062,
"loss": 0.4225,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.4592502117156982,
"rewards/margins": 1.428612470626831,
"rewards/rejected": -2.8878626823425293,
"step": 3085
},
{
"epoch": 0.8086888249149438,
"grad_norm": 7.300355434417725,
"learning_rate": 3.831457733577597e-06,
"logits/chosen": -2.975726366043091,
"logits/rejected": -2.9710006713867188,
"logps/chosen": -293.2878112792969,
"logps/rejected": -317.8365783691406,
"loss": 0.469,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6435035467147827,
"rewards/margins": 1.1404824256896973,
"rewards/rejected": -2.7839858531951904,
"step": 3090
},
{
"epoch": 0.8099973828840618,
"grad_norm": 6.010566711425781,
"learning_rate": 3.8052865741952373e-06,
"logits/chosen": -3.0353527069091797,
"logits/rejected": -3.0600810050964355,
"logps/chosen": -293.95343017578125,
"logps/rejected": -270.61004638671875,
"loss": 0.5388,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.5871026515960693,
"rewards/margins": 1.1377742290496826,
"rewards/rejected": -2.724876880645752,
"step": 3095
},
{
"epoch": 0.8113059408531798,
"grad_norm": 6.270742893218994,
"learning_rate": 3.7791154148128765e-06,
"logits/chosen": -2.8717355728149414,
"logits/rejected": -2.875217914581299,
"logps/chosen": -311.8862609863281,
"logps/rejected": -329.5162048339844,
"loss": 0.4579,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.5746370553970337,
"rewards/margins": 1.3537368774414062,
"rewards/rejected": -2.9283738136291504,
"step": 3100
},
{
"epoch": 0.8126144988222979,
"grad_norm": 9.443546295166016,
"learning_rate": 3.7529442554305157e-06,
"logits/chosen": -2.9529592990875244,
"logits/rejected": -2.9881930351257324,
"logps/chosen": -282.13140869140625,
"logps/rejected": -282.1882019042969,
"loss": 0.5176,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.5689818859100342,
"rewards/margins": 1.4024773836135864,
"rewards/rejected": -2.971459150314331,
"step": 3105
},
{
"epoch": 0.8139230567914159,
"grad_norm": 5.381568908691406,
"learning_rate": 3.7267730960481554e-06,
"logits/chosen": -2.9820408821105957,
"logits/rejected": -2.9640519618988037,
"logps/chosen": -291.12969970703125,
"logps/rejected": -277.7159118652344,
"loss": 0.3889,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.7315528392791748,
"rewards/margins": 1.7429358959197998,
"rewards/rejected": -3.4744884967803955,
"step": 3110
},
{
"epoch": 0.8152316147605339,
"grad_norm": 5.273469924926758,
"learning_rate": 3.7006019366657946e-06,
"logits/chosen": -2.9469833374023438,
"logits/rejected": -3.015949010848999,
"logps/chosen": -285.083740234375,
"logps/rejected": -265.0752868652344,
"loss": 0.4623,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.6700470447540283,
"rewards/margins": 1.4441113471984863,
"rewards/rejected": -3.1141583919525146,
"step": 3115
},
{
"epoch": 0.816540172729652,
"grad_norm": 6.050240993499756,
"learning_rate": 3.6744307772834342e-06,
"logits/chosen": -2.932485580444336,
"logits/rejected": -2.9992053508758545,
"logps/chosen": -276.6758117675781,
"logps/rejected": -296.73333740234375,
"loss": 0.4808,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.8731861114501953,
"rewards/margins": 1.7045581340789795,
"rewards/rejected": -3.577744722366333,
"step": 3120
},
{
"epoch": 0.81784873069877,
"grad_norm": 7.137781620025635,
"learning_rate": 3.648259617901073e-06,
"logits/chosen": -2.9615964889526367,
"logits/rejected": -2.880619525909424,
"logps/chosen": -268.4992370605469,
"logps/rejected": -295.4488830566406,
"loss": 0.5318,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.7875770330429077,
"rewards/margins": 1.2315229177474976,
"rewards/rejected": -3.019099712371826,
"step": 3125
},
{
"epoch": 0.819157288667888,
"grad_norm": 7.0195770263671875,
"learning_rate": 3.6220884585187127e-06,
"logits/chosen": -3.0774314403533936,
"logits/rejected": -3.045903205871582,
"logps/chosen": -302.8963928222656,
"logps/rejected": -289.8187561035156,
"loss": 0.4234,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.661341667175293,
"rewards/margins": 1.1674296855926514,
"rewards/rejected": -2.8287715911865234,
"step": 3130
},
{
"epoch": 0.8204658466370061,
"grad_norm": 7.271994590759277,
"learning_rate": 3.5959172991363523e-06,
"logits/chosen": -2.9475741386413574,
"logits/rejected": -3.019735813140869,
"logps/chosen": -316.7098693847656,
"logps/rejected": -328.3517761230469,
"loss": 0.5028,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.7327712774276733,
"rewards/margins": 1.505361795425415,
"rewards/rejected": -3.238132953643799,
"step": 3135
},
{
"epoch": 0.821774404606124,
"grad_norm": 6.570908546447754,
"learning_rate": 3.569746139753991e-06,
"logits/chosen": -2.9615020751953125,
"logits/rejected": -3.003160238265991,
"logps/chosen": -299.74273681640625,
"logps/rejected": -291.9687805175781,
"loss": 0.3802,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.4846771955490112,
"rewards/margins": 2.014193058013916,
"rewards/rejected": -3.498870372772217,
"step": 3140
},
{
"epoch": 0.823082962575242,
"grad_norm": 3.8049066066741943,
"learning_rate": 3.5435749803716308e-06,
"logits/chosen": -3.0279877185821533,
"logits/rejected": -3.0599331855773926,
"logps/chosen": -309.2520446777344,
"logps/rejected": -310.35345458984375,
"loss": 0.5532,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.453507661819458,
"rewards/margins": 1.6446282863616943,
"rewards/rejected": -3.0981359481811523,
"step": 3145
},
{
"epoch": 0.8243915205443602,
"grad_norm": 9.395318031311035,
"learning_rate": 3.5174038209892704e-06,
"logits/chosen": -2.9431824684143066,
"logits/rejected": -3.0078811645507812,
"logps/chosen": -323.8676452636719,
"logps/rejected": -280.33453369140625,
"loss": 0.7375,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.7898139953613281,
"rewards/margins": 0.9050365686416626,
"rewards/rejected": -2.694850444793701,
"step": 3150
},
{
"epoch": 0.8257000785134782,
"grad_norm": 6.055134296417236,
"learning_rate": 3.491232661606909e-06,
"logits/chosen": -2.9507486820220947,
"logits/rejected": -3.0724995136260986,
"logps/chosen": -315.60272216796875,
"logps/rejected": -324.2132263183594,
"loss": 0.5575,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.733232855796814,
"rewards/margins": 1.3413363695144653,
"rewards/rejected": -3.0745689868927,
"step": 3155
},
{
"epoch": 0.8270086364825961,
"grad_norm": 6.183289051055908,
"learning_rate": 3.465061502224549e-06,
"logits/chosen": -2.8759653568267822,
"logits/rejected": -2.8962912559509277,
"logps/chosen": -282.1387023925781,
"logps/rejected": -255.8695526123047,
"loss": 0.5389,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.533860445022583,
"rewards/margins": 1.3426233530044556,
"rewards/rejected": -2.876483678817749,
"step": 3160
},
{
"epoch": 0.8283171944517143,
"grad_norm": 5.198134422302246,
"learning_rate": 3.438890342842188e-06,
"logits/chosen": -2.9086239337921143,
"logits/rejected": -2.9607226848602295,
"logps/chosen": -277.3299560546875,
"logps/rejected": -286.2669982910156,
"loss": 0.4774,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.849469780921936,
"rewards/margins": 1.3763679265975952,
"rewards/rejected": -3.2258377075195312,
"step": 3165
},
{
"epoch": 0.8296257524208323,
"grad_norm": 9.19830322265625,
"learning_rate": 3.4127191834598277e-06,
"logits/chosen": -3.077191114425659,
"logits/rejected": -2.9784042835235596,
"logps/chosen": -299.14813232421875,
"logps/rejected": -330.7813415527344,
"loss": 0.46,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.447545051574707,
"rewards/margins": 1.3955647945404053,
"rewards/rejected": -2.8431098461151123,
"step": 3170
},
{
"epoch": 0.8309343103899502,
"grad_norm": 6.804299354553223,
"learning_rate": 3.386548024077467e-06,
"logits/chosen": -2.9858601093292236,
"logits/rejected": -2.996666431427002,
"logps/chosen": -314.24005126953125,
"logps/rejected": -302.2778015136719,
"loss": 0.5011,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.7465810775756836,
"rewards/margins": 1.3649499416351318,
"rewards/rejected": -3.1115307807922363,
"step": 3175
},
{
"epoch": 0.8322428683590684,
"grad_norm": 5.180750846862793,
"learning_rate": 3.360376864695106e-06,
"logits/chosen": -2.973330020904541,
"logits/rejected": -3.0549609661102295,
"logps/chosen": -384.7432556152344,
"logps/rejected": -316.04833984375,
"loss": 0.5156,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.293901801109314,
"rewards/margins": 1.534147024154663,
"rewards/rejected": -2.8280491828918457,
"step": 3180
},
{
"epoch": 0.8335514263281864,
"grad_norm": 7.629887104034424,
"learning_rate": 3.3342057053127458e-06,
"logits/chosen": -3.01334285736084,
"logits/rejected": -2.983562469482422,
"logps/chosen": -322.8381652832031,
"logps/rejected": -297.83392333984375,
"loss": 0.527,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.3352491855621338,
"rewards/margins": 1.2987505197525024,
"rewards/rejected": -2.6339995861053467,
"step": 3185
},
{
"epoch": 0.8348599842973043,
"grad_norm": 7.089648246765137,
"learning_rate": 3.3080345459303846e-06,
"logits/chosen": -2.8630924224853516,
"logits/rejected": -2.8851840496063232,
"logps/chosen": -304.0502014160156,
"logps/rejected": -321.80255126953125,
"loss": 0.586,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.5677211284637451,
"rewards/margins": 1.1580266952514648,
"rewards/rejected": -2.72574782371521,
"step": 3190
},
{
"epoch": 0.8361685422664225,
"grad_norm": 6.6564860343933105,
"learning_rate": 3.281863386548024e-06,
"logits/chosen": -2.8017802238464355,
"logits/rejected": -2.8708529472351074,
"logps/chosen": -257.3683166503906,
"logps/rejected": -240.1475830078125,
"loss": 0.5368,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.4609788656234741,
"rewards/margins": 1.4196208715438843,
"rewards/rejected": -2.8805994987487793,
"step": 3195
},
{
"epoch": 0.8374771002355405,
"grad_norm": 7.388543605804443,
"learning_rate": 3.255692227165664e-06,
"logits/chosen": -2.9424808025360107,
"logits/rejected": -3.038038492202759,
"logps/chosen": -301.71630859375,
"logps/rejected": -311.08245849609375,
"loss": 0.4928,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.0317409038543701,
"rewards/margins": 1.6584066152572632,
"rewards/rejected": -2.6901473999023438,
"step": 3200
},
{
"epoch": 0.8374771002355405,
"eval_logits/chosen": -3.004171848297119,
"eval_logits/rejected": -3.0209107398986816,
"eval_logps/chosen": -295.4100036621094,
"eval_logps/rejected": -290.0252990722656,
"eval_loss": 0.49639275670051575,
"eval_rewards/accuracies": 0.7595000267028809,
"eval_rewards/chosen": -1.2176601886749268,
"eval_rewards/margins": 1.3914103507995605,
"eval_rewards/rejected": -2.6090707778930664,
"eval_runtime": 763.7592,
"eval_samples_per_second": 2.619,
"eval_steps_per_second": 0.327,
"step": 3200
},
{
"epoch": 0.8387856582046584,
"grad_norm": 7.057781219482422,
"learning_rate": 3.2295210677833035e-06,
"logits/chosen": -2.9613780975341797,
"logits/rejected": -2.9954867362976074,
"logps/chosen": -296.17083740234375,
"logps/rejected": -265.18072509765625,
"loss": 0.3919,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.425917387008667,
"rewards/margins": 1.6352697610855103,
"rewards/rejected": -3.0611870288848877,
"step": 3205
},
{
"epoch": 0.8400942161737766,
"grad_norm": 7.055730819702148,
"learning_rate": 3.2033499084009423e-06,
"logits/chosen": -2.848184585571289,
"logits/rejected": -2.9945926666259766,
"logps/chosen": -307.47662353515625,
"logps/rejected": -315.60101318359375,
"loss": 0.3921,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.9309557676315308,
"rewards/margins": 1.7479255199432373,
"rewards/rejected": -2.6788811683654785,
"step": 3210
},
{
"epoch": 0.8414027741428945,
"grad_norm": 9.12265682220459,
"learning_rate": 3.177178749018582e-06,
"logits/chosen": -3.0333364009857178,
"logits/rejected": -3.006922483444214,
"logps/chosen": -251.0235595703125,
"logps/rejected": -257.051025390625,
"loss": 0.6059,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.6401920318603516,
"rewards/margins": 1.1688909530639648,
"rewards/rejected": -2.8090832233428955,
"step": 3215
},
{
"epoch": 0.8427113321120125,
"grad_norm": 6.198644638061523,
"learning_rate": 3.151007589636221e-06,
"logits/chosen": -2.97039532661438,
"logits/rejected": -3.030413866043091,
"logps/chosen": -256.3705749511719,
"logps/rejected": -229.65072631835938,
"loss": 0.463,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.372048258781433,
"rewards/margins": 1.6272108554840088,
"rewards/rejected": -2.9992592334747314,
"step": 3220
},
{
"epoch": 0.8440198900811305,
"grad_norm": 6.126826763153076,
"learning_rate": 3.1248364302538604e-06,
"logits/chosen": -2.9617550373077393,
"logits/rejected": -3.0138838291168213,
"logps/chosen": -273.10321044921875,
"logps/rejected": -262.1094055175781,
"loss": 0.4266,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.1408138275146484,
"rewards/margins": 1.4347550868988037,
"rewards/rejected": -2.575568675994873,
"step": 3225
},
{
"epoch": 0.8453284480502486,
"grad_norm": 9.463789939880371,
"learning_rate": 3.0986652708715e-06,
"logits/chosen": -2.980067014694214,
"logits/rejected": -3.017759084701538,
"logps/chosen": -259.8030700683594,
"logps/rejected": -261.61419677734375,
"loss": 0.5268,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.146183967590332,
"rewards/margins": 1.2039825916290283,
"rewards/rejected": -2.3501665592193604,
"step": 3230
},
{
"epoch": 0.8466370060193666,
"grad_norm": 6.915153503417969,
"learning_rate": 3.0724941114891392e-06,
"logits/chosen": -3.0340006351470947,
"logits/rejected": -2.977094888687134,
"logps/chosen": -311.4469909667969,
"logps/rejected": -297.03460693359375,
"loss": 0.5994,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.4913580417633057,
"rewards/margins": 1.2569396495819092,
"rewards/rejected": -2.7482974529266357,
"step": 3235
},
{
"epoch": 0.8479455639884846,
"grad_norm": 7.158982276916504,
"learning_rate": 3.0463229521067784e-06,
"logits/chosen": -2.955960273742676,
"logits/rejected": -3.0300660133361816,
"logps/chosen": -270.5257873535156,
"logps/rejected": -304.80340576171875,
"loss": 0.4053,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.099576711654663,
"rewards/margins": 1.5984654426574707,
"rewards/rejected": -2.698042392730713,
"step": 3240
},
{
"epoch": 0.8492541219576027,
"grad_norm": 5.484156131744385,
"learning_rate": 3.0201517927244177e-06,
"logits/chosen": -2.967376232147217,
"logits/rejected": -3.0268256664276123,
"logps/chosen": -282.62249755859375,
"logps/rejected": -254.0297088623047,
"loss": 0.4376,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.060046672821045,
"rewards/margins": 1.762258529663086,
"rewards/rejected": -2.822305202484131,
"step": 3245
},
{
"epoch": 0.8505626799267207,
"grad_norm": 7.077023506164551,
"learning_rate": 2.9939806333420573e-06,
"logits/chosen": -2.9605937004089355,
"logits/rejected": -3.015320301055908,
"logps/chosen": -313.13873291015625,
"logps/rejected": -316.87750244140625,
"loss": 0.5465,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.3930941820144653,
"rewards/margins": 1.102479100227356,
"rewards/rejected": -2.4955732822418213,
"step": 3250
},
{
"epoch": 0.8518712378958387,
"grad_norm": 7.836671829223633,
"learning_rate": 2.967809473959697e-06,
"logits/chosen": -3.0352444648742676,
"logits/rejected": -3.102151393890381,
"logps/chosen": -307.4310302734375,
"logps/rejected": -253.9599609375,
"loss": 0.5673,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.2398912906646729,
"rewards/margins": 1.4965870380401611,
"rewards/rejected": -2.736478328704834,
"step": 3255
},
{
"epoch": 0.8531797958649568,
"grad_norm": 7.435060977935791,
"learning_rate": 2.9416383145773357e-06,
"logits/chosen": -2.9178872108459473,
"logits/rejected": -3.0403456687927246,
"logps/chosen": -319.54052734375,
"logps/rejected": -270.2267761230469,
"loss": 0.533,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2776553630828857,
"rewards/margins": 1.4546865224838257,
"rewards/rejected": -2.732342004776001,
"step": 3260
},
{
"epoch": 0.8544883538340748,
"grad_norm": 6.412667274475098,
"learning_rate": 2.9154671551949754e-06,
"logits/chosen": -3.0104992389678955,
"logits/rejected": -3.0792577266693115,
"logps/chosen": -278.3431091308594,
"logps/rejected": -322.3707580566406,
"loss": 0.4199,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.0318635702133179,
"rewards/margins": 1.6050819158554077,
"rewards/rejected": -2.6369454860687256,
"step": 3265
},
{
"epoch": 0.8557969118031928,
"grad_norm": 8.721508979797363,
"learning_rate": 2.889295995812615e-06,
"logits/chosen": -3.0468242168426514,
"logits/rejected": -3.033629894256592,
"logps/chosen": -371.76409912109375,
"logps/rejected": -343.81085205078125,
"loss": 0.4677,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.381502628326416,
"rewards/margins": 1.6215152740478516,
"rewards/rejected": -3.0030181407928467,
"step": 3270
},
{
"epoch": 0.857105469772311,
"grad_norm": 2.7983927726745605,
"learning_rate": 2.863124836430254e-06,
"logits/chosen": -2.9721121788024902,
"logits/rejected": -3.047161102294922,
"logps/chosen": -325.7302551269531,
"logps/rejected": -356.19097900390625,
"loss": 0.3731,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.456455945968628,
"rewards/margins": 1.8465648889541626,
"rewards/rejected": -3.303021192550659,
"step": 3275
},
{
"epoch": 0.8584140277414289,
"grad_norm": 4.696521282196045,
"learning_rate": 2.8369536770478935e-06,
"logits/chosen": -2.99096417427063,
"logits/rejected": -3.0428411960601807,
"logps/chosen": -293.07366943359375,
"logps/rejected": -274.5947570800781,
"loss": 0.3984,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.1890863180160522,
"rewards/margins": 2.1313068866729736,
"rewards/rejected": -3.3203930854797363,
"step": 3280
},
{
"epoch": 0.8597225857105469,
"grad_norm": 7.623385429382324,
"learning_rate": 2.8107825176655327e-06,
"logits/chosen": -2.9804584980010986,
"logits/rejected": -2.954515218734741,
"logps/chosen": -273.37713623046875,
"logps/rejected": -262.2710266113281,
"loss": 0.4606,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.275095820426941,
"rewards/margins": 1.7871692180633545,
"rewards/rejected": -3.062264919281006,
"step": 3285
},
{
"epoch": 0.861031143679665,
"grad_norm": 8.984639167785645,
"learning_rate": 2.7846113582831723e-06,
"logits/chosen": -2.820075511932373,
"logits/rejected": -2.9113526344299316,
"logps/chosen": -308.37994384765625,
"logps/rejected": -295.8663024902344,
"loss": 0.4429,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.3035976886749268,
"rewards/margins": 1.941608190536499,
"rewards/rejected": -3.2452056407928467,
"step": 3290
},
{
"epoch": 0.862339701648783,
"grad_norm": 6.510889530181885,
"learning_rate": 2.7584401989008115e-06,
"logits/chosen": -3.090820074081421,
"logits/rejected": -3.129952907562256,
"logps/chosen": -280.90997314453125,
"logps/rejected": -277.00030517578125,
"loss": 0.4266,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.054640293121338,
"rewards/margins": 1.758040428161621,
"rewards/rejected": -2.812680721282959,
"step": 3295
},
{
"epoch": 0.863648259617901,
"grad_norm": 9.088248252868652,
"learning_rate": 2.7322690395184508e-06,
"logits/chosen": -2.9733877182006836,
"logits/rejected": -3.027345895767212,
"logps/chosen": -287.5892639160156,
"logps/rejected": -267.43878173828125,
"loss": 0.4333,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.3994263410568237,
"rewards/margins": 2.2078773975372314,
"rewards/rejected": -3.607304096221924,
"step": 3300
},
{
"epoch": 0.8649568175870191,
"grad_norm": 5.8171491622924805,
"learning_rate": 2.7060978801360904e-06,
"logits/chosen": -2.925607204437256,
"logits/rejected": -3.0442306995391846,
"logps/chosen": -332.95458984375,
"logps/rejected": -302.6817321777344,
"loss": 0.4657,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.267094373703003,
"rewards/margins": 1.7528711557388306,
"rewards/rejected": -3.019965648651123,
"step": 3305
},
{
"epoch": 0.8662653755561371,
"grad_norm": 5.004279136657715,
"learning_rate": 2.679926720753729e-06,
"logits/chosen": -2.962287425994873,
"logits/rejected": -2.9575023651123047,
"logps/chosen": -266.0353088378906,
"logps/rejected": -269.9578552246094,
"loss": 0.5247,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3235856294631958,
"rewards/margins": 1.6374728679656982,
"rewards/rejected": -2.9610581398010254,
"step": 3310
},
{
"epoch": 0.8675739335252551,
"grad_norm": 7.791489601135254,
"learning_rate": 2.653755561371369e-06,
"logits/chosen": -2.993173122406006,
"logits/rejected": -3.0355095863342285,
"logps/chosen": -230.2362060546875,
"logps/rejected": -252.8092803955078,
"loss": 0.4766,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.343328833580017,
"rewards/margins": 1.6592735052108765,
"rewards/rejected": -3.0026021003723145,
"step": 3315
},
{
"epoch": 0.8688824914943732,
"grad_norm": 6.9868998527526855,
"learning_rate": 2.6275844019890085e-06,
"logits/chosen": -2.90317964553833,
"logits/rejected": -3.0146877765655518,
"logps/chosen": -232.83566284179688,
"logps/rejected": -263.18365478515625,
"loss": 0.4633,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3262803554534912,
"rewards/margins": 1.7742512226104736,
"rewards/rejected": -3.100531578063965,
"step": 3320
},
{
"epoch": 0.8701910494634912,
"grad_norm": 10.964067459106445,
"learning_rate": 2.601413242606648e-06,
"logits/chosen": -2.9223103523254395,
"logits/rejected": -2.892368793487549,
"logps/chosen": -276.1986389160156,
"logps/rejected": -297.419921875,
"loss": 0.8137,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -2.0564141273498535,
"rewards/margins": 0.9367812871932983,
"rewards/rejected": -2.9931955337524414,
"step": 3325
},
{
"epoch": 0.8714996074326092,
"grad_norm": 9.714390754699707,
"learning_rate": 2.575242083224287e-06,
"logits/chosen": -2.947138547897339,
"logits/rejected": -3.092214822769165,
"logps/chosen": -357.3052062988281,
"logps/rejected": -319.5323791503906,
"loss": 0.4528,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2855061292648315,
"rewards/margins": 2.0207602977752686,
"rewards/rejected": -3.3062667846679688,
"step": 3330
},
{
"epoch": 0.8728081654017273,
"grad_norm": 7.657586574554443,
"learning_rate": 2.5490709238419266e-06,
"logits/chosen": -3.013709545135498,
"logits/rejected": -3.0418596267700195,
"logps/chosen": -328.94537353515625,
"logps/rejected": -283.6571044921875,
"loss": 0.525,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.1225342750549316,
"rewards/margins": 1.795938491821289,
"rewards/rejected": -2.9184727668762207,
"step": 3335
},
{
"epoch": 0.8741167233708453,
"grad_norm": 6.528357028961182,
"learning_rate": 2.5228997644595658e-06,
"logits/chosen": -2.9861557483673096,
"logits/rejected": -3.135845184326172,
"logps/chosen": -243.2984619140625,
"logps/rejected": -237.9906005859375,
"loss": 0.6573,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.59354829788208,
"rewards/margins": 1.3717825412750244,
"rewards/rejected": -2.9653310775756836,
"step": 3340
},
{
"epoch": 0.8754252813399633,
"grad_norm": 8.699584007263184,
"learning_rate": 2.4967286050772054e-06,
"logits/chosen": -2.866548538208008,
"logits/rejected": -3.0045650005340576,
"logps/chosen": -319.767822265625,
"logps/rejected": -277.4715881347656,
"loss": 0.506,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.2535655498504639,
"rewards/margins": 1.7059853076934814,
"rewards/rejected": -2.9595508575439453,
"step": 3345
},
{
"epoch": 0.8767338393090814,
"grad_norm": 6.741850852966309,
"learning_rate": 2.4705574456948446e-06,
"logits/chosen": -3.012995719909668,
"logits/rejected": -3.0462710857391357,
"logps/chosen": -335.1493225097656,
"logps/rejected": -292.0329895019531,
"loss": 0.4305,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.2484524250030518,
"rewards/margins": 1.6712596416473389,
"rewards/rejected": -2.9197120666503906,
"step": 3350
},
{
"epoch": 0.8780423972781994,
"grad_norm": 6.849013805389404,
"learning_rate": 2.444386286312484e-06,
"logits/chosen": -3.019461154937744,
"logits/rejected": -3.0690269470214844,
"logps/chosen": -328.6025695800781,
"logps/rejected": -283.9448547363281,
"loss": 0.4625,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1433143615722656,
"rewards/margins": 1.3490070104599,
"rewards/rejected": -2.492321491241455,
"step": 3355
},
{
"epoch": 0.8793509552473174,
"grad_norm": 3.412297248840332,
"learning_rate": 2.418215126930123e-06,
"logits/chosen": -2.9395012855529785,
"logits/rejected": -2.933227300643921,
"logps/chosen": -276.8416748046875,
"logps/rejected": -305.0869445800781,
"loss": 0.3263,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.8936668634414673,
"rewards/margins": 2.017547845840454,
"rewards/rejected": -2.911214590072632,
"step": 3360
},
{
"epoch": 0.8806595132164355,
"grad_norm": 7.709452152252197,
"learning_rate": 2.3920439675477623e-06,
"logits/chosen": -3.047055721282959,
"logits/rejected": -3.083347797393799,
"logps/chosen": -260.4131164550781,
"logps/rejected": -314.905517578125,
"loss": 0.5143,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.1395976543426514,
"rewards/margins": 1.6013320684432983,
"rewards/rejected": -2.740929365158081,
"step": 3365
},
{
"epoch": 0.8819680711855535,
"grad_norm": 7.616606712341309,
"learning_rate": 2.365872808165402e-06,
"logits/chosen": -3.0480358600616455,
"logits/rejected": -3.052595853805542,
"logps/chosen": -265.3002624511719,
"logps/rejected": -294.23712158203125,
"loss": 0.5369,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.218409538269043,
"rewards/margins": 1.6266998052597046,
"rewards/rejected": -2.845109224319458,
"step": 3370
},
{
"epoch": 0.8832766291546715,
"grad_norm": 5.287708759307861,
"learning_rate": 2.339701648783041e-06,
"logits/chosen": -2.958054304122925,
"logits/rejected": -3.068449020385742,
"logps/chosen": -268.5967712402344,
"logps/rejected": -294.0408630371094,
"loss": 0.5142,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.2594164609909058,
"rewards/margins": 1.3391399383544922,
"rewards/rejected": -2.5985562801361084,
"step": 3375
},
{
"epoch": 0.8845851871237895,
"grad_norm": 9.27445125579834,
"learning_rate": 2.313530489400681e-06,
"logits/chosen": -3.025479793548584,
"logits/rejected": -3.016753673553467,
"logps/chosen": -314.6586608886719,
"logps/rejected": -325.9570007324219,
"loss": 0.6003,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.6857277154922485,
"rewards/margins": 1.2919389009475708,
"rewards/rejected": -2.9776668548583984,
"step": 3380
},
{
"epoch": 0.8858937450929076,
"grad_norm": 7.8645243644714355,
"learning_rate": 2.28735933001832e-06,
"logits/chosen": -2.9674994945526123,
"logits/rejected": -3.07468843460083,
"logps/chosen": -277.78887939453125,
"logps/rejected": -275.15899658203125,
"loss": 0.5845,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.2671358585357666,
"rewards/margins": 1.3928353786468506,
"rewards/rejected": -2.659970998764038,
"step": 3385
},
{
"epoch": 0.8872023030620256,
"grad_norm": 7.450486660003662,
"learning_rate": 2.2611881706359592e-06,
"logits/chosen": -2.8782074451446533,
"logits/rejected": -2.9619007110595703,
"logps/chosen": -273.5116271972656,
"logps/rejected": -306.6257629394531,
"loss": 0.5905,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.3433325290679932,
"rewards/margins": 1.4469295740127563,
"rewards/rejected": -2.790262222290039,
"step": 3390
},
{
"epoch": 0.8885108610311436,
"grad_norm": 5.97955322265625,
"learning_rate": 2.235017011253599e-06,
"logits/chosen": -2.9929678440093994,
"logits/rejected": -3.1036667823791504,
"logps/chosen": -338.9819030761719,
"logps/rejected": -304.00146484375,
"loss": 0.4292,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3203136920928955,
"rewards/margins": 1.7166574001312256,
"rewards/rejected": -3.0369715690612793,
"step": 3395
},
{
"epoch": 0.8898194190002617,
"grad_norm": 6.3199310302734375,
"learning_rate": 2.208845851871238e-06,
"logits/chosen": -2.9735336303710938,
"logits/rejected": -2.9744515419006348,
"logps/chosen": -296.5519104003906,
"logps/rejected": -322.54473876953125,
"loss": 0.4602,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.639952301979065,
"rewards/margins": 1.7713512182235718,
"rewards/rejected": -3.4113032817840576,
"step": 3400
},
{
"epoch": 0.8911279769693797,
"grad_norm": 7.517760276794434,
"learning_rate": 2.1826746924888777e-06,
"logits/chosen": -3.0101208686828613,
"logits/rejected": -3.0078091621398926,
"logps/chosen": -309.48797607421875,
"logps/rejected": -322.8365783691406,
"loss": 0.4754,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.2661359310150146,
"rewards/margins": 1.4626022577285767,
"rewards/rejected": -2.7287380695343018,
"step": 3405
},
{
"epoch": 0.8924365349384977,
"grad_norm": 7.3932037353515625,
"learning_rate": 2.156503533106517e-06,
"logits/chosen": -2.8534436225891113,
"logits/rejected": -2.9222302436828613,
"logps/chosen": -314.8663330078125,
"logps/rejected": -323.77252197265625,
"loss": 0.6032,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.5054900646209717,
"rewards/margins": 1.3880724906921387,
"rewards/rejected": -2.8935627937316895,
"step": 3410
},
{
"epoch": 0.8937450929076158,
"grad_norm": 5.375305652618408,
"learning_rate": 2.130332373724156e-06,
"logits/chosen": -3.040001630783081,
"logits/rejected": -3.1081435680389404,
"logps/chosen": -280.5118408203125,
"logps/rejected": -292.08270263671875,
"loss": 0.3811,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.2469137907028198,
"rewards/margins": 1.6295486688613892,
"rewards/rejected": -2.87646222114563,
"step": 3415
},
{
"epoch": 0.8950536508767338,
"grad_norm": 9.578230857849121,
"learning_rate": 2.1041612143417954e-06,
"logits/chosen": -2.9527251720428467,
"logits/rejected": -3.0252342224121094,
"logps/chosen": -309.64801025390625,
"logps/rejected": -319.7813415527344,
"loss": 0.5587,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5965030193328857,
"rewards/margins": 1.1897649765014648,
"rewards/rejected": -2.7862679958343506,
"step": 3420
},
{
"epoch": 0.8963622088458518,
"grad_norm": 5.5111002922058105,
"learning_rate": 2.0779900549594346e-06,
"logits/chosen": -2.8854727745056152,
"logits/rejected": -2.8697562217712402,
"logps/chosen": -303.0765380859375,
"logps/rejected": -296.5349426269531,
"loss": 0.4015,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.1718518733978271,
"rewards/margins": 1.609633207321167,
"rewards/rejected": -2.781485080718994,
"step": 3425
},
{
"epoch": 0.8976707668149699,
"grad_norm": 8.389058113098145,
"learning_rate": 2.0518188955770743e-06,
"logits/chosen": -3.0345101356506348,
"logits/rejected": -3.033433198928833,
"logps/chosen": -293.57318115234375,
"logps/rejected": -266.28497314453125,
"loss": 0.4547,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.349166750907898,
"rewards/margins": 1.5450422763824463,
"rewards/rejected": -2.894209146499634,
"step": 3430
},
{
"epoch": 0.8989793247840879,
"grad_norm": 7.349452495574951,
"learning_rate": 2.0256477361947135e-06,
"logits/chosen": -3.0003952980041504,
"logits/rejected": -3.0549144744873047,
"logps/chosen": -286.839111328125,
"logps/rejected": -297.920166015625,
"loss": 0.345,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.3413785696029663,
"rewards/margins": 2.113399028778076,
"rewards/rejected": -3.454777479171753,
"step": 3435
},
{
"epoch": 0.9002878827532059,
"grad_norm": 8.613375663757324,
"learning_rate": 1.999476576812353e-06,
"logits/chosen": -2.9884345531463623,
"logits/rejected": -3.0320065021514893,
"logps/chosen": -296.2138366699219,
"logps/rejected": -302.5164489746094,
"loss": 0.5699,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.6587432622909546,
"rewards/margins": 1.523429274559021,
"rewards/rejected": -3.1821722984313965,
"step": 3440
},
{
"epoch": 0.901596440722324,
"grad_norm": 5.533935070037842,
"learning_rate": 1.9733054174299923e-06,
"logits/chosen": -2.9528141021728516,
"logits/rejected": -2.9987263679504395,
"logps/chosen": -291.6798095703125,
"logps/rejected": -290.3052062988281,
"loss": 0.4147,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.4009933471679688,
"rewards/margins": 1.6354844570159912,
"rewards/rejected": -3.03647780418396,
"step": 3445
},
{
"epoch": 0.902904998691442,
"grad_norm": 9.2689847946167,
"learning_rate": 1.9471342580476316e-06,
"logits/chosen": -2.980245351791382,
"logits/rejected": -3.004823684692383,
"logps/chosen": -322.0609436035156,
"logps/rejected": -259.2235107421875,
"loss": 0.6756,
"rewards/accuracies": 0.6875,
"rewards/chosen": -2.1433141231536865,
"rewards/margins": 1.234440803527832,
"rewards/rejected": -3.3777554035186768,
"step": 3450
},
{
"epoch": 0.90421355666056,
"grad_norm": 10.556782722473145,
"learning_rate": 1.920963098665271e-06,
"logits/chosen": -3.0391366481781006,
"logits/rejected": -3.0794732570648193,
"logps/chosen": -274.0779724121094,
"logps/rejected": -266.954345703125,
"loss": 0.6314,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.3744312524795532,
"rewards/margins": 1.1151368618011475,
"rewards/rejected": -2.4895682334899902,
"step": 3455
},
{
"epoch": 0.9055221146296781,
"grad_norm": 9.761650085449219,
"learning_rate": 1.8947919392829104e-06,
"logits/chosen": -2.862259864807129,
"logits/rejected": -2.933992624282837,
"logps/chosen": -306.671142578125,
"logps/rejected": -282.5141296386719,
"loss": 0.624,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.6207879781723022,
"rewards/margins": 1.4092611074447632,
"rewards/rejected": -3.0300488471984863,
"step": 3460
},
{
"epoch": 0.9068306725987961,
"grad_norm": 5.749721527099609,
"learning_rate": 1.8686207799005498e-06,
"logits/chosen": -2.98447847366333,
"logits/rejected": -3.064558744430542,
"logps/chosen": -324.86114501953125,
"logps/rejected": -278.88970947265625,
"loss": 0.3623,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.3682321310043335,
"rewards/margins": 1.78044855594635,
"rewards/rejected": -3.1486804485321045,
"step": 3465
},
{
"epoch": 0.9081392305679141,
"grad_norm": 8.051587104797363,
"learning_rate": 1.842449620518189e-06,
"logits/chosen": -2.9344372749328613,
"logits/rejected": -3.005045175552368,
"logps/chosen": -310.6846618652344,
"logps/rejected": -305.3709411621094,
"loss": 0.6313,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.7746692895889282,
"rewards/margins": 1.2048325538635254,
"rewards/rejected": -2.979501962661743,
"step": 3470
},
{
"epoch": 0.9094477885370322,
"grad_norm": 5.901284217834473,
"learning_rate": 1.8162784611358283e-06,
"logits/chosen": -2.8661179542541504,
"logits/rejected": -2.9399428367614746,
"logps/chosen": -287.2570495605469,
"logps/rejected": -279.65753173828125,
"loss": 0.4383,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3972651958465576,
"rewards/margins": 1.6384613513946533,
"rewards/rejected": -3.035726547241211,
"step": 3475
},
{
"epoch": 0.9107563465061502,
"grad_norm": 11.674195289611816,
"learning_rate": 1.790107301753468e-06,
"logits/chosen": -2.847024440765381,
"logits/rejected": -2.9919915199279785,
"logps/chosen": -363.022216796875,
"logps/rejected": -360.4650573730469,
"loss": 0.5059,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.6431465148925781,
"rewards/margins": 1.526166558265686,
"rewards/rejected": -3.1693129539489746,
"step": 3480
},
{
"epoch": 0.9120649044752682,
"grad_norm": 6.637493133544922,
"learning_rate": 1.7639361423711071e-06,
"logits/chosen": -2.994047164916992,
"logits/rejected": -3.043743371963501,
"logps/chosen": -257.53936767578125,
"logps/rejected": -255.7579345703125,
"loss": 0.5947,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.4301214218139648,
"rewards/margins": 1.0894838571548462,
"rewards/rejected": -2.5196051597595215,
"step": 3485
},
{
"epoch": 0.9133734624443863,
"grad_norm": 7.341172695159912,
"learning_rate": 1.7377649829887466e-06,
"logits/chosen": -2.9541261196136475,
"logits/rejected": -3.027951240539551,
"logps/chosen": -282.1636657714844,
"logps/rejected": -250.5120849609375,
"loss": 0.4473,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.2336229085922241,
"rewards/margins": 1.685388207435608,
"rewards/rejected": -2.919011354446411,
"step": 3490
},
{
"epoch": 0.9146820204135043,
"grad_norm": 5.005151271820068,
"learning_rate": 1.7115938236063858e-06,
"logits/chosen": -3.0645532608032227,
"logits/rejected": -3.0290141105651855,
"logps/chosen": -294.0321960449219,
"logps/rejected": -294.2281799316406,
"loss": 0.4609,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3800275325775146,
"rewards/margins": 1.3488258123397827,
"rewards/rejected": -2.728853225708008,
"step": 3495
},
{
"epoch": 0.9159905783826223,
"grad_norm": 8.08800983428955,
"learning_rate": 1.6854226642240254e-06,
"logits/chosen": -3.062406063079834,
"logits/rejected": -3.1058766841888428,
"logps/chosen": -282.1437683105469,
"logps/rejected": -263.7963562011719,
"loss": 0.4244,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.356877088546753,
"rewards/margins": 1.5647704601287842,
"rewards/rejected": -2.921647548675537,
"step": 3500
},
{
"epoch": 0.9172991363517404,
"grad_norm": 7.3398518562316895,
"learning_rate": 1.6592515048416647e-06,
"logits/chosen": -3.0162220001220703,
"logits/rejected": -3.0233070850372314,
"logps/chosen": -289.8272705078125,
"logps/rejected": -308.98638916015625,
"loss": 0.4882,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.6926946640014648,
"rewards/margins": 1.4996764659881592,
"rewards/rejected": -3.192371129989624,
"step": 3505
},
{
"epoch": 0.9186076943208584,
"grad_norm": 6.586010456085205,
"learning_rate": 1.6330803454593039e-06,
"logits/chosen": -2.993813991546631,
"logits/rejected": -3.0858685970306396,
"logps/chosen": -304.7975769042969,
"logps/rejected": -281.93597412109375,
"loss": 0.4022,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.3266664743423462,
"rewards/margins": 1.8491216897964478,
"rewards/rejected": -3.175788164138794,
"step": 3510
},
{
"epoch": 0.9199162522899764,
"grad_norm": 5.34058141708374,
"learning_rate": 1.6069091860769433e-06,
"logits/chosen": -2.8806405067443848,
"logits/rejected": -3.0084080696105957,
"logps/chosen": -341.41949462890625,
"logps/rejected": -265.82244873046875,
"loss": 0.4117,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.258302927017212,
"rewards/margins": 1.7906951904296875,
"rewards/rejected": -3.0489978790283203,
"step": 3515
},
{
"epoch": 0.9212248102590945,
"grad_norm": 9.21333122253418,
"learning_rate": 1.5807380266945827e-06,
"logits/chosen": -3.036750078201294,
"logits/rejected": -3.039182186126709,
"logps/chosen": -262.3890686035156,
"logps/rejected": -282.87860107421875,
"loss": 0.6021,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.4617328643798828,
"rewards/margins": 1.3960540294647217,
"rewards/rejected": -2.8577866554260254,
"step": 3520
},
{
"epoch": 0.9225333682282125,
"grad_norm": 7.1087212562561035,
"learning_rate": 1.5545668673122222e-06,
"logits/chosen": -2.8940954208374023,
"logits/rejected": -2.9105896949768066,
"logps/chosen": -310.8441162109375,
"logps/rejected": -275.7667541503906,
"loss": 0.3913,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.1332801580429077,
"rewards/margins": 1.8010799884796143,
"rewards/rejected": -2.9343602657318115,
"step": 3525
},
{
"epoch": 0.9238419261973305,
"grad_norm": 8.39911937713623,
"learning_rate": 1.5283957079298614e-06,
"logits/chosen": -2.99474835395813,
"logits/rejected": -3.022717237472534,
"logps/chosen": -317.1966857910156,
"logps/rejected": -348.03912353515625,
"loss": 0.5075,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.2938311100006104,
"rewards/margins": 1.4564930200576782,
"rewards/rejected": -2.75032377243042,
"step": 3530
},
{
"epoch": 0.9251504841664486,
"grad_norm": 5.257741928100586,
"learning_rate": 1.5022245485475006e-06,
"logits/chosen": -3.0030980110168457,
"logits/rejected": -3.0215423107147217,
"logps/chosen": -259.13018798828125,
"logps/rejected": -269.13836669921875,
"loss": 0.6352,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4522452354431152,
"rewards/margins": 1.424629807472229,
"rewards/rejected": -2.8768749237060547,
"step": 3535
},
{
"epoch": 0.9264590421355666,
"grad_norm": 7.180831432342529,
"learning_rate": 1.4760533891651402e-06,
"logits/chosen": -2.985456705093384,
"logits/rejected": -3.057675838470459,
"logps/chosen": -301.18206787109375,
"logps/rejected": -271.5340881347656,
"loss": 0.4618,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.2516207695007324,
"rewards/margins": 1.704184889793396,
"rewards/rejected": -2.955806016921997,
"step": 3540
},
{
"epoch": 0.9277676001046846,
"grad_norm": 5.434940814971924,
"learning_rate": 1.4498822297827795e-06,
"logits/chosen": -3.045609951019287,
"logits/rejected": -3.0566811561584473,
"logps/chosen": -275.1131896972656,
"logps/rejected": -282.7756042480469,
"loss": 0.4518,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.5488427877426147,
"rewards/margins": 1.5551543235778809,
"rewards/rejected": -3.103996992111206,
"step": 3545
},
{
"epoch": 0.9290761580738026,
"grad_norm": 4.331151008605957,
"learning_rate": 1.4237110704004189e-06,
"logits/chosen": -2.9415290355682373,
"logits/rejected": -3.057833671569824,
"logps/chosen": -267.46356201171875,
"logps/rejected": -269.8301696777344,
"loss": 0.4672,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.1876513957977295,
"rewards/margins": 1.8198295831680298,
"rewards/rejected": -3.0074806213378906,
"step": 3550
},
{
"epoch": 0.9303847160429207,
"grad_norm": 5.2976813316345215,
"learning_rate": 1.3975399110180581e-06,
"logits/chosen": -2.95489764213562,
"logits/rejected": -2.976945400238037,
"logps/chosen": -272.0807189941406,
"logps/rejected": -283.7767333984375,
"loss": 0.5264,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.4996970891952515,
"rewards/margins": 1.2823091745376587,
"rewards/rejected": -2.7820065021514893,
"step": 3555
},
{
"epoch": 0.9316932740120387,
"grad_norm": 7.1556196212768555,
"learning_rate": 1.3713687516356975e-06,
"logits/chosen": -3.040180206298828,
"logits/rejected": -3.0060112476348877,
"logps/chosen": -238.9356689453125,
"logps/rejected": -238.90194702148438,
"loss": 0.4935,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.2994797229766846,
"rewards/margins": 1.4473230838775635,
"rewards/rejected": -2.746802568435669,
"step": 3560
},
{
"epoch": 0.9330018319811567,
"grad_norm": 5.718963623046875,
"learning_rate": 1.345197592253337e-06,
"logits/chosen": -2.975170135498047,
"logits/rejected": -2.9809365272521973,
"logps/chosen": -262.6956787109375,
"logps/rejected": -278.5767822265625,
"loss": 0.3928,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.0227237939834595,
"rewards/margins": 1.6377525329589844,
"rewards/rejected": -2.6604764461517334,
"step": 3565
},
{
"epoch": 0.9343103899502748,
"grad_norm": 8.762356758117676,
"learning_rate": 1.3190264328709762e-06,
"logits/chosen": -2.9716594219207764,
"logits/rejected": -3.007230520248413,
"logps/chosen": -306.72589111328125,
"logps/rejected": -303.2646179199219,
"loss": 0.5313,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.334179401397705,
"rewards/margins": 1.589410662651062,
"rewards/rejected": -2.9235899448394775,
"step": 3570
},
{
"epoch": 0.9356189479193928,
"grad_norm": 9.781453132629395,
"learning_rate": 1.2928552734886158e-06,
"logits/chosen": -2.9384396076202393,
"logits/rejected": -3.0885565280914307,
"logps/chosen": -293.48382568359375,
"logps/rejected": -283.4349670410156,
"loss": 0.6182,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.8570163249969482,
"rewards/margins": 1.306370496749878,
"rewards/rejected": -3.163386821746826,
"step": 3575
},
{
"epoch": 0.9369275058885108,
"grad_norm": 7.885117053985596,
"learning_rate": 1.266684114106255e-06,
"logits/chosen": -2.9314887523651123,
"logits/rejected": -3.0194966793060303,
"logps/chosen": -291.6297912597656,
"logps/rejected": -270.6680603027344,
"loss": 0.5607,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.429483413696289,
"rewards/margins": 1.4721983671188354,
"rewards/rejected": -2.901681661605835,
"step": 3580
},
{
"epoch": 0.9382360638576289,
"grad_norm": 9.249140739440918,
"learning_rate": 1.2405129547238943e-06,
"logits/chosen": -2.9748375415802,
"logits/rejected": -3.042620897293091,
"logps/chosen": -338.7610778808594,
"logps/rejected": -306.6709899902344,
"loss": 0.5191,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.091373324394226,
"rewards/margins": 1.554322600364685,
"rewards/rejected": -2.645695924758911,
"step": 3585
},
{
"epoch": 0.9395446218267469,
"grad_norm": 7.393261432647705,
"learning_rate": 1.2143417953415337e-06,
"logits/chosen": -3.0142085552215576,
"logits/rejected": -3.020599603652954,
"logps/chosen": -287.16290283203125,
"logps/rejected": -286.26214599609375,
"loss": 0.4996,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.6127214431762695,
"rewards/margins": 1.7448869943618774,
"rewards/rejected": -3.3576083183288574,
"step": 3590
},
{
"epoch": 0.9408531797958649,
"grad_norm": 10.740133285522461,
"learning_rate": 1.1881706359591731e-06,
"logits/chosen": -3.0725698471069336,
"logits/rejected": -3.0806450843811035,
"logps/chosen": -308.4686279296875,
"logps/rejected": -282.9092712402344,
"loss": 0.5655,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.3958985805511475,
"rewards/margins": 1.4924328327178955,
"rewards/rejected": -2.888331651687622,
"step": 3595
},
{
"epoch": 0.942161737764983,
"grad_norm": 5.223201274871826,
"learning_rate": 1.1619994765768126e-06,
"logits/chosen": -2.972827672958374,
"logits/rejected": -2.9871978759765625,
"logps/chosen": -298.4678955078125,
"logps/rejected": -260.34326171875,
"loss": 0.511,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.3953911066055298,
"rewards/margins": 1.678969144821167,
"rewards/rejected": -3.0743606090545654,
"step": 3600
},
{
"epoch": 0.942161737764983,
"eval_logits/chosen": -3.021827459335327,
"eval_logits/rejected": -3.038386583328247,
"eval_logps/chosen": -297.16925048828125,
"eval_logps/rejected": -292.7310485839844,
"eval_loss": 0.4937221109867096,
"eval_rewards/accuracies": 0.7570000290870667,
"eval_rewards/chosen": -1.393584966659546,
"eval_rewards/margins": 1.4860624074935913,
"eval_rewards/rejected": -2.8796472549438477,
"eval_runtime": 762.7775,
"eval_samples_per_second": 2.622,
"eval_steps_per_second": 0.328,
"step": 3600
},
{
"epoch": 0.943470295734101,
"grad_norm": 9.47269058227539,
"learning_rate": 1.1358283171944518e-06,
"logits/chosen": -3.0384669303894043,
"logits/rejected": -3.0766215324401855,
"logps/chosen": -334.4903869628906,
"logps/rejected": -284.4979553222656,
"loss": 0.3992,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.2259769439697266,
"rewards/margins": 1.6997013092041016,
"rewards/rejected": -2.925678253173828,
"step": 3605
},
{
"epoch": 0.944778853703219,
"grad_norm": 5.090957164764404,
"learning_rate": 1.1096571578120912e-06,
"logits/chosen": -3.0102407932281494,
"logits/rejected": -3.027517318725586,
"logps/chosen": -324.6131286621094,
"logps/rejected": -330.85943603515625,
"loss": 0.4659,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.461601972579956,
"rewards/margins": 1.6823663711547852,
"rewards/rejected": -3.1439685821533203,
"step": 3610
},
{
"epoch": 0.9460874116723371,
"grad_norm": 6.4353485107421875,
"learning_rate": 1.0834859984297304e-06,
"logits/chosen": -3.0820727348327637,
"logits/rejected": -3.122159481048584,
"logps/chosen": -327.16680908203125,
"logps/rejected": -324.119140625,
"loss": 0.5123,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.2964187860488892,
"rewards/margins": 1.2985150814056396,
"rewards/rejected": -2.5949337482452393,
"step": 3615
},
{
"epoch": 0.9473959696414551,
"grad_norm": 10.36352252960205,
"learning_rate": 1.0573148390473699e-06,
"logits/chosen": -2.820996046066284,
"logits/rejected": -2.93684720993042,
"logps/chosen": -257.29962158203125,
"logps/rejected": -299.9443664550781,
"loss": 0.469,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3600294589996338,
"rewards/margins": 1.814413070678711,
"rewards/rejected": -3.1744422912597656,
"step": 3620
},
{
"epoch": 0.9487045276105731,
"grad_norm": 7.558009147644043,
"learning_rate": 1.0311436796650093e-06,
"logits/chosen": -3.0475263595581055,
"logits/rejected": -3.0534262657165527,
"logps/chosen": -290.08026123046875,
"logps/rejected": -308.2751159667969,
"loss": 0.3977,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.28037428855896,
"rewards/margins": 1.833155870437622,
"rewards/rejected": -3.1135306358337402,
"step": 3625
},
{
"epoch": 0.9500130855796912,
"grad_norm": 4.711981773376465,
"learning_rate": 1.0049725202826487e-06,
"logits/chosen": -2.9161949157714844,
"logits/rejected": -3.079786777496338,
"logps/chosen": -344.1520080566406,
"logps/rejected": -316.8307189941406,
"loss": 0.494,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3298174142837524,
"rewards/margins": 1.6952533721923828,
"rewards/rejected": -3.0250706672668457,
"step": 3630
},
{
"epoch": 0.9513216435488092,
"grad_norm": 11.685251235961914,
"learning_rate": 9.78801360900288e-07,
"logits/chosen": -3.006196975708008,
"logits/rejected": -3.021721601486206,
"logps/chosen": -306.54205322265625,
"logps/rejected": -291.6183776855469,
"loss": 0.5325,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.6336702108383179,
"rewards/margins": 1.6822528839111328,
"rewards/rejected": -3.3159232139587402,
"step": 3635
},
{
"epoch": 0.9526302015179272,
"grad_norm": 7.035577297210693,
"learning_rate": 9.526302015179273e-07,
"logits/chosen": -3.0137877464294434,
"logits/rejected": -3.0155646800994873,
"logps/chosen": -280.07354736328125,
"logps/rejected": -279.3880310058594,
"loss": 0.4263,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.2827895879745483,
"rewards/margins": 1.7637962102890015,
"rewards/rejected": -3.04658579826355,
"step": 3640
},
{
"epoch": 0.9539387594870453,
"grad_norm": 6.1892852783203125,
"learning_rate": 9.264590421355667e-07,
"logits/chosen": -3.0102293491363525,
"logits/rejected": -3.043808937072754,
"logps/chosen": -282.82232666015625,
"logps/rejected": -295.6250915527344,
"loss": 0.5537,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.4279065132141113,
"rewards/margins": 1.4096248149871826,
"rewards/rejected": -2.837531566619873,
"step": 3645
},
{
"epoch": 0.9552473174561633,
"grad_norm": 7.0120344161987305,
"learning_rate": 9.00287882753206e-07,
"logits/chosen": -2.8158748149871826,
"logits/rejected": -3.0110549926757812,
"logps/chosen": -322.323974609375,
"logps/rejected": -296.1746520996094,
"loss": 0.5607,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.5179932117462158,
"rewards/margins": 1.3574055433273315,
"rewards/rejected": -2.875398635864258,
"step": 3650
},
{
"epoch": 0.9565558754252813,
"grad_norm": 6.899664878845215,
"learning_rate": 8.741167233708454e-07,
"logits/chosen": -2.86753249168396,
"logits/rejected": -3.003220558166504,
"logps/chosen": -288.17242431640625,
"logps/rejected": -287.6828918457031,
"loss": 0.5421,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4167354106903076,
"rewards/margins": 1.3222873210906982,
"rewards/rejected": -2.739022731781006,
"step": 3655
},
{
"epoch": 0.9578644333943994,
"grad_norm": 7.400771617889404,
"learning_rate": 8.479455639884849e-07,
"logits/chosen": -2.8408029079437256,
"logits/rejected": -3.037991762161255,
"logps/chosen": -289.79815673828125,
"logps/rejected": -266.6900939941406,
"loss": 0.4928,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5893640518188477,
"rewards/margins": 1.4343297481536865,
"rewards/rejected": -3.023693799972534,
"step": 3660
},
{
"epoch": 0.9591729913635174,
"grad_norm": 7.997771263122559,
"learning_rate": 8.217744046061241e-07,
"logits/chosen": -3.0197484493255615,
"logits/rejected": -2.9930665493011475,
"logps/chosen": -272.28826904296875,
"logps/rejected": -285.2413024902344,
"loss": 0.5595,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3076242208480835,
"rewards/margins": 1.3322126865386963,
"rewards/rejected": -2.6398367881774902,
"step": 3665
},
{
"epoch": 0.9604815493326354,
"grad_norm": 8.226212501525879,
"learning_rate": 7.956032452237634e-07,
"logits/chosen": -3.0422444343566895,
"logits/rejected": -3.020461320877075,
"logps/chosen": -278.33685302734375,
"logps/rejected": -284.96014404296875,
"loss": 0.4872,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.0055291652679443,
"rewards/margins": 1.4944764375686646,
"rewards/rejected": -2.5000054836273193,
"step": 3670
},
{
"epoch": 0.9617901073017535,
"grad_norm": 4.699522018432617,
"learning_rate": 7.694320858414028e-07,
"logits/chosen": -3.019260883331299,
"logits/rejected": -3.022517442703247,
"logps/chosen": -271.10174560546875,
"logps/rejected": -268.4229431152344,
"loss": 0.4087,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.1063932180404663,
"rewards/margins": 1.805824875831604,
"rewards/rejected": -2.9122183322906494,
"step": 3675
},
{
"epoch": 0.9630986652708715,
"grad_norm": 7.629035949707031,
"learning_rate": 7.432609264590422e-07,
"logits/chosen": -3.0566253662109375,
"logits/rejected": -3.069202423095703,
"logps/chosen": -271.2048034667969,
"logps/rejected": -276.36383056640625,
"loss": 0.4807,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0480868816375732,
"rewards/margins": 1.5648603439331055,
"rewards/rejected": -2.6129469871520996,
"step": 3680
},
{
"epoch": 0.9644072232399895,
"grad_norm": 6.998068809509277,
"learning_rate": 7.170897670766816e-07,
"logits/chosen": -2.845982789993286,
"logits/rejected": -2.9252867698669434,
"logps/chosen": -268.194091796875,
"logps/rejected": -262.69482421875,
"loss": 0.6263,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -2.0061893463134766,
"rewards/margins": 0.8048659563064575,
"rewards/rejected": -2.8110554218292236,
"step": 3685
},
{
"epoch": 0.9657157812091076,
"grad_norm": 4.9705586433410645,
"learning_rate": 6.90918607694321e-07,
"logits/chosen": -2.9738311767578125,
"logits/rejected": -2.96089768409729,
"logps/chosen": -269.18121337890625,
"logps/rejected": -312.06085205078125,
"loss": 0.4902,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.5259528160095215,
"rewards/margins": 1.5478928089141846,
"rewards/rejected": -3.073845624923706,
"step": 3690
},
{
"epoch": 0.9670243391782256,
"grad_norm": 7.587747573852539,
"learning_rate": 6.647474483119602e-07,
"logits/chosen": -2.985318660736084,
"logits/rejected": -3.0679590702056885,
"logps/chosen": -281.947265625,
"logps/rejected": -309.543701171875,
"loss": 0.451,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.10109543800354,
"rewards/margins": 1.621584177017212,
"rewards/rejected": -2.722679615020752,
"step": 3695
},
{
"epoch": 0.9683328971473436,
"grad_norm": 7.26895809173584,
"learning_rate": 6.385762889295996e-07,
"logits/chosen": -2.9335684776306152,
"logits/rejected": -2.8907690048217773,
"logps/chosen": -280.83489990234375,
"logps/rejected": -266.060546875,
"loss": 0.4105,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.2659553289413452,
"rewards/margins": 1.7439613342285156,
"rewards/rejected": -3.009916305541992,
"step": 3700
},
{
"epoch": 0.9696414551164617,
"grad_norm": 7.7010111808776855,
"learning_rate": 6.12405129547239e-07,
"logits/chosen": -2.914029836654663,
"logits/rejected": -2.911940097808838,
"logps/chosen": -285.9615173339844,
"logps/rejected": -265.48681640625,
"loss": 0.5042,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1594058275222778,
"rewards/margins": 1.309971570968628,
"rewards/rejected": -2.4693775177001953,
"step": 3705
},
{
"epoch": 0.9709500130855797,
"grad_norm": 9.604898452758789,
"learning_rate": 5.862339701648783e-07,
"logits/chosen": -2.9038796424865723,
"logits/rejected": -2.8771822452545166,
"logps/chosen": -322.8350830078125,
"logps/rejected": -357.41046142578125,
"loss": 0.5632,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.2673771381378174,
"rewards/margins": 1.460440993309021,
"rewards/rejected": -2.727818012237549,
"step": 3710
},
{
"epoch": 0.9722585710546977,
"grad_norm": 4.567657947540283,
"learning_rate": 5.600628107825177e-07,
"logits/chosen": -2.9830799102783203,
"logits/rejected": -3.0490164756774902,
"logps/chosen": -347.1837463378906,
"logps/rejected": -324.29754638671875,
"loss": 0.4124,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.351874589920044,
"rewards/margins": 1.7234786748886108,
"rewards/rejected": -3.0753531455993652,
"step": 3715
},
{
"epoch": 0.9735671290238157,
"grad_norm": 7.263519763946533,
"learning_rate": 5.338916514001571e-07,
"logits/chosen": -3.0393004417419434,
"logits/rejected": -3.0236408710479736,
"logps/chosen": -294.21710205078125,
"logps/rejected": -302.565673828125,
"loss": 0.4879,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.2430657148361206,
"rewards/margins": 1.4778881072998047,
"rewards/rejected": -2.7209537029266357,
"step": 3720
},
{
"epoch": 0.9748756869929338,
"grad_norm": 10.831700325012207,
"learning_rate": 5.077204920177964e-07,
"logits/chosen": -3.0219333171844482,
"logits/rejected": -3.1004836559295654,
"logps/chosen": -319.2060852050781,
"logps/rejected": -278.51043701171875,
"loss": 0.4765,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.4795074462890625,
"rewards/margins": 1.5765758752822876,
"rewards/rejected": -3.0560834407806396,
"step": 3725
},
{
"epoch": 0.9761842449620518,
"grad_norm": 8.930159568786621,
"learning_rate": 4.815493326354357e-07,
"logits/chosen": -2.993366003036499,
"logits/rejected": -3.02050518989563,
"logps/chosen": -330.6523132324219,
"logps/rejected": -347.84375,
"loss": 0.4885,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2553311586380005,
"rewards/margins": 1.5938208103179932,
"rewards/rejected": -2.849151849746704,
"step": 3730
},
{
"epoch": 0.9774928029311698,
"grad_norm": 8.077557563781738,
"learning_rate": 4.5537817325307516e-07,
"logits/chosen": -3.0191256999969482,
"logits/rejected": -2.950045347213745,
"logps/chosen": -328.70172119140625,
"logps/rejected": -297.58648681640625,
"loss": 0.4838,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.5267088413238525,
"rewards/margins": 1.4226138591766357,
"rewards/rejected": -2.9493229389190674,
"step": 3735
},
{
"epoch": 0.9788013609002879,
"grad_norm": 11.086000442504883,
"learning_rate": 4.2920701387071454e-07,
"logits/chosen": -2.9949474334716797,
"logits/rejected": -3.0554397106170654,
"logps/chosen": -310.36053466796875,
"logps/rejected": -330.1317138671875,
"loss": 0.6288,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3402494192123413,
"rewards/margins": 1.4754968881607056,
"rewards/rejected": -2.815746307373047,
"step": 3740
},
{
"epoch": 0.9801099188694059,
"grad_norm": 9.768365859985352,
"learning_rate": 4.0303585448835386e-07,
"logits/chosen": -2.7967236042022705,
"logits/rejected": -2.9284253120422363,
"logps/chosen": -260.5437927246094,
"logps/rejected": -275.2959899902344,
"loss": 0.5966,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4888842105865479,
"rewards/margins": 1.163434386253357,
"rewards/rejected": -2.6523184776306152,
"step": 3745
},
{
"epoch": 0.9814184768385239,
"grad_norm": 5.782496452331543,
"learning_rate": 3.7686469510599324e-07,
"logits/chosen": -3.020731210708618,
"logits/rejected": -3.06559157371521,
"logps/chosen": -329.1947937011719,
"logps/rejected": -303.79425048828125,
"loss": 0.4177,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.4123915433883667,
"rewards/margins": 1.5942955017089844,
"rewards/rejected": -3.0066869258880615,
"step": 3750
},
{
"epoch": 0.982727034807642,
"grad_norm": 6.72975492477417,
"learning_rate": 3.506935357236326e-07,
"logits/chosen": -3.057302951812744,
"logits/rejected": -3.0887694358825684,
"logps/chosen": -249.98825073242188,
"logps/rejected": -255.726806640625,
"loss": 0.48,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3327161073684692,
"rewards/margins": 1.2151663303375244,
"rewards/rejected": -2.547882318496704,
"step": 3755
},
{
"epoch": 0.98403559277676,
"grad_norm": 5.480231285095215,
"learning_rate": 3.2452237634127194e-07,
"logits/chosen": -2.945413112640381,
"logits/rejected": -3.0293281078338623,
"logps/chosen": -278.617431640625,
"logps/rejected": -259.78509521484375,
"loss": 0.3754,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.1209717988967896,
"rewards/margins": 1.8064987659454346,
"rewards/rejected": -2.9274706840515137,
"step": 3760
},
{
"epoch": 0.985344150745878,
"grad_norm": 8.33041763305664,
"learning_rate": 2.983512169589113e-07,
"logits/chosen": -2.937929630279541,
"logits/rejected": -2.863677501678467,
"logps/chosen": -267.8946838378906,
"logps/rejected": -290.6978454589844,
"loss": 0.4701,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.3163795471191406,
"rewards/margins": 1.3320225477218628,
"rewards/rejected": -2.648401975631714,
"step": 3765
},
{
"epoch": 0.9866527087149961,
"grad_norm": 7.798052787780762,
"learning_rate": 2.7218005757655065e-07,
"logits/chosen": -2.946786642074585,
"logits/rejected": -3.0757522583007812,
"logps/chosen": -303.80364990234375,
"logps/rejected": -338.39898681640625,
"loss": 0.6613,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.5532114505767822,
"rewards/margins": 1.0035349130630493,
"rewards/rejected": -2.556746482849121,
"step": 3770
},
{
"epoch": 0.9879612666841141,
"grad_norm": 8.479823112487793,
"learning_rate": 2.4600889819419e-07,
"logits/chosen": -2.9792187213897705,
"logits/rejected": -3.042782783508301,
"logps/chosen": -345.6416015625,
"logps/rejected": -288.8009033203125,
"loss": 0.5072,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4748677015304565,
"rewards/margins": 1.32082998752594,
"rewards/rejected": -2.7956976890563965,
"step": 3775
},
{
"epoch": 0.9892698246532321,
"grad_norm": 7.717909812927246,
"learning_rate": 2.198377388118294e-07,
"logits/chosen": -2.9219813346862793,
"logits/rejected": -3.069363832473755,
"logps/chosen": -334.748046875,
"logps/rejected": -300.7204895019531,
"loss": 0.3629,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -1.2521204948425293,
"rewards/margins": 1.8430713415145874,
"rewards/rejected": -3.095191717147827,
"step": 3780
},
{
"epoch": 0.9905783826223502,
"grad_norm": 8.238800048828125,
"learning_rate": 1.9366657942946875e-07,
"logits/chosen": -3.0556554794311523,
"logits/rejected": -2.9894509315490723,
"logps/chosen": -259.3734130859375,
"logps/rejected": -261.5130310058594,
"loss": 0.5617,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.3929826021194458,
"rewards/margins": 1.1740365028381348,
"rewards/rejected": -2.567018985748291,
"step": 3785
},
{
"epoch": 0.9918869405914682,
"grad_norm": 6.61944580078125,
"learning_rate": 1.6749542004710808e-07,
"logits/chosen": -2.9529948234558105,
"logits/rejected": -3.041637897491455,
"logps/chosen": -303.34112548828125,
"logps/rejected": -273.41619873046875,
"loss": 0.4298,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2014284133911133,
"rewards/margins": 1.7370707988739014,
"rewards/rejected": -2.9384992122650146,
"step": 3790
},
{
"epoch": 0.9931954985605862,
"grad_norm": 8.906229972839355,
"learning_rate": 1.4132426066474745e-07,
"logits/chosen": -2.8641409873962402,
"logits/rejected": -2.9114432334899902,
"logps/chosen": -270.04541015625,
"logps/rejected": -307.6247863769531,
"loss": 0.553,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.198313593864441,
"rewards/margins": 1.2590378522872925,
"rewards/rejected": -2.4573516845703125,
"step": 3795
},
{
"epoch": 0.9945040565297043,
"grad_norm": 8.40211009979248,
"learning_rate": 1.1515310128238682e-07,
"logits/chosen": -2.9926180839538574,
"logits/rejected": -3.0532259941101074,
"logps/chosen": -248.5250244140625,
"logps/rejected": -262.3029479980469,
"loss": 0.5102,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -1.4260847568511963,
"rewards/margins": 1.5013940334320068,
"rewards/rejected": -2.927478790283203,
"step": 3800
},
{
"epoch": 0.9958126144988223,
"grad_norm": 8.588583946228027,
"learning_rate": 8.898194190002618e-08,
"logits/chosen": -3.000002384185791,
"logits/rejected": -3.0484166145324707,
"logps/chosen": -324.4134826660156,
"logps/rejected": -311.4660339355469,
"loss": 0.5217,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.3233641386032104,
"rewards/margins": 1.193496584892273,
"rewards/rejected": -2.5168607234954834,
"step": 3805
},
{
"epoch": 0.9971211724679403,
"grad_norm": 7.888390064239502,
"learning_rate": 6.281078251766554e-08,
"logits/chosen": -3.020127773284912,
"logits/rejected": -3.0420382022857666,
"logps/chosen": -278.3679504394531,
"logps/rejected": -265.987060546875,
"loss": 0.5211,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.2625151872634888,
"rewards/margins": 1.2696560621261597,
"rewards/rejected": -2.5321712493896484,
"step": 3810
},
{
"epoch": 0.9984297304370584,
"grad_norm": 7.750138759613037,
"learning_rate": 3.6639623135304896e-08,
"logits/chosen": -2.9767794609069824,
"logits/rejected": -3.0281155109405518,
"logps/chosen": -273.447509765625,
"logps/rejected": -290.362060546875,
"loss": 0.5369,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.5177167654037476,
"rewards/margins": 1.1918412446975708,
"rewards/rejected": -2.7095580101013184,
"step": 3815
},
{
"epoch": 0.9997382884061764,
"grad_norm": 8.182830810546875,
"learning_rate": 1.0468463752944255e-08,
"logits/chosen": -2.9969050884246826,
"logits/rejected": -3.121110200881958,
"logps/chosen": -325.16021728515625,
"logps/rejected": -278.646728515625,
"loss": 0.5479,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2306115627288818,
"rewards/margins": 1.541174054145813,
"rewards/rejected": -2.7717857360839844,
"step": 3820
},
{
"epoch": 1.0,
"step": 3821,
"total_flos": 0.0,
"train_loss": 0.5167291775107914,
"train_runtime": 38075.0654,
"train_samples_per_second": 1.606,
"train_steps_per_second": 0.1
}
],
"logging_steps": 5,
"max_steps": 3821,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}