dpo_lr_finder_callback / trainer_state.json
bimabk's picture
Upload task output 281ac9fc-853b-477d-8e84-145f9030d732
b89ffb1 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9871794871794872,
"eval_steps": 500,
"global_step": 699,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.021367521367521368,
"grad_norm": 16.125,
"learning_rate": 1.7556090538745385e-06,
"logits/chosen": -3.5722389221191406,
"logits/rejected": -3.5034377574920654,
"logps/chosen": -41.095054626464844,
"logps/rejected": -79.83882141113281,
"loss": 0.6922,
"rewards/accuracies": 0.3500000238418579,
"rewards/chosen": 0.0014523781137540936,
"rewards/margins": 0.001936142100021243,
"rewards/rejected": -0.0004837641608901322,
"step": 5
},
{
"epoch": 0.042735042735042736,
"grad_norm": 16.75,
"learning_rate": 3.950120371217711e-06,
"logits/chosen": -3.586623430252075,
"logits/rejected": -3.506187915802002,
"logps/chosen": -40.016441345214844,
"logps/rejected": -78.24286651611328,
"loss": 0.6796,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.002693606074899435,
"rewards/margins": 0.027483653277158737,
"rewards/rejected": -0.03017725981771946,
"step": 10
},
{
"epoch": 0.0641025641025641,
"grad_norm": 14.75,
"learning_rate": 6.144631688560886e-06,
"logits/chosen": -3.5991673469543457,
"logits/rejected": -3.50789213180542,
"logps/chosen": -39.836097717285156,
"logps/rejected": -81.42815399169922,
"loss": 0.6306,
"rewards/accuracies": 0.9916666746139526,
"rewards/chosen": -0.006502463016659021,
"rewards/margins": 0.1313387155532837,
"rewards/rejected": -0.13784119486808777,
"step": 15
},
{
"epoch": 0.08547008547008547,
"grad_norm": 13.625,
"learning_rate": 8.339143005904057e-06,
"logits/chosen": -3.5530943870544434,
"logits/rejected": -3.492051601409912,
"logps/chosen": -39.75938034057617,
"logps/rejected": -80.72886657714844,
"loss": 0.5434,
"rewards/accuracies": 0.9791666865348816,
"rewards/chosen": -0.015247734263539314,
"rewards/margins": 0.33922019600868225,
"rewards/rejected": -0.3544679284095764,
"step": 20
},
{
"epoch": 0.10683760683760683,
"grad_norm": 11.1875,
"learning_rate": 1.0533654323247232e-05,
"logits/chosen": -3.512582302093506,
"logits/rejected": -3.4719951152801514,
"logps/chosen": -39.996891021728516,
"logps/rejected": -85.54742431640625,
"loss": 0.3961,
"rewards/accuracies": 0.98333340883255,
"rewards/chosen": -0.04132762551307678,
"rewards/margins": 0.7773466110229492,
"rewards/rejected": -0.8186742067337036,
"step": 25
},
{
"epoch": 0.1282051282051282,
"grad_norm": 7.5625,
"learning_rate": 1.2728165640590407e-05,
"logits/chosen": -3.4401345252990723,
"logits/rejected": -3.4451375007629395,
"logps/chosen": -42.438926696777344,
"logps/rejected": -93.06343078613281,
"loss": 0.2584,
"rewards/accuracies": 0.9958332777023315,
"rewards/chosen": -0.13156814873218536,
"rewards/margins": 1.4302090406417847,
"rewards/rejected": -1.561777114868164,
"step": 30
},
{
"epoch": 0.14957264957264957,
"grad_norm": 4.28125,
"learning_rate": 1.4922676957933578e-05,
"logits/chosen": -3.220163345336914,
"logits/rejected": -3.2657477855682373,
"logps/chosen": -44.191307067871094,
"logps/rejected": -108.76808166503906,
"loss": 0.1195,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4397502541542053,
"rewards/margins": 2.7580618858337402,
"rewards/rejected": -3.19781231880188,
"step": 35
},
{
"epoch": 0.17094017094017094,
"grad_norm": 2.0625,
"learning_rate": 1.5360556888469565e-05,
"logits/chosen": -2.7749853134155273,
"logits/rejected": -2.807795286178589,
"logps/chosen": -52.53513717651367,
"logps/rejected": -143.27357482910156,
"loss": 0.0426,
"rewards/accuracies": 0.9916666746139526,
"rewards/chosen": -1.2650867700576782,
"rewards/margins": 5.236789226531982,
"rewards/rejected": -6.501875877380371,
"step": 40
},
{
"epoch": 0.19230769230769232,
"grad_norm": 1.71875,
"learning_rate": 1.535640428282884e-05,
"logits/chosen": -2.3302321434020996,
"logits/rejected": -2.289696216583252,
"logps/chosen": -51.40728759765625,
"logps/rejected": -162.12364196777344,
"loss": 0.0168,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -1.279262900352478,
"rewards/margins": 7.274726867675781,
"rewards/rejected": -8.553990364074707,
"step": 45
},
{
"epoch": 0.21367521367521367,
"grad_norm": 6.5,
"learning_rate": 1.5349059809872097e-05,
"logits/chosen": -1.9786951541900635,
"logits/rejected": -1.9170547723770142,
"logps/chosen": -57.05157470703125,
"logps/rejected": -180.14271545410156,
"loss": 0.0191,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -1.664361596107483,
"rewards/margins": 8.5733003616333,
"rewards/rejected": -10.237661361694336,
"step": 50
},
{
"epoch": 0.23504273504273504,
"grad_norm": 0.67578125,
"learning_rate": 1.5338527542732884e-05,
"logits/chosen": -1.976489782333374,
"logits/rejected": -1.909419059753418,
"logps/chosen": -58.60235595703125,
"logps/rejected": -196.4000701904297,
"loss": 0.0097,
"rewards/accuracies": 0.9958332777023315,
"rewards/chosen": -1.8706943988800049,
"rewards/margins": 9.873316764831543,
"rewards/rejected": -11.744011878967285,
"step": 55
},
{
"epoch": 0.2564102564102564,
"grad_norm": 3.234375,
"learning_rate": 1.532481332244717e-05,
"logits/chosen": -1.9295637607574463,
"logits/rejected": -1.817983627319336,
"logps/chosen": -57.39198684692383,
"logps/rejected": -199.2140350341797,
"loss": 0.0047,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.8252334594726562,
"rewards/margins": 10.293539047241211,
"rewards/rejected": -12.11877155303955,
"step": 60
},
{
"epoch": 0.2777777777777778,
"grad_norm": 9.125,
"learning_rate": 1.5307924754713968e-05,
"logits/chosen": -1.9402471780776978,
"logits/rejected": -1.8663572072982788,
"logps/chosen": -59.858306884765625,
"logps/rejected": -196.60189819335938,
"loss": 0.0108,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -1.9207541942596436,
"rewards/margins": 9.978109359741211,
"rewards/rejected": -11.89886474609375,
"step": 65
},
{
"epoch": 0.29914529914529914,
"grad_norm": 0.298828125,
"learning_rate": 1.528787120567736e-05,
"logits/chosen": -2.046313762664795,
"logits/rejected": -1.9556039571762085,
"logps/chosen": -54.13606643676758,
"logps/rejected": -192.9593963623047,
"loss": 0.0026,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.4830687046051025,
"rewards/margins": 10.119759559631348,
"rewards/rejected": -11.602827072143555,
"step": 70
},
{
"epoch": 0.32051282051282054,
"grad_norm": 0.400390625,
"learning_rate": 1.526466379673215e-05,
"logits/chosen": -2.0463297367095947,
"logits/rejected": -1.9465014934539795,
"logps/chosen": -53.60888671875,
"logps/rejected": -200.31202697753906,
"loss": 0.0074,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.4517700672149658,
"rewards/margins": 10.894501686096191,
"rewards/rejected": -12.346272468566895,
"step": 75
},
{
"epoch": 0.3418803418803419,
"grad_norm": 14.0,
"learning_rate": 1.5238315398356126e-05,
"logits/chosen": -1.9893850088119507,
"logits/rejected": -1.8696527481079102,
"logps/chosen": -55.793907165527344,
"logps/rejected": -205.0140838623047,
"loss": 0.0099,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -1.6301085948944092,
"rewards/margins": 11.256315231323242,
"rewards/rejected": -12.88642406463623,
"step": 80
},
{
"epoch": 0.36324786324786323,
"grad_norm": 0.040283203125,
"learning_rate": 1.5208840622972272e-05,
"logits/chosen": -1.9942238330841064,
"logits/rejected": -1.8389371633529663,
"logps/chosen": -60.305450439453125,
"logps/rejected": -209.89016723632812,
"loss": 0.0228,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -1.8958022594451904,
"rewards/margins": 11.26887321472168,
"rewards/rejected": -13.164674758911133,
"step": 85
},
{
"epoch": 0.38461538461538464,
"grad_norm": 0.68359375,
"learning_rate": 1.5176255816844948e-05,
"logits/chosen": -1.958398461341858,
"logits/rejected": -1.7746648788452148,
"logps/chosen": -53.41706466674805,
"logps/rejected": -208.37869262695312,
"loss": 0.0067,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.439720869064331,
"rewards/margins": 11.646267890930176,
"rewards/rejected": -13.08598804473877,
"step": 90
},
{
"epoch": 0.405982905982906,
"grad_norm": 0.67578125,
"learning_rate": 1.5140579051014502e-05,
"logits/chosen": -1.9176127910614014,
"logits/rejected": -1.6993322372436523,
"logps/chosen": -57.2091178894043,
"logps/rejected": -224.92782592773438,
"loss": 0.0026,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.716392159461975,
"rewards/margins": 12.696627616882324,
"rewards/rejected": -14.413020133972168,
"step": 95
},
{
"epoch": 0.42735042735042733,
"grad_norm": 0.3203125,
"learning_rate": 1.5101830111275334e-05,
"logits/chosen": -1.874871015548706,
"logits/rejected": -1.682807207107544,
"logps/chosen": -63.0499267578125,
"logps/rejected": -215.8572235107422,
"loss": 0.0118,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -2.3144705295562744,
"rewards/margins": 11.472696304321289,
"rewards/rejected": -13.787165641784668,
"step": 100
},
{
"epoch": 0.44871794871794873,
"grad_norm": 4.21875,
"learning_rate": 1.5060030487203004e-05,
"logits/chosen": -1.8294957876205444,
"logits/rejected": -1.583496332168579,
"logps/chosen": -67.67689514160156,
"logps/rejected": -231.562744140625,
"loss": 0.006,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -2.7576889991760254,
"rewards/margins": 12.489922523498535,
"rewards/rejected": -15.247611999511719,
"step": 105
},
{
"epoch": 0.4700854700854701,
"grad_norm": 0.1015625,
"learning_rate": 1.501520336023643e-05,
"logits/chosen": -1.7654807567596436,
"logits/rejected": -1.4742016792297363,
"logps/chosen": -66.52511596679688,
"logps/rejected": -227.6389923095703,
"loss": 0.0016,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.638422966003418,
"rewards/margins": 12.420553207397461,
"rewards/rejected": -15.058975219726562,
"step": 110
},
{
"epoch": 0.49145299145299143,
"grad_norm": 1.7421875,
"learning_rate": 1.4967373590821828e-05,
"logits/chosen": -1.7195736169815063,
"logits/rejected": -1.4602675437927246,
"logps/chosen": -63.84660720825195,
"logps/rejected": -229.90017700195312,
"loss": 0.0106,
"rewards/accuracies": 0.9916666746139526,
"rewards/chosen": -2.3248353004455566,
"rewards/margins": 12.83845329284668,
"rewards/rejected": -15.163289070129395,
"step": 115
},
{
"epoch": 0.5128205128205128,
"grad_norm": 0.134765625,
"learning_rate": 1.491656770462546e-05,
"logits/chosen": -1.617491364479065,
"logits/rejected": -1.2875694036483765,
"logps/chosen": -59.133209228515625,
"logps/rejected": -232.18191528320312,
"loss": 0.0038,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.9275062084197998,
"rewards/margins": 13.423171997070312,
"rewards/rejected": -15.350679397583008,
"step": 120
},
{
"epoch": 0.5341880341880342,
"grad_norm": 11.0,
"learning_rate": 1.4862813877822923e-05,
"logits/chosen": -1.6853389739990234,
"logits/rejected": -1.3519870042800903,
"logps/chosen": -58.672515869140625,
"logps/rejected": -232.4235382080078,
"loss": 0.0091,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -1.8344595432281494,
"rewards/margins": 13.702409744262695,
"rewards/rejected": -15.53686809539795,
"step": 125
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.18359375,
"learning_rate": 1.4806141921473063e-05,
"logits/chosen": -1.711216926574707,
"logits/rejected": -1.331209421157837,
"logps/chosen": -58.75749969482422,
"logps/rejected": -241.5647430419922,
"loss": 0.0058,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.8602139949798584,
"rewards/margins": 14.427963256835938,
"rewards/rejected": -16.288179397583008,
"step": 130
},
{
"epoch": 0.5769230769230769,
"grad_norm": 0.0157470703125,
"learning_rate": 1.4746583264985202e-05,
"logits/chosen": -1.73836350440979,
"logits/rejected": -1.358798623085022,
"logps/chosen": -57.656578063964844,
"logps/rejected": -245.814697265625,
"loss": 0.0054,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -1.727595329284668,
"rewards/margins": 14.943672180175781,
"rewards/rejected": -16.671268463134766,
"step": 135
},
{
"epoch": 0.5982905982905983,
"grad_norm": 0.017578125,
"learning_rate": 1.468417093868888e-05,
"logits/chosen": -1.7839100360870361,
"logits/rejected": -1.4424632787704468,
"logps/chosen": -55.03651809692383,
"logps/rejected": -240.14572143554688,
"loss": 0.0047,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -1.5889627933502197,
"rewards/margins": 14.578252792358398,
"rewards/rejected": -16.16721534729004,
"step": 140
},
{
"epoch": 0.6196581196581197,
"grad_norm": 0.06884765625,
"learning_rate": 1.4618939555515721e-05,
"logits/chosen": -1.7428079843521118,
"logits/rejected": -1.4061057567596436,
"logps/chosen": -59.45949172973633,
"logps/rejected": -239.75048828125,
"loss": 0.0028,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.9328705072402954,
"rewards/margins": 14.28913402557373,
"rewards/rejected": -16.222003936767578,
"step": 145
},
{
"epoch": 0.6410256410256411,
"grad_norm": 0.00933837890625,
"learning_rate": 1.455092529180363e-05,
"logits/chosen": -1.7827781438827515,
"logits/rejected": -1.4337228536605835,
"logps/chosen": -57.58295440673828,
"logps/rejected": -243.2423553466797,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.8888952732086182,
"rewards/margins": 14.729533195495605,
"rewards/rejected": -16.618427276611328,
"step": 150
},
{
"epoch": 0.6623931623931624,
"grad_norm": 1.1171875,
"learning_rate": 1.4480165867233946e-05,
"logits/chosen": -1.7574710845947266,
"logits/rejected": -1.4313023090362549,
"logps/chosen": -61.97917938232422,
"logps/rejected": -249.78890991210938,
"loss": 0.0062,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -2.226684093475342,
"rewards/margins": 14.859643936157227,
"rewards/rejected": -17.086326599121094,
"step": 155
},
{
"epoch": 0.6837606837606838,
"grad_norm": 2.59375,
"learning_rate": 1.440670052391267e-05,
"logits/chosen": -1.776049017906189,
"logits/rejected": -1.4136337041854858,
"logps/chosen": -58.725502014160156,
"logps/rejected": -239.2755126953125,
"loss": 0.0046,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.0192863941192627,
"rewards/margins": 14.30653190612793,
"rewards/rejected": -16.32581901550293,
"step": 160
},
{
"epoch": 0.7051282051282052,
"grad_norm": 1.7734375,
"learning_rate": 1.4330570004607398e-05,
"logits/chosen": -1.8287827968597412,
"logits/rejected": -1.4543850421905518,
"logps/chosen": -57.99534225463867,
"logps/rejected": -249.4701690673828,
"loss": 0.0035,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -1.8924716711044312,
"rewards/margins": 15.258901596069336,
"rewards/rejected": -17.15137481689453,
"step": 165
},
{
"epoch": 0.7264957264957265,
"grad_norm": 1.359375,
"learning_rate": 1.4251816530151986e-05,
"logits/chosen": -1.7740707397460938,
"logits/rejected": -1.4325586557388306,
"logps/chosen": -60.86432647705078,
"logps/rejected": -249.397216796875,
"loss": 0.0032,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.059633255004883,
"rewards/margins": 15.068583488464355,
"rewards/rejected": -17.128215789794922,
"step": 170
},
{
"epoch": 0.7478632478632479,
"grad_norm": 0.060546875,
"learning_rate": 1.4170483776031526e-05,
"logits/chosen": -1.7101682424545288,
"logits/rejected": -1.3118056058883667,
"logps/chosen": -60.89410400390625,
"logps/rejected": -252.2559814453125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.1261119842529297,
"rewards/margins": 15.250958442687988,
"rewards/rejected": -17.377071380615234,
"step": 175
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.031494140625,
"learning_rate": 1.4086616848160574e-05,
"logits/chosen": -1.6894880533218384,
"logits/rejected": -1.2855875492095947,
"logps/chosen": -66.87565612792969,
"logps/rejected": -245.99557495117188,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.7102391719818115,
"rewards/margins": 14.112167358398438,
"rewards/rejected": -16.822406768798828,
"step": 180
},
{
"epoch": 0.7905982905982906,
"grad_norm": 0.09912109375,
"learning_rate": 1.4000262257868096e-05,
"logits/chosen": -1.6548315286636353,
"logits/rejected": -1.192333459854126,
"logps/chosen": -64.77526092529297,
"logps/rejected": -249.92196655273438,
"loss": 0.0027,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.546926975250244,
"rewards/margins": 14.788670539855957,
"rewards/rejected": -17.33559799194336,
"step": 185
},
{
"epoch": 0.811965811965812,
"grad_norm": 1.1953125,
"learning_rate": 1.3911467896102994e-05,
"logits/chosen": -1.6022329330444336,
"logits/rejected": -1.1850754022598267,
"logps/chosen": -60.396514892578125,
"logps/rejected": -247.5742645263672,
"loss": 0.0054,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -2.1046481132507324,
"rewards/margins": 15.000350952148438,
"rewards/rejected": -17.104999542236328,
"step": 190
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.10400390625,
"learning_rate": 1.3820283006874503e-05,
"logits/chosen": -1.563820719718933,
"logits/rejected": -1.137289047241211,
"logps/chosen": -65.57173156738281,
"logps/rejected": -252.38955688476562,
"loss": 0.0085,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -2.3488571643829346,
"rewards/margins": 15.110745429992676,
"rewards/rejected": -17.45960235595703,
"step": 195
},
{
"epoch": 0.8547008547008547,
"grad_norm": 0.1279296875,
"learning_rate": 1.372675815994221e-05,
"logits/chosen": -1.4945417642593384,
"logits/rejected": -1.038334846496582,
"logps/chosen": -54.2162971496582,
"logps/rejected": -254.0222625732422,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.6657600402832031,
"rewards/margins": 16.079242706298828,
"rewards/rejected": -17.745004653930664,
"step": 200
},
{
"epoch": 0.8760683760683761,
"grad_norm": 14.625,
"learning_rate": 1.3630945222770829e-05,
"logits/chosen": -1.5167819261550903,
"logits/rejected": -1.0430529117584229,
"logps/chosen": -60.5411262512207,
"logps/rejected": -259.63555908203125,
"loss": 0.0192,
"rewards/accuracies": 0.9916666746139526,
"rewards/chosen": -2.0362796783447266,
"rewards/margins": 16.097753524780273,
"rewards/rejected": -18.134033203125,
"step": 205
},
{
"epoch": 0.8974358974358975,
"grad_norm": 0.2119140625,
"learning_rate": 1.3532897331765301e-05,
"logits/chosen": -1.5572597980499268,
"logits/rejected": -1.053264856338501,
"logps/chosen": -59.34284591674805,
"logps/rejected": -260.30206298828125,
"loss": 0.0151,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -1.9360477924346924,
"rewards/margins": 16.303897857666016,
"rewards/rejected": -18.239948272705078,
"step": 210
},
{
"epoch": 0.9188034188034188,
"grad_norm": 0.0849609375,
"learning_rate": 1.3432668862802134e-05,
"logits/chosen": -1.4950945377349854,
"logits/rejected": -1.014696478843689,
"logps/chosen": -57.9535026550293,
"logps/rejected": -256.272705078125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.7511317729949951,
"rewards/margins": 15.973749160766602,
"rewards/rejected": -17.724878311157227,
"step": 215
},
{
"epoch": 0.9401709401709402,
"grad_norm": 0.08642578125,
"learning_rate": 1.3330315401073371e-05,
"logits/chosen": -1.5073899030685425,
"logits/rejected": -1.0442817211151123,
"logps/chosen": -59.66225051879883,
"logps/rejected": -249.38534545898438,
"loss": 0.002,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.872859001159668,
"rewards/margins": 15.302103042602539,
"rewards/rejected": -17.17496109008789,
"step": 220
},
{
"epoch": 0.9615384615384616,
"grad_norm": 2.625,
"learning_rate": 1.3225893710259887e-05,
"logits/chosen": -1.3290693759918213,
"logits/rejected": -0.8046108484268188,
"logps/chosen": -60.98704147338867,
"logps/rejected": -252.48495483398438,
"loss": 0.0065,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -2.179378032684326,
"rewards/margins": 15.35717487335205,
"rewards/rejected": -17.53655433654785,
"step": 225
},
{
"epoch": 0.9829059829059829,
"grad_norm": 1.359375,
"learning_rate": 1.3119461701051105e-05,
"logits/chosen": -1.4031749963760376,
"logits/rejected": -0.8651553988456726,
"logps/chosen": -57.99671173095703,
"logps/rejected": -251.2525634765625,
"loss": 0.0006,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.0349326133728027,
"rewards/margins": 15.621841430664062,
"rewards/rejected": -17.656774520874023,
"step": 230
},
{
"epoch": 0.9957264957264957,
"eval_logits/chosen": -1.4502625465393066,
"eval_logits/rejected": -0.9043333530426025,
"eval_logps/chosen": -61.05683517456055,
"eval_logps/rejected": -259.54730224609375,
"eval_loss": 0.0003871396475005895,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": -2.1613101959228516,
"eval_rewards/margins": 15.991730690002441,
"eval_rewards/rejected": -18.15304183959961,
"eval_runtime": 9.8475,
"eval_samples_per_second": 20.31,
"eval_steps_per_second": 20.31,
"step": 233
},
{
"epoch": 1.0042735042735043,
"grad_norm": 0.0341796875,
"learning_rate": 1.3011078399028605e-05,
"logits/chosen": -1.430901288986206,
"logits/rejected": -0.9248638153076172,
"logps/chosen": -60.4477424621582,
"logps/rejected": -265.5071716308594,
"loss": 0.0036,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -2.0871260166168213,
"rewards/margins": 16.610048294067383,
"rewards/rejected": -18.697174072265625,
"step": 235
},
{
"epoch": 1.0256410256410255,
"grad_norm": 0.058349609375,
"learning_rate": 1.2900803911931431e-05,
"logits/chosen": -1.4457504749298096,
"logits/rejected": -0.9034906625747681,
"logps/chosen": -59.68601608276367,
"logps/rejected": -263.2565612792969,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.9931907653808594,
"rewards/margins": 16.479150772094727,
"rewards/rejected": -18.472341537475586,
"step": 240
},
{
"epoch": 1.047008547008547,
"grad_norm": 0.037841796875,
"learning_rate": 1.2788699396321252e-05,
"logits/chosen": -1.39047110080719,
"logits/rejected": -0.8637332916259766,
"logps/chosen": -57.831581115722656,
"logps/rejected": -255.6790771484375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -1.925370454788208,
"rewards/margins": 16.076345443725586,
"rewards/rejected": -18.00171661376953,
"step": 245
},
{
"epoch": 1.0683760683760684,
"grad_norm": 0.0002689361572265625,
"learning_rate": 1.2674827023665853e-05,
"logits/chosen": -1.4293615818023682,
"logits/rejected": -0.9234841465950012,
"logps/chosen": -61.868263244628906,
"logps/rejected": -270.99920654296875,
"loss": 0.0008,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.230020046234131,
"rewards/margins": 17.106487274169922,
"rewards/rejected": -19.336505889892578,
"step": 250
},
{
"epoch": 1.0897435897435896,
"grad_norm": 0.08154296875,
"learning_rate": 1.255924994585978e-05,
"logits/chosen": -1.4079844951629639,
"logits/rejected": -0.8683494329452515,
"logps/chosen": -63.99333953857422,
"logps/rejected": -266.74462890625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.376952886581421,
"rewards/margins": 16.51467514038086,
"rewards/rejected": -18.89162826538086,
"step": 255
},
{
"epoch": 1.1111111111111112,
"grad_norm": 0.00147247314453125,
"learning_rate": 1.2442032260201255e-05,
"logits/chosen": -1.400887131690979,
"logits/rejected": -0.890865683555603,
"logps/chosen": -62.627723693847656,
"logps/rejected": -263.9581604003906,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.308192729949951,
"rewards/margins": 16.26297950744629,
"rewards/rejected": -18.571170806884766,
"step": 260
},
{
"epoch": 1.1324786324786325,
"grad_norm": 0.1982421875,
"learning_rate": 1.2323238973844796e-05,
"logits/chosen": -1.438955545425415,
"logits/rejected": -0.9066799283027649,
"logps/chosen": -63.32421875,
"logps/rejected": -273.82049560546875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.295706272125244,
"rewards/margins": 17.04709243774414,
"rewards/rejected": -19.342798233032227,
"step": 265
},
{
"epoch": 1.1538461538461537,
"grad_norm": 0.011962890625,
"learning_rate": 1.2202935967749212e-05,
"logits/chosen": -1.3738555908203125,
"logits/rejected": -0.8493305444717407,
"logps/chosen": -67.02043151855469,
"logps/rejected": -268.3296813964844,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.584259510040283,
"rewards/margins": 16.276723861694336,
"rewards/rejected": -18.86098289489746,
"step": 270
},
{
"epoch": 1.1752136752136753,
"grad_norm": 1.765625,
"learning_rate": 1.2081189960141038e-05,
"logits/chosen": -1.424109697341919,
"logits/rejected": -0.8860370516777039,
"logps/chosen": -61.854644775390625,
"logps/rejected": -267.197021484375,
"loss": 0.0013,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.1506810188293457,
"rewards/margins": 16.72945213317871,
"rewards/rejected": -18.880136489868164,
"step": 275
},
{
"epoch": 1.1965811965811965,
"grad_norm": 0.185546875,
"learning_rate": 1.1958068469513604e-05,
"logits/chosen": -1.4285290241241455,
"logits/rejected": -0.9055356979370117,
"logps/chosen": -65.09664916992188,
"logps/rejected": -277.3038635253906,
"loss": 0.0015,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.459376096725464,
"rewards/margins": 17.357646942138672,
"rewards/rejected": -19.817026138305664,
"step": 280
},
{
"epoch": 1.217948717948718,
"grad_norm": 0.0059814453125,
"learning_rate": 1.1833639777182316e-05,
"logits/chosen": -1.306983232498169,
"logits/rejected": -0.7688174247741699,
"logps/chosen": -59.734840393066406,
"logps/rejected": -272.2923889160156,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.1238536834716797,
"rewards/margins": 17.397279739379883,
"rewards/rejected": -19.52113151550293,
"step": 285
},
{
"epoch": 1.2393162393162394,
"grad_norm": 0.004547119140625,
"learning_rate": 1.170797288941685e-05,
"logits/chosen": -1.2420094013214111,
"logits/rejected": -0.7252348065376282,
"logps/chosen": -66.8678207397461,
"logps/rejected": -281.2005920410156,
"loss": 0.0007,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.6819348335266113,
"rewards/margins": 17.403133392333984,
"rewards/rejected": -20.085067749023438,
"step": 290
},
{
"epoch": 1.2606837606837606,
"grad_norm": 0.8828125,
"learning_rate": 1.1581137499171342e-05,
"logits/chosen": -1.2711966037750244,
"logits/rejected": -0.7393882870674133,
"logps/chosen": -67.3682861328125,
"logps/rejected": -272.0533142089844,
"loss": 0.0012,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.7543585300445557,
"rewards/margins": 16.66292953491211,
"rewards/rejected": -19.41728973388672,
"step": 295
},
{
"epoch": 1.282051282051282,
"grad_norm": 0.01080322265625,
"learning_rate": 1.145320394743371e-05,
"logits/chosen": -1.2599390745162964,
"logits/rejected": -0.7466105222702026,
"logps/chosen": -65.01288604736328,
"logps/rejected": -266.981689453125,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.5523085594177246,
"rewards/margins": 16.560453414916992,
"rewards/rejected": -19.112764358520508,
"step": 300
},
{
"epoch": 1.3034188034188035,
"grad_norm": 0.005706787109375,
"learning_rate": 1.1324243184215622e-05,
"logits/chosen": -1.268808364868164,
"logits/rejected": -0.7630107998847961,
"logps/chosen": -67.40531921386719,
"logps/rejected": -282.67205810546875,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.892889976501465,
"rewards/margins": 17.570592880249023,
"rewards/rejected": -20.463483810424805,
"step": 305
},
{
"epoch": 1.3247863247863247,
"grad_norm": 0.00238037109375,
"learning_rate": 1.1194326729204686e-05,
"logits/chosen": -1.2974001169204712,
"logits/rejected": -0.7796735167503357,
"logps/chosen": -67.35159301757812,
"logps/rejected": -279.50341796875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.8483831882476807,
"rewards/margins": 17.4374942779541,
"rewards/rejected": -20.285879135131836,
"step": 310
},
{
"epoch": 1.3461538461538463,
"grad_norm": 0.021484375,
"learning_rate": 1.1063526632100717e-05,
"logits/chosen": -1.2823902368545532,
"logits/rejected": -0.8006687164306641,
"logps/chosen": -71.19620513916016,
"logps/rejected": -273.39569091796875,
"loss": 0.0031,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -3.098231077194214,
"rewards/margins": 16.550390243530273,
"rewards/rejected": -19.648624420166016,
"step": 315
},
{
"epoch": 1.3675213675213675,
"grad_norm": 0.022216796875,
"learning_rate": 1.0931915432658055e-05,
"logits/chosen": -1.2740647792816162,
"logits/rejected": -0.7717633843421936,
"logps/chosen": -68.88563537597656,
"logps/rejected": -277.2671813964844,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.923701763153076,
"rewards/margins": 17.007097244262695,
"rewards/rejected": -19.930797576904297,
"step": 320
},
{
"epoch": 1.3888888888888888,
"grad_norm": 0.048095703125,
"learning_rate": 1.0799566120456133e-05,
"logits/chosen": -1.2586907148361206,
"logits/rejected": -0.7510126829147339,
"logps/chosen": -71.87751770019531,
"logps/rejected": -275.9109191894531,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.1944289207458496,
"rewards/margins": 16.62671661376953,
"rewards/rejected": -19.82114601135254,
"step": 325
},
{
"epoch": 1.4102564102564101,
"grad_norm": 0.0052490234375,
"learning_rate": 1.066655209442054e-05,
"logits/chosen": -1.280989646911621,
"logits/rejected": -0.772638201713562,
"logps/chosen": -71.73796081542969,
"logps/rejected": -279.8167419433594,
"loss": 0.0021,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.172919511795044,
"rewards/margins": 16.89200210571289,
"rewards/rejected": -20.064922332763672,
"step": 330
},
{
"epoch": 1.4316239316239316,
"grad_norm": 0.007720947265625,
"learning_rate": 1.0532947122117101e-05,
"logits/chosen": -1.27340567111969,
"logits/rejected": -0.7604951858520508,
"logps/chosen": -69.45366668701172,
"logps/rejected": -278.88409423828125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.9620535373687744,
"rewards/margins": 17.169063568115234,
"rewards/rejected": -20.13111686706543,
"step": 335
},
{
"epoch": 1.452991452991453,
"grad_norm": 0.046630859375,
"learning_rate": 1.0398825298841499e-05,
"logits/chosen": -1.2809860706329346,
"logits/rejected": -0.838448166847229,
"logps/chosen": -72.61773681640625,
"logps/rejected": -281.2303466796875,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.1478874683380127,
"rewards/margins": 17.092044830322266,
"rewards/rejected": -20.239933013916016,
"step": 340
},
{
"epoch": 1.4743589743589745,
"grad_norm": 0.01025390625,
"learning_rate": 1.0264261006527144e-05,
"logits/chosen": -1.301695704460144,
"logits/rejected": -0.8668543100357056,
"logps/chosen": -69.15229797363281,
"logps/rejected": -270.77130126953125,
"loss": 0.0035,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -2.9070522785186768,
"rewards/margins": 16.549983978271484,
"rewards/rejected": -19.457035064697266,
"step": 345
},
{
"epoch": 1.4957264957264957,
"grad_norm": 0.5625,
"learning_rate": 1.0129328872494075e-05,
"logits/chosen": -1.385507345199585,
"logits/rejected": -0.9304911494255066,
"logps/chosen": -70.08064270019531,
"logps/rejected": -279.90045166015625,
"loss": 0.003,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -3.0408215522766113,
"rewards/margins": 17.223588943481445,
"rewards/rejected": -20.2644100189209,
"step": 350
},
{
"epoch": 1.517094017094017,
"grad_norm": 0.12890625,
"learning_rate": 9.994103728061786e-06,
"logits/chosen": -1.3539297580718994,
"logits/rejected": -0.8995206952095032,
"logps/chosen": -71.8431625366211,
"logps/rejected": -278.0020446777344,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.1649577617645264,
"rewards/margins": 16.83130645751953,
"rewards/rejected": -19.99626350402832,
"step": 355
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.006134033203125,
"learning_rate": 9.858660567048902e-06,
"logits/chosen": -1.3628873825073242,
"logits/rejected": -0.8617550730705261,
"logps/chosen": -73.44374084472656,
"logps/rejected": -291.49237060546875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.3345115184783936,
"rewards/margins": 17.942880630493164,
"rewards/rejected": -21.277393341064453,
"step": 360
},
{
"epoch": 1.5598290598290598,
"grad_norm": 0.036376953125,
"learning_rate": 9.72307450418274e-06,
"logits/chosen": -1.3687984943389893,
"logits/rejected": -0.8952552676200867,
"logps/chosen": -68.8270492553711,
"logps/rejected": -276.6328125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.877126932144165,
"rewards/margins": 17.08051872253418,
"rewards/rejected": -19.957645416259766,
"step": 365
},
{
"epoch": 1.5811965811965814,
"grad_norm": 0.400390625,
"learning_rate": 9.587420733441835e-06,
"logits/chosen": -1.3641754388809204,
"logits/rejected": -0.9082571864128113,
"logps/chosen": -67.42304992675781,
"logps/rejected": -281.8648986816406,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.7612674236297607,
"rewards/margins": 17.61087989807129,
"rewards/rejected": -20.372146606445312,
"step": 370
},
{
"epoch": 1.6025641025641026,
"grad_norm": 0.000423431396484375,
"learning_rate": 9.45177448635447e-06,
"logits/chosen": -1.3942601680755615,
"logits/rejected": -0.8580430746078491,
"logps/chosen": -67.60791015625,
"logps/rejected": -284.3832092285156,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.805607795715332,
"rewards/margins": 18.005619049072266,
"rewards/rejected": -20.811227798461914,
"step": 375
},
{
"epoch": 1.623931623931624,
"grad_norm": 0.0255126953125,
"learning_rate": 9.316210990276434e-06,
"logits/chosen": -1.3189040422439575,
"logits/rejected": -0.8662185668945312,
"logps/chosen": -67.1756362915039,
"logps/rejected": -272.69500732421875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.7223994731903076,
"rewards/margins": 16.855377197265625,
"rewards/rejected": -19.577777862548828,
"step": 380
},
{
"epoch": 1.6452991452991452,
"grad_norm": 0.04638671875,
"learning_rate": 9.18080542667105e-06,
"logits/chosen": -1.3573819398880005,
"logits/rejected": -0.8372514843940735,
"logps/chosen": -68.33662414550781,
"logps/rejected": -291.3061218261719,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.794102191925049,
"rewards/margins": 18.328411102294922,
"rewards/rejected": -21.122512817382812,
"step": 385
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.000885009765625,
"learning_rate": 9.045632889414686e-06,
"logits/chosen": -1.345085859298706,
"logits/rejected": -0.8467508554458618,
"logps/chosen": -66.54302978515625,
"logps/rejected": -283.5255126953125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.698514461517334,
"rewards/margins": 17.84862518310547,
"rewards/rejected": -20.54714012145996,
"step": 390
},
{
"epoch": 1.688034188034188,
"grad_norm": 0.02734375,
"learning_rate": 8.910768343150828e-06,
"logits/chosen": -1.3259168863296509,
"logits/rejected": -0.855597198009491,
"logps/chosen": -69.09947204589844,
"logps/rejected": -284.14801025390625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.0127596855163574,
"rewards/margins": 17.691822052001953,
"rewards/rejected": -20.704580307006836,
"step": 395
},
{
"epoch": 1.7094017094017095,
"grad_norm": 0.0032806396484375,
"learning_rate": 8.77628658171581e-06,
"logits/chosen": -1.3521082401275635,
"logits/rejected": -0.896456241607666,
"logps/chosen": -67.49749755859375,
"logps/rejected": -278.388916015625,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.764543056488037,
"rewards/margins": 17.227275848388672,
"rewards/rejected": -19.991817474365234,
"step": 400
},
{
"epoch": 1.7307692307692308,
"grad_norm": 0.0830078125,
"learning_rate": 8.642262186659298e-06,
"logits/chosen": -1.311095952987671,
"logits/rejected": -0.8420788049697876,
"logps/chosen": -68.78193664550781,
"logps/rejected": -280.7834777832031,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.862732172012329,
"rewards/margins": 17.331756591796875,
"rewards/rejected": -20.194486618041992,
"step": 405
},
{
"epoch": 1.7521367521367521,
"grad_norm": 0.00640869140625,
"learning_rate": 8.508769485882487e-06,
"logits/chosen": -1.3232362270355225,
"logits/rejected": -0.8540644645690918,
"logps/chosen": -74.96504974365234,
"logps/rejected": -283.9762268066406,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.3426380157470703,
"rewards/margins": 17.237430572509766,
"rewards/rejected": -20.580068588256836,
"step": 410
},
{
"epoch": 1.7735042735042734,
"grad_norm": 0.91796875,
"learning_rate": 8.375882512416969e-06,
"logits/chosen": -1.2893245220184326,
"logits/rejected": -0.7882084846496582,
"logps/chosen": -72.9918212890625,
"logps/rejected": -286.07623291015625,
"loss": 0.0005,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.248950481414795,
"rewards/margins": 17.547632217407227,
"rewards/rejected": -20.796581268310547,
"step": 415
},
{
"epoch": 1.7948717948717947,
"grad_norm": 0.019287109375,
"learning_rate": 8.243674963367137e-06,
"logits/chosen": -1.3166277408599854,
"logits/rejected": -0.8026930093765259,
"logps/chosen": -76.18501281738281,
"logps/rejected": -281.90447998046875,
"loss": 0.0032,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -3.600315570831299,
"rewards/margins": 16.638935089111328,
"rewards/rejected": -20.239248275756836,
"step": 420
},
{
"epoch": 1.8162393162393162,
"grad_norm": 0.0012359619140625,
"learning_rate": 8.11222015903888e-06,
"logits/chosen": -1.3423035144805908,
"logits/rejected": -0.8135835528373718,
"logps/chosen": -78.0848159790039,
"logps/rejected": -287.83135986328125,
"loss": 0.0047,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -3.862813949584961,
"rewards/margins": 17.16622543334961,
"rewards/rejected": -21.029037475585938,
"step": 425
},
{
"epoch": 1.8376068376068377,
"grad_norm": 0.037353515625,
"learning_rate": 7.981591002277265e-06,
"logits/chosen": -1.3140472173690796,
"logits/rejected": -0.8106688261032104,
"logps/chosen": -77.36860656738281,
"logps/rejected": -281.99664306640625,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.733330488204956,
"rewards/margins": 16.830041885375977,
"rewards/rejected": -20.563371658325195,
"step": 430
},
{
"epoch": 1.858974358974359,
"grad_norm": 0.03515625,
"learning_rate": 7.851859938035712e-06,
"logits/chosen": -1.304713487625122,
"logits/rejected": -0.7914744019508362,
"logps/chosen": -78.41984558105469,
"logps/rejected": -291.4969177246094,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.7893600463867188,
"rewards/margins": 17.505069732666016,
"rewards/rejected": -21.294429779052734,
"step": 435
},
{
"epoch": 1.8803418803418803,
"grad_norm": 0.03271484375,
"learning_rate": 7.723098913199118e-06,
"logits/chosen": -1.3396222591400146,
"logits/rejected": -0.834884524345398,
"logps/chosen": -75.61878967285156,
"logps/rejected": -278.4013366699219,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.660414218902588,
"rewards/margins": 16.659198760986328,
"rewards/rejected": -20.31961441040039,
"step": 440
},
{
"epoch": 1.9017094017094016,
"grad_norm": 0.015625,
"learning_rate": 7.595379336683204e-06,
"logits/chosen": -1.3091070652008057,
"logits/rejected": -0.7569972276687622,
"logps/chosen": -72.38371276855469,
"logps/rejected": -287.745849609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.400171995162964,
"rewards/margins": 17.77309226989746,
"rewards/rejected": -21.17326545715332,
"step": 445
},
{
"epoch": 1.9230769230769231,
"grad_norm": 0.004638671875,
"learning_rate": 7.468772039832218e-06,
"logits/chosen": -1.2781813144683838,
"logits/rejected": -0.7406023740768433,
"logps/chosen": -68.62843322753906,
"logps/rejected": -279.81103515625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.0305323600769043,
"rewards/margins": 17.346649169921875,
"rewards/rejected": -20.37718391418457,
"step": 450
},
{
"epoch": 1.9444444444444444,
"grad_norm": 0.0026092529296875,
"learning_rate": 7.3433472371369404e-06,
"logits/chosen": -1.349867820739746,
"logits/rejected": -0.8363698720932007,
"logps/chosen": -74.56883239746094,
"logps/rejected": -285.27227783203125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.506099224090576,
"rewards/margins": 17.23251724243164,
"rewards/rejected": -20.738616943359375,
"step": 455
},
{
"epoch": 1.965811965811966,
"grad_norm": 0.00165557861328125,
"learning_rate": 7.219174487294784e-06,
"logits/chosen": -1.3465303182601929,
"logits/rejected": -0.8472278714179993,
"logps/chosen": -71.95396423339844,
"logps/rejected": -288.6927795410156,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.223040819168091,
"rewards/margins": 17.83417510986328,
"rewards/rejected": -21.05721664428711,
"step": 460
},
{
"epoch": 1.9871794871794872,
"grad_norm": 0.003509521484375,
"learning_rate": 7.0963226546336e-06,
"logits/chosen": -1.3585379123687744,
"logits/rejected": -0.8536975979804993,
"logps/chosen": -72.73930358886719,
"logps/rejected": -283.37579345703125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.333233594894409,
"rewards/margins": 17.287670135498047,
"rewards/rejected": -20.62090492248535,
"step": 465
},
{
"epoch": 1.9914529914529915,
"eval_logits/chosen": -1.3740124702453613,
"eval_logits/rejected": -0.8455994129180908,
"eval_logps/chosen": -73.04570007324219,
"eval_logps/rejected": -284.90460205078125,
"eval_loss": 0.00021937819838058203,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": -3.360196590423584,
"eval_rewards/margins": 17.328575134277344,
"eval_rewards/rejected": -20.688772201538086,
"eval_runtime": 9.5669,
"eval_samples_per_second": 20.905,
"eval_steps_per_second": 20.905,
"step": 466
},
{
"epoch": 2.0085470085470085,
"grad_norm": 0.038818359375,
"learning_rate": 6.974859870920561e-06,
"logits/chosen": -1.2795295715332031,
"logits/rejected": -0.8111523389816284,
"logps/chosen": -75.71898651123047,
"logps/rejected": -279.89599609375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.6148743629455566,
"rewards/margins": 16.733488082885742,
"rewards/rejected": -20.348360061645508,
"step": 470
},
{
"epoch": 2.02991452991453,
"grad_norm": 0.006195068359375,
"learning_rate": 6.8548534975773135e-06,
"logits/chosen": -1.3317922353744507,
"logits/rejected": -0.8281890153884888,
"logps/chosen": -75.41677856445312,
"logps/rejected": -286.36761474609375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.4711709022521973,
"rewards/margins": 17.307971954345703,
"rewards/rejected": -20.779144287109375,
"step": 475
},
{
"epoch": 2.051282051282051,
"grad_norm": 0.341796875,
"learning_rate": 6.736370088322359e-06,
"logits/chosen": -1.3174855709075928,
"logits/rejected": -0.7978845238685608,
"logps/chosen": -74.10897064208984,
"logps/rejected": -283.9443664550781,
"loss": 0.0004,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.4749629497528076,
"rewards/margins": 17.323604583740234,
"rewards/rejected": -20.798566818237305,
"step": 480
},
{
"epoch": 2.072649572649573,
"grad_norm": 0.007293701171875,
"learning_rate": 6.619475352261356e-06,
"logits/chosen": -1.3000952005386353,
"logits/rejected": -0.8089855909347534,
"logps/chosen": -78.87946319580078,
"logps/rejected": -287.0957336425781,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.6867222785949707,
"rewards/margins": 17.182767868041992,
"rewards/rejected": -20.869489669799805,
"step": 485
},
{
"epoch": 2.094017094017094,
"grad_norm": 0.002105712890625,
"learning_rate": 6.504234117445857e-06,
"logits/chosen": -1.3139859437942505,
"logits/rejected": -0.8154487609863281,
"logps/chosen": -74.31788635253906,
"logps/rejected": -284.235107421875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.5273959636688232,
"rewards/margins": 17.246944427490234,
"rewards/rejected": -20.774341583251953,
"step": 490
},
{
"epoch": 2.1153846153846154,
"grad_norm": 0.0240478515625,
"learning_rate": 6.39071029492065e-06,
"logits/chosen": -1.2831732034683228,
"logits/rejected": -0.7532753348350525,
"logps/chosen": -73.73322296142578,
"logps/rejected": -282.9715270996094,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.508289337158203,
"rewards/margins": 17.217966079711914,
"rewards/rejected": -20.726253509521484,
"step": 495
},
{
"epoch": 2.1367521367521367,
"grad_norm": 0.162109375,
"learning_rate": 6.2789668432796535e-06,
"logits/chosen": -1.2966052293777466,
"logits/rejected": -0.8182178735733032,
"logps/chosen": -75.21055603027344,
"logps/rejected": -284.99566650390625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.6024773120880127,
"rewards/margins": 17.332172393798828,
"rewards/rejected": -20.934650421142578,
"step": 500
},
{
"epoch": 2.1367521367521367,
"eval_logits/chosen": -1.3704440593719482,
"eval_logits/rejected": -0.8410933017730713,
"eval_logps/chosen": -73.29509735107422,
"eval_logps/rejected": -285.1749572753906,
"eval_loss": 0.00022948597325012088,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": -3.385136365890503,
"eval_rewards/margins": 17.330673217773438,
"eval_rewards/rejected": -20.715810775756836,
"eval_runtime": 9.5382,
"eval_samples_per_second": 20.968,
"eval_steps_per_second": 20.968,
"step": 500
},
{
"epoch": 2.158119658119658,
"grad_norm": 0.009033203125,
"learning_rate": 6.16906573375004e-06,
"logits/chosen": -1.3252205848693848,
"logits/rejected": -0.8390571475028992,
"logps/chosen": -74.536376953125,
"logps/rejected": -282.44195556640625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.4960930347442627,
"rewards/margins": 17.042308807373047,
"rewards/rejected": -20.538402557373047,
"step": 505
},
{
"epoch": 2.1794871794871793,
"grad_norm": 0.00933837890625,
"learning_rate": 6.061067915823923e-06,
"logits/chosen": -1.2685729265213013,
"logits/rejected": -0.7679704427719116,
"logps/chosen": -72.36498260498047,
"logps/rejected": -282.66351318359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.318829298019409,
"rewards/margins": 17.183773040771484,
"rewards/rejected": -20.50260353088379,
"step": 510
},
{
"epoch": 2.200854700854701,
"grad_norm": 0.0196533203125,
"learning_rate": 5.955033283456711e-06,
"logits/chosen": -1.2974504232406616,
"logits/rejected": -0.7774112820625305,
"logps/chosen": -78.15269470214844,
"logps/rejected": -293.7340393066406,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.7473702430725098,
"rewards/margins": 17.710281372070312,
"rewards/rejected": -21.457651138305664,
"step": 515
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.0023040771484375,
"learning_rate": 5.8510206418507914e-06,
"logits/chosen": -1.3559068441390991,
"logits/rejected": -0.8591842651367188,
"logps/chosen": -77.19640350341797,
"logps/rejected": -300.4443359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.6723015308380127,
"rewards/margins": 18.35979461669922,
"rewards/rejected": -22.03209686279297,
"step": 520
},
{
"epoch": 2.2435897435897436,
"grad_norm": 0.01385498046875,
"learning_rate": 5.749087674843095e-06,
"logits/chosen": -1.2999016046524048,
"logits/rejected": -0.8241308927536011,
"logps/chosen": -70.76306915283203,
"logps/rejected": -283.5411376953125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.232194423675537,
"rewards/margins": 17.38874626159668,
"rewards/rejected": -20.620941162109375,
"step": 525
},
{
"epoch": 2.264957264957265,
"grad_norm": 0.0264892578125,
"learning_rate": 5.649290912914482e-06,
"logits/chosen": -1.3043696880340576,
"logits/rejected": -0.8295344114303589,
"logps/chosen": -79.15299224853516,
"logps/rejected": -298.5386962890625,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.7782886028289795,
"rewards/margins": 17.993247985839844,
"rewards/rejected": -21.771535873413086,
"step": 530
},
{
"epoch": 2.286324786324786,
"grad_norm": 0.04931640625,
"learning_rate": 5.5516857018388144e-06,
"logits/chosen": -1.355273962020874,
"logits/rejected": -0.8746377229690552,
"logps/chosen": -74.15048217773438,
"logps/rejected": -281.2100524902344,
"loss": 0.0002,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.4310333728790283,
"rewards/margins": 16.96903419494629,
"rewards/rejected": -20.400066375732422,
"step": 535
},
{
"epoch": 2.3076923076923075,
"grad_norm": 0.01397705078125,
"learning_rate": 5.456326171989005e-06,
"logits/chosen": -1.3123310804367065,
"logits/rejected": -0.840388298034668,
"logps/chosen": -71.68992614746094,
"logps/rejected": -300.37091064453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.202878475189209,
"rewards/margins": 18.644420623779297,
"rewards/rejected": -21.84729766845703,
"step": 540
},
{
"epoch": 2.3290598290598292,
"grad_norm": 0.0027618408203125,
"learning_rate": 5.363265208317156e-06,
"logits/chosen": -1.2788275480270386,
"logits/rejected": -0.8199743032455444,
"logps/chosen": -73.48957824707031,
"logps/rejected": -281.1923828125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.415186643600464,
"rewards/margins": 17.106266021728516,
"rewards/rejected": -20.52145004272461,
"step": 545
},
{
"epoch": 2.3504273504273505,
"grad_norm": 0.01055908203125,
"learning_rate": 5.272554421025347e-06,
"logits/chosen": -1.3188756704330444,
"logits/rejected": -0.8151782751083374,
"logps/chosen": -74.64764404296875,
"logps/rejected": -291.92108154296875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.5020947456359863,
"rewards/margins": 17.861783981323242,
"rewards/rejected": -21.36387825012207,
"step": 550
},
{
"epoch": 2.371794871794872,
"grad_norm": 0.05078125,
"learning_rate": 5.184244116943411e-06,
"logits/chosen": -1.3126680850982666,
"logits/rejected": -0.8074380159378052,
"logps/chosen": -73.89201354980469,
"logps/rejected": -285.258056640625,
"loss": 0.003,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -3.3727481365203857,
"rewards/margins": 17.37813377380371,
"rewards/rejected": -20.75088119506836,
"step": 555
},
{
"epoch": 2.393162393162393,
"grad_norm": 0.00604248046875,
"learning_rate": 5.098383271629512e-06,
"logits/chosen": -1.3314543962478638,
"logits/rejected": -0.8163145184516907,
"logps/chosen": -73.50102233886719,
"logps/rejected": -279.52532958984375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.414262294769287,
"rewards/margins": 16.943660736083984,
"rewards/rejected": -20.357921600341797,
"step": 560
},
{
"epoch": 2.4145299145299144,
"grad_norm": 0.00311279296875,
"learning_rate": 5.015019502209056e-06,
"logits/chosen": -1.3196806907653809,
"logits/rejected": -0.8105131387710571,
"logps/chosen": -72.18685913085938,
"logps/rejected": -275.95123291015625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.361246109008789,
"rewards/margins": 16.643529891967773,
"rewards/rejected": -20.004776000976562,
"step": 565
},
{
"epoch": 2.435897435897436,
"grad_norm": 0.000823974609375,
"learning_rate": 4.934199040966955e-06,
"logits/chosen": -1.3401740789413452,
"logits/rejected": -0.8449984788894653,
"logps/chosen": -73.99894714355469,
"logps/rejected": -279.31915283203125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.482757568359375,
"rewards/margins": 16.8636531829834,
"rewards/rejected": -20.346412658691406,
"step": 570
},
{
"epoch": 2.4572649572649574,
"grad_norm": 0.03271484375,
"learning_rate": 4.855966709707881e-06,
"logits/chosen": -1.308977484703064,
"logits/rejected": -0.8370776176452637,
"logps/chosen": -77.53469848632812,
"logps/rejected": -285.58367919921875,
"loss": 0.0029,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -3.7107481956481934,
"rewards/margins": 17.121841430664062,
"rewards/rejected": -20.832592010498047,
"step": 575
},
{
"epoch": 2.4786324786324787,
"grad_norm": 0.0299072265625,
"learning_rate": 4.780365894898799e-06,
"logits/chosen": -1.3271667957305908,
"logits/rejected": -0.8259018063545227,
"logps/chosen": -74.68269348144531,
"logps/rejected": -287.12078857421875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.4662697315216064,
"rewards/margins": 17.42196273803711,
"rewards/rejected": -20.888233184814453,
"step": 580
},
{
"epoch": 2.5,
"grad_norm": 0.00921630859375,
"learning_rate": 4.7074385236074684e-06,
"logits/chosen": -1.3541457653045654,
"logits/rejected": -0.8319869041442871,
"logps/chosen": -78.34286499023438,
"logps/rejected": -294.329833984375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.6850643157958984,
"rewards/margins": 17.795787811279297,
"rewards/rejected": -21.480854034423828,
"step": 585
},
{
"epoch": 2.5213675213675213,
"grad_norm": 0.052001953125,
"learning_rate": 4.63722504025034e-06,
"logits/chosen": -1.3320066928863525,
"logits/rejected": -0.8415569067001343,
"logps/chosen": -71.98558044433594,
"logps/rejected": -284.99346923828125,
"loss": 0.0011,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.249178647994995,
"rewards/margins": 17.486406326293945,
"rewards/rejected": -20.735586166381836,
"step": 590
},
{
"epoch": 2.5427350427350426,
"grad_norm": 0.01025390625,
"learning_rate": 4.569764384162676e-06,
"logits/chosen": -1.3463108539581299,
"logits/rejected": -0.8353781700134277,
"logps/chosen": -68.05410766601562,
"logps/rejected": -284.91461181640625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.9681944847106934,
"rewards/margins": 17.779176712036133,
"rewards/rejected": -20.74736976623535,
"step": 595
},
{
"epoch": 2.564102564102564,
"grad_norm": 0.01422119140625,
"learning_rate": 4.50509396800341e-06,
"logits/chosen": -1.2894313335418701,
"logits/rejected": -0.7797183990478516,
"logps/chosen": -72.90419006347656,
"logps/rejected": -285.7866516113281,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.3224644660949707,
"rewards/margins": 17.589801788330078,
"rewards/rejected": -20.91226577758789,
"step": 600
},
{
"epoch": 2.5854700854700856,
"grad_norm": 0.0247802734375,
"learning_rate": 4.443249657006627e-06,
"logits/chosen": -1.2982523441314697,
"logits/rejected": -0.7844825983047485,
"logps/chosen": -70.4168472290039,
"logps/rejected": -290.3528747558594,
"loss": 0.0029,
"rewards/accuracies": 0.9958333969116211,
"rewards/chosen": -3.1497464179992676,
"rewards/margins": 18.14370346069336,
"rewards/rejected": -21.293447494506836,
"step": 605
},
{
"epoch": 2.606837606837607,
"grad_norm": 0.0164794921875,
"learning_rate": 4.384265749091266e-06,
"logits/chosen": -1.2762781381607056,
"logits/rejected": -0.7862453460693359,
"logps/chosen": -78.71661376953125,
"logps/rejected": -288.5256042480469,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.7849228382110596,
"rewards/margins": 17.296167373657227,
"rewards/rejected": -21.081090927124023,
"step": 610
},
{
"epoch": 2.628205128205128,
"grad_norm": 0.00482177734375,
"learning_rate": 4.328174955840002e-06,
"logits/chosen": -1.2989494800567627,
"logits/rejected": -0.791740357875824,
"logps/chosen": -68.19273376464844,
"logps/rejected": -283.67010498046875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.0757412910461426,
"rewards/margins": 17.688796997070312,
"rewards/rejected": -20.764535903930664,
"step": 615
},
{
"epoch": 2.6495726495726495,
"grad_norm": 0.0026092529296875,
"learning_rate": 4.275008384357902e-06,
"logits/chosen": -1.3389320373535156,
"logits/rejected": -0.8387205004692078,
"logps/chosen": -72.53665924072266,
"logps/rejected": -283.5371398925781,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.2912750244140625,
"rewards/margins": 17.413835525512695,
"rewards/rejected": -20.705108642578125,
"step": 620
},
{
"epoch": 2.6709401709401708,
"grad_norm": 0.0028533935546875,
"learning_rate": 4.224795520020898e-06,
"logits/chosen": -1.2840917110443115,
"logits/rejected": -0.7634187936782837,
"logps/chosen": -75.63284301757812,
"logps/rejected": -282.2013244628906,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.4395980834960938,
"rewards/margins": 16.99862289428711,
"rewards/rejected": -20.438220977783203,
"step": 625
},
{
"epoch": 2.6923076923076925,
"grad_norm": 0.01446533203125,
"learning_rate": 4.177564210123634e-06,
"logits/chosen": -1.32615327835083,
"logits/rejected": -0.8317953944206238,
"logps/chosen": -72.77333068847656,
"logps/rejected": -291.9281921386719,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.310988664627075,
"rewards/margins": 17.982282638549805,
"rewards/rejected": -21.293270111083984,
"step": 630
},
{
"epoch": 2.713675213675214,
"grad_norm": 0.00921630859375,
"learning_rate": 4.133340648435789e-06,
"logits/chosen": -1.3119795322418213,
"logits/rejected": -0.7846705913543701,
"logps/chosen": -74.24435424804688,
"logps/rejected": -289.03887939453125,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.4849464893341064,
"rewards/margins": 17.662567138671875,
"rewards/rejected": -21.147512435913086,
"step": 635
},
{
"epoch": 2.735042735042735,
"grad_norm": 0.005828857421875,
"learning_rate": 4.092149360675402e-06,
"logits/chosen": -1.2881155014038086,
"logits/rejected": -0.8033782243728638,
"logps/chosen": -79.22930908203125,
"logps/rejected": -294.1476745605469,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.7118358612060547,
"rewards/margins": 17.629985809326172,
"rewards/rejected": -21.341821670532227,
"step": 640
},
{
"epoch": 2.7564102564102564,
"grad_norm": 0.019775390625,
"learning_rate": 4.054013190907282e-06,
"logits/chosen": -1.2686903476715088,
"logits/rejected": -0.7805891633033752,
"logps/chosen": -70.22049713134766,
"logps/rejected": -284.9203796386719,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.1786301136016846,
"rewards/margins": 17.73773956298828,
"rewards/rejected": -20.916370391845703,
"step": 645
},
{
"epoch": 2.7777777777777777,
"grad_norm": 0.004241943359375,
"learning_rate": 4.018953288874035e-06,
"logits/chosen": -1.3032779693603516,
"logits/rejected": -0.8233755230903625,
"logps/chosen": -74.59503936767578,
"logps/rejected": -288.961669921875,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.4693355560302734,
"rewards/margins": 17.675161361694336,
"rewards/rejected": -21.14449691772461,
"step": 650
},
{
"epoch": 2.799145299145299,
"grad_norm": 0.0040283203125,
"learning_rate": 3.9869890982667385e-06,
"logits/chosen": -1.310773491859436,
"logits/rejected": -0.7524069547653198,
"logps/chosen": -72.80381774902344,
"logps/rejected": -290.0320739746094,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.345860004425049,
"rewards/margins": 17.688016891479492,
"rewards/rejected": -21.033876419067383,
"step": 655
},
{
"epoch": 2.8205128205128203,
"grad_norm": 0.0189208984375,
"learning_rate": 3.9581383459417625e-06,
"logits/chosen": -1.291512370109558,
"logits/rejected": -0.790591299533844,
"logps/chosen": -78.11724853515625,
"logps/rejected": -297.84283447265625,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.7806007862091064,
"rewards/margins": 18.05144500732422,
"rewards/rejected": -21.832046508789062,
"step": 660
},
{
"epoch": 2.841880341880342,
"grad_norm": 0.00494384765625,
"learning_rate": 3.932417032089722e-06,
"logits/chosen": -1.3292133808135986,
"logits/rejected": -0.8189595937728882,
"logps/chosen": -76.23522186279297,
"logps/rejected": -293.2325744628906,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.59516978263855,
"rewards/margins": 17.823274612426758,
"rewards/rejected": -21.418445587158203,
"step": 665
},
{
"epoch": 2.8632478632478633,
"grad_norm": 0.0029754638671875,
"learning_rate": 3.909839421362017e-06,
"logits/chosen": -1.2779964208602905,
"logits/rejected": -0.7794166803359985,
"logps/chosen": -74.2072982788086,
"logps/rejected": -289.36956787109375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.46783185005188,
"rewards/margins": 17.554330825805664,
"rewards/rejected": -21.02216148376465,
"step": 670
},
{
"epoch": 2.8846153846153846,
"grad_norm": 0.038818359375,
"learning_rate": 3.890418034959871e-06,
"logits/chosen": -1.2737759351730347,
"logits/rejected": -0.7384223937988281,
"logps/chosen": -73.76658630371094,
"logps/rejected": -286.06195068359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.443812847137451,
"rewards/margins": 17.49923324584961,
"rewards/rejected": -20.943044662475586,
"step": 675
},
{
"epoch": 2.905982905982906,
"grad_norm": 0.06884765625,
"learning_rate": 3.874163643690263e-06,
"logits/chosen": -1.255707025527954,
"logits/rejected": -0.7339369654655457,
"logps/chosen": -80.28028869628906,
"logps/rejected": -291.90179443359375,
"loss": 0.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.9126086235046387,
"rewards/margins": 17.456329345703125,
"rewards/rejected": -21.368938446044922,
"step": 680
},
{
"epoch": 2.9273504273504276,
"grad_norm": 0.04296875,
"learning_rate": 3.861085261992599e-06,
"logits/chosen": -1.306028127670288,
"logits/rejected": -0.8490394353866577,
"logps/chosen": -78.53582763671875,
"logps/rejected": -289.38330078125,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.830167770385742,
"rewards/margins": 17.217695236206055,
"rewards/rejected": -21.047863006591797,
"step": 685
},
{
"epoch": 2.948717948717949,
"grad_norm": 0.12255859375,
"learning_rate": 3.851190142939442e-06,
"logits/chosen": -1.2999136447906494,
"logits/rejected": -0.8062965273857117,
"logps/chosen": -72.80134582519531,
"logps/rejected": -288.02532958984375,
"loss": 0.0001,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.2750275135040283,
"rewards/margins": 17.672157287597656,
"rewards/rejected": -20.94718360900879,
"step": 690
},
{
"epoch": 2.97008547008547,
"grad_norm": 0.001434326171875,
"learning_rate": 3.844483774214069e-06,
"logits/chosen": -1.2856634855270386,
"logits/rejected": -0.7375695705413818,
"logps/chosen": -72.40345764160156,
"logps/rejected": -288.3551940917969,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.333986759185791,
"rewards/margins": 17.785888671875,
"rewards/rejected": -21.119874954223633,
"step": 695
},
{
"epoch": 2.9871794871794872,
"eval_logits/chosen": -1.3524901866912842,
"eval_logits/rejected": -0.8189607262611389,
"eval_logps/chosen": -73.21478271484375,
"eval_logps/rejected": -285.69122314453125,
"eval_loss": 0.00020370040147099644,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": -3.377105474472046,
"eval_rewards/margins": 17.390329360961914,
"eval_rewards/rejected": -20.767436981201172,
"eval_runtime": 9.5425,
"eval_samples_per_second": 20.959,
"eval_steps_per_second": 20.959,
"step": 699
}
],
"logging_steps": 5,
"max_steps": 702,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}