DeepSeek_MATH_Self_Explore / trainer_state.json
hbin0701's picture
add model
b170434
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1235,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 14.532158655563514,
"learning_rate": 2.6954177897574124e-09,
"logits/chosen": 34.72175216674805,
"logits/rejected": 20.32191276550293,
"logps/chosen": -43.472816467285156,
"logps/rejected": -15.394071578979492,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.01,
"grad_norm": 13.33639437555034,
"learning_rate": 2.6954177897574124e-08,
"logits/chosen": 24.2269344329834,
"logits/rejected": 11.215648651123047,
"logps/chosen": -28.242902755737305,
"logps/rejected": -20.951187133789062,
"loss": 0.6929,
"rewards/accuracies": 0.4305555522441864,
"rewards/chosen": 0.0030395330395549536,
"rewards/margins": 0.001963509013876319,
"rewards/rejected": 0.0010760227451100945,
"step": 10
},
{
"epoch": 0.02,
"grad_norm": 14.205098357697945,
"learning_rate": 5.390835579514825e-08,
"logits/chosen": 18.444320678710938,
"logits/rejected": 9.983879089355469,
"logps/chosen": -31.849700927734375,
"logps/rejected": -23.311790466308594,
"loss": 0.6933,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": 0.0012197095202282071,
"rewards/margins": 0.0035085126291960478,
"rewards/rejected": -0.0022888043895363808,
"step": 20
},
{
"epoch": 0.02,
"grad_norm": 16.673011790453604,
"learning_rate": 8.086253369272237e-08,
"logits/chosen": 17.74435043334961,
"logits/rejected": 8.345118522644043,
"logps/chosen": -30.406158447265625,
"logps/rejected": -21.020471572875977,
"loss": 0.6895,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.010070301592350006,
"rewards/margins": 0.020921695977449417,
"rewards/rejected": -0.010851392522454262,
"step": 30
},
{
"epoch": 0.03,
"grad_norm": 14.573591360793975,
"learning_rate": 1.078167115902965e-07,
"logits/chosen": 18.73330307006836,
"logits/rejected": 9.43653678894043,
"logps/chosen": -31.389293670654297,
"logps/rejected": -22.603178024291992,
"loss": 0.6886,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.016900639981031418,
"rewards/margins": 0.02598029002547264,
"rewards/rejected": -0.009079648181796074,
"step": 40
},
{
"epoch": 0.04,
"grad_norm": 15.064895702850436,
"learning_rate": 1.347708894878706e-07,
"logits/chosen": 21.941606521606445,
"logits/rejected": 11.377029418945312,
"logps/chosen": -26.427230834960938,
"logps/rejected": -22.94554901123047,
"loss": 0.682,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.004804253112524748,
"rewards/margins": 0.022189300507307053,
"rewards/rejected": -0.026993554085493088,
"step": 50
},
{
"epoch": 0.05,
"grad_norm": 11.51348272649836,
"learning_rate": 1.6172506738544473e-07,
"logits/chosen": 18.3980770111084,
"logits/rejected": 8.642730712890625,
"logps/chosen": -31.209949493408203,
"logps/rejected": -22.193492889404297,
"loss": 0.6801,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.004430429544299841,
"rewards/margins": 0.025433117523789406,
"rewards/rejected": -0.029863541945815086,
"step": 60
},
{
"epoch": 0.06,
"grad_norm": 12.59495036610267,
"learning_rate": 1.8867924528301886e-07,
"logits/chosen": 22.12859535217285,
"logits/rejected": 11.408747673034668,
"logps/chosen": -33.64688491821289,
"logps/rejected": -25.069316864013672,
"loss": 0.6677,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": 0.007852818816900253,
"rewards/margins": 0.07031778991222382,
"rewards/rejected": -0.06246497482061386,
"step": 70
},
{
"epoch": 0.06,
"grad_norm": 13.631521920184003,
"learning_rate": 2.15633423180593e-07,
"logits/chosen": 19.470523834228516,
"logits/rejected": 9.611539840698242,
"logps/chosen": -34.38616180419922,
"logps/rejected": -24.599369049072266,
"loss": 0.6537,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0013453528517857194,
"rewards/margins": 0.07501634210348129,
"rewards/rejected": -0.07636170089244843,
"step": 80
},
{
"epoch": 0.07,
"grad_norm": 15.155810829127399,
"learning_rate": 2.425876010781671e-07,
"logits/chosen": 18.752395629882812,
"logits/rejected": 10.284080505371094,
"logps/chosen": -30.28765296936035,
"logps/rejected": -22.451499938964844,
"loss": 0.6389,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.02513519860804081,
"rewards/margins": 0.14448794722557068,
"rewards/rejected": -0.11935273557901382,
"step": 90
},
{
"epoch": 0.08,
"grad_norm": 15.287885727754697,
"learning_rate": 2.695417789757412e-07,
"logits/chosen": 18.602680206298828,
"logits/rejected": 10.510783195495605,
"logps/chosen": -27.710529327392578,
"logps/rejected": -20.73261260986328,
"loss": 0.6084,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.0187116377055645,
"rewards/margins": 0.17055347561836243,
"rewards/rejected": -0.15184184908866882,
"step": 100
},
{
"epoch": 0.09,
"grad_norm": 14.136752828952837,
"learning_rate": 2.9649595687331536e-07,
"logits/chosen": 18.222576141357422,
"logits/rejected": 8.738725662231445,
"logps/chosen": -32.60607147216797,
"logps/rejected": -23.04232406616211,
"loss": 0.577,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.010421124286949635,
"rewards/margins": 0.258650541305542,
"rewards/rejected": -0.24822942912578583,
"step": 110
},
{
"epoch": 0.1,
"grad_norm": 10.395501571662873,
"learning_rate": 3.2345013477088946e-07,
"logits/chosen": 20.114803314208984,
"logits/rejected": 10.068379402160645,
"logps/chosen": -32.489681243896484,
"logps/rejected": -24.69915199279785,
"loss": 0.5631,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.022254476323723793,
"rewards/margins": 0.33683449029922485,
"rewards/rejected": -0.3145800232887268,
"step": 120
},
{
"epoch": 0.11,
"grad_norm": 13.16014124752717,
"learning_rate": 3.504043126684636e-07,
"logits/chosen": 20.067184448242188,
"logits/rejected": 9.164568901062012,
"logps/chosen": -32.826438903808594,
"logps/rejected": -23.3800106048584,
"loss": 0.5475,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 0.000534355640411377,
"rewards/margins": 0.368183434009552,
"rewards/rejected": -0.3676490783691406,
"step": 130
},
{
"epoch": 0.11,
"grad_norm": 9.132463704120546,
"learning_rate": 3.773584905660377e-07,
"logits/chosen": 17.273075103759766,
"logits/rejected": 7.174586296081543,
"logps/chosen": -35.4624137878418,
"logps/rejected": -26.397430419921875,
"loss": 0.5052,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": 0.009195957332849503,
"rewards/margins": 0.5245504379272461,
"rewards/rejected": -0.515354573726654,
"step": 140
},
{
"epoch": 0.12,
"grad_norm": 9.183413111659194,
"learning_rate": 4.043126684636118e-07,
"logits/chosen": 16.8369197845459,
"logits/rejected": 7.7970170974731445,
"logps/chosen": -32.14591979980469,
"logps/rejected": -30.26938819885254,
"loss": 0.4874,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.035380132496356964,
"rewards/margins": 0.7585235238075256,
"rewards/rejected": -0.723143458366394,
"step": 150
},
{
"epoch": 0.13,
"grad_norm": 7.878919640462125,
"learning_rate": 4.31266846361186e-07,
"logits/chosen": 19.754840850830078,
"logits/rejected": 9.633878707885742,
"logps/chosen": -31.04324722290039,
"logps/rejected": -27.710851669311523,
"loss": 0.4563,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.042704738676548004,
"rewards/margins": 0.838780403137207,
"rewards/rejected": -0.7960756421089172,
"step": 160
},
{
"epoch": 0.14,
"grad_norm": 7.2682115815248185,
"learning_rate": 4.582210242587601e-07,
"logits/chosen": 17.248498916625977,
"logits/rejected": 6.265856742858887,
"logps/chosen": -35.737918853759766,
"logps/rejected": -33.12517547607422,
"loss": 0.4045,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.042743489146232605,
"rewards/margins": 1.2479735612869263,
"rewards/rejected": -1.2052299976348877,
"step": 170
},
{
"epoch": 0.15,
"grad_norm": 6.655107331443258,
"learning_rate": 4.851752021563342e-07,
"logits/chosen": 15.79400634765625,
"logits/rejected": 7.4026994705200195,
"logps/chosen": -33.69775390625,
"logps/rejected": -33.79288864135742,
"loss": 0.415,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.022446583956480026,
"rewards/margins": 1.2890950441360474,
"rewards/rejected": -1.266648530960083,
"step": 180
},
{
"epoch": 0.15,
"grad_norm": 6.738689797871621,
"learning_rate": 5.121293800539083e-07,
"logits/chosen": 17.93876075744629,
"logits/rejected": 7.969454765319824,
"logps/chosen": -39.0764274597168,
"logps/rejected": -38.42572784423828,
"loss": 0.3938,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.012814655900001526,
"rewards/margins": 1.4817765951156616,
"rewards/rejected": -1.4945913553237915,
"step": 190
},
{
"epoch": 0.16,
"grad_norm": 7.571413646680112,
"learning_rate": 5.390835579514824e-07,
"logits/chosen": 15.535064697265625,
"logits/rejected": 7.618639945983887,
"logps/chosen": -29.554052352905273,
"logps/rejected": -38.369998931884766,
"loss": 0.3967,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 0.009946437552571297,
"rewards/margins": 1.5956742763519287,
"rewards/rejected": -1.5857279300689697,
"step": 200
},
{
"epoch": 0.17,
"grad_norm": 8.30487385022327,
"learning_rate": 5.660377358490566e-07,
"logits/chosen": 17.70827865600586,
"logits/rejected": 9.438637733459473,
"logps/chosen": -25.7026309967041,
"logps/rejected": -35.64281463623047,
"loss": 0.4122,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.03218810260295868,
"rewards/margins": 1.5774122476577759,
"rewards/rejected": -1.5452241897583008,
"step": 210
},
{
"epoch": 0.18,
"grad_norm": 10.394921980828073,
"learning_rate": 5.929919137466307e-07,
"logits/chosen": 14.426767349243164,
"logits/rejected": 6.211413383483887,
"logps/chosen": -30.019893646240234,
"logps/rejected": -32.835453033447266,
"loss": 0.3722,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.03471699357032776,
"rewards/margins": 1.463291049003601,
"rewards/rejected": -1.4285740852355957,
"step": 220
},
{
"epoch": 0.19,
"grad_norm": 6.375611768144224,
"learning_rate": 6.199460916442049e-07,
"logits/chosen": 13.590364456176758,
"logits/rejected": 5.274600028991699,
"logps/chosen": -32.46255111694336,
"logps/rejected": -35.932701110839844,
"loss": 0.3672,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.015460697934031487,
"rewards/margins": 1.697251558303833,
"rewards/rejected": -1.6817909479141235,
"step": 230
},
{
"epoch": 0.19,
"grad_norm": 7.0998588698532,
"learning_rate": 6.469002695417789e-07,
"logits/chosen": 15.448835372924805,
"logits/rejected": 7.325920104980469,
"logps/chosen": -31.609811782836914,
"logps/rejected": -36.574485778808594,
"loss": 0.3442,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.02785242535173893,
"rewards/margins": 1.6729406118392944,
"rewards/rejected": -1.7007930278778076,
"step": 240
},
{
"epoch": 0.2,
"grad_norm": 11.112020896957825,
"learning_rate": 6.738544474393531e-07,
"logits/chosen": 11.986245155334473,
"logits/rejected": 5.750561714172363,
"logps/chosen": -25.793066024780273,
"logps/rejected": -41.1673469543457,
"loss": 0.3791,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.025392189621925354,
"rewards/margins": 1.8996098041534424,
"rewards/rejected": -1.925002098083496,
"step": 250
},
{
"epoch": 0.21,
"grad_norm": 8.28256712940868,
"learning_rate": 7.008086253369272e-07,
"logits/chosen": 12.781267166137695,
"logits/rejected": 5.152982234954834,
"logps/chosen": -39.48822021484375,
"logps/rejected": -41.072654724121094,
"loss": 0.3437,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.0019520943751558661,
"rewards/margins": 2.153017520904541,
"rewards/rejected": -2.1549696922302246,
"step": 260
},
{
"epoch": 0.22,
"grad_norm": 7.89202268057561,
"learning_rate": 7.277628032345014e-07,
"logits/chosen": 9.854484558105469,
"logits/rejected": 4.801867485046387,
"logps/chosen": -28.593700408935547,
"logps/rejected": -42.71391677856445,
"loss": 0.3319,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.09453903138637543,
"rewards/margins": 2.2262840270996094,
"rewards/rejected": -2.3208229541778564,
"step": 270
},
{
"epoch": 0.23,
"grad_norm": 8.826305045755074,
"learning_rate": 7.547169811320754e-07,
"logits/chosen": 9.630620956420898,
"logits/rejected": 3.555039167404175,
"logps/chosen": -33.346229553222656,
"logps/rejected": -44.17544937133789,
"loss": 0.3312,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.12214686721563339,
"rewards/margins": 2.3439300060272217,
"rewards/rejected": -2.4660770893096924,
"step": 280
},
{
"epoch": 0.23,
"grad_norm": 8.653546928358537,
"learning_rate": 7.816711590296495e-07,
"logits/chosen": 10.433551788330078,
"logits/rejected": 3.2461979389190674,
"logps/chosen": -39.67103576660156,
"logps/rejected": -42.812984466552734,
"loss": 0.3321,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.12133828550577164,
"rewards/margins": 2.284976005554199,
"rewards/rejected": -2.4063143730163574,
"step": 290
},
{
"epoch": 0.24,
"grad_norm": 8.824713073330798,
"learning_rate": 8.086253369272237e-07,
"logits/chosen": 11.353474617004395,
"logits/rejected": 4.719082832336426,
"logps/chosen": -35.91315841674805,
"logps/rejected": -44.575462341308594,
"loss": 0.3245,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.12168803066015244,
"rewards/margins": 2.267569065093994,
"rewards/rejected": -2.3892571926116943,
"step": 300
},
{
"epoch": 0.25,
"grad_norm": 9.66741690133772,
"learning_rate": 8.355795148247978e-07,
"logits/chosen": 10.735493659973145,
"logits/rejected": 4.588931560516357,
"logps/chosen": -26.46687889099121,
"logps/rejected": -49.5245246887207,
"loss": 0.2771,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.17442038655281067,
"rewards/margins": 2.738029956817627,
"rewards/rejected": -2.9124507904052734,
"step": 310
},
{
"epoch": 0.26,
"grad_norm": 9.315896892682426,
"learning_rate": 8.62533692722372e-07,
"logits/chosen": 10.167337417602539,
"logits/rejected": 4.709428787231445,
"logps/chosen": -27.1495304107666,
"logps/rejected": -42.73828125,
"loss": 0.3354,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.22090284526348114,
"rewards/margins": 2.134673833847046,
"rewards/rejected": -2.355576515197754,
"step": 320
},
{
"epoch": 0.27,
"grad_norm": 13.726622644360368,
"learning_rate": 8.89487870619946e-07,
"logits/chosen": 10.056068420410156,
"logits/rejected": 3.6924197673797607,
"logps/chosen": -29.271936416625977,
"logps/rejected": -52.2385368347168,
"loss": 0.2851,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.25930890440940857,
"rewards/margins": 2.7558753490448,
"rewards/rejected": -3.0151844024658203,
"step": 330
},
{
"epoch": 0.28,
"grad_norm": 9.719990562865625,
"learning_rate": 9.164420485175202e-07,
"logits/chosen": 10.418781280517578,
"logits/rejected": 3.512908935546875,
"logps/chosen": -35.38787841796875,
"logps/rejected": -54.7783203125,
"loss": 0.3212,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.13537803292274475,
"rewards/margins": 3.10463285446167,
"rewards/rejected": -3.2400107383728027,
"step": 340
},
{
"epoch": 0.28,
"grad_norm": 7.713291117111928,
"learning_rate": 9.433962264150943e-07,
"logits/chosen": 11.484260559082031,
"logits/rejected": 2.651977300643921,
"logps/chosen": -40.273658752441406,
"logps/rejected": -58.427894592285156,
"loss": 0.3083,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.1973624974489212,
"rewards/margins": 3.379941940307617,
"rewards/rejected": -3.5773043632507324,
"step": 350
},
{
"epoch": 0.29,
"grad_norm": 8.823882065396496,
"learning_rate": 9.703504043126684e-07,
"logits/chosen": 9.80009937286377,
"logits/rejected": 3.363906145095825,
"logps/chosen": -35.430931091308594,
"logps/rejected": -53.07218551635742,
"loss": 0.2974,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.268148809671402,
"rewards/margins": 2.871591567993164,
"rewards/rejected": -3.1397411823272705,
"step": 360
},
{
"epoch": 0.3,
"grad_norm": 8.758247083429891,
"learning_rate": 9.973045822102425e-07,
"logits/chosen": 8.435894966125488,
"logits/rejected": 2.97558331489563,
"logps/chosen": -30.389368057250977,
"logps/rejected": -57.72784423828125,
"loss": 0.2906,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.27489131689071655,
"rewards/margins": 3.4869256019592285,
"rewards/rejected": -3.761817216873169,
"step": 370
},
{
"epoch": 0.31,
"grad_norm": 7.631748999946755,
"learning_rate": 9.973005398920215e-07,
"logits/chosen": 9.810181617736816,
"logits/rejected": 2.715953826904297,
"logps/chosen": -32.38187789916992,
"logps/rejected": -59.24934005737305,
"loss": 0.2515,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.26635074615478516,
"rewards/margins": 3.6957993507385254,
"rewards/rejected": -3.9621498584747314,
"step": 380
},
{
"epoch": 0.32,
"grad_norm": 9.640014523194331,
"learning_rate": 9.943011397720455e-07,
"logits/chosen": 6.572465419769287,
"logits/rejected": 1.5894476175308228,
"logps/chosen": -35.82884216308594,
"logps/rejected": -62.335121154785156,
"loss": 0.2944,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.3674604296684265,
"rewards/margins": 3.553370952606201,
"rewards/rejected": -3.9208312034606934,
"step": 390
},
{
"epoch": 0.32,
"grad_norm": 8.447688684618234,
"learning_rate": 9.913017396520695e-07,
"logits/chosen": 8.614189147949219,
"logits/rejected": 2.1055407524108887,
"logps/chosen": -36.229373931884766,
"logps/rejected": -67.00838470458984,
"loss": 0.2475,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.36652085185050964,
"rewards/margins": 4.133388996124268,
"rewards/rejected": -4.499909400939941,
"step": 400
},
{
"epoch": 0.33,
"grad_norm": 10.308333518649784,
"learning_rate": 9.883023395320934e-07,
"logits/chosen": 7.3046979904174805,
"logits/rejected": 1.8474470376968384,
"logps/chosen": -33.466575622558594,
"logps/rejected": -54.38008499145508,
"loss": 0.2786,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.4685148298740387,
"rewards/margins": 2.9815940856933594,
"rewards/rejected": -3.450108766555786,
"step": 410
},
{
"epoch": 0.34,
"grad_norm": 11.066433923488011,
"learning_rate": 9.853029394121174e-07,
"logits/chosen": 8.033042907714844,
"logits/rejected": 1.4042919874191284,
"logps/chosen": -36.41872024536133,
"logps/rejected": -62.8419303894043,
"loss": 0.2887,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.3985442519187927,
"rewards/margins": 3.7590441703796387,
"rewards/rejected": -4.157588481903076,
"step": 420
},
{
"epoch": 0.35,
"grad_norm": 9.436834774341992,
"learning_rate": 9.823035392921416e-07,
"logits/chosen": 8.418048858642578,
"logits/rejected": 2.3021135330200195,
"logps/chosen": -32.22361755371094,
"logps/rejected": -59.6855583190918,
"loss": 0.2838,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.2626524865627289,
"rewards/margins": 3.715350389480591,
"rewards/rejected": -3.9780030250549316,
"step": 430
},
{
"epoch": 0.36,
"grad_norm": 8.228341584616038,
"learning_rate": 9.793041391721656e-07,
"logits/chosen": 6.120682716369629,
"logits/rejected": 1.3484184741973877,
"logps/chosen": -33.420013427734375,
"logps/rejected": -50.40047836303711,
"loss": 0.2841,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.2531784176826477,
"rewards/margins": 3.000014305114746,
"rewards/rejected": -3.253192901611328,
"step": 440
},
{
"epoch": 0.36,
"grad_norm": 10.32480072105457,
"learning_rate": 9.763047390521895e-07,
"logits/chosen": 5.195191383361816,
"logits/rejected": 1.293041467666626,
"logps/chosen": -32.28490447998047,
"logps/rejected": -65.37628173828125,
"loss": 0.2243,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.3500588834285736,
"rewards/margins": 4.006264686584473,
"rewards/rejected": -4.356323719024658,
"step": 450
},
{
"epoch": 0.37,
"grad_norm": 9.684893504642167,
"learning_rate": 9.733053389322135e-07,
"logits/chosen": 6.817592620849609,
"logits/rejected": 1.8375227451324463,
"logps/chosen": -32.0588264465332,
"logps/rejected": -58.29685592651367,
"loss": 0.2894,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.3759368360042572,
"rewards/margins": 3.311121702194214,
"rewards/rejected": -3.687058687210083,
"step": 460
},
{
"epoch": 0.38,
"grad_norm": 10.551148814526442,
"learning_rate": 9.703059388122375e-07,
"logits/chosen": 7.666343688964844,
"logits/rejected": 0.895540714263916,
"logps/chosen": -40.66641616821289,
"logps/rejected": -62.7708625793457,
"loss": 0.266,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.376313179731369,
"rewards/margins": 3.9723620414733887,
"rewards/rejected": -4.34867525100708,
"step": 470
},
{
"epoch": 0.39,
"grad_norm": 13.955838544627463,
"learning_rate": 9.673065386922614e-07,
"logits/chosen": 7.45211935043335,
"logits/rejected": 1.4658912420272827,
"logps/chosen": -37.92814254760742,
"logps/rejected": -60.00164031982422,
"loss": 0.2695,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.5523335337638855,
"rewards/margins": 3.546916961669922,
"rewards/rejected": -4.099250793457031,
"step": 480
},
{
"epoch": 0.4,
"grad_norm": 6.509468723388655,
"learning_rate": 9.643071385722856e-07,
"logits/chosen": 6.468374729156494,
"logits/rejected": 1.589951992034912,
"logps/chosen": -31.105602264404297,
"logps/rejected": -68.32657623291016,
"loss": 0.233,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.3899196684360504,
"rewards/margins": 4.078752040863037,
"rewards/rejected": -4.4686713218688965,
"step": 490
},
{
"epoch": 0.4,
"grad_norm": 8.461512679967555,
"learning_rate": 9.613077384523096e-07,
"logits/chosen": 5.951247215270996,
"logits/rejected": 0.2273063212633133,
"logps/chosen": -40.5043830871582,
"logps/rejected": -66.83312225341797,
"loss": 0.2335,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.46840596199035645,
"rewards/margins": 4.177884578704834,
"rewards/rejected": -4.6462907791137695,
"step": 500
},
{
"epoch": 0.41,
"grad_norm": 11.967462722409747,
"learning_rate": 9.583083383323336e-07,
"logits/chosen": 7.466969966888428,
"logits/rejected": 2.0265185832977295,
"logps/chosen": -36.37960433959961,
"logps/rejected": -62.22391891479492,
"loss": 0.3024,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.5502637624740601,
"rewards/margins": 3.5663585662841797,
"rewards/rejected": -4.116622447967529,
"step": 510
},
{
"epoch": 0.42,
"grad_norm": 13.48790921542868,
"learning_rate": 9.553089382123575e-07,
"logits/chosen": 7.438709259033203,
"logits/rejected": 2.2369534969329834,
"logps/chosen": -31.806324005126953,
"logps/rejected": -62.761444091796875,
"loss": 0.2708,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.3073802590370178,
"rewards/margins": 3.9625587463378906,
"rewards/rejected": -4.269938945770264,
"step": 520
},
{
"epoch": 0.43,
"grad_norm": 8.131965384825092,
"learning_rate": 9.523095380923815e-07,
"logits/chosen": 6.172783851623535,
"logits/rejected": 1.3336482048034668,
"logps/chosen": -38.68606948852539,
"logps/rejected": -61.667015075683594,
"loss": 0.2684,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.5361462831497192,
"rewards/margins": 3.6847171783447266,
"rewards/rejected": -4.220863342285156,
"step": 530
},
{
"epoch": 0.44,
"grad_norm": 9.497892720299374,
"learning_rate": 9.493101379724055e-07,
"logits/chosen": 6.613364219665527,
"logits/rejected": 0.7903220057487488,
"logps/chosen": -38.494571685791016,
"logps/rejected": -69.2742919921875,
"loss": 0.2634,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.45714178681373596,
"rewards/margins": 4.283186912536621,
"rewards/rejected": -4.740328788757324,
"step": 540
},
{
"epoch": 0.45,
"grad_norm": 9.301689454835405,
"learning_rate": 9.463107378524294e-07,
"logits/chosen": 7.8855462074279785,
"logits/rejected": 1.7080109119415283,
"logps/chosen": -46.07327651977539,
"logps/rejected": -69.71966552734375,
"loss": 0.2601,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.4959110617637634,
"rewards/margins": 4.211344242095947,
"rewards/rejected": -4.7072553634643555,
"step": 550
},
{
"epoch": 0.45,
"grad_norm": 8.870511185695392,
"learning_rate": 9.433113377324534e-07,
"logits/chosen": 4.9687042236328125,
"logits/rejected": 0.5882563591003418,
"logps/chosen": -35.23982620239258,
"logps/rejected": -66.53153228759766,
"loss": 0.2627,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.6176945567131042,
"rewards/margins": 3.988445281982422,
"rewards/rejected": -4.606139659881592,
"step": 560
},
{
"epoch": 0.46,
"grad_norm": 9.088680603264963,
"learning_rate": 9.403119376124775e-07,
"logits/chosen": 4.735517501831055,
"logits/rejected": 0.7597037553787231,
"logps/chosen": -32.378013610839844,
"logps/rejected": -61.3135986328125,
"loss": 0.2795,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.517710268497467,
"rewards/margins": 3.588815212249756,
"rewards/rejected": -4.106525421142578,
"step": 570
},
{
"epoch": 0.47,
"grad_norm": 9.322483216606127,
"learning_rate": 9.373125374925015e-07,
"logits/chosen": 5.155986309051514,
"logits/rejected": 0.859477698802948,
"logps/chosen": -34.49011993408203,
"logps/rejected": -67.98521423339844,
"loss": 0.2787,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.48589468002319336,
"rewards/margins": 4.109589576721191,
"rewards/rejected": -4.595483779907227,
"step": 580
},
{
"epoch": 0.48,
"grad_norm": 8.369998864435798,
"learning_rate": 9.343131373725255e-07,
"logits/chosen": 6.183840751647949,
"logits/rejected": 0.43990451097488403,
"logps/chosen": -37.519309997558594,
"logps/rejected": -74.74612426757812,
"loss": 0.234,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.5745238065719604,
"rewards/margins": 4.902890205383301,
"rewards/rejected": -5.477413654327393,
"step": 590
},
{
"epoch": 0.49,
"grad_norm": 9.01122070108579,
"learning_rate": 9.313137372525495e-07,
"logits/chosen": 6.226532936096191,
"logits/rejected": 1.1327306032180786,
"logps/chosen": -43.671016693115234,
"logps/rejected": -65.06999206542969,
"loss": 0.2639,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.5650517344474792,
"rewards/margins": 3.8689913749694824,
"rewards/rejected": -4.434042930603027,
"step": 600
},
{
"epoch": 0.49,
"grad_norm": 10.53898233882828,
"learning_rate": 9.283143371325735e-07,
"logits/chosen": 7.082172393798828,
"logits/rejected": 1.4899251461029053,
"logps/chosen": -39.97394561767578,
"logps/rejected": -67.15525817871094,
"loss": 0.2816,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.5089329481124878,
"rewards/margins": 4.089753150939941,
"rewards/rejected": -4.598686695098877,
"step": 610
},
{
"epoch": 0.5,
"grad_norm": 10.507702524930748,
"learning_rate": 9.253149370125974e-07,
"logits/chosen": 6.221233367919922,
"logits/rejected": 1.4244422912597656,
"logps/chosen": -37.602134704589844,
"logps/rejected": -63.385986328125,
"loss": 0.2623,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.6569489240646362,
"rewards/margins": 3.7848668098449707,
"rewards/rejected": -4.441815376281738,
"step": 620
},
{
"epoch": 0.51,
"grad_norm": 7.547301709428263,
"learning_rate": 9.223155368926214e-07,
"logits/chosen": 5.5077009201049805,
"logits/rejected": 0.2255195677280426,
"logps/chosen": -44.82948684692383,
"logps/rejected": -70.71250915527344,
"loss": 0.2411,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.7187212705612183,
"rewards/margins": 4.245903491973877,
"rewards/rejected": -4.964625358581543,
"step": 630
},
{
"epoch": 0.52,
"grad_norm": 6.997225295696728,
"learning_rate": 9.193161367726454e-07,
"logits/chosen": 5.982936859130859,
"logits/rejected": 0.9599858522415161,
"logps/chosen": -37.054931640625,
"logps/rejected": -68.97079467773438,
"loss": 0.2675,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.6195434331893921,
"rewards/margins": 4.198363304138184,
"rewards/rejected": -4.817906379699707,
"step": 640
},
{
"epoch": 0.53,
"grad_norm": 12.60027391679346,
"learning_rate": 9.163167366526694e-07,
"logits/chosen": 3.3835721015930176,
"logits/rejected": -0.19015471637248993,
"logps/chosen": -28.28788185119629,
"logps/rejected": -66.44264221191406,
"loss": 0.2576,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.6566665768623352,
"rewards/margins": 4.069520473480225,
"rewards/rejected": -4.726187229156494,
"step": 650
},
{
"epoch": 0.53,
"grad_norm": 8.897497961790428,
"learning_rate": 9.133173365326934e-07,
"logits/chosen": 5.087882041931152,
"logits/rejected": 0.9116026163101196,
"logps/chosen": -33.74671173095703,
"logps/rejected": -73.04826354980469,
"loss": 0.2648,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.6395894289016724,
"rewards/margins": 4.4610419273376465,
"rewards/rejected": -5.100631237030029,
"step": 660
},
{
"epoch": 0.54,
"grad_norm": 10.358901625732027,
"learning_rate": 9.103179364127174e-07,
"logits/chosen": 5.677692413330078,
"logits/rejected": 0.2521089017391205,
"logps/chosen": -39.447689056396484,
"logps/rejected": -75.43064880371094,
"loss": 0.2183,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.6245348453521729,
"rewards/margins": 4.826773643493652,
"rewards/rejected": -5.451308250427246,
"step": 670
},
{
"epoch": 0.55,
"grad_norm": 13.919487286435421,
"learning_rate": 9.073185362927415e-07,
"logits/chosen": 6.0167036056518555,
"logits/rejected": 0.1550506204366684,
"logps/chosen": -33.495323181152344,
"logps/rejected": -73.1063461303711,
"loss": 0.2139,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.6048420667648315,
"rewards/margins": 4.695935249328613,
"rewards/rejected": -5.300777435302734,
"step": 680
},
{
"epoch": 0.56,
"grad_norm": 9.811099990146761,
"learning_rate": 9.043191361727654e-07,
"logits/chosen": 5.033354759216309,
"logits/rejected": 0.25484520196914673,
"logps/chosen": -39.27431106567383,
"logps/rejected": -71.24162292480469,
"loss": 0.2634,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.5098114013671875,
"rewards/margins": 4.455938339233398,
"rewards/rejected": -4.965749740600586,
"step": 690
},
{
"epoch": 0.57,
"grad_norm": 8.017368985065696,
"learning_rate": 9.013197360527894e-07,
"logits/chosen": 5.161509990692139,
"logits/rejected": -0.10750408470630646,
"logps/chosen": -33.15007781982422,
"logps/rejected": -68.93727111816406,
"loss": 0.2397,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.6771873235702515,
"rewards/margins": 4.225132942199707,
"rewards/rejected": -4.90231990814209,
"step": 700
},
{
"epoch": 0.57,
"grad_norm": 9.515143348659938,
"learning_rate": 8.983203359328135e-07,
"logits/chosen": 6.287454605102539,
"logits/rejected": 0.1438266783952713,
"logps/chosen": -35.95096206665039,
"logps/rejected": -72.1221923828125,
"loss": 0.2619,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.5426121950149536,
"rewards/margins": 4.689238548278809,
"rewards/rejected": -5.231850624084473,
"step": 710
},
{
"epoch": 0.58,
"grad_norm": 8.853639892057634,
"learning_rate": 8.953209358128374e-07,
"logits/chosen": 6.399009704589844,
"logits/rejected": 0.3351520895957947,
"logps/chosen": -42.617916107177734,
"logps/rejected": -77.32844543457031,
"loss": 0.2486,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.6509742736816406,
"rewards/margins": 4.959528923034668,
"rewards/rejected": -5.610503673553467,
"step": 720
},
{
"epoch": 0.59,
"grad_norm": 6.236836391613244,
"learning_rate": 8.923215356928614e-07,
"logits/chosen": 4.475239276885986,
"logits/rejected": -0.2508452832698822,
"logps/chosen": -37.084800720214844,
"logps/rejected": -71.18461608886719,
"loss": 0.2146,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.6816585063934326,
"rewards/margins": 4.352997779846191,
"rewards/rejected": -5.034656047821045,
"step": 730
},
{
"epoch": 0.6,
"grad_norm": 11.394264503228657,
"learning_rate": 8.893221355728854e-07,
"logits/chosen": 2.8366403579711914,
"logits/rejected": -0.25868746638298035,
"logps/chosen": -30.913869857788086,
"logps/rejected": -65.82793426513672,
"loss": 0.2565,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.47125107049942017,
"rewards/margins": 4.218214988708496,
"rewards/rejected": -4.6894659996032715,
"step": 740
},
{
"epoch": 0.61,
"grad_norm": 9.63703798333239,
"learning_rate": 8.863227354529093e-07,
"logits/chosen": 5.555853366851807,
"logits/rejected": 0.24771659076213837,
"logps/chosen": -42.00291061401367,
"logps/rejected": -71.82039642333984,
"loss": 0.2199,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.6926873922348022,
"rewards/margins": 4.374499320983887,
"rewards/rejected": -5.0671868324279785,
"step": 750
},
{
"epoch": 0.62,
"grad_norm": 12.475179147108308,
"learning_rate": 8.833233353329333e-07,
"logits/chosen": 4.9750657081604,
"logits/rejected": 0.9217138290405273,
"logps/chosen": -32.88280487060547,
"logps/rejected": -68.7645034790039,
"loss": 0.2622,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.6225011944770813,
"rewards/margins": 4.266197681427002,
"rewards/rejected": -4.888699054718018,
"step": 760
},
{
"epoch": 0.62,
"grad_norm": 13.961604966562444,
"learning_rate": 8.803239352129574e-07,
"logits/chosen": 4.99851655960083,
"logits/rejected": 0.3421913683414459,
"logps/chosen": -39.43068313598633,
"logps/rejected": -63.01778030395508,
"loss": 0.2288,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.5820320844650269,
"rewards/margins": 3.9034628868103027,
"rewards/rejected": -4.485495090484619,
"step": 770
},
{
"epoch": 0.63,
"grad_norm": 8.898695794199242,
"learning_rate": 8.773245350929814e-07,
"logits/chosen": 4.911359786987305,
"logits/rejected": -0.29061204195022583,
"logps/chosen": -38.77482986450195,
"logps/rejected": -71.60365295410156,
"loss": 0.2317,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.6680704951286316,
"rewards/margins": 4.3021674156188965,
"rewards/rejected": -4.970237731933594,
"step": 780
},
{
"epoch": 0.64,
"grad_norm": 7.082466242287932,
"learning_rate": 8.743251349730053e-07,
"logits/chosen": 8.117277145385742,
"logits/rejected": 0.22241690754890442,
"logps/chosen": -43.375308990478516,
"logps/rejected": -83.64913177490234,
"loss": 0.2283,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.4950665533542633,
"rewards/margins": 5.797509670257568,
"rewards/rejected": -6.292576789855957,
"step": 790
},
{
"epoch": 0.65,
"grad_norm": 11.152027144249061,
"learning_rate": 8.713257348530294e-07,
"logits/chosen": 3.5433716773986816,
"logits/rejected": 0.14973898231983185,
"logps/chosen": -37.43271255493164,
"logps/rejected": -64.81431579589844,
"loss": 0.2465,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.6468411087989807,
"rewards/margins": 3.858449935913086,
"rewards/rejected": -4.505290508270264,
"step": 800
},
{
"epoch": 0.66,
"grad_norm": 10.057327590247457,
"learning_rate": 8.683263347330534e-07,
"logits/chosen": 4.4199323654174805,
"logits/rejected": -0.7671071887016296,
"logps/chosen": -41.11504364013672,
"logps/rejected": -84.44219970703125,
"loss": 0.2022,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.7142581343650818,
"rewards/margins": 5.4674906730651855,
"rewards/rejected": -6.181748390197754,
"step": 810
},
{
"epoch": 0.66,
"grad_norm": 8.438998651633067,
"learning_rate": 8.653269346130773e-07,
"logits/chosen": 5.182823181152344,
"logits/rejected": 0.20817065238952637,
"logps/chosen": -36.99726486206055,
"logps/rejected": -67.30989074707031,
"loss": 0.2377,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.5765316486358643,
"rewards/margins": 4.2105937004089355,
"rewards/rejected": -4.787125587463379,
"step": 820
},
{
"epoch": 0.67,
"grad_norm": 11.4901243014845,
"learning_rate": 8.623275344931013e-07,
"logits/chosen": 5.666651725769043,
"logits/rejected": 0.33456581830978394,
"logps/chosen": -39.16795349121094,
"logps/rejected": -85.42790222167969,
"loss": 0.2227,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -0.4975081980228424,
"rewards/margins": 5.7258501052856445,
"rewards/rejected": -6.223358154296875,
"step": 830
},
{
"epoch": 0.68,
"grad_norm": 19.82780237798911,
"learning_rate": 8.593281343731254e-07,
"logits/chosen": 4.063232898712158,
"logits/rejected": -0.5040629506111145,
"logps/chosen": -34.486968994140625,
"logps/rejected": -70.23979187011719,
"loss": 0.2577,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.6543490886688232,
"rewards/margins": 4.389010429382324,
"rewards/rejected": -5.043359279632568,
"step": 840
},
{
"epoch": 0.69,
"grad_norm": 13.135947133773865,
"learning_rate": 8.563287342531494e-07,
"logits/chosen": 4.684737682342529,
"logits/rejected": 0.15397313237190247,
"logps/chosen": -44.84247970581055,
"logps/rejected": -81.59874725341797,
"loss": 0.2415,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.934431254863739,
"rewards/margins": 4.985320091247559,
"rewards/rejected": -5.9197516441345215,
"step": 850
},
{
"epoch": 0.7,
"grad_norm": 9.942117794842815,
"learning_rate": 8.533293341331733e-07,
"logits/chosen": 5.942558765411377,
"logits/rejected": -0.4634874761104584,
"logps/chosen": -40.689144134521484,
"logps/rejected": -85.91549682617188,
"loss": 0.2115,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.8215259313583374,
"rewards/margins": 5.591064453125,
"rewards/rejected": -6.412590026855469,
"step": 860
},
{
"epoch": 0.7,
"grad_norm": 10.317389757107083,
"learning_rate": 8.503299340131973e-07,
"logits/chosen": 6.38779354095459,
"logits/rejected": -0.08178432285785675,
"logps/chosen": -44.51734161376953,
"logps/rejected": -84.4544677734375,
"loss": 0.235,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.6095073223114014,
"rewards/margins": 5.518308639526367,
"rewards/rejected": -6.127816200256348,
"step": 870
},
{
"epoch": 0.71,
"grad_norm": 10.455585529285335,
"learning_rate": 8.473305338932213e-07,
"logits/chosen": 4.026412010192871,
"logits/rejected": -0.11150656640529633,
"logps/chosen": -41.58991241455078,
"logps/rejected": -79.89984130859375,
"loss": 0.2029,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.7930614948272705,
"rewards/margins": 5.062989234924316,
"rewards/rejected": -5.856051445007324,
"step": 880
},
{
"epoch": 0.72,
"grad_norm": 12.413727527066243,
"learning_rate": 8.443311337732452e-07,
"logits/chosen": 3.686732769012451,
"logits/rejected": -0.9453102350234985,
"logps/chosen": -36.16538619995117,
"logps/rejected": -66.45039367675781,
"loss": 0.2538,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.6018454432487488,
"rewards/margins": 4.1831464767456055,
"rewards/rejected": -4.784992218017578,
"step": 890
},
{
"epoch": 0.73,
"grad_norm": 5.779220350203222,
"learning_rate": 8.413317336532693e-07,
"logits/chosen": 3.2173056602478027,
"logits/rejected": -0.6864817142486572,
"logps/chosen": -45.675899505615234,
"logps/rejected": -79.54066467285156,
"loss": 0.2205,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -1.37534499168396,
"rewards/margins": 4.585756301879883,
"rewards/rejected": -5.961101055145264,
"step": 900
},
{
"epoch": 0.74,
"grad_norm": 8.005991362378799,
"learning_rate": 8.383323335332934e-07,
"logits/chosen": 4.786983489990234,
"logits/rejected": 0.6237784624099731,
"logps/chosen": -35.7918815612793,
"logps/rejected": -78.17144012451172,
"loss": 0.2198,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.5418484210968018,
"rewards/margins": 5.177511215209961,
"rewards/rejected": -5.719359397888184,
"step": 910
},
{
"epoch": 0.74,
"grad_norm": 10.754977883833703,
"learning_rate": 8.353329334133174e-07,
"logits/chosen": 4.775103569030762,
"logits/rejected": -0.6356438398361206,
"logps/chosen": -38.72078323364258,
"logps/rejected": -76.97815704345703,
"loss": 0.2345,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.5932806730270386,
"rewards/margins": 5.025919437408447,
"rewards/rejected": -5.619199752807617,
"step": 920
},
{
"epoch": 0.75,
"grad_norm": 8.370671264923958,
"learning_rate": 8.323335332933413e-07,
"logits/chosen": 3.7196967601776123,
"logits/rejected": -0.6006534099578857,
"logps/chosen": -37.750770568847656,
"logps/rejected": -75.3314208984375,
"loss": 0.239,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.6637129783630371,
"rewards/margins": 4.798027992248535,
"rewards/rejected": -5.4617414474487305,
"step": 930
},
{
"epoch": 0.76,
"grad_norm": 6.856873898439941,
"learning_rate": 8.293341331733653e-07,
"logits/chosen": 4.082217216491699,
"logits/rejected": -0.9965828657150269,
"logps/chosen": -37.654178619384766,
"logps/rejected": -84.30792236328125,
"loss": 0.2356,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.9460653066635132,
"rewards/margins": 5.248031139373779,
"rewards/rejected": -6.194096088409424,
"step": 940
},
{
"epoch": 0.77,
"grad_norm": 12.985186244804439,
"learning_rate": 8.263347330533893e-07,
"logits/chosen": 3.5539650917053223,
"logits/rejected": -1.1321687698364258,
"logps/chosen": -38.285552978515625,
"logps/rejected": -74.53987121582031,
"loss": 0.2505,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -1.0940616130828857,
"rewards/margins": 4.375874996185303,
"rewards/rejected": -5.469937324523926,
"step": 950
},
{
"epoch": 0.78,
"grad_norm": 10.435750083651422,
"learning_rate": 8.233353329334133e-07,
"logits/chosen": 5.076735973358154,
"logits/rejected": 0.126987487077713,
"logps/chosen": -34.09984588623047,
"logps/rejected": -71.26011657714844,
"loss": 0.2292,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.7402558922767639,
"rewards/margins": 4.3730621337890625,
"rewards/rejected": -5.11331844329834,
"step": 960
},
{
"epoch": 0.79,
"grad_norm": 10.13777044080165,
"learning_rate": 8.203359328134373e-07,
"logits/chosen": 7.634838104248047,
"logits/rejected": 1.4603595733642578,
"logps/chosen": -40.903377532958984,
"logps/rejected": -76.29527282714844,
"loss": 0.1999,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.5216394066810608,
"rewards/margins": 5.088165283203125,
"rewards/rejected": -5.609805107116699,
"step": 970
},
{
"epoch": 0.79,
"grad_norm": 9.326208459350674,
"learning_rate": 8.173365326934613e-07,
"logits/chosen": 6.336162567138672,
"logits/rejected": 0.3762982487678528,
"logps/chosen": -33.36209487915039,
"logps/rejected": -74.17536926269531,
"loss": 0.2128,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.5508583784103394,
"rewards/margins": 4.918101787567139,
"rewards/rejected": -5.468959808349609,
"step": 980
},
{
"epoch": 0.8,
"grad_norm": 8.983054286726517,
"learning_rate": 8.143371325734852e-07,
"logits/chosen": 4.599123001098633,
"logits/rejected": -0.9218250513076782,
"logps/chosen": -43.528175354003906,
"logps/rejected": -74.83673095703125,
"loss": 0.196,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.7882202863693237,
"rewards/margins": 4.805144309997559,
"rewards/rejected": -5.593365669250488,
"step": 990
},
{
"epoch": 0.81,
"grad_norm": 7.586393164977657,
"learning_rate": 8.113377324535092e-07,
"logits/chosen": 5.207022666931152,
"logits/rejected": -0.7721199989318848,
"logps/chosen": -42.59235382080078,
"logps/rejected": -85.05528259277344,
"loss": 0.2199,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.7499576807022095,
"rewards/margins": 5.488531112670898,
"rewards/rejected": -6.238488674163818,
"step": 1000
},
{
"epoch": 0.82,
"grad_norm": 9.992746900458588,
"learning_rate": 8.083383323335332e-07,
"logits/chosen": 3.9228744506835938,
"logits/rejected": -0.4620266854763031,
"logps/chosen": -37.23911666870117,
"logps/rejected": -78.92009735107422,
"loss": 0.2303,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.8569334745407104,
"rewards/margins": 4.876101970672607,
"rewards/rejected": -5.733035087585449,
"step": 1010
},
{
"epoch": 0.83,
"grad_norm": 10.673195989249953,
"learning_rate": 8.053389322135572e-07,
"logits/chosen": 5.194386959075928,
"logits/rejected": -0.3306584060192108,
"logps/chosen": -37.87696075439453,
"logps/rejected": -72.78455352783203,
"loss": 0.2187,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.6252318024635315,
"rewards/margins": 4.7294392585754395,
"rewards/rejected": -5.354671478271484,
"step": 1020
},
{
"epoch": 0.83,
"grad_norm": 14.803696255288303,
"learning_rate": 8.023395320935813e-07,
"logits/chosen": 3.891000270843506,
"logits/rejected": -0.9068805575370789,
"logps/chosen": -40.6781120300293,
"logps/rejected": -81.7871322631836,
"loss": 0.1926,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.6426281929016113,
"rewards/margins": 5.368206977844238,
"rewards/rejected": -6.01083517074585,
"step": 1030
},
{
"epoch": 0.84,
"grad_norm": 7.25945056094058,
"learning_rate": 7.993401319736053e-07,
"logits/chosen": 3.9840798377990723,
"logits/rejected": -0.24789929389953613,
"logps/chosen": -33.29187774658203,
"logps/rejected": -69.07560729980469,
"loss": 0.2182,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.6687902212142944,
"rewards/margins": 4.272780418395996,
"rewards/rejected": -4.941570281982422,
"step": 1040
},
{
"epoch": 0.85,
"grad_norm": 9.78196474122683,
"learning_rate": 7.963407318536293e-07,
"logits/chosen": 5.05679988861084,
"logits/rejected": 0.055974699556827545,
"logps/chosen": -37.91151809692383,
"logps/rejected": -80.07110595703125,
"loss": 0.2448,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": -0.7068133354187012,
"rewards/margins": 5.139575958251953,
"rewards/rejected": -5.8463897705078125,
"step": 1050
},
{
"epoch": 0.86,
"grad_norm": 8.672914354007535,
"learning_rate": 7.933413317336532e-07,
"logits/chosen": 5.2890625,
"logits/rejected": 0.1258922517299652,
"logps/chosen": -37.52037048339844,
"logps/rejected": -84.71490478515625,
"loss": 0.1865,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.5368348956108093,
"rewards/margins": 5.716190814971924,
"rewards/rejected": -6.253025531768799,
"step": 1060
},
{
"epoch": 0.87,
"grad_norm": 9.84113162499674,
"learning_rate": 7.903419316136772e-07,
"logits/chosen": 6.731717109680176,
"logits/rejected": 0.1761070042848587,
"logps/chosen": -46.707733154296875,
"logps/rejected": -83.32044219970703,
"loss": 0.2375,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.8217447996139526,
"rewards/margins": 5.439145088195801,
"rewards/rejected": -6.260889530181885,
"step": 1070
},
{
"epoch": 0.87,
"grad_norm": 10.836630414800007,
"learning_rate": 7.873425314937012e-07,
"logits/chosen": 5.533178806304932,
"logits/rejected": 0.15569272637367249,
"logps/chosen": -37.35350799560547,
"logps/rejected": -78.94861602783203,
"loss": 0.1957,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.545146107673645,
"rewards/margins": 5.181343078613281,
"rewards/rejected": -5.726489067077637,
"step": 1080
},
{
"epoch": 0.88,
"grad_norm": 8.602866268180797,
"learning_rate": 7.843431313737253e-07,
"logits/chosen": 4.896660804748535,
"logits/rejected": -0.805033802986145,
"logps/chosen": -39.619224548339844,
"logps/rejected": -77.04666137695312,
"loss": 0.2214,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.47726893424987793,
"rewards/margins": 5.242520332336426,
"rewards/rejected": -5.719789981842041,
"step": 1090
},
{
"epoch": 0.89,
"grad_norm": 6.662307424265464,
"learning_rate": 7.813437312537492e-07,
"logits/chosen": 4.894705772399902,
"logits/rejected": -0.20809988677501678,
"logps/chosen": -31.3262996673584,
"logps/rejected": -81.197265625,
"loss": 0.176,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.6251105666160583,
"rewards/margins": 5.416481018066406,
"rewards/rejected": -6.041591644287109,
"step": 1100
},
{
"epoch": 0.9,
"grad_norm": 13.236914879599565,
"learning_rate": 7.783443311337732e-07,
"logits/chosen": 5.344097137451172,
"logits/rejected": 0.4349094331264496,
"logps/chosen": -41.192283630371094,
"logps/rejected": -73.71807861328125,
"loss": 0.2189,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.9201833605766296,
"rewards/margins": 4.493067741394043,
"rewards/rejected": -5.4132513999938965,
"step": 1110
},
{
"epoch": 0.91,
"grad_norm": 11.447379083352251,
"learning_rate": 7.753449310137972e-07,
"logits/chosen": 5.665997505187988,
"logits/rejected": -1.3642793893814087,
"logps/chosen": -40.07326889038086,
"logps/rejected": -86.33983612060547,
"loss": 0.1977,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.7224239110946655,
"rewards/margins": 5.889608860015869,
"rewards/rejected": -6.612032413482666,
"step": 1120
},
{
"epoch": 0.91,
"grad_norm": 13.326349881889897,
"learning_rate": 7.723455308938211e-07,
"logits/chosen": 6.115036487579346,
"logits/rejected": -0.4208458960056305,
"logps/chosen": -39.56875991821289,
"logps/rejected": -86.91368103027344,
"loss": 0.195,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.669539213180542,
"rewards/margins": 5.865464210510254,
"rewards/rejected": -6.535002708435059,
"step": 1130
},
{
"epoch": 0.92,
"grad_norm": 9.16173651681725,
"learning_rate": 7.693461307738452e-07,
"logits/chosen": 4.2573957443237305,
"logits/rejected": -1.075480580329895,
"logps/chosen": -40.893592834472656,
"logps/rejected": -78.26148223876953,
"loss": 0.2122,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.8861662745475769,
"rewards/margins": 5.005410671234131,
"rewards/rejected": -5.89157772064209,
"step": 1140
},
{
"epoch": 0.93,
"grad_norm": 8.806688464717144,
"learning_rate": 7.663467306538693e-07,
"logits/chosen": 4.599158763885498,
"logits/rejected": -0.7346502542495728,
"logps/chosen": -39.177040100097656,
"logps/rejected": -87.11119842529297,
"loss": 0.2062,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -0.8978098034858704,
"rewards/margins": 5.6298370361328125,
"rewards/rejected": -6.527646541595459,
"step": 1150
},
{
"epoch": 0.94,
"grad_norm": 25.10098990447531,
"learning_rate": 7.633473305338932e-07,
"logits/chosen": 3.0874524116516113,
"logits/rejected": -0.8507956266403198,
"logps/chosen": -35.36151885986328,
"logps/rejected": -74.7900390625,
"loss": 0.2189,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.6411249041557312,
"rewards/margins": 4.888244152069092,
"rewards/rejected": -5.529369831085205,
"step": 1160
},
{
"epoch": 0.95,
"grad_norm": 10.385014559881746,
"learning_rate": 7.603479304139172e-07,
"logits/chosen": 3.757722854614258,
"logits/rejected": -0.22128507494926453,
"logps/chosen": -37.696983337402344,
"logps/rejected": -73.14608001708984,
"loss": 0.2361,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": -0.9848117828369141,
"rewards/margins": 4.35174036026001,
"rewards/rejected": -5.336552143096924,
"step": 1170
},
{
"epoch": 0.96,
"grad_norm": 10.380895353757795,
"learning_rate": 7.573485302939412e-07,
"logits/chosen": 4.578551769256592,
"logits/rejected": -0.7743647694587708,
"logps/chosen": -39.889625549316406,
"logps/rejected": -82.70549774169922,
"loss": 0.2109,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.8430160284042358,
"rewards/margins": 5.3374342918396,
"rewards/rejected": -6.180450916290283,
"step": 1180
},
{
"epoch": 0.96,
"grad_norm": 12.046426752286843,
"learning_rate": 7.543491301739652e-07,
"logits/chosen": 4.2825775146484375,
"logits/rejected": -0.97093665599823,
"logps/chosen": -40.860382080078125,
"logps/rejected": -89.05497741699219,
"loss": 0.211,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -0.7804887890815735,
"rewards/margins": 5.945776462554932,
"rewards/rejected": -6.726265907287598,
"step": 1190
},
{
"epoch": 0.97,
"grad_norm": 6.32612572100064,
"learning_rate": 7.513497300539891e-07,
"logits/chosen": 4.0176568031311035,
"logits/rejected": -0.5003149509429932,
"logps/chosen": -38.51813507080078,
"logps/rejected": -80.8168716430664,
"loss": 0.2054,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.5089032649993896,
"rewards/margins": 5.293961048126221,
"rewards/rejected": -5.802863597869873,
"step": 1200
},
{
"epoch": 0.98,
"grad_norm": 5.9075949824072875,
"learning_rate": 7.483503299340131e-07,
"logits/chosen": 3.3083388805389404,
"logits/rejected": -1.7419131994247437,
"logps/chosen": -38.396018981933594,
"logps/rejected": -82.67097473144531,
"loss": 0.2315,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": -0.7572061419487,
"rewards/margins": 5.389390468597412,
"rewards/rejected": -6.146596431732178,
"step": 1210
},
{
"epoch": 0.99,
"grad_norm": 10.20062551022738,
"learning_rate": 7.453509298140372e-07,
"logits/chosen": 4.2033233642578125,
"logits/rejected": -0.8881649971008301,
"logps/chosen": -37.607303619384766,
"logps/rejected": -84.1115951538086,
"loss": 0.1816,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.5505583882331848,
"rewards/margins": 5.680405616760254,
"rewards/rejected": -6.230964183807373,
"step": 1220
},
{
"epoch": 1.0,
"grad_norm": 7.177796631658208,
"learning_rate": 7.423515296940611e-07,
"logits/chosen": 6.914002418518066,
"logits/rejected": 0.5874744057655334,
"logps/chosen": -40.15135955810547,
"logps/rejected": -82.51225280761719,
"loss": 0.214,
"rewards/accuracies": 0.8374999761581421,
"rewards/chosen": -0.8042081594467163,
"rewards/margins": 5.3896894454956055,
"rewards/rejected": -6.193896770477295,
"step": 1230
},
{
"epoch": 1.0,
"eval_logits/chosen": 3.8774516582489014,
"eval_logits/rejected": -1.1117931604385376,
"eval_logps/chosen": -28.125341415405273,
"eval_logps/rejected": -61.24203872680664,
"eval_loss": 0.16555103659629822,
"eval_rewards/accuracies": 1.0,
"eval_rewards/chosen": -0.05321738123893738,
"eval_rewards/margins": 4.279094219207764,
"eval_rewards/rejected": -4.332311153411865,
"eval_runtime": 19.0346,
"eval_samples_per_second": 5.254,
"eval_steps_per_second": 0.368,
"step": 1235
}
],
"logging_steps": 10,
"max_steps": 3705,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}