weights_for_naj / trainer_state.json
derko83's picture
Upload folder using huggingface_hub
f3565a6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 4479,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.033489618218352314,
"grad_norm": 78.33113098144531,
"learning_rate": 2.1875e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -175.2264862060547,
"logps/rejected": -218.9656982421875,
"loss": 0.6923,
"rewards/accuracies": 0.4137499928474426,
"rewards/chosen": 0.0005424434202723205,
"rewards/margins": 0.0029623538721352816,
"rewards/rejected": -0.0024199108593165874,
"step": 50
},
{
"epoch": 0.06697923643670463,
"grad_norm": 106.17163848876953,
"learning_rate": 4.419642857142857e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -179.5259246826172,
"logps/rejected": -224.7578887939453,
"loss": 0.6907,
"rewards/accuracies": 0.42124998569488525,
"rewards/chosen": -0.005094751715660095,
"rewards/margins": 0.00641661649569869,
"rewards/rejected": -0.011511369608342648,
"step": 100
},
{
"epoch": 0.10046885465505694,
"grad_norm": 86.04861450195312,
"learning_rate": 6.651785714285713e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -165.04095458984375,
"logps/rejected": -219.6518096923828,
"loss": 0.6756,
"rewards/accuracies": 0.5112500190734863,
"rewards/chosen": -0.026584235951304436,
"rewards/margins": 0.03996539115905762,
"rewards/rejected": -0.0665496289730072,
"step": 150
},
{
"epoch": 0.13395847287340926,
"grad_norm": 82.77224731445312,
"learning_rate": 8.88392857142857e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -177.75872802734375,
"logps/rejected": -223.51528930664062,
"loss": 0.6591,
"rewards/accuracies": 0.5099999904632568,
"rewards/chosen": -0.11609632521867752,
"rewards/margins": 0.10106377303600311,
"rewards/rejected": -0.21716010570526123,
"step": 200
},
{
"epoch": 0.16744809109176156,
"grad_norm": 135.95346069335938,
"learning_rate": 1.1116071428571427e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -180.67254638671875,
"logps/rejected": -226.42140197753906,
"loss": 0.6295,
"rewards/accuracies": 0.5099999904632568,
"rewards/chosen": -0.18336135149002075,
"rewards/margins": 0.20662552118301392,
"rewards/rejected": -0.38998690247535706,
"step": 250
},
{
"epoch": 0.20093770931011387,
"grad_norm": 89.77359771728516,
"learning_rate": 1.3348214285714285e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -178.88687133789062,
"logps/rejected": -226.58355712890625,
"loss": 0.602,
"rewards/accuracies": 0.5637500286102295,
"rewards/chosen": -0.29084426164627075,
"rewards/margins": 0.32938891649246216,
"rewards/rejected": -0.6202332377433777,
"step": 300
},
{
"epoch": 0.23442732752846618,
"grad_norm": 89.93605041503906,
"learning_rate": 1.558035714285714e-06,
"logits/chosen": NaN,
"logits/rejected": -1.608971118927002,
"logps/chosen": -176.1905059814453,
"logps/rejected": -231.0211944580078,
"loss": 0.5782,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.4454282522201538,
"rewards/margins": 0.5162708163261414,
"rewards/rejected": -0.9616988897323608,
"step": 350
},
{
"epoch": 0.2679169457468185,
"grad_norm": 113.58289337158203,
"learning_rate": 1.7812499999999999e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -176.52401733398438,
"logps/rejected": -236.76588439941406,
"loss": 0.5478,
"rewards/accuracies": 0.6150000095367432,
"rewards/chosen": -0.5549299120903015,
"rewards/margins": 0.8102107048034668,
"rewards/rejected": -1.3651405572891235,
"step": 400
},
{
"epoch": 0.3014065639651708,
"grad_norm": 100.28213500976562,
"learning_rate": 1.999999696300462e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -183.1260223388672,
"logps/rejected": -235.15631103515625,
"loss": 0.5635,
"rewards/accuracies": 0.5799999833106995,
"rewards/chosen": -0.48344433307647705,
"rewards/margins": 0.770007848739624,
"rewards/rejected": -1.253452181816101,
"step": 450
},
{
"epoch": 0.33489618218352313,
"grad_norm": 90.32833099365234,
"learning_rate": 1.999210181452139e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -180.36907958984375,
"logps/rejected": -232.14285278320312,
"loss": 0.5376,
"rewards/accuracies": 0.6087499856948853,
"rewards/chosen": -0.5261387825012207,
"rewards/margins": 0.8372372984886169,
"rewards/rejected": -1.3633761405944824,
"step": 500
},
{
"epoch": 0.3683858004018754,
"grad_norm": 72.57466125488281,
"learning_rate": 1.996903560165487e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -175.88233947753906,
"logps/rejected": -242.15728759765625,
"loss": 0.5083,
"rewards/accuracies": 0.6225000023841858,
"rewards/chosen": -0.5493210554122925,
"rewards/margins": 1.0930429697036743,
"rewards/rejected": -1.6423640251159668,
"step": 550
},
{
"epoch": 0.40187541862022774,
"grad_norm": 47.55934143066406,
"learning_rate": 1.993083334596579e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -184.1678924560547,
"logps/rejected": -251.43661499023438,
"loss": 0.5193,
"rewards/accuracies": 0.6225000023841858,
"rewards/chosen": -0.7250985503196716,
"rewards/margins": 1.2086968421936035,
"rewards/rejected": -1.9337953329086304,
"step": 600
},
{
"epoch": 0.43536503683858,
"grad_norm": 90.7481460571289,
"learning_rate": 1.987755305015383e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -196.693359375,
"logps/rejected": -247.3010711669922,
"loss": 0.516,
"rewards/accuracies": 0.6137499809265137,
"rewards/chosen": -0.6984607577323914,
"rewards/margins": 1.173628807067871,
"rewards/rejected": -1.8720895051956177,
"step": 650
},
{
"epoch": 0.46885465505693236,
"grad_norm": 86.08389282226562,
"learning_rate": 1.980927560999178e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -186.29693603515625,
"logps/rejected": -245.04824829101562,
"loss": 0.5057,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.6868166327476501,
"rewards/margins": 1.367271900177002,
"rewards/rejected": -2.0540883541107178,
"step": 700
},
{
"epoch": 0.5023442732752846,
"grad_norm": 40.12553405761719,
"learning_rate": 1.9726104691501045e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -179.41378784179688,
"logps/rejected": -240.62547302246094,
"loss": 0.5132,
"rewards/accuracies": 0.5975000262260437,
"rewards/chosen": -0.5570769309997559,
"rewards/margins": 1.2463946342468262,
"rewards/rejected": -1.803471326828003,
"step": 750
},
{
"epoch": 0.535833891493637,
"grad_norm": 36.09309005737305,
"learning_rate": 1.9628166573554945e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -170.22169494628906,
"logps/rejected": -239.9406280517578,
"loss": 0.4553,
"rewards/accuracies": 0.6449999809265137,
"rewards/chosen": -0.5568282604217529,
"rewards/margins": 1.5600597858428955,
"rewards/rejected": -2.1168878078460693,
"step": 800
},
{
"epoch": 0.5693235097119893,
"grad_norm": 88.8606185913086,
"learning_rate": 1.951560995614879e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -175.4136199951172,
"logps/rejected": -241.44386291503906,
"loss": 0.4912,
"rewards/accuracies": 0.6175000071525574,
"rewards/chosen": -0.6789398193359375,
"rewards/margins": 1.448940634727478,
"rewards/rejected": -2.127880573272705,
"step": 850
},
{
"epoch": 0.6028131279303416,
"grad_norm": 37.501346588134766,
"learning_rate": 1.9388605734627843e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -183.4543914794922,
"logps/rejected": -241.45433044433594,
"loss": 0.505,
"rewards/accuracies": 0.6212499737739563,
"rewards/chosen": -0.719947338104248,
"rewards/margins": 1.5332283973693848,
"rewards/rejected": -2.253175735473633,
"step": 900
},
{
"epoch": 0.6363027461486939,
"grad_norm": 58.78173065185547,
"learning_rate": 1.9247346740215936e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -182.4608612060547,
"logps/rejected": -236.8692169189453,
"loss": 0.4756,
"rewards/accuracies": 0.6274999976158142,
"rewards/chosen": -0.5931037068367004,
"rewards/margins": 1.6174336671829224,
"rewards/rejected": -2.2105374336242676,
"step": 950
},
{
"epoch": 0.6697923643670463,
"grad_norm": 53.627410888671875,
"learning_rate": 1.909204744723877e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -169.64356994628906,
"logps/rejected": -238.07931518554688,
"loss": 0.4699,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.5164381265640259,
"rewards/margins": 1.6023368835449219,
"rewards/rejected": -2.1187753677368164,
"step": 1000
},
{
"epoch": 0.7032819825853985,
"grad_norm": 47.64691162109375,
"learning_rate": 1.8922943647486314e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -174.08212280273438,
"logps/rejected": -251.6885223388672,
"loss": 0.4309,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.560505211353302,
"rewards/margins": 1.9433872699737549,
"rewards/rejected": -2.503892421722412,
"step": 1050
},
{
"epoch": 0.7367716008037508,
"grad_norm": 58.94224166870117,
"learning_rate": 1.8740292092208816e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -162.09487915039062,
"logps/rejected": -236.79824829101562,
"loss": 0.4293,
"rewards/accuracies": 0.6524999737739563,
"rewards/chosen": -0.6041057705879211,
"rewards/margins": 2.0014426708221436,
"rewards/rejected": -2.60554838180542,
"step": 1100
},
{
"epoch": 0.7702612190221031,
"grad_norm": 41.707763671875,
"learning_rate": 1.8544370102289943e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -177.0761260986328,
"logps/rejected": -240.7725067138672,
"loss": 0.4419,
"rewards/accuracies": 0.6612499952316284,
"rewards/chosen": -0.6522895097732544,
"rewards/margins": 1.7689578533172607,
"rewards/rejected": -2.4212474822998047,
"step": 1150
},
{
"epoch": 0.8037508372404555,
"grad_norm": 45.48369216918945,
"learning_rate": 1.83354751471889e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -184.2169952392578,
"logps/rejected": -264.9205322265625,
"loss": 0.4503,
"rewards/accuracies": 0.6549999713897705,
"rewards/chosen": -0.49645543098449707,
"rewards/margins": 2.04986572265625,
"rewards/rejected": -2.546321392059326,
"step": 1200
},
{
"epoch": 0.8372404554588078,
"grad_norm": 51.16058349609375,
"learning_rate": 1.8113924393290904e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -182.03074645996094,
"logps/rejected": -249.8163604736328,
"loss": 0.4319,
"rewards/accuracies": 0.6612499952316284,
"rewards/chosen": -0.6471911072731018,
"rewards/margins": 2.1099319458007812,
"rewards/rejected": -2.7571229934692383,
"step": 1250
},
{
"epoch": 0.87073007367716,
"grad_norm": 64.02259063720703,
"learning_rate": 1.7880054222351658e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -178.18972778320312,
"logps/rejected": -237.3641815185547,
"loss": 0.4155,
"rewards/accuracies": 0.6725000143051147,
"rewards/chosen": -0.38780125975608826,
"rewards/margins": 1.9852185249328613,
"rewards/rejected": -2.3730199337005615,
"step": 1300
},
{
"epoch": 0.9042196918955124,
"grad_norm": 35.12641525268555,
"learning_rate": 1.763421972076705e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -175.52285766601562,
"logps/rejected": -247.11244201660156,
"loss": 0.4359,
"rewards/accuracies": 0.6512500047683716,
"rewards/chosen": -0.493091344833374,
"rewards/margins": 1.8931076526641846,
"rewards/rejected": -2.3861987590789795,
"step": 1350
},
{
"epoch": 0.9377093101138647,
"grad_norm": 64.41110229492188,
"learning_rate": 1.7376794140443474e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -178.29629516601562,
"logps/rejected": -234.5249481201172,
"loss": 0.4512,
"rewards/accuracies": 0.6549999713897705,
"rewards/chosen": -0.4724200367927551,
"rewards/margins": 1.9340243339538574,
"rewards/rejected": -2.4064440727233887,
"step": 1400
},
{
"epoch": 0.971198928332217,
"grad_norm": 26.93653106689453,
"learning_rate": 1.7108168332087366e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -175.42259216308594,
"logps/rejected": -243.82032775878906,
"loss": 0.4343,
"rewards/accuracies": 0.6512500047683716,
"rewards/chosen": -0.3961036205291748,
"rewards/margins": 1.8803616762161255,
"rewards/rejected": -2.27646541595459,
"step": 1450
},
{
"epoch": 1.0046885465505693,
"grad_norm": 74.74053955078125,
"learning_rate": 1.682875015177438e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -174.56732177734375,
"logps/rejected": -246.36451721191406,
"loss": 0.3957,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": -0.34164169430732727,
"rewards/margins": 2.248396635055542,
"rewards/rejected": -2.590038537979126,
"step": 1500
},
{
"epoch": 1.0381781647689217,
"grad_norm": 58.65504455566406,
"learning_rate": 1.6538963841699207e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -176.5469207763672,
"logps/rejected": -258.92706298828125,
"loss": 0.2861,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.2739707827568054,
"rewards/margins": 3.0113985538482666,
"rewards/rejected": -3.2853691577911377,
"step": 1550
},
{
"epoch": 1.0716677829872738,
"grad_norm": 59.74324417114258,
"learning_rate": 1.6239249386046274e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -177.00692749023438,
"logps/rejected": -255.23556518554688,
"loss": 0.2914,
"rewards/accuracies": 0.7549999952316284,
"rewards/chosen": -0.4652925729751587,
"rewards/margins": 3.098710298538208,
"rewards/rejected": -3.564002752304077,
"step": 1600
},
{
"epoch": 1.1051574012056262,
"grad_norm": 37.80025863647461,
"learning_rate": 1.593006184295927e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -185.12716674804688,
"logps/rejected": -254.19509887695312,
"loss": 0.2798,
"rewards/accuracies": 0.7524999976158142,
"rewards/chosen": -0.28863173723220825,
"rewards/margins": 3.227825880050659,
"rewards/rejected": -3.516458034515381,
"step": 1650
},
{
"epoch": 1.1386470194239786,
"grad_norm": 40.97309875488281,
"learning_rate": 1.5611870653623825e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -182.1793975830078,
"logps/rejected": -245.0845184326172,
"loss": 0.2778,
"rewards/accuracies": 0.7450000047683716,
"rewards/chosen": -0.3949226438999176,
"rewards/margins": 3.3151471614837646,
"rewards/rejected": -3.7100696563720703,
"step": 1700
},
{
"epoch": 1.1721366376423308,
"grad_norm": 61.272247314453125,
"learning_rate": 1.5285158929512291e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -174.18487548828125,
"logps/rejected": -247.96957397460938,
"loss": 0.3048,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.4471362233161926,
"rewards/margins": 3.481740951538086,
"rewards/rejected": -3.928877830505371,
"step": 1750
},
{
"epoch": 1.2056262558606832,
"grad_norm": 20.384906768798828,
"learning_rate": 1.4950422718872916e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -177.91143798828125,
"logps/rejected": -264.8081970214844,
"loss": 0.2738,
"rewards/accuracies": 0.7574999928474426,
"rewards/chosen": -0.4734611213207245,
"rewards/margins": 3.4893076419830322,
"rewards/rejected": -3.962768793106079,
"step": 1800
},
{
"epoch": 1.2391158740790356,
"grad_norm": 46.84432601928711,
"learning_rate": 1.4608170253576945e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -171.02236938476562,
"logps/rejected": -259.7280578613281,
"loss": 0.2928,
"rewards/accuracies": 0.7262499928474426,
"rewards/chosen": -0.6498711109161377,
"rewards/margins": 3.556124210357666,
"rewards/rejected": -4.205995082855225,
"step": 1850
},
{
"epoch": 1.2726054922973877,
"grad_norm": 40.36602020263672,
"learning_rate": 1.4258921177467371e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -176.40257263183594,
"logps/rejected": -251.6402130126953,
"loss": 0.301,
"rewards/accuracies": 0.7325000166893005,
"rewards/chosen": -0.7374945878982544,
"rewards/margins": 3.618178606033325,
"rewards/rejected": -4.355673789978027,
"step": 1900
},
{
"epoch": 1.3060951105157401,
"grad_norm": 33.35322952270508,
"learning_rate": 1.3903205757380715e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -177.98854064941406,
"logps/rejected": -259.6983337402344,
"loss": 0.2985,
"rewards/accuracies": 0.7275000214576721,
"rewards/chosen": -0.7513535022735596,
"rewards/margins": 3.433237314224243,
"rewards/rejected": -4.184591293334961,
"step": 1950
},
{
"epoch": 1.3395847287340925,
"grad_norm": 31.858760833740234,
"learning_rate": 1.3541564078039942e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -176.97511291503906,
"logps/rejected": -267.1122131347656,
"loss": 0.307,
"rewards/accuracies": 0.7174999713897705,
"rewards/chosen": -0.6912581920623779,
"rewards/margins": 3.4836156368255615,
"rewards/rejected": -4.1748738288879395,
"step": 2000
},
{
"epoch": 1.3730743469524447,
"grad_norm": 40.272186279296875,
"learning_rate": 1.3174545222040757e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -181.2541046142578,
"logps/rejected": -267.8948974609375,
"loss": 0.2764,
"rewards/accuracies": 0.7612500190734863,
"rewards/chosen": -0.5613307356834412,
"rewards/margins": 3.6199841499328613,
"rewards/rejected": -4.181314468383789,
"step": 2050
},
{
"epoch": 1.406563965170797,
"grad_norm": 20.189088821411133,
"learning_rate": 1.2802706436176447e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -186.3399658203125,
"logps/rejected": -275.252685546875,
"loss": 0.2673,
"rewards/accuracies": 0.7512500286102295,
"rewards/chosen": -0.49821099638938904,
"rewards/margins": 3.6726813316345215,
"rewards/rejected": -4.170892238616943,
"step": 2100
},
{
"epoch": 1.4400535833891492,
"grad_norm": 28.09309196472168,
"learning_rate": 1.2426612285366904e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -180.54571533203125,
"logps/rejected": -272.14337158203125,
"loss": 0.2833,
"rewards/accuracies": 0.7649999856948853,
"rewards/chosen": -0.5274211168289185,
"rewards/margins": 3.785543203353882,
"rewards/rejected": -4.31296443939209,
"step": 2150
},
{
"epoch": 1.4735432016075016,
"grad_norm": 5.396151542663574,
"learning_rate": 1.2046833795476566e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -178.48960876464844,
"logps/rejected": -268.61944580078125,
"loss": 0.2594,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.3929290771484375,
"rewards/margins": 3.8942084312438965,
"rewards/rejected": -4.287137508392334,
"step": 2200
},
{
"epoch": 1.507032819825854,
"grad_norm": 26.636991500854492,
"learning_rate": 1.16639475863226e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -183.34547424316406,
"logps/rejected": -259.8311462402344,
"loss": 0.3026,
"rewards/accuracies": 0.7200000286102295,
"rewards/chosen": -0.5500699281692505,
"rewards/margins": 3.565783739089966,
"rewards/rejected": -4.115853786468506,
"step": 2250
},
{
"epoch": 1.5405224380442064,
"grad_norm": 14.03653335571289,
"learning_rate": 1.1278534996189831e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -182.8995361328125,
"logps/rejected": -273.84112548828125,
"loss": 0.2603,
"rewards/accuracies": 0.7487499713897705,
"rewards/chosen": -0.5162584185600281,
"rewards/margins": 4.0679030418396,
"rewards/rejected": -4.584161758422852,
"step": 2300
},
{
"epoch": 1.5740120562625586,
"grad_norm": 67.45540618896484,
"learning_rate": 1.0891181199181518e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -176.06849670410156,
"logps/rejected": -265.9678649902344,
"loss": 0.272,
"rewards/accuracies": 0.7475000023841858,
"rewards/chosen": -0.5778465867042542,
"rewards/margins": 3.9320404529571533,
"rewards/rejected": -4.509886264801025,
"step": 2350
},
{
"epoch": 1.607501674480911,
"grad_norm": 21.127580642700195,
"learning_rate": 1.0502474316746242e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -178.6305694580078,
"logps/rejected": -265.5202331542969,
"loss": 0.2839,
"rewards/accuracies": 0.7462499737739563,
"rewards/chosen": -0.5587973594665527,
"rewards/margins": 3.9246935844421387,
"rewards/rejected": -4.48349142074585,
"step": 2400
},
{
"epoch": 1.6409912926992631,
"grad_norm": 47.24773025512695,
"learning_rate": 1.0113004524729797e-06,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -196.45948791503906,
"logps/rejected": -272.1256408691406,
"loss": 0.2791,
"rewards/accuracies": 0.7587500214576721,
"rewards/chosen": -0.5817875862121582,
"rewards/margins": 3.766108989715576,
"rewards/rejected": -4.347896099090576,
"step": 2450
},
{
"epoch": 1.6744809109176155,
"grad_norm": 20.178668975830078,
"learning_rate": 9.723363157307888e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -183.7681427001953,
"logps/rejected": -268.5182800292969,
"loss": 0.2744,
"rewards/accuracies": 0.7475000023841858,
"rewards/chosen": -0.5075680017471313,
"rewards/margins": 3.9134867191314697,
"rewards/rejected": -4.421054840087891,
"step": 2500
},
{
"epoch": 1.707970529135968,
"grad_norm": 31.073015213012695,
"learning_rate": 9.334141809160118e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -178.35658264160156,
"logps/rejected": -265.6587829589844,
"loss": 0.2405,
"rewards/accuracies": 0.7712500095367432,
"rewards/chosen": -0.6600850820541382,
"rewards/margins": 4.134018421173096,
"rewards/rejected": -4.794103622436523,
"step": 2550
},
{
"epoch": 1.7414601473543203,
"grad_norm": 36.3228759765625,
"learning_rate": 8.945931437248468e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -178.47000122070312,
"logps/rejected": -270.1788635253906,
"loss": 0.2674,
"rewards/accuracies": 0.7524999976158142,
"rewards/chosen": -0.6616349816322327,
"rewards/margins": 4.066000461578369,
"rewards/rejected": -4.727634906768799,
"step": 2600
},
{
"epoch": 1.7749497655726725,
"grad_norm": 27.108051300048828,
"learning_rate": 8.559321463564014e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -175.67808532714844,
"logps/rejected": -261.2061767578125,
"loss": 0.2494,
"rewards/accuracies": 0.7549999952316284,
"rewards/chosen": -0.5604009032249451,
"rewards/margins": 4.31578254699707,
"rewards/rejected": -4.876183032989502,
"step": 2650
},
{
"epoch": 1.8084393837910246,
"grad_norm": 54.821876525878906,
"learning_rate": 8.174898880204195e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -178.19236755371094,
"logps/rejected": -269.1416015625,
"loss": 0.2817,
"rewards/accuracies": 0.7400000095367432,
"rewards/chosen": -0.5425779223442078,
"rewards/margins": 3.9950203895568848,
"rewards/rejected": -4.537598133087158,
"step": 2700
},
{
"epoch": 1.841929002009377,
"grad_norm": 36.13364791870117,
"learning_rate": 7.793247358139428e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -179.92677307128906,
"logps/rejected": -266.75799560546875,
"loss": 0.2885,
"rewards/accuracies": 0.7387499809265137,
"rewards/chosen": -0.5648588538169861,
"rewards/margins": 3.864666700363159,
"rewards/rejected": -4.429525852203369,
"step": 2750
},
{
"epoch": 1.8754186202277294,
"grad_norm": 24.641510009765625,
"learning_rate": 7.414946361022179e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -171.00909423828125,
"logps/rejected": -273.5279541015625,
"loss": 0.2695,
"rewards/accuracies": 0.7400000095367432,
"rewards/chosen": -0.4850202798843384,
"rewards/margins": 4.063894271850586,
"rewards/rejected": -4.548914432525635,
"step": 2800
},
{
"epoch": 1.9089082384460818,
"grad_norm": 25.44546127319336,
"learning_rate": 7.040570265384029e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -186.17147827148438,
"logps/rejected": -272.64111328125,
"loss": 0.2881,
"rewards/accuracies": 0.7512500286102295,
"rewards/chosen": -0.5362930297851562,
"rewards/margins": 4.026025295257568,
"rewards/rejected": -4.562318325042725,
"step": 2850
},
{
"epoch": 1.942397856664434,
"grad_norm": 62.34092330932617,
"learning_rate": 6.670687488556586e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -188.8939208984375,
"logps/rejected": -270.8504943847656,
"loss": 0.2685,
"rewards/accuracies": 0.7337499856948853,
"rewards/chosen": -0.3625078499317169,
"rewards/margins": 4.072076797485352,
"rewards/rejected": -4.434584617614746,
"step": 2900
},
{
"epoch": 1.9758874748827864,
"grad_norm": 16.188819885253906,
"learning_rate": 6.305859625640224e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -177.49630737304688,
"logps/rejected": -280.4139404296875,
"loss": 0.2755,
"rewards/accuracies": 0.7475000023841858,
"rewards/chosen": -0.6155076026916504,
"rewards/margins": 4.242664337158203,
"rewards/rejected": -4.8581719398498535,
"step": 2950
},
{
"epoch": 2.0093770931011385,
"grad_norm": 35.435707092285156,
"learning_rate": 5.946640596831101e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -166.32289123535156,
"logps/rejected": -263.216552734375,
"loss": 0.2391,
"rewards/accuracies": 0.7712500095367432,
"rewards/chosen": -0.6572730541229248,
"rewards/margins": 4.339555740356445,
"rewards/rejected": -4.996828556060791,
"step": 3000
},
{
"epoch": 2.042866711319491,
"grad_norm": 42.23343276977539,
"learning_rate": 5.59357580640101e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -179.9312744140625,
"logps/rejected": -277.5908508300781,
"loss": 0.213,
"rewards/accuracies": 0.7850000262260437,
"rewards/chosen": -0.35315731167793274,
"rewards/margins": 4.545411586761475,
"rewards/rejected": -4.898569107055664,
"step": 3050
},
{
"epoch": 2.0763563295378433,
"grad_norm": 2.853132486343384,
"learning_rate": 5.247201314606984e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -180.59486389160156,
"logps/rejected": -276.373291015625,
"loss": 0.2047,
"rewards/accuracies": 0.7950000166893005,
"rewards/chosen": -0.3648325800895691,
"rewards/margins": 4.745596885681152,
"rewards/rejected": -5.110429763793945,
"step": 3100
},
{
"epoch": 2.1098459477561957,
"grad_norm": 22.07088851928711,
"learning_rate": 4.90804302378802e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -177.32708740234375,
"logps/rejected": -260.5697021484375,
"loss": 0.2054,
"rewards/accuracies": 0.7925000190734863,
"rewards/chosen": -0.48022788763046265,
"rewards/margins": 4.517958641052246,
"rewards/rejected": -4.998186111450195,
"step": 3150
},
{
"epoch": 2.1433355659745477,
"grad_norm": 50.728519439697266,
"learning_rate": 4.57661587988459e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -177.0932159423828,
"logps/rejected": -270.6129150390625,
"loss": 0.236,
"rewards/accuracies": 0.7574999928474426,
"rewards/chosen": -0.4882276654243469,
"rewards/margins": 4.606672286987305,
"rewards/rejected": -5.094900131225586,
"step": 3200
},
{
"epoch": 2.1768251841929,
"grad_norm": 19.410276412963867,
"learning_rate": 4.253423090593318e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -185.2410125732422,
"logps/rejected": -282.7039794921875,
"loss": 0.2242,
"rewards/accuracies": 0.7612500190734863,
"rewards/chosen": -0.5257070064544678,
"rewards/margins": 4.692570209503174,
"rewards/rejected": -5.218277454376221,
"step": 3250
},
{
"epoch": 2.2103148024112524,
"grad_norm": 45.68756103515625,
"learning_rate": 3.938955361343912e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -175.8925018310547,
"logps/rejected": -284.1990966796875,
"loss": 0.2259,
"rewards/accuracies": 0.7699999809265137,
"rewards/chosen": -0.605311930179596,
"rewards/margins": 4.8395843505859375,
"rewards/rejected": -5.444896221160889,
"step": 3300
},
{
"epoch": 2.243804420629605,
"grad_norm": 51.53227996826172,
"learning_rate": 3.6336901502583364e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -177.85601806640625,
"logps/rejected": -275.8158874511719,
"loss": 0.2048,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.6794506907463074,
"rewards/margins": 4.734764575958252,
"rewards/rejected": -5.414215087890625,
"step": 3350
},
{
"epoch": 2.2772940388479572,
"grad_norm": 3.569408893585205,
"learning_rate": 3.3380909432234807e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -182.00836181640625,
"logps/rejected": -280.286376953125,
"loss": 0.1999,
"rewards/accuracies": 0.7950000166893005,
"rewards/chosen": -0.6098263263702393,
"rewards/margins": 4.961060047149658,
"rewards/rejected": -5.570886611938477,
"step": 3400
},
{
"epoch": 2.3107836570663096,
"grad_norm": 27.362163543701172,
"learning_rate": 3.0526065501779184e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -172.97593688964844,
"logps/rejected": -275.5477600097656,
"loss": 0.2184,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.6930285096168518,
"rewards/margins": 4.821885585784912,
"rewards/rejected": -5.514913558959961,
"step": 3450
},
{
"epoch": 2.3442732752846616,
"grad_norm": 28.243000030517578,
"learning_rate": 2.7776704236812454e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -182.44705200195312,
"logps/rejected": -277.888427734375,
"loss": 0.2128,
"rewards/accuracies": 0.7649999856948853,
"rewards/chosen": -0.6010170578956604,
"rewards/margins": 5.026294708251953,
"rewards/rejected": -5.6273112297058105,
"step": 3500
},
{
"epoch": 2.377762893503014,
"grad_norm": 14.03532886505127,
"learning_rate": 2.5137000008006437e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -182.77134704589844,
"logps/rejected": -279.57769775390625,
"loss": 0.21,
"rewards/accuracies": 0.7799999713897705,
"rewards/chosen": -0.7788973450660706,
"rewards/margins": 5.022655010223389,
"rewards/rejected": -5.801552772521973,
"step": 3550
},
{
"epoch": 2.4112525117213663,
"grad_norm": 35.019554138183594,
"learning_rate": 2.261096069313816e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -187.45738220214844,
"logps/rejected": -281.279541015625,
"loss": 0.1887,
"rewards/accuracies": 0.8075000047683716,
"rewards/chosen": -0.7265406847000122,
"rewards/margins": 5.097284317016602,
"rewards/rejected": -5.823824882507324,
"step": 3600
},
{
"epoch": 2.4447421299397187,
"grad_norm": 25.041046142578125,
"learning_rate": 2.020242159190646e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -176.86915588378906,
"logps/rejected": -277.746826171875,
"loss": 0.2311,
"rewards/accuracies": 0.7587500214576721,
"rewards/chosen": -0.786669135093689,
"rewards/margins": 4.789151191711426,
"rewards/rejected": -5.575820446014404,
"step": 3650
},
{
"epoch": 2.478231748158071,
"grad_norm": 20.99360466003418,
"learning_rate": 1.7915039602775062e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -182.3199462890625,
"logps/rejected": -273.0755920410156,
"loss": 0.2429,
"rewards/accuracies": 0.7737500071525574,
"rewards/chosen": -0.8147923946380615,
"rewards/margins": 4.847590446472168,
"rewards/rejected": -5.66238260269165,
"step": 3700
},
{
"epoch": 2.511721366376423,
"grad_norm": 18.44826889038086,
"learning_rate": 1.5752287670682861e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -170.71795654296875,
"logps/rejected": -276.1592102050781,
"loss": 0.2043,
"rewards/accuracies": 0.7862499952316284,
"rewards/chosen": -0.638399064540863,
"rewards/margins": 5.212125301361084,
"rewards/rejected": -5.850524425506592,
"step": 3750
},
{
"epoch": 2.5452109845947755,
"grad_norm": 40.779659271240234,
"learning_rate": 1.3717449514052314e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -180.7264404296875,
"logps/rejected": -284.6885986328125,
"loss": 0.2033,
"rewards/accuracies": 0.7962499856948853,
"rewards/chosen": -0.882935106754303,
"rewards/margins": 5.128498554229736,
"rewards/rejected": -6.0114336013793945,
"step": 3800
},
{
"epoch": 2.578700602813128,
"grad_norm": 44.556678771972656,
"learning_rate": 1.1813614639101088e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -183.99533081054688,
"logps/rejected": -275.25518798828125,
"loss": 0.2274,
"rewards/accuracies": 0.7774999737739563,
"rewards/chosen": -0.703125,
"rewards/margins": 5.014428615570068,
"rewards/rejected": -5.717553615570068,
"step": 3850
},
{
"epoch": 2.6121902210314802,
"grad_norm": 61.39085388183594,
"learning_rate": 1.0043673649027517e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -178.3540802001953,
"logps/rejected": -282.1649475097656,
"loss": 0.2097,
"rewards/accuracies": 0.7662500143051147,
"rewards/chosen": -0.683403730392456,
"rewards/margins": 5.063638687133789,
"rewards/rejected": -5.747043609619141,
"step": 3900
},
{
"epoch": 2.6456798392498326,
"grad_norm": 58.0173454284668,
"learning_rate": 8.410313855191464e-08,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -178.94400024414062,
"logps/rejected": -286.5594177246094,
"loss": 0.2042,
"rewards/accuracies": 0.7862499952316284,
"rewards/chosen": -0.8088821172714233,
"rewards/margins": 5.067000865936279,
"rewards/rejected": -5.875882625579834,
"step": 3950
},
{
"epoch": 2.679169457468185,
"grad_norm": 16.31562042236328,
"learning_rate": 6.916015196954383e-08,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -185.46673583984375,
"logps/rejected": -288.2527770996094,
"loss": 0.217,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.7252050638198853,
"rewards/margins": 5.204960823059082,
"rewards/rejected": -5.930166244506836,
"step": 4000
},
{
"epoch": 2.7126590756865374,
"grad_norm": 20.799222946166992,
"learning_rate": 5.5630464763733787e-08,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -188.50820922851562,
"logps/rejected": -288.9837646484375,
"loss": 0.2258,
"rewards/accuracies": 0.7724999785423279,
"rewards/chosen": -0.7981621026992798,
"rewards/margins": 5.062735557556152,
"rewards/rejected": -5.860898017883301,
"step": 4050
},
{
"epoch": 2.7461486939048894,
"grad_norm": 18.682947158813477,
"learning_rate": 4.353461913466405e-08,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -178.44317626953125,
"logps/rejected": -266.35333251953125,
"loss": 0.2426,
"rewards/accuracies": 0.7524999976158142,
"rewards/chosen": -0.6803594827651978,
"rewards/margins": 4.8590497970581055,
"rewards/rejected": -5.539409160614014,
"step": 4100
},
{
"epoch": 2.7796383121232418,
"grad_norm": 54.06953048706055,
"learning_rate": 3.2890980272783255e-08,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -180.65658569335938,
"logps/rejected": -280.3162536621094,
"loss": 0.2086,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.812857449054718,
"rewards/margins": 5.271449565887451,
"rewards/rejected": -6.0843071937561035,
"step": 4150
},
{
"epoch": 2.813127930341594,
"grad_norm": 12.436116218566895,
"learning_rate": 2.371570847483839e-08,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -180.7625732421875,
"logps/rejected": -277.9272766113281,
"loss": 0.2046,
"rewards/accuracies": 0.78125,
"rewards/chosen": -0.6954517364501953,
"rewards/margins": 5.145771026611328,
"rewards/rejected": -5.841222763061523,
"step": 4200
},
{
"epoch": 2.8466175485599465,
"grad_norm": 66.9225845336914,
"learning_rate": 1.6022734607604393e-08,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -187.79019165039062,
"logps/rejected": -282.13323974609375,
"loss": 0.2096,
"rewards/accuracies": 0.7925000190734863,
"rewards/chosen": -0.8357629179954529,
"rewards/margins": 5.103863716125488,
"rewards/rejected": -5.939626693725586,
"step": 4250
},
{
"epoch": 2.8801071667782985,
"grad_norm": 15.983145713806152,
"learning_rate": 9.823738956571182e-09,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -191.03807067871094,
"logps/rejected": -292.3773193359375,
"loss": 0.206,
"rewards/accuracies": 0.7912499904632568,
"rewards/chosen": -0.6932557821273804,
"rewards/margins": 5.146268367767334,
"rewards/rejected": -5.839523792266846,
"step": 4300
},
{
"epoch": 2.913596784996651,
"grad_norm": 33.383487701416016,
"learning_rate": 5.128133491700715e-09,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -186.7404327392578,
"logps/rejected": -289.3056945800781,
"loss": 0.1936,
"rewards/accuracies": 0.7975000143051147,
"rewards/chosen": -0.7487243413925171,
"rewards/margins": 5.300227642059326,
"rewards/rejected": -6.048952579498291,
"step": 4350
},
{
"epoch": 2.9470864032150033,
"grad_norm": 3.542743682861328,
"learning_rate": 1.9430475771796684e-09,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -190.31752014160156,
"logps/rejected": -268.015380859375,
"loss": 0.2124,
"rewards/accuracies": 0.7862499952316284,
"rewards/chosen": -0.6255255937576294,
"rewards/margins": 4.9648332595825195,
"rewards/rejected": -5.590358257293701,
"step": 4400
},
{
"epoch": 2.9805760214333556,
"grad_norm": 19.205642700195312,
"learning_rate": 2.733171468656259e-10,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -177.03684997558594,
"logps/rejected": -277.01495361328125,
"loss": 0.2059,
"rewards/accuracies": 0.7837499976158142,
"rewards/chosen": -0.7730162739753723,
"rewards/margins": 5.190572738647461,
"rewards/rejected": -5.963588714599609,
"step": 4450
}
],
"logging_steps": 50,
"max_steps": 4479,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}