hello8 / doo_poo_onion /600 /trainer_state.json
1001dinosaurs's picture
Upload folder using huggingface_hub
1e0ea83 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.1813999342753863,
"eval_steps": 500,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005915215248110417,
"grad_norm": 75.64228057861328,
"learning_rate": 6.535947712418302e-08,
"logits/chosen": 0.06831549108028412,
"logits/rejected": 0.22947487235069275,
"logps/chosen": -288.36907958984375,
"logps/rejected": -308.97442626953125,
"loss": 0.6936,
"rewards/accuracies": 0.15740741789340973,
"rewards/chosen": 0.0018866577884182334,
"rewards/margins": 0.00013007604866288602,
"rewards/rejected": 0.0017565814778208733,
"step": 3
},
{
"epoch": 0.011830430496220835,
"grad_norm": 85.5863265991211,
"learning_rate": 1.6339869281045755e-07,
"logits/chosen": 0.1193937212228775,
"logits/rejected": 0.14183634519577026,
"logps/chosen": -297.21484375,
"logps/rejected": -289.2657165527344,
"loss": 0.6933,
"rewards/accuracies": 0.513888955116272,
"rewards/chosen": 0.00845375657081604,
"rewards/margins": 0.003605001140385866,
"rewards/rejected": 0.004848754033446312,
"step": 6
},
{
"epoch": 0.017745645744331254,
"grad_norm": 92.01388549804688,
"learning_rate": 2.6143790849673207e-07,
"logits/chosen": 0.013294734060764313,
"logits/rejected": 0.10444601625204086,
"logps/chosen": -293.22174072265625,
"logps/rejected": -296.61444091796875,
"loss": 0.6879,
"rewards/accuracies": 0.5601852536201477,
"rewards/chosen": 0.0001924792304635048,
"rewards/margins": 0.014045190066099167,
"rewards/rejected": -0.013852710835635662,
"step": 9
},
{
"epoch": 0.02366086099244167,
"grad_norm": 63.192481994628906,
"learning_rate": 3.5947712418300653e-07,
"logits/chosen": 0.1236824318766594,
"logits/rejected": 0.056549232453107834,
"logps/chosen": -284.45782470703125,
"logps/rejected": -284.9747009277344,
"loss": 0.6899,
"rewards/accuracies": 0.5370370149612427,
"rewards/chosen": 0.007598253898322582,
"rewards/margins": 0.011467371135950089,
"rewards/rejected": -0.003869118168950081,
"step": 12
},
{
"epoch": 0.02957607624055209,
"grad_norm": 52.87683868408203,
"learning_rate": 4.5751633986928105e-07,
"logits/chosen": 0.1870713233947754,
"logits/rejected": 0.17200356721878052,
"logps/chosen": -278.11785888671875,
"logps/rejected": -288.7649841308594,
"loss": 0.6823,
"rewards/accuracies": 0.5694444179534912,
"rewards/chosen": 0.018766043707728386,
"rewards/margins": 0.029558217152953148,
"rewards/rejected": -0.010792172513902187,
"step": 15
},
{
"epoch": 0.03549129148866251,
"grad_norm": 47.12651443481445,
"learning_rate": 5.555555555555555e-07,
"logits/chosen": 0.2681354880332947,
"logits/rejected": 0.2608181834220886,
"logps/chosen": -299.3112487792969,
"logps/rejected": -293.26654052734375,
"loss": 0.672,
"rewards/accuracies": 0.5972222685813904,
"rewards/chosen": 0.013350310735404491,
"rewards/margins": 0.058473195880651474,
"rewards/rejected": -0.04512288421392441,
"step": 18
},
{
"epoch": 0.04140650673677292,
"grad_norm": 46.12940216064453,
"learning_rate": 6.535947712418302e-07,
"logits/chosen": 0.21857470273971558,
"logits/rejected": 0.1517491191625595,
"logps/chosen": -297.955078125,
"logps/rejected": -291.7802429199219,
"loss": 0.6583,
"rewards/accuracies": 0.5787036418914795,
"rewards/chosen": -0.0388781875371933,
"rewards/margins": 0.12246696650981903,
"rewards/rejected": -0.16134515404701233,
"step": 21
},
{
"epoch": 0.04732172198488334,
"grad_norm": 43.86560821533203,
"learning_rate": 7.516339869281046e-07,
"logits/chosen": 0.24428892135620117,
"logits/rejected": 0.2012861967086792,
"logps/chosen": -294.25177001953125,
"logps/rejected": -295.1514892578125,
"loss": 0.658,
"rewards/accuracies": 0.6481481790542603,
"rewards/chosen": 0.02628326043486595,
"rewards/margins": 0.15634408593177795,
"rewards/rejected": -0.1300608068704605,
"step": 24
},
{
"epoch": 0.053236937232993754,
"grad_norm": 34.92121124267578,
"learning_rate": 8.496732026143792e-07,
"logits/chosen": 0.19387498497962952,
"logits/rejected": 0.2192877233028412,
"logps/chosen": -297.76287841796875,
"logps/rejected": -298.34576416015625,
"loss": 0.6738,
"rewards/accuracies": 0.6064814925193787,
"rewards/chosen": -0.02720721624791622,
"rewards/margins": 0.20852135121822357,
"rewards/rejected": -0.23572856187820435,
"step": 27
},
{
"epoch": 0.05915215248110418,
"grad_norm": 35.40020751953125,
"learning_rate": 9.477124183006536e-07,
"logits/chosen": 0.0057902163825929165,
"logits/rejected": 0.068918377161026,
"logps/chosen": -284.17608642578125,
"logps/rejected": -289.56268310546875,
"loss": 0.5956,
"rewards/accuracies": 0.6990741491317749,
"rewards/chosen": 0.17752066254615784,
"rewards/margins": 0.39666658639907837,
"rewards/rejected": -0.21914593875408173,
"step": 30
},
{
"epoch": 0.06506736772921459,
"grad_norm": 65.21533966064453,
"learning_rate": 1.0457516339869283e-06,
"logits/chosen": 0.17345206439495087,
"logits/rejected": 0.2315821498632431,
"logps/chosen": -274.51983642578125,
"logps/rejected": -297.6330261230469,
"loss": 0.6454,
"rewards/accuracies": 0.5601851940155029,
"rewards/chosen": 0.33469104766845703,
"rewards/margins": 0.3119283616542816,
"rewards/rejected": 0.022762654349207878,
"step": 33
},
{
"epoch": 0.07098258297732501,
"grad_norm": 52.888023376464844,
"learning_rate": 1.1437908496732026e-06,
"logits/chosen": 0.10116317868232727,
"logits/rejected": 0.12458281219005585,
"logps/chosen": -304.16064453125,
"logps/rejected": -330.84197998046875,
"loss": 0.6079,
"rewards/accuracies": 0.6342592835426331,
"rewards/chosen": 0.6159655451774597,
"rewards/margins": 0.4744771122932434,
"rewards/rejected": 0.14148837327957153,
"step": 36
},
{
"epoch": 0.07689779822543542,
"grad_norm": 44.073524475097656,
"learning_rate": 1.2418300653594772e-06,
"logits/chosen": 0.11387699842453003,
"logits/rejected": 0.11010300368070602,
"logps/chosen": -282.6150207519531,
"logps/rejected": -302.6578369140625,
"loss": 0.5832,
"rewards/accuracies": 0.6527777910232544,
"rewards/chosen": 0.7451757192611694,
"rewards/margins": 0.5253991484642029,
"rewards/rejected": 0.21977655589580536,
"step": 39
},
{
"epoch": 0.08281301347354585,
"grad_norm": 45.03438186645508,
"learning_rate": 1.3398692810457518e-06,
"logits/chosen": 0.14107277989387512,
"logits/rejected": 0.08299855887889862,
"logps/chosen": -289.40655517578125,
"logps/rejected": -301.38641357421875,
"loss": 0.5929,
"rewards/accuracies": 0.6759259700775146,
"rewards/chosen": 0.8502761125564575,
"rewards/margins": 0.6023391485214233,
"rewards/rejected": 0.24793694913387299,
"step": 42
},
{
"epoch": 0.08872822872165625,
"grad_norm": 34.897117614746094,
"learning_rate": 1.4379084967320261e-06,
"logits/chosen": 0.11066042631864548,
"logits/rejected": 0.18711869418621063,
"logps/chosen": -277.82086181640625,
"logps/rejected": -311.96185302734375,
"loss": 0.574,
"rewards/accuracies": 0.6435185670852661,
"rewards/chosen": 0.7543072700500488,
"rewards/margins": 0.595453679561615,
"rewards/rejected": 0.15885356068611145,
"step": 45
},
{
"epoch": 0.09464344396976668,
"grad_norm": 52.84747314453125,
"learning_rate": 1.535947712418301e-06,
"logits/chosen": 0.07594814896583557,
"logits/rejected": 0.06705646216869354,
"logps/chosen": -288.2305603027344,
"logps/rejected": -294.79473876953125,
"loss": 0.6463,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": 0.5022045373916626,
"rewards/margins": 0.49619922041893005,
"rewards/rejected": 0.006005376577377319,
"step": 48
},
{
"epoch": 0.1005586592178771,
"grad_norm": 42.00156784057617,
"learning_rate": 1.6339869281045753e-06,
"logits/chosen": 0.28929704427719116,
"logits/rejected": 0.28439193964004517,
"logps/chosen": -295.6617126464844,
"logps/rejected": -297.2070617675781,
"loss": 0.6407,
"rewards/accuracies": 0.5925926566123962,
"rewards/chosen": 0.33414918184280396,
"rewards/margins": 0.5068655610084534,
"rewards/rejected": -0.17271637916564941,
"step": 51
},
{
"epoch": 0.10647387446598751,
"grad_norm": 41.05317687988281,
"learning_rate": 1.7320261437908499e-06,
"logits/chosen": 0.2613026797771454,
"logits/rejected": 0.25626230239868164,
"logps/chosen": -270.5616149902344,
"logps/rejected": -288.9526672363281,
"loss": 0.666,
"rewards/accuracies": 0.6111111044883728,
"rewards/chosen": 0.35354501008987427,
"rewards/margins": 0.516350507736206,
"rewards/rejected": -0.16280552744865417,
"step": 54
},
{
"epoch": 0.11238908971409793,
"grad_norm": 40.66948318481445,
"learning_rate": 1.8300653594771242e-06,
"logits/chosen": 0.14654496312141418,
"logits/rejected": 0.18361543118953705,
"logps/chosen": -276.3260498046875,
"logps/rejected": -286.315185546875,
"loss": 0.6593,
"rewards/accuracies": 0.6527777910232544,
"rewards/chosen": 0.19915954768657684,
"rewards/margins": 0.5710794925689697,
"rewards/rejected": -0.3719198703765869,
"step": 57
},
{
"epoch": 0.11830430496220835,
"grad_norm": 39.70878601074219,
"learning_rate": 1.928104575163399e-06,
"logits/chosen": 0.26200026273727417,
"logits/rejected": 0.2716706097126007,
"logps/chosen": -296.445068359375,
"logps/rejected": -302.8128356933594,
"loss": 0.6215,
"rewards/accuracies": 0.6481481790542603,
"rewards/chosen": 0.1488932967185974,
"rewards/margins": 0.6068000793457031,
"rewards/rejected": -0.4579067826271057,
"step": 60
},
{
"epoch": 0.12421952021031876,
"grad_norm": 45.6502571105957,
"learning_rate": 2.0261437908496734e-06,
"logits/chosen": 0.16293856501579285,
"logits/rejected": 0.16650280356407166,
"logps/chosen": -279.3722839355469,
"logps/rejected": -302.0152282714844,
"loss": 0.5946,
"rewards/accuracies": 0.6527777910232544,
"rewards/chosen": 0.07076837122440338,
"rewards/margins": 0.6879870891571045,
"rewards/rejected": -0.6172187328338623,
"step": 63
},
{
"epoch": 0.13013473545842918,
"grad_norm": 37.381980895996094,
"learning_rate": 2.1241830065359477e-06,
"logits/chosen": 0.1667034924030304,
"logits/rejected": 0.11124895513057709,
"logps/chosen": -286.760986328125,
"logps/rejected": -294.1531982421875,
"loss": 0.5771,
"rewards/accuracies": 0.6481481790542603,
"rewards/chosen": 0.002402188954874873,
"rewards/margins": 0.7446907758712769,
"rewards/rejected": -0.7422885298728943,
"step": 66
},
{
"epoch": 0.1360499507065396,
"grad_norm": 40.16853713989258,
"learning_rate": 2.222222222222222e-06,
"logits/chosen": 0.1928640604019165,
"logits/rejected": 0.2365691065788269,
"logps/chosen": -289.02642822265625,
"logps/rejected": -299.3502197265625,
"loss": 0.599,
"rewards/accuracies": 0.6898148059844971,
"rewards/chosen": 0.0025644637644290924,
"rewards/margins": 0.755738377571106,
"rewards/rejected": -0.7531739473342896,
"step": 69
},
{
"epoch": 0.14196516595465003,
"grad_norm": 61.28244400024414,
"learning_rate": 2.320261437908497e-06,
"logits/chosen": 0.13621635735034943,
"logits/rejected": 0.2548728287220001,
"logps/chosen": -289.6425476074219,
"logps/rejected": -316.8446350097656,
"loss": 0.5755,
"rewards/accuracies": 0.6944444179534912,
"rewards/chosen": 0.012163564562797546,
"rewards/margins": 0.7862691879272461,
"rewards/rejected": -0.7741057276725769,
"step": 72
},
{
"epoch": 0.14788038120276042,
"grad_norm": 72.57213592529297,
"learning_rate": 2.4183006535947716e-06,
"logits/chosen": 0.05144810676574707,
"logits/rejected": 0.13899767398834229,
"logps/chosen": -280.32623291015625,
"logps/rejected": -301.4366760253906,
"loss": 0.5793,
"rewards/accuracies": 0.6527777910232544,
"rewards/chosen": 0.18221376836299896,
"rewards/margins": 0.7852480411529541,
"rewards/rejected": -0.603034257888794,
"step": 75
},
{
"epoch": 0.15379559645087085,
"grad_norm": 43.978668212890625,
"learning_rate": 2.516339869281046e-06,
"logits/chosen": 0.2053721845149994,
"logits/rejected": 0.301210880279541,
"logps/chosen": -293.8603820800781,
"logps/rejected": -311.3579406738281,
"loss": 0.5898,
"rewards/accuracies": 0.6712962985038757,
"rewards/chosen": 0.04886661097407341,
"rewards/margins": 0.7296194434165955,
"rewards/rejected": -0.6807528734207153,
"step": 78
},
{
"epoch": 0.15971081169898127,
"grad_norm": 36.1211051940918,
"learning_rate": 2.6143790849673208e-06,
"logits/chosen": 0.0973750576376915,
"logits/rejected": 0.1262703239917755,
"logps/chosen": -286.0694274902344,
"logps/rejected": -307.6643371582031,
"loss": 0.5745,
"rewards/accuracies": 0.680555522441864,
"rewards/chosen": 0.17426416277885437,
"rewards/margins": 0.898255467414856,
"rewards/rejected": -0.7239912748336792,
"step": 81
},
{
"epoch": 0.1656260269470917,
"grad_norm": 34.10630416870117,
"learning_rate": 2.7124183006535947e-06,
"logits/chosen": 0.135942280292511,
"logits/rejected": 0.17899185419082642,
"logps/chosen": -282.0833740234375,
"logps/rejected": -294.80340576171875,
"loss": 0.5664,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": 0.24072617292404175,
"rewards/margins": 0.7590986490249634,
"rewards/rejected": -0.5183724761009216,
"step": 84
},
{
"epoch": 0.17154124219520211,
"grad_norm": 101.0091323852539,
"learning_rate": 2.8104575163398695e-06,
"logits/chosen": 0.049379341304302216,
"logits/rejected": 0.22774741053581238,
"logps/chosen": -276.23651123046875,
"logps/rejected": -328.92730712890625,
"loss": 0.6363,
"rewards/accuracies": 0.6388888955116272,
"rewards/chosen": 0.24817490577697754,
"rewards/margins": 0.6880936026573181,
"rewards/rejected": -0.4399186372756958,
"step": 87
},
{
"epoch": 0.1774564574433125,
"grad_norm": 41.10043716430664,
"learning_rate": 2.9084967320261443e-06,
"logits/chosen": 0.04306400939822197,
"logits/rejected": 0.12012603133916855,
"logps/chosen": -273.14862060546875,
"logps/rejected": -297.67779541015625,
"loss": 0.6138,
"rewards/accuracies": 0.6388888955116272,
"rewards/chosen": 0.2495533525943756,
"rewards/margins": 0.97026127576828,
"rewards/rejected": -0.720707893371582,
"step": 90
},
{
"epoch": 0.18337167269142293,
"grad_norm": 36.96125030517578,
"learning_rate": 3.0065359477124182e-06,
"logits/chosen": -0.0033540725708007812,
"logits/rejected": 0.15706853568553925,
"logps/chosen": -280.9652099609375,
"logps/rejected": -311.3870544433594,
"loss": 0.5552,
"rewards/accuracies": 0.6990741491317749,
"rewards/chosen": 0.41701826453208923,
"rewards/margins": 1.1586750745773315,
"rewards/rejected": -0.7416568994522095,
"step": 93
},
{
"epoch": 0.18928688793953335,
"grad_norm": 34.02159118652344,
"learning_rate": 3.104575163398693e-06,
"logits/chosen": 0.055927395820617676,
"logits/rejected": 0.07919944822788239,
"logps/chosen": -277.48199462890625,
"logps/rejected": -295.92218017578125,
"loss": 0.5402,
"rewards/accuracies": 0.6898148059844971,
"rewards/chosen": 0.7134115099906921,
"rewards/margins": 1.0553935766220093,
"rewards/rejected": -0.34198200702667236,
"step": 96
},
{
"epoch": 0.19520210318764378,
"grad_norm": 30.828338623046875,
"learning_rate": 3.2026143790849674e-06,
"logits/chosen": 0.02216392755508423,
"logits/rejected": 0.17252111434936523,
"logps/chosen": -274.6871337890625,
"logps/rejected": -308.18707275390625,
"loss": 0.5325,
"rewards/accuracies": 0.6944444179534912,
"rewards/chosen": 0.7039467692375183,
"rewards/margins": 1.234229326248169,
"rewards/rejected": -0.5302824974060059,
"step": 99
},
{
"epoch": 0.2011173184357542,
"grad_norm": 55.14480972290039,
"learning_rate": 3.300653594771242e-06,
"logits/chosen": 0.0836678296327591,
"logits/rejected": 0.1754518747329712,
"logps/chosen": -285.959228515625,
"logps/rejected": -307.93658447265625,
"loss": 0.6502,
"rewards/accuracies": 0.6759259700775146,
"rewards/chosen": 0.4246281087398529,
"rewards/margins": 1.0537660121917725,
"rewards/rejected": -0.6291378736495972,
"step": 102
},
{
"epoch": 0.2070325336838646,
"grad_norm": 62.848453521728516,
"learning_rate": 3.398692810457517e-06,
"logits/chosen": 0.027146054431796074,
"logits/rejected": 0.0840519368648529,
"logps/chosen": -294.8350524902344,
"logps/rejected": -316.4007873535156,
"loss": 0.6039,
"rewards/accuracies": 0.6712962985038757,
"rewards/chosen": 0.3796502947807312,
"rewards/margins": 1.3719719648361206,
"rewards/rejected": -0.9923217296600342,
"step": 105
},
{
"epoch": 0.21294774893197502,
"grad_norm": 34.59516143798828,
"learning_rate": 3.496732026143791e-06,
"logits/chosen": 0.18716061115264893,
"logits/rejected": 0.2005024403333664,
"logps/chosen": -294.5157470703125,
"logps/rejected": -318.09368896484375,
"loss": 0.646,
"rewards/accuracies": 0.703703761100769,
"rewards/chosen": 0.1613771915435791,
"rewards/margins": 1.098291277885437,
"rewards/rejected": -0.9369141459465027,
"step": 108
},
{
"epoch": 0.21886296418008544,
"grad_norm": 59.11061096191406,
"learning_rate": 3.5947712418300657e-06,
"logits/chosen": 0.02729523926973343,
"logits/rejected": 0.18032339215278625,
"logps/chosen": -279.9372863769531,
"logps/rejected": -314.1539306640625,
"loss": 0.6169,
"rewards/accuracies": 0.6944445371627808,
"rewards/chosen": 0.2143605351448059,
"rewards/margins": 1.1479460000991821,
"rewards/rejected": -0.933585524559021,
"step": 111
},
{
"epoch": 0.22477817942819586,
"grad_norm": 54.83635711669922,
"learning_rate": 3.6928104575163404e-06,
"logits/chosen": -0.07722613215446472,
"logits/rejected": 0.056545909494161606,
"logps/chosen": -265.40985107421875,
"logps/rejected": -308.90350341796875,
"loss": 0.5748,
"rewards/accuracies": 0.6944445371627808,
"rewards/chosen": 0.5514373183250427,
"rewards/margins": 1.499307632446289,
"rewards/rejected": -0.9478704333305359,
"step": 114
},
{
"epoch": 0.23069339467630628,
"grad_norm": 82.6873779296875,
"learning_rate": 3.7908496732026144e-06,
"logits/chosen": -0.05033176392316818,
"logits/rejected": 0.08841504901647568,
"logps/chosen": -284.0753479003906,
"logps/rejected": -315.85931396484375,
"loss": 0.6855,
"rewards/accuracies": 0.6990741491317749,
"rewards/chosen": 0.38630211353302,
"rewards/margins": 1.3165416717529297,
"rewards/rejected": -0.9302395582199097,
"step": 117
},
{
"epoch": 0.2366086099244167,
"grad_norm": 34.905941009521484,
"learning_rate": 3.88888888888889e-06,
"logits/chosen": -0.014662293717265129,
"logits/rejected": 0.03569987416267395,
"logps/chosen": -269.83001708984375,
"logps/rejected": -294.5672912597656,
"loss": 0.5944,
"rewards/accuracies": 0.703703761100769,
"rewards/chosen": 0.29097482562065125,
"rewards/margins": 1.4026880264282227,
"rewards/rejected": -1.111713171005249,
"step": 120
},
{
"epoch": 0.2425238251725271,
"grad_norm": 34.07733917236328,
"learning_rate": 3.986928104575164e-06,
"logits/chosen": -0.08946999907493591,
"logits/rejected": 0.01624571532011032,
"logps/chosen": -283.3293151855469,
"logps/rejected": -312.3072509765625,
"loss": 0.554,
"rewards/accuracies": 0.7083333730697632,
"rewards/chosen": 0.07897276431322098,
"rewards/margins": 1.4951574802398682,
"rewards/rejected": -1.416184663772583,
"step": 123
},
{
"epoch": 0.24843904042063752,
"grad_norm": 47.81401824951172,
"learning_rate": 4.084967320261438e-06,
"logits/chosen": -0.08563312888145447,
"logits/rejected": -0.123930923640728,
"logps/chosen": -308.0613098144531,
"logps/rejected": -300.32025146484375,
"loss": 0.6619,
"rewards/accuracies": 0.6342592835426331,
"rewards/chosen": -0.285559743642807,
"rewards/margins": 1.018710970878601,
"rewards/rejected": -1.304270625114441,
"step": 126
},
{
"epoch": 0.2543542556687479,
"grad_norm": 40.03066635131836,
"learning_rate": 4.183006535947713e-06,
"logits/chosen": 0.0017192339291796088,
"logits/rejected": 0.14679786562919617,
"logps/chosen": -316.32379150390625,
"logps/rejected": -339.078125,
"loss": 0.6979,
"rewards/accuracies": 0.6481481790542603,
"rewards/chosen": -0.6922714710235596,
"rewards/margins": 0.9672110080718994,
"rewards/rejected": -1.659482717514038,
"step": 129
},
{
"epoch": 0.26026947091685837,
"grad_norm": 53.74517822265625,
"learning_rate": 4.281045751633987e-06,
"logits/chosen": -0.05079513043165207,
"logits/rejected": 0.07216000556945801,
"logps/chosen": -295.9747314453125,
"logps/rejected": -332.39013671875,
"loss": 0.5792,
"rewards/accuracies": 0.7222222685813904,
"rewards/chosen": -0.8141295909881592,
"rewards/margins": 1.5237984657287598,
"rewards/rejected": -2.337928295135498,
"step": 132
},
{
"epoch": 0.26618468616496876,
"grad_norm": 35.11404800415039,
"learning_rate": 4.379084967320262e-06,
"logits/chosen": -0.046733301132917404,
"logits/rejected": 0.0873221755027771,
"logps/chosen": -289.7781982421875,
"logps/rejected": -330.5262756347656,
"loss": 0.5458,
"rewards/accuracies": 0.7129629850387573,
"rewards/chosen": -0.5395014882087708,
"rewards/margins": 1.484311819076538,
"rewards/rejected": -2.023813247680664,
"step": 135
},
{
"epoch": 0.2720999014130792,
"grad_norm": 46.060089111328125,
"learning_rate": 4.477124183006537e-06,
"logits/chosen": -0.10985089838504791,
"logits/rejected": 0.007513361983001232,
"logps/chosen": -291.6156311035156,
"logps/rejected": -330.0778503417969,
"loss": 0.6452,
"rewards/accuracies": 0.6712963581085205,
"rewards/chosen": -0.8152337074279785,
"rewards/margins": 1.0877575874328613,
"rewards/rejected": -1.9029912948608398,
"step": 138
},
{
"epoch": 0.2780151166611896,
"grad_norm": 36.288475036621094,
"learning_rate": 4.5751633986928105e-06,
"logits/chosen": -0.12672923505306244,
"logits/rejected": -0.05195396766066551,
"logps/chosen": -291.0050048828125,
"logps/rejected": -328.9648742675781,
"loss": 0.622,
"rewards/accuracies": 0.6990740299224854,
"rewards/chosen": -0.6529079675674438,
"rewards/margins": 1.3228920698165894,
"rewards/rejected": -1.9758000373840332,
"step": 141
},
{
"epoch": 0.28393033190930006,
"grad_norm": 61.34006118774414,
"learning_rate": 4.673202614379085e-06,
"logits/chosen": -0.09269940853118896,
"logits/rejected": -0.09529760479927063,
"logps/chosen": -308.4407958984375,
"logps/rejected": -327.4217224121094,
"loss": 0.7128,
"rewards/accuracies": 0.6805555820465088,
"rewards/chosen": -0.8733962774276733,
"rewards/margins": 1.3924760818481445,
"rewards/rejected": -2.2658724784851074,
"step": 144
},
{
"epoch": 0.28984554715741045,
"grad_norm": 60.66278839111328,
"learning_rate": 4.77124183006536e-06,
"logits/chosen": -0.21165470778942108,
"logits/rejected": -0.08071193099021912,
"logps/chosen": -292.4040832519531,
"logps/rejected": -327.4679260253906,
"loss": 0.6824,
"rewards/accuracies": 0.6805555820465088,
"rewards/chosen": -1.0551072359085083,
"rewards/margins": 1.499154806137085,
"rewards/rejected": -2.5542619228363037,
"step": 147
},
{
"epoch": 0.29576076240552085,
"grad_norm": 37.254608154296875,
"learning_rate": 4.869281045751634e-06,
"logits/chosen": -0.09953123331069946,
"logits/rejected": -0.10298528522253036,
"logps/chosen": -307.6355285644531,
"logps/rejected": -334.2861328125,
"loss": 0.6938,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.1909699440002441,
"rewards/margins": 1.456664800643921,
"rewards/rejected": -2.647634506225586,
"step": 150
},
{
"epoch": 0.3016759776536313,
"grad_norm": 29.970460891723633,
"learning_rate": 4.967320261437909e-06,
"logits/chosen": -0.16488048434257507,
"logits/rejected": -0.026979412883520126,
"logps/chosen": -292.1322021484375,
"logps/rejected": -334.13330078125,
"loss": 0.7198,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": -1.4055306911468506,
"rewards/margins": 1.0318067073822021,
"rewards/rejected": -2.4373371601104736,
"step": 153
},
{
"epoch": 0.3075911929017417,
"grad_norm": 102.86775970458984,
"learning_rate": 4.999973746084687e-06,
"logits/chosen": -0.2688429355621338,
"logits/rejected": -0.08179165422916412,
"logps/chosen": -292.56201171875,
"logps/rejected": -334.35443115234375,
"loss": 0.6896,
"rewards/accuracies": 0.6712962985038757,
"rewards/chosen": -1.556567907333374,
"rewards/margins": 1.6510968208312988,
"rewards/rejected": -3.2076644897460938,
"step": 156
},
{
"epoch": 0.31350640814985214,
"grad_norm": 35.398048400878906,
"learning_rate": 4.999835914537063e-06,
"logits/chosen": -0.1668413281440735,
"logits/rejected": -0.13912776112556458,
"logps/chosen": -304.44842529296875,
"logps/rejected": -323.0757141113281,
"loss": 0.5734,
"rewards/accuracies": 0.7546296119689941,
"rewards/chosen": -1.2236417531967163,
"rewards/margins": 1.9698877334594727,
"rewards/rejected": -3.1935291290283203,
"step": 159
},
{
"epoch": 0.31942162339796254,
"grad_norm": 54.54057312011719,
"learning_rate": 4.999579948383184e-06,
"logits/chosen": -0.15461772680282593,
"logits/rejected": -0.03271166980266571,
"logps/chosen": -295.79632568359375,
"logps/rejected": -327.92584228515625,
"loss": 0.7708,
"rewards/accuracies": 0.6620371341705322,
"rewards/chosen": -1.315285086631775,
"rewards/margins": 1.5617358684539795,
"rewards/rejected": -2.877020835876465,
"step": 162
},
{
"epoch": 0.32533683864607293,
"grad_norm": 34.624603271484375,
"learning_rate": 4.9992058597192255e-06,
"logits/chosen": -0.10605038702487946,
"logits/rejected": 0.021997269243001938,
"logps/chosen": -306.4567565917969,
"logps/rejected": -336.64208984375,
"loss": 0.6976,
"rewards/accuracies": 0.6620370745658875,
"rewards/chosen": -1.345373511314392,
"rewards/margins": 1.6246647834777832,
"rewards/rejected": -2.9700381755828857,
"step": 165
},
{
"epoch": 0.3312520538941834,
"grad_norm": 67.67698669433594,
"learning_rate": 4.9987136662234764e-06,
"logits/chosen": -0.0949287861585617,
"logits/rejected": -0.04236632585525513,
"logps/chosen": -310.03973388671875,
"logps/rejected": -338.93768310546875,
"loss": 0.7999,
"rewards/accuracies": 0.6851851940155029,
"rewards/chosen": -2.0794289112091064,
"rewards/margins": 1.357082724571228,
"rewards/rejected": -3.436511516571045,
"step": 168
},
{
"epoch": 0.3371672691422938,
"grad_norm": 32.37373733520508,
"learning_rate": 4.998103391155496e-06,
"logits/chosen": -0.0922163650393486,
"logits/rejected": 0.0038010727148503065,
"logps/chosen": -299.1915283203125,
"logps/rejected": -340.01202392578125,
"loss": 0.7605,
"rewards/accuracies": 0.6990741491317749,
"rewards/chosen": -2.1031405925750732,
"rewards/margins": 1.5288575887680054,
"rewards/rejected": -3.631998062133789,
"step": 171
},
{
"epoch": 0.34308248439040423,
"grad_norm": 41.414024353027344,
"learning_rate": 4.997375063355021e-06,
"logits/chosen": -0.14530247449874878,
"logits/rejected": -0.017910713329911232,
"logps/chosen": -294.3675537109375,
"logps/rejected": -331.98138427734375,
"loss": 0.6571,
"rewards/accuracies": 0.6898148059844971,
"rewards/chosen": -2.397458791732788,
"rewards/margins": 1.8552653789520264,
"rewards/rejected": -4.252723693847656,
"step": 174
},
{
"epoch": 0.3489976996385146,
"grad_norm": 48.10707473754883,
"learning_rate": 4.996528717240595e-06,
"logits/chosen": -0.05039427801966667,
"logits/rejected": -0.03388180956244469,
"logps/chosen": -334.423583984375,
"logps/rejected": -366.33087158203125,
"loss": 0.6121,
"rewards/accuracies": 0.7129630446434021,
"rewards/chosen": -2.6137490272521973,
"rewards/margins": 2.267016887664795,
"rewards/rejected": -4.880765438079834,
"step": 177
},
{
"epoch": 0.354912914886625,
"grad_norm": 48.403987884521484,
"learning_rate": 4.995564392807951e-06,
"logits/chosen": -0.12557795643806458,
"logits/rejected": -0.08240347355604172,
"logps/chosen": -307.76190185546875,
"logps/rejected": -336.8500671386719,
"loss": 0.8248,
"rewards/accuracies": 0.6527777910232544,
"rewards/chosen": -2.706207036972046,
"rewards/margins": 1.6851190328598022,
"rewards/rejected": -4.391325950622559,
"step": 180
},
{
"epoch": 0.36082813013473547,
"grad_norm": 33.078346252441406,
"learning_rate": 4.994482135628115e-06,
"logits/chosen": -0.0540686696767807,
"logits/rejected": -0.02901211380958557,
"logps/chosen": -308.53997802734375,
"logps/rejected": -325.751220703125,
"loss": 0.6532,
"rewards/accuracies": 0.6805555820465088,
"rewards/chosen": -2.590834617614746,
"rewards/margins": 2.104125738143921,
"rewards/rejected": -4.694960594177246,
"step": 183
},
{
"epoch": 0.36674334538284586,
"grad_norm": 48.71003723144531,
"learning_rate": 4.993281996845253e-06,
"logits/chosen": -0.12156584858894348,
"logits/rejected": -0.009240781888365746,
"logps/chosen": -309.54833984375,
"logps/rejected": -357.3154296875,
"loss": 0.7973,
"rewards/accuracies": 0.6805555820465088,
"rewards/chosen": -3.1666088104248047,
"rewards/margins": 1.5774611234664917,
"rewards/rejected": -4.744070053100586,
"step": 186
},
{
"epoch": 0.3726585606309563,
"grad_norm": 34.78226089477539,
"learning_rate": 4.991964033174257e-06,
"logits/chosen": 0.03807515650987625,
"logits/rejected": 0.03681405261158943,
"logps/chosen": -311.2495422363281,
"logps/rejected": -324.69183349609375,
"loss": 0.7562,
"rewards/accuracies": 0.6388888359069824,
"rewards/chosen": -2.8808019161224365,
"rewards/margins": 1.3406963348388672,
"rewards/rejected": -4.221498489379883,
"step": 189
},
{
"epoch": 0.3785737758790667,
"grad_norm": 40.5612678527832,
"learning_rate": 4.990528306898062e-06,
"logits/chosen": -0.02013694867491722,
"logits/rejected": 0.06969591975212097,
"logps/chosen": -311.6338806152344,
"logps/rejected": -357.86932373046875,
"loss": 0.6847,
"rewards/accuracies": 0.6481481790542603,
"rewards/chosen": -2.982748031616211,
"rewards/margins": 1.80600905418396,
"rewards/rejected": -4.78875732421875,
"step": 192
},
{
"epoch": 0.3844889911271771,
"grad_norm": 40.58036804199219,
"learning_rate": 4.988974885864706e-06,
"logits/chosen": -0.07019342482089996,
"logits/rejected": 0.034122247248888016,
"logps/chosen": -311.8460388183594,
"logps/rejected": -335.7353515625,
"loss": 0.6818,
"rewards/accuracies": 0.7083333730697632,
"rewards/chosen": -2.6682796478271484,
"rewards/margins": 2.0244059562683105,
"rewards/rejected": -4.692685127258301,
"step": 195
},
{
"epoch": 0.39040420637528755,
"grad_norm": 40.306243896484375,
"learning_rate": 4.987303843484119e-06,
"logits/chosen": -0.04457690566778183,
"logits/rejected": -0.04778536409139633,
"logps/chosen": -323.8233337402344,
"logps/rejected": -337.99658203125,
"loss": 0.8443,
"rewards/accuracies": 0.6712962985038757,
"rewards/chosen": -2.713296413421631,
"rewards/margins": 2.1648170948028564,
"rewards/rejected": -4.878113269805908,
"step": 198
},
{
"epoch": 0.39631942162339795,
"grad_norm": 40.987098693847656,
"learning_rate": 4.985515258724657e-06,
"logits/chosen": -0.1199549064040184,
"logits/rejected": 0.04760899394750595,
"logps/chosen": -315.40380859375,
"logps/rejected": -365.968017578125,
"loss": 0.8342,
"rewards/accuracies": 0.6296296119689941,
"rewards/chosen": -2.873556137084961,
"rewards/margins": 1.9774025678634644,
"rewards/rejected": -4.850958824157715,
"step": 201
},
{
"epoch": 0.4022346368715084,
"grad_norm": 63.48135757446289,
"learning_rate": 4.983609216109371e-06,
"logits/chosen": -0.11053924262523651,
"logits/rejected": 0.05218297988176346,
"logps/chosen": -315.01361083984375,
"logps/rejected": -348.6982421875,
"loss": 0.8018,
"rewards/accuracies": 0.6481481790542603,
"rewards/chosen": -3.1666953563690186,
"rewards/margins": 2.3253214359283447,
"rewards/rejected": -5.492016792297363,
"step": 204
},
{
"epoch": 0.4081498521196188,
"grad_norm": 38.437744140625,
"learning_rate": 4.981585805712011e-06,
"logits/chosen": 0.056118495762348175,
"logits/rejected": 0.09087176620960236,
"logps/chosen": -318.7930603027344,
"logps/rejected": -362.77825927734375,
"loss": 0.8394,
"rewards/accuracies": 0.638888955116272,
"rewards/chosen": -3.5864510536193848,
"rewards/margins": 2.024658203125,
"rewards/rejected": -5.611109733581543,
"step": 207
},
{
"epoch": 0.4140650673677292,
"grad_norm": 38.63540267944336,
"learning_rate": 4.979445123152767e-06,
"logits/chosen": 0.05023570358753204,
"logits/rejected": 0.07232505083084106,
"logps/chosen": -325.7311096191406,
"logps/rejected": -351.2863464355469,
"loss": 0.8521,
"rewards/accuracies": 0.6574074625968933,
"rewards/chosen": -3.6044719219207764,
"rewards/margins": 1.671539306640625,
"rewards/rejected": -5.2760114669799805,
"step": 210
},
{
"epoch": 0.41998028261583964,
"grad_norm": 41.875938415527344,
"learning_rate": 4.977187269593758e-06,
"logits/chosen": -0.009280918166041374,
"logits/rejected": 0.08061732351779938,
"logps/chosen": -309.8787841796875,
"logps/rejected": -338.63531494140625,
"loss": 0.9004,
"rewards/accuracies": 0.6481481790542603,
"rewards/chosen": -3.5854623317718506,
"rewards/margins": 1.1396470069885254,
"rewards/rejected": -4.725109100341797,
"step": 213
},
{
"epoch": 0.42589549786395003,
"grad_norm": 103.75460815429688,
"learning_rate": 4.974812351734241e-06,
"logits/chosen": -0.02067667804658413,
"logits/rejected": 0.05178874731063843,
"logps/chosen": -312.7908630371094,
"logps/rejected": -342.73443603515625,
"loss": 0.802,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": -3.1982085704803467,
"rewards/margins": 1.6781682968139648,
"rewards/rejected": -4.876377105712891,
"step": 216
},
{
"epoch": 0.4318107131120605,
"grad_norm": 45.85707092285156,
"learning_rate": 4.972320481805578e-06,
"logits/chosen": -0.1770055741071701,
"logits/rejected": -0.07678120583295822,
"logps/chosen": -323.437744140625,
"logps/rejected": -376.6873474121094,
"loss": 0.7821,
"rewards/accuracies": 0.6805555820465088,
"rewards/chosen": -3.227814197540283,
"rewards/margins": 1.796374797821045,
"rewards/rejected": -5.024188995361328,
"step": 219
},
{
"epoch": 0.4377259283601709,
"grad_norm": 61.05466079711914,
"learning_rate": 4.969711777565928e-06,
"logits/chosen": -0.05364451929926872,
"logits/rejected": -0.02637811005115509,
"logps/chosen": -334.20367431640625,
"logps/rejected": -368.54693603515625,
"loss": 0.8648,
"rewards/accuracies": 0.6759259104728699,
"rewards/chosen": -3.613854169845581,
"rewards/margins": 1.6240665912628174,
"rewards/rejected": -5.237921237945557,
"step": 222
},
{
"epoch": 0.4436411436082813,
"grad_norm": 34.13490295410156,
"learning_rate": 4.96698636229468e-06,
"logits/chosen": -0.005055941641330719,
"logits/rejected": 0.06123928725719452,
"logps/chosen": -324.4971923828125,
"logps/rejected": -349.1205749511719,
"loss": 0.7293,
"rewards/accuracies": 0.7083333730697632,
"rewards/chosen": -3.4245200157165527,
"rewards/margins": 1.7457599639892578,
"rewards/rejected": -5.1702799797058105,
"step": 225
},
{
"epoch": 0.4495563588563917,
"grad_norm": 83.93833923339844,
"learning_rate": 4.964144364786632e-06,
"logits/chosen": 0.13881027698516846,
"logits/rejected": 0.12519869208335876,
"logps/chosen": -348.6081237792969,
"logps/rejected": -361.4942321777344,
"loss": 1.0233,
"rewards/accuracies": 0.6851851940155029,
"rewards/chosen": -4.027976036071777,
"rewards/margins": 1.528003454208374,
"rewards/rejected": -5.555978775024414,
"step": 228
},
{
"epoch": 0.4554715741045021,
"grad_norm": 97.00470733642578,
"learning_rate": 4.9611859193459015e-06,
"logits/chosen": 0.07493604719638824,
"logits/rejected": 0.14453333616256714,
"logps/chosen": -326.7818298339844,
"logps/rejected": -349.25189208984375,
"loss": 0.92,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": -4.245251655578613,
"rewards/margins": 1.4085248708724976,
"rewards/rejected": -5.653777122497559,
"step": 231
},
{
"epoch": 0.46138678935261257,
"grad_norm": 42.000885009765625,
"learning_rate": 4.958111165779579e-06,
"logits/chosen": 0.0968238115310669,
"logits/rejected": 0.17590413987636566,
"logps/chosen": -324.4273376464844,
"logps/rejected": -368.86004638671875,
"loss": 0.8419,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": -4.32670259475708,
"rewards/margins": 1.9024198055267334,
"rewards/rejected": -6.229121685028076,
"step": 234
},
{
"epoch": 0.46730200460072296,
"grad_norm": 34.4752197265625,
"learning_rate": 4.954920249391123e-06,
"logits/chosen": 0.035372521728277206,
"logits/rejected": 0.07403655350208282,
"logps/chosen": -336.0986633300781,
"logps/rejected": -353.7882995605469,
"loss": 0.6037,
"rewards/accuracies": 0.7546296119689941,
"rewards/chosen": -3.8972387313842773,
"rewards/margins": 1.817379355430603,
"rewards/rejected": -5.714618682861328,
"step": 237
},
{
"epoch": 0.4732172198488334,
"grad_norm": 28.94403648376465,
"learning_rate": 4.951613320973491e-06,
"logits/chosen": 0.0323605015873909,
"logits/rejected": 0.0774674192070961,
"logps/chosen": -324.99774169921875,
"logps/rejected": -344.53985595703125,
"loss": 0.7093,
"rewards/accuracies": 0.7268518805503845,
"rewards/chosen": -4.354706764221191,
"rewards/margins": 1.9214775562286377,
"rewards/rejected": -6.276185035705566,
"step": 240
},
{
"epoch": 0.4791324350969438,
"grad_norm": 30.506528854370117,
"learning_rate": 4.948190536802015e-06,
"logits/chosen": -0.06049029156565666,
"logits/rejected": 0.006028448697179556,
"logps/chosen": -324.240234375,
"logps/rejected": -350.5492858886719,
"loss": 0.7427,
"rewards/accuracies": 0.6388888955116272,
"rewards/chosen": -4.099154472351074,
"rewards/margins": 1.713746428489685,
"rewards/rejected": -5.812901496887207,
"step": 243
},
{
"epoch": 0.4850476503450542,
"grad_norm": 73.70082092285156,
"learning_rate": 4.944652058627013e-06,
"logits/chosen": -0.08537846058607101,
"logits/rejected": -0.032611675560474396,
"logps/chosen": -336.5609130859375,
"logps/rejected": -372.4656677246094,
"loss": 0.6616,
"rewards/accuracies": 0.703703761100769,
"rewards/chosen": -4.300870895385742,
"rewards/margins": 1.9437074661254883,
"rewards/rejected": -6.2445783615112305,
"step": 246
},
{
"epoch": 0.49096286559316465,
"grad_norm": 34.85869598388672,
"learning_rate": 4.9409980536661535e-06,
"logits/chosen": -0.06217961013317108,
"logits/rejected": -0.004529049154371023,
"logps/chosen": -340.4526062011719,
"logps/rejected": -369.7755126953125,
"loss": 0.8365,
"rewards/accuracies": 0.6527777910232544,
"rewards/chosen": -4.246068000793457,
"rewards/margins": 1.704252004623413,
"rewards/rejected": -5.950319290161133,
"step": 249
},
{
"epoch": 0.49687808084127505,
"grad_norm": 39.66143035888672,
"learning_rate": 4.937228694596545e-06,
"logits/chosen": -0.12100633233785629,
"logits/rejected": -0.006573869846761227,
"logps/chosen": -320.8197021484375,
"logps/rejected": -355.7471923828125,
"loss": 0.6241,
"rewards/accuracies": 0.7083333730697632,
"rewards/chosen": -3.746490478515625,
"rewards/margins": 2.363032579421997,
"rewards/rejected": -6.109523296356201,
"step": 252
},
{
"epoch": 0.5027932960893855,
"grad_norm": 33.68574142456055,
"learning_rate": 4.933344159546577e-06,
"logits/chosen": -0.18371449410915375,
"logits/rejected": -0.022255782037973404,
"logps/chosen": -333.9938049316406,
"logps/rejected": -384.731689453125,
"loss": 0.8266,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": -4.545222759246826,
"rewards/margins": 2.332798719406128,
"rewards/rejected": -6.878022193908691,
"step": 255
},
{
"epoch": 0.5087085113374958,
"grad_norm": 45.985260009765625,
"learning_rate": 4.929344632087506e-06,
"logits/chosen": -0.14562034606933594,
"logits/rejected": -0.04126621410250664,
"logps/chosen": -327.22882080078125,
"logps/rejected": -382.4744873046875,
"loss": 0.6009,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": -4.554243564605713,
"rewards/margins": 3.3480947017669678,
"rewards/rejected": -7.902338027954102,
"step": 258
},
{
"epoch": 0.5146237265856063,
"grad_norm": 59.11403274536133,
"learning_rate": 4.9252303012247775e-06,
"logits/chosen": -0.11811560392379761,
"logits/rejected": -0.01293177530169487,
"logps/chosen": -351.2400817871094,
"logps/rejected": -404.04693603515625,
"loss": 0.7836,
"rewards/accuracies": 0.7592592239379883,
"rewards/chosen": -5.45390510559082,
"rewards/margins": 3.5169780254364014,
"rewards/rejected": -8.970884323120117,
"step": 261
},
{
"epoch": 0.5205389418337167,
"grad_norm": 38.673831939697266,
"learning_rate": 4.921001361389096e-06,
"logits/chosen": -0.05326487869024277,
"logits/rejected": 0.009572159498929977,
"logps/chosen": -333.938232421875,
"logps/rejected": -371.80816650390625,
"loss": 0.7732,
"rewards/accuracies": 0.7175925970077515,
"rewards/chosen": -5.093016624450684,
"rewards/margins": 2.952821731567383,
"rewards/rejected": -8.045838356018066,
"step": 264
},
{
"epoch": 0.5264541570818272,
"grad_norm": 33.54100036621094,
"learning_rate": 4.916658012427235e-06,
"logits/chosen": -0.037946220487356186,
"logits/rejected": 0.06312233209609985,
"logps/chosen": -342.2742614746094,
"logps/rejected": -391.18646240234375,
"loss": 0.7494,
"rewards/accuracies": 0.7083333730697632,
"rewards/chosen": -5.872312545776367,
"rewards/margins": 2.313776969909668,
"rewards/rejected": -8.186089515686035,
"step": 267
},
{
"epoch": 0.5323693723299375,
"grad_norm": 37.88241195678711,
"learning_rate": 4.912200459592595e-06,
"logits/chosen": -0.06072680652141571,
"logits/rejected": 0.09704061597585678,
"logps/chosen": -343.1576843261719,
"logps/rejected": -407.43798828125,
"loss": 0.8544,
"rewards/accuracies": 0.6620370149612427,
"rewards/chosen": -6.514297008514404,
"rewards/margins": 2.3704113960266113,
"rewards/rejected": -8.884708404541016,
"step": 270
},
{
"epoch": 0.538284587578048,
"grad_norm": 71.56549072265625,
"learning_rate": 4.9076289135355e-06,
"logits/chosen": 0.02188403531908989,
"logits/rejected": 0.1214473694562912,
"logps/chosen": -355.06793212890625,
"logps/rejected": -388.5225830078125,
"loss": 0.873,
"rewards/accuracies": 0.6342592835426331,
"rewards/chosen": -6.959141731262207,
"rewards/margins": 1.8934102058410645,
"rewards/rejected": -8.852551460266113,
"step": 273
},
{
"epoch": 0.5441998028261584,
"grad_norm": 55.54237365722656,
"learning_rate": 4.902943590293245e-06,
"logits/chosen": 0.0230946596711874,
"logits/rejected": 0.08508029580116272,
"logps/chosen": -349.1761169433594,
"logps/rejected": -398.710693359375,
"loss": 0.8547,
"rewards/accuracies": 0.7222222685813904,
"rewards/chosen": -6.52820348739624,
"rewards/margins": 2.2471137046813965,
"rewards/rejected": -8.775317192077637,
"step": 276
},
{
"epoch": 0.5501150180742688,
"grad_norm": 48.57643127441406,
"learning_rate": 4.898144711279894e-06,
"logits/chosen": -0.07424692809581757,
"logits/rejected": 0.09649358689785004,
"logps/chosen": -339.84686279296875,
"logps/rejected": -393.98016357421875,
"loss": 0.8443,
"rewards/accuracies": 0.5972222685813904,
"rewards/chosen": -6.690239906311035,
"rewards/margins": 2.0843098163604736,
"rewards/rejected": -8.77454948425293,
"step": 279
},
{
"epoch": 0.5560302333223792,
"grad_norm": 28.025590896606445,
"learning_rate": 4.8932325032758006e-06,
"logits/chosen": -0.13962030410766602,
"logits/rejected": 0.05175274237990379,
"logps/chosen": -335.58966064453125,
"logps/rejected": -378.72833251953125,
"loss": 0.6595,
"rewards/accuracies": 0.703703761100769,
"rewards/chosen": -5.493226528167725,
"rewards/margins": 2.881283760070801,
"rewards/rejected": -8.374510765075684,
"step": 282
},
{
"epoch": 0.5619454485704897,
"grad_norm": 42.00820541381836,
"learning_rate": 4.8882071984169055e-06,
"logits/chosen": 0.007744944654405117,
"logits/rejected": 0.10258468985557556,
"logps/chosen": -363.2281494140625,
"logps/rejected": -410.42266845703125,
"loss": 0.7705,
"rewards/accuracies": 0.6805555820465088,
"rewards/chosen": -6.42072868347168,
"rewards/margins": 2.2050187587738037,
"rewards/rejected": -8.625746726989746,
"step": 285
},
{
"epoch": 0.5678606638186001,
"grad_norm": 54.86595916748047,
"learning_rate": 4.8830690341837596e-06,
"logits/chosen": -0.032913923263549805,
"logits/rejected": 0.1072501391172409,
"logps/chosen": -358.8460693359375,
"logps/rejected": -409.0872802734375,
"loss": 0.9018,
"rewards/accuracies": 0.6944444179534912,
"rewards/chosen": -6.712845802307129,
"rewards/margins": 2.6474528312683105,
"rewards/rejected": -9.360298156738281,
"step": 288
},
{
"epoch": 0.5737758790667105,
"grad_norm": 63.118587493896484,
"learning_rate": 4.877818253390303e-06,
"logits/chosen": -0.04406512528657913,
"logits/rejected": 0.006794461514800787,
"logps/chosen": -367.21063232421875,
"logps/rejected": -415.6984558105469,
"loss": 0.866,
"rewards/accuracies": 0.6944445371627808,
"rewards/chosen": -7.632047176361084,
"rewards/margins": 2.7398147583007812,
"rewards/rejected": -10.371862411499023,
"step": 291
},
{
"epoch": 0.5796910943148209,
"grad_norm": 35.43265914916992,
"learning_rate": 4.872455104172392e-06,
"logits/chosen": -0.018917741253972054,
"logits/rejected": 0.09038470685482025,
"logps/chosen": -353.57666015625,
"logps/rejected": -384.8489990234375,
"loss": 0.8107,
"rewards/accuracies": 0.703703761100769,
"rewards/chosen": -7.3933916091918945,
"rewards/margins": 2.474519729614258,
"rewards/rejected": -9.867910385131836,
"step": 294
},
{
"epoch": 0.5856063095629314,
"grad_norm": 44.21418762207031,
"learning_rate": 4.866979839976068e-06,
"logits/chosen": -0.0317760705947876,
"logits/rejected": 0.05773278325796127,
"logps/chosen": -358.10247802734375,
"logps/rejected": -407.8479919433594,
"loss": 0.8849,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": -8.095900535583496,
"rewards/margins": 2.4276022911071777,
"rewards/rejected": -10.523502349853516,
"step": 297
},
{
"epoch": 0.5915215248110417,
"grad_norm": 63.40266799926758,
"learning_rate": 4.861392719545586e-06,
"logits/chosen": -0.07806281745433807,
"logits/rejected": -0.02447107993066311,
"logps/chosen": -356.18218994140625,
"logps/rejected": -387.41180419921875,
"loss": 0.9313,
"rewards/accuracies": 0.6388888955116272,
"rewards/chosen": -7.356019020080566,
"rewards/margins": 1.7391393184661865,
"rewards/rejected": -9.095157623291016,
"step": 300
},
{
"epoch": 0.5974367400591521,
"grad_norm": 66.6312484741211,
"learning_rate": 4.855694006911184e-06,
"logits/chosen": -0.06037697196006775,
"logits/rejected": -0.07680558413267136,
"logps/chosen": -368.1618347167969,
"logps/rejected": -388.21026611328125,
"loss": 0.9553,
"rewards/accuracies": 0.6435185670852661,
"rewards/chosen": -7.906184673309326,
"rewards/margins": 1.781913161277771,
"rewards/rejected": -9.688097953796387,
"step": 303
},
{
"epoch": 0.6033519553072626,
"grad_norm": 50.65395736694336,
"learning_rate": 4.849883971376608e-06,
"logits/chosen": -0.036385513842105865,
"logits/rejected": 0.02312180958688259,
"logps/chosen": -360.32763671875,
"logps/rejected": -385.574462890625,
"loss": 0.8076,
"rewards/accuracies": 0.6805555820465088,
"rewards/chosen": -7.673541069030762,
"rewards/margins": 2.30534291267395,
"rewards/rejected": -9.97888469696045,
"step": 306
},
{
"epoch": 0.6092671705553729,
"grad_norm": 37.83553695678711,
"learning_rate": 4.843962887506382e-06,
"logits/chosen": -0.007246436085551977,
"logits/rejected": 0.10245460271835327,
"logps/chosen": -360.2029724121094,
"logps/rejected": -388.7747497558594,
"loss": 1.0146,
"rewards/accuracies": 0.638888955116272,
"rewards/chosen": -7.557069301605225,
"rewards/margins": 2.0892889499664307,
"rewards/rejected": -9.646357536315918,
"step": 309
},
{
"epoch": 0.6151823858034834,
"grad_norm": 67.90631103515625,
"learning_rate": 4.837931035112836e-06,
"logits/chosen": -0.021308597177267075,
"logits/rejected": 0.05145422741770744,
"logps/chosen": -340.68328857421875,
"logps/rejected": -396.016845703125,
"loss": 0.8041,
"rewards/accuracies": 0.6712962985038757,
"rewards/chosen": -6.7121500968933105,
"rewards/margins": 3.0467376708984375,
"rewards/rejected": -9.758888244628906,
"step": 312
},
{
"epoch": 0.6210976010515938,
"grad_norm": 38.1522331237793,
"learning_rate": 4.831788699242882e-06,
"logits/chosen": 0.12459397315979004,
"logits/rejected": 0.07747067511081696,
"logps/chosen": -381.19317626953125,
"logps/rejected": -386.43023681640625,
"loss": 0.8322,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": -7.670474529266357,
"rewards/margins": 2.657536268234253,
"rewards/rejected": -10.328010559082031,
"step": 315
},
{
"epoch": 0.6270128162997043,
"grad_norm": 37.3199462890625,
"learning_rate": 4.825536170164543e-06,
"logits/chosen": 0.0364363007247448,
"logits/rejected": 0.1072683334350586,
"logps/chosen": -384.99658203125,
"logps/rejected": -423.906494140625,
"loss": 0.8372,
"rewards/accuracies": 0.703703761100769,
"rewards/chosen": -8.085029602050781,
"rewards/margins": 2.9126205444335938,
"rewards/rejected": -10.997650146484375,
"step": 318
},
{
"epoch": 0.6329280315478146,
"grad_norm": 84.70304870605469,
"learning_rate": 4.819173743353237e-06,
"logits/chosen": 0.07106878608465195,
"logits/rejected": 0.07161180675029755,
"logps/chosen": -353.46929931640625,
"logps/rejected": -382.072998046875,
"loss": 0.9152,
"rewards/accuracies": 0.6620370745658875,
"rewards/chosen": -8.362825393676758,
"rewards/margins": 2.396571636199951,
"rewards/rejected": -10.759397506713867,
"step": 321
},
{
"epoch": 0.6388432467959251,
"grad_norm": 41.621517181396484,
"learning_rate": 4.812701719477813e-06,
"logits/chosen": -0.042387984693050385,
"logits/rejected": -0.006728718988597393,
"logps/chosen": -397.9143981933594,
"logps/rejected": -417.5738220214844,
"loss": 0.7585,
"rewards/accuracies": 0.7222222685813904,
"rewards/chosen": -8.613710403442383,
"rewards/margins": 3.0014796257019043,
"rewards/rejected": -11.615188598632812,
"step": 324
},
{
"epoch": 0.6447584620440355,
"grad_norm": 34.132080078125,
"learning_rate": 4.80612040438634e-06,
"logits/chosen": -0.005274191033095121,
"logits/rejected": -0.0017657628050073981,
"logps/chosen": -391.5089111328125,
"logps/rejected": -413.4745788574219,
"loss": 0.8454,
"rewards/accuracies": 0.6759259700775146,
"rewards/chosen": -8.282991409301758,
"rewards/margins": 2.736060619354248,
"rewards/rejected": -11.019050598144531,
"step": 327
},
{
"epoch": 0.6506736772921459,
"grad_norm": 42.51496505737305,
"learning_rate": 4.799430109091659e-06,
"logits/chosen": -0.15326707065105438,
"logits/rejected": -0.08302909135818481,
"logps/chosen": -363.5447998046875,
"logps/rejected": -425.05645751953125,
"loss": 0.8109,
"rewards/accuracies": 0.7314814925193787,
"rewards/chosen": -8.056258201599121,
"rewards/margins": 3.1323180198669434,
"rewards/rejected": -11.188575744628906,
"step": 330
},
{
"epoch": 0.6565888925402563,
"grad_norm": 38.95637512207031,
"learning_rate": 4.792631149756683e-06,
"logits/chosen": -0.10734808444976807,
"logits/rejected": -0.13398586213588715,
"logps/chosen": -383.37957763671875,
"logps/rejected": -396.33050537109375,
"loss": 0.9087,
"rewards/accuracies": 0.6805557012557983,
"rewards/chosen": -9.402881622314453,
"rewards/margins": 2.247579574584961,
"rewards/rejected": -11.65046215057373,
"step": 333
},
{
"epoch": 0.6625041077883668,
"grad_norm": 30.90215301513672,
"learning_rate": 4.785723847679451e-06,
"logits/chosen": -0.17347650229930878,
"logits/rejected": -0.04830838367342949,
"logps/chosen": -365.1563415527344,
"logps/rejected": -424.2984619140625,
"loss": 0.7836,
"rewards/accuracies": 0.7222222089767456,
"rewards/chosen": -9.815483093261719,
"rewards/margins": 2.839183807373047,
"rewards/rejected": -12.654666900634766,
"step": 336
},
{
"epoch": 0.6684193230364771,
"grad_norm": 47.471736907958984,
"learning_rate": 4.778708529277954e-06,
"logits/chosen": -0.1100246012210846,
"logits/rejected": -0.06617899239063263,
"logps/chosen": -385.6170654296875,
"logps/rejected": -420.5422058105469,
"loss": 0.9052,
"rewards/accuracies": 0.703703761100769,
"rewards/chosen": -9.508901596069336,
"rewards/margins": 3.112328052520752,
"rewards/rejected": -12.621230125427246,
"step": 339
},
{
"epoch": 0.6743345382845876,
"grad_norm": 47.64808654785156,
"learning_rate": 4.7715855260747e-06,
"logits/chosen": -0.1708156168460846,
"logits/rejected": -0.07380948960781097,
"logps/chosen": -389.4483337402344,
"logps/rejected": -430.76080322265625,
"loss": 0.7454,
"rewards/accuracies": 0.6990741491317749,
"rewards/chosen": -9.387763023376465,
"rewards/margins": 3.1104159355163574,
"rewards/rejected": -12.498178482055664,
"step": 342
},
{
"epoch": 0.680249753532698,
"grad_norm": 34.76079559326172,
"learning_rate": 4.764355174681056e-06,
"logits/chosen": -0.12020980566740036,
"logits/rejected": -0.09629341214895248,
"logps/chosen": -386.8775329589844,
"logps/rejected": -407.75299072265625,
"loss": 0.7826,
"rewards/accuracies": 0.7129629850387573,
"rewards/chosen": -9.380858421325684,
"rewards/margins": 2.675083637237549,
"rewards/rejected": -12.055941581726074,
"step": 345
},
{
"epoch": 0.6861649687808085,
"grad_norm": 31.153663635253906,
"learning_rate": 4.757017816781331e-06,
"logits/chosen": -0.16364365816116333,
"logits/rejected": -0.0642888993024826,
"logps/chosen": -379.285400390625,
"logps/rejected": -452.3611755371094,
"loss": 0.6606,
"rewards/accuracies": 0.7314814925193787,
"rewards/chosen": -9.602463722229004,
"rewards/margins": 3.611215591430664,
"rewards/rejected": -13.213679313659668,
"step": 348
},
{
"epoch": 0.6920801840289188,
"grad_norm": 49.72859191894531,
"learning_rate": 4.74957379911664e-06,
"logits/chosen": -0.07502593845129013,
"logits/rejected": 0.00437380513176322,
"logps/chosen": -395.549560546875,
"logps/rejected": -437.9383544921875,
"loss": 0.8434,
"rewards/accuracies": 0.6990741491317749,
"rewards/chosen": -10.370527267456055,
"rewards/margins": 2.701253652572632,
"rewards/rejected": -13.07178020477295,
"step": 351
},
{
"epoch": 0.6979953992770292,
"grad_norm": 56.92310333251953,
"learning_rate": 4.7420234734685104e-06,
"logits/chosen": -0.03868694603443146,
"logits/rejected": -0.0014290250837802887,
"logps/chosen": -401.3062744140625,
"logps/rejected": -445.33807373046875,
"loss": 0.8181,
"rewards/accuracies": 0.7083333730697632,
"rewards/chosen": -10.017228126525879,
"rewards/margins": 3.0577263832092285,
"rewards/rejected": -13.07495403289795,
"step": 354
},
{
"epoch": 0.7039106145251397,
"grad_norm": 28.037378311157227,
"learning_rate": 4.7343671966422584e-06,
"logits/chosen": -0.19204241037368774,
"logits/rejected": -0.08173765987157822,
"logps/chosen": -388.4847106933594,
"logps/rejected": -435.093505859375,
"loss": 0.8228,
"rewards/accuracies": 0.6944444179534912,
"rewards/chosen": -9.829904556274414,
"rewards/margins": 2.792029857635498,
"rewards/rejected": -12.621932983398438,
"step": 357
},
{
"epoch": 0.70982582977325,
"grad_norm": 40.521690368652344,
"learning_rate": 4.726605330450132e-06,
"logits/chosen": -0.13022971153259277,
"logits/rejected": -0.08149293065071106,
"logps/chosen": -379.46185302734375,
"logps/rejected": -414.7354736328125,
"loss": 0.7553,
"rewards/accuracies": 0.7222222685813904,
"rewards/chosen": -8.846694946289062,
"rewards/margins": 2.8018383979797363,
"rewards/rejected": -11.64853286743164,
"step": 360
},
{
"epoch": 0.7157410450213605,
"grad_norm": 58.53022003173828,
"learning_rate": 4.718738241694207e-06,
"logits/chosen": -0.1846962869167328,
"logits/rejected": -0.10722313821315765,
"logps/chosen": -353.771484375,
"logps/rejected": -411.9970703125,
"loss": 0.7709,
"rewards/accuracies": 0.708333432674408,
"rewards/chosen": -8.93757152557373,
"rewards/margins": 2.895968198776245,
"rewards/rejected": -11.833539962768555,
"step": 363
},
{
"epoch": 0.7216562602694709,
"grad_norm": 62.94578552246094,
"learning_rate": 4.710766302149059e-06,
"logits/chosen": -0.11727502197027206,
"logits/rejected": -0.07848824560642242,
"logps/chosen": -386.8691101074219,
"logps/rejected": -423.58203125,
"loss": 0.9148,
"rewards/accuracies": 0.6990741491317749,
"rewards/chosen": -9.200393676757812,
"rewards/margins": 2.5602023601531982,
"rewards/rejected": -11.76059627532959,
"step": 366
},
{
"epoch": 0.7275714755175814,
"grad_norm": 41.92021560668945,
"learning_rate": 4.7026898885441895e-06,
"logits/chosen": -0.23892198503017426,
"logits/rejected": -0.19355922937393188,
"logps/chosen": -371.76507568359375,
"logps/rejected": -428.6576843261719,
"loss": 0.5582,
"rewards/accuracies": 0.7685184478759766,
"rewards/chosen": -8.767149925231934,
"rewards/margins": 3.590496063232422,
"rewards/rejected": -12.357645988464355,
"step": 369
},
{
"epoch": 0.7334866907656917,
"grad_norm": 49.8127555847168,
"learning_rate": 4.694509382546225e-06,
"logits/chosen": -0.15272876620292664,
"logits/rejected": -0.16361907124519348,
"logps/chosen": -391.66729736328125,
"logps/rejected": -436.54638671875,
"loss": 0.7923,
"rewards/accuracies": 0.7222222685813904,
"rewards/chosen": -9.770153045654297,
"rewards/margins": 3.136183738708496,
"rewards/rejected": -12.906336784362793,
"step": 372
},
{
"epoch": 0.7394019060138022,
"grad_norm": 58.40044403076172,
"learning_rate": 4.686225170740881e-06,
"logits/chosen": -0.18054398894309998,
"logits/rejected": -0.10335493087768555,
"logps/chosen": -412.2997741699219,
"logps/rejected": -459.0295104980469,
"loss": 0.8095,
"rewards/accuracies": 0.6805555820465088,
"rewards/chosen": -9.97874641418457,
"rewards/margins": 2.9409613609313965,
"rewards/rejected": -12.919708251953125,
"step": 375
},
{
"epoch": 0.7453171212619126,
"grad_norm": 39.62379837036133,
"learning_rate": 4.677837644614692e-06,
"logits/chosen": -0.0909600779414177,
"logits/rejected": -0.023966720327734947,
"logps/chosen": -404.3166198730469,
"logps/rejected": -445.68218994140625,
"loss": 0.8851,
"rewards/accuracies": 0.6712962985038757,
"rewards/chosen": -10.392790794372559,
"rewards/margins": 2.6830623149871826,
"rewards/rejected": -13.07585334777832,
"step": 378
},
{
"epoch": 0.751232336510023,
"grad_norm": 70.16593170166016,
"learning_rate": 4.669347200536513e-06,
"logits/chosen": -0.22100476920604706,
"logits/rejected": -0.07667340338230133,
"logps/chosen": -388.3753662109375,
"logps/rejected": -459.80645751953125,
"loss": 0.9195,
"rewards/accuracies": 0.6944444179534912,
"rewards/chosen": -9.879241943359375,
"rewards/margins": 3.15310001373291,
"rewards/rejected": -13.032341003417969,
"step": 381
},
{
"epoch": 0.7571475517581334,
"grad_norm": 31.03739356994629,
"learning_rate": 4.660754239738784e-06,
"logits/chosen": -0.13154415786266327,
"logits/rejected": -0.14076797664165497,
"logps/chosen": -357.6185302734375,
"logps/rejected": -386.7919921875,
"loss": 0.7967,
"rewards/accuracies": 0.7083333730697632,
"rewards/chosen": -8.652280807495117,
"rewards/margins": 3.0989902019500732,
"rewards/rejected": -11.751270294189453,
"step": 384
},
{
"epoch": 0.7630627670062439,
"grad_norm": 36.11872482299805,
"learning_rate": 4.652059168298575e-06,
"logits/chosen": -0.1265685260295868,
"logits/rejected": -0.16448134183883667,
"logps/chosen": -372.2834167480469,
"logps/rejected": -404.4194030761719,
"loss": 0.8985,
"rewards/accuracies": 0.6527777910232544,
"rewards/chosen": -8.796895980834961,
"rewards/margins": 2.566009521484375,
"rewards/rejected": -11.362905502319336,
"step": 387
},
{
"epoch": 0.7689779822543542,
"grad_norm": 46.44245910644531,
"learning_rate": 4.6432623971183914e-06,
"logits/chosen": -0.14210101962089539,
"logits/rejected": -0.10977351665496826,
"logps/chosen": -394.84918212890625,
"logps/rejected": -433.3570556640625,
"loss": 0.9522,
"rewards/accuracies": 0.6759259104728699,
"rewards/chosen": -9.386205673217773,
"rewards/margins": 2.8492612838745117,
"rewards/rejected": -12.235466003417969,
"step": 390
},
{
"epoch": 0.7748931975024647,
"grad_norm": 97.72257232666016,
"learning_rate": 4.634364341906758e-06,
"logits/chosen": -0.0367339588701725,
"logits/rejected": -0.006284890230745077,
"logps/chosen": -384.237548828125,
"logps/rejected": -411.18560791015625,
"loss": 0.9734,
"rewards/accuracies": 0.6435185670852661,
"rewards/chosen": -9.642322540283203,
"rewards/margins": 2.240882635116577,
"rewards/rejected": -11.88320541381836,
"step": 393
},
{
"epoch": 0.7808084127505751,
"grad_norm": 31.60021209716797,
"learning_rate": 4.6253654231585724e-06,
"logits/chosen": -0.04863632842898369,
"logits/rejected": -0.05471419543027878,
"logps/chosen": -393.5486755371094,
"logps/rejected": -420.154541015625,
"loss": 0.8569,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": -10.027898788452148,
"rewards/margins": 2.8087966442108154,
"rewards/rejected": -12.83669662475586,
"step": 396
},
{
"epoch": 0.7867236279986856,
"grad_norm": 34.62734603881836,
"learning_rate": 4.616266066135236e-06,
"logits/chosen": 0.012388413771986961,
"logits/rejected": 0.08139034360647202,
"logps/chosen": -403.1490478515625,
"logps/rejected": -445.5652770996094,
"loss": 0.8866,
"rewards/accuracies": 0.7175925970077515,
"rewards/chosen": -10.999832153320312,
"rewards/margins": 2.9154164791107178,
"rewards/rejected": -13.915247917175293,
"step": 399
},
{
"epoch": 0.7926388432467959,
"grad_norm": 45.868507385253906,
"learning_rate": 4.6070667008445565e-06,
"logits/chosen": -0.1295236349105835,
"logits/rejected": -0.05852815508842468,
"logps/chosen": -410.78717041015625,
"logps/rejected": -466.6341247558594,
"loss": 0.8873,
"rewards/accuracies": 0.6898148059844971,
"rewards/chosen": -11.442038536071777,
"rewards/margins": 2.919829845428467,
"rewards/rejected": -14.361867904663086,
"step": 402
},
{
"epoch": 0.7985540584949063,
"grad_norm": 57.16217041015625,
"learning_rate": 4.597767762020425e-06,
"logits/chosen": -0.09274892508983612,
"logits/rejected": -0.0731249749660492,
"logps/chosen": -414.71905517578125,
"logps/rejected": -454.8731384277344,
"loss": 0.8367,
"rewards/accuracies": 0.6759259104728699,
"rewards/chosen": -12.529696464538574,
"rewards/margins": 2.8657846450805664,
"rewards/rejected": -15.395480155944824,
"step": 405
},
{
"epoch": 0.8044692737430168,
"grad_norm": 40.70981979370117,
"learning_rate": 4.588369689102275e-06,
"logits/chosen": -0.19083881378173828,
"logits/rejected": -0.22435228526592255,
"logps/chosen": -420.8725280761719,
"logps/rejected": -443.5750732421875,
"loss": 0.8609,
"rewards/accuracies": 0.7037036418914795,
"rewards/chosen": -12.435295104980469,
"rewards/margins": 2.047469139099121,
"rewards/rejected": -14.48276424407959,
"step": 408
},
{
"epoch": 0.8103844889911271,
"grad_norm": 81.51207733154297,
"learning_rate": 4.578872926214312e-06,
"logits/chosen": -0.1143086701631546,
"logits/rejected": -0.12217384576797485,
"logps/chosen": -426.69769287109375,
"logps/rejected": -465.9218444824219,
"loss": 0.8227,
"rewards/accuracies": 0.6898148059844971,
"rewards/chosen": -11.509513854980469,
"rewards/margins": 3.481245756149292,
"rewards/rejected": -14.990760803222656,
"step": 411
},
{
"epoch": 0.8162997042392376,
"grad_norm": 37.07810974121094,
"learning_rate": 4.569277922144531e-06,
"logits/chosen": -0.07632291316986084,
"logits/rejected": -0.09633226692676544,
"logps/chosen": -380.8441162109375,
"logps/rejected": -426.43780517578125,
"loss": 0.7403,
"rewards/accuracies": 0.7175925970077515,
"rewards/chosen": -10.021100997924805,
"rewards/margins": 3.8166158199310303,
"rewards/rejected": -13.83771800994873,
"step": 414
},
{
"epoch": 0.822214919487348,
"grad_norm": 40.48078155517578,
"learning_rate": 4.559585130323503e-06,
"logits/chosen": -0.11609819531440735,
"logits/rejected": -0.11777342855930328,
"logps/chosen": -382.35833740234375,
"logps/rejected": -425.1037292480469,
"loss": 0.7272,
"rewards/accuracies": 0.7453703880310059,
"rewards/chosen": -10.085898399353027,
"rewards/margins": 4.251326560974121,
"rewards/rejected": -14.337224960327148,
"step": 417
},
{
"epoch": 0.8281301347354584,
"grad_norm": 23.162534713745117,
"learning_rate": 4.549795008802951e-06,
"logits/chosen": -0.09667672216892242,
"logits/rejected": -0.16936007142066956,
"logps/chosen": -414.39794921875,
"logps/rejected": -467.24224853515625,
"loss": 0.71,
"rewards/accuracies": 0.75,
"rewards/chosen": -10.526082992553711,
"rewards/margins": 5.361236572265625,
"rewards/rejected": -15.887319564819336,
"step": 420
},
{
"epoch": 0.8340453499835688,
"grad_norm": 46.23252868652344,
"learning_rate": 4.539908020234101e-06,
"logits/chosen": -0.18942461907863617,
"logits/rejected": -0.20741616189479828,
"logps/chosen": -395.4661560058594,
"logps/rejected": -422.1636962890625,
"loss": 0.8233,
"rewards/accuracies": 0.7361111044883728,
"rewards/chosen": -11.337644577026367,
"rewards/margins": 3.6029138565063477,
"rewards/rejected": -14.940558433532715,
"step": 423
},
{
"epoch": 0.8399605652316793,
"grad_norm": 39.3061637878418,
"learning_rate": 4.529924631845819e-06,
"logits/chosen": -0.22194555401802063,
"logits/rejected": -0.2846095860004425,
"logps/chosen": -391.3856201171875,
"logps/rejected": -436.1163024902344,
"loss": 0.7741,
"rewards/accuracies": 0.7083333730697632,
"rewards/chosen": -11.072221755981445,
"rewards/margins": 3.993955612182617,
"rewards/rejected": -15.066177368164062,
"step": 426
},
{
"epoch": 0.8458757804797897,
"grad_norm": 35.3477783203125,
"learning_rate": 4.5198453154225336e-06,
"logits/chosen": -0.20290356874465942,
"logits/rejected": -0.22691264748573303,
"logps/chosen": -401.7808532714844,
"logps/rejected": -431.0494384765625,
"loss": 0.9883,
"rewards/accuracies": 0.6527777910232544,
"rewards/chosen": -11.68575668334961,
"rewards/margins": 3.1858057975769043,
"rewards/rejected": -14.871562957763672,
"step": 429
},
{
"epoch": 0.8517909957279001,
"grad_norm": 45.111228942871094,
"learning_rate": 4.509670547281938e-06,
"logits/chosen": -0.1648390144109726,
"logits/rejected": -0.15016193687915802,
"logps/chosen": -408.5751647949219,
"logps/rejected": -455.25286865234375,
"loss": 0.685,
"rewards/accuracies": 0.7129630446434021,
"rewards/chosen": -11.243772506713867,
"rewards/margins": 4.181289196014404,
"rewards/rejected": -15.425060272216797,
"step": 432
},
{
"epoch": 0.8577062109760105,
"grad_norm": 46.299076080322266,
"learning_rate": 4.499400808252481e-06,
"logits/chosen": -0.17850446701049805,
"logits/rejected": -0.17678868770599365,
"logps/chosen": -426.147216796875,
"logps/rejected": -462.2786560058594,
"loss": 0.8077,
"rewards/accuracies": 0.7453703880310059,
"rewards/chosen": -12.385601043701172,
"rewards/margins": 3.7832705974578857,
"rewards/rejected": -16.168874740600586,
"step": 435
},
{
"epoch": 0.863621426224121,
"grad_norm": 33.022735595703125,
"learning_rate": 4.489036583650649e-06,
"logits/chosen": -0.23058825731277466,
"logits/rejected": -0.1485554724931717,
"logps/chosen": -404.13458251953125,
"logps/rejected": -459.26434326171875,
"loss": 0.7394,
"rewards/accuracies": 0.6851851940155029,
"rewards/chosen": -12.277857780456543,
"rewards/margins": 4.416906833648682,
"rewards/rejected": -16.694765090942383,
"step": 438
},
{
"epoch": 0.8695366414722313,
"grad_norm": 35.275081634521484,
"learning_rate": 4.478578363258023e-06,
"logits/chosen": -0.16853290796279907,
"logits/rejected": -0.1521489918231964,
"logps/chosen": -415.5618591308594,
"logps/rejected": -451.77178955078125,
"loss": 0.8398,
"rewards/accuracies": 0.6898148059844971,
"rewards/chosen": -12.673078536987305,
"rewards/margins": 3.461433172225952,
"rewards/rejected": -16.134510040283203,
"step": 441
},
{
"epoch": 0.8754518567203418,
"grad_norm": 33.70681381225586,
"learning_rate": 4.468026641298142e-06,
"logits/chosen": -0.1421818733215332,
"logits/rejected": -0.08376338332891464,
"logps/chosen": -427.12335205078125,
"logps/rejected": -484.59649658203125,
"loss": 0.7882,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": -13.382842063903809,
"rewards/margins": 3.6166725158691406,
"rewards/rejected": -16.999515533447266,
"step": 444
},
{
"epoch": 0.8813670719684522,
"grad_norm": 33.35516357421875,
"learning_rate": 4.457381916413141e-06,
"logits/chosen": -0.13292686641216278,
"logits/rejected": -0.18141326308250427,
"logps/chosen": -425.69287109375,
"logps/rejected": -463.3936767578125,
"loss": 0.8456,
"rewards/accuracies": 0.7083333730697632,
"rewards/chosen": -12.859245300292969,
"rewards/margins": 3.692777156829834,
"rewards/rejected": -16.55202293395996,
"step": 447
},
{
"epoch": 0.8872822872165625,
"grad_norm": 35.195640563964844,
"learning_rate": 4.4466446916401895e-06,
"logits/chosen": -0.22234384715557098,
"logits/rejected": -0.08050793409347534,
"logps/chosen": -418.515869140625,
"logps/rejected": -479.6258239746094,
"loss": 0.754,
"rewards/accuracies": 0.7222222685813904,
"rewards/chosen": -13.117304801940918,
"rewards/margins": 4.073483467102051,
"rewards/rejected": -17.19078826904297,
"step": 450
},
{
"epoch": 0.893197502464673,
"grad_norm": 57.9023551940918,
"learning_rate": 4.435815474387719e-06,
"logits/chosen": -0.117046058177948,
"logits/rejected": -0.09925241768360138,
"logps/chosen": -431.378662109375,
"logps/rejected": -481.478271484375,
"loss": 0.9017,
"rewards/accuracies": 0.6898148059844971,
"rewards/chosen": -13.18567943572998,
"rewards/margins": 3.816985845565796,
"rewards/rejected": -17.002666473388672,
"step": 453
},
{
"epoch": 0.8991127177127834,
"grad_norm": 32.576194763183594,
"learning_rate": 4.424894776411445e-06,
"logits/chosen": -0.14799581468105316,
"logits/rejected": -0.14989186823368073,
"logps/chosen": -423.6829833984375,
"logps/rejected": -472.8582458496094,
"loss": 0.751,
"rewards/accuracies": 0.7222222685813904,
"rewards/chosen": -12.279951095581055,
"rewards/margins": 4.995446681976318,
"rewards/rejected": -17.27539825439453,
"step": 456
},
{
"epoch": 0.9050279329608939,
"grad_norm": 42.71744155883789,
"learning_rate": 4.413883113790183e-06,
"logits/chosen": -0.1738191843032837,
"logits/rejected": -0.13758057355880737,
"logps/chosen": -426.9272155761719,
"logps/rejected": -479.89434814453125,
"loss": 0.9077,
"rewards/accuracies": 0.6805555820465088,
"rewards/chosen": -13.401716232299805,
"rewards/margins": 3.820394277572632,
"rewards/rejected": -17.22211265563965,
"step": 459
},
{
"epoch": 0.9109431482090042,
"grad_norm": 39.175025939941406,
"learning_rate": 4.402781006901457e-06,
"logits/chosen": -0.2070755809545517,
"logits/rejected": -0.14179669320583344,
"logps/chosen": -414.93804931640625,
"logps/rejected": -477.036376953125,
"loss": 0.8665,
"rewards/accuracies": 0.6805555820465088,
"rewards/chosen": -13.253950119018555,
"rewards/margins": 3.051947593688965,
"rewards/rejected": -16.305896759033203,
"step": 462
},
{
"epoch": 0.9168583634571147,
"grad_norm": 22.947998046875,
"learning_rate": 4.391588980396913e-06,
"logits/chosen": -0.1404464989900589,
"logits/rejected": -0.181797593832016,
"logps/chosen": -406.57586669921875,
"logps/rejected": -447.8778076171875,
"loss": 0.6584,
"rewards/accuracies": 0.7407407760620117,
"rewards/chosen": -11.844066619873047,
"rewards/margins": 4.259424209594727,
"rewards/rejected": -16.103490829467773,
"step": 465
},
{
"epoch": 0.9227735787052251,
"grad_norm": 27.205076217651367,
"learning_rate": 4.380307563177523e-06,
"logits/chosen": -0.27725404500961304,
"logits/rejected": -0.20193539559841156,
"logps/chosen": -409.8758239746094,
"logps/rejected": -500.9091491699219,
"loss": 0.7579,
"rewards/accuracies": 0.7361111640930176,
"rewards/chosen": -12.209704399108887,
"rewards/margins": 5.026104927062988,
"rewards/rejected": -17.235809326171875,
"step": 468
},
{
"epoch": 0.9286887939533355,
"grad_norm": 28.20140266418457,
"learning_rate": 4.36893728836859e-06,
"logits/chosen": -0.2689639925956726,
"logits/rejected": -0.20782801508903503,
"logps/chosen": -396.9151611328125,
"logps/rejected": -470.01593017578125,
"loss": 0.5835,
"rewards/accuracies": 0.7731481790542603,
"rewards/chosen": -12.019521713256836,
"rewards/margins": 5.0072855949401855,
"rewards/rejected": -17.02680778503418,
"step": 471
},
{
"epoch": 0.9346040092014459,
"grad_norm": 32.47650909423828,
"learning_rate": 4.357478693294557e-06,
"logits/chosen": -0.3439037799835205,
"logits/rejected": -0.28947803378105164,
"logps/chosen": -415.56072998046875,
"logps/rejected": -484.7381286621094,
"loss": 0.7827,
"rewards/accuracies": 0.763888955116272,
"rewards/chosen": -12.406390190124512,
"rewards/margins": 5.7553629875183105,
"rewards/rejected": -18.161752700805664,
"step": 474
},
{
"epoch": 0.9405192244495564,
"grad_norm": 37.89875793457031,
"learning_rate": 4.345932319453612e-06,
"logits/chosen": -0.3605978488922119,
"logits/rejected": -0.29322177171707153,
"logps/chosen": -429.87091064453125,
"logps/rejected": -483.867431640625,
"loss": 0.8029,
"rewards/accuracies": 0.703703761100769,
"rewards/chosen": -13.3162841796875,
"rewards/margins": 4.183923721313477,
"rewards/rejected": -17.500207901000977,
"step": 477
},
{
"epoch": 0.9464344396976668,
"grad_norm": 50.86394119262695,
"learning_rate": 4.334298712492098e-06,
"logits/chosen": -0.2936496138572693,
"logits/rejected": -0.32563602924346924,
"logps/chosen": -436.94317626953125,
"logps/rejected": -472.5697937011719,
"loss": 0.9131,
"rewards/accuracies": 0.6574074029922485,
"rewards/chosen": -13.809501647949219,
"rewards/margins": 3.8074746131896973,
"rewards/rejected": -17.61697769165039,
"step": 480
},
{
"epoch": 0.9523496549457772,
"grad_norm": 52.91617202758789,
"learning_rate": 4.32257842217873e-06,
"logits/chosen": -0.2928038239479065,
"logits/rejected": -0.25982213020324707,
"logps/chosen": -438.8931579589844,
"logps/rejected": -474.4143981933594,
"loss": 0.8308,
"rewards/accuracies": 0.7314814925193787,
"rewards/chosen": -14.169482231140137,
"rewards/margins": 3.389918327331543,
"rewards/rejected": -17.559402465820312,
"step": 483
},
{
"epoch": 0.9582648701938876,
"grad_norm": 40.86037826538086,
"learning_rate": 4.310772002378613e-06,
"logits/chosen": -0.33903825283050537,
"logits/rejected": -0.27782881259918213,
"logps/chosen": -430.27288818359375,
"logps/rejected": -489.8445739746094,
"loss": 0.7149,
"rewards/accuracies": 0.7407407164573669,
"rewards/chosen": -14.304798126220703,
"rewards/margins": 4.365303993225098,
"rewards/rejected": -18.670101165771484,
"step": 486
},
{
"epoch": 0.9641800854419981,
"grad_norm": 38.9904899597168,
"learning_rate": 4.298880011027067e-06,
"logits/chosen": -0.3663506507873535,
"logits/rejected": -0.2783759832382202,
"logps/chosen": -428.8545837402344,
"logps/rejected": -498.8543701171875,
"loss": 0.54,
"rewards/accuracies": 0.75,
"rewards/chosen": -14.275480270385742,
"rewards/margins": 4.776096343994141,
"rewards/rejected": -19.051578521728516,
"step": 489
},
{
"epoch": 0.9700953006901084,
"grad_norm": 33.28893280029297,
"learning_rate": 4.286903010103267e-06,
"logits/chosen": -0.34440621733665466,
"logits/rejected": -0.3880541920661926,
"logps/chosen": -448.9291687011719,
"logps/rejected": -492.80609130859375,
"loss": 0.7484,
"rewards/accuracies": 0.7546297311782837,
"rewards/chosen": -13.652152061462402,
"rewards/margins": 4.3886871337890625,
"rewards/rejected": -18.04084014892578,
"step": 492
},
{
"epoch": 0.9760105159382189,
"grad_norm": 23.393043518066406,
"learning_rate": 4.274841565603674e-06,
"logits/chosen": -0.3807776868343353,
"logits/rejected": -0.37542426586151123,
"logps/chosen": -420.848388671875,
"logps/rejected": -468.35150146484375,
"loss": 0.6653,
"rewards/accuracies": 0.7361111044883728,
"rewards/chosen": -13.749267578125,
"rewards/margins": 3.9116599559783936,
"rewards/rejected": -17.660926818847656,
"step": 495
},
{
"epoch": 0.9819257311863293,
"grad_norm": 71.44178771972656,
"learning_rate": 4.262696247515298e-06,
"logits/chosen": -0.33630847930908203,
"logits/rejected": -0.3011205196380615,
"logps/chosen": -427.74493408203125,
"logps/rejected": -486.2608337402344,
"loss": 0.79,
"rewards/accuracies": 0.7453703284263611,
"rewards/chosen": -13.645870208740234,
"rewards/margins": 3.291267156600952,
"rewards/rejected": -16.937137603759766,
"step": 498
},
{
"epoch": 0.9878409464344396,
"grad_norm": 38.925453186035156,
"learning_rate": 4.250467629788758e-06,
"logits/chosen": -0.3359528183937073,
"logits/rejected": -0.32171425223350525,
"logps/chosen": -416.31011962890625,
"logps/rejected": -453.65667724609375,
"loss": 0.8185,
"rewards/accuracies": 0.7592593431472778,
"rewards/chosen": -13.421798706054688,
"rewards/margins": 3.002066135406494,
"rewards/rejected": -16.423866271972656,
"step": 501
},
{
"epoch": 0.9937561616825501,
"grad_norm": 43.50007247924805,
"learning_rate": 4.238156290311159e-06,
"logits/chosen": -0.2119762897491455,
"logits/rejected": -0.21785835921764374,
"logps/chosen": -419.2802734375,
"logps/rejected": -464.8529052734375,
"loss": 0.8978,
"rewards/accuracies": 0.7083333730697632,
"rewards/chosen": -12.489054679870605,
"rewards/margins": 2.8680572509765625,
"rewards/rejected": -15.357111930847168,
"step": 504
},
{
"epoch": 0.9996713769306605,
"grad_norm": 28.424711227416992,
"learning_rate": 4.2257628108787855e-06,
"logits/chosen": -0.3750268220901489,
"logits/rejected": -0.3536463975906372,
"logps/chosen": -406.7919921875,
"logps/rejected": -456.8607177734375,
"loss": 0.8355,
"rewards/accuracies": 0.75,
"rewards/chosen": -11.890769004821777,
"rewards/margins": 3.3902645111083984,
"rewards/rejected": -15.281034469604492,
"step": 507
},
{
"epoch": 1.0039434768320736,
"grad_norm": 9.685483932495117,
"learning_rate": 4.2132877771696e-06,
"logits/chosen": -0.44336915016174316,
"logits/rejected": -0.3426854610443115,
"logps/chosen": -401.211669921875,
"logps/rejected": -507.76446533203125,
"loss": 0.2311,
"rewards/accuracies": 0.9487179517745972,
"rewards/chosen": -8.906108856201172,
"rewards/margins": 9.99924087524414,
"rewards/rejected": -18.905351638793945,
"step": 510
},
{
"epoch": 1.009858692080184,
"grad_norm": 15.767539978027344,
"learning_rate": 4.200731778715575e-06,
"logits/chosen": -0.3226369619369507,
"logits/rejected": -0.23430070281028748,
"logps/chosen": -386.5018005371094,
"logps/rejected": -498.0735778808594,
"loss": 0.0674,
"rewards/accuracies": 0.9722223281860352,
"rewards/chosen": -8.576701164245605,
"rewards/margins": 10.921747207641602,
"rewards/rejected": -19.49844741821289,
"step": 513
},
{
"epoch": 1.0157739073282945,
"grad_norm": 8.739374160766602,
"learning_rate": 4.188095408874829e-06,
"logits/chosen": -0.3742499053478241,
"logits/rejected": -0.33725112676620483,
"logps/chosen": -354.2779235839844,
"logps/rejected": -456.7283935546875,
"loss": 0.0407,
"rewards/accuracies": 0.9907407760620117,
"rewards/chosen": -8.695253372192383,
"rewards/margins": 9.780828475952148,
"rewards/rejected": -18.47608184814453,
"step": 516
},
{
"epoch": 1.0216891225764049,
"grad_norm": 13.139928817749023,
"learning_rate": 4.175379264803587e-06,
"logits/chosen": -0.37535345554351807,
"logits/rejected": -0.31396183371543884,
"logps/chosen": -375.81396484375,
"logps/rejected": -471.1697692871094,
"loss": 0.0762,
"rewards/accuracies": 0.9768518805503845,
"rewards/chosen": -8.316873550415039,
"rewards/margins": 9.90269660949707,
"rewards/rejected": -18.219572067260742,
"step": 519
},
{
"epoch": 1.0276043378245152,
"grad_norm": 3.5331549644470215,
"learning_rate": 4.162583947427958e-06,
"logits/chosen": -0.40028223395347595,
"logits/rejected": -0.22156159579753876,
"logps/chosen": -378.267822265625,
"logps/rejected": -504.5182800292969,
"loss": 0.1258,
"rewards/accuracies": 0.953703761100769,
"rewards/chosen": -9.443648338317871,
"rewards/margins": 11.473689079284668,
"rewards/rejected": -20.917339324951172,
"step": 522
},
{
"epoch": 1.0335195530726258,
"grad_norm": 10.87333869934082,
"learning_rate": 4.149710061415542e-06,
"logits/chosen": -0.3893941044807434,
"logits/rejected": -0.22475658357143402,
"logps/chosen": -389.7325134277344,
"logps/rejected": -521.1325073242188,
"loss": 0.0873,
"rewards/accuracies": 0.9675926566123962,
"rewards/chosen": -9.745861053466797,
"rewards/margins": 12.743724822998047,
"rewards/rejected": -22.489585876464844,
"step": 525
},
{
"epoch": 1.0394347683207361,
"grad_norm": 10.995318412780762,
"learning_rate": 4.13675821514685e-06,
"logits/chosen": -0.39965152740478516,
"logits/rejected": -0.252058744430542,
"logps/chosen": -410.03411865234375,
"logps/rejected": -544.2578735351562,
"loss": 0.0969,
"rewards/accuracies": 0.9675926566123962,
"rewards/chosen": -11.346731185913086,
"rewards/margins": 11.341825485229492,
"rewards/rejected": -22.688552856445312,
"step": 528
},
{
"epoch": 1.0453499835688465,
"grad_norm": 8.572622299194336,
"learning_rate": 4.12372902068656e-06,
"logits/chosen": -0.4358088970184326,
"logits/rejected": -0.2031504511833191,
"logps/chosen": -403.5755310058594,
"logps/rejected": -527.644775390625,
"loss": 0.1168,
"rewards/accuracies": 0.9444444179534912,
"rewards/chosen": -11.759601593017578,
"rewards/margins": 10.76253890991211,
"rewards/rejected": -22.522140502929688,
"step": 531
},
{
"epoch": 1.051265198816957,
"grad_norm": 7.874849796295166,
"learning_rate": 4.110623093754585e-06,
"logits/chosen": -0.38974201679229736,
"logits/rejected": -0.25777122378349304,
"logps/chosen": -408.3241882324219,
"logps/rejected": -516.1395263671875,
"loss": 0.1062,
"rewards/accuracies": 0.9629629850387573,
"rewards/chosen": -11.630317687988281,
"rewards/margins": 10.122415542602539,
"rewards/rejected": -21.752735137939453,
"step": 534
},
{
"epoch": 1.0571804140650674,
"grad_norm": 7.267045974731445,
"learning_rate": 4.097441053696985e-06,
"logits/chosen": -0.44127148389816284,
"logits/rejected": -0.32113516330718994,
"logps/chosen": -430.0728759765625,
"logps/rejected": -561.5933837890625,
"loss": 0.0811,
"rewards/accuracies": 0.9722222685813904,
"rewards/chosen": -11.996893882751465,
"rewards/margins": 12.880267143249512,
"rewards/rejected": -24.87716293334961,
"step": 537
},
{
"epoch": 1.0630956293131777,
"grad_norm": 26.037012100219727,
"learning_rate": 4.08418352345669e-06,
"logits/chosen": -0.3767847418785095,
"logits/rejected": -0.31423503160476685,
"logps/chosen": -406.0875244140625,
"logps/rejected": -526.44482421875,
"loss": 0.0899,
"rewards/accuracies": 0.9768518805503845,
"rewards/chosen": -10.038455963134766,
"rewards/margins": 13.710319519042969,
"rewards/rejected": -23.748777389526367,
"step": 540
},
{
"epoch": 1.0690108445612883,
"grad_norm": 13.05281925201416,
"learning_rate": 4.070851129544065e-06,
"logits/chosen": -0.465512216091156,
"logits/rejected": -0.2587343454360962,
"logps/chosen": -399.33642578125,
"logps/rejected": -554.8599243164062,
"loss": 0.1257,
"rewards/accuracies": 0.953703761100769,
"rewards/chosen": -10.270711898803711,
"rewards/margins": 13.220477104187012,
"rewards/rejected": -23.491188049316406,
"step": 543
},
{
"epoch": 1.0749260598093986,
"grad_norm": 24.870304107666016,
"learning_rate": 4.057444502007306e-06,
"logits/chosen": -0.5171704888343811,
"logits/rejected": -0.33416056632995605,
"logps/chosen": -398.2466735839844,
"logps/rejected": -536.3380737304688,
"loss": 0.1337,
"rewards/accuracies": 0.953703761100769,
"rewards/chosen": -9.948308944702148,
"rewards/margins": 13.192928314208984,
"rewards/rejected": -23.1412353515625,
"step": 546
},
{
"epoch": 1.0808412750575092,
"grad_norm": 3.7352793216705322,
"learning_rate": 4.043964274402663e-06,
"logits/chosen": -0.4453051686286926,
"logits/rejected": -0.30837786197662354,
"logps/chosen": -386.0931091308594,
"logps/rejected": -503.6631774902344,
"loss": 0.0532,
"rewards/accuracies": 0.9768519401550293,
"rewards/chosen": -9.698603630065918,
"rewards/margins": 12.827930450439453,
"rewards/rejected": -22.526535034179688,
"step": 549
},
{
"epoch": 1.0867564903056195,
"grad_norm": 9.68443489074707,
"learning_rate": 4.030411083764498e-06,
"logits/chosen": -0.5020241737365723,
"logits/rejected": -0.28794384002685547,
"logps/chosen": -378.8649597167969,
"logps/rejected": -540.6756591796875,
"loss": 0.0671,
"rewards/accuracies": 0.9814814925193787,
"rewards/chosen": -9.302837371826172,
"rewards/margins": 13.408391952514648,
"rewards/rejected": -22.711231231689453,
"step": 552
},
{
"epoch": 1.0926717055537298,
"grad_norm": 20.763072967529297,
"learning_rate": 4.0167855705751855e-06,
"logits/chosen": -0.5195566415786743,
"logits/rejected": -0.35310834646224976,
"logps/chosen": -388.2412109375,
"logps/rejected": -517.09375,
"loss": 0.1577,
"rewards/accuracies": 0.9351853132247925,
"rewards/chosen": -9.682899475097656,
"rewards/margins": 12.611946105957031,
"rewards/rejected": -22.294845581054688,
"step": 555
},
{
"epoch": 1.0985869208018402,
"grad_norm": 10.18380069732666,
"learning_rate": 4.003088378734841e-06,
"logits/chosen": -0.5263486504554749,
"logits/rejected": -0.3212735056877136,
"logps/chosen": -395.2178649902344,
"logps/rejected": -554.233642578125,
"loss": 0.0954,
"rewards/accuracies": 0.9722222685813904,
"rewards/chosen": -11.4990234375,
"rewards/margins": 13.107860565185547,
"rewards/rejected": -24.606884002685547,
"step": 558
},
{
"epoch": 1.1045021360499507,
"grad_norm": 17.160139083862305,
"learning_rate": 3.989320155530894e-06,
"logits/chosen": -0.4392577111721039,
"logits/rejected": -0.2696951925754547,
"logps/chosen": -410.08343505859375,
"logps/rejected": -540.97509765625,
"loss": 0.1387,
"rewards/accuracies": 0.9583333730697632,
"rewards/chosen": -11.576102256774902,
"rewards/margins": 12.610298156738281,
"rewards/rejected": -24.186399459838867,
"step": 561
},
{
"epoch": 1.110417351298061,
"grad_norm": 16.328100204467773,
"learning_rate": 3.9754815516075e-06,
"logits/chosen": -0.49670523405075073,
"logits/rejected": -0.22816550731658936,
"logps/chosen": -402.760498046875,
"logps/rejected": -556.445556640625,
"loss": 0.1141,
"rewards/accuracies": 0.9490741491317749,
"rewards/chosen": -11.802339553833008,
"rewards/margins": 13.369563102722168,
"rewards/rejected": -25.171903610229492,
"step": 564
},
{
"epoch": 1.1163325665461716,
"grad_norm": 3.9692351818084717,
"learning_rate": 3.9615732209347925e-06,
"logits/chosen": -0.4443477988243103,
"logits/rejected": -0.30315929651260376,
"logps/chosen": -390.0570068359375,
"logps/rejected": -514.3804931640625,
"loss": 0.1019,
"rewards/accuracies": 0.9629630446434021,
"rewards/chosen": -11.126566886901855,
"rewards/margins": 10.942947387695312,
"rewards/rejected": -22.069515228271484,
"step": 567
},
{
"epoch": 1.122247781794282,
"grad_norm": 14.457867622375488,
"learning_rate": 3.947595820777978e-06,
"logits/chosen": -0.517672061920166,
"logits/rejected": -0.24748222529888153,
"logps/chosen": -386.17291259765625,
"logps/rejected": -539.0173950195312,
"loss": 0.1354,
"rewards/accuracies": 0.9444444179534912,
"rewards/chosen": -11.382842063903809,
"rewards/margins": 13.581473350524902,
"rewards/rejected": -24.96431541442871,
"step": 570
},
{
"epoch": 1.1281629970423923,
"grad_norm": 8.523018836975098,
"learning_rate": 3.933550011666275e-06,
"logits/chosen": -0.401355117559433,
"logits/rejected": -0.2663224935531616,
"logps/chosen": -422.0758972167969,
"logps/rejected": -550.5574951171875,
"loss": 0.1321,
"rewards/accuracies": 0.9444444179534912,
"rewards/chosen": -12.331350326538086,
"rewards/margins": 13.582274436950684,
"rewards/rejected": -25.913619995117188,
"step": 573
},
{
"epoch": 1.1340782122905029,
"grad_norm": 37.025264739990234,
"learning_rate": 3.919436457361701e-06,
"logits/chosen": -0.3494156002998352,
"logits/rejected": -0.2177804559469223,
"logps/chosen": -423.7979431152344,
"logps/rejected": -559.4660034179688,
"loss": 0.1034,
"rewards/accuracies": 0.9675926566123962,
"rewards/chosen": -12.480727195739746,
"rewards/margins": 12.493712425231934,
"rewards/rejected": -24.97443962097168,
"step": 576
},
{
"epoch": 1.1399934275386132,
"grad_norm": 10.311182975769043,
"learning_rate": 3.905255824827703e-06,
"logits/chosen": -0.404990017414093,
"logits/rejected": -0.23333707451820374,
"logps/chosen": -400.90179443359375,
"logps/rejected": -549.761962890625,
"loss": 0.0563,
"rewards/accuracies": 0.9861111044883728,
"rewards/chosen": -12.794790267944336,
"rewards/margins": 12.089736938476562,
"rewards/rejected": -24.8845272064209,
"step": 579
},
{
"epoch": 1.1459086427867236,
"grad_norm": 16.51976203918457,
"learning_rate": 3.891008784197642e-06,
"logits/chosen": -0.4602348208427429,
"logits/rejected": -0.32792428135871887,
"logps/chosen": -443.4609069824219,
"logps/rejected": -569.1358642578125,
"loss": 0.1204,
"rewards/accuracies": 0.9490741491317749,
"rewards/chosen": -13.16263484954834,
"rewards/margins": 12.034723281860352,
"rewards/rejected": -25.197355270385742,
"step": 582
},
{
"epoch": 1.1518238580348341,
"grad_norm": 19.857742309570312,
"learning_rate": 3.87669600874312e-06,
"logits/chosen": -0.45605939626693726,
"logits/rejected": -0.2824591398239136,
"logps/chosen": -440.00982666015625,
"logps/rejected": -580.4444580078125,
"loss": 0.1661,
"rewards/accuracies": 0.9490741491317749,
"rewards/chosen": -13.881183624267578,
"rewards/margins": 12.989599227905273,
"rewards/rejected": -26.87078094482422,
"step": 585
},
{
"epoch": 1.1577390732829445,
"grad_norm": 13.731415748596191,
"learning_rate": 3.8623181748421705e-06,
"logits/chosen": -0.5049574375152588,
"logits/rejected": -0.3214029371738434,
"logps/chosen": -420.5501403808594,
"logps/rejected": -567.5693969726562,
"loss": 0.0816,
"rewards/accuracies": 0.9768518805503845,
"rewards/chosen": -13.632789611816406,
"rewards/margins": 13.632120132446289,
"rewards/rejected": -27.264907836914062,
"step": 588
},
{
"epoch": 1.1636542885310548,
"grad_norm": 8.935365676879883,
"learning_rate": 3.847875961947284e-06,
"logits/chosen": -0.4097817540168762,
"logits/rejected": -0.30726078152656555,
"logps/chosen": -445.2584228515625,
"logps/rejected": -565.4356689453125,
"loss": 0.0672,
"rewards/accuracies": 0.9629629850387573,
"rewards/chosen": -13.714117050170898,
"rewards/margins": 12.752127647399902,
"rewards/rejected": -26.466245651245117,
"step": 591
},
{
"epoch": 1.1695695037791654,
"grad_norm": 7.481489181518555,
"learning_rate": 3.833370052553311e-06,
"logits/chosen": -0.44872909784317017,
"logits/rejected": -0.2258618324995041,
"logps/chosen": -420.02825927734375,
"logps/rejected": -583.5780029296875,
"loss": 0.106,
"rewards/accuracies": 0.9583333730697632,
"rewards/chosen": -12.75123119354248,
"rewards/margins": 15.216802597045898,
"rewards/rejected": -27.968032836914062,
"step": 594
},
{
"epoch": 1.1754847190272757,
"grad_norm": 18.711483001708984,
"learning_rate": 3.818801132165203e-06,
"logits/chosen": -0.5571283102035522,
"logits/rejected": -0.3472014367580414,
"logps/chosen": -406.1875915527344,
"logps/rejected": -583.6386108398438,
"loss": 0.1648,
"rewards/accuracies": 0.953703761100769,
"rewards/chosen": -11.881487846374512,
"rewards/margins": 16.52305793762207,
"rewards/rejected": -28.404544830322266,
"step": 597
},
{
"epoch": 1.1813999342753863,
"grad_norm": 16.07924461364746,
"learning_rate": 3.804169889265615e-06,
"logits/chosen": -0.5486902594566345,
"logits/rejected": -0.3185918927192688,
"logps/chosen": -393.58740234375,
"logps/rejected": -564.5778198242188,
"loss": 0.1159,
"rewards/accuracies": 0.953703761100769,
"rewards/chosen": -11.905487060546875,
"rewards/margins": 15.971022605895996,
"rewards/rejected": -27.876510620117188,
"step": 600
}
],
"logging_steps": 3,
"max_steps": 1524,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 300,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.14720897968746e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}