{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984, "eval_steps": 500, "global_step": 468, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010666666666666666, "grad_norm": 15.542097378420866, "learning_rate": 1.0638297872340425e-07, "logits/chosen": -0.9908615350723267, "logits/rejected": -1.0208933353424072, "logps/chosen": -0.2650989294052124, "logps/rejected": -0.2679658532142639, "loss": 3.0616, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -2.650989294052124, "rewards/margins": 0.028668876737356186, "rewards/rejected": -2.6796584129333496, "step": 5 }, { "epoch": 0.021333333333333333, "grad_norm": 15.236258225562361, "learning_rate": 2.127659574468085e-07, "logits/chosen": -0.9872478246688843, "logits/rejected": -1.0333178043365479, "logps/chosen": -0.2723778486251831, "logps/rejected": -0.2769668400287628, "loss": 3.0091, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -2.723778486251831, "rewards/margins": 0.0458899661898613, "rewards/rejected": -2.7696685791015625, "step": 10 }, { "epoch": 0.032, "grad_norm": 16.04597079407282, "learning_rate": 3.1914893617021275e-07, "logits/chosen": -1.054266333580017, "logits/rejected": -1.071502685546875, "logps/chosen": -0.27361491322517395, "logps/rejected": -0.2760830521583557, "loss": 3.0376, "rewards/accuracies": 0.46875, "rewards/chosen": -2.736149311065674, "rewards/margins": 0.02468101680278778, "rewards/rejected": -2.7608304023742676, "step": 15 }, { "epoch": 0.042666666666666665, "grad_norm": 17.377673929521983, "learning_rate": 4.25531914893617e-07, "logits/chosen": -1.0661036968231201, "logits/rejected": -1.10869300365448, "logps/chosen": -0.2736705541610718, "logps/rejected": -0.27885910868644714, "loss": 3.0339, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -2.7367053031921387, "rewards/margins": 0.051885683089494705, "rewards/rejected": -2.788590908050537, "step": 20 }, { "epoch": 0.05333333333333334, "grad_norm": 15.384527480517336, "learning_rate": 5.319148936170212e-07, "logits/chosen": -1.00022292137146, "logits/rejected": -1.0285676717758179, "logps/chosen": -0.27224716544151306, "logps/rejected": -0.2667531669139862, "loss": 3.0482, "rewards/accuracies": 0.46875, "rewards/chosen": -2.7224717140197754, "rewards/margins": -0.054939769208431244, "rewards/rejected": -2.667531967163086, "step": 25 }, { "epoch": 0.064, "grad_norm": 16.663291386460095, "learning_rate": 6.382978723404255e-07, "logits/chosen": -1.0526418685913086, "logits/rejected": -1.0864059925079346, "logps/chosen": -0.27711886167526245, "logps/rejected": -0.28717002272605896, "loss": 3.0561, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -2.771188497543335, "rewards/margins": 0.10051168501377106, "rewards/rejected": -2.8717002868652344, "step": 30 }, { "epoch": 0.07466666666666667, "grad_norm": 15.627909071060607, "learning_rate": 7.446808510638297e-07, "logits/chosen": -1.0521974563598633, "logits/rejected": -1.0903512239456177, "logps/chosen": -0.26880407333374023, "logps/rejected": -0.2833861708641052, "loss": 3.032, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.6880409717559814, "rewards/margins": 0.14582109451293945, "rewards/rejected": -2.833861827850342, "step": 35 }, { "epoch": 0.08533333333333333, "grad_norm": 18.064971688821167, "learning_rate": 8.51063829787234e-07, "logits/chosen": -1.0548655986785889, "logits/rejected": -1.069032073020935, "logps/chosen": -0.27780383825302124, "logps/rejected": -0.2752782106399536, "loss": 3.0389, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -2.778038501739502, "rewards/margins": -0.025255998596549034, "rewards/rejected": -2.7527823448181152, "step": 40 }, { "epoch": 0.096, "grad_norm": 15.909843739204323, "learning_rate": 9.574468085106384e-07, "logits/chosen": -1.0730093717575073, "logits/rejected": -1.0732877254486084, "logps/chosen": -0.27054473757743835, "logps/rejected": -0.28124722838401794, "loss": 3.0145, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.705447196960449, "rewards/margins": 0.10702502727508545, "rewards/rejected": -2.812472343444824, "step": 45 }, { "epoch": 0.10666666666666667, "grad_norm": 17.40081947428894, "learning_rate": 9.998747147528373e-07, "logits/chosen": -1.0844353437423706, "logits/rejected": -1.1193302869796753, "logps/chosen": -0.2774738669395447, "logps/rejected": -0.28697627782821655, "loss": 3.0146, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.774738311767578, "rewards/margins": 0.09502413868904114, "rewards/rejected": -2.869762420654297, "step": 50 }, { "epoch": 0.11733333333333333, "grad_norm": 15.576993262006905, "learning_rate": 9.991093100466482e-07, "logits/chosen": -1.0948281288146973, "logits/rejected": -1.1082046031951904, "logps/chosen": -0.2911163866519928, "logps/rejected": -0.29472410678863525, "loss": 3.0249, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -2.911163568496704, "rewards/margins": 0.03607722371816635, "rewards/rejected": -2.9472408294677734, "step": 55 }, { "epoch": 0.128, "grad_norm": 16.260673735173633, "learning_rate": 9.976491676662678e-07, "logits/chosen": -1.106910228729248, "logits/rejected": -1.1165117025375366, "logps/chosen": -0.28884169459342957, "logps/rejected": -0.2888728082180023, "loss": 3.0111, "rewards/accuracies": 0.53125, "rewards/chosen": -2.8884167671203613, "rewards/margins": 0.000311434268951416, "rewards/rejected": -2.888728618621826, "step": 60 }, { "epoch": 0.13866666666666666, "grad_norm": 15.259045978055722, "learning_rate": 9.95496320064109e-07, "logits/chosen": -1.1103830337524414, "logits/rejected": -1.1367188692092896, "logps/chosen": -0.28479719161987305, "logps/rejected": -0.2901866137981415, "loss": 3.0013, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -2.8479714393615723, "rewards/margins": 0.05389442294836044, "rewards/rejected": -2.9018661975860596, "step": 65 }, { "epoch": 0.14933333333333335, "grad_norm": 15.800187299464394, "learning_rate": 9.926537639070456e-07, "logits/chosen": -1.12952721118927, "logits/rejected": -1.1671946048736572, "logps/chosen": -0.2909308671951294, "logps/rejected": -0.32388028502464294, "loss": 2.996, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.909308671951294, "rewards/margins": 0.3294942080974579, "rewards/rejected": -3.2388031482696533, "step": 70 }, { "epoch": 0.16, "grad_norm": 17.626483067965456, "learning_rate": 9.891254559051884e-07, "logits/chosen": -1.212756633758545, "logits/rejected": -1.2210320234298706, "logps/chosen": -0.2985421419143677, "logps/rejected": -0.30502721667289734, "loss": 2.9708, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.985421657562256, "rewards/margins": 0.06485103070735931, "rewards/rejected": -3.0502724647521973, "step": 75 }, { "epoch": 0.17066666666666666, "grad_norm": 17.71391454265527, "learning_rate": 9.849163073043223e-07, "logits/chosen": -1.1545735597610474, "logits/rejected": -1.2085789442062378, "logps/chosen": -0.29359039664268494, "logps/rejected": -0.30682289600372314, "loss": 2.9847, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.9359042644500732, "rewards/margins": 0.13232454657554626, "rewards/rejected": -3.0682289600372314, "step": 80 }, { "epoch": 0.18133333333333335, "grad_norm": 16.3328352772127, "learning_rate": 9.800321770496724e-07, "logits/chosen": -1.2470288276672363, "logits/rejected": -1.2831312417984009, "logps/chosen": -0.3038308620452881, "logps/rejected": -0.3224955201148987, "loss": 3.0144, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -3.038308620452881, "rewards/margins": 0.18664616346359253, "rewards/rejected": -3.2249550819396973, "step": 85 }, { "epoch": 0.192, "grad_norm": 18.86796938836226, "learning_rate": 9.744798636305187e-07, "logits/chosen": -1.1841561794281006, "logits/rejected": -1.2237173318862915, "logps/chosen": -0.3177623152732849, "logps/rejected": -0.3389167785644531, "loss": 2.9578, "rewards/accuracies": 0.5, "rewards/chosen": -3.1776235103607178, "rewards/margins": 0.21154406666755676, "rewards/rejected": -3.3891677856445312, "step": 90 }, { "epoch": 0.20266666666666666, "grad_norm": 15.728961940733296, "learning_rate": 9.68267095617003e-07, "logits/chosen": -1.2409876585006714, "logits/rejected": -1.2001683712005615, "logps/chosen": -0.317212849855423, "logps/rejected": -0.3224230408668518, "loss": 2.9979, "rewards/accuracies": 0.46875, "rewards/chosen": -3.172128677368164, "rewards/margins": 0.052101828157901764, "rewards/rejected": -3.2242302894592285, "step": 95 }, { "epoch": 0.21333333333333335, "grad_norm": 15.929749034356965, "learning_rate": 9.614025209023083e-07, "logits/chosen": -1.243082046508789, "logits/rejected": -1.2563217878341675, "logps/chosen": -0.31486523151397705, "logps/rejected": -0.36377206444740295, "loss": 2.9804, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -3.1486525535583496, "rewards/margins": 0.48906850814819336, "rewards/rejected": -3.637720823287964, "step": 100 }, { "epoch": 0.224, "grad_norm": 18.523618354856197, "learning_rate": 9.538956946651815e-07, "logits/chosen": -1.287687063217163, "logits/rejected": -1.2784639596939087, "logps/chosen": -0.3346417546272278, "logps/rejected": -0.34726667404174805, "loss": 2.9939, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -3.346417188644409, "rewards/margins": 0.1262492835521698, "rewards/rejected": -3.4726665019989014, "step": 105 }, { "epoch": 0.23466666666666666, "grad_norm": 18.401939572120774, "learning_rate": 9.457570660695539e-07, "logits/chosen": -1.244755506515503, "logits/rejected": -1.249961018562317, "logps/chosen": -0.3117735981941223, "logps/rejected": -0.3277347981929779, "loss": 2.9245, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -3.1177361011505127, "rewards/margins": 0.15961191058158875, "rewards/rejected": -3.277348041534424, "step": 110 }, { "epoch": 0.24533333333333332, "grad_norm": 17.42187895151266, "learning_rate": 9.369979637197774e-07, "logits/chosen": -1.2914206981658936, "logits/rejected": -1.2996666431427002, "logps/chosen": -0.3168942332267761, "logps/rejected": -0.3301650881767273, "loss": 2.9149, "rewards/accuracies": 0.46875, "rewards/chosen": -3.1689422130584717, "rewards/margins": 0.13270840048789978, "rewards/rejected": -3.3016505241394043, "step": 115 }, { "epoch": 0.256, "grad_norm": 18.355579088710684, "learning_rate": 9.276305798917158e-07, "logits/chosen": -1.246483564376831, "logits/rejected": -1.266202688217163, "logps/chosen": -0.32255855202674866, "logps/rejected": -0.3426387906074524, "loss": 2.9557, "rewards/accuracies": 0.5, "rewards/chosen": -3.2255859375, "rewards/margins": 0.20080196857452393, "rewards/rejected": -3.4263877868652344, "step": 120 }, { "epoch": 0.26666666666666666, "grad_norm": 17.621346858524813, "learning_rate": 9.176679535616476e-07, "logits/chosen": -1.247054934501648, "logits/rejected": -1.2541028261184692, "logps/chosen": -0.3470916152000427, "logps/rejected": -0.368325799703598, "loss": 2.9694, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -3.4709160327911377, "rewards/margins": 0.2123415172100067, "rewards/rejected": -3.683258056640625, "step": 125 }, { "epoch": 0.2773333333333333, "grad_norm": 19.942213640703905, "learning_rate": 9.071239522565976e-07, "logits/chosen": -1.3265608549118042, "logits/rejected": -1.3420588970184326, "logps/chosen": -0.3686971664428711, "logps/rejected": -0.4225061535835266, "loss": 3.0102, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -3.686971664428711, "rewards/margins": 0.5380896925926208, "rewards/rejected": -4.225061416625977, "step": 130 }, { "epoch": 0.288, "grad_norm": 19.54726145620393, "learning_rate": 8.960132527513642e-07, "logits/chosen": -1.236342191696167, "logits/rejected": -1.2464872598648071, "logps/chosen": -0.3776538074016571, "logps/rejected": -0.373279333114624, "loss": 2.9791, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -3.776538133621216, "rewards/margins": -0.043744854629039764, "rewards/rejected": -3.732793092727661, "step": 135 }, { "epoch": 0.2986666666666667, "grad_norm": 19.40428586263726, "learning_rate": 8.8435132063911e-07, "logits/chosen": -1.2351807355880737, "logits/rejected": -1.2663322687149048, "logps/chosen": -0.3428255021572113, "logps/rejected": -0.3997129797935486, "loss": 2.9746, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -3.4282546043395996, "rewards/margins": 0.568874716758728, "rewards/rejected": -3.9971299171447754, "step": 140 }, { "epoch": 0.30933333333333335, "grad_norm": 19.21728993268386, "learning_rate": 8.721543888039532e-07, "logits/chosen": -1.2016265392303467, "logits/rejected": -1.214748501777649, "logps/chosen": -0.33607161045074463, "logps/rejected": -0.3531486988067627, "loss": 2.9215, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -3.3607163429260254, "rewards/margins": 0.17077085375785828, "rewards/rejected": -3.531487226486206, "step": 145 }, { "epoch": 0.32, "grad_norm": 19.767126676733028, "learning_rate": 8.594394348255237e-07, "logits/chosen": -1.2247461080551147, "logits/rejected": -1.2420233488082886, "logps/chosen": -0.3638969957828522, "logps/rejected": -0.3963164687156677, "loss": 3.037, "rewards/accuracies": 0.46875, "rewards/chosen": -3.638970136642456, "rewards/margins": 0.3241948187351227, "rewards/rejected": -3.963164806365967, "step": 150 }, { "epoch": 0.33066666666666666, "grad_norm": 19.61247251775275, "learning_rate": 8.462241573469377e-07, "logits/chosen": -1.2525568008422852, "logits/rejected": -1.2552361488342285, "logps/chosen": -0.35086172819137573, "logps/rejected": -0.4083561301231384, "loss": 2.9396, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -3.508617877960205, "rewards/margins": 0.5749433040618896, "rewards/rejected": -4.083560943603516, "step": 155 }, { "epoch": 0.3413333333333333, "grad_norm": 20.251747085669653, "learning_rate": 8.325269514390834e-07, "logits/chosen": -1.207936406135559, "logits/rejected": -1.2139708995819092, "logps/chosen": -0.32931679487228394, "logps/rejected": -0.35854530334472656, "loss": 2.994, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -3.29316782951355, "rewards/margins": 0.2922849655151367, "rewards/rejected": -3.5854530334472656, "step": 160 }, { "epoch": 0.352, "grad_norm": 19.996942830355515, "learning_rate": 8.183668829955111e-07, "logits/chosen": -1.1788911819458008, "logits/rejected": -1.208141803741455, "logps/chosen": -0.3463769853115082, "logps/rejected": -0.36146289110183716, "loss": 2.9678, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -3.4637699127197266, "rewards/margins": 0.15085917711257935, "rewards/rejected": -3.614629030227661, "step": 165 }, { "epoch": 0.3626666666666667, "grad_norm": 18.25970826279287, "learning_rate": 8.037636621935684e-07, "logits/chosen": -1.2193751335144043, "logits/rejected": -1.2422373294830322, "logps/chosen": -0.3798995614051819, "logps/rejected": -0.38932663202285767, "loss": 2.9514, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -3.7989959716796875, "rewards/margins": 0.09427039325237274, "rewards/rejected": -3.893266201019287, "step": 170 }, { "epoch": 0.37333333333333335, "grad_norm": 19.672992426759237, "learning_rate": 7.887376160587213e-07, "logits/chosen": -1.1973555088043213, "logits/rejected": -1.2301228046417236, "logps/chosen": -0.38230299949645996, "logps/rejected": -0.40916380286216736, "loss": 2.909, "rewards/accuracies": 0.53125, "rewards/chosen": -3.8230299949645996, "rewards/margins": 0.26860785484313965, "rewards/rejected": -4.09163761138916, "step": 175 }, { "epoch": 0.384, "grad_norm": 17.911399995564594, "learning_rate": 7.733096601702507e-07, "logits/chosen": -1.1881186962127686, "logits/rejected": -1.2008723020553589, "logps/chosen": -0.34471386671066284, "logps/rejected": -0.3660317063331604, "loss": 2.9854, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -3.447139024734497, "rewards/margins": 0.21317827701568604, "rewards/rejected": -3.6603171825408936, "step": 180 }, { "epoch": 0.39466666666666667, "grad_norm": 18.358466758766593, "learning_rate": 7.575012695477076e-07, "logits/chosen": -1.18712317943573, "logits/rejected": -1.2011723518371582, "logps/chosen": -0.3857135474681854, "logps/rejected": -0.4228752553462982, "loss": 2.9108, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -3.857135772705078, "rewards/margins": 0.37161707878112793, "rewards/rejected": -4.228753089904785, "step": 185 }, { "epoch": 0.4053333333333333, "grad_norm": 22.989081667421914, "learning_rate": 7.413344487586542e-07, "logits/chosen": -1.243378758430481, "logits/rejected": -1.2453285455703735, "logps/chosen": -0.3679312467575073, "logps/rejected": -0.3737437129020691, "loss": 2.9705, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -3.679312229156494, "rewards/margins": 0.058125365525484085, "rewards/rejected": -3.7374374866485596, "step": 190 }, { "epoch": 0.416, "grad_norm": 18.67773935679249, "learning_rate": 7.248317012892968e-07, "logits/chosen": -1.256320595741272, "logits/rejected": -1.2754420042037964, "logps/chosen": -0.4265909194946289, "logps/rejected": -0.5589956045150757, "loss": 2.947, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -4.265908718109131, "rewards/margins": 1.3240468502044678, "rewards/rejected": -5.589955806732178, "step": 195 }, { "epoch": 0.4266666666666667, "grad_norm": 18.689783800632924, "learning_rate": 7.08015998220647e-07, "logits/chosen": -1.2699859142303467, "logits/rejected": -1.2951858043670654, "logps/chosen": -0.3922530710697174, "logps/rejected": -0.4252193570137024, "loss": 2.9519, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -3.92253041267395, "rewards/margins": 0.3296629786491394, "rewards/rejected": -4.252193927764893, "step": 200 }, { "epoch": 0.43733333333333335, "grad_norm": 18.65842216703865, "learning_rate": 6.909107462538111e-07, "logits/chosen": -1.241351842880249, "logits/rejected": -1.2584552764892578, "logps/chosen": -0.39992719888687134, "logps/rejected": -0.4298134744167328, "loss": 2.9987, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -3.999271869659424, "rewards/margins": 0.2988627552986145, "rewards/rejected": -4.298134803771973, "step": 205 }, { "epoch": 0.448, "grad_norm": 18.291353097077145, "learning_rate": 6.735397551289178e-07, "logits/chosen": -1.2319579124450684, "logits/rejected": -1.2307006120681763, "logps/chosen": -0.37881526350975037, "logps/rejected": -0.40964803099632263, "loss": 2.9793, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -3.7881526947021484, "rewards/margins": 0.30832812190055847, "rewards/rejected": -4.096480846405029, "step": 210 }, { "epoch": 0.45866666666666667, "grad_norm": 22.707746897767183, "learning_rate": 6.559272044830316e-07, "logits/chosen": -1.2260886430740356, "logits/rejected": -1.2554783821105957, "logps/chosen": -0.37825000286102295, "logps/rejected": -0.41132181882858276, "loss": 3.0282, "rewards/accuracies": 0.5, "rewards/chosen": -3.7824997901916504, "rewards/margins": 0.33071866631507874, "rewards/rejected": -4.113218784332275, "step": 215 }, { "epoch": 0.4693333333333333, "grad_norm": 19.197902589030974, "learning_rate": 6.380976101931879e-07, "logits/chosen": -1.2666683197021484, "logits/rejected": -1.255789875984192, "logps/chosen": -0.4547974169254303, "logps/rejected": -0.4583858549594879, "loss": 3.0647, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -4.547974109649658, "rewards/margins": 0.035884302109479904, "rewards/rejected": -4.583858489990234, "step": 220 }, { "epoch": 0.48, "grad_norm": 18.843696932667417, "learning_rate": 6.200757902513962e-07, "logits/chosen": -1.2097585201263428, "logits/rejected": -1.2467796802520752, "logps/chosen": -0.39689359068870544, "logps/rejected": -0.46917811036109924, "loss": 2.9096, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -3.96893572807312, "rewards/margins": 0.7228449583053589, "rewards/rejected": -4.6917805671691895, "step": 225 }, { "epoch": 0.49066666666666664, "grad_norm": 20.181834593605693, "learning_rate": 6.018868302191139e-07, "logits/chosen": -1.1870830059051514, "logits/rejected": -1.196030855178833, "logps/chosen": -0.3868725299835205, "logps/rejected": -0.4080945551395416, "loss": 2.9443, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -3.868724822998047, "rewards/margins": 0.21222031116485596, "rewards/rejected": -4.0809454917907715, "step": 230 }, { "epoch": 0.5013333333333333, "grad_norm": 20.821870588498854, "learning_rate": 5.835560483092742e-07, "logits/chosen": -1.2740999460220337, "logits/rejected": -1.2843743562698364, "logps/chosen": -0.4087337553501129, "logps/rejected": -0.4453648030757904, "loss": 2.9423, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -4.087337017059326, "rewards/margins": 0.3663104176521301, "rewards/rejected": -4.453647613525391, "step": 235 }, { "epoch": 0.512, "grad_norm": 21.213279684684075, "learning_rate": 5.651089601444752e-07, "logits/chosen": -1.2432048320770264, "logits/rejected": -1.2379240989685059, "logps/chosen": -0.38207095861434937, "logps/rejected": -0.4122505784034729, "loss": 2.9824, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -3.820709705352783, "rewards/margins": 0.3017956614494324, "rewards/rejected": -4.122505187988281, "step": 240 }, { "epoch": 0.5226666666666666, "grad_norm": 20.797644682555266, "learning_rate": 5.465712432403811e-07, "logits/chosen": -1.2271026372909546, "logits/rejected": -1.2531237602233887, "logps/chosen": -0.4181506037712097, "logps/rejected": -0.44948238134384155, "loss": 2.9081, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -4.1815056800842285, "rewards/margins": 0.31331825256347656, "rewards/rejected": -4.494824409484863, "step": 245 }, { "epoch": 0.5333333333333333, "grad_norm": 23.47153560832651, "learning_rate": 5.279687012637798e-07, "logits/chosen": -1.2226316928863525, "logits/rejected": -1.2568190097808838, "logps/chosen": -0.4167971611022949, "logps/rejected": -0.44948896765708923, "loss": 2.9484, "rewards/accuracies": 0.53125, "rewards/chosen": -4.167971611022949, "rewards/margins": 0.3269180357456207, "rewards/rejected": -4.494889259338379, "step": 250 }, { "epoch": 0.544, "grad_norm": 18.38315194030945, "learning_rate": 5.093272281150382e-07, "logits/chosen": -1.2616848945617676, "logits/rejected": -1.2981878519058228, "logps/chosen": -0.391302227973938, "logps/rejected": -0.434838205575943, "loss": 2.9307, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -3.9130218029022217, "rewards/margins": 0.4353601336479187, "rewards/rejected": -4.348381996154785, "step": 255 }, { "epoch": 0.5546666666666666, "grad_norm": 21.27143313038359, "learning_rate": 4.906727718849618e-07, "logits/chosen": -1.2721519470214844, "logits/rejected": -1.2690547704696655, "logps/chosen": -0.3983103334903717, "logps/rejected": -0.4260452687740326, "loss": 2.8974, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -3.9831039905548096, "rewards/margins": 0.2773493230342865, "rewards/rejected": -4.260452747344971, "step": 260 }, { "epoch": 0.5653333333333334, "grad_norm": 20.34815938237593, "learning_rate": 4.7203129873622036e-07, "logits/chosen": -1.3212722539901733, "logits/rejected": -1.3238407373428345, "logps/chosen": -0.4332142770290375, "logps/rejected": -0.5484018325805664, "loss": 2.988, "rewards/accuracies": 0.53125, "rewards/chosen": -4.332143306732178, "rewards/margins": 1.1518752574920654, "rewards/rejected": -5.484018802642822, "step": 265 }, { "epoch": 0.576, "grad_norm": 22.40687761363269, "learning_rate": 4.534287567596188e-07, "logits/chosen": -1.2679193019866943, "logits/rejected": -1.2968288660049438, "logps/chosen": -0.406146377325058, "logps/rejected": -0.4383586049079895, "loss": 2.9959, "rewards/accuracies": 0.5, "rewards/chosen": -4.061463356018066, "rewards/margins": 0.3221224844455719, "rewards/rejected": -4.3835859298706055, "step": 270 }, { "epoch": 0.5866666666666667, "grad_norm": 20.942148890287285, "learning_rate": 4.348910398555249e-07, "logits/chosen": -1.2546627521514893, "logits/rejected": -1.2904515266418457, "logps/chosen": -0.4170510172843933, "logps/rejected": -0.459602415561676, "loss": 3.0177, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -4.1705098152160645, "rewards/margins": 0.4255140423774719, "rewards/rejected": -4.596024036407471, "step": 275 }, { "epoch": 0.5973333333333334, "grad_norm": 20.460111659042415, "learning_rate": 4.1644395169072575e-07, "logits/chosen": -1.235873818397522, "logits/rejected": -1.264666199684143, "logps/chosen": -0.3890388011932373, "logps/rejected": -0.3938077986240387, "loss": 3.0016, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -3.890387773513794, "rewards/margins": 0.04768957570195198, "rewards/rejected": -3.938077926635742, "step": 280 }, { "epoch": 0.608, "grad_norm": 19.621676810019572, "learning_rate": 3.9811316978088615e-07, "logits/chosen": -1.2465546131134033, "logits/rejected": -1.2484326362609863, "logps/chosen": -0.4200409948825836, "logps/rejected": -0.4409112334251404, "loss": 2.9329, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -4.200409889221191, "rewards/margins": 0.20870289206504822, "rewards/rejected": -4.409112453460693, "step": 285 }, { "epoch": 0.6186666666666667, "grad_norm": 21.426484978013306, "learning_rate": 3.799242097486038e-07, "logits/chosen": -1.313291072845459, "logits/rejected": -1.3121254444122314, "logps/chosen": -0.45259523391723633, "logps/rejected": -0.48710495233535767, "loss": 2.9039, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -4.525952339172363, "rewards/margins": 0.3450973331928253, "rewards/rejected": -4.871049404144287, "step": 290 }, { "epoch": 0.6293333333333333, "grad_norm": 19.38424744063439, "learning_rate": 3.619023898068123e-07, "logits/chosen": -1.2923152446746826, "logits/rejected": -1.304937720298767, "logps/chosen": -0.41213172674179077, "logps/rejected": -0.46351146697998047, "loss": 2.9142, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -4.121316909790039, "rewards/margins": 0.5137982964515686, "rewards/rejected": -4.635115146636963, "step": 295 }, { "epoch": 0.64, "grad_norm": 18.14370228888205, "learning_rate": 3.4407279551696846e-07, "logits/chosen": -1.2836002111434937, "logits/rejected": -1.309725046157837, "logps/chosen": -0.4251108169555664, "logps/rejected": -0.44700655341148376, "loss": 2.9205, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -4.251107215881348, "rewards/margins": 0.21895785629749298, "rewards/rejected": -4.470065593719482, "step": 300 }, { "epoch": 0.6506666666666666, "grad_norm": 18.496567614089848, "learning_rate": 3.2646024487108213e-07, "logits/chosen": -1.284744381904602, "logits/rejected": -1.2811096906661987, "logps/chosen": -0.4097173810005188, "logps/rejected": -0.4487836956977844, "loss": 2.9462, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -4.097173690795898, "rewards/margins": 0.39066314697265625, "rewards/rejected": -4.487837314605713, "step": 305 }, { "epoch": 0.6613333333333333, "grad_norm": 19.205847653386794, "learning_rate": 3.0908925374618887e-07, "logits/chosen": -1.26366126537323, "logits/rejected": -1.2661869525909424, "logps/chosen": -0.4183129668235779, "logps/rejected": -0.44671106338500977, "loss": 2.9514, "rewards/accuracies": 0.5625, "rewards/chosen": -4.18312931060791, "rewards/margins": 0.2839811444282532, "rewards/rejected": -4.467110633850098, "step": 310 }, { "epoch": 0.672, "grad_norm": 20.240569305536415, "learning_rate": 2.91984001779353e-07, "logits/chosen": -1.212679386138916, "logits/rejected": -1.2541916370391846, "logps/chosen": -0.40088850259780884, "logps/rejected": -0.4241718351840973, "loss": 2.884, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -4.008884906768799, "rewards/margins": 0.23283371329307556, "rewards/rejected": -4.241718292236328, "step": 315 }, { "epoch": 0.6826666666666666, "grad_norm": 19.477994840917976, "learning_rate": 2.751682987107029e-07, "logits/chosen": -1.3080363273620605, "logits/rejected": -1.33048415184021, "logps/chosen": -0.4709502160549164, "logps/rejected": -0.48716697096824646, "loss": 3.0093, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -4.709502220153809, "rewards/margins": 0.16216790676116943, "rewards/rejected": -4.871669769287109, "step": 320 }, { "epoch": 0.6933333333333334, "grad_norm": 19.392408354902603, "learning_rate": 2.5866555124134577e-07, "logits/chosen": -1.247816801071167, "logits/rejected": -1.2808668613433838, "logps/chosen": -0.40554946660995483, "logps/rejected": -0.41697534918785095, "loss": 2.9083, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -4.055495262145996, "rewards/margins": 0.11425850540399551, "rewards/rejected": -4.169753551483154, "step": 325 }, { "epoch": 0.704, "grad_norm": 24.255125187042516, "learning_rate": 2.424987304522924e-07, "logits/chosen": -1.2667433023452759, "logits/rejected": -1.2772916555404663, "logps/chosen": -0.4439450800418854, "logps/rejected": -0.44936904311180115, "loss": 3.0007, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -4.439450740814209, "rewards/margins": 0.05423973873257637, "rewards/rejected": -4.493690490722656, "step": 330 }, { "epoch": 0.7146666666666667, "grad_norm": 18.66997866326468, "learning_rate": 2.2669033982974944e-07, "logits/chosen": -1.2586668729782104, "logits/rejected": -1.2690373659133911, "logps/chosen": -0.4035263955593109, "logps/rejected": -0.4696914553642273, "loss": 2.9202, "rewards/accuracies": 0.59375, "rewards/chosen": -4.035264015197754, "rewards/margins": 0.6616507172584534, "rewards/rejected": -4.6969146728515625, "step": 335 }, { "epoch": 0.7253333333333334, "grad_norm": 19.194048967943004, "learning_rate": 2.1126238394127867e-07, "logits/chosen": -1.2616932392120361, "logits/rejected": -1.301735281944275, "logps/chosen": -0.4147927165031433, "logps/rejected": -0.4699038863182068, "loss": 2.9179, "rewards/accuracies": 0.53125, "rewards/chosen": -4.147927284240723, "rewards/margins": 0.5511118173599243, "rewards/rejected": -4.699038505554199, "step": 340 }, { "epoch": 0.736, "grad_norm": 20.972738719646298, "learning_rate": 1.9623633780643155e-07, "logits/chosen": -1.206508994102478, "logits/rejected": -1.2490206956863403, "logps/chosen": -0.3681824505329132, "logps/rejected": -0.3938142657279968, "loss": 2.9287, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -3.6818244457244873, "rewards/margins": 0.25631803274154663, "rewards/rejected": -3.9381422996520996, "step": 345 }, { "epoch": 0.7466666666666667, "grad_norm": 19.685667725326027, "learning_rate": 1.8163311700448898e-07, "logits/chosen": -1.2758309841156006, "logits/rejected": -1.298648476600647, "logps/chosen": -0.41422510147094727, "logps/rejected": -0.4361799359321594, "loss": 2.9367, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -4.142251014709473, "rewards/margins": 0.2195480316877365, "rewards/rejected": -4.3617987632751465, "step": 350 }, { "epoch": 0.7573333333333333, "grad_norm": 18.702359306429475, "learning_rate": 1.674730485609166e-07, "logits/chosen": -1.2573009729385376, "logits/rejected": -1.2752147912979126, "logps/chosen": -0.4381086230278015, "logps/rejected": -0.47269415855407715, "loss": 3.0636, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -4.381086349487305, "rewards/margins": 0.34585532546043396, "rewards/rejected": -4.7269415855407715, "step": 355 }, { "epoch": 0.768, "grad_norm": 19.15975497260745, "learning_rate": 1.537758426530622e-07, "logits/chosen": -1.2939916849136353, "logits/rejected": -1.2921050786972046, "logps/chosen": -0.4421107769012451, "logps/rejected": -0.4742702543735504, "loss": 2.9573, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -4.421108245849609, "rewards/margins": 0.3215946853160858, "rewards/rejected": -4.742702960968018, "step": 360 }, { "epoch": 0.7786666666666666, "grad_norm": 20.412109989540898, "learning_rate": 1.4056056517447634e-07, "logits/chosen": -1.2623586654663086, "logits/rejected": -1.2932734489440918, "logps/chosen": -0.4066384732723236, "logps/rejected": -0.44183507561683655, "loss": 2.9361, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -4.066384315490723, "rewards/margins": 0.3519664406776428, "rewards/rejected": -4.418351173400879, "step": 365 }, { "epoch": 0.7893333333333333, "grad_norm": 21.56479851024075, "learning_rate": 1.2784561119604682e-07, "logits/chosen": -1.2890852689743042, "logits/rejected": -1.2910048961639404, "logps/chosen": -0.4455583691596985, "logps/rejected": -0.5098209381103516, "loss": 2.96, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -4.455583572387695, "rewards/margins": 0.642626166343689, "rewards/rejected": -5.098209857940674, "step": 370 }, { "epoch": 0.8, "grad_norm": 18.271145361200055, "learning_rate": 1.156486793608899e-07, "logits/chosen": -1.3208215236663818, "logits/rejected": -1.3235687017440796, "logps/chosen": -0.41836491227149963, "logps/rejected": -0.45064491033554077, "loss": 2.9402, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.183648586273193, "rewards/margins": 0.32280001044273376, "rewards/rejected": -4.506449222564697, "step": 375 }, { "epoch": 0.8106666666666666, "grad_norm": 19.953182254510097, "learning_rate": 1.0398674724863581e-07, "logits/chosen": -1.3136857748031616, "logits/rejected": -1.3123706579208374, "logps/chosen": -0.4310145974159241, "logps/rejected": -0.49033123254776, "loss": 2.9092, "rewards/accuracies": 0.5625, "rewards/chosen": -4.310145378112793, "rewards/margins": 0.5931666493415833, "rewards/rejected": -4.9033122062683105, "step": 380 }, { "epoch": 0.8213333333333334, "grad_norm": 20.59979411954975, "learning_rate": 9.287604774340235e-08, "logits/chosen": -1.2864879369735718, "logits/rejected": -1.3099608421325684, "logps/chosen": -0.4408188760280609, "logps/rejected": -0.4660406708717346, "loss": 2.9032, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -4.408188819885254, "rewards/margins": 0.2522173821926117, "rewards/rejected": -4.660406112670898, "step": 385 }, { "epoch": 0.832, "grad_norm": 21.750100354501434, "learning_rate": 8.233204643835234e-08, "logits/chosen": -1.2784446477890015, "logits/rejected": -1.3241978883743286, "logps/chosen": -0.41723695397377014, "logps/rejected": -0.48806482553482056, "loss": 2.8978, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -4.172369480133057, "rewards/margins": 0.708278477191925, "rewards/rejected": -4.880647659301758, "step": 390 }, { "epoch": 0.8426666666666667, "grad_norm": 20.384595741085338, "learning_rate": 7.236942010828429e-08, "logits/chosen": -1.3349159955978394, "logits/rejected": -1.3055683374404907, "logps/chosen": -0.43574801087379456, "logps/rejected": -0.4621052145957947, "loss": 2.9866, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -4.357480525970459, "rewards/margins": 0.26357167959213257, "rewards/rejected": -4.621052265167236, "step": 395 }, { "epoch": 0.8533333333333334, "grad_norm": 18.501766671703546, "learning_rate": 6.300203628022271e-08, "logits/chosen": -1.248923897743225, "logits/rejected": -1.2598222494125366, "logps/chosen": -0.38643878698349, "logps/rejected": -0.39453864097595215, "loss": 2.9395, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -3.8643882274627686, "rewards/margins": 0.08099845796823502, "rewards/rejected": -3.9453868865966797, "step": 400 }, { "epoch": 0.864, "grad_norm": 19.839337640876096, "learning_rate": 5.42429339304461e-08, "logits/chosen": -1.2822444438934326, "logits/rejected": -1.3219239711761475, "logps/chosen": -0.43799424171447754, "logps/rejected": -0.4629386067390442, "loss": 3.0149, "rewards/accuracies": 0.5, "rewards/chosen": -4.379942417144775, "rewards/margins": 0.24944381415843964, "rewards/rejected": -4.629385948181152, "step": 405 }, { "epoch": 0.8746666666666667, "grad_norm": 18.208971956367737, "learning_rate": 4.610430533481857e-08, "logits/chosen": -1.266296148300171, "logits/rejected": -1.303065538406372, "logps/chosen": -0.4157203137874603, "logps/rejected": -0.47199106216430664, "loss": 2.8876, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.15720272064209, "rewards/margins": 0.5627071857452393, "rewards/rejected": -4.719910144805908, "step": 410 }, { "epoch": 0.8853333333333333, "grad_norm": 19.131190141361028, "learning_rate": 3.859747909769162e-08, "logits/chosen": -1.2796621322631836, "logits/rejected": -1.315230131149292, "logps/chosen": -0.4193892478942871, "logps/rejected": -0.48347288370132446, "loss": 2.8209, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -4.193892478942871, "rewards/margins": 0.6408361196517944, "rewards/rejected": -4.834728240966797, "step": 415 }, { "epoch": 0.896, "grad_norm": 19.4485490322355, "learning_rate": 3.173290438299697e-08, "logits/chosen": -1.295986294746399, "logits/rejected": -1.3151133060455322, "logps/chosen": -0.4371556341648102, "logps/rejected": -0.4686927795410156, "loss": 2.96, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -4.371556758880615, "rewards/margins": 0.31537097692489624, "rewards/rejected": -4.686927318572998, "step": 420 }, { "epoch": 0.9066666666666666, "grad_norm": 20.219307796360535, "learning_rate": 2.5520136369481194e-08, "logits/chosen": -1.2928402423858643, "logits/rejected": -1.3205971717834473, "logps/chosen": -0.46112942695617676, "logps/rejected": -0.51966392993927, "loss": 2.948, "rewards/accuracies": 0.53125, "rewards/chosen": -4.611294746398926, "rewards/margins": 0.5853451490402222, "rewards/rejected": -5.196639060974121, "step": 425 }, { "epoch": 0.9173333333333333, "grad_norm": 20.610456554777507, "learning_rate": 1.996782295032745e-08, "logits/chosen": -1.3009055852890015, "logits/rejected": -1.3030388355255127, "logps/chosen": -0.40775948762893677, "logps/rejected": -0.4260484576225281, "loss": 2.9981, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.07759428024292, "rewards/margins": 0.18288996815681458, "rewards/rejected": -4.260484218597412, "step": 430 }, { "epoch": 0.928, "grad_norm": 20.47490770082739, "learning_rate": 1.508369269567783e-08, "logits/chosen": -1.2779980897903442, "logits/rejected": -1.286833643913269, "logps/chosen": -0.4154781401157379, "logps/rejected": -0.45970821380615234, "loss": 2.9325, "rewards/accuracies": 0.5, "rewards/chosen": -4.154781341552734, "rewards/margins": 0.44230085611343384, "rewards/rejected": -4.597081661224365, "step": 435 }, { "epoch": 0.9386666666666666, "grad_norm": 21.154945951171282, "learning_rate": 1.0874544094811422e-08, "logits/chosen": -1.3095712661743164, "logits/rejected": -1.328394889831543, "logps/chosen": -0.46411657333374023, "logps/rejected": -0.5013660192489624, "loss": 2.9036, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -4.641165733337402, "rewards/margins": 0.3724942207336426, "rewards/rejected": -5.013659954071045, "step": 440 }, { "epoch": 0.9493333333333334, "grad_norm": 19.52631450128395, "learning_rate": 7.346236092954316e-09, "logits/chosen": -1.316333532333374, "logits/rejected": -1.3282887935638428, "logps/chosen": -0.4388408064842224, "logps/rejected": -0.479898065328598, "loss": 2.9824, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -4.388408184051514, "rewards/margins": 0.4105720520019531, "rewards/rejected": -4.798980712890625, "step": 445 }, { "epoch": 0.96, "grad_norm": 20.824863365709117, "learning_rate": 4.50367993589107e-09, "logits/chosen": -1.316052794456482, "logits/rejected": -1.3359907865524292, "logps/chosen": -0.4431839883327484, "logps/rejected": -0.4758357107639313, "loss": 3.0019, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -4.431839942932129, "rewards/margins": 0.32651659846305847, "rewards/rejected": -4.758356094360352, "step": 450 }, { "epoch": 0.9706666666666667, "grad_norm": 20.63831130072036, "learning_rate": 2.3508323337321224e-09, "logits/chosen": -1.3009384870529175, "logits/rejected": -1.3190171718597412, "logps/chosen": -0.4200662672519684, "logps/rejected": -0.4521716237068176, "loss": 2.9499, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -4.200663089752197, "rewards/margins": 0.3210526704788208, "rewards/rejected": -4.5217156410217285, "step": 455 }, { "epoch": 0.9813333333333333, "grad_norm": 22.995746894078476, "learning_rate": 8.906899533517864e-10, "logits/chosen": -1.2918171882629395, "logits/rejected": -1.2942984104156494, "logps/chosen": -0.4635673463344574, "logps/rejected": -0.4919341504573822, "loss": 2.9469, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -4.635673999786377, "rewards/margins": 0.283668577671051, "rewards/rejected": -4.919342041015625, "step": 460 }, { "epoch": 0.992, "grad_norm": 23.572574549303816, "learning_rate": 1.252852471625987e-10, "logits/chosen": -1.2587887048721313, "logits/rejected": -1.2847938537597656, "logps/chosen": -0.4143844246864319, "logps/rejected": -0.4563868045806885, "loss": 2.9443, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -4.143843650817871, "rewards/margins": 0.42002400755882263, "rewards/rejected": -4.563868522644043, "step": 465 } ], "logging_steps": 5, "max_steps": 468, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 181044392755200.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }