{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 400, "global_step": 468, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010683760683760684, "grad_norm": 18.387579925622582, "learning_rate": 8.51063829787234e-08, "logits/chosen": -1.4812500476837158, "logits/rejected": -1.4343750476837158, "logps/chosen": -0.81640625, "logps/rejected": -0.850390613079071, "loss": 1.7676, "nll_loss": 0.0, "rewards/accuracies": 0.4671874940395355, "rewards/chosen": -2.0406250953674316, "rewards/margins": 0.08261718600988388, "rewards/rejected": -2.1226563453674316, "step": 5 }, { "epoch": 0.021367521367521368, "grad_norm": 21.659560893283086, "learning_rate": 1.9148936170212765e-07, "logits/chosen": -1.43359375, "logits/rejected": -1.396875023841858, "logps/chosen": -0.7269531488418579, "logps/rejected": -0.7250000238418579, "loss": 1.7614, "nll_loss": 0.0, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.818750023841858, "rewards/margins": -0.007250976748764515, "rewards/rejected": -1.810937523841858, "step": 10 }, { "epoch": 0.03205128205128205, "grad_norm": 14.389258634685925, "learning_rate": 2.978723404255319e-07, "logits/chosen": -1.489843726158142, "logits/rejected": -1.4617187976837158, "logps/chosen": -0.73046875, "logps/rejected": -0.7445312738418579, "loss": 1.6841, "nll_loss": 0.0, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.825781226158142, "rewards/margins": 0.03479614108800888, "rewards/rejected": -1.860937476158142, "step": 15 }, { "epoch": 0.042735042735042736, "grad_norm": 8.572904416071315, "learning_rate": 4.0425531914893614e-07, "logits/chosen": -1.4171874523162842, "logits/rejected": -1.403906226158142, "logps/chosen": -0.71875, "logps/rejected": -0.80859375, "loss": 1.6713, "nll_loss": 0.0, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": -1.7960937023162842, "rewards/margins": 0.22412109375, "rewards/rejected": -2.0218749046325684, "step": 20 }, { "epoch": 0.053418803418803416, "grad_norm": 12.085534263291942, "learning_rate": 5.106382978723403e-07, "logits/chosen": -1.471093773841858, "logits/rejected": -1.44921875, "logps/chosen": -0.6435546875, "logps/rejected": -0.68359375, "loss": 1.6299, "nll_loss": 0.0, "rewards/accuracies": 0.526562511920929, "rewards/chosen": -1.610937476158142, "rewards/margins": 0.09737396240234375, "rewards/rejected": -1.7078125476837158, "step": 25 }, { "epoch": 0.0641025641025641, "grad_norm": 14.051949061509276, "learning_rate": 6.170212765957446e-07, "logits/chosen": -1.46484375, "logits/rejected": -1.4367187023162842, "logps/chosen": -0.5189453363418579, "logps/rejected": -0.5455077886581421, "loss": 1.5902, "nll_loss": 0.0, "rewards/accuracies": 0.4984374940395355, "rewards/chosen": -1.296875, "rewards/margins": 0.06621094048023224, "rewards/rejected": -1.3640625476837158, "step": 30 }, { "epoch": 0.07478632478632478, "grad_norm": 15.304062972124502, "learning_rate": 7.23404255319149e-07, "logits/chosen": -1.497656226158142, "logits/rejected": -1.46875, "logps/chosen": -0.4546875059604645, "logps/rejected": -0.47148436307907104, "loss": 1.6006, "nll_loss": 0.0, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.135156273841858, "rewards/margins": 0.04415283352136612, "rewards/rejected": -1.1796875, "step": 35 }, { "epoch": 0.08547008547008547, "grad_norm": 19.104391719681082, "learning_rate": 8.297872340425532e-07, "logits/chosen": -1.5343749523162842, "logits/rejected": -1.485937476158142, "logps/chosen": -0.41132813692092896, "logps/rejected": -0.4263671934604645, "loss": 1.5897, "nll_loss": 0.0, "rewards/accuracies": 0.520312488079071, "rewards/chosen": -1.0285155773162842, "rewards/margins": 0.03793945163488388, "rewards/rejected": -1.06640625, "step": 40 }, { "epoch": 0.09615384615384616, "grad_norm": 8.645908098858397, "learning_rate": 9.361702127659575e-07, "logits/chosen": -1.557031273841858, "logits/rejected": -1.52734375, "logps/chosen": -0.3822265565395355, "logps/rejected": -0.4048828184604645, "loss": 1.5691, "nll_loss": 0.0, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.955078125, "rewards/margins": 0.05756836012005806, "rewards/rejected": -1.0128905773162842, "step": 45 }, { "epoch": 0.10683760683760683, "grad_norm": 9.00331703799887, "learning_rate": 9.999443163759668e-07, "logits/chosen": -1.497656226158142, "logits/rejected": -1.4695312976837158, "logps/chosen": -0.3783203065395355, "logps/rejected": -0.421875, "loss": 1.5388, "nll_loss": 0.0, "rewards/accuracies": 0.535937488079071, "rewards/chosen": -0.946093738079071, "rewards/margins": 0.10857544094324112, "rewards/rejected": -1.055078148841858, "step": 50 }, { "epoch": 0.11752136752136752, "grad_norm": 12.767356159141897, "learning_rate": 9.993180180337126e-07, "logits/chosen": -1.4953124523162842, "logits/rejected": -1.46875, "logps/chosen": -0.3998046815395355, "logps/rejected": -0.4404296875, "loss": 1.5484, "nll_loss": 0.0, "rewards/accuracies": 0.5296875238418579, "rewards/chosen": -1.0007812976837158, "rewards/margins": 0.10200195014476776, "rewards/rejected": -1.1015625, "step": 55 }, { "epoch": 0.1282051282051282, "grad_norm": 10.195545822550427, "learning_rate": 9.979966915051516e-07, "logits/chosen": -1.52734375, "logits/rejected": -1.4914062023162842, "logps/chosen": -0.41523438692092896, "logps/rejected": -0.4541015625, "loss": 1.5552, "nll_loss": 0.0, "rewards/accuracies": 0.5390625, "rewards/chosen": -1.0398437976837158, "rewards/margins": 0.0967559814453125, "rewards/rejected": -1.135156273841858, "step": 60 }, { "epoch": 0.1388888888888889, "grad_norm": 10.716342096548614, "learning_rate": 9.959821760172848e-07, "logits/chosen": -1.506250023841858, "logits/rejected": -1.466406226158142, "logps/chosen": -0.3990234434604645, "logps/rejected": -0.44023436307907104, "loss": 1.5528, "nll_loss": 0.0, "rewards/accuracies": 0.5, "rewards/chosen": -0.999218761920929, "rewards/margins": 0.10208740085363388, "rewards/rejected": -1.099609375, "step": 65 }, { "epoch": 0.14957264957264957, "grad_norm": 18.26340093274305, "learning_rate": 9.932772756849152e-07, "logits/chosen": -1.532812476158142, "logits/rejected": -1.49609375, "logps/chosen": -0.4326171875, "logps/rejected": -0.49980467557907104, "loss": 1.5508, "nll_loss": 0.0, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.080468773841858, "rewards/margins": 0.16822509467601776, "rewards/rejected": -1.25, "step": 70 }, { "epoch": 0.16025641025641027, "grad_norm": 21.259278645333858, "learning_rate": 9.898857556074466e-07, "logits/chosen": -1.5671875476837158, "logits/rejected": -1.549218773841858, "logps/chosen": -0.44921875, "logps/rejected": -0.5394531488418579, "loss": 1.505, "nll_loss": 0.0, "rewards/accuracies": 0.59375, "rewards/chosen": -1.123437523841858, "rewards/margins": 0.22548827528953552, "rewards/rejected": -1.3484375476837158, "step": 75 }, { "epoch": 0.17094017094017094, "grad_norm": 12.244653076137343, "learning_rate": 9.858123366280356e-07, "logits/chosen": -1.588281273841858, "logits/rejected": -1.572656273841858, "logps/chosen": -0.4935546815395355, "logps/rejected": -0.571093738079071, "loss": 1.5305, "nll_loss": 0.0, "rewards/accuracies": 0.5234375, "rewards/chosen": -1.232812523841858, "rewards/margins": 0.19204100966453552, "rewards/rejected": -1.4265625476837158, "step": 80 }, { "epoch": 0.18162393162393162, "grad_norm": 10.444731750558073, "learning_rate": 9.810626887623898e-07, "logits/chosen": -1.5390625, "logits/rejected": -1.5109374523162842, "logps/chosen": -0.4673828184604645, "logps/rejected": -0.542773425579071, "loss": 1.5526, "nll_loss": 0.0, "rewards/accuracies": 0.526562511920929, "rewards/chosen": -1.170312523841858, "rewards/margins": 0.18715819716453552, "rewards/rejected": -1.357031226158142, "step": 85 }, { "epoch": 0.19230769230769232, "grad_norm": 10.370326525798975, "learning_rate": 9.756434233063615e-07, "logits/chosen": -1.525781273841858, "logits/rejected": -1.489843726158142, "logps/chosen": -0.40058594942092896, "logps/rejected": -0.46113282442092896, "loss": 1.5482, "nll_loss": 0.0, "rewards/accuracies": 0.559374988079071, "rewards/chosen": -1.001562476158142, "rewards/margins": 0.15218810737133026, "rewards/rejected": -1.1535155773162842, "step": 90 }, { "epoch": 0.202991452991453, "grad_norm": 11.39793809106751, "learning_rate": 9.695620836333219e-07, "logits/chosen": -1.575781226158142, "logits/rejected": -1.5671875476837158, "logps/chosen": -0.3656249940395355, "logps/rejected": -0.4404296875, "loss": 1.5179, "nll_loss": 0.0, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.9140625, "rewards/margins": 0.18659667670726776, "rewards/rejected": -1.1007812023162842, "step": 95 }, { "epoch": 0.21367521367521367, "grad_norm": 13.822852555772993, "learning_rate": 9.628271346941252e-07, "logits/chosen": -1.564062476158142, "logits/rejected": -1.517968773841858, "logps/chosen": -0.3890624940395355, "logps/rejected": -0.4488281309604645, "loss": 1.5532, "nll_loss": 0.0, "rewards/accuracies": 0.534375011920929, "rewards/chosen": -0.9730468988418579, "rewards/margins": 0.14824219048023224, "rewards/rejected": -1.120703101158142, "step": 100 }, { "epoch": 0.22435897435897437, "grad_norm": 12.7396003923963, "learning_rate": 9.554479512342783e-07, "logits/chosen": -1.533593773841858, "logits/rejected": -1.498437523841858, "logps/chosen": -0.3792968690395355, "logps/rejected": -0.4404296875, "loss": 1.538, "nll_loss": 0.0, "rewards/accuracies": 0.542187511920929, "rewards/chosen": -0.9476562738418579, "rewards/margins": 0.15297850966453552, "rewards/rejected": -1.1003906726837158, "step": 105 }, { "epoch": 0.23504273504273504, "grad_norm": 21.261402234301844, "learning_rate": 9.474348047447176e-07, "logits/chosen": -1.568750023841858, "logits/rejected": -1.5234375, "logps/chosen": -0.38749998807907104, "logps/rejected": -0.43574219942092896, "loss": 1.546, "nll_loss": 0.0, "rewards/accuracies": 0.528124988079071, "rewards/chosen": -0.967578113079071, "rewards/margins": 0.12104491889476776, "rewards/rejected": -1.0890624523162842, "step": 110 }, { "epoch": 0.24572649572649571, "grad_norm": 17.398796131106096, "learning_rate": 9.387988491643557e-07, "logits/chosen": -1.55859375, "logits/rejected": -1.540624976158142, "logps/chosen": -0.4195312559604645, "logps/rejected": -0.503710925579071, "loss": 1.5278, "nll_loss": 0.0, "rewards/accuracies": 0.5625, "rewards/chosen": -1.048437476158142, "rewards/margins": 0.21040038764476776, "rewards/rejected": -1.2585937976837158, "step": 115 }, { "epoch": 0.2564102564102564, "grad_norm": 11.87400607253576, "learning_rate": 9.295521053543019e-07, "logits/chosen": -1.583593726158142, "logits/rejected": -1.571874976158142, "logps/chosen": -0.439453125, "logps/rejected": -0.5162109136581421, "loss": 1.519, "nll_loss": 0.0, "rewards/accuracies": 0.573437511920929, "rewards/chosen": -1.0988280773162842, "rewards/margins": 0.19086913764476776, "rewards/rejected": -1.2882812023162842, "step": 120 }, { "epoch": 0.2670940170940171, "grad_norm": 11.436299073464317, "learning_rate": 9.197074443653642e-07, "logits/chosen": -1.58984375, "logits/rejected": -1.56640625, "logps/chosen": -0.5277343988418579, "logps/rejected": -0.623828113079071, "loss": 1.5477, "nll_loss": 0.0, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.318750023841858, "rewards/margins": 0.24033813178539276, "rewards/rejected": -1.5593750476837158, "step": 125 }, { "epoch": 0.2777777777777778, "grad_norm": 14.626979030522325, "learning_rate": 9.09278569522127e-07, "logits/chosen": -1.5968749523162842, "logits/rejected": -1.546875, "logps/chosen": -0.48027342557907104, "logps/rejected": -0.6021484136581421, "loss": 1.4972, "nll_loss": 0.0, "rewards/accuracies": 0.5625, "rewards/chosen": -1.200781226158142, "rewards/margins": 0.304443359375, "rewards/rejected": -1.505468726158142, "step": 130 }, { "epoch": 0.28846153846153844, "grad_norm": 11.845040694170025, "learning_rate": 8.982799973485406e-07, "logits/chosen": -1.5703125, "logits/rejected": -1.536718726158142, "logps/chosen": -0.4761718809604645, "logps/rejected": -0.6029297113418579, "loss": 1.4762, "nll_loss": 0.0, "rewards/accuracies": 0.5765625238418579, "rewards/chosen": -1.191015601158142, "rewards/margins": 0.31770020723342896, "rewards/rejected": -1.509374976158142, "step": 135 }, { "epoch": 0.29914529914529914, "grad_norm": 13.92232790418488, "learning_rate": 8.867270373615734e-07, "logits/chosen": -1.5460937023162842, "logits/rejected": -1.521875023841858, "logps/chosen": -0.4703125059604645, "logps/rejected": -0.5712890625, "loss": 1.5045, "nll_loss": 0.0, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -1.1749999523162842, "rewards/margins": 0.2523437440395355, "rewards/rejected": -1.4289062023162842, "step": 140 }, { "epoch": 0.30982905982905984, "grad_norm": 12.635415703289459, "learning_rate": 8.746357707610543e-07, "logits/chosen": -1.5359375476837158, "logits/rejected": -1.505468726158142, "logps/chosen": -0.4593749940395355, "logps/rejected": -0.563671886920929, "loss": 1.5233, "nll_loss": 0.0, "rewards/accuracies": 0.582812488079071, "rewards/chosen": -1.1476562023162842, "rewards/margins": 0.25996094942092896, "rewards/rejected": -1.4093749523162842, "step": 145 }, { "epoch": 0.32051282051282054, "grad_norm": 11.154844952921092, "learning_rate": 8.620230280453671e-07, "logits/chosen": -1.5203125476837158, "logits/rejected": -1.4921875, "logps/chosen": -0.48515623807907104, "logps/rejected": -0.586718738079071, "loss": 1.4909, "nll_loss": 0.0, "rewards/accuracies": 0.6109374761581421, "rewards/chosen": -1.213281273841858, "rewards/margins": 0.2548828125, "rewards/rejected": -1.46875, "step": 150 }, { "epoch": 0.3311965811965812, "grad_norm": 18.4281131810382, "learning_rate": 8.48906365584155e-07, "logits/chosen": -1.529687523841858, "logits/rejected": -1.4921875, "logps/chosen": -0.4828124940395355, "logps/rejected": -0.6187499761581421, "loss": 1.4699, "nll_loss": 0.0, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.2082030773162842, "rewards/margins": 0.3387695252895355, "rewards/rejected": -1.545312523841858, "step": 155 }, { "epoch": 0.3418803418803419, "grad_norm": 13.615385650176293, "learning_rate": 8.353040411806447e-07, "logits/chosen": -1.544531226158142, "logits/rejected": -1.5125000476837158, "logps/chosen": -0.5599609613418579, "logps/rejected": -0.6734374761581421, "loss": 1.5053, "nll_loss": 0.0, "rewards/accuracies": 0.5609375238418579, "rewards/chosen": -1.3992187976837158, "rewards/margins": 0.2828125059604645, "rewards/rejected": -1.6828124523162842, "step": 160 }, { "epoch": 0.3525641025641026, "grad_norm": 26.41527109835978, "learning_rate": 8.212349886576069e-07, "logits/chosen": -1.600000023841858, "logits/rejected": -1.5515625476837158, "logps/chosen": -0.535937488079071, "logps/rejected": -0.670703113079071, "loss": 1.477, "nll_loss": 0.0, "rewards/accuracies": 0.604687511920929, "rewards/chosen": -1.33984375, "rewards/margins": 0.3355468809604645, "rewards/rejected": -1.6749999523162842, "step": 165 }, { "epoch": 0.36324786324786323, "grad_norm": 13.57171264424383, "learning_rate": 8.067187915023281e-07, "logits/chosen": -1.553125023841858, "logits/rejected": -1.5265624523162842, "logps/chosen": -0.5589843988418579, "logps/rejected": -0.6937500238418579, "loss": 1.471, "nll_loss": 0.0, "rewards/accuracies": 0.567187488079071, "rewards/chosen": -1.3984375, "rewards/margins": 0.3372802734375, "rewards/rejected": -1.735937476158142, "step": 170 }, { "epoch": 0.37393162393162394, "grad_norm": 19.506176492341762, "learning_rate": 7.91775655607279e-07, "logits/chosen": -1.560156226158142, "logits/rejected": -1.541406273841858, "logps/chosen": -0.583984375, "logps/rejected": -0.7816406488418579, "loss": 1.4303, "nll_loss": 0.0, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4617187976837158, "rewards/margins": 0.49335938692092896, "rewards/rejected": -1.954687476158142, "step": 175 }, { "epoch": 0.38461538461538464, "grad_norm": 21.640230091832745, "learning_rate": 7.764263811444214e-07, "logits/chosen": -1.5851562023162842, "logits/rejected": -1.5539062023162842, "logps/chosen": -0.5960937738418579, "logps/rejected": -0.7250000238418579, "loss": 1.4788, "nll_loss": 0.0, "rewards/accuracies": 0.598437488079071, "rewards/chosen": -1.490625023841858, "rewards/margins": 0.322265625, "rewards/rejected": -1.8125, "step": 180 }, { "epoch": 0.3952991452991453, "grad_norm": 17.283034944275915, "learning_rate": 7.606923336123069e-07, "logits/chosen": -1.546875, "logits/rejected": -1.52734375, "logps/chosen": -0.6265624761581421, "logps/rejected": -0.7578125, "loss": 1.5019, "nll_loss": 0.0, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.5656249523162842, "rewards/margins": 0.328369140625, "rewards/rejected": -1.89453125, "step": 185 }, { "epoch": 0.405982905982906, "grad_norm": 16.927291550124732, "learning_rate": 7.445954140962649e-07, "logits/chosen": -1.595312476158142, "logits/rejected": -1.564843773841858, "logps/chosen": -0.592578113079071, "logps/rejected": -0.792187511920929, "loss": 1.4263, "nll_loss": 0.0, "rewards/accuracies": 0.629687488079071, "rewards/chosen": -1.4835937023162842, "rewards/margins": 0.49609375, "rewards/rejected": -1.9796874523162842, "step": 190 }, { "epoch": 0.4166666666666667, "grad_norm": 19.885105063023072, "learning_rate": 7.28158028783079e-07, "logits/chosen": -1.5578124523162842, "logits/rejected": -1.5304687023162842, "logps/chosen": -0.60546875, "logps/rejected": -0.7406250238418579, "loss": 1.4749, "nll_loss": 0.0, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -1.513281226158142, "rewards/margins": 0.33740234375, "rewards/rejected": -1.8523437976837158, "step": 195 }, { "epoch": 0.42735042735042733, "grad_norm": 16.181127748785606, "learning_rate": 7.114030577725835e-07, "logits/chosen": -1.576562523841858, "logits/rejected": -1.541406273841858, "logps/chosen": -0.6292968988418579, "logps/rejected": -0.7671874761581421, "loss": 1.4646, "nll_loss": 0.0, "rewards/accuracies": 0.5703125, "rewards/chosen": -1.5734374523162842, "rewards/margins": 0.34318846464157104, "rewards/rejected": -1.9171874523162842, "step": 200 }, { "epoch": 0.43803418803418803, "grad_norm": 20.82169481298227, "learning_rate": 6.943538232295964e-07, "logits/chosen": -1.607812523841858, "logits/rejected": -1.5859375, "logps/chosen": -0.665234386920929, "logps/rejected": -0.823437511920929, "loss": 1.4523, "nll_loss": 0.0, "rewards/accuracies": 0.6109374761581421, "rewards/chosen": -1.662500023841858, "rewards/margins": 0.3946289122104645, "rewards/rejected": -2.0562500953674316, "step": 205 }, { "epoch": 0.44871794871794873, "grad_norm": 25.003069519753712, "learning_rate": 6.770340569205157e-07, "logits/chosen": -1.6203124523162842, "logits/rejected": -1.5812499523162842, "logps/chosen": -0.7738281488418579, "logps/rejected": -0.957812488079071, "loss": 1.4483, "nll_loss": 0.0, "rewards/accuracies": 0.620312511920929, "rewards/chosen": -1.9343750476837158, "rewards/margins": 0.4595703184604645, "rewards/rejected": -2.393749952316284, "step": 210 }, { "epoch": 0.4594017094017094, "grad_norm": 18.610256654997034, "learning_rate": 6.594678671797703e-07, "logits/chosen": -1.6007812023162842, "logits/rejected": -1.572656273841858, "logps/chosen": -0.77734375, "logps/rejected": -1.0222656726837158, "loss": 1.4027, "nll_loss": 0.0, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.943750023841858, "rewards/margins": 0.610156238079071, "rewards/rejected": -2.5562500953674316, "step": 215 }, { "epoch": 0.4700854700854701, "grad_norm": 22.948907291803067, "learning_rate": 6.416797053521038e-07, "logits/chosen": -1.588281273841858, "logits/rejected": -1.5539062023162842, "logps/chosen": -0.78515625, "logps/rejected": -0.969531238079071, "loss": 1.4565, "nll_loss": 0.0, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -1.9609375, "rewards/margins": 0.45917969942092896, "rewards/rejected": -2.421875, "step": 220 }, { "epoch": 0.4807692307692308, "grad_norm": 22.112843833650466, "learning_rate": 6.236943317574054e-07, "logits/chosen": -1.5476562976837158, "logits/rejected": -1.5203125476837158, "logps/chosen": -0.774609386920929, "logps/rejected": -0.942187488079071, "loss": 1.4957, "nll_loss": 0.0, "rewards/accuracies": 0.609375, "rewards/chosen": -1.9367187023162842, "rewards/margins": 0.4192871153354645, "rewards/rejected": -2.356250047683716, "step": 225 }, { "epoch": 0.49145299145299143, "grad_norm": 23.56709379174187, "learning_rate": 6.055367812254592e-07, "logits/chosen": -1.5867187976837158, "logits/rejected": -1.56640625, "logps/chosen": -0.749218761920929, "logps/rejected": -0.935546875, "loss": 1.4253, "nll_loss": 0.0, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -1.8757812976837158, "rewards/margins": 0.46416014432907104, "rewards/rejected": -2.335156202316284, "step": 230 }, { "epoch": 0.5021367521367521, "grad_norm": 22.251408755340613, "learning_rate": 5.872323282485888e-07, "logits/chosen": -1.5859375, "logits/rejected": -1.549218773841858, "logps/chosen": -0.7632812261581421, "logps/rejected": -0.934374988079071, "loss": 1.4625, "nll_loss": 0.0, "rewards/accuracies": 0.609375, "rewards/chosen": -1.908593773841858, "rewards/margins": 0.4273437559604645, "rewards/rejected": -2.33203125, "step": 235 }, { "epoch": 0.5128205128205128, "grad_norm": 20.6813266166916, "learning_rate": 5.688064518007035e-07, "logits/chosen": -1.575781226158142, "logits/rejected": -1.564843773841858, "logps/chosen": -0.7757812738418579, "logps/rejected": -0.98828125, "loss": 1.3784, "nll_loss": 0.0, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -1.9382812976837158, "rewards/margins": 0.53125, "rewards/rejected": -2.4703125953674316, "step": 240 }, { "epoch": 0.5235042735042735, "grad_norm": 15.883505475500101, "learning_rate": 5.50284799871714e-07, "logits/chosen": -1.6062500476837158, "logits/rejected": -1.56640625, "logps/chosen": -0.735156238079071, "logps/rejected": -0.940234363079071, "loss": 1.4273, "nll_loss": 0.0, "rewards/accuracies": 0.6078125238418579, "rewards/chosen": -1.838281273841858, "rewards/margins": 0.5125976800918579, "rewards/rejected": -2.3499999046325684, "step": 245 }, { "epoch": 0.5341880341880342, "grad_norm": 25.5478315800542, "learning_rate": 5.316931537666856e-07, "logits/chosen": -1.61328125, "logits/rejected": -1.5828125476837158, "logps/chosen": -0.785937488079071, "logps/rejected": -0.9281250238418579, "loss": 1.4669, "nll_loss": 0.0, "rewards/accuracies": 0.5921875238418579, "rewards/chosen": -1.96484375, "rewards/margins": 0.35576170682907104, "rewards/rejected": -2.319531202316284, "step": 250 }, { "epoch": 0.5448717948717948, "grad_norm": 25.662663107269402, "learning_rate": 5.130573922194236e-07, "logits/chosen": -1.6218750476837158, "logits/rejected": -1.5945312976837158, "logps/chosen": -0.8207031488418579, "logps/rejected": -1.051171898841858, "loss": 1.3959, "nll_loss": 0.0, "rewards/accuracies": 0.645312488079071, "rewards/chosen": -2.0492186546325684, "rewards/margins": 0.574414074420929, "rewards/rejected": -2.6234374046325684, "step": 255 }, { "epoch": 0.5555555555555556, "grad_norm": 19.93174202171367, "learning_rate": 4.944034553704412e-07, "logits/chosen": -1.6171875, "logits/rejected": -1.6015625, "logps/chosen": -0.8949218988418579, "logps/rejected": -1.1492187976837158, "loss": 1.4123, "nll_loss": 0.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.235156297683716, "rewards/margins": 0.637499988079071, "rewards/rejected": -2.8734374046325684, "step": 260 }, { "epoch": 0.5662393162393162, "grad_norm": 25.25500473771292, "learning_rate": 4.7575730865945284e-07, "logits/chosen": -1.5906250476837158, "logits/rejected": -1.568750023841858, "logps/chosen": -0.8773437738418579, "logps/rejected": -1.0632812976837158, "loss": 1.4437, "nll_loss": 0.0, "rewards/accuracies": 0.651562511920929, "rewards/chosen": -2.1929688453674316, "rewards/margins": 0.4657226502895355, "rewards/rejected": -2.660937547683716, "step": 265 }, { "epoch": 0.5769230769230769, "grad_norm": 26.727173719859465, "learning_rate": 4.5714490668265237e-07, "logits/chosen": -1.59375, "logits/rejected": -1.575781226158142, "logps/chosen": -0.891406238079071, "logps/rejected": -1.0867187976837158, "loss": 1.4468, "nll_loss": 0.0, "rewards/accuracies": 0.609375, "rewards/chosen": -2.2281250953674316, "rewards/margins": 0.4881591796875, "rewards/rejected": -2.7171874046325684, "step": 270 }, { "epoch": 0.5876068376068376, "grad_norm": 23.107407158744333, "learning_rate": 4.385921570650829e-07, "logits/chosen": -1.626562476158142, "logits/rejected": -1.6007812023162842, "logps/chosen": -0.846875011920929, "logps/rejected": -1.054296851158142, "loss": 1.3979, "nll_loss": 0.0, "rewards/accuracies": 0.6734374761581421, "rewards/chosen": -2.116406202316284, "rewards/margins": 0.521728515625, "rewards/rejected": -2.637500047683716, "step": 275 }, { "epoch": 0.5982905982905983, "grad_norm": 24.943044733477908, "learning_rate": 4.201248843983918e-07, "logits/chosen": -1.5890624523162842, "logits/rejected": -1.5554687976837158, "logps/chosen": -0.8343750238418579, "logps/rejected": -1.064062476158142, "loss": 1.3962, "nll_loss": 0.0, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -2.085156202316284, "rewards/margins": 0.572070300579071, "rewards/rejected": -2.65625, "step": 280 }, { "epoch": 0.6089743589743589, "grad_norm": 20.205385409855005, "learning_rate": 4.0176879429416083e-07, "logits/chosen": -1.592187523841858, "logits/rejected": -1.544531226158142, "logps/chosen": -0.9039062261581421, "logps/rejected": -1.1015625, "loss": 1.4174, "nll_loss": 0.0, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.262500047683716, "rewards/margins": 0.4927734434604645, "rewards/rejected": -2.753124952316284, "step": 285 }, { "epoch": 0.6196581196581197, "grad_norm": 37.04376569884301, "learning_rate": 3.8354943760285435e-07, "logits/chosen": -1.618749976158142, "logits/rejected": -1.6015625, "logps/chosen": -0.952343761920929, "logps/rejected": -1.205468773841858, "loss": 1.3805, "nll_loss": 0.0, "rewards/accuracies": 0.6484375, "rewards/chosen": -2.378124952316284, "rewards/margins": 0.6346679925918579, "rewards/rejected": -3.0140624046325684, "step": 290 }, { "epoch": 0.6303418803418803, "grad_norm": 28.57757441969063, "learning_rate": 3.6549217484818573e-07, "logits/chosen": -1.6015625, "logits/rejected": -1.5828125476837158, "logps/chosen": -0.987500011920929, "logps/rejected": -1.204687476158142, "loss": 1.4307, "nll_loss": 0.0, "rewards/accuracies": 0.6484375, "rewards/chosen": -2.46875, "rewards/margins": 0.5419921875, "rewards/rejected": -3.012500047683716, "step": 295 }, { "epoch": 0.6410256410256411, "grad_norm": 26.97457548170502, "learning_rate": 3.4762214092641096e-07, "logits/chosen": -1.5906250476837158, "logits/rejected": -1.56640625, "logps/chosen": -0.912109375, "logps/rejected": -1.1707031726837158, "loss": 1.3817, "nll_loss": 0.0, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -2.2796874046325684, "rewards/margins": 0.6458984613418579, "rewards/rejected": -2.926562547683716, "step": 300 }, { "epoch": 0.6517094017094017, "grad_norm": 18.40872102433568, "learning_rate": 3.299642101196854e-07, "logits/chosen": -1.626562476158142, "logits/rejected": -1.6007812023162842, "logps/chosen": -0.8511718511581421, "logps/rejected": -1.115234375, "loss": 1.3859, "nll_loss": 0.0, "rewards/accuracies": 0.6421874761581421, "rewards/chosen": -2.1273436546325684, "rewards/margins": 0.660351574420929, "rewards/rejected": -2.7890625, "step": 305 }, { "epoch": 0.6623931623931624, "grad_norm": 48.83431435177251, "learning_rate": 3.125429614721842e-07, "logits/chosen": -1.5773437023162842, "logits/rejected": -1.5656249523162842, "logps/chosen": -0.9214843511581421, "logps/rejected": -1.154296875, "loss": 1.4462, "nll_loss": 0.0, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -2.3023438453674316, "rewards/margins": 0.5805908441543579, "rewards/rejected": -2.8843750953674316, "step": 310 }, { "epoch": 0.6730769230769231, "grad_norm": 26.010483998640073, "learning_rate": 2.953826445771788e-07, "logits/chosen": -1.584375023841858, "logits/rejected": -1.5671875476837158, "logps/chosen": -0.907031238079071, "logps/rejected": -1.125, "loss": 1.4235, "nll_loss": 0.0, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.2679686546325684, "rewards/margins": 0.5430663824081421, "rewards/rejected": -2.8125, "step": 315 }, { "epoch": 0.6837606837606838, "grad_norm": 25.722431614221733, "learning_rate": 2.785071458226972e-07, "logits/chosen": -1.592187523841858, "logits/rejected": -1.5617187023162842, "logps/chosen": -0.90625, "logps/rejected": -1.146093726158142, "loss": 1.3948, "nll_loss": 0.0, "rewards/accuracies": 0.640625, "rewards/chosen": -2.2671875953674316, "rewards/margins": 0.595898449420929, "rewards/rejected": -2.864062547683716, "step": 320 }, { "epoch": 0.6944444444444444, "grad_norm": 27.64784221370248, "learning_rate": 2.6193995514274705e-07, "logits/chosen": -1.5851562023162842, "logits/rejected": -1.5625, "logps/chosen": -0.9371093511581421, "logps/rejected": -1.185156226158142, "loss": 1.3706, "nll_loss": 0.0, "rewards/accuracies": 0.6546875238418579, "rewards/chosen": -2.3414063453674316, "rewards/margins": 0.6197265386581421, "rewards/rejected": -2.9625000953674316, "step": 325 }, { "epoch": 0.7051282051282052, "grad_norm": 26.29403948884724, "learning_rate": 2.457041333203852e-07, "logits/chosen": -1.607812523841858, "logits/rejected": -1.58984375, "logps/chosen": -0.973437488079071, "logps/rejected": -1.2390625476837158, "loss": 1.397, "nll_loss": 0.0, "rewards/accuracies": 0.640625, "rewards/chosen": -2.434375047683716, "rewards/margins": 0.66748046875, "rewards/rejected": -3.1015625, "step": 330 }, { "epoch": 0.7158119658119658, "grad_norm": 20.070403111951517, "learning_rate": 2.2982227988814796e-07, "logits/chosen": -1.5625, "logits/rejected": -1.5578124523162842, "logps/chosen": -1.002343773841858, "logps/rejected": -1.299218773841858, "loss": 1.3534, "nll_loss": 0.0, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": -2.503124952316284, "rewards/margins": 0.74609375, "rewards/rejected": -3.246875047683716, "step": 335 }, { "epoch": 0.7264957264957265, "grad_norm": 21.27637529256292, "learning_rate": 2.1431650167051918e-07, "logits/chosen": -1.6203124523162842, "logits/rejected": -1.599218726158142, "logps/chosen": -1.0, "logps/rejected": -1.2265625, "loss": 1.4229, "nll_loss": 0.0, "rewards/accuracies": 0.6484375, "rewards/chosen": -2.5, "rewards/margins": 0.564257800579071, "rewards/rejected": -3.065624952316284, "step": 340 }, { "epoch": 0.7371794871794872, "grad_norm": 26.660141655685035, "learning_rate": 1.992083820122259e-07, "logits/chosen": -1.5859375, "logits/rejected": -1.578125, "logps/chosen": -0.92578125, "logps/rejected": -1.1687500476837158, "loss": 1.3553, "nll_loss": 0.0, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -2.315624952316284, "rewards/margins": 0.6068359613418579, "rewards/rejected": -2.9203124046325684, "step": 345 }, { "epoch": 0.7478632478632479, "grad_norm": 28.652969469341908, "learning_rate": 1.845189507351964e-07, "logits/chosen": -1.5859375, "logits/rejected": -1.56640625, "logps/chosen": -0.8785156011581421, "logps/rejected": -1.14453125, "loss": 1.3487, "nll_loss": 0.0, "rewards/accuracies": 0.6703125238418579, "rewards/chosen": -2.1968750953674316, "rewards/margins": 0.663867175579071, "rewards/rejected": -2.864062547683716, "step": 350 }, { "epoch": 0.7585470085470085, "grad_norm": 31.505789159719555, "learning_rate": 1.7026865486599374e-07, "logits/chosen": -1.603906273841858, "logits/rejected": -1.5734374523162842, "logps/chosen": -1.0070312023162842, "logps/rejected": -1.28515625, "loss": 1.3709, "nll_loss": 0.0, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -2.518749952316284, "rewards/margins": 0.6943359375, "rewards/rejected": -3.2125000953674316, "step": 355 }, { "epoch": 0.7692307692307693, "grad_norm": 26.452453101979053, "learning_rate": 1.564773301744774e-07, "logits/chosen": -1.5984375476837158, "logits/rejected": -1.571874976158142, "logps/chosen": -0.9917968511581421, "logps/rejected": -1.2703125476837158, "loss": 1.3662, "nll_loss": 0.0, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -2.4765625, "rewards/margins": 0.6996093988418579, "rewards/rejected": -3.1781249046325684, "step": 360 }, { "epoch": 0.7799145299145299, "grad_norm": 22.60419152131841, "learning_rate": 1.431641735633044e-07, "logits/chosen": -1.62109375, "logits/rejected": -1.5906250476837158, "logps/chosen": -1.000390648841858, "logps/rejected": -1.2742187976837158, "loss": 1.3257, "nll_loss": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.5, "rewards/margins": 0.6851562261581421, "rewards/rejected": -3.184375047683716, "step": 365 }, { "epoch": 0.7905982905982906, "grad_norm": 31.959715203438616, "learning_rate": 1.30347716346706e-07, "logits/chosen": -1.587499976158142, "logits/rejected": -1.5554687976837158, "logps/chosen": -1.006250023841858, "logps/rejected": -1.2453124523162842, "loss": 1.3749, "nll_loss": 0.0, "rewards/accuracies": 0.6578124761581421, "rewards/chosen": -2.515625, "rewards/margins": 0.598828136920929, "rewards/rejected": -3.112499952316284, "step": 370 }, { "epoch": 0.8012820512820513, "grad_norm": 26.131565932019587, "learning_rate": 1.1804579845573287e-07, "logits/chosen": -1.5859375, "logits/rejected": -1.5625, "logps/chosen": -0.985156238079071, "logps/rejected": -1.212499976158142, "loss": 1.408, "nll_loss": 0.0, "rewards/accuracies": 0.6390625238418579, "rewards/chosen": -2.465625047683716, "rewards/margins": 0.566210925579071, "rewards/rejected": -3.0296874046325684, "step": 375 }, { "epoch": 0.811965811965812, "grad_norm": 22.064228308170097, "learning_rate": 1.0627554360587532e-07, "logits/chosen": -1.6296875476837158, "logits/rejected": -1.6007812023162842, "logps/chosen": -1.041406273841858, "logps/rejected": -1.36328125, "loss": 1.3429, "nll_loss": 0.0, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.6031250953674316, "rewards/margins": 0.803906261920929, "rewards/rejected": -3.4078125953674316, "step": 380 }, { "epoch": 0.8226495726495726, "grad_norm": 21.960589810609097, "learning_rate": 9.50533354616217e-08, "logits/chosen": -1.607031226158142, "logits/rejected": -1.58984375, "logps/chosen": -1.0390625, "logps/rejected": -1.306249976158142, "loss": 1.3265, "nll_loss": 0.0, "rewards/accuracies": 0.6953125, "rewards/chosen": -2.5953125953674316, "rewards/margins": 0.6675781011581421, "rewards/rejected": -3.262500047683716, "step": 385 }, { "epoch": 0.8333333333333334, "grad_norm": 31.587909830608158, "learning_rate": 8.439479483113682e-08, "logits/chosen": -1.610937476158142, "logits/rejected": -1.58203125, "logps/chosen": -1.0402343273162842, "logps/rejected": -1.338281273841858, "loss": 1.3555, "nll_loss": 0.0, "rewards/accuracies": 0.6859375238418579, "rewards/chosen": -2.598437547683716, "rewards/margins": 0.7417968511581421, "rewards/rejected": -3.339062452316284, "step": 390 }, { "epoch": 0.844017094017094, "grad_norm": 22.55165739610421, "learning_rate": 7.431475792280017e-08, "logits/chosen": -1.5867187976837158, "logits/rejected": -1.5656249523162842, "logps/chosen": -1.017187476158142, "logps/rejected": -1.252343773841858, "loss": 1.3629, "nll_loss": 0.0, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -2.543750047683716, "rewards/margins": 0.588671863079071, "rewards/rejected": -3.1328125, "step": 395 }, { "epoch": 0.8547008547008547, "grad_norm": 19.84309936265848, "learning_rate": 6.482725569387171e-08, "logits/chosen": -1.6015625, "logits/rejected": -1.5812499523162842, "logps/chosen": -1.046484351158142, "logps/rejected": -1.322656273841858, "loss": 1.3524, "nll_loss": 0.0, "rewards/accuracies": 0.6703125238418579, "rewards/chosen": -2.6171875, "rewards/margins": 0.6898437738418579, "rewards/rejected": -3.3062500953674316, "step": 400 }, { "epoch": 0.8547008547008547, "eval_logits/chosen": -1.4735382795333862, "eval_logits/rejected": -1.4497227668762207, "eval_logps/chosen": -1.0328880548477173, "eval_logps/rejected": -1.3099168539047241, "eval_loss": 1.3493971824645996, "eval_nll_loss": 0.0, "eval_rewards/accuracies": 0.6670587062835693, "eval_rewards/chosen": -2.5819051265716553, "eval_rewards/margins": 0.6942256689071655, "eval_rewards/rejected": -3.276461601257324, "eval_runtime": 26.5068, "eval_samples_per_second": 73.981, "eval_steps_per_second": 2.339, "step": 400 }, { "epoch": 0.8653846153846154, "grad_norm": 26.041292961951424, "learning_rate": 5.594549432003243e-08, "logits/chosen": -1.631250023841858, "logits/rejected": -1.59375, "logps/chosen": -1.0828125476837158, "logps/rejected": -1.334375023841858, "loss": 1.3903, "nll_loss": 0.0, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.7093749046325684, "rewards/margins": 0.6244140863418579, "rewards/rejected": -3.332812547683716, "step": 405 }, { "epoch": 0.8760683760683761, "grad_norm": 32.518651766511766, "learning_rate": 4.76818368129821e-08, "logits/chosen": -1.6015625, "logits/rejected": -1.571874976158142, "logps/chosen": -1.058984398841858, "logps/rejected": -1.3125, "loss": 1.3631, "nll_loss": 0.0, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -2.6484375, "rewards/margins": 0.633984386920929, "rewards/rejected": -3.2828125953674316, "step": 410 }, { "epoch": 0.8867521367521367, "grad_norm": 23.06725788674345, "learning_rate": 4.004778581168411e-08, "logits/chosen": -1.588281273841858, "logits/rejected": -1.572656273841858, "logps/chosen": -1.028906226158142, "logps/rejected": -1.3125, "loss": 1.3308, "nll_loss": 0.0, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.5703125, "rewards/margins": 0.7134765386581421, "rewards/rejected": -3.284374952316284, "step": 415 }, { "epoch": 0.8974358974358975, "grad_norm": 24.18638777242838, "learning_rate": 3.305396757121037e-08, "logits/chosen": -1.62890625, "logits/rejected": -1.6124999523162842, "logps/chosen": -1.075781226158142, "logps/rejected": -1.33984375, "loss": 1.3695, "nll_loss": 0.0, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.6875, "rewards/margins": 0.6610351800918579, "rewards/rejected": -3.346874952316284, "step": 420 }, { "epoch": 0.9081196581196581, "grad_norm": 30.971842967518672, "learning_rate": 2.6710117171472757e-08, "logits/chosen": -1.571874976158142, "logits/rejected": -1.5421874523162842, "logps/chosen": -1.041015625, "logps/rejected": -1.30859375, "loss": 1.3354, "nll_loss": 0.0, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.6015625, "rewards/margins": 0.6728515625, "rewards/rejected": -3.2734375, "step": 425 }, { "epoch": 0.9188034188034188, "grad_norm": 24.277716317235853, "learning_rate": 2.1025064966430694e-08, "logits/chosen": -1.627343773841858, "logits/rejected": -1.610937476158142, "logps/chosen": -1.0441405773162842, "logps/rejected": -1.3624999523162842, "loss": 1.3138, "nll_loss": 0.0, "rewards/accuracies": 0.671875, "rewards/chosen": -2.612499952316284, "rewards/margins": 0.792187511920929, "rewards/rejected": -3.401562452316284, "step": 430 }, { "epoch": 0.9294871794871795, "grad_norm": 24.66405837343724, "learning_rate": 1.6006724292636166e-08, "logits/chosen": -1.6257812976837158, "logits/rejected": -1.6171875, "logps/chosen": -1.050390601158142, "logps/rejected": -1.3406250476837158, "loss": 1.3571, "nll_loss": 0.0, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -2.6265625953674316, "rewards/margins": 0.724414050579071, "rewards/rejected": -3.3515625, "step": 435 }, { "epoch": 0.9401709401709402, "grad_norm": 25.638182759285908, "learning_rate": 1.1662080454225509e-08, "logits/chosen": -1.5812499523162842, "logits/rejected": -1.5578124523162842, "logps/chosen": -1.0617187023162842, "logps/rejected": -1.337499976158142, "loss": 1.3339, "nll_loss": 0.0, "rewards/accuracies": 0.692187488079071, "rewards/chosen": -2.653125047683716, "rewards/margins": 0.6910156011581421, "rewards/rejected": -3.3453125953674316, "step": 440 }, { "epoch": 0.9508547008547008, "grad_norm": 26.16854399233931, "learning_rate": 7.9971809996911e-09, "logits/chosen": -1.61328125, "logits/rejected": -1.603124976158142, "logps/chosen": -1.072656273841858, "logps/rejected": -1.350000023841858, "loss": 1.3319, "nll_loss": 0.0, "rewards/accuracies": 0.6859375238418579, "rewards/chosen": -2.6781249046325684, "rewards/margins": 0.694140613079071, "rewards/rejected": -3.371875047683716, "step": 445 }, { "epoch": 0.9615384615384616, "grad_norm": 26.197163635860893, "learning_rate": 5.017127303966084e-09, "logits/chosen": -1.6023437976837158, "logits/rejected": -1.5703125, "logps/chosen": -1.081640601158142, "logps/rejected": -1.359375, "loss": 1.3163, "nll_loss": 0.0, "rewards/accuracies": 0.676562488079071, "rewards/chosen": -2.703125, "rewards/margins": 0.699023425579071, "rewards/rejected": -3.4046874046325684, "step": 450 }, { "epoch": 0.9722222222222222, "grad_norm": 27.97092955661895, "learning_rate": 2.7260674675404496e-09, "logits/chosen": -1.62109375, "logits/rejected": -1.591406226158142, "logps/chosen": -1.099218726158142, "logps/rejected": -1.3468749523162842, "loss": 1.4061, "nll_loss": 0.0, "rewards/accuracies": 0.635937511920929, "rewards/chosen": -2.7484374046325684, "rewards/margins": 0.615039050579071, "rewards/rejected": -3.362499952316284, "step": 455 }, { "epoch": 0.9829059829059829, "grad_norm": 27.659006594737694, "learning_rate": 1.1271905424918293e-09, "logits/chosen": -1.587499976158142, "logits/rejected": -1.580468773841858, "logps/chosen": -1.107812523841858, "logps/rejected": -1.3742187023162842, "loss": 1.3842, "nll_loss": 0.0, "rewards/accuracies": 0.6484375, "rewards/chosen": -2.7671875953674316, "rewards/margins": 0.6664062738418579, "rewards/rejected": -3.4359374046325684, "step": 460 }, { "epoch": 0.9935897435897436, "grad_norm": 26.950711678907783, "learning_rate": 2.227220934688523e-10, "logits/chosen": -1.614843726158142, "logits/rejected": -1.5890624523162842, "logps/chosen": -1.0382812023162842, "logps/rejected": -1.350000023841858, "loss": 1.3106, "nll_loss": 0.0, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -2.598437547683716, "rewards/margins": 0.7733398675918579, "rewards/rejected": -3.3734374046325684, "step": 465 }, { "epoch": 1.0, "step": 468, "total_flos": 0.0, "train_loss": 1.4527900891426282, "train_runtime": 3187.3315, "train_samples_per_second": 18.786, "train_steps_per_second": 0.147 } ], "logging_steps": 5, "max_steps": 468, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }