{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 1856, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.40908203125, "epoch": 0.0053893829156561575, "grad_norm": 7.759696960449219, "learning_rate": 4.8387096774193546e-08, "logits/chosen": -0.44672375660898817, "logits/rejected": -0.39365523942633257, "logps/chosen": -107.6, "logps/rejected": -109.3, "loss": 0.6923, "mean_token_accuracy": 0.9019625961780549, "num_tokens": 365562.0, "rewards/accuracies": 0.328125, "rewards/chosen": 0.00010342597961425782, "rewards/margins": 0.0004913330078125, "rewards/rejected": -0.0003886222839355469, "step": 10 }, { "entropy": 0.41064453125, "epoch": 0.010778765831312315, "grad_norm": 7.587465286254883, "learning_rate": 1.0215053763440861e-07, "logits/chosen": -0.3728776172134608, "logits/rejected": -0.332460740198616, "logps/chosen": -100.825, "logps/rejected": -102.975, "loss": 0.693, "mean_token_accuracy": 0.9035390466451645, "num_tokens": 720708.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.00041942596435546876, "rewards/margins": -0.00022115707397460938, "rewards/rejected": -0.00019679069519042968, "step": 20 }, { "entropy": 0.4173828125, "epoch": 0.016168148746968473, "grad_norm": 6.577247142791748, "learning_rate": 1.5591397849462365e-07, "logits/chosen": -0.472471438697433, "logits/rejected": -0.39264794802016467, "logps/chosen": -96.45, "logps/rejected": -99.825, "loss": 0.6929, "mean_token_accuracy": 0.9041816651821136, "num_tokens": 1075168.0, "rewards/accuracies": 0.375, "rewards/chosen": -0.00027996301651000977, "rewards/margins": -0.00011727809906005859, "rewards/rejected": -0.00016357898712158204, "step": 30 }, { "entropy": 0.39150390625, "epoch": 0.02155753166262463, "grad_norm": 6.858296871185303, "learning_rate": 2.0967741935483871e-07, "logits/chosen": -0.3733890259921028, "logits/rejected": -0.3552958369517544, "logps/chosen": -99.3375, "logps/rejected": -103.175, "loss": 0.6929, "mean_token_accuracy": 0.9075490444898605, "num_tokens": 1443719.0, "rewards/accuracies": 0.35, "rewards/chosen": -0.0002532958984375, "rewards/margins": 0.000262451171875, "rewards/rejected": -0.0005157232284545898, "step": 40 }, { "entropy": 0.39990234375, "epoch": 0.026946914578280787, "grad_norm": 6.983979225158691, "learning_rate": 2.6344086021505376e-07, "logits/chosen": -0.3704425655981937, "logits/rejected": -0.3571443883083806, "logps/chosen": -89.5625, "logps/rejected": -92.6, "loss": 0.6927, "mean_token_accuracy": 0.9039196908473969, "num_tokens": 1782804.0, "rewards/accuracies": 0.428125, "rewards/chosen": -0.0011333465576171876, "rewards/margins": 0.0005399942398071289, "rewards/rejected": -0.0016736984252929688, "step": 50 }, { "entropy": 0.4013671875, "epoch": 0.03233629749393695, "grad_norm": 6.333939552307129, "learning_rate": 3.172043010752688e-07, "logits/chosen": -0.34019105871641864, "logits/rejected": -0.3218748638672174, "logps/chosen": -94.4875, "logps/rejected": -93.575, "loss": 0.6919, "mean_token_accuracy": 0.9046859920024872, "num_tokens": 2126112.0, "rewards/accuracies": 0.490625, "rewards/chosen": -0.0008403778076171875, "rewards/margins": 0.0018994331359863282, "rewards/rejected": -0.002740812301635742, "step": 60 }, { "entropy": 0.41484375, "epoch": 0.0377256804095931, "grad_norm": 7.4297566413879395, "learning_rate": 3.7096774193548384e-07, "logits/chosen": -0.3875610308110304, "logits/rejected": -0.35152331612465404, "logps/chosen": -97.375, "logps/rejected": -99.3625, "loss": 0.6914, "mean_token_accuracy": 0.9037097364664077, "num_tokens": 2470754.0, "rewards/accuracies": 0.540625, "rewards/chosen": -0.0022185802459716796, "rewards/margins": 0.002935457229614258, "rewards/rejected": -0.005158233642578125, "step": 70 }, { "entropy": 0.41259765625, "epoch": 0.04311506332524926, "grad_norm": 7.677794456481934, "learning_rate": 4.247311827956989e-07, "logits/chosen": -0.35103980570551985, "logits/rejected": -0.3017016560657181, "logps/chosen": -97.5, "logps/rejected": -99.3875, "loss": 0.6909, "mean_token_accuracy": 0.9047763884067536, "num_tokens": 2836116.0, "rewards/accuracies": 0.571875, "rewards/chosen": -0.004664897918701172, "rewards/margins": 0.004430198669433593, "rewards/rejected": -0.009100341796875, "step": 80 }, { "entropy": 0.41826171875, "epoch": 0.04850444624090541, "grad_norm": 6.765298366546631, "learning_rate": 4.78494623655914e-07, "logits/chosen": -0.4268290587468382, "logits/rejected": -0.37336222025804355, "logps/chosen": -108.175, "logps/rejected": -111.6, "loss": 0.6898, "mean_token_accuracy": 0.9004561603069305, "num_tokens": 3207605.0, "rewards/accuracies": 0.684375, "rewards/chosen": -0.008296608785167336, "rewards/margins": 0.006721109163481742, "rewards/rejected": -0.01501771821640432, "setc/cal_net_lr": 3.375e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.052821920812129976, "setc/logratio_margin_delta": -0.0747640784829855, "setc/logratio_margin_vanilla": 0.746875, "setc/logratio_margin_weighted": 0.6721109215170145, "step": 90 }, { "entropy": 0.44228515625, "epoch": 0.05389382915656157, "grad_norm": 7.393381595611572, "learning_rate": 5.322580645161289e-07, "logits/chosen": -0.4361902457762258, "logits/rejected": -0.3870189698671949, "logps/chosen": -101.75, "logps/rejected": -109.4, "loss": 0.6872, "mean_token_accuracy": 0.9016273647546769, "num_tokens": 3548981.0, "rewards/accuracies": 0.728125, "rewards/chosen": -0.013736124988645316, "rewards/margins": 0.01212767151882872, "rewards/rejected": -0.02586379610002041, "setc/cal_net_lr": 0.00011666666666666668, "setc/credit_mean": 1.0, "setc/credit_std": 0.053199329786002635, "setc/logratio_margin_delta": -0.07141252495348453, "setc/logratio_margin_vanilla": 1.2841796875, "setc/logratio_margin_weighted": 1.2127671625465155, "step": 100 }, { "epoch": 0.05389382915656157, "eval_entropy": 0.4267578125, "eval_logits/chosen": -0.450571413522897, "eval_logits/rejected": -0.3940343719054589, "eval_logps/chosen": -108.859375, "eval_logps/rejected": -108.640625, "eval_loss": 0.6871142387390137, "eval_mean_token_accuracy": 0.9000339470803738, "eval_num_tokens": 3548981.0, "eval_rewards/accuracies": 0.671875, "eval_rewards/chosen": -0.021255508123431355, "eval_rewards/margins": 0.012425659064319916, "eval_rewards/rejected": -0.03368116734782234, "eval_runtime": 14.2001, "eval_samples_per_second": 35.211, "eval_setc/cal_net_lr": 0.00015833333333333327, "eval_setc/credit_mean": 1.0, "eval_setc/credit_std": 0.04981097369454801, "eval_setc/logratio_margin_delta": -0.06453109718859196, "eval_setc/logratio_margin_vanilla": 1.3050537109375, "eval_setc/logratio_margin_weighted": 1.240522613748908, "eval_steps_per_second": 2.254, "step": 100 }, { "entropy": 0.42685546875, "epoch": 0.059283212072217734, "grad_norm": 7.357357501983643, "learning_rate": 5.860215053763441e-07, "logits/chosen": -0.43970988910262926, "logits/rejected": -0.40844065752963693, "logps/chosen": -107.05, "logps/rejected": -113.025, "loss": 0.6854, "mean_token_accuracy": 0.899716067314148, "num_tokens": 3902446.0, "rewards/accuracies": 0.725, "rewards/chosen": -0.02834852202795446, "rewards/margins": 0.015831261326093228, "rewards/rejected": -0.04417978236451745, "setc/cal_net_lr": 0.0002, "setc/credit_mean": 0.9999999970197677, "setc/credit_std": 0.047479902021586896, "setc/logratio_margin_delta": -0.10251835882663726, "setc/logratio_margin_vanilla": 1.68564453125, "setc/logratio_margin_weighted": 1.5831261724233627, "step": 110 }, { "entropy": 0.42177734375, "epoch": 0.0646725949878739, "grad_norm": 7.097802639007568, "learning_rate": 6.397849462365591e-07, "logits/chosen": -0.41605972462656393, "logits/rejected": -0.3486092219206212, "logps/chosen": -97.4, "logps/rejected": -101.15, "loss": 0.6811, "mean_token_accuracy": 0.8998741269111633, "num_tokens": 4248207.0, "rewards/accuracies": 0.715625, "rewards/chosen": -0.0486921863630414, "rewards/margins": 0.02490431647747755, "rewards/rejected": -0.07359650395810605, "setc/cal_net_lr": 0.0002833333333333333, "setc/credit_mean": 1.0, "setc/credit_std": 0.040153346955776215, "setc/logratio_margin_delta": -0.10605261325836182, "setc/logratio_margin_vanilla": 2.596484375, "setc/logratio_margin_weighted": 2.4904317617416383, "step": 120 }, { "entropy": 0.4609375, "epoch": 0.07006197790353004, "grad_norm": 8.569380760192871, "learning_rate": 6.935483870967742e-07, "logits/chosen": -0.5107418885495224, "logits/rejected": -0.46866435782064403, "logps/chosen": -125.8, "logps/rejected": -125.425, "loss": 0.6705, "mean_token_accuracy": 0.8874441146850586, "num_tokens": 4601428.0, "rewards/accuracies": 0.721875, "rewards/chosen": -0.09584741443395614, "rewards/margins": 0.04766115248203277, "rewards/rejected": -0.14350856877863408, "setc/cal_net_lr": 0.0003666666666666668, "setc/credit_mean": 1.0, "setc/credit_std": 0.035092563927173616, "setc/logratio_margin_delta": -0.18232210874557495, "setc/logratio_margin_vanilla": 4.9484375, "setc/logratio_margin_weighted": 4.766115391254425, "step": 130 }, { "entropy": 0.46455078125, "epoch": 0.0754513608191862, "grad_norm": 7.780826091766357, "learning_rate": 7.473118279569892e-07, "logits/chosen": -0.4342919440785252, "logits/rejected": -0.42807028323097684, "logps/chosen": -100.9, "logps/rejected": -110.775, "loss": 0.6591, "mean_token_accuracy": 0.8856117874383926, "num_tokens": 4920681.0, "rewards/accuracies": 0.725, "rewards/chosen": -0.1356092609465122, "rewards/margins": 0.07387373577803373, "rewards/rejected": -0.2094829984009266, "setc/cal_net_lr": 0.00045, "setc/credit_mean": 1.0, "setc/credit_std": 0.022299187257885933, "setc/logratio_margin_delta": -0.09700125455856323, "setc/logratio_margin_vanilla": 7.484375, "setc/logratio_margin_weighted": 7.387373745441437, "step": 140 }, { "entropy": 0.45078125, "epoch": 0.08084074373484236, "grad_norm": 8.610173225402832, "learning_rate": 8.010752688172043e-07, "logits/chosen": -0.41425887318933086, "logits/rejected": -0.3917996553785628, "logps/chosen": -125.75, "logps/rejected": -139.075, "loss": 0.6385, "mean_token_accuracy": 0.88282710313797, "num_tokens": 5269262.0, "rewards/accuracies": 0.7125, "rewards/chosen": -0.2560404367744923, "rewards/margins": 0.1263975765556097, "rewards/rejected": -0.3824380189180374, "setc/cal_net_lr": 0.0005333333333333333, "setc/credit_mean": 1.0, "setc/credit_std": 0.012131102010607719, "setc/logratio_margin_delta": 0.07022677659988404, "setc/logratio_margin_vanilla": 12.56953125, "setc/logratio_margin_weighted": 12.639758026599884, "step": 150 }, { "entropy": 0.46650390625, "epoch": 0.08623012665049852, "grad_norm": 10.757467269897461, "learning_rate": 8.548387096774193e-07, "logits/chosen": -0.48324375921289925, "logits/rejected": -0.4760727307021881, "logps/chosen": -147.85, "logps/rejected": -173.725, "loss": 0.6143, "mean_token_accuracy": 0.8614455878734588, "num_tokens": 5635034.0, "rewards/accuracies": 0.725, "rewards/chosen": -0.46175644397735593, "rewards/margins": 0.20489476919174193, "rewards/rejected": -0.6666512250900268, "setc/cal_net_lr": 0.0006166666666666666, "setc/credit_mean": 1.0, "setc/credit_std": 0.01982680819928646, "setc/logratio_margin_delta": 0.5832273483276367, "setc/logratio_margin_vanilla": 19.90625, "setc/logratio_margin_weighted": 20.489477348327636, "step": 160 }, { "entropy": 0.4853515625, "epoch": 0.09161950956615468, "grad_norm": 12.929532051086426, "learning_rate": 9.086021505376343e-07, "logits/chosen": -0.5325447226004508, "logits/rejected": -0.533857912904264, "logps/chosen": -177.75, "logps/rejected": -207.0, "loss": 0.5938, "mean_token_accuracy": 0.8355448335409165, "num_tokens": 5996408.0, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7701589599251747, "rewards/margins": 0.2884057696908712, "rewards/rejected": -1.0585647374391556, "setc/cal_net_lr": 0.0007000000000000002, "setc/credit_mean": 1.0, "setc/credit_std": 0.04059908324852586, "setc/logratio_margin_delta": 1.1937029838562012, "setc/logratio_margin_vanilla": 27.646875, "setc/logratio_margin_weighted": 28.8405779838562, "step": 170 }, { "entropy": 0.4525390625, "epoch": 0.09700889248181083, "grad_norm": 13.670575141906738, "learning_rate": 9.623655913978494e-07, "logits/chosen": -0.5174588059832541, "logits/rejected": -0.4903172377790385, "logps/chosen": -212.5, "logps/rejected": -245.725, "loss": 0.5927, "mean_token_accuracy": 0.818328133225441, "num_tokens": 6344423.0, "rewards/accuracies": 0.69375, "rewards/chosen": -1.1680178940296173, "rewards/margins": 0.3832601685076952, "rewards/rejected": -1.5512780517339706, "setc/cal_net_lr": 0.0007833333333333333, "setc/credit_mean": 1.0, "setc/credit_std": 0.06093887090682983, "setc/logratio_margin_delta": 2.3182060956954955, "setc/logratio_margin_vanilla": 36.0078125, "setc/logratio_margin_weighted": 38.326018595695494, "step": 180 }, { "entropy": 0.4759765625, "epoch": 0.10239827539746699, "grad_norm": 13.61806583404541, "learning_rate": 9.999920375234096e-07, "logits/chosen": -0.5286533092264682, "logits/rejected": -0.5037254571131352, "logps/chosen": -254.55, "logps/rejected": -306.6, "loss": 0.5476, "mean_token_accuracy": 0.8002152472734452, "num_tokens": 6709991.0, "rewards/accuracies": 0.715625, "rewards/chosen": -1.5595814138650894, "rewards/margins": 0.5580403164029122, "rewards/rejected": -2.1176217913627626, "setc/cal_net_lr": 0.0008666666666666668, "setc/credit_mean": 1.0, "setc/credit_std": 0.07659839615225791, "setc/logratio_margin_delta": 3.3602832794189452, "setc/logratio_margin_vanilla": 52.44375, "setc/logratio_margin_weighted": 55.804033279418945, "step": 190 }, { "entropy": 0.43544921875, "epoch": 0.10778765831312315, "grad_norm": 14.541138648986816, "learning_rate": 9.998504894389128e-07, "logits/chosen": -0.527879368703732, "logits/rejected": -0.5018783955560024, "logps/chosen": -266.7, "logps/rejected": -320.55, "loss": 0.5664, "mean_token_accuracy": 0.800949826836586, "num_tokens": 7083816.0, "rewards/accuracies": 0.70625, "rewards/chosen": -1.7819188117980957, "rewards/margins": 0.5936955399811268, "rewards/rejected": -2.3756143033504484, "setc/cal_net_lr": 0.0009499999999999998, "setc/credit_mean": 1.0, "setc/credit_std": 0.0854849111288786, "setc/logratio_margin_delta": 3.891430473327637, "setc/logratio_margin_vanilla": 55.478125, "setc/logratio_margin_weighted": 59.36955547332764, "step": 200 }, { "epoch": 0.10778765831312315, "eval_entropy": 0.44390869140625, "eval_logits/chosen": -0.579585194957734, "eval_logits/rejected": -0.5690945474827908, "eval_logps/chosen": -275.25, "eval_logps/rejected": -336.9375, "eval_loss": 0.5417425036430359, "eval_mean_token_accuracy": 0.7958512287586927, "eval_num_tokens": 7083816.0, "eval_rewards/accuracies": 0.755859375, "eval_rewards/chosen": -1.8367619067430496, "eval_rewards/margins": 0.6809160728007555, "eval_rewards/rejected": -2.5176779851317406, "eval_runtime": 14.1879, "eval_samples_per_second": 35.241, "eval_setc/cal_net_lr": 0.0009916666666666676, "eval_setc/credit_mean": 1.0, "eval_setc/credit_std": 0.09134258469566703, "eval_setc/logratio_margin_delta": 4.674029171466827, "eval_setc/logratio_margin_vanilla": 64.115234375, "eval_setc/logratio_margin_weighted": 68.78926354646683, "eval_steps_per_second": 2.255, "step": 200 }, { "entropy": 0.4421875, "epoch": 0.11317704122877931, "grad_norm": 15.038735389709473, "learning_rate": 9.995320550872586e-07, "logits/chosen": -0.5982554625710975, "logits/rejected": -0.5934455933800111, "logps/chosen": -288.05, "logps/rejected": -341.35, "loss": 0.5767, "mean_token_accuracy": 0.7952886909246445, "num_tokens": 7430056.0, "rewards/accuracies": 0.740625, "rewards/chosen": -1.9779862701892852, "rewards/margins": 0.5607165463268757, "rewards/rejected": -2.538702827692032, "setc/cal_net_lr": 0.0009994884161461147, "setc/credit_mean": 1.0, "setc/credit_std": 0.09023903161287308, "setc/logratio_margin_delta": 3.6450933456420898, "setc/logratio_margin_vanilla": 52.4265625, "setc/logratio_margin_weighted": 56.07165584564209, "step": 210 }, { "entropy": 0.42705078125, "epoch": 0.11856642414443547, "grad_norm": 16.99634552001953, "learning_rate": 9.990368471554702e-07, "logits/chosen": -0.5453219890997539, "logits/rejected": -0.5525475843850532, "logps/chosen": -261.55, "logps/rejected": -330.25, "loss": 0.5191, "mean_token_accuracy": 0.8015528202056885, "num_tokens": 7791142.0, "rewards/accuracies": 0.76875, "rewards/chosen": -1.7827031493186951, "rewards/margins": 0.7491906136274338, "rewards/rejected": -2.531893861293793, "setc/cal_net_lr": 0.0009992394884850806, "setc/credit_mean": 1.0, "setc/credit_std": 0.09283662959933281, "setc/logratio_margin_delta": 5.744063377380371, "setc/logratio_margin_vanilla": 69.175, "setc/logratio_margin_weighted": 74.91906337738037, "step": 220 }, { "entropy": 0.426953125, "epoch": 0.12395580706009161, "grad_norm": 13.820030212402344, "learning_rate": 9.98365040886914e-07, "logits/chosen": -0.5075058959811635, "logits/rejected": -0.5316922545794671, "logps/chosen": -297.45, "logps/rejected": -381.2, "loss": 0.5132, "mean_token_accuracy": 0.8004782497882843, "num_tokens": 8160190.0, "rewards/accuracies": 0.75625, "rewards/chosen": -2.1069876670837404, "rewards/margins": 0.8769942551851273, "rewards/rejected": -2.9839819371700287, "setc/cal_net_lr": 0.0009978552797828366, "setc/credit_mean": 1.0, "setc/credit_std": 0.10098463930189609, "setc/logratio_margin_delta": 6.56817512512207, "setc/logratio_margin_vanilla": 81.13125, "setc/logratio_margin_weighted": 87.69942512512208, "step": 230 }, { "entropy": 0.41240234375, "epoch": 0.1293451899757478, "grad_norm": 20.368221282958984, "learning_rate": 9.975168740192852e-07, "logits/chosen": -0.5192135050793214, "logits/rejected": -0.5336463823173456, "logps/chosen": -274.9, "logps/rejected": -356.6, "loss": 0.5139, "mean_token_accuracy": 0.7949420899152756, "num_tokens": 8525986.0, "rewards/accuracies": 0.753125, "rewards/chosen": -1.9865135312080384, "rewards/margins": 0.8216024398803711, "rewards/rejected": -2.8081159591674805, "setc/cal_net_lr": 0.00099575444916116, "setc/credit_mean": 1.0, "setc/credit_std": 0.1036558359861374, "setc/logratio_margin_delta": 7.082119369506836, "setc/logratio_margin_vanilla": 75.078125, "setc/logratio_margin_weighted": 82.16024436950684, "step": 240 }, { "entropy": 0.4126953125, "epoch": 0.13473457289140395, "grad_norm": 32.17527770996094, "learning_rate": 9.964926467004765e-07, "logits/chosen": -0.5428571750679503, "logits/rejected": -0.5702547242679802, "logps/chosen": -319.9, "logps/rejected": -424.2, "loss": 0.4687, "mean_token_accuracy": 0.782842355966568, "num_tokens": 8896270.0, "rewards/accuracies": 0.775, "rewards/chosen": -2.4224733889102934, "rewards/margins": 1.0895347505807877, "rewards/rejected": -3.5120081663131715, "setc/cal_net_lr": 0.000992940020593752, "setc/credit_mean": 1.0, "setc/credit_std": 0.10739997066557408, "setc/logratio_margin_delta": 8.453478050231933, "setc/logratio_margin_vanilla": 100.5, "setc/logratio_margin_weighted": 108.95347805023194, "step": 250 }, { "entropy": 0.40068359375, "epoch": 0.14012395580706008, "grad_norm": 26.627729415893555, "learning_rate": 9.952927213823627e-07, "logits/chosen": -0.4605946438072367, "logits/rejected": -0.490033485518619, "logps/chosen": -351.4, "logps/rejected": -466.9, "loss": 0.4693, "mean_token_accuracy": 0.7776162534952163, "num_tokens": 9245420.0, "rewards/accuracies": 0.76875, "rewards/chosen": -2.833879363536835, "rewards/margins": 1.184336504340172, "rewards/rejected": -4.018215835094452, "setc/cal_net_lr": 0.000989416045220137, "setc/credit_mean": 1.0, "setc/credit_std": 0.10721651390194893, "setc/logratio_margin_delta": 7.883653450012207, "setc/logratio_margin_vanilla": 110.55, "setc/logratio_margin_weighted": 118.4336534500122, "step": 260 }, { "entropy": 0.412109375, "epoch": 0.14551333872271624, "grad_norm": 35.465599060058594, "learning_rate": 9.939175226925377e-07, "logits/chosen": -0.5947771680223737, "logits/rejected": -0.609633545923327, "logps/chosen": -345.4, "logps/rejected": -476.9, "loss": 0.4478, "mean_token_accuracy": 0.7697073727846145, "num_tokens": 9603870.0, "rewards/accuracies": 0.78125, "rewards/chosen": -2.724973976612091, "rewards/margins": 1.3795409053564072, "rewards/rejected": -4.10451488494873, "setc/cal_net_lr": 0.000985187595514384, "setc/credit_mean": 1.0, "setc/credit_std": 0.10481685362756252, "setc/logratio_margin_delta": 10.8415922164917, "setc/logratio_margin_vanilla": 127.1125, "setc/logratio_margin_weighted": 137.9540922164917, "step": 270 }, { "entropy": 0.402734375, "epoch": 0.1509027216383724, "grad_norm": 20.82551383972168, "learning_rate": 9.92367537284046e-07, "logits/chosen": -0.5548631230339891, "logits/rejected": -0.5871126674101008, "logps/chosen": -330.9, "logps/rejected": -440.2, "loss": 0.4953, "mean_token_accuracy": 0.7791453748941422, "num_tokens": 9933926.0, "rewards/accuracies": 0.775, "rewards/chosen": -2.6326347470283507, "rewards/margins": 1.1489318370819093, "rewards/rejected": -3.7815666437149047, "setc/cal_net_lr": 0.0009802607579836912, "setc/credit_mean": 1.0, "setc/credit_std": 0.10762366987764835, "setc/logratio_margin_delta": 10.655691719055175, "setc/logratio_margin_vanilla": 104.2375, "setc/logratio_margin_weighted": 114.89319171905518, "step": 280 }, { "entropy": 0.3955078125, "epoch": 0.15629210455402856, "grad_norm": 18.485855102539062, "learning_rate": 9.906433136631696e-07, "logits/chosen": -0.543341739516008, "logits/rejected": -0.5576040736941209, "logps/chosen": -321.1, "logps/rejected": -424.9, "loss": 0.4656, "mean_token_accuracy": 0.7913450181484223, "num_tokens": 10300364.0, "rewards/accuracies": 0.796875, "rewards/chosen": -2.416798382997513, "rewards/margins": 1.1477631881833077, "rewards/rejected": -3.5645615816116334, "setc/cal_net_lr": 0.0009746426244073601, "setc/credit_mean": 1.0, "setc/credit_std": 0.1039271742105484, "setc/logratio_margin_delta": 9.02632179260254, "setc/logratio_margin_vanilla": 105.75, "setc/logratio_margin_weighted": 114.77632179260254, "step": 290 }, { "entropy": 0.37470703125, "epoch": 0.16168148746968472, "grad_norm": 19.445205688476562, "learning_rate": 9.887454619953203e-07, "logits/chosen": -0.5234257126029256, "logits/rejected": -0.5395859317035523, "logps/chosen": -333.7, "logps/rejected": -451.1, "loss": 0.4567, "mean_token_accuracy": 0.7912471622228623, "num_tokens": 10652272.0, "rewards/accuracies": 0.81875, "rewards/chosen": -2.56951225399971, "rewards/margins": 1.2171583086252213, "rewards/rejected": -3.7866706013679505, "setc/cal_net_lr": 0.0009683412816287581, "setc/credit_mean": 1.0, "setc/credit_std": 0.09890042394399642, "setc/logratio_margin_delta": 9.815832328796386, "setc/logratio_margin_vanilla": 111.9, "setc/logratio_margin_weighted": 121.71583232879638, "step": 300 }, { "epoch": 0.16168148746968472, "eval_entropy": 0.3961181640625, "eval_logits/chosen": -0.5485536702445084, "eval_logits/rejected": -0.5521437157571957, "eval_logps/chosen": -344.875, "eval_logps/rejected": -457.9375, "eval_loss": 0.45288482308387756, "eval_mean_token_accuracy": 0.7810891252011061, "eval_num_tokens": 10652272.0, "eval_rewards/accuracies": 0.810546875, "eval_rewards/chosen": -2.6322504356503487, "eval_rewards/margins": 1.2452996505890042, "eval_rewards/rejected": -3.877550110220909, "eval_runtime": 14.1906, "eval_samples_per_second": 35.235, "eval_setc/cal_net_lr": 0.000964790405843831, "eval_setc/credit_mean": 1.0, "eval_setc/credit_std": 0.09871792490594089, "eval_setc/logratio_margin_delta": 9.83822700381279, "eval_setc/logratio_margin_vanilla": 115.9619140625, "eval_setc/logratio_margin_weighted": 125.80014106631279, "eval_steps_per_second": 2.255, "step": 300 }, { "entropy": 0.37734375, "epoch": 0.16707087038534088, "grad_norm": 25.576738357543945, "learning_rate": 9.866746538891172e-07, "logits/chosen": -0.5276412640451731, "logits/rejected": -0.5328882291269553, "logps/chosen": -361.3, "logps/rejected": -468.8, "loss": 0.5015, "mean_token_accuracy": 0.7804528266191483, "num_tokens": 11025201.0, "rewards/accuracies": 0.75, "rewards/chosen": -2.8542094469070434, "rewards/margins": 1.1937554821372032, "rewards/rejected": -4.047964870929718, "setc/cal_net_lr": 0.0009613657999149686, "setc/credit_mean": 1.0, "setc/credit_std": 0.09489493407309055, "setc/logratio_margin_delta": 11.188052368164062, "setc/logratio_margin_vanilla": 108.1875, "setc/logratio_margin_weighted": 119.37555236816407, "step": 310 }, { "entropy": 0.39609375, "epoch": 0.17246025330099704, "grad_norm": 25.457834243774414, "learning_rate": 9.844316221587202e-07, "logits/chosen": -0.547783353678369, "logits/rejected": -0.5447233904757801, "logps/chosen": -354.2, "logps/rejected": -475.9, "loss": 0.4576, "mean_token_accuracy": 0.7729141265153885, "num_tokens": 11378241.0, "rewards/accuracies": 0.775, "rewards/chosen": -2.7483198761940004, "rewards/margins": 1.3043047487735748, "rewards/rejected": -4.052624702453613, "setc/cal_net_lr": 0.0009537262199008873, "setc/credit_mean": 1.0, "setc/credit_std": 0.09067346304655075, "setc/logratio_margin_delta": 9.902355837821961, "setc/logratio_margin_vanilla": 120.528125, "setc/logratio_margin_weighted": 130.43048083782196, "step": 320 }, { "entropy": 0.3943359375, "epoch": 0.1778496362166532, "grad_norm": 26.59535789489746, "learning_rate": 9.82017160564499e-07, "logits/chosen": -0.5416634787030847, "logits/rejected": -0.5507818676121325, "logps/chosen": -370.9, "logps/rejected": -523.6, "loss": 0.4163, "mean_token_accuracy": 0.7765046328306198, "num_tokens": 11731948.0, "rewards/accuracies": 0.809375, "rewards/chosen": -2.9148443579673766, "rewards/margins": 1.6325711846351623, "rewards/rejected": -4.547415578365326, "setc/cal_net_lr": 0.0009454335381365463, "setc/credit_mean": 1.0, "setc/credit_std": 0.09494004286825657, "setc/logratio_margin_delta": 11.8196231842041, "setc/logratio_margin_vanilla": 151.4375, "setc/logratio_margin_weighted": 163.2571231842041, "step": 330 }, { "entropy": 0.35849609375, "epoch": 0.18323901913230936, "grad_norm": 32.58420944213867, "learning_rate": 9.79432123532143e-07, "logits/chosen": -0.49375742239443066, "logits/rejected": -0.5324902808324306, "logps/chosen": -475.7, "logps/rejected": -628.1, "loss": 0.4615, "mean_token_accuracy": 0.7430515229701996, "num_tokens": 12055728.0, "rewards/accuracies": 0.75625, "rewards/chosen": -4.24499009847641, "rewards/margins": 1.6139036893844605, "rewards/rejected": -5.858893728256225, "setc/cal_net_lr": 0.0009364996912584834, "setc/credit_mean": 1.0, "setc/credit_std": 0.09690484963357449, "setc/logratio_margin_delta": 11.415373611450196, "setc/logratio_margin_vanilla": 149.975, "setc/logratio_margin_weighted": 161.3903736114502, "step": 340 }, { "entropy": 0.3806640625, "epoch": 0.18862840204796552, "grad_norm": 27.812801361083984, "learning_rate": 9.76677425850295e-07, "logits/chosen": -0.6376107562818031, "logits/rejected": -0.6418548650284829, "logps/chosen": -465.6, "logps/rejected": -605.1, "loss": 0.4574, "mean_token_accuracy": 0.7482803016901016, "num_tokens": 12415175.0, "rewards/accuracies": 0.778125, "rewards/chosen": -3.9335139870643614, "rewards/margins": 1.5329814106225967, "rewards/rejected": -5.466495335102081, "setc/cal_net_lr": 0.000926937538807931, "setc/credit_mean": 1.0, "setc/credit_std": 0.08994356840848923, "setc/logratio_margin_delta": 11.082522773742676, "setc/logratio_margin_vanilla": 142.215625, "setc/logratio_margin_weighted": 153.29814777374267, "step": 350 }, { "entropy": 0.4140625, "epoch": 0.19401778496362165, "grad_norm": 21.86480712890625, "learning_rate": 9.737540423468298e-07, "logits/chosen": -0.6784440598334419, "logits/rejected": -0.7089506769710789, "logps/chosen": -366.3, "logps/rejected": -501.6, "loss": 0.4309, "mean_token_accuracy": 0.7626632928848267, "num_tokens": 12762716.0, "rewards/accuracies": 0.834375, "rewards/chosen": -2.877052891254425, "rewards/margins": 1.4415714114904403, "rewards/rejected": -4.318624341487885, "setc/cal_net_lr": 0.0009167608447205565, "setc/credit_mean": 1.0, "setc/credit_std": 0.08553993590176105, "setc/logratio_margin_delta": 10.219639015197753, "setc/logratio_margin_vanilla": 133.9375, "setc/logratio_margin_weighted": 144.15713901519774, "step": 360 }, { "entropy": 0.43505859375, "epoch": 0.1994071678792778, "grad_norm": 38.346282958984375, "learning_rate": 9.706630075438816e-07, "logits/chosen": -0.641167718623423, "logits/rejected": -0.6645962228342419, "logps/chosen": -407.6, "logps/rejected": -551.5, "loss": 0.4323, "mean_token_accuracy": 0.7579832553863526, "num_tokens": 13126907.0, "rewards/accuracies": 0.815625, "rewards/chosen": -3.3240499973297117, "rewards/margins": 1.5297267407178878, "rewards/rejected": -4.8537767171859745, "setc/cal_net_lr": 0.0009059842575144065, "setc/credit_mean": 1.0, "setc/credit_std": 0.08733033128082753, "setc/logratio_margin_delta": 11.610177040100098, "setc/logratio_margin_vanilla": 141.3625, "setc/logratio_margin_weighted": 152.9726770401001, "step": 370 }, { "entropy": 0.44599609375, "epoch": 0.20479655079493397, "grad_norm": 23.460803985595703, "learning_rate": 9.67405415291751e-07, "logits/chosen": -0.7566490767380819, "logits/rejected": -0.7921768754823747, "logps/chosen": -448.3, "logps/rejected": -605.6, "loss": 0.4256, "mean_token_accuracy": 0.7468535989522934, "num_tokens": 13476130.0, "rewards/accuracies": 0.828125, "rewards/chosen": -3.7169021725654603, "rewards/margins": 1.668231150507927, "rewards/rejected": -5.385133290290833, "setc/cal_net_lr": 0.0008946232892045628, "setc/credit_mean": 1.0, "setc/credit_std": 0.08953866809606552, "setc/logratio_margin_delta": 11.298118209838867, "setc/logratio_margin_vanilla": 155.525, "setc/logratio_margin_weighted": 166.82311820983887, "step": 380 }, { "entropy": 0.4015625, "epoch": 0.21018593371059013, "grad_norm": 37.17218780517578, "learning_rate": 9.639824183818136e-07, "logits/chosen": -0.6983920089322826, "logits/rejected": -0.7127737130064461, "logps/chosen": -405.7, "logps/rejected": -581.9, "loss": 0.3994, "mean_token_accuracy": 0.7641981273889542, "num_tokens": 13824865.0, "rewards/accuracies": 0.8375, "rewards/chosen": -3.3119530200958254, "rewards/margins": 1.9102674454450608, "rewards/rejected": -5.222220623493195, "setc/cal_net_lr": 0.0008826942929748675, "setc/credit_mean": 1.0, "setc/credit_std": 0.08975566849112511, "setc/logratio_margin_delta": 13.101746749877929, "setc/logratio_margin_vanilla": 177.925, "setc/logratio_margin_weighted": 191.02674674987793, "step": 390 }, { "entropy": 0.39140625, "epoch": 0.2155753166262463, "grad_norm": 31.108200073242188, "learning_rate": 9.603952281385731e-07, "logits/chosen": -0.6745283746311073, "logits/rejected": -0.7180434918542554, "logps/chosen": -434.1, "logps/rejected": -567.2, "loss": 0.4993, "mean_token_accuracy": 0.7623392403125763, "num_tokens": 14186289.0, "rewards/accuracies": 0.78125, "rewards/chosen": -3.6153177618980408, "rewards/margins": 1.431649774312973, "rewards/rejected": -5.046967601776123, "setc/cal_net_lr": 0.0008702144396388513, "setc/credit_mean": 1.0, "setc/credit_std": 0.0924822997301817, "setc/logratio_margin_delta": 9.014981460571288, "setc/logratio_margin_vanilla": 134.15, "setc/logratio_margin_weighted": 143.16498146057128, "step": 400 }, { "epoch": 0.2155753166262463, "eval_entropy": 0.39117431640625, "eval_logits/chosen": -0.7596684509609115, "eval_logits/rejected": -0.7672941152249027, "eval_logps/chosen": -379.0625, "eval_logps/rejected": -529.625, "eval_loss": 0.403584361076355, "eval_mean_token_accuracy": 0.7722007241100073, "eval_num_tokens": 14186289.0, "eval_rewards/accuracies": 0.833984375, "eval_rewards/chosen": -2.980880632996559, "eval_rewards/margins": 1.6366331987082958, "eval_rewards/rejected": -4.617513798177242, "eval_runtime": 14.2806, "eval_samples_per_second": 35.013, "eval_setc/cal_net_lr": 0.0008634697702573876, "eval_setc/credit_mean": 1.0, "eval_setc/credit_std": 0.08779020188376307, "eval_setc/logratio_margin_delta": 11.151197791099548, "eval_setc/logratio_margin_vanilla": 153.91796875, "eval_setc/logratio_margin_weighted": 165.06916654109955, "eval_steps_per_second": 2.241, "step": 400 }, { "entropy": 0.3703125, "epoch": 0.22096469954190245, "grad_norm": 24.23314094543457, "learning_rate": 9.566451139909988e-07, "logits/chosen": -0.6472189487288608, "logits/rejected": -0.6706296399402472, "logps/chosen": -380.5, "logps/rejected": -509.3, "loss": 0.4374, "mean_token_accuracy": 0.7814467757940292, "num_tokens": 14563626.0, "rewards/accuracies": 0.809375, "rewards/chosen": -3.0536776065826414, "rewards/margins": 1.3719112485647202, "rewards/rejected": -4.425588870048523, "setc/cal_net_lr": 0.0008572016929237579, "setc/credit_mean": 1.0, "setc/credit_std": 0.08474965952336788, "setc/logratio_margin_delta": 9.878629875183105, "setc/logratio_margin_vanilla": 127.3125, "setc/logratio_margin_weighted": 137.1911298751831, "step": 410 }, { "entropy": 0.37265625, "epoch": 0.22635408245755861, "grad_norm": 28.370731353759766, "learning_rate": 9.527334030233028e-07, "logits/chosen": -0.6874033644949044, "logits/rejected": -0.6702958053090058, "logps/chosen": -365.85, "logps/rejected": -494.9, "loss": 0.486, "mean_token_accuracy": 0.7773175150156021, "num_tokens": 14919543.0, "rewards/accuracies": 0.775, "rewards/chosen": -2.9779480576515196, "rewards/margins": 1.3624024033546447, "rewards/rejected": -4.340350472927094, "setc/cal_net_lr": 0.0008436747836132224, "setc/credit_mean": 1.0, "setc/credit_std": 0.08348823450505734, "setc/logratio_margin_delta": 11.090241241455079, "setc/logratio_margin_vanilla": 125.15, "setc/logratio_margin_weighted": 136.24024124145507, "step": 420 }, { "entropy": 0.36806640625, "epoch": 0.23174346537321477, "grad_norm": 20.790348052978516, "learning_rate": 9.486614795053136e-07, "logits/chosen": -0.687221071504475, "logits/rejected": -0.6840851017343192, "logps/chosen": -355.1, "logps/rejected": -485.7, "loss": 0.4321, "mean_token_accuracy": 0.785921522974968, "num_tokens": 15277577.0, "rewards/accuracies": 0.825, "rewards/chosen": -2.7120434999465943, "rewards/margins": 1.4238930448889733, "rewards/rejected": -4.135936522483826, "setc/cal_net_lr": 0.0008296531825858478, "setc/credit_mean": 1.0, "setc/credit_std": 0.08098721131682396, "setc/logratio_margin_delta": 11.239310264587402, "setc/logratio_margin_vanilla": 131.15, "setc/logratio_margin_weighted": 142.3893102645874, "step": 430 }, { "entropy": 0.36962890625, "epoch": 0.23713284828887093, "grad_norm": 29.099376678466797, "learning_rate": 9.444307844026128e-07, "logits/chosen": -0.7001576273000311, "logits/rejected": -0.7564168572512255, "logps/chosen": -394.0, "logps/rejected": -555.2, "loss": 0.4012, "mean_token_accuracy": 0.777391391992569, "num_tokens": 15614929.0, "rewards/accuracies": 0.834375, "rewards/chosen": -3.16938259601593, "rewards/margins": 1.7262235552072525, "rewards/rejected": -4.8956061720848085, "setc/cal_net_lr": 0.0008151570727884652, "setc/credit_mean": 1.0, "setc/credit_std": 0.08620954863727093, "setc/logratio_margin_delta": 14.22236442565918, "setc/logratio_margin_vanilla": 158.4, "setc/logratio_margin_weighted": 172.62236442565919, "step": 440 }, { "entropy": 0.35576171875, "epoch": 0.2425222312045271, "grad_norm": 36.5993537902832, "learning_rate": 9.400428148666088e-07, "logits/chosen": -0.7156464373663424, "logits/rejected": -0.7045427588458122, "logps/chosen": -444.4, "logps/rejected": -642.3, "loss": 0.4123, "mean_token_accuracy": 0.7572789788246155, "num_tokens": 15977200.0, "rewards/accuracies": 0.821875, "rewards/chosen": -3.769306206703186, "rewards/margins": 2.0929836332798004, "rewards/rejected": -5.86228985786438, "setc/cal_net_lr": 0.0008002073201844367, "setc/credit_mean": 1.0, "setc/credit_std": 0.08339687958359718, "setc/logratio_margin_delta": 14.948371696472169, "setc/logratio_margin_vanilla": 194.35, "setc/logratio_margin_weighted": 209.29837169647217, "step": 450 }, { "entropy": 0.34697265625, "epoch": 0.24791161412018323, "grad_norm": 21.59111785888672, "learning_rate": 9.354991237047272e-07, "logits/chosen": -0.6320623481408233, "logits/rejected": -0.6675801158708582, "logps/chosen": -436.2, "logps/rejected": -619.0, "loss": 0.3781, "mean_token_accuracy": 0.7620709240436554, "num_tokens": 16318118.0, "rewards/accuracies": 0.828125, "rewards/chosen": -3.739296352863312, "rewards/margins": 1.971435186266899, "rewards/rejected": -5.710731565952301, "setc/cal_net_lr": 0.0007848254437188073, "setc/credit_mean": 1.0, "setc/credit_std": 0.08574292548000813, "setc/logratio_margin_delta": 13.068524169921876, "setc/logratio_margin_vanilla": 184.075, "setc/logratio_margin_weighted": 197.14352416992188, "step": 460 }, { "entropy": 0.35234375, "epoch": 0.2533009970358394, "grad_norm": 39.83766555786133, "learning_rate": 9.308013188309052e-07, "logits/chosen": -0.5782339558042321, "logits/rejected": -0.5778674119896376, "logps/chosen": -449.1, "logps/rejected": -629.1, "loss": 0.4338, "mean_token_accuracy": 0.7699876993894577, "num_tokens": 16695798.0, "rewards/accuracies": 0.828125, "rewards/chosen": -3.8279688358306885, "rewards/margins": 1.9190792471170426, "rewards/rejected": -5.747048032283783, "setc/cal_net_lr": 0.0007690335843435461, "setc/credit_mean": 1.0, "setc/credit_std": 0.08457843437790871, "setc/logratio_margin_delta": 15.239182376861573, "setc/logratio_margin_vanilla": 176.66875, "setc/logratio_margin_weighted": 191.90793237686157, "step": 470 }, { "entropy": 0.37353515625, "epoch": 0.2586903799514956, "grad_norm": 61.235111236572266, "learning_rate": 9.259510626965874e-07, "logits/chosen": -0.6445287935159719, "logits/rejected": -0.6687137779580832, "logps/chosen": -441.0, "logps/rejected": -700.9, "loss": 0.3949, "mean_token_accuracy": 0.7573456019163132, "num_tokens": 17051928.0, "rewards/accuracies": 0.846875, "rewards/chosen": -3.6645211458206175, "rewards/margins": 2.796916735172272, "rewards/rejected": -6.46143786907196, "setc/cal_net_lr": 0.0007528544731474588, "setc/credit_mean": 1.0, "setc/credit_std": 0.08809281550347806, "setc/logratio_margin_delta": 17.716680145263673, "setc/logratio_margin_vanilla": 261.975, "setc/logratio_margin_weighted": 279.69168014526366, "step": 480 }, { "entropy": 0.3984375, "epoch": 0.26407976286715173, "grad_norm": 51.56365966796875, "learning_rate": 9.209500717024184e-07, "logits/chosen": -0.5772588425384099, "logits/rejected": -0.5829906453335669, "logps/chosen": -433.8, "logps/rejected": -669.7, "loss": 0.3728, "mean_token_accuracy": 0.7556526750326157, "num_tokens": 17399473.0, "rewards/accuracies": 0.840625, "rewards/chosen": -3.6556755781173704, "rewards/margins": 2.525260826945305, "rewards/rejected": -6.180936527252197, "setc/cal_net_lr": 0.0007363113986366443, "setc/credit_mean": 1.0, "setc/credit_std": 0.08914664313197136, "setc/logratio_margin_delta": 16.12609519958496, "setc/logratio_margin_vanilla": 236.4, "setc/logratio_margin_weighted": 252.52609519958497, "step": 490 }, { "entropy": 0.3970703125, "epoch": 0.2694691457828079, "grad_norm": 41.8188591003418, "learning_rate": 9.158001155908463e-07, "logits/chosen": -0.5640562576190764, "logits/rejected": -0.5751849485254861, "logps/chosen": -457.0, "logps/rejected": -668.7, "loss": 0.4275, "mean_token_accuracy": 0.7455924898386002, "num_tokens": 17747161.0, "rewards/accuracies": 0.83125, "rewards/chosen": -3.902832806110382, "rewards/margins": 2.2202740639448164, "rewards/rejected": -6.123106646537781, "setc/cal_net_lr": 0.0007194281732126001, "setc/credit_mean": 1.0, "setc/credit_std": 0.0880829505622387, "setc/logratio_margin_delta": 13.539912796020507, "setc/logratio_margin_vanilla": 208.4875, "setc/logratio_margin_weighted": 222.0274127960205, "step": 500 }, { "epoch": 0.2694691457828079, "eval_entropy": 0.40399169921875, "eval_logits/chosen": -0.592532312686467, "eval_logits/rejected": -0.605988185624285, "eval_logps/chosen": -462.9375, "eval_logps/rejected": -684.0, "eval_loss": 0.35599419474601746, "eval_mean_token_accuracy": 0.7497607115656137, "eval_num_tokens": 17747161.0, "eval_rewards/accuracies": 0.849609375, "eval_rewards/chosen": -3.872654564678669, "eval_rewards/margins": 2.3638561107218266, "eval_rewards/rejected": -6.236510649323463, "eval_runtime": 14.2091, "eval_samples_per_second": 35.189, "eval_setc/cal_net_lr": 0.000710449074239924, "eval_setc/credit_mean": 1.0, "eval_setc/credit_std": 0.08703692350536585, "eval_setc/logratio_margin_delta": 14.291290760040283, "eval_setc/logratio_margin_vanilla": 222.796875, "eval_setc/logratio_margin_weighted": 237.08816576004028, "eval_steps_per_second": 2.252, "step": 500 }, { "entropy": 0.3931640625, "epoch": 0.274858528698464, "grad_norm": 24.2525577545166, "learning_rate": 9.105030168198502e-07, "logits/chosen": -0.6082334317471639, "logits/rejected": -0.6164790579325305, "logps/chosen": -376.2, "logps/rejected": -538.2, "loss": 0.4458, "mean_token_accuracy": 0.7635719299316406, "num_tokens": 18074898.0, "rewards/accuracies": 0.809375, "rewards/chosen": -3.032584583759308, "rewards/margins": 1.7272971540689468, "rewards/rejected": -4.7598817348480225, "setc/cal_net_lr": 0.0007022290988962156, "setc/credit_mean": 1.0, "setc/credit_std": 0.08350460566580295, "setc/logratio_margin_delta": 12.80472412109375, "setc/logratio_margin_vanilla": 159.925, "setc/logratio_margin_weighted": 172.72972412109374, "step": 510 }, { "entropy": 0.418359375, "epoch": 0.28024791161412016, "grad_norm": 49.45343017578125, "learning_rate": 9.050606499180102e-07, "logits/chosen": -0.6519357377370888, "logits/rejected": -0.6423741925659583, "logps/chosen": -402.5, "logps/rejected": -578.4, "loss": 0.4126, "mean_token_accuracy": 0.7656619876623154, "num_tokens": 18445858.0, "rewards/accuracies": 0.834375, "rewards/chosen": -3.202549707889557, "rewards/margins": 1.860961028933525, "rewards/rejected": -5.063510704040527, "setc/cal_net_lr": 0.0006847389323470055, "setc/credit_mean": 1.0, "setc/credit_std": 0.0847404919564724, "setc/logratio_margin_delta": 11.646109008789063, "setc/logratio_margin_vanilla": 174.45, "setc/logratio_margin_weighted": 186.09610900878906, "step": 520 }, { "entropy": 0.43330078125, "epoch": 0.2856372945297763, "grad_norm": 29.773883819580078, "learning_rate": 8.994749408211536e-07, "logits/chosen": -0.7015092396795138, "logits/rejected": -0.7123617485729619, "logps/chosen": -520.0, "logps/rejected": -806.7, "loss": 0.3962, "mean_token_accuracy": 0.729384770989418, "num_tokens": 18791922.0, "rewards/accuracies": 0.815625, "rewards/chosen": -4.475372099876404, "rewards/margins": 3.0225441813468934, "rewards/rejected": -7.497916245460511, "setc/cal_net_lr": 0.0006669828492279234, "setc/credit_mean": 1.0, "setc/credit_std": 0.08948785215616226, "setc/logratio_margin_delta": 17.254426956176758, "setc/logratio_margin_vanilla": 285.0, "setc/logratio_margin_weighted": 302.25442695617676, "step": 530 }, { "entropy": 0.4228515625, "epoch": 0.2910266774454325, "grad_norm": 21.04458236694336, "learning_rate": 8.937478661908069e-07, "logits/chosen": -0.7061397320826232, "logits/rejected": -0.7431562334397835, "logps/chosen": -449.0, "logps/rejected": -646.0, "loss": 0.3719, "mean_token_accuracy": 0.7576641947031021, "num_tokens": 19156930.0, "rewards/accuracies": 0.828125, "rewards/chosen": -3.6808414697647094, "rewards/margins": 2.149125117063522, "rewards/rejected": -5.829966616630554, "setc/cal_net_lr": 0.0006489864079670575, "setc/credit_mean": 1.0, "setc/credit_std": 0.08795219510793686, "setc/logratio_margin_delta": 14.512511444091796, "setc/logratio_margin_vanilla": 200.4, "setc/logratio_margin_weighted": 214.9125114440918, "step": 540 }, { "entropy": 0.424609375, "epoch": 0.29641606036108864, "grad_norm": 67.26245880126953, "learning_rate": 8.878814527146985e-07, "logits/chosen": -0.6747327955549893, "logits/rejected": -0.729661536184305, "logps/chosen": -396.5, "logps/rejected": -612.5, "loss": 0.3854, "mean_token_accuracy": 0.7647815823554993, "num_tokens": 19505372.0, "rewards/accuracies": 0.8375, "rewards/chosen": -3.2515084028244017, "rewards/margins": 2.2733805954456328, "rewards/rejected": -5.52488911151886, "setc/cal_net_lr": 0.0006307755129683639, "setc/credit_mean": 1.0, "setc/credit_std": 0.09044467471539974, "setc/logratio_margin_delta": 15.188063812255859, "setc/logratio_margin_vanilla": 212.15, "setc/logratio_margin_weighted": 227.33806381225585, "step": 550 }, { "entropy": 0.410546875, "epoch": 0.3018054432767448, "grad_norm": 56.778804779052734, "learning_rate": 8.818777763895585e-07, "logits/chosen": -0.6860816122826272, "logits/rejected": -0.6895918590666351, "logps/chosen": -430.6, "logps/rejected": -624.7, "loss": 0.3921, "mean_token_accuracy": 0.7563153892755509, "num_tokens": 19863864.0, "rewards/accuracies": 0.83125, "rewards/chosen": -3.628837537765503, "rewards/margins": 2.037660950422287, "rewards/rejected": -5.666498494148255, "setc/cal_net_lr": 0.0006123763773243994, "setc/credit_mean": 1.0, "setc/credit_std": 0.08885525353252888, "setc/logratio_margin_delta": 13.316099929809571, "setc/logratio_margin_vanilla": 190.45, "setc/logratio_margin_weighted": 203.76609992980957, "step": 560 }, { "entropy": 0.3953125, "epoch": 0.30719482619240096, "grad_norm": 42.43838119506836, "learning_rate": 8.75738961786467e-07, "logits/chosen": -0.688892356215907, "logits/rejected": -0.7157972843996309, "logps/chosen": -461.9, "logps/rejected": -660.4, "loss": 0.4452, "mean_token_accuracy": 0.7513888716697693, "num_tokens": 20215920.0, "rewards/accuracies": 0.803125, "rewards/chosen": -3.9202752113342285, "rewards/margins": 2.134818637371063, "rewards/rejected": -6.055093717575073, "setc/cal_net_lr": 0.0005938154850847184, "setc/credit_mean": 1.0, "setc/credit_std": 0.08897339478135109, "setc/logratio_margin_delta": 14.231868362426757, "setc/logratio_margin_vanilla": 199.25, "setc/logratio_margin_weighted": 213.48186836242675, "step": 570 }, { "entropy": 0.3853515625, "epoch": 0.3125842091080571, "grad_norm": 39.56584167480469, "learning_rate": 8.694671812990155e-07, "logits/chosen": -0.643150039587863, "logits/rejected": -0.6802687248343776, "logps/chosen": -361.9, "logps/rejected": -505.1, "loss": 0.4191, "mean_token_accuracy": 0.7855405032634735, "num_tokens": 20574156.0, "rewards/accuracies": 0.796875, "rewards/chosen": -2.867064654827118, "rewards/margins": 1.4799500912427903, "rewards/rejected": -4.347014737129212, "setc/cal_net_lr": 0.0005751195531342541, "setc/credit_mean": 1.0, "setc/credit_std": 0.08569788709282875, "setc/logratio_margin_delta": 10.945015716552735, "setc/logratio_margin_vanilla": 137.05, "setc/logratio_margin_weighted": 147.99501571655273, "step": 580 }, { "entropy": 0.41494140625, "epoch": 0.3179735920237133, "grad_norm": 18.509885787963867, "learning_rate": 8.630646543745433e-07, "logits/chosen": -0.7564691341004269, "logits/rejected": -0.7964878528594659, "logps/chosen": -466.7, "logps/rejected": -751.8, "loss": 0.3255, "mean_token_accuracy": 0.7475934147834777, "num_tokens": 20942542.0, "rewards/accuracies": 0.85, "rewards/chosen": -3.9546597361564637, "rewards/margins": 3.00535169839859, "rewards/rejected": -6.960011458396911, "setc/cal_net_lr": 0.000556315492736548, "setc/credit_mean": 1.0, "setc/credit_std": 0.08982903957366943, "setc/logratio_margin_delta": 20.885171508789064, "setc/logratio_margin_vanilla": 279.65, "setc/logratio_margin_weighted": 300.53517150878906, "step": 590 }, { "entropy": 0.405859375, "epoch": 0.32336297493936944, "grad_norm": 26.894384384155273, "learning_rate": 8.565336467287235e-07, "logits/chosen": -0.7470103611470064, "logits/rejected": -0.7722877603949199, "logps/chosen": -531.8, "logps/rejected": -820.6, "loss": 0.3808, "mean_token_accuracy": 0.7415017306804657, "num_tokens": 21291866.0, "rewards/accuracies": 0.834375, "rewards/chosen": -4.668173170089721, "rewards/margins": 3.097328555583954, "rewards/rejected": -7.765501689910889, "setc/cal_net_lr": 0.0005374303707971904, "setc/credit_mean": 1.0, "setc/credit_std": 0.08936081901192665, "setc/logratio_margin_delta": 19.732860565185547, "setc/logratio_margin_vanilla": 290.0, "setc/logratio_margin_weighted": 309.73286056518555, "step": 600 }, { "epoch": 0.32336297493936944, "eval_entropy": 0.4246826171875, "eval_logits/chosen": -0.7950303357768591, "eval_logits/rejected": -0.809009124352729, "eval_logps/chosen": -570.9375, "eval_logps/rejected": -884.625, "eval_loss": 0.3066518008708954, "eval_mean_token_accuracy": 0.726261779665947, "eval_num_tokens": 21291866.0, "eval_rewards/accuracies": 0.875, "eval_rewards/chosen": -5.0408351719379425, "eval_rewards/margins": 3.342687487602234, "eval_rewards/rejected": -8.383522659540176, "eval_runtime": 14.2973, "eval_samples_per_second": 34.972, "eval_setc/cal_net_lr": 0.0005274940322432844, "eval_setc/credit_mean": 1.0, "eval_setc/credit_std": 0.09106356324627995, "eval_setc/logratio_margin_delta": 19.50989866256714, "eval_setc/logratio_margin_vanilla": 315.8125, "eval_setc/logratio_margin_weighted": 335.32239866256714, "eval_steps_per_second": 2.238, "step": 600 }, { "entropy": 0.43974609375, "epoch": 0.3287523578550256, "grad_norm": 41.537105560302734, "learning_rate": 8.498764695437746e-07, "logits/chosen": -0.7988740761782873, "logits/rejected": -0.7901971178299118, "logps/chosen": -550.8, "logps/rejected": -910.0, "loss": 0.3901, "mean_token_accuracy": 0.7285310417413712, "num_tokens": 21645741.0, "rewards/accuracies": 0.8375, "rewards/chosen": -4.841351461410523, "rewards/margins": 3.7519346356391905, "rewards/rejected": -8.593285942077637, "setc/cal_net_lr": 0.0005184913709032242, "setc/credit_mean": 1.0, "setc/credit_std": 0.08994946219027042, "setc/logratio_margin_delta": 18.493466186523438, "setc/logratio_margin_vanilla": 356.7, "setc/logratio_margin_weighted": 375.19346618652344, "step": 610 }, { "entropy": 0.43916015625, "epoch": 0.33414174077068176, "grad_norm": 42.426490783691406, "learning_rate": 8.430954786505839e-07, "logits/chosen": -0.8440707111321254, "logits/rejected": -0.8533261529276125, "logps/chosen": -431.9, "logps/rejected": -656.1, "loss": 0.3473, "mean_token_accuracy": 0.7551120847463608, "num_tokens": 21998359.0, "rewards/accuracies": 0.878125, "rewards/chosen": -3.5407957673072814, "rewards/margins": 2.3734045028686523, "rewards/rejected": -5.914200258255005, "setc/cal_net_lr": 0.0004995257541945978, "setc/credit_mean": 1.0, "setc/credit_std": 0.08794644176959991, "setc/logratio_margin_delta": 15.790457153320313, "setc/logratio_margin_vanilla": 221.55, "setc/logratio_margin_weighted": 237.3404571533203, "step": 620 }, { "entropy": 0.4126953125, "epoch": 0.3395311236863379, "grad_norm": 24.879779815673828, "learning_rate": 8.361930736950299e-07, "logits/chosen": -0.8090851961587198, "logits/rejected": -0.8543913061734949, "logps/chosen": -432.2, "logps/rejected": -689.9, "loss": 0.3312, "mean_token_accuracy": 0.7620727181434631, "num_tokens": 22367339.0, "rewards/accuracies": 0.884375, "rewards/chosen": -3.6054784178733827, "rewards/margins": 2.696714848279953, "rewards/rejected": -6.302193331718445, "setc/cal_net_lr": 0.00048056082012398516, "setc/credit_mean": 1.0, "setc/credit_std": 0.0865101970732212, "setc/logratio_margin_delta": 17.24649543762207, "setc/logratio_margin_vanilla": 252.425, "setc/logratio_margin_weighted": 269.67149543762207, "step": 630 }, { "entropy": 0.402734375, "epoch": 0.3449205066019941, "grad_norm": 29.819805145263672, "learning_rate": 8.29171697288799e-07, "logits/chosen": -0.7799408370558261, "logits/rejected": -0.7945290786054311, "logps/chosen": -520.6, "logps/rejected": -788.4, "loss": 0.4063, "mean_token_accuracy": 0.7504911392927169, "num_tokens": 22746633.0, "rewards/accuracies": 0.825, "rewards/chosen": -4.477042305469513, "rewards/margins": 2.874571180343628, "rewards/rejected": -7.351613521575928, "setc/cal_net_lr": 0.00046162386716145867, "setc/credit_mean": 1.0, "setc/credit_std": 0.08807485699653625, "setc/logratio_margin_delta": 20.70712242126465, "setc/logratio_margin_vanilla": 266.75, "setc/logratio_margin_weighted": 287.45712242126467, "step": 640 }, { "entropy": 0.411328125, "epoch": 0.35030988951765024, "grad_norm": 45.361915588378906, "learning_rate": 8.220338341449986e-07, "logits/chosen": -0.7536949779449323, "logits/rejected": -0.797799150332984, "logps/chosen": -483.3, "logps/rejected": -779.8, "loss": 0.3723, "mean_token_accuracy": 0.7493287652730942, "num_tokens": 23123993.0, "rewards/accuracies": 0.84375, "rewards/chosen": -4.141227328777314, "rewards/margins": 3.1743425011634825, "rewards/rejected": -7.315569949150086, "setc/cal_net_lr": 0.00044274215350057644, "setc/credit_mean": 1.0, "setc/credit_std": 0.08726362958550453, "setc/logratio_margin_delta": 19.73426208496094, "setc/logratio_margin_vanilla": 297.7, "setc/logratio_margin_weighted": 317.43426208496095, "step": 650 }, { "entropy": 0.4083984375, "epoch": 0.3556992724333064, "grad_norm": 37.76664352416992, "learning_rate": 8.147820101988704e-07, "logits/chosen": -0.8120619076794536, "logits/rejected": -0.8494678070244117, "logps/chosen": -361.7, "logps/rejected": -565.3, "loss": 0.3474, "mean_token_accuracy": 0.766129943728447, "num_tokens": 23454227.0, "rewards/accuracies": 0.8375, "rewards/chosen": -2.9051371812820435, "rewards/margins": 2.1819508969783783, "rewards/rejected": -5.087088012695313, "setc/cal_net_lr": 0.00042394285782244395, "setc/credit_mean": 1.0, "setc/credit_std": 0.08217526227235794, "setc/logratio_margin_delta": 14.345091247558594, "setc/logratio_margin_vanilla": 203.85, "setc/logratio_margin_weighted": 218.1950912475586, "step": 660 }, { "entropy": 0.421875, "epoch": 0.36108865534896256, "grad_norm": 39.73728942871094, "learning_rate": 8.074187917139175e-07, "logits/chosen": -0.7319666812006074, "logits/rejected": -0.7562072698077329, "logps/chosen": -492.1, "logps/rejected": -771.8, "loss": 0.37, "mean_token_accuracy": 0.7497668653726578, "num_tokens": 23818594.0, "rewards/accuracies": 0.85, "rewards/chosen": -4.2121153473854065, "rewards/margins": 2.9903420001268386, "rewards/rejected": -7.202457308769226, "setc/cal_net_lr": 0.00040525304017422745, "setc/credit_mean": 1.0, "setc/credit_std": 0.08255718871951104, "setc/logratio_margin_delta": 18.634216690063475, "setc/logratio_margin_vanilla": 280.4, "setc/logratio_margin_weighted": 299.0342166900635, "step": 670 }, { "entropy": 0.42548828125, "epoch": 0.3664780382646187, "grad_norm": 40.21190643310547, "learning_rate": 7.999467843737582e-07, "logits/chosen": -0.8038282930095126, "logits/rejected": -0.8421265761188057, "logps/chosen": -460.9, "logps/rejected": -701.0, "loss": 0.3708, "mean_token_accuracy": 0.7515695452690124, "num_tokens": 24157407.0, "rewards/accuracies": 0.859375, "rewards/chosen": -3.902362608909607, "rewards/margins": 2.5381152600049974, "rewards/rejected": -6.440477788448334, "setc/cal_net_lr": 0.00038669960301843077, "setc/credit_mean": 1.0, "setc/credit_std": 0.0827824518084526, "setc/logratio_margin_delta": 16.236531066894532, "setc/logratio_margin_vanilla": 237.575, "setc/logratio_margin_weighted": 253.81153106689453, "step": 680 }, { "entropy": 0.39599609375, "epoch": 0.3718674211802749, "grad_norm": 20.98696517944336, "learning_rate": 7.92368632360032e-07, "logits/chosen": -0.8042416360177865, "logits/rejected": -0.8270914788970604, "logps/chosen": -469.8, "logps/rejected": -730.8, "loss": 0.344, "mean_token_accuracy": 0.7479289263486862, "num_tokens": 24493771.0, "rewards/accuracies": 0.859375, "rewards/chosen": -4.073775279521942, "rewards/margins": 2.744447809457779, "rewards/rejected": -6.8182231187820435, "setc/cal_net_lr": 0.000368309252509004, "setc/credit_mean": 1.0, "setc/credit_std": 0.08153500594198704, "setc/logratio_margin_delta": 16.7447868347168, "setc/logratio_margin_vanilla": 257.7, "setc/logratio_margin_weighted": 274.4447868347168, "step": 690 }, { "entropy": 0.3873046875, "epoch": 0.37725680409593104, "grad_norm": 58.19563674926758, "learning_rate": 7.846870174166803e-07, "logits/chosen": -0.751562047809603, "logits/rejected": -0.7788989722070948, "logps/chosen": -553.7, "logps/rejected": -904.5, "loss": 0.3499, "mean_token_accuracy": 0.7495994985103607, "num_tokens": 24861785.0, "rewards/accuracies": 0.8375, "rewards/chosen": -4.865620005130768, "rewards/margins": 3.707144260406494, "rewards/rejected": -8.57276439666748, "setc/cal_net_lr": 0.00035010846005002045, "setc/credit_mean": 1.0, "setc/credit_std": 0.0810060478746891, "setc/logratio_margin_delta": 21.064439392089845, "setc/logratio_margin_vanilla": 349.65, "setc/logratio_margin_weighted": 370.71443939208984, "step": 700 }, { "epoch": 0.37725680409593104, "eval_entropy": 0.4024658203125, "eval_logits/chosen": -0.8513818849761876, "eval_logits/rejected": -0.8638025440627952, "eval_logps/chosen": -570.4375, "eval_logps/rejected": -930.875, "eval_loss": 0.3007389008998871, "eval_mean_token_accuracy": 0.7318701520562172, "eval_num_tokens": 24861785.0, "eval_rewards/accuracies": 0.86328125, "eval_rewards/chosen": -5.0088541358709335, "eval_rewards/margins": 3.838240258395672, "eval_rewards/rejected": -8.847094357013702, "eval_runtime": 14.2035, "eval_samples_per_second": 35.202, "eval_setc/cal_net_lr": 0.00034062822999975144, "eval_setc/credit_mean": 1.0, "eval_setc/credit_std": 0.08001079317182302, "eval_setc/logratio_margin_delta": 21.063920974731445, "eval_setc/logratio_margin_vanilla": 363.875, "eval_setc/logratio_margin_weighted": 384.93892097473145, "eval_steps_per_second": 2.253, "step": 700 }, { "entropy": 0.4060546875, "epoch": 0.38264618701158715, "grad_norm": 86.25070190429688, "learning_rate": 7.769046579009356e-07, "logits/chosen": -0.8918346681389394, "logits/rejected": -0.9074910639956075, "logps/chosen": -639.5, "logps/rejected": -989.4, "loss": 0.4045, "mean_token_accuracy": 0.7125069379806519, "num_tokens": 25201876.0, "rewards/accuracies": 0.821875, "rewards/chosen": -5.8089584589004515, "rewards/margins": 3.719921666383743, "rewards/rejected": -9.52888035774231, "setc/cal_net_lr": 0.0003321234241922574, "setc/credit_mean": 1.0, "setc/credit_std": 0.07928324565291404, "setc/logratio_margin_delta": 21.29217185974121, "setc/logratio_margin_vanilla": 350.7, "setc/logratio_margin_weighted": 371.99217185974123, "step": 710 }, { "entropy": 0.4359375, "epoch": 0.3880355699272433, "grad_norm": 45.142372131347656, "learning_rate": 7.690243078213531e-07, "logits/chosen": -0.9740272973330708, "logits/rejected": -1.0063528667573212, "logps/chosen": -689.2, "logps/rejected": -1139.4, "loss": 0.3136, "mean_token_accuracy": 0.6987293243408204, "num_tokens": 25567728.0, "rewards/accuracies": 0.88125, "rewards/chosen": -6.253266739845276, "rewards/margins": 4.749369788169861, "rewards/rejected": -11.002636575698853, "setc/cal_net_lr": 0.0003143800329225269, "setc/credit_mean": 1.0, "setc/credit_std": 0.0756493654102087, "setc/logratio_margin_delta": 25.636982727050782, "setc/logratio_margin_vanilla": 449.3, "setc/logratio_margin_weighted": 474.9369827270508, "step": 720 }, { "entropy": 0.45234375, "epoch": 0.39342495284289947, "grad_norm": 13.179286003112793, "learning_rate": 7.610487558632278e-07, "logits/chosen": -1.0633192171048695, "logits/rejected": -1.1078102349589132, "logps/chosen": -656.4, "logps/rejected": -1050.0, "loss": 0.4255, "mean_token_accuracy": 0.693815928697586, "num_tokens": 25901754.0, "rewards/accuracies": 0.85625, "rewards/chosen": -5.947020316123963, "rewards/margins": 4.109345865249634, "rewards/rejected": -10.056366229057312, "setc/cal_net_lr": 0.00029690382640003857, "setc/credit_mean": 1.0, "setc/credit_std": 0.07734486311674119, "setc/logratio_margin_delta": 20.284591674804688, "setc/logratio_margin_vanilla": 390.65, "setc/logratio_margin_weighted": 410.93459167480466, "step": 730 }, { "entropy": 0.44501953125, "epoch": 0.3988143357585556, "grad_norm": 73.25760650634766, "learning_rate": 7.529808244017382e-07, "logits/chosen": -1.0208730154549697, "logits/rejected": -1.055008389785436, "logps/chosen": -627.0, "logps/rejected": -961.0, "loss": 0.4673, "mean_token_accuracy": 0.70859514772892, "num_tokens": 26254733.0, "rewards/accuracies": 0.865625, "rewards/chosen": -5.5865224480628966, "rewards/margins": 3.537639796733856, "rewards/rejected": -9.124162173271179, "setc/cal_net_lr": 0.00027971996019343084, "setc/credit_mean": 1.0, "setc/credit_std": 0.07454265281558037, "setc/logratio_margin_delta": 17.0139892578125, "setc/logratio_margin_vanilla": 336.75, "setc/logratio_margin_weighted": 353.7639892578125, "step": 740 }, { "entropy": 0.45361328125, "epoch": 0.4042037186742118, "grad_norm": 94.29061126708984, "learning_rate": 7.448233685031693e-07, "logits/chosen": -1.0630623814562947, "logits/rejected": -1.1073087245931403, "logps/chosen": -550.3, "logps/rejected": -882.2, "loss": 0.3111, "mean_token_accuracy": 0.7219261229038239, "num_tokens": 26587461.0, "rewards/accuracies": 0.89375, "rewards/chosen": -4.7769605159759525, "rewards/margins": 3.535008490085602, "rewards/rejected": -8.311969017982483, "setc/cal_net_lr": 0.00026285316907138917, "setc/credit_mean": 1.0, "setc/credit_std": 0.07642085328698159, "setc/logratio_margin_delta": 19.450860595703126, "setc/logratio_margin_vanilla": 334.05, "setc/logratio_margin_weighted": 353.5008605957031, "step": 750 }, { "entropy": 0.40673828125, "epoch": 0.40959310158986795, "grad_norm": 35.175498962402344, "learning_rate": 7.365792749145662e-07, "logits/chosen": -1.0218951494827677, "logits/rejected": -1.048595356950924, "logps/chosen": -488.2, "logps/rejected": -758.2, "loss": 0.37, "mean_token_accuracy": 0.73828344643116, "num_tokens": 26953136.0, "rewards/accuracies": 0.871875, "rewards/chosen": -4.2265440940856935, "rewards/margins": 2.8608301758766173, "rewards/rejected": -7.08737428188324, "setc/cal_net_lr": 0.00024632773139897124, "setc/credit_mean": 1.0, "setc/credit_std": 0.07358978651463985, "setc/logratio_margin_delta": 16.033021926879883, "setc/logratio_margin_vanilla": 270.05, "setc/logratio_margin_weighted": 286.0830219268799, "step": 760 }, { "entropy": 0.40654296875, "epoch": 0.4149824845055241, "grad_norm": 37.247169494628906, "learning_rate": 7.28251461042177e-07, "logits/chosen": -1.0070175249471507, "logits/rejected": -1.0310765859216962, "logps/chosen": -496.5, "logps/rejected": -771.8, "loss": 0.37, "mean_token_accuracy": 0.7500552415847779, "num_tokens": 27312799.0, "rewards/accuracies": 0.84375, "rewards/chosen": -4.265070939064026, "rewards/margins": 2.9076979637145994, "rewards/rejected": -7.172768950462341, "setc/cal_net_lr": 0.00023016743419088836, "setc/credit_mean": 1.0, "setc/credit_std": 0.07328317761421203, "setc/logratio_margin_delta": 18.319805145263672, "setc/logratio_margin_vanilla": 272.45, "setc/logratio_margin_weighted": 290.76980514526366, "step": 770 }, { "entropy": 0.43095703125, "epoch": 0.42037186742118027, "grad_norm": 19.564472198486328, "learning_rate": 7.198428739190457e-07, "logits/chosen": -1.082859984019952, "logits/rejected": -1.0929271185391367, "logps/chosen": -557.8, "logps/rejected": -832.2, "loss": 0.3455, "mean_token_accuracy": 0.7363463073968888, "num_tokens": 27667656.0, "rewards/accuracies": 0.86875, "rewards/chosen": -4.847213423252105, "rewards/margins": 2.919535148143768, "rewards/rejected": -7.766748380661011, "setc/cal_net_lr": 0.00021439553887204476, "setc/credit_mean": 1.0, "setc/credit_std": 0.07149026319384574, "setc/logratio_margin_delta": 16.803517150878907, "setc/logratio_margin_vanilla": 275.15, "setc/logratio_margin_weighted": 291.9535171508789, "step": 780 }, { "entropy": 0.428515625, "epoch": 0.4257612503368364, "grad_norm": 24.025815963745117, "learning_rate": 7.113564891621212e-07, "logits/chosen": -0.9810840312218222, "logits/rejected": -1.0128754190678195, "logps/chosen": -626.1, "logps/rejected": -965.2, "loss": 0.3468, "mean_token_accuracy": 0.729249706864357, "num_tokens": 28037217.0, "rewards/accuracies": 0.86875, "rewards/chosen": -5.643828749656677, "rewards/margins": 3.574371063709259, "rewards/rejected": -9.218199872970581, "setc/cal_net_lr": 0.00019903474779462064, "setc/credit_mean": 1.0, "setc/credit_std": 0.07268696911633014, "setc/logratio_margin_delta": 19.687115478515626, "setc/logratio_margin_vanilla": 337.75, "setc/logratio_margin_weighted": 357.43711547851564, "step": 790 }, { "entropy": 0.4359375, "epoch": 0.4311506332524926, "grad_norm": 26.062593460083008, "learning_rate": 7.027953099192509e-07, "logits/chosen": -1.037471741150463, "logits/rejected": -1.0548466740521376, "logps/chosen": -552.5, "logps/rejected": -859.8, "loss": 0.2958, "mean_token_accuracy": 0.7349712908267975, "num_tokens": 28389736.0, "rewards/accuracies": 0.896875, "rewards/chosen": -4.908501935005188, "rewards/margins": 3.250082492828369, "rewards/rejected": -8.15858452320099, "setc/cal_net_lr": 0.0001841071715598952, "setc/credit_mean": 1.0, "setc/credit_std": 0.0715821348130703, "setc/logratio_margin_delta": 19.408258819580077, "setc/logratio_margin_vanilla": 305.6, "setc/logratio_margin_weighted": 325.0082588195801, "step": 800 }, { "epoch": 0.4311506332524926, "eval_entropy": 0.437255859375, "eval_logits/chosen": -1.0900919600946075, "eval_logits/rejected": -1.1059430813034792, "eval_logps/chosen": -572.875, "eval_logps/rejected": -897.125, "eval_loss": 0.29659560322761536, "eval_mean_token_accuracy": 0.7379258405417204, "eval_num_tokens": 28389736.0, "eval_rewards/accuracies": 0.890625, "eval_rewards/chosen": -5.015224754810333, "eval_rewards/margins": 3.451749622821808, "eval_rewards/rejected": -8.466974467039108, "eval_runtime": 14.2077, "eval_samples_per_second": 35.192, "eval_setc/cal_net_lr": 0.00017643152107359864, "eval_setc/credit_mean": 1.0, "eval_setc/credit_std": 0.07336517237126827, "eval_setc/logratio_margin_delta": 19.209267139434814, "eval_setc/logratio_margin_vanilla": 328.59375, "eval_setc/logratio_margin_weighted": 347.8030171394348, "eval_steps_per_second": 2.252, "step": 800 }, { "entropy": 0.43359375, "epoch": 0.43654001616814875, "grad_norm": 70.45535278320312, "learning_rate": 6.941623658064312e-07, "logits/chosen": -1.0733893563106265, "logits/rejected": -1.0982346659335882, "logps/chosen": -633.1, "logps/rejected": -941.0, "loss": 0.3849, "mean_token_accuracy": 0.729145985841751, "num_tokens": 28733805.0, "rewards/accuracies": 0.834375, "rewards/chosen": -5.708724451065064, "rewards/margins": 3.248158019781113, "rewards/rejected": -8.956882286071778, "setc/cal_net_lr": 0.00016963429719184532, "setc/credit_mean": 1.0, "setc/credit_std": 0.07415341325104237, "setc/logratio_margin_delta": 17.66580581665039, "setc/logratio_margin_vanilla": 307.15, "setc/logratio_margin_weighted": 324.81580581665037, "step": 810 }, { "entropy": 0.4302734375, "epoch": 0.4419293990838049, "grad_norm": 40.56505584716797, "learning_rate": 6.854607118356928e-07, "logits/chosen": -1.1385733399006264, "logits/rejected": -1.183113374768871, "logps/chosen": -566.1, "logps/rejected": -896.6, "loss": 0.3152, "mean_token_accuracy": 0.7246308416128159, "num_tokens": 29076104.0, "rewards/accuracies": 0.859375, "rewards/chosen": -5.08197500705719, "rewards/margins": 3.4515433311462402, "rewards/rejected": -8.53351833820343, "setc/cal_net_lr": 0.0001556369572083343, "setc/credit_mean": 1.0, "setc/credit_std": 0.07320533730089665, "setc/logratio_margin_delta": 18.154342651367188, "setc/logratio_margin_vanilla": 327.0, "setc/logratio_margin_weighted": 345.1543426513672, "step": 820 }, { "entropy": 0.43427734375, "epoch": 0.44731878199946107, "grad_norm": 56.18523025512695, "learning_rate": 6.766934273339973e-07, "logits/chosen": -1.1405602822344687, "logits/rejected": -1.1867411468308247, "logps/chosen": -557.2, "logps/rejected": -863.8, "loss": 0.2993, "mean_token_accuracy": 0.7315610319375991, "num_tokens": 29438936.0, "rewards/accuracies": 0.88125, "rewards/chosen": -4.800099658966064, "rewards/margins": 3.2655778348445894, "rewards/rejected": -8.065677428245545, "setc/cal_net_lr": 0.00014213529963440723, "setc/credit_mean": 1.0, "setc/credit_std": 0.07172517664730549, "setc/logratio_margin_delta": 18.5077823638916, "setc/logratio_margin_vanilla": 308.05, "setc/logratio_margin_weighted": 326.5577823638916, "step": 830 }, { "entropy": 0.4123046875, "epoch": 0.45270816491511723, "grad_norm": 27.114397048950195, "learning_rate": 6.678636148535307e-07, "logits/chosen": -1.1366168963090462, "logits/rejected": -1.173395336965087, "logps/chosen": -496.6, "logps/rejected": -803.6, "loss": 0.3137, "mean_token_accuracy": 0.7537930309772491, "num_tokens": 29818328.0, "rewards/accuracies": 0.871875, "rewards/chosen": -4.169769895076752, "rewards/margins": 3.23821382522583, "rewards/rejected": -7.407983732223511, "setc/cal_net_lr": 0.0001291487590008584, "setc/credit_mean": 1.0, "setc/credit_std": 0.07007276304066182, "setc/logratio_margin_delta": 19.77139205932617, "setc/logratio_margin_vanilla": 304.05, "setc/logratio_margin_weighted": 323.8213920593262, "step": 840 }, { "entropy": 0.4216796875, "epoch": 0.4580975478307734, "grad_norm": 41.05332946777344, "learning_rate": 6.58974399073777e-07, "logits/chosen": -1.1544986734058926, "logits/rejected": -1.1544554202476698, "logps/chosen": -538.0, "logps/rejected": -871.0, "loss": 0.2942, "mean_token_accuracy": 0.736896401643753, "num_tokens": 30188501.0, "rewards/accuracies": 0.8875, "rewards/chosen": -4.634168601036071, "rewards/margins": 3.525930380821228, "rewards/rejected": -8.160099053382874, "setc/cal_net_lr": 0.00011669602836981566, "setc/credit_mean": 1.0, "setc/credit_std": 0.07121242731809616, "setc/logratio_margin_delta": 21.643047332763672, "setc/logratio_margin_vanilla": 330.95, "setc/logratio_margin_weighted": 352.59304733276366, "step": 850 }, { "entropy": 0.4259765625, "epoch": 0.46348693074642955, "grad_norm": 64.55622100830078, "learning_rate": 6.500289256957615e-07, "logits/chosen": -1.1011358195249479, "logits/rejected": -1.1499861979740114, "logps/chosen": -573.8, "logps/rejected": -874.0, "loss": 0.4537, "mean_token_accuracy": 0.7331288278102874, "num_tokens": 30547224.0, "rewards/accuracies": 0.803125, "rewards/chosen": -5.030296123027801, "rewards/margins": 3.2068679571151733, "rewards/rejected": -8.237163949012757, "setc/cal_net_lr": 0.00010479503242760742, "setc/credit_mean": 1.0, "setc/credit_std": 0.0720590002834797, "setc/logratio_margin_delta": 18.836805725097655, "setc/logratio_margin_vanilla": 301.85, "setc/logratio_margin_weighted": 320.68680572509766, "step": 860 }, { "entropy": 0.43505859375, "epoch": 0.4688763136620857, "grad_norm": 39.62507247924805, "learning_rate": 6.410303603288561e-07, "logits/chosen": -1.088289906345851, "logits/rejected": -1.0841832767082082, "logps/chosen": -404.8, "logps/rejected": -620.1, "loss": 0.3673, "mean_token_accuracy": 0.7633604764938354, "num_tokens": 30902393.0, "rewards/accuracies": 0.8625, "rewards/chosen": -3.1859230756759644, "rewards/margins": 2.2959958791732786, "rewards/rejected": -5.481918931007385, "setc/cal_net_lr": 9.346290168364385e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.06972141042351723, "setc/logratio_margin_delta": 14.549591827392579, "setc/logratio_margin_vanilla": 215.05, "setc/logratio_margin_weighted": 229.59959182739257, "step": 870 }, { "entropy": 0.41787109375, "epoch": 0.47426569657774187, "grad_norm": 33.14332962036133, "learning_rate": 6.319818873705377e-07, "logits/chosen": -1.089281081386084, "logits/rejected": -1.114244560013979, "logps/chosen": -462.6, "logps/rejected": -784.6, "loss": 0.3062, "mean_token_accuracy": 0.7454081952571869, "num_tokens": 31235617.0, "rewards/accuracies": 0.896875, "rewards/chosen": -3.9631998538970947, "rewards/margins": 3.428285652399063, "rewards/rejected": -7.391485595703125, "setc/cal_net_lr": 8.271594781245119e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.07049608752131462, "setc/logratio_margin_delta": 20.25357093811035, "setc/logratio_margin_vanilla": 322.575, "setc/logratio_margin_weighted": 342.8285709381104, "step": 880 }, { "entropy": 0.4427734375, "epoch": 0.47965507949339803, "grad_norm": 25.35371208190918, "learning_rate": 6.228867088794997e-07, "logits/chosen": -1.117235813203115, "logits/rejected": -1.1379867493894777, "logps/chosen": -611.3, "logps/rejected": -992.4, "loss": 0.3308, "mean_token_accuracy": 0.7231000691652298, "num_tokens": 31611665.0, "rewards/accuracies": 0.85625, "rewards/chosen": -5.4193216323852536, "rewards/margins": 4.017894721031189, "rewards/rejected": -9.437216472625732, "setc/cal_net_lr": 7.256964017435153e-05, "setc/credit_mean": 0.9999999970197677, "setc/credit_std": 0.0703140676021576, "setc/logratio_margin_delta": 22.489488220214845, "setc/logratio_margin_vanilla": 379.3, "setc/logratio_margin_weighted": 401.7894882202148, "step": 890 }, { "entropy": 0.425390625, "epoch": 0.4850444624090542, "grad_norm": 21.312118530273438, "learning_rate": 6.137480434425124e-07, "logits/chosen": -1.063890301715667, "logits/rejected": -1.1046153500158946, "logps/chosen": -502.7, "logps/rejected": -788.2, "loss": 0.3396, "mean_token_accuracy": 0.7333939164876938, "num_tokens": 31947911.0, "rewards/accuracies": 0.853125, "rewards/chosen": -4.411977684497833, "rewards/margins": 2.9957467019557953, "rewards/rejected": -7.407724452018738, "setc/cal_net_lr": 6.303858354858501e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.06882553435862064, "setc/logratio_margin_delta": 16.82468185424805, "setc/logratio_margin_vanilla": 282.75, "setc/logratio_margin_weighted": 299.57468185424807, "step": 900 }, { "epoch": 0.4850444624090542, "eval_entropy": 0.458984375, "eval_logits/chosen": -1.136106022790833, "eval_logits/rejected": -1.1553218335773052, "eval_logps/chosen": -567.9375, "eval_logps/rejected": -919.125, "eval_loss": 0.2798229157924652, "eval_mean_token_accuracy": 0.7211284190416336, "eval_num_tokens": 31947911.0, "eval_rewards/accuracies": 0.888671875, "eval_rewards/chosen": -4.92750097066164, "eval_rewards/margins": 3.7332901805639267, "eval_rewards/rejected": -8.660791024565697, "eval_runtime": 14.2045, "eval_samples_per_second": 35.2, "eval_setc/cal_net_lr": 5.825929969209536e-05, "eval_setc/credit_mean": 1.0, "eval_setc/credit_std": 0.07130194280762225, "eval_setc/logratio_margin_delta": 20.13284683227539, "eval_setc/logratio_margin_vanilla": 354.25, "eval_setc/logratio_margin_weighted": 374.3828468322754, "eval_steps_per_second": 2.253, "step": 900 }, { "entropy": 0.4380859375, "epoch": 0.4904338453247103, "grad_norm": 30.84346580505371, "learning_rate": 6.045691250354349e-07, "logits/chosen": -1.0498209270589363, "logits/rejected": -1.07947071859647, "logps/chosen": -607.5, "logps/rejected": -951.4, "loss": 0.3157, "mean_token_accuracy": 0.7191443383693695, "num_tokens": 32308377.0, "rewards/accuracies": 0.859375, "rewards/chosen": -5.468822598457336, "rewards/margins": 3.641323584318161, "rewards/rejected": -9.110146307945252, "setc/cal_net_lr": 5.413649711092565e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.07014792338013649, "setc/logratio_margin_delta": 21.932370758056642, "setc/logratio_margin_vanilla": 342.2, "setc/logratio_margin_weighted": 364.13237075805665, "step": 910 }, { "entropy": 0.43798828125, "epoch": 0.49582322824036645, "grad_norm": 69.57173919677734, "learning_rate": 5.953532018787807e-07, "logits/chosen": -1.0614989872760174, "logits/rejected": -1.1125016970713177, "logps/chosen": -562.6, "logps/rejected": -910.6, "loss": 0.3801, "mean_token_accuracy": 0.7194907575845718, "num_tokens": 32667657.0, "rewards/accuracies": 0.834375, "rewards/chosen": -5.013018798828125, "rewards/margins": 3.655552077293396, "rewards/rejected": -8.668570947647094, "setc/cal_net_lr": 4.587619468605094e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.07026223614811897, "setc/logratio_margin_delta": 19.905216217041016, "setc/logratio_margin_vanilla": 345.65, "setc/logratio_margin_weighted": 365.555216217041, "step": 920 }, { "entropy": 0.43251953125, "epoch": 0.5012126111560227, "grad_norm": 37.31387710571289, "learning_rate": 5.861035352882434e-07, "logits/chosen": -1.1028243420067865, "logits/rejected": -1.1239636597884728, "logps/chosen": -668.2, "logps/rejected": -1051.8, "loss": 0.3082, "mean_token_accuracy": 0.7101206243038177, "num_tokens": 33038205.0, "rewards/accuracies": 0.871875, "rewards/chosen": -6.0299333214759825, "rewards/margins": 4.101047283411026, "rewards/rejected": -10.130980563163757, "setc/cal_net_lr": 3.826956630309029e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.07096938453614712, "setc/logratio_margin_delta": 25.654744720458986, "setc/logratio_margin_vanilla": 384.45, "setc/logratio_margin_weighted": 410.104744720459, "step": 930 }, { "entropy": 0.43154296875, "epoch": 0.5066019940716788, "grad_norm": 31.79937744140625, "learning_rate": 5.768233985205871e-07, "logits/chosen": -1.0400086846324763, "logits/rejected": -1.0826617979469901, "logps/chosen": -603.6, "logps/rejected": -1010.8, "loss": 0.3537, "mean_token_accuracy": 0.7205777823925018, "num_tokens": 33412806.0, "rewards/accuracies": 0.8375, "rewards/chosen": -5.440573036670685, "rewards/margins": 4.329031145572662, "rewards/rejected": -9.769603991508484, "setc/cal_net_lr": 3.1327561080901155e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.07101482562720776, "setc/logratio_margin_delta": 24.953127288818358, "setc/logratio_margin_vanilla": 407.95, "setc/logratio_margin_weighted": 432.9031272888184, "step": 940 }, { "entropy": 0.44111328125, "epoch": 0.511991376987335, "grad_norm": 41.023216247558594, "learning_rate": 5.675160756153119e-07, "logits/chosen": -1.063226666319973, "logits/rejected": -1.103878181164049, "logps/chosen": -572.8, "logps/rejected": -937.8, "loss": 0.2939, "mean_token_accuracy": 0.7170851469039917, "num_tokens": 33771494.0, "rewards/accuracies": 0.875, "rewards/chosen": -5.1082786321640015, "rewards/margins": 3.9003884315490724, "rewards/rejected": -9.0086669921875, "setc/cal_net_lr": 2.5060171467709125e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.0710456695407629, "setc/logratio_margin_delta": 24.788859558105468, "setc/logratio_margin_vanilla": 365.25, "setc/logratio_margin_weighted": 390.03885955810546, "step": 950 }, { "entropy": 0.44267578125, "epoch": 0.5173807599029911, "grad_norm": 52.85020446777344, "learning_rate": 5.581848602325041e-07, "logits/chosen": -1.0075269843195531, "logits/rejected": -1.0690200247116604, "logps/chosen": -536.4, "logps/rejected": -866.6, "loss": 0.3122, "mean_token_accuracy": 0.7294569611549377, "num_tokens": 34132231.0, "rewards/accuracies": 0.878125, "rewards/chosen": -4.7246493101119995, "rewards/margins": 3.508440887928009, "rewards/rejected": -8.233090209960938, "setc/cal_net_lr": 1.9476418857796457e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.07001486346125603, "setc/logratio_margin_delta": 22.394107818603516, "setc/logratio_margin_vanilla": 328.45, "setc/logratio_margin_weighted": 350.8441078186035, "step": 960 }, { "entropy": 0.4556640625, "epoch": 0.5227701428186473, "grad_norm": 22.979949951171875, "learning_rate": 5.488330544872797e-07, "logits/chosen": -1.0871756321135606, "logits/rejected": -1.1191254896199978, "logps/chosen": -508.9, "logps/rejected": -850.4, "loss": 0.3046, "mean_token_accuracy": 0.7397548407316208, "num_tokens": 34502355.0, "rewards/accuracies": 0.859375, "rewards/chosen": -4.326072335243225, "rewards/margins": 3.6845016717910766, "rewards/rejected": -8.010574007034302, "setc/cal_net_lr": 1.4584340605943599e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.07133265994489194, "setc/logratio_margin_delta": 22.850178527832032, "setc/logratio_margin_vanilla": 345.6, "setc/logratio_margin_weighted": 368.450178527832, "step": 970 }, { "entropy": 0.43857421875, "epoch": 0.5281595257343035, "grad_norm": 25.516042709350586, "learning_rate": 5.394639677812387e-07, "logits/chosen": -1.088331639351806, "logits/rejected": -1.134811225998461, "logps/chosen": -455.4, "logps/rejected": -754.1, "loss": 0.3234, "mean_token_accuracy": 0.7478211104869843, "num_tokens": 34852059.0, "rewards/accuracies": 0.871875, "rewards/chosen": -3.8503175020217895, "rewards/margins": 3.19860480427742, "rewards/rejected": -7.048922348022461, "setc/cal_net_lr": 1.0390978458315019e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.07125458978116513, "setc/logratio_margin_delta": 19.160488891601563, "setc/logratio_margin_vanilla": 300.7, "setc/logratio_margin_weighted": 319.86048889160156, "step": 980 }, { "entropy": 0.41787109375, "epoch": 0.5335489086499596, "grad_norm": 22.045780181884766, "learning_rate": 5.300809156313389e-07, "logits/chosen": -1.1293688528513106, "logits/rejected": -1.1482995959404332, "logps/chosen": -498.4, "logps/rejected": -795.4, "loss": 0.3843, "mean_token_accuracy": 0.7528087586164475, "num_tokens": 35212012.0, "rewards/accuracies": 0.86875, "rewards/chosen": -4.3080038785934445, "rewards/margins": 3.206386703252792, "rewards/rejected": -7.514390516281128, "setc/cal_net_lr": 6.902368416441938e-06, "setc/credit_mean": 1.0, "setc/credit_std": 0.07167959362268447, "setc/logratio_margin_delta": 22.038671493530273, "setc/logratio_margin_vanilla": 298.6, "setc/logratio_margin_weighted": 320.6386714935303, "step": 990 }, { "entropy": 0.4220703125, "epoch": 0.5389382915656158, "grad_norm": 29.74481773376465, "learning_rate": 5.206872184966065e-07, "logits/chosen": -1.2005332112522926, "logits/rejected": -1.223600991098159, "logps/chosen": -475.5, "logps/rejected": -771.0, "loss": 0.3032, "mean_token_accuracy": 0.7501377999782562, "num_tokens": 35559559.0, "rewards/accuracies": 0.884375, "rewards/chosen": -4.051128447055817, "rewards/margins": 3.163189709186554, "rewards/rejected": -7.214318156242371, "setc/cal_net_lr": 4.123532048892359e-06, "setc/credit_mean": 1.0, "setc/credit_std": 0.07157905176281928, "setc/logratio_margin_delta": 21.718975830078126, "setc/logratio_margin_vanilla": 294.6, "setc/logratio_margin_weighted": 316.3189758300781, "step": 1000 }, { "epoch": 0.5389382915656158, "eval_entropy": 0.42791748046875, "eval_logits/chosen": -1.205733974360692, "eval_logits/rejected": -1.2256254141801155, "eval_logps/chosen": -520.25, "eval_logps/rejected": -881.125, "eval_loss": 0.26266777515411377, "eval_mean_token_accuracy": 0.7448298633098602, "eval_num_tokens": 35559559.0, "eval_rewards/accuracies": 0.912109375, "eval_rewards/chosen": -4.464685194194317, "eval_rewards/margins": 3.8784405440092087, "eval_rewards/rejected": -8.343125775456429, "eval_runtime": 14.2575, "eval_samples_per_second": 35.069, "eval_setc/cal_net_lr": 2.9204217399144684e-06, "eval_setc/credit_mean": 1.0, "eval_setc/credit_std": 0.07189792324788868, "eval_setc/logratio_margin_delta": 25.44654369354248, "eval_setc/logratio_margin_vanilla": 364.4375, "eval_setc/logratio_margin_weighted": 389.8840436935425, "eval_steps_per_second": 2.244, "step": 1000 }, { "entropy": 0.4033203125, "epoch": 0.5443276744812718, "grad_norm": 84.13672637939453, "learning_rate": 5.11286200603097e-07, "logits/chosen": -1.1744355391727033, "logits/rejected": -1.1982522802884472, "logps/chosen": -589.9, "logps/rejected": -966.7, "loss": 0.3591, "mean_token_accuracy": 0.7260809451341629, "num_tokens": 35914588.0, "rewards/accuracies": 0.86875, "rewards/chosen": -5.367753648757935, "rewards/margins": 4.012299507856369, "rewards/rejected": -9.380053186416626, "setc/cal_net_lr": 2.058469263134096e-06, "setc/credit_mean": 1.0, "setc/credit_std": 0.07252811081707478, "setc/logratio_margin_delta": 27.179954528808594, "setc/logratio_margin_vanilla": 374.05, "setc/logratio_margin_weighted": 401.2299545288086, "step": 1010 }, { "entropy": 0.41494140625, "epoch": 0.549717057396928, "grad_norm": 45.3025016784668, "learning_rate": 5.018811887675243e-07, "logits/chosen": -1.2095668156933808, "logits/rejected": -1.2754331190012376, "logps/chosen": -638.7, "logps/rejected": -1027.8, "loss": 0.2909, "mean_token_accuracy": 0.7233078300952911, "num_tokens": 36272220.0, "rewards/accuracies": 0.9, "rewards/chosen": -5.741470265388489, "rewards/margins": 4.172668009996414, "rewards/rejected": -9.914138317108154, "setc/cal_net_lr": 7.101525479954687e-07, "setc/credit_mean": 1.0, "setc/credit_std": 0.07240922413766385, "setc/logratio_margin_delta": 29.066813659667968, "setc/logratio_margin_vanilla": 388.2, "setc/logratio_margin_weighted": 417.26681365966795, "step": 1020 }, { "entropy": 0.41572265625, "epoch": 0.5551064403125842, "grad_norm": 54.05473327636719, "learning_rate": 4.924755112199719e-07, "logits/chosen": -1.1848740348447429, "logits/rejected": -1.2239745932740136, "logps/chosen": -530.6, "logps/rejected": -873.4, "loss": 0.3204, "mean_token_accuracy": 0.737355038523674, "num_tokens": 36608348.0, "rewards/accuracies": 0.846875, "rewards/chosen": -4.719290387630463, "rewards/margins": 3.6306647956371307, "rewards/rejected": -8.349955177307129, "setc/cal_net_lr": 8.052269501096502e-08, "setc/credit_mean": 1.0, "setc/credit_std": 0.07150659598410129, "setc/logratio_margin_delta": 23.716495513916016, "setc/logratio_margin_vanilla": 339.35, "setc/logratio_margin_weighted": 363.06649551391604, "step": 1030 }, { "entropy": 0.40263671875, "epoch": 0.5604958232282403, "grad_norm": 51.89723587036133, "learning_rate": 4.830724964261044e-07, "logits/chosen": -1.2390042148153568, "logits/rejected": -1.2885369769102084, "logps/chosen": -552.1, "logps/rejected": -873.2, "loss": 0.3291, "mean_token_accuracy": 0.7273021250963211, "num_tokens": 36931299.0, "rewards/accuracies": 0.8875, "rewards/chosen": -4.9631112813949585, "rewards/margins": 3.396042114496231, "rewards/rejected": -8.359153366088867, "setc/cal_net_lr": 1.704860048107271e-07, "setc/credit_mean": 1.0, "setc/credit_std": 0.0706510066986084, "setc/logratio_margin_delta": 24.204216766357423, "setc/logratio_margin_vanilla": 315.4, "setc/logratio_margin_weighted": 339.6042167663574, "step": 1040 }, { "entropy": 0.42197265625, "epoch": 0.5658852061438965, "grad_norm": 108.92530822753906, "learning_rate": 4.736754719092948e-07, "logits/chosen": -1.2478903284456264, "logits/rejected": -1.3215152839167994, "logps/chosen": -660.8, "logps/rejected": -1058.0, "loss": 0.317, "mean_token_accuracy": 0.7129073649644851, "num_tokens": 37295844.0, "rewards/accuracies": 0.8875, "rewards/chosen": -5.961016845703125, "rewards/margins": 4.262525498867035, "rewards/rejected": -10.223542261123658, "setc/cal_net_lr": 9.799129825749437e-07, "setc/credit_mean": 1.0, "setc/credit_std": 0.0731208048760891, "setc/logratio_margin_delta": 28.65255813598633, "setc/logratio_margin_vanilla": 397.6, "setc/logratio_margin_weighted": 426.2525581359863, "step": 1050 }, { "entropy": 0.40234375, "epoch": 0.5712745890595526, "grad_norm": 23.12958335876465, "learning_rate": 4.6428776307308795e-07, "logits/chosen": -1.2329498912657004, "logits/rejected": -1.247452417260519, "logps/chosen": -669.1, "logps/rejected": -1018.6, "loss": 0.3105, "mean_token_accuracy": 0.7226340204477311, "num_tokens": 37665756.0, "rewards/accuracies": 0.875, "rewards/chosen": -6.078487741947174, "rewards/margins": 3.7634204626083374, "rewards/rejected": -9.841908311843872, "setc/cal_net_lr": 2.5076385244310457e-06, "setc/credit_mean": 1.0, "setc/credit_std": 0.07050310261547565, "setc/logratio_margin_delta": 26.592060089111328, "setc/logratio_margin_vanilla": 349.75, "setc/logratio_margin_weighted": 376.3420600891113, "step": 1060 }, { "entropy": 0.42890625, "epoch": 0.5766639719752088, "grad_norm": 54.92173385620117, "learning_rate": 4.5491269202441044e-07, "logits/chosen": -1.2268895919032963, "logits/rejected": -1.2605704569389382, "logps/chosen": -601.6, "logps/rejected": -936.8, "loss": 0.2924, "mean_token_accuracy": 0.7248776346445084, "num_tokens": 38031467.0, "rewards/accuracies": 0.871875, "rewards/chosen": -5.383153641223908, "rewards/margins": 3.53509316444397, "rewards/rejected": -8.918246793746949, "setc/cal_net_lr": 4.751463594525389e-06, "setc/credit_mean": 1.0, "setc/credit_std": 0.07096486799418926, "setc/logratio_margin_delta": 21.009319305419922, "setc/logratio_margin_vanilla": 332.5, "setc/logratio_margin_weighted": 353.5093193054199, "step": 1070 }, { "entropy": 0.43212890625, "epoch": 0.582053354890865, "grad_norm": 29.253252029418945, "learning_rate": 4.455535763979489e-07, "logits/chosen": -1.2967908691930823, "logits/rejected": -1.334540072490827, "logps/chosen": -673.7, "logps/rejected": -1018.8, "loss": 0.276, "mean_token_accuracy": 0.7103154689073563, "num_tokens": 38406113.0, "rewards/accuracies": 0.8875, "rewards/chosen": -6.113679957389832, "rewards/margins": 3.689132344722748, "rewards/rejected": -9.802812147140504, "setc/cal_net_lr": 7.708158390355337e-06, "setc/credit_mean": 1.0, "setc/credit_std": 0.07218135371804238, "setc/logratio_margin_delta": 23.763233947753907, "setc/logratio_margin_vanilla": 345.15, "setc/logratio_margin_weighted": 368.9132339477539, "step": 1080 }, { "entropy": 0.43740234375, "epoch": 0.5874427378065211, "grad_norm": 63.62852096557617, "learning_rate": 4.3621372818211233e-07, "logits/chosen": -1.2966434675325627, "logits/rejected": -1.3388852385699441, "logps/chosen": -717.7, "logps/rejected": -1140.4, "loss": 0.2884, "mean_token_accuracy": 0.7034007340669632, "num_tokens": 38739807.0, "rewards/accuracies": 0.9, "rewards/chosen": -6.62959771156311, "rewards/margins": 4.472747385501862, "rewards/rejected": -11.1023451089859, "setc/cal_net_lr": 1.1373466991805524e-05, "setc/credit_mean": 0.9999999970197677, "setc/credit_std": 0.07208193615078926, "setc/logratio_margin_delta": 29.124752044677734, "setc/logratio_margin_vanilla": 418.15, "setc/logratio_margin_weighted": 447.2747520446777, "step": 1090 }, { "entropy": 0.43876953125, "epoch": 0.5928321207221773, "grad_norm": 58.927433013916016, "learning_rate": 4.2689645254698956e-07, "logits/chosen": -1.3082769209270118, "logits/rejected": -1.357060948196913, "logps/chosen": -709.7, "logps/rejected": -1137.4, "loss": 0.2492, "mean_token_accuracy": 0.713100990653038, "num_tokens": 39121036.0, "rewards/accuracies": 0.8875, "rewards/chosen": -6.4656531572341915, "rewards/margins": 4.571207290887832, "rewards/rejected": -11.03686032295227, "setc/cal_net_lr": 1.5742113487196524e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.07217907235026359, "setc/logratio_margin_delta": 28.670735931396486, "setc/logratio_margin_vanilla": 428.45, "setc/logratio_margin_weighted": 457.1207359313965, "step": 1100 }, { "epoch": 0.5928321207221773, "eval_entropy": 0.44512939453125, "eval_logits/chosen": -1.3603613258581002, "eval_logits/rejected": -1.3904849397392145, "eval_logps/chosen": -694.0625, "eval_logps/rejected": -1149.25, "eval_loss": 0.2531175911426544, "eval_mean_token_accuracy": 0.7049690876156092, "eval_num_tokens": 39121036.0, "eval_rewards/accuracies": 0.904296875, "eval_rewards/chosen": -6.33006377518177, "eval_rewards/margins": 4.873635433614254, "eval_rewards/rejected": -11.20369903743267, "eval_runtime": 14.1864, "eval_samples_per_second": 35.245, "eval_setc/cal_net_lr": 1.8286307534312433e-05, "eval_setc/credit_mean": 1.0, "eval_setc/credit_std": 0.07362057128921151, "eval_setc/logratio_margin_delta": 30.792871475219727, "eval_setc/logratio_margin_vanilla": 457.6875, "eval_setc/logratio_margin_weighted": 488.4803714752197, "eval_steps_per_second": 2.256, "step": 1100 }, { "entropy": 0.43310546875, "epoch": 0.5982215036378334, "grad_norm": 40.20852279663086, "learning_rate": 4.176050466747224e-07, "logits/chosen": -1.3270166713790517, "logits/rejected": -1.4020509422736323, "logps/chosen": -716.6, "logps/rejected": -1198.2, "loss": 0.2997, "mean_token_accuracy": 0.6954291999340058, "num_tokens": 39473664.0, "rewards/accuracies": 0.878125, "rewards/chosen": -6.649608445167542, "rewards/margins": 5.128565120697021, "rewards/rejected": -11.778173661231994, "setc/cal_net_lr": 2.0807809567527706e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.07330171279609203, "setc/logratio_margin_delta": 34.506527709960935, "setc/logratio_margin_vanilla": 478.35, "setc/logratio_margin_weighted": 512.8565277099609, "step": 1110 }, { "entropy": 0.4423828125, "epoch": 0.6036108865534896, "grad_norm": 70.10661315917969, "learning_rate": 4.0834279859270284e-07, "logits/chosen": -1.3479278371743117, "logits/rejected": -1.402101682907808, "logps/chosen": -684.2, "logps/rejected": -1106.6, "loss": 0.2735, "mean_token_accuracy": 0.698912826180458, "num_tokens": 39821548.0, "rewards/accuracies": 0.8875, "rewards/chosen": -6.291271615028381, "rewards/margins": 4.475275766849518, "rewards/rejected": -10.766547250747681, "setc/cal_net_lr": 2.6563263577983417e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.07238798253238202, "setc/logratio_margin_delta": 27.777584838867188, "setc/logratio_margin_vanilla": 419.75, "setc/logratio_margin_weighted": 447.5275848388672, "step": 1120 }, { "entropy": 0.43671875, "epoch": 0.6090002694691458, "grad_norm": 20.157817840576172, "learning_rate": 3.9911298601001273e-07, "logits/chosen": -1.3612183495667927, "logits/rejected": -1.3977843345025138, "logps/chosen": -601.0, "logps/rejected": -1085.4, "loss": 0.2369, "mean_token_accuracy": 0.7253735572099685, "num_tokens": 40192876.0, "rewards/accuracies": 0.875, "rewards/chosen": -5.32529776096344, "rewards/margins": 5.10829610824585, "rewards/rejected": -10.433593916893006, "setc/cal_net_lr": 3.3000191013673035e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.07107761949300766, "setc/logratio_margin_delta": 29.07962646484375, "setc/logratio_margin_vanilla": 481.75, "setc/logratio_margin_weighted": 510.8296264648437, "step": 1130 }, { "entropy": 0.45537109375, "epoch": 0.6143896523848019, "grad_norm": 47.181636810302734, "learning_rate": 3.8991887515751285e-07, "logits/chosen": -1.3811772662010782, "logits/rejected": -1.4073035922265509, "logps/chosen": -634.4, "logps/rejected": -992.8, "loss": 0.3292, "mean_token_accuracy": 0.7175350487232208, "num_tokens": 40559383.0, "rewards/accuracies": 0.890625, "rewards/chosen": -5.5887420058250425, "rewards/margins": 3.8901084780693056, "rewards/rejected": -9.478850364685059, "setc/cal_net_lr": 4.010932644449762e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.07180496528744698, "setc/logratio_margin_delta": 26.260860443115234, "setc/logratio_margin_vanilla": 362.75, "setc/logratio_margin_weighted": 389.01086044311523, "step": 1140 }, { "entropy": 0.43125, "epoch": 0.6197790353004581, "grad_norm": 69.74315643310547, "learning_rate": 3.807637196319943e-07, "logits/chosen": -1.4106397734688008, "logits/rejected": -1.4677872876176674, "logps/chosen": -626.8, "logps/rejected": -1127.6, "loss": 0.3109, "mean_token_accuracy": 0.7143973648548126, "num_tokens": 40918488.0, "rewards/accuracies": 0.871875, "rewards/chosen": -5.679463398456574, "rewards/margins": 5.340268242359161, "rewards/rejected": -11.019731426239014, "setc/cal_net_lr": 4.7880436851978056e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.07189065329730511, "setc/logratio_margin_delta": 37.0768325805664, "setc/logratio_margin_vanilla": 496.95, "setc/logratio_margin_weighted": 534.0268325805664, "step": 1150 }, { "entropy": 0.45703125, "epoch": 0.6251684182161142, "grad_norm": 98.07344055175781, "learning_rate": 3.716507592448015e-07, "logits/chosen": -1.388660761223864, "logits/rejected": -1.3975069124599455, "logps/chosen": -674.2, "logps/rejected": -1123.8, "loss": 0.2989, "mean_token_accuracy": 0.711270448565483, "num_tokens": 41284352.0, "rewards/accuracies": 0.890625, "rewards/chosen": -6.101946842670441, "rewards/margins": 4.818000304698944, "rewards/rejected": -10.919947171211243, "setc/cal_net_lr": 5.630233635884748e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.07173714898526669, "setc/logratio_margin_delta": 30.45003662109375, "setc/logratio_margin_vanilla": 451.35, "setc/logratio_margin_weighted": 481.80003662109374, "step": 1160 }, { "entropy": 0.43251953125, "epoch": 0.6305578011317704, "grad_norm": 35.384769439697266, "learning_rate": 3.625832188753326e-07, "logits/chosen": -1.3047580708198727, "logits/rejected": -1.369543789182836, "logps/chosen": -584.7, "logps/rejected": -974.6, "loss": 0.3559, "mean_token_accuracy": 0.7312755823135376, "num_tokens": 41646831.0, "rewards/accuracies": 0.88125, "rewards/chosen": -5.220397734642029, "rewards/margins": 4.157508033514023, "rewards/rejected": -9.377905797958373, "setc/cal_net_lr": 6.536290233020582e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.07165979258716107, "setc/logratio_margin_delta": 27.70081787109375, "setc/logratio_margin_vanilla": 388.05, "setc/logratio_margin_weighted": 415.7508178710938, "step": 1170 }, { "entropy": 0.45009765625, "epoch": 0.6359471840474266, "grad_norm": 32.074398040771484, "learning_rate": 3.5356430732982537e-07, "logits/chosen": -1.3558946550666882, "logits/rejected": -1.367336323455139, "logps/chosen": -480.1, "logps/rejected": -838.2, "loss": 0.3099, "mean_token_accuracy": 0.7344884812831879, "num_tokens": 41977848.0, "rewards/accuracies": 0.86875, "rewards/chosen": -4.130843257904052, "rewards/margins": 3.8041255950927733, "rewards/rejected": -7.934968829154968, "setc/cal_net_lr": 7.504909282306044e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.07002530321478843, "setc/logratio_margin_delta": 23.66255874633789, "setc/logratio_margin_vanilla": 356.75, "setc/logratio_margin_weighted": 380.4125587463379, "step": 1180 }, { "entropy": 0.4537109375, "epoch": 0.6413365669630827, "grad_norm": 93.54570007324219, "learning_rate": 3.445972162058286e-07, "logits/chosen": -1.348678425603877, "logits/rejected": -1.3903070843099647, "logps/chosen": -526.7, "logps/rejected": -908.2, "loss": 0.2711, "mean_token_accuracy": 0.7370507389307022, "num_tokens": 42336278.0, "rewards/accuracies": 0.9, "rewards/chosen": -4.534214103221894, "rewards/margins": 4.061459875106811, "rewards/rejected": -8.595674014091491, "setc/cal_net_lr": 8.534696535913519e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.0704065527766943, "setc/logratio_margin_delta": 27.095995330810545, "setc/logratio_margin_vanilla": 379.05, "setc/logratio_margin_weighted": 406.14599533081054, "step": 1190 }, { "entropy": 0.46689453125, "epoch": 0.6467259498787389, "grad_norm": 44.626190185546875, "learning_rate": 3.356851187627665e-07, "logits/chosen": -1.3881511589012898, "logits/rejected": -1.4340061933201436, "logps/chosen": -612.0, "logps/rejected": -1049.2, "loss": 0.2864, "mean_token_accuracy": 0.7160806059837341, "num_tokens": 42714490.0, "rewards/accuracies": 0.875, "rewards/chosen": -5.426479089260101, "rewards/margins": 4.657109332084656, "rewards/rejected": -10.083588361740112, "setc/cal_net_lr": 9.624169699392697e-05, "setc/credit_mean": 1.0, "setc/credit_std": 0.0706709910184145, "setc/logratio_margin_delta": 29.010940551757812, "setc/logratio_margin_vanilla": 436.7, "setc/logratio_margin_weighted": 465.7109405517578, "step": 1200 }, { "epoch": 0.6467259498787389, "eval_entropy": 0.48931884765625, "eval_logits/chosen": -1.3905256226255422, "eval_logits/rejected": -1.424202928967488, "eval_logps/chosen": -595.625, "eval_logps/rejected": -1020.375, "eval_loss": 0.26192784309387207, "eval_mean_token_accuracy": 0.7144393119961023, "eval_num_tokens": 42714490.0, "eval_rewards/accuracies": 0.90625, "eval_rewards/chosen": -5.241253539919853, "eval_rewards/margins": 4.524848110973835, "eval_rewards/rejected": -9.766101628541946, "eval_runtime": 14.2072, "eval_samples_per_second": 35.193, "eval_setc/cal_net_lr": 0.0001021713083937391, "eval_setc/credit_mean": 1.0, "eval_setc/credit_std": 0.07142715808004141, "eval_setc/logratio_margin_delta": 26.581721305847168, "eval_setc/logratio_margin_vanilla": 426.4375, "eval_setc/logratio_margin_weighted": 453.01922130584717, "eval_steps_per_second": 2.252, "step": 1200 }, { "entropy": 0.48623046875, "epoch": 0.652115332794395, "grad_norm": 33.88088607788086, "learning_rate": 3.2683116879898995e-07, "logits/chosen": -1.3838917142029392, "logits/rejected": -1.4172299467669451, "logps/chosen": -591.7, "logps/rejected": -991.4, "loss": 0.2971, "mean_token_accuracy": 0.7138142108917236, "num_tokens": 43061800.0, "rewards/accuracies": 0.8875, "rewards/chosen": -5.238682866096497, "rewards/margins": 4.275832235813141, "rewards/rejected": -9.514515256881714, "setc/cal_net_lr": 0.00010771760565312037, "setc/credit_mean": 1.0, "setc/credit_std": 0.07107691392302513, "setc/logratio_margin_delta": 23.933246612548828, "setc/logratio_margin_vanilla": 403.65, "setc/logratio_margin_weighted": 427.5832466125488, "step": 1210 }, { "entropy": 0.47412109375, "epoch": 0.6575047157100512, "grad_norm": 24.915206909179688, "learning_rate": 3.180384995357155e-07, "logits/chosen": -1.3262967787135937, "logits/rejected": -1.349477324233645, "logps/chosen": -513.3, "logps/rejected": -880.6, "loss": 0.282, "mean_token_accuracy": 0.7303446799516677, "num_tokens": 43422934.0, "rewards/accuracies": 0.890625, "rewards/chosen": -4.445479357242585, "rewards/margins": 3.892670524120331, "rewards/rejected": -8.338150024414062, "setc/cal_net_lr": 0.00011975817270565024, "setc/credit_mean": 1.0, "setc/credit_std": 0.06996985897421837, "setc/logratio_margin_delta": 25.342060852050782, "setc/logratio_margin_vanilla": 363.925, "setc/logratio_margin_weighted": 389.2670608520508, "step": 1220 }, { "entropy": 0.47568359375, "epoch": 0.6628940986257074, "grad_norm": 41.575965881347656, "learning_rate": 3.093102225082468e-07, "logits/chosen": -1.4003033108084804, "logits/rejected": -1.4641346348518416, "logps/chosen": -629.7, "logps/rejected": -1016.2, "loss": 0.3094, "mean_token_accuracy": 0.7089363127946854, "num_tokens": 43787087.0, "rewards/accuracies": 0.896875, "rewards/chosen": -5.634258687496185, "rewards/margins": 4.108717429637909, "rewards/rejected": -9.742976069450378, "setc/cal_net_lr": 0.00013234606674091868, "setc/credit_mean": 1.0, "setc/credit_std": 0.06970880702137947, "setc/logratio_margin_delta": 24.44675521850586, "setc/logratio_margin_vanilla": 386.425, "setc/logratio_margin_weighted": 410.8717552185059, "step": 1230 }, { "entropy": 0.4681640625, "epoch": 0.6682834815413635, "grad_norm": 42.247772216796875, "learning_rate": 3.006494264648687e-07, "logits/chosen": -1.5038021364170624, "logits/rejected": -1.5682670959321656, "logps/chosen": -746.6, "logps/rejected": -1242.8, "loss": 0.3417, "mean_token_accuracy": 0.6826083391904831, "num_tokens": 44151449.0, "rewards/accuracies": 0.875, "rewards/chosen": -6.940376424789429, "rewards/margins": 5.315678989887237, "rewards/rejected": -12.256055474281311, "setc/cal_net_lr": 0.00014546316851594206, "setc/credit_mean": 1.0, "setc/credit_std": 0.07185462042689324, "setc/logratio_margin_delta": 34.71790542602539, "setc/logratio_margin_vanilla": 496.85, "setc/logratio_margin_weighted": 531.5679054260254, "step": 1240 }, { "entropy": 0.45966796875, "epoch": 0.6736728644570197, "grad_norm": 16.57280731201172, "learning_rate": 2.9205917627380717e-07, "logits/chosen": -1.450344017268741, "logits/rejected": -1.4847681129953636, "logps/chosen": -670.9, "logps/rejected": -1120.8, "loss": 0.2943, "mean_token_accuracy": 0.7004325985908508, "num_tokens": 44493045.0, "rewards/accuracies": 0.8625, "rewards/chosen": -6.097113466262817, "rewards/margins": 4.771983242034912, "rewards/rejected": -10.869096612930297, "setc/cal_net_lr": 0.00015909059703651744, "setc/credit_mean": 1.0, "setc/credit_std": 0.07053840048611164, "setc/logratio_margin_delta": 30.54833221435547, "setc/logratio_margin_vanilla": 446.65, "setc/logratio_margin_weighted": 477.19833221435545, "step": 1250 }, { "entropy": 0.4638671875, "epoch": 0.6790622473726758, "grad_norm": 22.801801681518555, "learning_rate": 2.8354251183863833e-07, "logits/chosen": -1.4518458887210037, "logits/rejected": -1.505511142314074, "logps/chosen": -650.8, "logps/rejected": -1089.8, "loss": 0.2263, "mean_token_accuracy": 0.709148183465004, "num_tokens": 44862628.0, "rewards/accuracies": 0.896875, "rewards/chosen": -5.8279621481895445, "rewards/margins": 4.705725991725922, "rewards/rejected": -10.533688116073609, "setc/cal_net_lr": 0.00017320873673486848, "setc/credit_mean": 1.0, "setc/credit_std": 0.06929908581078052, "setc/logratio_margin_delta": 31.472618103027344, "setc/logratio_margin_vanilla": 439.1, "setc/logratio_margin_weighted": 470.57261810302737, "step": 1260 }, { "entropy": 0.45927734375, "epoch": 0.684451630288332, "grad_norm": 81.56884002685547, "learning_rate": 2.7510244702253166e-07, "logits/chosen": -1.5361301765199689, "logits/rejected": -1.6149365987857098, "logps/chosen": -617.4, "logps/rejected": -1038.0, "loss": 0.3026, "mean_token_accuracy": 0.7010406374931335, "num_tokens": 45189795.0, "rewards/accuracies": 0.8875, "rewards/chosen": -5.554684376716613, "rewards/margins": 4.473910105228424, "rewards/rejected": -10.028594422340394, "setc/cal_net_lr": 0.0001877972657046496, "setc/credit_mean": 1.0, "setc/credit_std": 0.0700082466006279, "setc/logratio_margin_delta": 30.791018676757812, "setc/logratio_margin_vanilla": 416.6, "setc/logratio_margin_weighted": 447.3910186767578, "step": 1270 }, { "entropy": 0.43818359375, "epoch": 0.6898410132039882, "grad_norm": 42.3488883972168, "learning_rate": 2.667419685817105e-07, "logits/chosen": -1.4125931971850059, "logits/rejected": -1.4648399384142876, "logps/chosen": -500.7, "logps/rejected": -882.1, "loss": 0.3085, "mean_token_accuracy": 0.7451159060001373, "num_tokens": 45554415.0, "rewards/accuracies": 0.871875, "rewards/chosen": -4.311119174957275, "rewards/margins": 4.081181025505066, "rewards/rejected": -8.392300033569336, "setc/cal_net_lr": 0.00020283518495266647, "setc/credit_mean": 1.0, "setc/credit_std": 0.06918111927807331, "setc/logratio_margin_delta": 28.018105697631835, "setc/logratio_margin_vanilla": 380.1, "setc/logratio_margin_weighted": 408.11810569763185, "step": 1280 }, { "entropy": 0.4404296875, "epoch": 0.6952303961196443, "grad_norm": 52.7696647644043, "learning_rate": 2.5846403510850225e-07, "logits/chosen": -1.5529430524670726, "logits/rejected": -1.5987173557025316, "logps/chosen": -522.1, "logps/rejected": -968.8, "loss": 0.2509, "mean_token_accuracy": 0.7253134727478028, "num_tokens": 45901757.0, "rewards/accuracies": 0.896875, "rewards/chosen": -4.538894402980804, "rewards/margins": 4.75179933309555, "rewards/rejected": -9.290693855285644, "setc/cal_net_lr": 0.00021830084862520705, "setc/credit_mean": 1.0, "setc/credit_std": 0.0703603059053421, "setc/logratio_margin_delta": 34.279936981201175, "setc/logratio_margin_vanilla": 440.9, "setc/logratio_margin_weighted": 475.1799369812012, "step": 1290 }, { "entropy": 0.44638671875, "epoch": 0.7006197790353005, "grad_norm": 31.051944732666016, "learning_rate": 2.5027157598435777e-07, "logits/chosen": -1.612937918631245, "logits/rejected": -1.7009304808047285, "logps/chosen": -665.4, "logps/rejected": -1133.0, "loss": 0.358, "mean_token_accuracy": 0.7091849476099015, "num_tokens": 46262002.0, "rewards/accuracies": 0.8625, "rewards/chosen": -5.984306502342224, "rewards/margins": 5.062739598751068, "rewards/rejected": -11.047045946121216, "setc/cal_net_lr": 0.00023417199516547726, "setc/credit_mean": 1.0, "setc/credit_std": 0.07216440998017788, "setc/logratio_margin_delta": 34.07397308349609, "setc/logratio_margin_vanilla": 472.2, "setc/logratio_margin_weighted": 506.2739730834961, "step": 1300 }, { "epoch": 0.7006197790353005, "eval_entropy": 0.45013427734375, "eval_logits/chosen": -1.5569858902716183, "eval_logits/rejected": -1.6061105394898563, "eval_logps/chosen": -589.3125, "eval_logps/rejected": -1073.625, "eval_loss": 0.2594917416572571, "eval_mean_token_accuracy": 0.721536174416542, "eval_num_tokens": 46262002.0, "eval_rewards/accuracies": 0.90234375, "eval_rewards/chosen": -5.197996735572815, "eval_rewards/margins": 5.20976897329092, "eval_rewards/rejected": -10.407765805721283, "eval_runtime": 14.2113, "eval_samples_per_second": 35.183, "eval_setc/cal_net_lr": 0.00024264359402975666, "eval_setc/credit_mean": 1.0, "eval_setc/credit_std": 0.0713688328396529, "eval_setc/logratio_margin_delta": 34.81822967529297, "eval_setc/logratio_margin_vanilla": 487.0, "eval_setc/logratio_margin_weighted": 521.818229675293, "eval_steps_per_second": 2.252, "step": 1300 }, { "entropy": 0.41630859375, "epoch": 0.7060091619509566, "grad_norm": 77.33255004882812, "learning_rate": 2.4216749034320574e-07, "logits/chosen": -1.5246676903481826, "logits/rejected": -1.6122385093574638, "logps/chosen": -567.6, "logps/rejected": -984.0, "loss": 0.3325, "mean_token_accuracy": 0.728985533118248, "num_tokens": 46607558.0, "rewards/accuracies": 0.878125, "rewards/chosen": -5.136111176013946, "rewards/margins": 4.455406987667084, "rewards/rejected": -9.591518354415893, "setc/cal_net_lr": 0.0002504257793572883, "setc/credit_mean": 1.0, "setc/credit_std": 0.07067357562482357, "setc/logratio_margin_delta": 31.340717315673828, "setc/logratio_margin_vanilla": 414.2, "setc/logratio_margin_weighted": 445.5407173156738, "step": 1310 }, { "entropy": 0.440625, "epoch": 0.7113985448666128, "grad_norm": 23.72609519958496, "learning_rate": 2.341546460455126e-07, "logits/chosen": -1.540273190267993, "logits/rejected": -1.6056650116281355, "logps/chosen": -530.0, "logps/rejected": -924.0, "loss": 0.2819, "mean_token_accuracy": 0.7307059586048126, "num_tokens": 46949311.0, "rewards/accuracies": 0.9, "rewards/chosen": -4.646138024330139, "rewards/margins": 4.262345945835113, "rewards/rejected": -8.908483982086182, "setc/cal_net_lr": 0.0002670388052088759, "setc/credit_mean": 1.0, "setc/credit_std": 0.06920704822987318, "setc/logratio_margin_delta": 31.384617614746094, "setc/logratio_margin_vanilla": 394.85, "setc/logratio_margin_weighted": 426.23461761474607, "step": 1320 }, { "entropy": 0.47109375, "epoch": 0.716787927782269, "grad_norm": 20.47443389892578, "learning_rate": 2.2623587866340888e-07, "logits/chosen": -1.5787250099693941, "logits/rejected": -1.6378218184439, "logps/chosen": -555.9, "logps/rejected": -959.8, "loss": 0.3328, "mean_token_accuracy": 0.7292061507701874, "num_tokens": 47301716.0, "rewards/accuracies": 0.90625, "rewards/chosen": -4.8078773021698, "rewards/margins": 4.328281128406525, "rewards/rejected": -9.13615837097168, "setc/cal_net_lr": 0.00028398715962951504, "setc/credit_mean": 1.0, "setc/credit_std": 0.06858081389218569, "setc/logratio_margin_delta": 31.928114318847655, "setc/logratio_margin_vanilla": 400.9, "setc/logratio_margin_weighted": 432.82811431884767, "step": 1330 }, { "entropy": 0.44931640625, "epoch": 0.7221773106979251, "grad_norm": 30.538745880126953, "learning_rate": 2.1841399047724047e-07, "logits/chosen": -1.52773450533801, "logits/rejected": -1.58617710115334, "logps/chosen": -509.1, "logps/rejected": -886.6, "loss": 0.2378, "mean_token_accuracy": 0.7392058879137039, "num_tokens": 47673875.0, "rewards/accuracies": 0.9125, "rewards/chosen": -4.407375812530518, "rewards/margins": 4.048559367656708, "rewards/rejected": -8.455935263633728, "setc/cal_net_lr": 0.0003012464468504569, "setc/credit_mean": 1.0, "setc/credit_std": 0.06842790953814984, "setc/logratio_margin_delta": 29.705943298339843, "setc/logratio_margin_vanilla": 375.15, "setc/logratio_margin_weighted": 404.85594329833987, "step": 1340 }, { "entropy": 0.442578125, "epoch": 0.7275666936135813, "grad_norm": 72.51182556152344, "learning_rate": 2.1069174948390435e-07, "logits/chosen": -1.584456418416361, "logits/rejected": -1.6523068810983297, "logps/chosen": -554.8, "logps/rejected": -955.6, "loss": 0.3051, "mean_token_accuracy": 0.7066467195749283, "num_tokens": 47987918.0, "rewards/accuracies": 0.90625, "rewards/chosen": -4.997482490539551, "rewards/margins": 4.298661887645721, "rewards/rejected": -9.296144366264343, "setc/cal_net_lr": 0.00031879182354063983, "setc/credit_mean": 1.0, "setc/credit_std": 0.07268548682332039, "setc/logratio_margin_delta": 33.21618576049805, "setc/logratio_margin_vanilla": 396.65, "setc/logratio_margin_weighted": 429.86618576049807, "step": 1350 }, { "entropy": 0.46162109375, "epoch": 0.7329560765292374, "grad_norm": 53.63888931274414, "learning_rate": 2.0307188841731193e-07, "logits/chosen": -1.6495496355737422, "logits/rejected": -1.6986198259558798, "logps/chosen": -670.8, "logps/rejected": -1126.8, "loss": 0.3292, "mean_token_accuracy": 0.7082957863807678, "num_tokens": 48363901.0, "rewards/accuracies": 0.884375, "rewards/chosen": -6.070537447929382, "rewards/margins": 4.876463973522187, "rewards/rejected": -10.947001552581787, "setc/cal_net_lr": 0.0003365980345666318, "setc/credit_mean": 1.0, "setc/credit_std": 0.0717622920870781, "setc/logratio_margin_delta": 33.046409606933594, "setc/logratio_margin_vanilla": 454.6, "setc/logratio_margin_weighted": 487.6464096069336, "step": 1360 }, { "entropy": 0.45615234375, "epoch": 0.7383454594448936, "grad_norm": 68.25013732910156, "learning_rate": 1.9555710378133584e-07, "logits/chosen": -1.5475741416931696, "logits/rejected": -1.5981262042934623, "logps/chosen": -553.0, "logps/rejected": -915.8, "loss": 0.3524, "mean_token_accuracy": 0.719152620434761, "num_tokens": 48721520.0, "rewards/accuracies": 0.89375, "rewards/chosen": -4.9144437432289125, "rewards/margins": 3.8710513949394225, "rewards/rejected": -8.785495042800903, "setc/cal_net_lr": 0.00035463944934532626, "setc/credit_mean": 1.0, "setc/credit_std": 0.07162475697696209, "setc/logratio_margin_delta": 26.60514907836914, "setc/logratio_margin_vanilla": 360.5, "setc/logratio_margin_weighted": 387.10514907836915, "step": 1370 }, { "entropy": 0.47626953125, "epoch": 0.7437348423605498, "grad_norm": 27.951370239257812, "learning_rate": 1.88150054895574e-07, "logits/chosen": -1.5752492477710134, "logits/rejected": -1.6267678817590063, "logps/chosen": -530.9, "logps/rejected": -860.2, "loss": 0.331, "mean_token_accuracy": 0.7230644017457962, "num_tokens": 49073907.0, "rewards/accuracies": 0.878125, "rewards/chosen": -4.60977201461792, "rewards/margins": 3.515028989315033, "rewards/rejected": -8.12480103969574, "setc/cal_net_lr": 0.00037289009873706997, "setc/credit_mean": 1.0, "setc/credit_std": 0.0722591146826744, "setc/logratio_margin_delta": 24.0529052734375, "setc/logratio_margin_vanilla": 327.45, "setc/logratio_margin_weighted": 351.5029052734375, "step": 1380 }, { "entropy": 0.4615234375, "epoch": 0.7491242252762059, "grad_norm": 55.398651123046875, "learning_rate": 1.808533629542751e-07, "logits/chosen": -1.519858605305458, "logits/rejected": -1.6093533244431188, "logps/chosen": -509.7, "logps/rejected": -855.6, "loss": 0.3028, "mean_token_accuracy": 0.7340264290571212, "num_tokens": 49418680.0, "rewards/accuracies": 0.890625, "rewards/chosen": -4.401390933990479, "rewards/margins": 3.69234881401062, "rewards/rejected": -8.093739652633667, "setc/cal_net_lr": 0.0003913237124261124, "setc/credit_mean": 1.0, "setc/credit_std": 0.071388328820467, "setc/logratio_margin_delta": 25.53488998413086, "setc/logratio_margin_vanilla": 343.7, "setc/logratio_margin_weighted": 369.23488998413086, "step": 1390 }, { "entropy": 0.47451171875, "epoch": 0.7545136081918621, "grad_norm": 20.171005249023438, "learning_rate": 1.736696100987543e-07, "logits/chosen": -1.6546586863128225, "logits/rejected": -1.7063404244684879, "logps/chosen": -534.9, "logps/rejected": -856.2, "loss": 0.3403, "mean_token_accuracy": 0.7271544426679611, "num_tokens": 49772876.0, "rewards/accuracies": 0.859375, "rewards/chosen": -4.6236349701881405, "rewards/margins": 3.4730340123176573, "rewards/rejected": -8.096669101715088, "setc/cal_net_lr": 0.0004099137567345749, "setc/credit_mean": 1.0, "setc/credit_std": 0.07029190417379141, "setc/logratio_margin_delta": 24.003412628173827, "setc/logratio_margin_vanilla": 323.3, "setc/logratio_margin_weighted": 347.3034126281738, "step": 1400 }, { "epoch": 0.7545136081918621, "eval_entropy": 0.4881591796875, "eval_logits/chosen": -1.5865094371675443, "eval_logits/rejected": -1.6286456849256612, "eval_logps/chosen": -503.875, "eval_logps/rejected": -873.75, "eval_loss": 0.25063228607177734, "eval_mean_token_accuracy": 0.7336005866527557, "eval_num_tokens": 49772876.0, "eval_rewards/accuracies": 0.9140625, "eval_rewards/chosen": -4.2661414965987206, "eval_rewards/margins": 3.9734192714095116, "eval_rewards/rejected": -8.239560678601265, "eval_runtime": 14.2136, "eval_samples_per_second": 35.178, "eval_setc/cal_net_lr": 0.00041972234003674093, "eval_setc/credit_mean": 1.0, "eval_setc/credit_std": 0.06988485460169613, "eval_setc/logratio_margin_delta": 25.711304664611816, "eval_setc/logratio_margin_vanilla": 373.5625, "eval_setc/logratio_margin_weighted": 399.2738046646118, "eval_steps_per_second": 2.251, "step": 1400 }, { "entropy": 0.46767578125, "epoch": 0.7599029911075182, "grad_norm": 59.77590560913086, "learning_rate": 1.6660133850362878e-07, "logits/chosen": -1.5703001768437603, "logits/rejected": -1.6146571741836044, "logps/chosen": -567.2, "logps/rejected": -932.8, "loss": 0.3168, "mean_token_accuracy": 0.7329632073640824, "num_tokens": 50163681.0, "rewards/accuracies": 0.884375, "rewards/chosen": -4.9076042652130125, "rewards/margins": 3.9311932802200316, "rewards/rejected": -8.838797640800475, "setc/cal_net_lr": 0.00042863347281550787, "setc/credit_mean": 1.0, "setc/credit_std": 0.07002903185784817, "setc/logratio_margin_delta": 26.31934051513672, "setc/logratio_margin_vanilla": 366.8, "setc/logratio_margin_weighted": 393.11934051513674, "step": 1410 }, { "entropy": 0.46171875, "epoch": 0.7652923740231743, "grad_norm": 56.6906852722168, "learning_rate": 1.5965104947719816e-07, "logits/chosen": -1.5351415663599084, "logits/rejected": -1.622101327043853, "logps/chosen": -546.1, "logps/rejected": -945.4, "loss": 0.2704, "mean_token_accuracy": 0.7306698024272918, "num_tokens": 50536109.0, "rewards/accuracies": 0.896875, "rewards/chosen": -4.773424637317658, "rewards/margins": 4.259200811386108, "rewards/rejected": -9.032625222206116, "setc/cal_net_lr": 0.0004474559151700609, "setc/credit_mean": 1.0, "setc/credit_std": 0.07033420763909817, "setc/logratio_margin_delta": 29.820078659057618, "setc/logratio_margin_vanilla": 396.1, "setc/logratio_margin_weighted": 425.9200786590576, "step": 1420 }, { "entropy": 0.4712890625, "epoch": 0.7706817569388305, "grad_norm": 27.157922744750977, "learning_rate": 1.5282120257628493e-07, "logits/chosen": -1.5769850546803867, "logits/rejected": -1.675950265945526, "logps/chosen": -572.0, "logps/rejected": -961.6, "loss": 0.3383, "mean_token_accuracy": 0.7177787780761719, "num_tokens": 50898774.0, "rewards/accuracies": 0.85, "rewards/chosen": -5.021525096893311, "rewards/margins": 4.185178649425507, "rewards/rejected": -9.206703734397887, "setc/cal_net_lr": 0.00046635399043332197, "setc/credit_mean": 1.0, "setc/credit_std": 0.07177232243120671, "setc/logratio_margin_delta": 28.917872619628906, "setc/logratio_margin_vanilla": 389.6, "setc/logratio_margin_weighted": 418.5178726196289, "step": 1430 }, { "entropy": 0.46376953125, "epoch": 0.7760711398544866, "grad_norm": 40.695457458496094, "learning_rate": 1.46114214735851e-07, "logits/chosen": -1.6385950412691188, "logits/rejected": -1.686338215651635, "logps/chosen": -551.6, "logps/rejected": -926.2, "loss": 0.3597, "mean_token_accuracy": 0.7252315312623978, "num_tokens": 51264449.0, "rewards/accuracies": 0.859375, "rewards/chosen": -4.829545390605927, "rewards/margins": 3.9724243760108946, "rewards/rejected": -8.801969718933105, "setc/cal_net_lr": 0.00048530049637299856, "setc/credit_mean": 1.0, "setc/credit_std": 0.06923934705555439, "setc/logratio_margin_delta": 24.39244613647461, "setc/logratio_margin_vanilla": 372.85, "setc/logratio_margin_weighted": 397.2424461364746, "step": 1440 }, { "entropy": 0.4931640625, "epoch": 0.7814605227701428, "grad_norm": 35.001766204833984, "learning_rate": 1.3953245941369606e-07, "logits/chosen": -1.5720843194136846, "logits/rejected": -1.6047685664358426, "logps/chosen": -596.7, "logps/rejected": -962.6, "loss": 0.4049, "mean_token_accuracy": 0.7161958605051041, "num_tokens": 51627142.0, "rewards/accuracies": 0.86875, "rewards/chosen": -5.245314371585846, "rewards/margins": 3.897673261165619, "rewards/rejected": -9.142987942695617, "setc/cal_net_lr": 0.0005042681610448041, "setc/credit_mean": 1.0, "setc/credit_std": 0.06911584474146366, "setc/logratio_margin_delta": 24.167342376708984, "setc/logratio_margin_vanilla": 365.6, "setc/logratio_margin_weighted": 389.767342376709, "step": 1450 }, { "entropy": 0.46611328125, "epoch": 0.7868499056857989, "grad_norm": 25.934104919433594, "learning_rate": 1.3307826575054303e-07, "logits/chosen": -1.5709140280737057, "logits/rejected": -1.6279985828979888, "logps/chosen": -520.7, "logps/rejected": -859.0, "loss": 0.2908, "mean_token_accuracy": 0.7397780060768128, "num_tokens": 51996636.0, "rewards/accuracies": 0.8875, "rewards/chosen": -4.433149778842926, "rewards/margins": 3.6351953089237212, "rewards/rejected": -8.068345022201537, "setc/cal_net_lr": 0.0005232296820481889, "setc/credit_mean": 1.0, "setc/credit_std": 0.06630576048046351, "setc/logratio_margin_delta": 23.869540405273437, "setc/logratio_margin_vanilla": 339.65, "setc/logratio_margin_weighted": 363.51954040527346, "step": 1460 }, { "entropy": 0.4646484375, "epoch": 0.7922392886014551, "grad_norm": 109.72233581542969, "learning_rate": 1.267539177458053e-07, "logits/chosen": -1.5302456922524228, "logits/rejected": -1.5671663859152356, "logps/chosen": -500.8, "logps/rejected": -826.2, "loss": 0.3222, "mean_token_accuracy": 0.7304188579320907, "num_tokens": 52363307.0, "rewards/accuracies": 0.871875, "rewards/chosen": -4.318540203571319, "rewards/margins": 3.475985234975815, "rewards/rejected": -7.794525456428528, "setc/cal_net_lr": 0.0005421577658259117, "setc/credit_mean": 1.0, "setc/credit_std": 0.06553097013384104, "setc/logratio_margin_delta": 22.448529052734376, "setc/logratio_margin_vanilla": 325.15, "setc/logratio_margin_weighted": 347.59852905273436, "step": 1470 }, { "entropy": 0.46298828125, "epoch": 0.7976286715171113, "grad_norm": 28.780488967895508, "learning_rate": 1.2056165344932827e-07, "logits/chosen": -1.4931537332170401, "logits/rejected": -1.569671151563729, "logps/chosen": -528.1, "logps/rejected": -940.6, "loss": 0.2665, "mean_token_accuracy": 0.7405376106500625, "num_tokens": 52725552.0, "rewards/accuracies": 0.90625, "rewards/chosen": -4.563779902458191, "rewards/margins": 4.378486478328705, "rewards/rejected": -8.942266464233398, "setc/cal_net_lr": 0.0005610251669508799, "setc/credit_mean": 1.0, "setc/credit_std": 0.06495412047952413, "setc/logratio_margin_delta": 26.19866180419922, "setc/logratio_margin_vanilla": 411.65, "setc/logratio_margin_weighted": 437.8486618041992, "step": 1480 }, { "entropy": 0.44052734375, "epoch": 0.8030180544327674, "grad_norm": 62.47037124633789, "learning_rate": 1.1450366416939295e-07, "logits/chosen": -1.611906114241489, "logits/rejected": -1.6829238200447194, "logps/chosen": -531.7, "logps/rejected": -886.6, "loss": 0.286, "mean_token_accuracy": 0.7277172565460205, "num_tokens": 53058665.0, "rewards/accuracies": 0.890625, "rewards/chosen": -4.697575926780701, "rewards/margins": 3.8217047095298766, "rewards/rejected": -8.519280743598937, "setc/cal_net_lr": 0.000579804727343712, "setc/credit_mean": 1.0, "setc/credit_std": 0.06567572541534901, "setc/logratio_margin_delta": 26.470491790771483, "setc/logratio_margin_vanilla": 355.7, "setc/logratio_margin_weighted": 382.1704917907715, "step": 1490 }, { "entropy": 0.46953125, "epoch": 0.8084074373484236, "grad_norm": 42.8690185546875, "learning_rate": 1.0858209369725851e-07, "logits/chosen": -1.5465363961992153, "logits/rejected": -1.6092895249530645, "logps/chosen": -617.6, "logps/rejected": -1037.6, "loss": 0.2758, "mean_token_accuracy": 0.7125173568725586, "num_tokens": 53427519.0, "rewards/accuracies": 0.88125, "rewards/chosen": -5.522391545772552, "rewards/margins": 4.461647641658783, "rewards/rejected": -9.984039378166198, "setc/cal_net_lr": 0.0005984694153645696, "setc/credit_mean": 1.0, "setc/credit_std": 0.06612777952104806, "setc/logratio_margin_delta": 29.16478729248047, "setc/logratio_margin_vanilla": 417.0, "setc/logratio_margin_weighted": 446.16478729248047, "step": 1500 }, { "epoch": 0.8084074373484236, "eval_entropy": 0.4835205078125, "eval_logits/chosen": -1.574804083909467, "eval_logits/rejected": -1.6192383259931051, "eval_logps/chosen": -594.5625, "eval_logps/rejected": -1023.125, "eval_loss": 0.24590468406677246, "eval_mean_token_accuracy": 0.7153119649738073, "eval_num_tokens": 53427519.0, "eval_rewards/accuracies": 0.908203125, "eval_rewards/chosen": -5.237039923667908, "eval_rewards/margins": 4.583509214222431, "eval_rewards/rejected": -9.820549130439758, "eval_runtime": 14.2433, "eval_samples_per_second": 35.104, "eval_setc/cal_net_lr": 0.0006082198069690511, "eval_setc/credit_mean": 1.0, "eval_setc/credit_std": 0.06576518306974322, "eval_setc/logratio_margin_delta": 28.384427070617676, "eval_setc/logratio_margin_vanilla": 431.625, "eval_setc/logratio_margin_weighted": 460.0094270706177, "eval_steps_per_second": 2.247, "step": 1500 }, { "entropy": 0.4947265625, "epoch": 0.8137968202640797, "grad_norm": 25.9521427154541, "learning_rate": 1.0279903754852164e-07, "logits/chosen": -1.5614659093580356, "logits/rejected": -1.6606548524269127, "logps/chosen": -632.0, "logps/rejected": -1072.0, "loss": 0.2877, "mean_token_accuracy": 0.7104504853487015, "num_tokens": 53792050.0, "rewards/accuracies": 0.9125, "rewards/chosen": -5.629306566715241, "rewards/margins": 4.753253519535065, "rewards/rejected": -10.382560396194458, "setc/cal_net_lr": 0.0006169923647229892, "setc/credit_mean": 1.0, "setc/credit_std": 0.0657088104635477, "setc/logratio_margin_delta": 28.475372314453125, "setc/logratio_margin_vanilla": 446.85, "setc/logratio_margin_weighted": 475.32537231445315, "step": 1510 }, { "entropy": 0.484375, "epoch": 0.8191862031797359, "grad_norm": 33.998130798339844, "learning_rate": 9.715654222155811e-08, "logits/chosen": -1.5534200931238813, "logits/rejected": -1.5930939373874997, "logps/chosen": -587.5, "logps/rejected": -1001.8, "loss": 0.3289, "mean_token_accuracy": 0.72098990380764, "num_tokens": 54136636.0, "rewards/accuracies": 0.878125, "rewards/chosen": -5.229832494258881, "rewards/margins": 4.401166546344757, "rewards/rejected": -9.63099913597107, "setc/cal_net_lr": 0.0006353469131497097, "setc/credit_mean": 1.0, "setc/credit_std": 0.06840050462633371, "setc/logratio_margin_delta": 29.116664123535156, "setc/logratio_margin_vanilla": 411.0, "setc/logratio_margin_weighted": 440.11666412353514, "step": 1520 }, { "entropy": 0.4716796875, "epoch": 0.824575586095392, "grad_norm": 50.3288688659668, "learning_rate": 9.16566044733122e-08, "logits/chosen": -1.5786561767257699, "logits/rejected": -1.642224351067974, "logps/chosen": -626.7, "logps/rejected": -1025.2, "loss": 0.2522, "mean_token_accuracy": 0.7102588206529618, "num_tokens": 54516451.0, "rewards/accuracies": 0.896875, "rewards/chosen": -5.588171422481537, "rewards/margins": 4.30382000207901, "rewards/rejected": -9.891991662979127, "setc/cal_net_lr": 0.0006535066407748258, "setc/credit_mean": 1.0, "setc/credit_std": 0.0688111089169979, "setc/logratio_margin_delta": 28.632022094726562, "setc/logratio_margin_vanilla": 401.75, "setc/logratio_margin_weighted": 430.38202209472655, "step": 1530 }, { "entropy": 0.4650390625, "epoch": 0.8299649690110482, "grad_norm": 61.28715515136719, "learning_rate": 8.630117061268733e-08, "logits/chosen": -1.6015775772223662, "logits/rejected": -1.6506740047739197, "logps/chosen": -673.1, "logps/rejected": -1067.2, "loss": 0.3355, "mean_token_accuracy": 0.7012999951839447, "num_tokens": 54867398.0, "rewards/accuracies": 0.8625, "rewards/chosen": -6.10805311203003, "rewards/margins": 4.206076884269715, "rewards/rejected": -10.314130067825317, "setc/cal_net_lr": 0.0006714454081570259, "setc/credit_mean": 1.0, "setc/credit_std": 0.06854588873684406, "setc/logratio_margin_delta": 26.507701110839843, "setc/logratio_margin_vanilla": 394.1, "setc/logratio_margin_weighted": 420.60770111083986, "step": 1540 }, { "entropy": 0.46708984375, "epoch": 0.8353543519267044, "grad_norm": 66.6949234008789, "learning_rate": 8.109213581178897e-08, "logits/chosen": -1.519720828447748, "logits/rejected": -1.5808756968990825, "logps/chosen": -617.7, "logps/rejected": -967.2, "loss": 0.3575, "mean_token_accuracy": 0.7194855481386184, "num_tokens": 55239887.0, "rewards/accuracies": 0.9, "rewards/chosen": -5.524090123176575, "rewards/margins": 3.7780436635017396, "rewards/rejected": -9.302133798599243, "setc/cal_net_lr": 0.0006891373939091811, "setc/credit_mean": 1.0, "setc/credit_std": 0.0681806318461895, "setc/logratio_margin_delta": 26.4543701171875, "setc/logratio_margin_vanilla": 351.35, "setc/logratio_margin_weighted": 377.8043701171875, "step": 1550 }, { "entropy": 0.4662109375, "epoch": 0.8407437348423605, "grad_norm": 27.215456008911133, "learning_rate": 7.603134343526502e-08, "logits/chosen": -1.5236826772256906, "logits/rejected": -1.570188853218213, "logps/chosen": -587.4, "logps/rejected": -975.4, "loss": 0.233, "mean_token_accuracy": 0.7267722725868225, "num_tokens": 55609384.0, "rewards/accuracies": 0.90625, "rewards/chosen": -5.191406297683716, "rewards/margins": 4.120009970664978, "rewards/rejected": -9.311416292190552, "setc/cal_net_lr": 0.0007065571318661164, "setc/credit_mean": 1.0, "setc/credit_std": 0.06620657239109277, "setc/logratio_margin_delta": 27.050999450683594, "setc/logratio_margin_vanilla": 384.95, "setc/logratio_margin_weighted": 412.0009994506836, "step": 1560 }, { "entropy": 0.46328125, "epoch": 0.8461331177580167, "grad_norm": 47.70095443725586, "learning_rate": 7.112058438797858e-08, "logits/chosen": -1.5296358009335131, "logits/rejected": -1.5978488752687707, "logps/chosen": -614.6, "logps/rejected": -1019.2, "loss": 0.2631, "mean_token_accuracy": 0.7203928977251053, "num_tokens": 55970954.0, "rewards/accuracies": 0.903125, "rewards/chosen": -5.481227231025696, "rewards/margins": 4.321574485301971, "rewards/rejected": -9.80280179977417, "setc/cal_net_lr": 0.0007236795477410702, "setc/credit_mean": 1.0, "setc/credit_std": 0.06795285735279322, "setc/logratio_margin_delta": 27.957456970214842, "setc/logratio_margin_vanilla": 404.2, "setc/logratio_margin_weighted": 432.15745697021487, "step": 1570 }, { "entropy": 0.46845703125, "epoch": 0.8515225006736729, "grad_norm": 36.03289794921875, "learning_rate": 6.636159648124557e-08, "logits/chosen": -1.5813003009716322, "logits/rejected": -1.6186931994037377, "logps/chosen": -607.3, "logps/rejected": -967.4, "loss": 0.3036, "mean_token_accuracy": 0.7082692325115204, "num_tokens": 56337079.0, "rewards/accuracies": 0.85, "rewards/chosen": -5.434537315368653, "rewards/margins": 3.795243227481842, "rewards/rejected": -9.229780340194703, "setc/cal_net_lr": 0.0007404799952180797, "setc/credit_mean": 1.0, "setc/credit_std": 0.06792235635221004, "setc/logratio_margin_delta": 25.124325561523438, "setc/logratio_margin_vanilla": 354.4, "setc/logratio_margin_weighted": 379.5243255615234, "step": 1580 }, { "entropy": 0.4748046875, "epoch": 0.856911883589329, "grad_norm": 24.535364151000977, "learning_rate": 6.175606381786069e-08, "logits/chosen": -1.5430637306828987, "logits/rejected": -1.6079818626695932, "logps/chosen": -650.9, "logps/rejected": -1084.8, "loss": 0.2913, "mean_token_accuracy": 0.7224538773298264, "num_tokens": 56731554.0, "rewards/accuracies": 0.875, "rewards/chosen": -5.72764368057251, "rewards/margins": 4.639845299720764, "rewards/rejected": -10.367488980293274, "setc/cal_net_lr": 0.0007569342914283324, "setc/credit_mean": 0.9999999970197677, "setc/credit_std": 0.06582386922091246, "setc/logratio_margin_delta": 29.334539794921874, "setc/logratio_margin_vanilla": 434.65, "setc/logratio_margin_weighted": 463.9845397949219, "step": 1590 }, { "entropy": 0.48642578125, "epoch": 0.8623012665049852, "grad_norm": 79.2051773071289, "learning_rate": 5.7305616196130334e-08, "logits/chosen": -1.6237664047229114, "logits/rejected": -1.6430690136847275, "logps/chosen": -653.3, "logps/rejected": -1039.8, "loss": 0.2893, "mean_token_accuracy": 0.7086077839136123, "num_tokens": 57110285.0, "rewards/accuracies": 0.85625, "rewards/chosen": -5.711839783191681, "rewards/margins": 4.166626226902008, "rewards/rejected": -9.87846598625183, "setc/cal_net_lr": 0.0007730187517594288, "setc/credit_mean": 0.9999999970197677, "setc/credit_std": 0.06541631631553173, "setc/logratio_margin_delta": 25.612635803222656, "setc/logratio_margin_vanilla": 391.05, "setc/logratio_margin_weighted": 416.66263580322266, "step": 1600 }, { "epoch": 0.8623012665049852, "eval_entropy": 0.4923095703125, "eval_logits/chosen": -1.603833879515406, "eval_logits/rejected": -1.6448864097538396, "eval_logps/chosen": -600.5, "eval_logps/rejected": -1020.375, "eval_loss": 0.24327774345874786, "eval_mean_token_accuracy": 0.7129223179072142, "eval_num_tokens": 57110285.0, "eval_rewards/accuracies": 0.916015625, "eval_rewards/chosen": -5.2806950733065605, "eval_rewards/margins": 4.483514308929443, "eval_rewards/rejected": -9.764209300279617, "eval_runtime": 14.2982, "eval_samples_per_second": 34.969, "eval_setc/cal_net_lr": 0.0007813240527369953, "eval_setc/credit_mean": 1.0, "eval_setc/credit_std": 0.06434770347550511, "eval_setc/logratio_margin_delta": 26.474138259887695, "eval_setc/logratio_margin_vanilla": 423.375, "eval_setc/logratio_margin_weighted": 449.8491382598877, "eval_steps_per_second": 2.238, "step": 1600 }, { "entropy": 0.48271484375, "epoch": 0.8676906494206413, "grad_norm": 70.79142761230469, "learning_rate": 5.3011828533122024e-08, "logits/chosen": -1.6149875326285763, "logits/rejected": -1.6959871557083672, "logps/chosen": -639.3, "logps/rejected": -1038.2, "loss": 0.2599, "mean_token_accuracy": 0.713002547621727, "num_tokens": 57472462.0, "rewards/accuracies": 0.878125, "rewards/chosen": -5.632159161567688, "rewards/margins": 4.25579309463501, "rewards/rejected": -9.887952256202698, "setc/cal_net_lr": 0.0007887102239474405, "setc/credit_mean": 1.0, "setc/credit_std": 0.06394433472305536, "setc/logratio_margin_delta": 26.4293212890625, "setc/logratio_margin_vanilla": 399.15, "setc/logratio_margin_weighted": 425.5793212890625, "step": 1610 }, { "entropy": 0.475, "epoch": 0.8730800323362975, "grad_norm": 39.525123596191406, "learning_rate": 4.887622030733507e-08, "logits/chosen": -1.5679981820658682, "logits/rejected": -1.6499297438183103, "logps/chosen": -620.7, "logps/rejected": -1020.0, "loss": 0.2992, "mean_token_accuracy": 0.7138593286275864, "num_tokens": 57826454.0, "rewards/accuracies": 0.90625, "rewards/chosen": -5.5272397756576535, "rewards/margins": 4.316784369945526, "rewards/rejected": -9.84402413368225, "setc/cal_net_lr": 0.0008039861214027002, "setc/credit_mean": 1.0, "setc/credit_std": 0.060222255811095235, "setc/logratio_margin_delta": 27.97843780517578, "setc/logratio_margin_vanilla": 403.7, "setc/logratio_margin_weighted": 431.6784378051758, "step": 1620 }, { "entropy": 0.4849609375, "epoch": 0.8784694152519537, "grad_norm": 50.614253997802734, "learning_rate": 4.4900255020990764e-08, "logits/chosen": -1.554491747697552, "logits/rejected": -1.6372942528114076, "logps/chosen": -619.1, "logps/rejected": -1022.0, "loss": 0.3179, "mean_token_accuracy": 0.7143799781799316, "num_tokens": 58225092.0, "rewards/accuracies": 0.871875, "rewards/chosen": -5.4253825664520265, "rewards/margins": 4.251791071891785, "rewards/rejected": -9.677173686027526, "setc/cal_net_lr": 0.0008188244557213483, "setc/credit_mean": 1.0, "setc/credit_std": 0.05570842530578375, "setc/logratio_margin_delta": 23.57911911010742, "setc/logratio_margin_vanilla": 401.6, "setc/logratio_margin_weighted": 425.1791191101074, "step": 1630 }, { "entropy": 0.47099609375, "epoch": 0.8838587981676098, "grad_norm": 33.20830535888672, "learning_rate": 4.108533968212963e-08, "logits/chosen": -1.5388461767468533, "logits/rejected": -1.583654628206283, "logps/chosen": -595.7, "logps/rejected": -1035.6, "loss": 0.2626, "mean_token_accuracy": 0.713904058933258, "num_tokens": 58584790.0, "rewards/accuracies": 0.896875, "rewards/chosen": -5.293009197711944, "rewards/margins": 4.6426942348480225, "rewards/rejected": -9.93570351600647, "setc/cal_net_lr": 0.000833203868335841, "setc/credit_mean": 0.9999999970197677, "setc/credit_std": 0.05715073775500059, "setc/logratio_margin_delta": 29.46942901611328, "setc/logratio_margin_vanilla": 434.8, "setc/logratio_margin_weighted": 464.2694290161133, "step": 1640 }, { "entropy": 0.4619140625, "epoch": 0.889248181083266, "grad_norm": 49.0516242980957, "learning_rate": 3.743282430670286e-08, "logits/chosen": -1.5955134523755936, "logits/rejected": -1.631971502049342, "logps/chosen": -559.5, "logps/rejected": -978.0, "loss": 0.278, "mean_token_accuracy": 0.7260301500558853, "num_tokens": 58947907.0, "rewards/accuracies": 0.88125, "rewards/chosen": -4.951177608966828, "rewards/margins": 4.426893401145935, "rewards/rejected": -9.378070974349976, "setc/cal_net_lr": 0.0008471036612588582, "setc/credit_mean": 1.0, "setc/credit_std": 0.057837378792464735, "setc/logratio_margin_delta": 26.0393424987793, "setc/logratio_margin_vanilla": 416.65, "setc/logratio_margin_weighted": 442.6893424987793, "step": 1650 }, { "entropy": 0.476953125, "epoch": 0.8946375639989221, "grad_norm": 61.91923141479492, "learning_rate": 3.3944001440829986e-08, "logits/chosen": -1.5750558117502387, "logits/rejected": -1.657317549975772, "logps/chosen": -623.0, "logps/rejected": -1002.8, "loss": 0.3331, "mean_token_accuracy": 0.7019858151674271, "num_tokens": 59303281.0, "rewards/accuracies": 0.875, "rewards/chosen": -5.602843141555786, "rewards/margins": 4.078434026241302, "rewards/rejected": -9.681277179718018, "setc/cal_net_lr": 0.0008605038268763623, "setc/credit_mean": 1.0, "setc/credit_std": 0.059012738056480885, "setc/logratio_margin_delta": 26.393409729003906, "setc/logratio_margin_vanilla": 381.45, "setc/logratio_margin_weighted": 407.8434097290039, "step": 1660 }, { "entropy": 0.4888671875, "epoch": 0.9000269469145783, "grad_norm": 40.752716064453125, "learning_rate": 3.0620105703395075e-08, "logits/chosen": -1.6270705285389657, "logits/rejected": -1.6553936166815, "logps/chosen": -588.7, "logps/rejected": -1051.8, "loss": 0.2932, "mean_token_accuracy": 0.7158574312925339, "num_tokens": 59687228.0, "rewards/accuracies": 0.88125, "rewards/chosen": -5.1385871052742, "rewards/margins": 4.8742870688438416, "rewards/rejected": -10.012874150276184, "setc/cal_net_lr": 0.0008733850767469216, "setc/credit_mean": 1.0, "setc/credit_std": 0.05472625363618135, "setc/logratio_margin_delta": 24.0787109375, "setc/logratio_margin_vanilla": 463.35, "setc/logratio_margin_weighted": 487.4287109375, "step": 1670 }, { "entropy": 0.47392578125, "epoch": 0.9054163298302345, "grad_norm": 57.972511291503906, "learning_rate": 2.7462313349142063e-08, "logits/chosen": -1.5235790048606868, "logits/rejected": -1.5691862969507655, "logps/chosen": -614.3, "logps/rejected": -1054.2, "loss": 0.2667, "mean_token_accuracy": 0.7157090991735459, "num_tokens": 60046942.0, "rewards/accuracies": 0.878125, "rewards/chosen": -5.51959844827652, "rewards/margins": 4.6187913656234745, "rewards/rejected": -10.138389611244202, "setc/cal_net_lr": 0.0008857288693658431, "setc/credit_mean": 1.0, "setc/credit_std": 0.05935143679380417, "setc/logratio_margin_delta": 24.97913818359375, "setc/logratio_margin_vanilla": 436.9, "setc/logratio_margin_weighted": 461.87913818359374, "step": 1680 }, { "entropy": 0.478125, "epoch": 0.9108057127458906, "grad_norm": 56.78584289550781, "learning_rate": 2.4471741852423233e-08, "logits/chosen": -1.6328370435487536, "logits/rejected": -1.6685099427783272, "logps/chosen": -598.0, "logps/rejected": -951.4, "loss": 0.3008, "mean_token_accuracy": 0.7114303112030029, "num_tokens": 60420768.0, "rewards/accuracies": 0.88125, "rewards/chosen": -5.270635271072388, "rewards/margins": 3.7526545763015746, "rewards/rejected": -9.023289966583253, "setc/cal_net_lr": 0.0008975174368541483, "setc/credit_mean": 1.0, "setc/credit_std": 0.06038776412606239, "setc/logratio_margin_delta": 21.315467834472656, "setc/logratio_margin_vanilla": 353.95, "setc/logratio_margin_weighted": 375.26546783447264, "step": 1690 }, { "entropy": 0.4927734375, "epoch": 0.9161950956615468, "grad_norm": 57.52949142456055, "learning_rate": 2.1649449511749796e-08, "logits/chosen": -1.5933153416721983, "logits/rejected": -1.6209732453760946, "logps/chosen": -600.1, "logps/rejected": -1020.8, "loss": 0.2691, "mean_token_accuracy": 0.7082908779382706, "num_tokens": 60787745.0, "rewards/accuracies": 0.890625, "rewards/chosen": -5.318318796157837, "rewards/margins": 4.3899754106998445, "rewards/rejected": -9.708294129371643, "setc/cal_net_lr": 0.0009087338105339832, "setc/credit_mean": 1.0, "setc/credit_std": 0.06079798210412264, "setc/logratio_margin_delta": 25.297548675537108, "setc/logratio_margin_vanilla": 413.7, "setc/logratio_margin_weighted": 438.99754867553713, "step": 1700 }, { "epoch": 0.9161950956615468, "eval_entropy": 0.49981689453125, "eval_logits/chosen": -1.64129217363471, "eval_logits/rejected": -1.6824805209804925, "eval_logps/chosen": -589.3125, "eval_logps/rejected": -1005.5, "eval_loss": 0.24332645535469055, "eval_mean_token_accuracy": 0.7128801997750998, "eval_num_tokens": 60787745.0, "eval_rewards/accuracies": 0.916015625, "eval_rewards/chosen": -5.122073702514172, "eval_rewards/margins": 4.437938891351223, "eval_rewards/rejected": -9.560012698173523, "eval_runtime": 14.1962, "eval_samples_per_second": 35.221, "eval_setc/cal_net_lr": 0.0009144126721245054, "eval_setc/credit_mean": 1.0, "eval_setc/credit_std": 0.05909399001393467, "eval_setc/logratio_margin_delta": 25.14619731903076, "eval_setc/logratio_margin_vanilla": 420.25, "eval_setc/logratio_margin_weighted": 445.39619731903076, "eval_steps_per_second": 2.254, "step": 1700 }, { "entropy": 0.47744140625, "epoch": 0.9215844785772029, "grad_norm": 32.23965072631836, "learning_rate": 1.8996435075282425e-08, "logits/chosen": -1.589121973571888, "logits/rejected": -1.6433100488728811, "logps/chosen": -585.4, "logps/rejected": -1005.6, "loss": 0.2847, "mean_token_accuracy": 0.7224965184926987, "num_tokens": 61140539.0, "rewards/accuracies": 0.896875, "rewards/chosen": -5.167207801342011, "rewards/margins": 4.406006598472596, "rewards/rejected": -9.573214316368103, "setc/cal_net_lr": 0.0009193618453536389, "setc/credit_mean": 1.0, "setc/credit_std": 0.05848235320299864, "setc/logratio_margin_delta": 24.300669860839843, "setc/logratio_margin_vanilla": 416.3, "setc/logratio_margin_weighted": 440.60066986083984, "step": 1710 }, { "entropy": 0.49404296875, "epoch": 0.9269738614928591, "grad_norm": 104.77435302734375, "learning_rate": 1.6513637387396138e-08, "logits/chosen": -1.5879688472104307, "logits/rejected": -1.6660038253212073, "logps/chosen": -603.2, "logps/rejected": -1010.6, "loss": 0.3055, "mean_token_accuracy": 0.7246152043342591, "num_tokens": 61515724.0, "rewards/accuracies": 0.88125, "rewards/chosen": -5.219302845001221, "rewards/margins": 4.304101645946503, "rewards/rejected": -9.5234046459198, "setc/cal_net_lr": 0.0009293862431270326, "setc/credit_mean": 1.0, "setc/credit_std": 0.060384664312005044, "setc/logratio_margin_delta": 24.260169982910156, "setc/logratio_margin_vanilla": 406.15, "setc/logratio_margin_weighted": 430.41016998291013, "step": 1720 }, { "entropy": 0.47578125, "epoch": 0.9323632444085153, "grad_norm": 62.20964813232422, "learning_rate": 1.4201935056443681e-08, "logits/chosen": -1.6094559658891932, "logits/rejected": -1.6524878440333168, "logps/chosen": -615.5, "logps/rejected": -1023.2, "loss": 0.2376, "mean_token_accuracy": 0.714499345421791, "num_tokens": 61880478.0, "rewards/accuracies": 0.9125, "rewards/chosen": -5.445954358577728, "rewards/margins": 4.328825449943542, "rewards/rejected": -9.774779748916625, "setc/cal_net_lr": 0.000938792574554196, "setc/credit_mean": 1.0, "setc/credit_std": 0.05732750911265612, "setc/logratio_margin_delta": 26.48255157470703, "setc/logratio_margin_vanilla": 406.4, "setc/logratio_margin_weighted": 432.882551574707, "step": 1730 }, { "entropy": 0.47080078125, "epoch": 0.9377526273241714, "grad_norm": 26.240835189819336, "learning_rate": 1.2062146143834939e-08, "logits/chosen": -1.6424857519172662, "logits/rejected": -1.6840320911940327, "logps/chosen": -589.7, "logps/rejected": -973.6, "loss": 0.3045, "mean_token_accuracy": 0.710444837808609, "num_tokens": 62237198.0, "rewards/accuracies": 0.865625, "rewards/chosen": -5.231074094772339, "rewards/margins": 4.04384742975235, "rewards/rejected": -9.274921464920045, "setc/cal_net_lr": 0.0009475672999910659, "setc/credit_mean": 1.0, "setc/credit_std": 0.05614407397806644, "setc/logratio_margin_delta": 23.08474655151367, "setc/logratio_margin_vanilla": 381.3, "setc/logratio_margin_weighted": 404.3847465515137, "step": 1740 }, { "entropy": 0.47841796875, "epoch": 0.9431420102398276, "grad_norm": 25.031240463256836, "learning_rate": 1.009502787454264e-08, "logits/chosen": -1.628094637586519, "logits/rejected": -1.6946888854450202, "logps/chosen": -561.6, "logps/rejected": -962.4, "loss": 0.2709, "mean_token_accuracy": 0.7199163377285004, "num_tokens": 62613336.0, "rewards/accuracies": 0.871875, "rewards/chosen": -4.926135575771331, "rewards/margins": 4.213748967647552, "rewards/rejected": -9.13988437652588, "setc/cal_net_lr": 0.0009556977889387003, "setc/credit_mean": 1.0, "setc/credit_std": 0.05476319156587124, "setc/logratio_margin_delta": 26.124908447265625, "setc/logratio_margin_vanilla": 395.25, "setc/logratio_margin_weighted": 421.3749084472656, "step": 1750 }, { "entropy": 0.47578125, "epoch": 0.9485313931554837, "grad_norm": 33.9154052734375, "learning_rate": 8.301276369136912e-09, "logits/chosen": -1.5798610346038533, "logits/rejected": -1.6392516028325452, "logps/chosen": -602.1, "logps/rejected": -1041.4, "loss": 0.2397, "mean_token_accuracy": 0.7180195420980453, "num_tokens": 63005622.0, "rewards/accuracies": 0.9, "rewards/chosen": -5.272761929035187, "rewards/margins": 4.6579270362854, "rewards/rejected": -9.930689001083374, "setc/cal_net_lr": 0.0009631723382238432, "setc/credit_mean": 1.0, "setc/credit_std": 0.056856663711369035, "setc/logratio_margin_delta": 27.44272232055664, "setc/logratio_margin_vanilla": 438.35, "setc/logratio_margin_weighted": 465.79272232055666, "step": 1760 }, { "entropy": 0.48935546875, "epoch": 0.9539207760711399, "grad_norm": 71.26800537109375, "learning_rate": 6.68152639744346e-09, "logits/chosen": -1.650358383302045, "logits/rejected": -1.7143002367810154, "logps/chosen": -613.1, "logps/rejected": -987.6, "loss": 0.2931, "mean_token_accuracy": 0.7065669685602188, "num_tokens": 63373491.0, "rewards/accuracies": 0.875, "rewards/chosen": -5.3662453413009645, "rewards/margins": 3.9946064949035645, "rewards/rejected": -9.360851907730103, "setc/cal_net_lr": 0.0009699801888446867, "setc/credit_mean": 1.0, "setc/credit_std": 0.05508008189499378, "setc/logratio_margin_delta": 24.06066436767578, "setc/logratio_margin_vanilla": 375.4, "setc/logratio_margin_weighted": 399.4606643676758, "step": 1770 }, { "entropy": 0.46416015625, "epoch": 0.9593101589867961, "grad_norm": 25.516454696655273, "learning_rate": 5.236351153911855e-09, "logits/chosen": -1.7053886135754528, "logits/rejected": -1.753958930302091, "logps/chosen": -552.3, "logps/rejected": -933.8, "loss": 0.2468, "mean_token_accuracy": 0.7095615237951278, "num_tokens": 63708180.0, "rewards/accuracies": 0.884375, "rewards/chosen": -4.928412961959839, "rewards/margins": 4.00942040681839, "rewards/rejected": -8.937833166122436, "setc/cal_net_lr": 0.0009761115414575713, "setc/credit_mean": 1.0, "setc/credit_std": 0.05466606095433235, "setc/logratio_margin_delta": 26.192051696777344, "setc/logratio_margin_vanilla": 374.75, "setc/logratio_margin_weighted": 400.94205169677736, "step": 1780 }, { "entropy": 0.46875, "epoch": 0.9646995419024522, "grad_norm": 46.8420295715332, "learning_rate": 3.966262054774794e-09, "logits/chosen": -1.686366542637256, "logits/rejected": -1.7361481268819614, "logps/chosen": -585.3, "logps/rejected": -962.2, "loss": 0.2608, "mean_token_accuracy": 0.7172308325767517, "num_tokens": 64063107.0, "rewards/accuracies": 0.8875, "rewards/chosen": -5.106739115715027, "rewards/margins": 4.049427050352096, "rewards/rejected": -9.156166195869446, "setc/cal_net_lr": 0.0009815575704823403, "setc/credit_mean": 1.0, "setc/credit_std": 0.055073692835867404, "setc/logratio_margin_delta": 27.29270706176758, "setc/logratio_margin_vanilla": 377.65, "setc/logratio_margin_weighted": 404.9427070617676, "step": 1790 }, { "entropy": 0.47333984375, "epoch": 0.9700889248181084, "grad_norm": 41.61103820800781, "learning_rate": 2.8717085570689194e-09, "logits/chosen": -1.6285714864267278, "logits/rejected": -1.7007568628321508, "logps/chosen": -543.7, "logps/rejected": -972.0, "loss": 0.2488, "mean_token_accuracy": 0.7220604807138443, "num_tokens": 64415287.0, "rewards/accuracies": 0.915625, "rewards/chosen": -4.766039276123047, "rewards/margins": 4.599205017089844, "rewards/rejected": -9.365244293212891, "setc/cal_net_lr": 0.0009863104368060393, "setc/credit_mean": 1.0, "setc/credit_std": 0.0533838925883174, "setc/logratio_margin_delta": 31.27052230834961, "setc/logratio_margin_vanilla": 428.65, "setc/logratio_margin_weighted": 459.92052230834963, "step": 1800 }, { "epoch": 0.9700889248181084, "eval_entropy": 0.496337890625, "eval_logits/chosen": -1.6497335267505044, "eval_logits/rejected": -1.691259456206689, "eval_logps/chosen": -595.0, "eval_logps/rejected": -1017.0, "eval_loss": 0.242402583360672, "eval_mean_token_accuracy": 0.7130081094801426, "eval_num_tokens": 64415287.0, "eval_rewards/accuracies": 0.916015625, "eval_rewards/chosen": -5.198269315063953, "eval_rewards/margins": 4.524538166821003, "eval_rewards/rejected": -9.722807422280312, "eval_runtime": 14.196, "eval_samples_per_second": 35.221, "eval_setc/cal_net_lr": 0.0009885550958571609, "eval_setc/credit_mean": 1.0, "eval_setc/credit_std": 0.05235194368287921, "eval_setc/logratio_margin_delta": 28.659388542175293, "eval_setc/logratio_margin_vanilla": 425.4375, "eval_setc/logratio_margin_weighted": 454.0968885421753, "eval_steps_per_second": 2.254, "step": 1800 }, { "entropy": 0.49453125, "epoch": 0.9754783077337645, "grad_norm": 58.74081039428711, "learning_rate": 1.953077999581498e-09, "logits/chosen": -1.5973200334126898, "logits/rejected": -1.6452927645084596, "logps/chosen": -579.2, "logps/rejected": -1085.2, "loss": 0.2545, "mean_token_accuracy": 0.7099454998970032, "num_tokens": 64753784.0, "rewards/accuracies": 0.915625, "rewards/chosen": -5.223498034477234, "rewards/margins": 5.342882573604584, "rewards/rejected": -10.566380524635315, "setc/cal_net_lr": 0.0009903632990666783, "setc/credit_mean": 1.0, "setc/credit_std": 0.05157312601804733, "setc/logratio_margin_delta": 32.93826675415039, "setc/logratio_margin_vanilla": 501.35, "setc/logratio_margin_weighted": 534.2882667541504, "step": 1810 }, { "entropy": 0.4802734375, "epoch": 0.9808676906494206, "grad_norm": 35.12337112426758, "learning_rate": 1.210695465779732e-09, "logits/chosen": -1.6434376954136094, "logits/rejected": -1.6868111194134685, "logps/chosen": -599.4, "logps/rejected": -951.4, "loss": 0.31, "mean_token_accuracy": 0.7225580751895905, "num_tokens": 65114010.0, "rewards/accuracies": 0.88125, "rewards/chosen": -5.201381134986877, "rewards/margins": 3.772454595565796, "rewards/rejected": -8.973835754394532, "setc/cal_net_lr": 0.0009937103235008124, "setc/credit_mean": 1.0, "setc/credit_std": 0.04924789797514677, "setc/logratio_margin_delta": 25.745468139648438, "setc/logratio_margin_vanilla": 351.5, "setc/logratio_margin_weighted": 377.24546813964844, "step": 1820 }, { "entropy": 0.48134765625, "epoch": 0.9862570735650767, "grad_norm": 41.82625198364258, "learning_rate": 6.44823668770833e-10, "logits/chosen": -1.6893693610067781, "logits/rejected": -1.7599484026922272, "logps/chosen": -573.5, "logps/rejected": -927.0, "loss": 0.2913, "mean_token_accuracy": 0.7189024925231934, "num_tokens": 65453240.0, "rewards/accuracies": 0.871875, "rewards/chosen": -4.990872526168824, "rewards/margins": 3.8168386936187746, "rewards/rejected": -8.807711195945739, "setc/cal_net_lr": 0.000996346692340764, "setc/credit_mean": 0.9999999970197677, "setc/credit_std": 0.04604421444237232, "setc/logratio_margin_delta": 27.133870697021486, "setc/logratio_margin_vanilla": 354.55, "setc/logratio_margin_weighted": 381.68387069702146, "step": 1830 }, { "entropy": 0.4810546875, "epoch": 0.9916464564807329, "grad_norm": 64.11349487304688, "learning_rate": 2.556628583335052e-10, "logits/chosen": -1.6629026317813094, "logits/rejected": -1.702124887555161, "logps/chosen": -631.7, "logps/rejected": -1014.0, "loss": 0.2856, "mean_token_accuracy": 0.7035313218832016, "num_tokens": 65813540.0, "rewards/accuracies": 0.859375, "rewards/chosen": -5.657264351844788, "rewards/margins": 4.0819092988967896, "rewards/rejected": -9.739173555374146, "setc/cal_net_lr": 0.0009982686107494103, "setc/credit_mean": 1.0, "setc/credit_std": 0.04834742061793804, "setc/logratio_margin_delta": 25.690938568115236, "setc/logratio_margin_vanilla": 382.5, "setc/logratio_margin_weighted": 408.19093856811526, "step": 1840 }, { "entropy": 0.474609375, "epoch": 0.9970358393963891, "grad_norm": 32.58039855957031, "learning_rate": 4.3350750053627874e-11, "logits/chosen": -1.6136593090507105, "logits/rejected": -1.6928662572741515, "logps/chosen": -604.8, "logps/rejected": -1012.2, "loss": 0.294, "mean_token_accuracy": 0.712560287117958, "num_tokens": 66170915.0, "rewards/accuracies": 0.846875, "rewards/chosen": -5.385611450672149, "rewards/margins": 4.348533475399018, "rewards/rejected": -9.734145045280457, "setc/cal_net_lr": 0.0009994733122825335, "setc/credit_mean": 0.9999999970197677, "setc/credit_std": 0.04455969799309969, "setc/logratio_margin_delta": 27.10336227416992, "setc/logratio_margin_vanilla": 407.75, "setc/logratio_margin_weighted": 434.8533622741699, "step": 1850 } ], "logging_steps": 10, "max_steps": 1856, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.3990617254086574e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }