{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 63, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 10.069208002389907, "learning_rate": 2e-05, "logits/chosen": 1.40966796875, "logits/rejected": 1.90234375, "logps/chosen": -202.875, "logps/rejected": -39.71875, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.032, "grad_norm": 12.366009999221836, "learning_rate": 4e-05, "logits/chosen": 1.37548828125, "logits/rejected": 2.0009765625, "logps/chosen": -251.25, "logps/rejected": -40.09375, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.048, "grad_norm": 0.7217222726873707, "learning_rate": 6e-05, "logits/chosen": 2.0615234375, "logits/rejected": 1.708984375, "logps/chosen": -253.8125, "logps/rejected": -70.875, "loss": 0.0181, "rewards/accuracies": 1.0, "rewards/chosen": 1.26220703125, "rewards/margins": 4.46875, "rewards/rejected": -3.208984375, "step": 3 }, { "epoch": 0.064, "grad_norm": 0.9046680440747444, "learning_rate": 8e-05, "logits/chosen": 1.35107421875, "logits/rejected": 1.4130859375, "logps/chosen": -194.25, "logps/rejected": -140.25, "loss": 0.0225, "rewards/accuracies": 1.0, "rewards/chosen": 0.402099609375, "rewards/margins": 10.078125, "rewards/rejected": -9.6875, "step": 4 }, { "epoch": 0.08, "grad_norm": 19.759479094786197, "learning_rate": 0.0001, "logits/chosen": 0.591796875, "logits/rejected": 0.6298599243164062, "logps/chosen": -431.25, "logps/rejected": -201.25, "loss": 0.7488, "rewards/accuracies": 0.8125, "rewards/chosen": -12.28125, "rewards/margins": 3.73828125, "rewards/rejected": -16.046875, "step": 5 }, { "epoch": 0.096, "grad_norm": 1.0315492671962288e-06, "learning_rate": 0.00012, "logits/chosen": 2.171875, "logits/rejected": 0.79638671875, "logps/chosen": -320.375, "logps/rejected": -269.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.7626953125, "rewards/margins": 20.34375, "rewards/rejected": -23.15625, "step": 6 }, { "epoch": 0.112, "grad_norm": 0.0007913850356246983, "learning_rate": 0.00014, "logits/chosen": 2.4765625, "logits/rejected": 1.0927734375, "logps/chosen": -311.5, "logps/rejected": -300.5, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -7.390625, "rewards/margins": 18.8203125, "rewards/rejected": -26.234375, "step": 7 }, { "epoch": 0.128, "grad_norm": 3.55577097734532, "learning_rate": 0.00016, "logits/chosen": 2.388671875, "logits/rejected": 1.0126953125, "logps/chosen": -396.25, "logps/rejected": -314.0, "loss": 0.0956, "rewards/accuracies": 0.9375, "rewards/chosen": -14.8359375, "rewards/margins": 12.71875, "rewards/rejected": -27.546875, "step": 8 }, { "epoch": 0.144, "grad_norm": 1.1632494807287445e-10, "learning_rate": 0.00018, "logits/chosen": 3.025390625, "logits/rejected": 0.689453125, "logps/chosen": -277.0, "logps/rejected": -431.75, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.91796875, "rewards/margins": 33.296875, "rewards/rejected": -39.28125, "step": 9 }, { "epoch": 0.16, "grad_norm": 1.62273040339937e-05, "learning_rate": 0.0002, "logits/chosen": 3.20703125, "logits/rejected": 0.896484375, "logps/chosen": -288.25, "logps/rejected": -448.5, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -10.029296875, "rewards/margins": 30.765625, "rewards/rejected": -40.75, "step": 10 }, { "epoch": 0.176, "grad_norm": 29.485616401220902, "learning_rate": 0.00019622641509433963, "logits/chosen": 3.3828125, "logits/rejected": 1.173828125, "logps/chosen": -562.375, "logps/rejected": -450.75, "loss": 0.7779, "rewards/accuracies": 0.6875, "rewards/chosen": -28.53125, "rewards/margins": 12.4609375, "rewards/rejected": -40.9375, "step": 11 }, { "epoch": 0.192, "grad_norm": 1.934753970465984e-12, "learning_rate": 0.00019245283018867927, "logits/chosen": 3.322265625, "logits/rejected": 0.859375, "logps/chosen": -412.25, "logps/rejected": -626.5, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -17.078125, "rewards/margins": 41.640625, "rewards/rejected": -58.75, "step": 12 }, { "epoch": 0.208, "grad_norm": 12.013076102617397, "learning_rate": 0.00018867924528301889, "logits/chosen": 2.99609375, "logits/rejected": 0.555908203125, "logps/chosen": -595.375, "logps/rejected": -626.5, "loss": 0.5391, "rewards/accuracies": 0.9375, "rewards/chosen": -32.6640625, "rewards/margins": 25.96875, "rewards/rejected": -58.625, "step": 13 }, { "epoch": 0.224, "grad_norm": 7.6760618149121e-10, "learning_rate": 0.0001849056603773585, "logits/chosen": 2.908203125, "logits/rejected": 2.189453125, "logps/chosen": -475.25, "logps/rejected": -759.5, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -24.125, "rewards/margins": 47.96875, "rewards/rejected": -72.0, "step": 14 }, { "epoch": 0.24, "grad_norm": 890.1553140771243, "learning_rate": 0.00018113207547169812, "logits/chosen": 3.0703125, "logits/rejected": 2.83203125, "logps/chosen": -1423.0, "logps/rejected": -737.0, "loss": 57.7188, "rewards/accuracies": 0.375, "rewards/chosen": -117.1875, "rewards/margins": -48.046875, "rewards/rejected": -68.90625, "step": 15 }, { "epoch": 0.256, "grad_norm": 2.3766667756398814e-08, "learning_rate": 0.00017735849056603776, "logits/chosen": 4.076171875, "logits/rejected": 3.078125, "logps/chosen": -521.75, "logps/rejected": -752.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -27.234375, "rewards/margins": 43.96875, "rewards/rejected": -71.125, "step": 16 }, { "epoch": 0.272, "grad_norm": 3.6434206196266317e-06, "learning_rate": 0.00017358490566037738, "logits/chosen": 4.32421875, "logits/rejected": 3.45703125, "logps/chosen": -449.0, "logps/rejected": -740.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -24.625, "rewards/margins": 44.890625, "rewards/rejected": -69.3125, "step": 17 }, { "epoch": 0.288, "grad_norm": 0.0001148921285248143, "learning_rate": 0.000169811320754717, "logits/chosen": 4.5546875, "logits/rejected": 3.6796875, "logps/chosen": -527.0, "logps/rejected": -753.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -29.4140625, "rewards/margins": 41.578125, "rewards/rejected": -71.0, "step": 18 }, { "epoch": 0.304, "grad_norm": 14.827452248587537, "learning_rate": 0.0001660377358490566, "logits/chosen": 4.703125, "logits/rejected": 4.08203125, "logps/chosen": -643.25, "logps/rejected": -741.5, "loss": 0.4417, "rewards/accuracies": 0.875, "rewards/chosen": -39.75, "rewards/margins": 30.328125, "rewards/rejected": -70.0625, "step": 19 }, { "epoch": 0.32, "grad_norm": 2.2454827573591818e-18, "learning_rate": 0.00016226415094339625, "logits/chosen": 2.88671875, "logits/rejected": 3.017578125, "logps/chosen": -506.75, "logps/rejected": -834.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -21.97265625, "rewards/margins": 57.4375, "rewards/rejected": -79.5, "step": 20 }, { "epoch": 0.336, "grad_norm": 7.951357148069105e-13, "learning_rate": 0.00015849056603773587, "logits/chosen": 1.654296875, "logits/rejected": 2.53125, "logps/chosen": -409.0, "logps/rejected": -849.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -21.46484375, "rewards/margins": 59.28125, "rewards/rejected": -80.875, "step": 21 }, { "epoch": 0.352, "grad_norm": 0.002673619933126348, "learning_rate": 0.0001547169811320755, "logits/chosen": 0.790283203125, "logits/rejected": 2.2431640625, "logps/chosen": -725.0, "logps/rejected": -822.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -43.8125, "rewards/margins": 33.734375, "rewards/rejected": -77.625, "step": 22 }, { "epoch": 0.368, "grad_norm": 28.803556786399305, "learning_rate": 0.0001509433962264151, "logits/chosen": 0.34479522705078125, "logits/rejected": 2.234375, "logps/chosen": -807.5, "logps/rejected": -829.0, "loss": 0.737, "rewards/accuracies": 0.8125, "rewards/chosen": -52.953125, "rewards/margins": 25.828125, "rewards/rejected": -78.75, "step": 23 }, { "epoch": 0.384, "grad_norm": 6.616760062152765e-18, "learning_rate": 0.00014716981132075472, "logits/chosen": 1.5146484375, "logits/rejected": 1.8779296875, "logps/chosen": -495.75, "logps/rejected": -890.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.546875, "rewards/margins": 62.40625, "rewards/rejected": -85.0625, "step": 24 }, { "epoch": 0.4, "grad_norm": 5.053256029955234e-22, "learning_rate": 0.00014339622641509434, "logits/chosen": 1.84375, "logits/rejected": 1.8896484375, "logps/chosen": -420.25, "logps/rejected": -919.5, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -20.05078125, "rewards/margins": 67.875, "rewards/rejected": -87.9375, "step": 25 }, { "epoch": 0.416, "grad_norm": 2.9563266933229072e-15, "learning_rate": 0.00013962264150943395, "logits/chosen": 2.0087890625, "logits/rejected": 1.9580078125, "logps/chosen": -558.0, "logps/rejected": -932.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -29.9375, "rewards/margins": 59.375, "rewards/rejected": -89.1875, "step": 26 }, { "epoch": 0.432, "grad_norm": 1.477158681297205e-12, "learning_rate": 0.0001358490566037736, "logits/chosen": 2.072265625, "logits/rejected": 2.03515625, "logps/chosen": -596.5, "logps/rejected": -940.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -33.125, "rewards/margins": 56.96875, "rewards/rejected": -90.1875, "step": 27 }, { "epoch": 0.448, "grad_norm": 1.1066552941906446e-16, "learning_rate": 0.0001320754716981132, "logits/chosen": 2.1240234375, "logits/rejected": 2.109375, "logps/chosen": -408.0, "logps/rejected": -944.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -22.03125, "rewards/margins": 68.375, "rewards/rejected": -90.375, "step": 28 }, { "epoch": 0.464, "grad_norm": 9.271918418710529e-10, "learning_rate": 0.00012830188679245283, "logits/chosen": 2.138671875, "logits/rejected": 2.205078125, "logps/chosen": -733.5, "logps/rejected": -941.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -43.875, "rewards/margins": 46.34375, "rewards/rejected": -90.1875, "step": 29 }, { "epoch": 0.48, "grad_norm": 1.8897943489643426e-08, "learning_rate": 0.00012452830188679244, "logits/chosen": 2.1201171875, "logits/rejected": 2.29296875, "logps/chosen": -630.5, "logps/rejected": -936.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -35.59375, "rewards/margins": 53.625, "rewards/rejected": -89.1875, "step": 30 }, { "epoch": 0.496, "grad_norm": 3.4735316128098106e-10, "learning_rate": 0.00012075471698113207, "logits/chosen": 2.0654296875, "logits/rejected": 2.3076171875, "logps/chosen": -639.0, "logps/rejected": -939.5, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -36.546875, "rewards/margins": 52.625, "rewards/rejected": -89.375, "step": 31 }, { "epoch": 0.512, "grad_norm": 2.30972605929251e-07, "learning_rate": 0.0001169811320754717, "logits/chosen": 2.197265625, "logits/rejected": 2.4921875, "logps/chosen": -516.25, "logps/rejected": -920.5, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -29.734375, "rewards/margins": 58.25, "rewards/rejected": -88.0, "step": 32 }, { "epoch": 0.528, "grad_norm": 5.090800553960521e-08, "learning_rate": 0.00011320754716981132, "logits/chosen": 2.0634765625, "logits/rejected": 2.587890625, "logps/chosen": -614.0, "logps/rejected": -912.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -36.8359375, "rewards/margins": 50.3125, "rewards/rejected": -87.125, "step": 33 }, { "epoch": 0.544, "grad_norm": 7.860715190818793e-09, "learning_rate": 0.00010943396226415095, "logits/chosen": 2.130859375, "logits/rejected": 2.65234375, "logps/chosen": -661.0, "logps/rejected": -904.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -39.125, "rewards/margins": 47.484375, "rewards/rejected": -86.5, "step": 34 }, { "epoch": 0.56, "grad_norm": 0.14310502335505565, "learning_rate": 0.00010566037735849057, "logits/chosen": 2.1650390625, "logits/rejected": 2.712890625, "logps/chosen": -625.5, "logps/rejected": -895.5, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -37.859375, "rewards/margins": 47.5625, "rewards/rejected": -85.375, "step": 35 }, { "epoch": 0.576, "grad_norm": 2.342334837346181e-10, "learning_rate": 0.0001018867924528302, "logits/chosen": 1.9853515625, "logits/rejected": 2.67578125, "logps/chosen": -549.25, "logps/rejected": -891.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -31.53125, "rewards/margins": 53.4375, "rewards/rejected": -85.0, "step": 36 }, { "epoch": 0.592, "grad_norm": 0.3363564536472393, "learning_rate": 9.811320754716981e-05, "logits/chosen": 1.845703125, "logits/rejected": 2.556640625, "logps/chosen": -494.25, "logps/rejected": -847.5, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -28.6015625, "rewards/margins": 51.90625, "rewards/rejected": -80.5, "step": 37 }, { "epoch": 0.608, "grad_norm": 1.3263620018804851e-11, "learning_rate": 9.433962264150944e-05, "logits/chosen": 1.615234375, "logits/rejected": 2.5078125, "logps/chosen": -561.25, "logps/rejected": -882.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -30.765625, "rewards/margins": 53.28125, "rewards/rejected": -84.0, "step": 38 }, { "epoch": 0.624, "grad_norm": 1.0577545951792833, "learning_rate": 9.056603773584906e-05, "logits/chosen": 1.38671875, "logits/rejected": 2.419921875, "logps/chosen": -612.75, "logps/rejected": -878.5, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": -36.984375, "rewards/margins": 46.96875, "rewards/rejected": -84.0625, "step": 39 }, { "epoch": 0.64, "grad_norm": 3.7701859084434684e-16, "learning_rate": 8.679245283018869e-05, "logits/chosen": 0.78564453125, "logits/rejected": 1.27978515625, "logps/chosen": -524.25, "logps/rejected": -960.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -29.9765625, "rewards/margins": 60.71875, "rewards/rejected": -90.8125, "step": 40 }, { "epoch": 0.656, "grad_norm": 2.01326547033749e-15, "learning_rate": 8.30188679245283e-05, "logits/chosen": 0.541259765625, "logits/rejected": 1.248046875, "logps/chosen": -575.0, "logps/rejected": -967.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -30.984375, "rewards/margins": 61.8125, "rewards/rejected": -92.8125, "step": 41 }, { "epoch": 0.672, "grad_norm": 9.322472149670949e-15, "learning_rate": 7.924528301886794e-05, "logits/chosen": 0.2445068359375, "logits/rejected": 1.1640625, "logps/chosen": -507.25, "logps/rejected": -960.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -27.421875, "rewards/margins": 64.65625, "rewards/rejected": -92.125, "step": 42 }, { "epoch": 0.688, "grad_norm": 7.769637270952163e-08, "learning_rate": 7.547169811320755e-05, "logits/chosen": -0.0588526725769043, "logits/rejected": 1.035888671875, "logps/chosen": -499.75, "logps/rejected": -1004.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -29.546875, "rewards/margins": 66.328125, "rewards/rejected": -95.875, "step": 43 }, { "epoch": 0.704, "grad_norm": 4.25507341439961e-09, "learning_rate": 7.169811320754717e-05, "logits/chosen": -0.14769744873046875, "logits/rejected": 1.09765625, "logps/chosen": -733.25, "logps/rejected": -949.5, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -43.71875, "rewards/margins": 47.25, "rewards/rejected": -91.0, "step": 44 }, { "epoch": 0.72, "grad_norm": 7.60966895891367e-15, "learning_rate": 6.79245283018868e-05, "logits/chosen": -0.33404541015625, "logits/rejected": 1.099609375, "logps/chosen": -505.5, "logps/rejected": -944.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -29.4609375, "rewards/margins": 60.8125, "rewards/rejected": -90.25, "step": 45 }, { "epoch": 0.736, "grad_norm": 92.0641578664817, "learning_rate": 6.415094339622641e-05, "logits/chosen": -0.5057373046875, "logits/rejected": 1.1044921875, "logps/chosen": -745.25, "logps/rejected": -936.0, "loss": 1.5391, "rewards/accuracies": 0.875, "rewards/chosen": -48.71875, "rewards/margins": 40.84375, "rewards/rejected": -89.5625, "step": 46 }, { "epoch": 0.752, "grad_norm": 0.002803042076571359, "learning_rate": 6.037735849056604e-05, "logits/chosen": -0.39910888671875, "logits/rejected": 0.93603515625, "logps/chosen": -528.0, "logps/rejected": -915.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -30.0546875, "rewards/margins": 57.125, "rewards/rejected": -87.0625, "step": 47 }, { "epoch": 0.768, "grad_norm": 5.448283044130183e-16, "learning_rate": 5.660377358490566e-05, "logits/chosen": -0.558837890625, "logits/rejected": 1.12890625, "logps/chosen": -529.5, "logps/rejected": -944.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -28.7890625, "rewards/margins": 61.96875, "rewards/rejected": -90.8125, "step": 48 }, { "epoch": 0.784, "grad_norm": 1.6427966944713224e-12, "learning_rate": 5.283018867924528e-05, "logits/chosen": -0.394775390625, "logits/rejected": 1.146484375, "logps/chosen": -676.25, "logps/rejected": -947.5, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -38.453125, "rewards/margins": 52.25, "rewards/rejected": -90.75, "step": 49 }, { "epoch": 0.8, "grad_norm": 3.810907085088454e-16, "learning_rate": 4.9056603773584906e-05, "logits/chosen": -0.5797119140625, "logits/rejected": 1.154296875, "logps/chosen": -526.0, "logps/rejected": -946.5, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -29.6640625, "rewards/margins": 61.1875, "rewards/rejected": -90.875, "step": 50 }, { "epoch": 0.816, "grad_norm": 6.364667179704401e-12, "learning_rate": 4.528301886792453e-05, "logits/chosen": -0.573974609375, "logits/rejected": 1.1572265625, "logps/chosen": -588.5, "logps/rejected": -948.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -33.515625, "rewards/margins": 57.21875, "rewards/rejected": -90.8125, "step": 51 }, { "epoch": 0.832, "grad_norm": 1.4796603970496823e-15, "learning_rate": 4.150943396226415e-05, "logits/chosen": -0.509521484375, "logits/rejected": 1.1650390625, "logps/chosen": -624.75, "logps/rejected": -946.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -34.671875, "rewards/margins": 56.1875, "rewards/rejected": -90.9375, "step": 52 }, { "epoch": 0.848, "grad_norm": 2.3490727781381015e-13, "learning_rate": 3.7735849056603776e-05, "logits/chosen": -0.517822265625, "logits/rejected": 1.177734375, "logps/chosen": -665.625, "logps/rejected": -949.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -37.21875, "rewards/margins": 53.5, "rewards/rejected": -90.6875, "step": 53 }, { "epoch": 0.864, "grad_norm": 2.3715956941641845e-14, "learning_rate": 3.39622641509434e-05, "logits/chosen": -0.52734375, "logits/rejected": 1.181640625, "logps/chosen": -669.5, "logps/rejected": -946.5, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -39.078125, "rewards/margins": 51.59375, "rewards/rejected": -90.8125, "step": 54 }, { "epoch": 0.88, "grad_norm": 3.890201643859771e-08, "learning_rate": 3.018867924528302e-05, "logits/chosen": -0.495361328125, "logits/rejected": 1.17578125, "logps/chosen": -729.5, "logps/rejected": -948.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -44.0625, "rewards/margins": 46.671875, "rewards/rejected": -91.0, "step": 55 }, { "epoch": 0.896, "grad_norm": 8.799502093565177e-14, "learning_rate": 2.641509433962264e-05, "logits/chosen": -0.48876953125, "logits/rejected": 1.1767578125, "logps/chosen": -607.25, "logps/rejected": -950.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -34.78125, "rewards/margins": 56.15625, "rewards/rejected": -90.9375, "step": 56 }, { "epoch": 0.912, "grad_norm": 8.933140987778416e-16, "learning_rate": 2.2641509433962265e-05, "logits/chosen": -0.56591796875, "logits/rejected": 1.1875, "logps/chosen": -461.75, "logps/rejected": -950.5, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -27.2734375, "rewards/margins": 63.78125, "rewards/rejected": -91.0, "step": 57 }, { "epoch": 0.928, "grad_norm": 6.596288118090753e-11, "learning_rate": 1.8867924528301888e-05, "logits/chosen": -0.572998046875, "logits/rejected": 1.1953125, "logps/chosen": -578.25, "logps/rejected": -951.5, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -33.890625, "rewards/margins": 56.96875, "rewards/rejected": -90.875, "step": 58 }, { "epoch": 0.944, "grad_norm": 3.7756990031247874e-10, "learning_rate": 1.509433962264151e-05, "logits/chosen": -0.56298828125, "logits/rejected": 1.040863037109375, "logps/chosen": -521.5, "logps/rejected": -947.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -30.6875, "rewards/margins": 59.34375, "rewards/rejected": -90.125, "step": 59 }, { "epoch": 0.96, "grad_norm": 1.7401296070209917e-09, "learning_rate": 1.1320754716981132e-05, "logits/chosen": -0.534423828125, "logits/rejected": 1.189453125, "logps/chosen": -612.75, "logps/rejected": -951.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -36.921875, "rewards/margins": 53.96875, "rewards/rejected": -90.9375, "step": 60 }, { "epoch": 0.976, "grad_norm": 1.3595843194900355e-10, "learning_rate": 7.547169811320755e-06, "logits/chosen": -0.457275390625, "logits/rejected": 1.19140625, "logps/chosen": -684.5, "logps/rejected": -951.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -40.59375, "rewards/margins": 50.40625, "rewards/rejected": -91.0, "step": 61 }, { "epoch": 0.992, "grad_norm": 1.292032339993031e-10, "learning_rate": 3.7735849056603773e-06, "logits/chosen": -0.46923828125, "logits/rejected": 1.1953125, "logps/chosen": -576.0, "logps/rejected": -950.5, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -33.90625, "rewards/margins": 57.03125, "rewards/rejected": -91.0, "step": 62 }, { "epoch": 1.0, "grad_norm": 1.292032339993031e-10, "learning_rate": 0.0, "logits/chosen": -0.58837890625, "logits/rejected": 1.1875, "logps/chosen": -526.5, "logps/rejected": -951.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -32.609375, "rewards/margins": 58.15625, "rewards/rejected": -91.0, "step": 63 } ], "logging_steps": 1, "max_steps": 63, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }