{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 100, "global_step": 1135, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04419889502762431, "grad_norm": 31.912578120558575, "learning_rate": 3.9473684210526315e-07, "logits/chosen": -34.45624923706055, "logits/rejected": -34.537498474121094, "logps/chosen": -63.181251525878906, "logps/rejected": -63.20624923706055, "loss": 0.7006, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.01593170128762722, "rewards/margins": -0.0059608458541333675, "rewards/rejected": 0.021885300055146217, "step": 10 }, { "epoch": 0.08839779005524862, "grad_norm": 156.60291460571648, "learning_rate": 8.333333333333333e-07, "logits/chosen": -34.21875, "logits/rejected": -33.884376525878906, "logps/chosen": -62.34375, "logps/rejected": -62.75, "loss": 0.7024, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.04129333421587944, "rewards/margins": -0.010060501284897327, "rewards/rejected": 0.051354218274354935, "step": 20 }, { "epoch": 0.13259668508287292, "grad_norm": 98.29416525426021, "learning_rate": 1.2719298245614037e-06, "logits/chosen": -34.19062423706055, "logits/rejected": -33.912498474121094, "logps/chosen": -61.806251525878906, "logps/rejected": -62.71875, "loss": 0.6988, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.03524055331945419, "rewards/margins": -0.0033996582496911287, "rewards/rejected": 0.03864631801843643, "step": 30 }, { "epoch": 0.17679558011049723, "grad_norm": 188.22174498321468, "learning_rate": 1.710526315789474e-06, "logits/chosen": -34.28437423706055, "logits/rejected": -34.318748474121094, "logps/chosen": -62.73125076293945, "logps/rejected": -63.17499923706055, "loss": 0.6965, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": 0.028417300432920456, "rewards/margins": 0.0014274597633630037, "rewards/rejected": 0.026988601312041283, "step": 40 }, { "epoch": 0.22099447513812154, "grad_norm": 93.03764687186249, "learning_rate": 2.149122807017544e-06, "logits/chosen": -34.39374923706055, "logits/rejected": -34.037498474121094, "logps/chosen": -62.40625, "logps/rejected": -62.35625076293945, "loss": 0.7024, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": 0.02354583702981472, "rewards/margins": -0.00987930316478014, "rewards/rejected": 0.03342132642865181, "step": 50 }, { "epoch": 0.26519337016574585, "grad_norm": 212.86610633385393, "learning_rate": 2.5877192982456147e-06, "logits/chosen": -34.96562576293945, "logits/rejected": -34.71875, "logps/chosen": -64.2125015258789, "logps/rejected": -63.89374923706055, "loss": 0.6975, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.023821258917450905, "rewards/margins": 0.00054931640625, "rewards/rejected": 0.023278426378965378, "step": 60 }, { "epoch": 0.30939226519337015, "grad_norm": 333.4212323543566, "learning_rate": 3.0263157894736843e-06, "logits/chosen": -34.368751525878906, "logits/rejected": -34.334373474121094, "logps/chosen": -61.3125, "logps/rejected": -62.849998474121094, "loss": 0.6932, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": 0.035521697252988815, "rewards/margins": 0.009041977114975452, "rewards/rejected": 0.026560593396425247, "step": 70 }, { "epoch": 0.35359116022099446, "grad_norm": 69.05772643062294, "learning_rate": 3.464912280701755e-06, "logits/chosen": -34.415626525878906, "logits/rejected": -34.90937423706055, "logps/chosen": -64.21875, "logps/rejected": -63.86249923706055, "loss": 0.7031, "rewards/accuracies": 0.4593749940395355, "rewards/chosen": 0.007965469732880592, "rewards/margins": -0.012713241390883923, "rewards/rejected": 0.020727921277284622, "step": 80 }, { "epoch": 0.39779005524861877, "grad_norm": 75.94384074445908, "learning_rate": 3.903508771929825e-06, "logits/chosen": -34.974998474121094, "logits/rejected": -34.453125, "logps/chosen": -62.4375, "logps/rejected": -63.14374923706055, "loss": 0.6972, "rewards/accuracies": 0.46875, "rewards/chosen": 0.03527069091796875, "rewards/margins": -0.0014644622569903731, "rewards/rejected": 0.036772917956113815, "step": 90 }, { "epoch": 0.4419889502762431, "grad_norm": 90.31215319727629, "learning_rate": 4.342105263157895e-06, "logits/chosen": -35.16875076293945, "logits/rejected": -35.353126525878906, "logps/chosen": -63.29999923706055, "logps/rejected": -65.1875, "loss": 0.697, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": 0.015990447252988815, "rewards/margins": 0.0013881683116778731, "rewards/rejected": 0.014550590887665749, "step": 100 }, { "epoch": 0.4419889502762431, "eval_logits/chosen": -34.13163757324219, "eval_logits/rejected": -33.74336242675781, "eval_logps/chosen": -64.11504364013672, "eval_logps/rejected": -67.4137191772461, "eval_loss": 0.6939815878868103, "eval_rewards/accuracies": 0.39988937973976135, "eval_rewards/chosen": 0.005282680504024029, "eval_rewards/margins": -0.0008932130294851959, "eval_rewards/rejected": 0.006183590739965439, "eval_runtime": 145.4485, "eval_samples_per_second": 12.431, "eval_steps_per_second": 0.777, "step": 100 }, { "epoch": 0.4861878453038674, "grad_norm": 101.63630769144744, "learning_rate": 4.780701754385965e-06, "logits/chosen": -34.06562423706055, "logits/rejected": -33.978126525878906, "logps/chosen": -62.349998474121094, "logps/rejected": -64.55000305175781, "loss": 0.6906, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": 0.04169921949505806, "rewards/margins": 0.012572860345244408, "rewards/rejected": 0.02917175367474556, "step": 110 }, { "epoch": 0.5303867403314917, "grad_norm": 13.504639212397363, "learning_rate": 4.9997041376403694e-06, "logits/chosen": -32.359375, "logits/rejected": -32.415626525878906, "logps/chosen": -59.11249923706055, "logps/rejected": -60.025001525878906, "loss": 0.6932, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.07001037895679474, "rewards/margins": 0.007287025451660156, "rewards/rejected": 0.06273727118968964, "step": 120 }, { "epoch": 0.574585635359116, "grad_norm": 38.93851829526993, "learning_rate": 4.997337658912519e-06, "logits/chosen": -31.584375381469727, "logits/rejected": -31.568750381469727, "logps/chosen": -57.65625, "logps/rejected": -57.193748474121094, "loss": 0.7, "rewards/accuracies": 0.4593749940395355, "rewards/chosen": 0.08158035576343536, "rewards/margins": -0.007588195614516735, "rewards/rejected": 0.08919067680835724, "step": 130 }, { "epoch": 0.6187845303867403, "grad_norm": 79.40195510873717, "learning_rate": 4.992606941810579e-06, "logits/chosen": -31.971874237060547, "logits/rejected": -32.06562423706055, "logps/chosen": -54.493751525878906, "logps/rejected": -56.506248474121094, "loss": 0.6939, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.09824828803539276, "rewards/margins": 0.00538558978587389, "rewards/rejected": 0.09286651760339737, "step": 140 }, { "epoch": 0.6629834254143646, "grad_norm": 140.17380459713385, "learning_rate": 4.985516464921125e-06, "logits/chosen": -32.240623474121094, "logits/rejected": -32.12812423706055, "logps/chosen": -54.25, "logps/rejected": -57.23125076293945, "loss": 0.6902, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": 0.10339584201574326, "rewards/margins": 0.01259765587747097, "rewards/rejected": 0.09071807563304901, "step": 150 }, { "epoch": 0.7071823204419889, "grad_norm": 70.5462452292945, "learning_rate": 4.9760729408236466e-06, "logits/chosen": -33.51874923706055, "logits/rejected": -33.734375, "logps/chosen": -56.875, "logps/rejected": -57.42499923706055, "loss": 0.6979, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.07443694770336151, "rewards/margins": -0.001873016357421875, "rewards/rejected": 0.0762714371085167, "step": 160 }, { "epoch": 0.7513812154696132, "grad_norm": 45.228951661428596, "learning_rate": 4.964285309735732e-06, "logits/chosen": -34.931251525878906, "logits/rejected": -35.32500076293945, "logps/chosen": -62.006248474121094, "logps/rejected": -62.224998474121094, "loss": 0.7053, "rewards/accuracies": 0.44062501192092896, "rewards/chosen": 0.027483750134706497, "rewards/margins": -0.01620330847799778, "rewards/rejected": 0.043679047375917435, "step": 170 }, { "epoch": 0.7955801104972375, "grad_norm": 27.486138227592058, "learning_rate": 4.9501647310493275e-06, "logits/chosen": -35.20624923706055, "logits/rejected": -35.64374923706055, "logps/chosen": -64.5, "logps/rejected": -64.83125305175781, "loss": 0.6975, "rewards/accuracies": 0.5, "rewards/chosen": 0.028644943609833717, "rewards/margins": -0.0013900756603106856, "rewards/rejected": 0.030038069933652878, "step": 180 }, { "epoch": 0.8397790055248618, "grad_norm": 18.549640140702518, "learning_rate": 4.933724572766102e-06, "logits/chosen": -35.36249923706055, "logits/rejected": -35.17499923706055, "logps/chosen": -63.75, "logps/rejected": -61.900001525878906, "loss": 0.7112, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.008522415533661842, "rewards/margins": -0.029364775866270065, "rewards/rejected": 0.03787269443273544, "step": 190 }, { "epoch": 0.8839779005524862, "grad_norm": 38.9951177948575, "learning_rate": 4.914980398841915e-06, "logits/chosen": -35.821876525878906, "logits/rejected": -35.66875076293945, "logps/chosen": -58.181251525878906, "logps/rejected": -60.118751525878906, "loss": 0.6944, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": 0.07456474006175995, "rewards/margins": 0.0073871612548828125, "rewards/rejected": 0.06726684421300888, "step": 200 }, { "epoch": 0.8839779005524862, "eval_logits/chosen": -34.85619354248047, "eval_logits/rejected": -34.648231506347656, "eval_logps/chosen": -55.311946868896484, "eval_logps/rejected": -56.56858444213867, "eval_loss": 0.7062707543373108, "eval_rewards/accuracies": 0.4148229956626892, "eval_rewards/chosen": 0.09352908283472061, "eval_rewards/margins": -0.021051863208413124, "eval_rewards/rejected": 0.11459620296955109, "eval_runtime": 145.1634, "eval_samples_per_second": 12.455, "eval_steps_per_second": 0.778, "step": 200 }, { "epoch": 0.9281767955801105, "grad_norm": 193.47798110649993, "learning_rate": 4.8939499544523635e-06, "logits/chosen": -33.82500076293945, "logits/rejected": -33.98125076293945, "logps/chosen": -54.45624923706055, "logps/rejected": -55.76874923706055, "loss": 0.692, "rewards/accuracies": 0.546875, "rewards/chosen": 0.11307983100414276, "rewards/margins": 0.009380340576171875, "rewards/rejected": 0.10376129299402237, "step": 210 }, { "epoch": 0.9723756906077348, "grad_norm": 115.37637912313947, "learning_rate": 4.870653149193363e-06, "logits/chosen": -33.193748474121094, "logits/rejected": -33.32500076293945, "logps/chosen": -52.537498474121094, "logps/rejected": -52.71875, "loss": 0.6961, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": 0.12864074110984802, "rewards/margins": 0.0001373291015625, "rewards/rejected": 0.12855835258960724, "step": 220 }, { "epoch": 1.0132596685082873, "grad_norm": 108.55588797717557, "learning_rate": 4.845112038232657e-06, "logits/chosen": -33.42567443847656, "logits/rejected": -33.885135650634766, "logps/chosen": -51.58108139038086, "logps/rejected": -52.9054069519043, "loss": 0.7, "rewards/accuracies": 0.5135135054588318, "rewards/chosen": 0.12744469940662384, "rewards/margins": -0.005753800738602877, "rewards/rejected": 0.133209228515625, "step": 230 }, { "epoch": 1.0574585635359115, "grad_norm": 44.644728918778526, "learning_rate": 4.817350801430122e-06, "logits/chosen": -32.58124923706055, "logits/rejected": -32.65937423706055, "logps/chosen": -53.65625, "logps/rejected": -53.98125076293945, "loss": 0.6967, "rewards/accuracies": 0.503125011920929, "rewards/chosen": 0.11483154445886612, "rewards/margins": -0.001277923583984375, "rewards/rejected": 0.11615677177906036, "step": 240 }, { "epoch": 1.101657458563536, "grad_norm": 111.3089775307173, "learning_rate": 4.7873957204466e-06, "logits/chosen": -33.037498474121094, "logits/rejected": -32.765625, "logps/chosen": -56.35625076293945, "logps/rejected": -56.431251525878906, "loss": 0.7031, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.09192390739917755, "rewards/margins": -0.0130157470703125, "rewards/rejected": 0.10481490939855576, "step": 250 }, { "epoch": 1.1458563535911601, "grad_norm": 54.614100005608506, "learning_rate": 4.75527515386296e-06, "logits/chosen": -33.875, "logits/rejected": -33.412498474121094, "logps/chosen": -59.381248474121094, "logps/rejected": -60.66875076293945, "loss": 0.6929, "rewards/accuracies": 0.534375011920929, "rewards/chosen": 0.06636963039636612, "rewards/margins": 0.007328033447265625, "rewards/rejected": 0.05897827073931694, "step": 260 }, { "epoch": 1.1900552486187845, "grad_norm": 66.75356578243974, "learning_rate": 4.721019510332931e-06, "logits/chosen": -33.68437576293945, "logits/rejected": -34.087501525878906, "logps/chosen": -58.5, "logps/rejected": -59.96875, "loss": 0.6894, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": 0.06654815375804901, "rewards/margins": 0.015409087762236595, "rewards/rejected": 0.05117645114660263, "step": 270 }, { "epoch": 1.234254143646409, "grad_norm": 71.46398525887813, "learning_rate": 4.684661219795123e-06, "logits/chosen": -35.400001525878906, "logits/rejected": -34.94062423706055, "logps/chosen": -58.568748474121094, "logps/rejected": -62.881248474121094, "loss": 0.6822, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.057286836206912994, "rewards/margins": 0.03053588792681694, "rewards/rejected": 0.02678680419921875, "step": 280 }, { "epoch": 1.2784530386740331, "grad_norm": 969.0135287018081, "learning_rate": 4.646234702771485e-06, "logits/chosen": -36.443748474121094, "logits/rejected": -36.256248474121094, "logps/chosen": -63.78125, "logps/rejected": -65.1875, "loss": 0.6928, "rewards/accuracies": 0.546875, "rewards/chosen": 0.010564422234892845, "rewards/margins": 0.00908737163990736, "rewards/rejected": 0.0014781951904296875, "step": 290 }, { "epoch": 1.3226519337016573, "grad_norm": 177.59084774019237, "learning_rate": 4.6057763377812795e-06, "logits/chosen": -35.65625, "logits/rejected": -35.806251525878906, "logps/chosen": -61.662498474121094, "logps/rejected": -62.61249923706055, "loss": 0.6976, "rewards/accuracies": 0.53125, "rewards/chosen": 0.04006652906537056, "rewards/margins": -0.0002792358282022178, "rewards/rejected": 0.04036102443933487, "step": 300 }, { "epoch": 1.3226519337016573, "eval_logits/chosen": -36.53982162475586, "eval_logits/rejected": -36.362831115722656, "eval_logps/chosen": -64.61504364013672, "eval_logps/rejected": -66.25, "eval_loss": 0.7048534154891968, "eval_rewards/accuracies": 0.3949114978313446, "eval_rewards/chosen": 0.0005568107590079308, "eval_rewards/margins": -0.017061756923794746, "eval_rewards/rejected": 0.01762552186846733, "eval_runtime": 144.7893, "eval_samples_per_second": 12.487, "eval_steps_per_second": 0.78, "step": 300 }, { "epoch": 1.3668508287292818, "grad_norm": 306.3599660205783, "learning_rate": 4.56332442690141e-06, "logits/chosen": -35.66875076293945, "logits/rejected": -35.58124923706055, "logps/chosen": -61.912498474121094, "logps/rejected": -64.42500305175781, "loss": 0.6863, "rewards/accuracies": 0.559374988079071, "rewards/chosen": 0.04250755161046982, "rewards/margins": 0.024246979504823685, "rewards/rejected": 0.01823272742331028, "step": 310 }, { "epoch": 1.4110497237569062, "grad_norm": 249.5041102060508, "learning_rate": 4.5189191595057056e-06, "logits/chosen": -36.712501525878906, "logits/rejected": -36.212501525878906, "logps/chosen": -64.3187484741211, "logps/rejected": -65.1937484741211, "loss": 0.699, "rewards/accuracies": 0.5, "rewards/chosen": -0.0035425187088549137, "rewards/margins": -0.0027992248069494963, "rewards/rejected": -0.0007007598760537803, "step": 320 }, { "epoch": 1.4552486187845304, "grad_norm": 130.9635465605912, "learning_rate": 4.472602574217504e-06, "logits/chosen": -36.86249923706055, "logits/rejected": -36.400001525878906, "logps/chosen": -65.63749694824219, "logps/rejected": -67.20625305175781, "loss": 0.6993, "rewards/accuracies": 0.5, "rewards/chosen": -0.010945891961455345, "rewards/margins": -0.004943275358527899, "rewards/rejected": -0.005972099490463734, "step": 330 }, { "epoch": 1.4994475138121546, "grad_norm": 98.7663555456692, "learning_rate": 4.424418519111536e-06, "logits/chosen": -38.0625, "logits/rejected": -38.48125076293945, "logps/chosen": -68.2125015258789, "logps/rejected": -71.4312515258789, "loss": 0.6849, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": -0.013086128048598766, "rewards/margins": 0.02442779578268528, "rewards/rejected": -0.03748359531164169, "step": 340 }, { "epoch": 1.543646408839779, "grad_norm": 155.42214285049903, "learning_rate": 4.374412610202799e-06, "logits/chosen": -37.14374923706055, "logits/rejected": -37.58124923706055, "logps/chosen": -68.3687515258789, "logps/rejected": -71.1500015258789, "loss": 0.687, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0279083251953125, "rewards/margins": 0.020545577630400658, "rewards/rejected": -0.0484277717769146, "step": 350 }, { "epoch": 1.5878453038674034, "grad_norm": 101.97845702466351, "learning_rate": 4.322632188261711e-06, "logits/chosen": -37.79375076293945, "logits/rejected": -38.099998474121094, "logps/chosen": -70.03125, "logps/rejected": -72.0374984741211, "loss": 0.6927, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": -0.05641632154583931, "rewards/margins": 0.009358977898955345, "rewards/rejected": -0.06578445434570312, "step": 360 }, { "epoch": 1.6320441988950276, "grad_norm": 160.8558514395795, "learning_rate": 4.2691262739964456e-06, "logits/chosen": -39.08124923706055, "logits/rejected": -39.193748474121094, "logps/chosen": -73.82499694824219, "logps/rejected": -75.375, "loss": 0.6909, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": -0.07808151096105576, "rewards/margins": 0.012683868408203125, "rewards/rejected": -0.09075317531824112, "step": 370 }, { "epoch": 1.6762430939226518, "grad_norm": 166.58956587251402, "learning_rate": 4.213945521644842e-06, "logits/chosen": -39.15625, "logits/rejected": -38.96875, "logps/chosen": -71.2750015258789, "logps/rejected": -72.76249694824219, "loss": 0.6977, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": -0.05317115783691406, "rewards/margins": -0.0004726409970317036, "rewards/rejected": -0.05269470065832138, "step": 380 }, { "epoch": 1.7204419889502762, "grad_norm": 44.485924324406845, "learning_rate": 4.15714217101987e-06, "logits/chosen": -38.712501525878906, "logits/rejected": -38.71875, "logps/chosen": -71.29374694824219, "logps/rejected": -72.26249694824219, "loss": 0.6955, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.04309577867388725, "rewards/margins": 0.0034702301491051912, "rewards/rejected": -0.04664192348718643, "step": 390 }, { "epoch": 1.7646408839779006, "grad_norm": 500.6584131917625, "learning_rate": 4.09876999805401e-06, "logits/chosen": -39.79375076293945, "logits/rejected": -39.26250076293945, "logps/chosen": -70.94999694824219, "logps/rejected": -71.39375305175781, "loss": 0.6939, "rewards/accuracies": 0.53125, "rewards/chosen": -0.05061149597167969, "rewards/margins": 0.0070930481888353825, "rewards/rejected": -0.05766182020306587, "step": 400 }, { "epoch": 1.7646408839779006, "eval_logits/chosen": -39.59292221069336, "eval_logits/rejected": -39.47123718261719, "eval_logps/chosen": -71.54425048828125, "eval_logps/rejected": -72.84513092041016, "eval_loss": 0.7072386741638184, "eval_rewards/accuracies": 0.4131637215614319, "eval_rewards/chosen": -0.06897148489952087, "eval_rewards/margins": -0.02090170606970787, "eval_rewards/rejected": -0.04804648458957672, "eval_runtime": 144.5628, "eval_samples_per_second": 12.507, "eval_steps_per_second": 0.782, "step": 400 }, { "epoch": 1.8088397790055248, "grad_norm": 126.98042944165913, "learning_rate": 4.038884263889384e-06, "logits/chosen": -40.724998474121094, "logits/rejected": -40.368751525878906, "logps/chosen": -69.98750305175781, "logps/rejected": -72.7750015258789, "loss": 0.6926, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.050659943372011185, "rewards/margins": 0.0110015869140625, "rewards/rejected": -0.06168022006750107, "step": 410 }, { "epoch": 1.853038674033149, "grad_norm": 224.1849196235128, "learning_rate": 3.97754166256185e-06, "logits/chosen": -40.931251525878906, "logits/rejected": -41.099998474121094, "logps/chosen": -69.2750015258789, "logps/rejected": -71.5875015258789, "loss": 0.6902, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.045252226293087006, "rewards/margins": 0.01509780902415514, "rewards/rejected": -0.06038818508386612, "step": 420 }, { "epoch": 1.8972375690607735, "grad_norm": 78.43379440112741, "learning_rate": 3.9148002673285425e-06, "logits/chosen": -38.95624923706055, "logits/rejected": -39.131248474121094, "logps/chosen": -69.6312484741211, "logps/rejected": -70.7249984741211, "loss": 0.6954, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -0.023590469732880592, "rewards/margins": 0.00233879080042243, "rewards/rejected": -0.02593536302447319, "step": 430 }, { "epoch": 1.9414364640883979, "grad_norm": 154.88166237242064, "learning_rate": 3.850719475689726e-06, "logits/chosen": -37.51250076293945, "logits/rejected": -37.587501525878906, "logps/chosen": -65.7750015258789, "logps/rejected": -66.9937515258789, "loss": 0.689, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": -0.0014175415271893144, "rewards/margins": 0.01654663123190403, "rewards/rejected": -0.017965316772460938, "step": 440 }, { "epoch": 1.985635359116022, "grad_norm": 242.09434718829198, "learning_rate": 3.7853599531569684e-06, "logits/chosen": -37.556251525878906, "logits/rejected": -37.79375076293945, "logps/chosen": -66.65625, "logps/rejected": -69.625, "loss": 0.6897, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.01039886474609375, "rewards/margins": 0.013865661807358265, "rewards/rejected": -0.024304961785674095, "step": 450 }, { "epoch": 2.0265193370165746, "grad_norm": 104.62758142861014, "learning_rate": 3.718783575820887e-06, "logits/chosen": -38.003379821777344, "logits/rejected": -37.97972869873047, "logps/chosen": -64.50675964355469, "logps/rejected": -68.62837982177734, "loss": 0.6771, "rewards/accuracies": 0.5810810923576355, "rewards/chosen": 0.012018152512609959, "rewards/margins": 0.04104779288172722, "rewards/rejected": -0.029030464589595795, "step": 460 }, { "epoch": 2.070718232044199, "grad_norm": 74.84252998438447, "learning_rate": 3.6510533717728337e-06, "logits/chosen": -39.26874923706055, "logits/rejected": -38.806251525878906, "logps/chosen": -65.8499984741211, "logps/rejected": -68.2562484741211, "loss": 0.6846, "rewards/accuracies": 0.5625, "rewards/chosen": -0.010594558902084827, "rewards/margins": 0.026428604498505592, "rewards/rejected": -0.03700122982263565, "step": 470 }, { "epoch": 2.114917127071823, "grad_norm": 36.07688880775278, "learning_rate": 3.5822334614359826e-06, "logits/chosen": -39.087501525878906, "logits/rejected": -39.70624923706055, "logps/chosen": -65.4937515258789, "logps/rejected": -67.84375, "loss": 0.69, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.000919342041015625, "rewards/margins": 0.015228271484375, "rewards/rejected": -0.014283562079071999, "step": 480 }, { "epoch": 2.1591160220994476, "grad_norm": 439.09968550456875, "learning_rate": 3.512388996862296e-06, "logits/chosen": -41.66875076293945, "logits/rejected": -41.681251525878906, "logps/chosen": -69.0625, "logps/rejected": -70.14375305175781, "loss": 0.7005, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.02707977220416069, "rewards/margins": -0.004337310791015625, "rewards/rejected": -0.022744368761777878, "step": 490 }, { "epoch": 2.203314917127072, "grad_norm": 284.7475721910987, "learning_rate": 3.441586100052845e-06, "logits/chosen": -41.724998474121094, "logits/rejected": -42.54375076293945, "logps/chosen": -71.23750305175781, "logps/rejected": -69.6937484741211, "loss": 0.7082, "rewards/accuracies": 0.484375, "rewards/chosen": -0.051213644444942474, "rewards/margins": -0.01878051832318306, "rewards/rejected": -0.032381821423769, "step": 500 }, { "epoch": 2.203314917127072, "eval_logits/chosen": -40.48893737792969, "eval_logits/rejected": -39.898231506347656, "eval_logps/chosen": -68.44247436523438, "eval_logps/rejected": -73.43584442138672, "eval_loss": 0.6894358396530151, "eval_rewards/accuracies": 0.49225664138793945, "eval_rewards/chosen": -0.03768670931458473, "eval_rewards/margins": 0.016151901334524155, "eval_rewards/rejected": -0.05382031574845314, "eval_runtime": 143.5469, "eval_samples_per_second": 12.595, "eval_steps_per_second": 0.787, "step": 500 }, { "epoch": 2.247513812154696, "grad_norm": 366.70198013560713, "learning_rate": 3.3698918003598844e-06, "logits/chosen": -40.65625, "logits/rejected": -40.53125, "logps/chosen": -67.51875305175781, "logps/rejected": -70.64375305175781, "loss": 0.6883, "rewards/accuracies": 0.559374988079071, "rewards/chosen": -0.033694077283144, "rewards/margins": 0.01934356614947319, "rewards/rejected": -0.05306548997759819, "step": 510 }, { "epoch": 2.2917127071823202, "grad_norm": 839.8512440303829, "learning_rate": 3.297373971029921e-06, "logits/chosen": -40.462501525878906, "logits/rejected": -40.25, "logps/chosen": -69.8812484741211, "logps/rejected": -72.1937484741211, "loss": 0.6847, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -0.04071960598230362, "rewards/margins": 0.02692871168255806, "rewards/rejected": -0.06762619316577911, "step": 520 }, { "epoch": 2.335911602209945, "grad_norm": 379.4415230614182, "learning_rate": 3.2241012649478783e-06, "logits/chosen": -42.13750076293945, "logits/rejected": -42.29375076293945, "logps/chosen": -73.125, "logps/rejected": -75.05000305175781, "loss": 0.6928, "rewards/accuracies": 0.546875, "rewards/chosen": -0.07470321655273438, "rewards/margins": 0.01118316687643528, "rewards/rejected": -0.08594703674316406, "step": 530 }, { "epoch": 2.380110497237569, "grad_norm": 95.96944740198023, "learning_rate": 3.1501430496431605e-06, "logits/chosen": -42.65625, "logits/rejected": -42.693748474121094, "logps/chosen": -72.0, "logps/rejected": -74.4437484741211, "loss": 0.6966, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.07560767978429794, "rewards/margins": 0.0027114867698401213, "rewards/rejected": -0.07832489162683487, "step": 540 }, { "epoch": 2.4243093922651933, "grad_norm": 18.45847682665282, "learning_rate": 3.0755693416191755e-06, "logits/chosen": -42.36249923706055, "logits/rejected": -42.525001525878906, "logps/chosen": -70.9312515258789, "logps/rejected": -71.3187484741211, "loss": 0.6994, "rewards/accuracies": 0.484375, "rewards/chosen": -0.050330352038145065, "rewards/margins": -0.0023559569381177425, "rewards/rejected": -0.04796638339757919, "step": 550 }, { "epoch": 2.468508287292818, "grad_norm": 36.6489451315173, "learning_rate": 3.0004507400684593e-06, "logits/chosen": -42.42499923706055, "logits/rejected": -42.29375076293945, "logps/chosen": -69.9000015258789, "logps/rejected": -69.9124984741211, "loss": 0.7018, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.03403320163488388, "rewards/margins": -0.009571838192641735, "rewards/rejected": -0.024443816393613815, "step": 560 }, { "epoch": 2.512707182320442, "grad_norm": 397.2747975598668, "learning_rate": 2.9248583600361707e-06, "logits/chosen": -41.88750076293945, "logits/rejected": -41.8125, "logps/chosen": -69.23124694824219, "logps/rejected": -67.42500305175781, "loss": 0.7083, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.04715003818273544, "rewards/margins": -0.02280883863568306, "rewards/rejected": -0.02434692345559597, "step": 570 }, { "epoch": 2.5569060773480663, "grad_norm": 127.0485458678845, "learning_rate": 2.848863765095231e-06, "logits/chosen": -42.14374923706055, "logits/rejected": -41.91875076293945, "logps/chosen": -69.7437515258789, "logps/rejected": -69.2249984741211, "loss": 0.7048, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": -0.03314819186925888, "rewards/margins": -0.01586761511862278, "rewards/rejected": -0.017250824719667435, "step": 580 }, { "epoch": 2.6011049723756905, "grad_norm": 44.71968810042943, "learning_rate": 2.772538899596835e-06, "logits/chosen": -42.09375, "logits/rejected": -42.068748474121094, "logps/chosen": -69.67500305175781, "logps/rejected": -69.05000305175781, "loss": 0.7011, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": -0.02761382982134819, "rewards/margins": -0.009915923699736595, "rewards/rejected": -0.017643356695771217, "step": 590 }, { "epoch": 2.6453038674033147, "grad_norm": 33.565381388381525, "learning_rate": 2.6959560205604785e-06, "logits/chosen": -41.11249923706055, "logits/rejected": -41.51874923706055, "logps/chosen": -67.375, "logps/rejected": -68.38749694824219, "loss": 0.6972, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.018046189099550247, "rewards/margins": -0.0015205383533611894, "rewards/rejected": -0.016547393053770065, "step": 600 }, { "epoch": 2.6453038674033147, "eval_logits/chosen": -41.26106262207031, "eval_logits/rejected": -40.43362808227539, "eval_logps/chosen": -65.07964324951172, "eval_logps/rejected": -71.23672485351562, "eval_loss": 0.681640625, "eval_rewards/accuracies": 0.571349561214447, "eval_rewards/chosen": -0.004132971167564392, "eval_rewards/margins": 0.028063207864761353, "eval_rewards/rejected": -0.03223608061671257, "eval_runtime": 142.6548, "eval_samples_per_second": 12.674, "eval_steps_per_second": 0.792, "step": 600 }, { "epoch": 2.6895027624309393, "grad_norm": 141.96003727662844, "learning_rate": 2.6191876292679836e-06, "logits/chosen": -40.931251525878906, "logits/rejected": -41.0625, "logps/chosen": -65.5687484741211, "logps/rejected": -66.8687515258789, "loss": 0.6882, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.00627212505787611, "rewards/margins": 0.0151519775390625, "rewards/rejected": -0.00893325824290514, "step": 610 }, { "epoch": 2.7337016574585635, "grad_norm": 19.115852764851674, "learning_rate": 2.5423064026262817e-06, "logits/chosen": -40.875, "logits/rejected": -41.04375076293945, "logps/chosen": -66.0562515258789, "logps/rejected": -66.58125305175781, "loss": 0.6962, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": 0.0024814605712890625, "rewards/margins": 0.0004646301385946572, "rewards/rejected": 0.0020057677756994963, "step": 620 }, { "epoch": 2.7779005524861877, "grad_norm": 59.80069984593265, "learning_rate": 2.465385124363926e-06, "logits/chosen": -40.92499923706055, "logits/rejected": -41.068748474121094, "logps/chosen": -65.6812515258789, "logps/rejected": -68.0, "loss": 0.6918, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -0.009976196102797985, "rewards/margins": 0.00931396521627903, "rewards/rejected": -0.019283294677734375, "step": 630 }, { "epoch": 2.8220994475138124, "grad_norm": 798.2218656544982, "learning_rate": 2.388496616126481e-06, "logits/chosen": -41.243751525878906, "logits/rejected": -41.412498474121094, "logps/chosen": -68.125, "logps/rejected": -67.3499984741211, "loss": 0.7028, "rewards/accuracies": 0.47187501192092896, "rewards/chosen": -0.024826431646943092, "rewards/margins": -0.011798858642578125, "rewards/rejected": -0.01299362163990736, "step": 640 }, { "epoch": 2.8662983425414366, "grad_norm": 40.514443198818554, "learning_rate": 2.311713668536013e-06, "logits/chosen": -40.79375076293945, "logits/rejected": -41.01874923706055, "logps/chosen": -66.4749984741211, "logps/rejected": -66.0, "loss": 0.7021, "rewards/accuracies": 0.46875, "rewards/chosen": -0.018305206671357155, "rewards/margins": -0.010966491885483265, "rewards/rejected": -0.007343292236328125, "step": 650 }, { "epoch": 2.9104972375690608, "grad_norm": 433.7012474084816, "learning_rate": 2.235108972279951e-06, "logits/chosen": -40.818748474121094, "logits/rejected": -40.71875, "logps/chosen": -64.42500305175781, "logps/rejected": -65.8187484741211, "loss": 0.6982, "rewards/accuracies": 0.515625, "rewards/chosen": -0.00847015343606472, "rewards/margins": -0.00159454345703125, "rewards/rejected": -0.00689010601490736, "step": 660 }, { "epoch": 2.954696132596685, "grad_norm": 93.1289398635236, "learning_rate": 2.158755049294557e-06, "logits/chosen": -40.84375, "logits/rejected": -40.95000076293945, "logps/chosen": -64.5, "logps/rejected": -66.04374694824219, "loss": 0.6947, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.0042327879928052425, "rewards/margins": 0.0054077147506177425, "rewards/rejected": -0.0012054443359375, "step": 670 }, { "epoch": 2.998895027624309, "grad_norm": 634.9306184331582, "learning_rate": 2.082724184108152e-06, "logits/chosen": -41.25, "logits/rejected": -41.36249923706055, "logps/chosen": -64.9937515258789, "logps/rejected": -67.01249694824219, "loss": 0.6951, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.0009216308826580644, "rewards/margins": 0.0038085938431322575, "rewards/rejected": -0.0029159546829760075, "step": 680 }, { "epoch": 3.0397790055248617, "grad_norm": 20.17782198882316, "learning_rate": 2.0070883554091004e-06, "logits/chosen": -41.14864730834961, "logits/rejected": -41.5405387878418, "logps/chosen": -66.60134887695312, "logps/rejected": -65.85810852050781, "loss": 0.7032, "rewards/accuracies": 0.43581080436706543, "rewards/chosen": -0.021340448409318924, "rewards/margins": -0.017934437841176987, "rewards/rejected": -0.003451579250395298, "step": 690 }, { "epoch": 3.0839779005524863, "grad_norm": 53.09914070994327, "learning_rate": 1.9319191679033283e-06, "logits/chosen": -41.743751525878906, "logits/rejected": -41.881248474121094, "logps/chosen": -67.36250305175781, "logps/rejected": -67.48124694824219, "loss": 0.7037, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.013952255249023438, "rewards/margins": -0.01392288226634264, "rewards/rejected": -2.937316821771674e-05, "step": 700 }, { "epoch": 3.0839779005524863, "eval_logits/chosen": -41.19247817993164, "eval_logits/rejected": -40.51548767089844, "eval_logps/chosen": -64.54646301269531, "eval_logps/rejected": -70.73008728027344, "eval_loss": 0.6816233396530151, "eval_rewards/accuracies": 0.5724557638168335, "eval_rewards/chosen": 0.0012067608768120408, "eval_rewards/margins": 0.02834569849073887, "eval_rewards/rejected": -0.02716905064880848, "eval_runtime": 145.2175, "eval_samples_per_second": 12.45, "eval_steps_per_second": 0.778, "step": 700 }, { "epoch": 3.1281767955801105, "grad_norm": 83.07623338013184, "learning_rate": 1.8572877845258983e-06, "logits/chosen": -41.58124923706055, "logits/rejected": -41.587501525878906, "logps/chosen": -67.33125305175781, "logps/rejected": -66.89375305175781, "loss": 0.6995, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": -0.03415717929601669, "rewards/margins": -0.005290222354233265, "rewards/rejected": -0.028838729485869408, "step": 710 }, { "epoch": 3.1723756906077347, "grad_norm": 270.7790200465776, "learning_rate": 1.7832648590708063e-06, "logits/chosen": -41.537498474121094, "logits/rejected": -41.79375076293945, "logps/chosen": -66.5562515258789, "logps/rejected": -68.4312515258789, "loss": 0.6916, "rewards/accuracies": 0.528124988079071, "rewards/chosen": -0.015944670885801315, "rewards/margins": 0.010485077276825905, "rewards/rejected": -0.026327896863222122, "step": 720 }, { "epoch": 3.216574585635359, "grad_norm": 356.88462524345476, "learning_rate": 1.7099204693027871e-06, "logits/chosen": -41.662498474121094, "logits/rejected": -41.95624923706055, "logps/chosen": -67.48124694824219, "logps/rejected": -66.8125, "loss": 0.6989, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.019288253039121628, "rewards/margins": -0.005435180850327015, "rewards/rejected": -0.013879776000976562, "step": 730 }, { "epoch": 3.2607734806629836, "grad_norm": 47.425574675096684, "learning_rate": 1.6373240506144453e-06, "logits/chosen": -41.662498474121094, "logits/rejected": -41.787498474121094, "logps/chosen": -66.61250305175781, "logps/rejected": -69.9437484741211, "loss": 0.6872, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0036094665993005037, "rewards/margins": 0.01754150353372097, "rewards/rejected": -0.021148681640625, "step": 740 }, { "epoch": 3.3049723756906078, "grad_norm": 37.03036848840057, "learning_rate": 1.5655443302915258e-06, "logits/chosen": -41.34375, "logits/rejected": -41.443748474121094, "logps/chosen": -66.89375305175781, "logps/rejected": -67.88749694824219, "loss": 0.6938, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -0.02233428880572319, "rewards/margins": 0.0054473876953125, "rewards/rejected": -0.027825165539979935, "step": 750 }, { "epoch": 3.349171270718232, "grad_norm": 149.3165966656791, "learning_rate": 1.4946492624485478e-06, "logits/chosen": -41.349998474121094, "logits/rejected": -41.32500076293945, "logps/chosen": -67.11250305175781, "logps/rejected": -67.3187484741211, "loss": 0.7031, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.024369429796934128, "rewards/margins": -0.01361160259693861, "rewards/rejected": -0.01075592078268528, "step": 760 }, { "epoch": 3.393370165745856, "grad_norm": 43.24521581781515, "learning_rate": 1.4247059636964079e-06, "logits/chosen": -41.23749923706055, "logits/rejected": -41.368751525878906, "logps/chosen": -65.94999694824219, "logps/rejected": -66.82499694824219, "loss": 0.6926, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": -0.0015350341564044356, "rewards/margins": 0.0068450928665697575, "rewards/rejected": -0.008346939459443092, "step": 770 }, { "epoch": 3.437569060773481, "grad_norm": 498.84758722956946, "learning_rate": 1.3557806496028442e-06, "logits/chosen": -40.78125, "logits/rejected": -40.86249923706055, "logps/chosen": -64.98750305175781, "logps/rejected": -66.04374694824219, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.017464447766542435, "rewards/margins": 0.006153869442641735, "rewards/rejected": 0.011297988705337048, "step": 780 }, { "epoch": 3.481767955801105, "grad_norm": 130.55316193535066, "learning_rate": 1.2879385720059262e-06, "logits/chosen": -41.193748474121094, "logits/rejected": -41.34375, "logps/chosen": -66.5, "logps/rejected": -65.6875, "loss": 0.7004, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": -0.0043357848189771175, "rewards/margins": -0.009280395694077015, "rewards/rejected": 0.004947662353515625, "step": 790 }, { "epoch": 3.525966850828729, "grad_norm": 29.71050417570976, "learning_rate": 1.221243957239912e-06, "logits/chosen": -41.8125, "logits/rejected": -41.618751525878906, "logps/chosen": -67.1500015258789, "logps/rejected": -67.08125305175781, "loss": 0.7002, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": -0.01057281531393528, "rewards/margins": -0.00801162701100111, "rewards/rejected": -0.002597808837890625, "step": 800 }, { "epoch": 3.525966850828729, "eval_logits/chosen": -41.719024658203125, "eval_logits/rejected": -40.88495635986328, "eval_logps/chosen": -64.07964324951172, "eval_logps/rejected": -70.1283187866211, "eval_loss": 0.6822801232337952, "eval_rewards/accuracies": 0.5896017551422119, "eval_rewards/chosen": 0.005799082573503256, "eval_rewards/margins": 0.026615582406520844, "eval_rewards/rejected": -0.020812584087252617, "eval_runtime": 144.2857, "eval_samples_per_second": 12.531, "eval_steps_per_second": 0.783, "step": 800 }, { "epoch": 3.570165745856354, "grad_norm": 17.46880215371985, "learning_rate": 1.155759945331945e-06, "logits/chosen": -41.681251525878906, "logits/rejected": -41.59375, "logps/chosen": -65.51249694824219, "logps/rejected": -66.5999984741211, "loss": 0.6977, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": 0.0007347107166424394, "rewards/margins": -0.003330230712890625, "rewards/rejected": 0.00410804757848382, "step": 810 }, { "epoch": 3.614364640883978, "grad_norm": 49.64772209236825, "learning_rate": 1.0915485302271756e-06, "logits/chosen": -41.712501525878906, "logits/rejected": -41.92499923706055, "logps/chosen": -65.91874694824219, "logps/rejected": -67.2562484741211, "loss": 0.6933, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": 0.0008491516346111894, "rewards/margins": 0.00578994769603014, "rewards/rejected": -0.00494461040943861, "step": 820 }, { "epoch": 3.658563535911602, "grad_norm": 59.68512440192332, "learning_rate": 1.028670501098865e-06, "logits/chosen": -41.48749923706055, "logits/rejected": -41.91875076293945, "logps/chosen": -67.4625015258789, "logps/rejected": -66.01875305175781, "loss": 0.7008, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.009090423583984375, "rewards/margins": -0.010057831183075905, "rewards/rejected": 0.0009643554803915322, "step": 830 }, { "epoch": 3.7027624309392264, "grad_norm": 24.97546610779972, "learning_rate": 9.671853847990682e-07, "logits/chosen": -41.78125, "logits/rejected": -42.08124923706055, "logps/chosen": -66.7750015258789, "logps/rejected": -67.7750015258789, "loss": 0.6957, "rewards/accuracies": 0.46875, "rewards/chosen": -0.012136459350585938, "rewards/margins": 0.000629425048828125, "rewards/rejected": -0.012753295712172985, "step": 840 }, { "epoch": 3.7469613259668506, "grad_norm": 43.276395845465316, "learning_rate": 9.071513895043508e-07, "logits/chosen": -41.92499923706055, "logits/rejected": -41.98749923706055, "logps/chosen": -66.05000305175781, "logps/rejected": -68.6500015258789, "loss": 0.691, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.0017818451160565019, "rewards/margins": 0.011096191592514515, "rewards/rejected": -0.009303664788603783, "step": 850 }, { "epoch": 3.7911602209944752, "grad_norm": 1454.47188507446, "learning_rate": 8.486253496098995e-07, "logits/chosen": -41.787498474121094, "logits/rejected": -42.318748474121094, "logps/chosen": -66.8125, "logps/rejected": -68.13749694824219, "loss": 0.6949, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": -0.03312988206744194, "rewards/margins": 0.0031730651389807463, "rewards/rejected": -0.03627509996294975, "step": 860 }, { "epoch": 3.8353591160220994, "grad_norm": 172.07031679781718, "learning_rate": 7.916626719242052e-07, "logits/chosen": -42.17499923706055, "logits/rejected": -42.181251525878906, "logps/chosen": -65.58125305175781, "logps/rejected": -68.4937515258789, "loss": 0.6837, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": 0.017704010009765625, "rewards/margins": 0.024872589856386185, "rewards/rejected": -0.0071617127396166325, "step": 870 }, { "epoch": 3.8795580110497236, "grad_norm": 146.89516485406568, "learning_rate": 7.363172832152388e-07, "logits/chosen": -41.98749923706055, "logits/rejected": -41.98125076293945, "logps/chosen": -65.70625305175781, "logps/rejected": -67.4749984741211, "loss": 0.6933, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.012438202276825905, "rewards/margins": 0.006200027652084827, "rewards/rejected": -0.018678665161132812, "step": 880 }, { "epoch": 3.9237569060773483, "grad_norm": 42.15227869155502, "learning_rate": 6.826415791577878e-07, "logits/chosen": -41.71875, "logits/rejected": -41.92499923706055, "logps/chosen": -65.10624694824219, "logps/rejected": -67.07499694824219, "loss": 0.6907, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -0.004158401396125555, "rewards/margins": 0.010738372802734375, "rewards/rejected": -0.014864349737763405, "step": 890 }, { "epoch": 3.9679558011049725, "grad_norm": 93.91352863787084, "learning_rate": 6.306863747302913e-07, "logits/chosen": -42.01874923706055, "logits/rejected": -42.03125, "logps/chosen": -65.63749694824219, "logps/rejected": -66.85624694824219, "loss": 0.6978, "rewards/accuracies": 0.46562498807907104, "rewards/chosen": -0.017127227038145065, "rewards/margins": -0.0032379149924963713, "rewards/rejected": -0.01387634314596653, "step": 900 }, { "epoch": 3.9679558011049725, "eval_logits/chosen": -42.41814041137695, "eval_logits/rejected": -41.70132827758789, "eval_logps/chosen": -64.80088806152344, "eval_logps/rejected": -71.0685806274414, "eval_loss": 0.6813294887542725, "eval_rewards/accuracies": 0.5978982448577881, "eval_rewards/chosen": -0.0017169513739645481, "eval_rewards/margins": 0.028534069657325745, "eval_rewards/rejected": -0.030240826308727264, "eval_runtime": 331.8628, "eval_samples_per_second": 5.448, "eval_steps_per_second": 0.341, "step": 900 }, { "epoch": 4.008839779005525, "grad_norm": 31.1737202253311, "learning_rate": 5.80500856108114e-07, "logits/chosen": -42.202701568603516, "logits/rejected": -42.16216278076172, "logps/chosen": -65.99324035644531, "logps/rejected": -67.97297668457031, "loss": 0.6917, "rewards/accuracies": 0.5304054021835327, "rewards/chosen": 0.007128329016268253, "rewards/margins": 0.010696823708713055, "rewards/rejected": -0.003579010721296072, "step": 910 }, { "epoch": 4.053038674033149, "grad_norm": 54.05162873046317, "learning_rate": 5.321325340988281e-07, "logits/chosen": -42.243751525878906, "logits/rejected": -42.381248474121094, "logps/chosen": -67.8125, "logps/rejected": -66.8187484741211, "loss": 0.7016, "rewards/accuracies": 0.43437498807907104, "rewards/chosen": -0.01435089111328125, "rewards/margins": -0.01076507568359375, "rewards/rejected": -0.0035995482467114925, "step": 920 }, { "epoch": 4.097237569060773, "grad_norm": 472.2147407153316, "learning_rate": 4.856271991635561e-07, "logits/chosen": -41.75, "logits/rejected": -42.01874923706055, "logps/chosen": -65.7125015258789, "logps/rejected": -66.9375, "loss": 0.6918, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.00778274517506361, "rewards/margins": 0.00872116070240736, "rewards/rejected": -0.016512298956513405, "step": 930 }, { "epoch": 4.141436464088398, "grad_norm": 79.02308592152725, "learning_rate": 4.410288780669869e-07, "logits/chosen": -42.349998474121094, "logits/rejected": -42.59375, "logps/chosen": -66.7125015258789, "logps/rejected": -68.05000305175781, "loss": 0.6913, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0019371032249182463, "rewards/margins": 0.009870529174804688, "rewards/rejected": -0.011784744448959827, "step": 940 }, { "epoch": 4.185635359116022, "grad_norm": 253.8366027535183, "learning_rate": 3.9837979219707586e-07, "logits/chosen": -42.07500076293945, "logits/rejected": -42.306251525878906, "logps/chosen": -66.3125, "logps/rejected": -67.33125305175781, "loss": 0.6932, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.001434326171875, "rewards/margins": 0.005306243896484375, "rewards/rejected": -0.0067539215087890625, "step": 950 }, { "epoch": 4.229834254143646, "grad_norm": 63.092721095914776, "learning_rate": 3.5772031759391424e-07, "logits/chosen": -42.34375, "logits/rejected": -41.84375, "logps/chosen": -66.20625305175781, "logps/rejected": -67.7125015258789, "loss": 0.6935, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": -0.012112045660614967, "rewards/margins": 0.005307006649672985, "rewards/rejected": -0.017436599358916283, "step": 960 }, { "epoch": 4.274033149171271, "grad_norm": 47.24648477264553, "learning_rate": 3.1908894672558337e-07, "logits/chosen": -42.162498474121094, "logits/rejected": -42.23749923706055, "logps/chosen": -66.58125305175781, "logps/rejected": -66.58125305175781, "loss": 0.6984, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.014201355166733265, "rewards/margins": -0.003826141357421875, "rewards/rejected": -0.01035232562571764, "step": 970 }, { "epoch": 4.318232044198895, "grad_norm": 38.970734821334965, "learning_rate": 2.8252225204720317e-07, "logits/chosen": -42.400001525878906, "logits/rejected": -42.51874923706055, "logps/chosen": -66.08125305175781, "logps/rejected": -67.76249694824219, "loss": 0.6935, "rewards/accuracies": 0.5, "rewards/chosen": -0.00645866384729743, "rewards/margins": 0.0046592713333666325, "rewards/rejected": -0.011082458309829235, "step": 980 }, { "epoch": 4.3624309392265195, "grad_norm": 38.49964922853884, "learning_rate": 2.4805485137766067e-07, "logits/chosen": -42.287498474121094, "logits/rejected": -42.66875076293945, "logps/chosen": -66.7750015258789, "logps/rejected": -67.23124694824219, "loss": 0.697, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.015133285894989967, "rewards/margins": -0.0010040283668786287, "rewards/rejected": -0.014149474911391735, "step": 990 }, { "epoch": 4.406629834254144, "grad_norm": 121.48672863295695, "learning_rate": 2.1571937512679386e-07, "logits/chosen": -42.54375076293945, "logits/rejected": -42.381248474121094, "logps/chosen": -67.375, "logps/rejected": -68.92500305175781, "loss": 0.694, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -0.022901535034179688, "rewards/margins": 0.004276275634765625, "rewards/rejected": -0.027169037610292435, "step": 1000 }, { "epoch": 4.406629834254144, "eval_logits/chosen": -42.646018981933594, "eval_logits/rejected": -41.95354080200195, "eval_logps/chosen": -65.31636810302734, "eval_logps/rejected": -71.64159393310547, "eval_loss": 0.6808455586433411, "eval_rewards/accuracies": 0.5918141603469849, "eval_rewards/chosen": -0.006461219396442175, "eval_rewards/margins": 0.029684286564588547, "eval_rewards/rejected": -0.036145709455013275, "eval_runtime": 1030.4577, "eval_samples_per_second": 1.755, "eval_steps_per_second": 0.11, "step": 1000 }, { "epoch": 4.450828729281768, "grad_norm": 15.62284752324105, "learning_rate": 1.8554643540407313e-07, "logits/chosen": -42.15625, "logits/rejected": -42.38750076293945, "logps/chosen": -67.32499694824219, "logps/rejected": -67.04374694824219, "loss": 0.7006, "rewards/accuracies": 0.44062501192092896, "rewards/chosen": -0.012189483270049095, "rewards/margins": -0.00953521765768528, "rewards/rejected": -0.00260505685582757, "step": 1010 }, { "epoch": 4.495027624309392, "grad_norm": 86.19253420737768, "learning_rate": 1.5756459703800493e-07, "logits/chosen": -42.63750076293945, "logits/rejected": -42.66875076293945, "logps/chosen": -66.65625, "logps/rejected": -69.8375015258789, "loss": 0.69, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.011952591128647327, "rewards/margins": 0.01207809429615736, "rewards/rejected": -0.023981858044862747, "step": 1020 }, { "epoch": 4.539226519337016, "grad_norm": 71.19069299148003, "learning_rate": 1.318003505337115e-07, "logits/chosen": -42.20000076293945, "logits/rejected": -42.443748474121094, "logps/chosen": -67.46875, "logps/rejected": -66.60624694824219, "loss": 0.7003, "rewards/accuracies": 0.453125, "rewards/chosen": -0.020994950085878372, "rewards/margins": -0.0084381103515625, "rewards/rejected": -0.012523651123046875, "step": 1030 }, { "epoch": 4.5834254143646405, "grad_norm": 535.1584549621664, "learning_rate": 1.0827808699427233e-07, "logits/chosen": -42.39374923706055, "logits/rejected": -42.23125076293945, "logps/chosen": -67.5562515258789, "logps/rejected": -67.7562484741211, "loss": 0.7, "rewards/accuracies": 0.47187501192092896, "rewards/chosen": -0.01778564415872097, "rewards/margins": -0.0076812743209302425, "rewards/rejected": -0.01009521447122097, "step": 1040 }, { "epoch": 4.6276243093922655, "grad_norm": 273.9823477024351, "learning_rate": 8.702007502958354e-08, "logits/chosen": -42.32500076293945, "logits/rejected": -42.306251525878906, "logps/chosen": -66.9437484741211, "logps/rejected": -68.4375, "loss": 0.6939, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": -0.01179580669850111, "rewards/margins": 0.004271316342055798, "rewards/rejected": -0.016061019152402878, "step": 1050 }, { "epoch": 4.67182320441989, "grad_norm": 434.9491733788703, "learning_rate": 6.804643967458614e-08, "logits/chosen": -42.087501525878906, "logits/rejected": -42.01250076293945, "logps/chosen": -65.5562515258789, "logps/rejected": -67.8375015258789, "loss": 0.6917, "rewards/accuracies": 0.53125, "rewards/chosen": -0.022443007677793503, "rewards/margins": 0.009143066592514515, "rewards/rejected": -0.03163337707519531, "step": 1060 }, { "epoch": 4.716022099447514, "grad_norm": 31.682645252253273, "learning_rate": 5.137514333682286e-08, "logits/chosen": -42.16875076293945, "logits/rejected": -42.40625, "logps/chosen": -66.5062484741211, "logps/rejected": -68.4437484741211, "loss": 0.6909, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -0.009397697634994984, "rewards/margins": 0.010795975103974342, "rewards/rejected": -0.020191192626953125, "step": 1070 }, { "epoch": 4.760220994475138, "grad_norm": 46.396153976694045, "learning_rate": 3.702196879136505e-08, "logits/chosen": -42.506248474121094, "logits/rejected": -42.474998474121094, "logps/chosen": -66.6312484741211, "logps/rejected": -68.38749694824219, "loss": 0.6921, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": -0.009720039553940296, "rewards/margins": 0.008687591180205345, "rewards/rejected": -0.018411636352539062, "step": 1080 }, { "epoch": 4.804419889502762, "grad_norm": 55.91984272430012, "learning_rate": 2.5000504239203194e-08, "logits/chosen": -42.60625076293945, "logits/rejected": -42.67499923706055, "logps/chosen": -67.61250305175781, "logps/rejected": -69.1812515258789, "loss": 0.6954, "rewards/accuracies": 0.515625, "rewards/chosen": -0.027659988030791283, "rewards/margins": 0.0024513243697583675, "rewards/rejected": -0.03017120435833931, "step": 1090 }, { "epoch": 4.8486187845303865, "grad_norm": 108.34311486393362, "learning_rate": 1.532213044324937e-08, "logits/chosen": -42.26250076293945, "logits/rejected": -42.443748474121094, "logps/chosen": -65.88749694824219, "logps/rejected": -68.375, "loss": 0.69, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.0031299591064453125, "rewards/margins": 0.01154174841940403, "rewards/rejected": -0.014626693911850452, "step": 1100 }, { "epoch": 4.8486187845303865, "eval_logits/chosen": -42.639381408691406, "eval_logits/rejected": -41.89380645751953, "eval_logps/chosen": -65.3429183959961, "eval_logps/rejected": -71.5177001953125, "eval_loss": 0.6816751956939697, "eval_rewards/accuracies": 0.5907079577445984, "eval_rewards/chosen": -0.006776117719709873, "eval_rewards/margins": 0.0282170120626688, "eval_rewards/rejected": -0.03500568866729736, "eval_runtime": 143.7357, "eval_samples_per_second": 12.579, "eval_steps_per_second": 0.786, "step": 1100 }, { "epoch": 4.892817679558011, "grad_norm": 45.89082361078265, "learning_rate": 7.996009954127914e-09, "logits/chosen": -42.693748474121094, "logits/rejected": -42.70000076293945, "logps/chosen": -66.41874694824219, "logps/rejected": -68.61250305175781, "loss": 0.6889, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0009090423700399697, "rewards/margins": 0.01406021136790514, "rewards/rejected": -0.013134384527802467, "step": 1110 }, { "epoch": 4.937016574585636, "grad_norm": 106.62343812338698, "learning_rate": 3.0290784359582327e-09, "logits/chosen": -42.15625, "logits/rejected": -42.506248474121094, "logps/chosen": -65.7874984741211, "logps/rejected": -68.48124694824219, "loss": 0.6853, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": -0.004611968994140625, "rewards/margins": 0.021500397473573685, "rewards/rejected": -0.026135634630918503, "step": 1120 }, { "epoch": 4.98121546961326, "grad_norm": 476.4838996406854, "learning_rate": 4.2603810033514657e-10, "logits/chosen": -42.23125076293945, "logits/rejected": -42.5625, "logps/chosen": -66.28125, "logps/rejected": -67.6875, "loss": 0.6926, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.009279632940888405, "rewards/margins": 0.007526397705078125, "rewards/rejected": -0.016846846789121628, "step": 1130 }, { "epoch": 5.0, "step": 1135, "total_flos": 0.0, "train_loss": 0.6952488298458149, "train_runtime": 23018.5818, "train_samples_per_second": 1.571, "train_steps_per_second": 0.049 } ], "logging_steps": 10, "max_steps": 1135, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }