{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 1000, "global_step": 375, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 1.3157894736842104e-08, "logits/chosen": -0.5324900150299072, "logits/rejected": -0.5734304189682007, "logps/chosen": -543.2296752929688, "logps/rejected": -325.48358154296875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/mix_margin": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.08, "learning_rate": 1.3157894736842104e-07, "logits/chosen": -0.48575523495674133, "logits/rejected": -0.5831019878387451, "logps/chosen": -334.6309509277344, "logps/rejected": -278.2859802246094, "loss": 0.6997, "rewards/accuracies": 0.4027777910232544, "rewards/chosen": -0.012853524647653103, "rewards/confidence": -0.0746772438287735, "rewards/confidence_mean_diff": 0.0746772438287735, "rewards/confidence_moving_diff": 0.0021637948229908943, "rewards/margins": -0.007044664584100246, "rewards/mix_margin": -0.007044283673167229, "rewards/real_percentage": 14.129032135009766, "rewards/rejected": -0.005808859597891569, "step": 10 }, { "epoch": 0.16, "learning_rate": 2.631578947368421e-07, "logits/chosen": -0.45206984877586365, "logits/rejected": -0.4436320662498474, "logps/chosen": -378.46478271484375, "logps/rejected": -291.097412109375, "loss": 0.687, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.019860025495290756, "rewards/confidence": -0.07699747383594513, "rewards/confidence_mean_diff": 0.07699747383594513, "rewards/confidence_moving_diff": -6.244657561182976e-05, "rewards/margins": 0.010339610278606415, "rewards/mix_margin": 0.010339389555156231, "rewards/real_percentage": 11.975000381469727, "rewards/rejected": 0.009520411491394043, "step": 20 }, { "epoch": 0.24, "learning_rate": 3.9473684210526315e-07, "logits/chosen": -0.48425692319869995, "logits/rejected": -0.5238968133926392, "logps/chosen": -363.4825439453125, "logps/rejected": -330.880859375, "loss": 0.6746, "rewards/accuracies": 0.625, "rewards/chosen": 0.07081757485866547, "rewards/confidence": -0.0583333782851696, "rewards/confidence_mean_diff": 0.0583333782851696, "rewards/confidence_moving_diff": 0.00017116544768214226, "rewards/margins": 0.04097529500722885, "rewards/mix_margin": 0.04097532853484154, "rewards/real_percentage": 12.024999618530273, "rewards/rejected": 0.029842281714081764, "step": 30 }, { "epoch": 0.32, "learning_rate": 4.999565492409831e-07, "logits/chosen": -0.47305864095687866, "logits/rejected": -0.582284152507782, "logps/chosen": -335.81610107421875, "logps/rejected": -256.0378723144531, "loss": 0.6474, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.16260090470314026, "rewards/confidence": -0.11778082698583603, "rewards/confidence_mean_diff": 0.11778082698583603, "rewards/confidence_moving_diff": 0.0008547043544240296, "rewards/margins": 0.05970517918467522, "rewards/mix_margin": 0.05970512703061104, "rewards/real_percentage": 12.074999809265137, "rewards/rejected": 0.10289572179317474, "step": 40 }, { "epoch": 0.4, "learning_rate": 4.984373579809777e-07, "logits/chosen": -0.5092490911483765, "logits/rejected": -0.5690798163414001, "logps/chosen": -329.53302001953125, "logps/rejected": -295.02294921875, "loss": 0.5866, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.3996647000312805, "rewards/confidence": -0.14608541131019592, "rewards/confidence_mean_diff": 0.14608541131019592, "rewards/confidence_moving_diff": -0.00040556181920692325, "rewards/margins": 0.20342092216014862, "rewards/mix_margin": 0.2034207135438919, "rewards/real_percentage": 12.0, "rewards/rejected": 0.1962437778711319, "step": 50 }, { "epoch": 0.48, "learning_rate": 4.947607089353757e-07, "logits/chosen": -0.4855988025665283, "logits/rejected": -0.5692173838615417, "logps/chosen": -365.7965393066406, "logps/rejected": -290.7939147949219, "loss": 0.6262, "rewards/accuracies": 0.75, "rewards/chosen": 0.6372241973876953, "rewards/confidence": -0.27295011281967163, "rewards/confidence_mean_diff": 0.27295011281967163, "rewards/confidence_moving_diff": -8.605476614320651e-05, "rewards/margins": 0.25993281602859497, "rewards/mix_margin": 0.2599331736564636, "rewards/real_percentage": 12.100000381469727, "rewards/rejected": 0.37729138135910034, "step": 60 }, { "epoch": 0.56, "learning_rate": 4.889585305354435e-07, "logits/chosen": -0.511881411075592, "logits/rejected": -0.5559085607528687, "logps/chosen": -374.42559814453125, "logps/rejected": -350.49285888671875, "loss": 0.5776, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.938610851764679, "rewards/confidence": -0.17002172768115997, "rewards/confidence_mean_diff": 0.17002172768115997, "rewards/confidence_moving_diff": 0.006227460689842701, "rewards/margins": 0.39301368594169617, "rewards/mix_margin": 0.39301276206970215, "rewards/real_percentage": 12.199999809265137, "rewards/rejected": 0.5455971360206604, "step": 70 }, { "epoch": 0.64, "learning_rate": 4.810812095469401e-07, "logits/chosen": -0.4341855049133301, "logits/rejected": -0.4922330975532532, "logps/chosen": -382.85986328125, "logps/rejected": -316.0860595703125, "loss": 0.4931, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 1.085030198097229, "rewards/confidence": -0.36220604181289673, "rewards/confidence_mean_diff": 0.36220604181289673, "rewards/confidence_moving_diff": -0.004994163755327463, "rewards/margins": 0.727383017539978, "rewards/mix_margin": 0.7273828387260437, "rewards/real_percentage": 11.899999618530273, "rewards/rejected": 0.3576471507549286, "step": 80 }, { "epoch": 0.72, "learning_rate": 4.711971535058109e-07, "logits/chosen": -0.4119408130645752, "logits/rejected": -0.5046309232711792, "logps/chosen": -335.9080810546875, "logps/rejected": -228.6096649169922, "loss": 0.5641, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.9102222323417664, "rewards/confidence": -0.18118831515312195, "rewards/confidence_mean_diff": 0.18118831515312195, "rewards/confidence_moving_diff": 0.0009786130394786596, "rewards/margins": 0.6580663919448853, "rewards/mix_margin": 0.6580665707588196, "rewards/real_percentage": 12.074999809265137, "rewards/rejected": 0.2521558403968811, "step": 90 }, { "epoch": 0.8, "learning_rate": 4.593921966594997e-07, "logits/chosen": -0.4459192752838135, "logits/rejected": -0.4894910454750061, "logps/chosen": -371.5416259765625, "logps/rejected": -310.64239501953125, "loss": 0.5469, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 1.2468544244766235, "rewards/confidence": -0.45958179235458374, "rewards/confidence_mean_diff": 0.45958179235458374, "rewards/confidence_moving_diff": 0.004259251989424229, "rewards/margins": 0.7577625513076782, "rewards/mix_margin": 0.757762610912323, "rewards/real_percentage": 12.100000381469727, "rewards/rejected": 0.4890917241573334, "step": 100 }, { "epoch": 0.88, "learning_rate": 4.457688545727496e-07, "logits/chosen": -0.5113216042518616, "logits/rejected": -0.5288140177726746, "logps/chosen": -352.3919982910156, "logps/rejected": -276.9599304199219, "loss": 0.5222, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 1.0378813743591309, "rewards/confidence": -0.29791101813316345, "rewards/confidence_mean_diff": 0.29791101813316345, "rewards/confidence_moving_diff": -0.0015016455436125398, "rewards/margins": 0.7134403586387634, "rewards/mix_margin": 0.7134405374526978, "rewards/real_percentage": 11.949999809265137, "rewards/rejected": 0.3244408965110779, "step": 110 }, { "epoch": 0.96, "learning_rate": 4.3044543387098026e-07, "logits/chosen": -0.5033639669418335, "logits/rejected": -0.5167360901832581, "logps/chosen": -323.29119873046875, "logps/rejected": -265.58221435546875, "loss": 0.5039, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 1.0098955631256104, "rewards/confidence": -0.42422398924827576, "rewards/confidence_mean_diff": 0.42422398924827576, "rewards/confidence_moving_diff": 0.0008432863396592438, "rewards/margins": 0.9732195138931274, "rewards/mix_margin": 0.9732197523117065, "rewards/real_percentage": 11.949999809265137, "rewards/rejected": 0.036676160991191864, "step": 120 }, { "epoch": 1.04, "learning_rate": 4.1355500485232917e-07, "logits/chosen": -0.4795234203338623, "logits/rejected": -0.5551981329917908, "logps/chosen": -367.8242492675781, "logps/rejected": -284.45062255859375, "loss": 0.381, "rewards/accuracies": 0.875, "rewards/chosen": 1.1803219318389893, "rewards/confidence": -0.1641966998577118, "rewards/confidence_mean_diff": 0.1641966998577118, "rewards/confidence_moving_diff": -0.009292250499129295, "rewards/margins": 1.2318060398101807, "rewards/mix_margin": 1.2318063974380493, "rewards/real_percentage": 11.875, "rewards/rejected": -0.051483988761901855, "step": 130 }, { "epoch": 1.12, "learning_rate": 3.9524424589030863e-07, "logits/chosen": -0.47544917464256287, "logits/rejected": -0.45598697662353516, "logps/chosen": -368.21197509765625, "logps/rejected": -327.8885803222656, "loss": 0.2637, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.4148461818695068, "rewards/confidence": 0.12870559096336365, "rewards/confidence_mean_diff": -0.12870559096336365, "rewards/confidence_moving_diff": -4.9034319090424106e-05, "rewards/margins": 1.949605941772461, "rewards/mix_margin": 1.9496057033538818, "rewards/real_percentage": 12.024999618530273, "rewards/rejected": -0.5347597599029541, "step": 140 }, { "epoch": 1.2, "learning_rate": 3.7567216966241556e-07, "logits/chosen": -0.5132138133049011, "logits/rejected": -0.5720852613449097, "logps/chosen": -349.05706787109375, "logps/rejected": -309.68194580078125, "loss": 0.2546, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.3815641403198242, "rewards/confidence": 0.15192195773124695, "rewards/confidence_mean_diff": -0.15192195773124695, "rewards/confidence_moving_diff": -0.004711526446044445, "rewards/margins": 1.766579031944275, "rewards/mix_margin": 1.766579031944275, "rewards/real_percentage": 11.925000190734863, "rewards/rejected": -0.3850148320198059, "step": 150 }, { "epoch": 1.28, "learning_rate": 3.5500874226626633e-07, "logits/chosen": -0.41593313217163086, "logits/rejected": -0.47519993782043457, "logps/chosen": -424.2110290527344, "logps/rejected": -386.99688720703125, "loss": 0.2319, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.594541072845459, "rewards/confidence": 0.3976772129535675, "rewards/confidence_mean_diff": -0.3976772129535675, "rewards/confidence_moving_diff": 0.0034655616618692875, "rewards/margins": 2.2624146938323975, "rewards/mix_margin": 2.2624149322509766, "rewards/real_percentage": 12.024999618530273, "rewards/rejected": -0.667873740196228, "step": 160 }, { "epoch": 1.36, "learning_rate": 3.334334072150074e-07, "logits/chosen": -0.4277438223361969, "logits/rejected": -0.44190508127212524, "logps/chosen": -359.75262451171875, "logps/rejected": -304.85107421875, "loss": 0.244, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3704854249954224, "rewards/confidence": 0.43457871675491333, "rewards/confidence_mean_diff": -0.43457871675491333, "rewards/confidence_moving_diff": 0.0003442527668084949, "rewards/margins": 2.137328624725342, "rewards/mix_margin": 2.137328624725342, "rewards/real_percentage": 12.024999618530273, "rewards/rejected": -0.766843318939209, "step": 170 }, { "epoch": 1.44, "learning_rate": 3.1113352712978995e-07, "logits/chosen": -0.4778042733669281, "logits/rejected": -0.5502051115036011, "logps/chosen": -285.4638671875, "logps/rejected": -259.30963134765625, "loss": 0.2673, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1096071004867554, "rewards/confidence": 0.36355599761009216, "rewards/confidence_mean_diff": -0.36355599761009216, "rewards/confidence_moving_diff": 2.6996247470378876e-05, "rewards/margins": 1.878248929977417, "rewards/mix_margin": 1.878249168395996, "rewards/real_percentage": 11.975000381469727, "rewards/rejected": -0.7686418294906616, "step": 180 }, { "epoch": 1.52, "learning_rate": 2.8830275666182565e-07, "logits/chosen": -0.5888835191726685, "logits/rejected": -0.5946951508522034, "logps/chosen": -345.4639587402344, "logps/rejected": -269.433349609375, "loss": 0.2581, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.4412751197814941, "rewards/confidence": 0.27181780338287354, "rewards/confidence_mean_diff": -0.27181780338287354, "rewards/confidence_moving_diff": -0.0013337878044694662, "rewards/margins": 1.7757008075714111, "rewards/mix_margin": 1.7756999731063843, "rewards/real_percentage": 11.975000381469727, "rewards/rejected": -0.3344256579875946, "step": 190 }, { "epoch": 1.6, "learning_rate": 2.651393607737495e-07, "logits/chosen": -0.43257981538772583, "logits/rejected": -0.5586498975753784, "logps/chosen": -332.6167907714844, "logps/rejected": -258.5675354003906, "loss": 0.2367, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.6590750217437744, "rewards/confidence": 0.20384028553962708, "rewards/confidence_mean_diff": -0.20384028553962708, "rewards/confidence_moving_diff": 0.0033724855165928602, "rewards/margins": 2.299750804901123, "rewards/mix_margin": 2.299750804901123, "rewards/real_percentage": 12.074999809265137, "rewards/rejected": -0.6406754851341248, "step": 200 }, { "epoch": 1.68, "learning_rate": 2.418444929845241e-07, "logits/chosen": -0.5128785371780396, "logits/rejected": -0.5602482557296753, "logps/chosen": -347.55145263671875, "logps/rejected": -316.63287353515625, "loss": 0.2366, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.4995661973953247, "rewards/confidence": 0.29339924454689026, "rewards/confidence_mean_diff": -0.29339924454689026, "rewards/confidence_moving_diff": -0.002348523121327162, "rewards/margins": 2.240088939666748, "rewards/mix_margin": 2.2400896549224854, "rewards/real_percentage": 11.949999809265137, "rewards/rejected": -0.7405228018760681, "step": 210 }, { "epoch": 1.76, "learning_rate": 2.186204485297965e-07, "logits/chosen": -0.5206685066223145, "logits/rejected": -0.49740782380104065, "logps/chosen": -327.6163024902344, "logps/rejected": -312.23345947265625, "loss": 0.259, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3457434177398682, "rewards/confidence": 0.5061102509498596, "rewards/confidence_mean_diff": -0.5061102509498596, "rewards/confidence_moving_diff": 0.0032404728699475527, "rewards/margins": 2.1838455200195312, "rewards/mix_margin": 2.183845281600952, "rewards/real_percentage": 12.125, "rewards/rejected": -0.8381019830703735, "step": 220 }, { "epoch": 1.84, "learning_rate": 1.956689076074607e-07, "logits/chosen": -0.47606563568115234, "logits/rejected": -0.5649515986442566, "logps/chosen": -359.9063415527344, "logps/rejected": -272.35333251953125, "loss": 0.2392, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2465178966522217, "rewards/confidence": 0.15909627079963684, "rewards/confidence_mean_diff": -0.15909627079963684, "rewards/confidence_moving_diff": -0.005671085324138403, "rewards/margins": 2.0437004566192627, "rewards/mix_margin": 2.043700695037842, "rewards/real_percentage": 11.899999618530273, "rewards/rejected": -0.7971823811531067, "step": 230 }, { "epoch": 1.92, "learning_rate": 1.7318918396427674e-07, "logits/chosen": -0.5379046201705933, "logits/rejected": -0.5706161260604858, "logps/chosen": -386.26861572265625, "logps/rejected": -303.8609619140625, "loss": 0.2138, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.7834584712982178, "rewards/confidence": 0.13540206849575043, "rewards/confidence_mean_diff": -0.13540206849575043, "rewards/confidence_moving_diff": 0.0014076533261686563, "rewards/margins": 2.3947689533233643, "rewards/mix_margin": 2.394768476486206, "rewards/real_percentage": 11.975000381469727, "rewards/rejected": -0.6113101840019226, "step": 240 }, { "epoch": 2.0, "learning_rate": 1.513764940330155e-07, "logits/chosen": -0.39151811599731445, "logits/rejected": -0.473433256149292, "logps/chosen": -336.6163024902344, "logps/rejected": -306.16046142578125, "loss": 0.2558, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.611301064491272, "rewards/confidence": 0.2504025101661682, "rewards/confidence_mean_diff": -0.2504025101661682, "rewards/confidence_moving_diff": 0.000840538355987519, "rewards/margins": 2.209441661834717, "rewards/mix_margin": 2.209441661834717, "rewards/real_percentage": 12.050000190734863, "rewards/rejected": -0.5981408357620239, "step": 250 }, { "epoch": 2.08, "learning_rate": 1.304202616511362e-07, "logits/chosen": -0.5112959742546082, "logits/rejected": -0.5279114842414856, "logps/chosen": -377.9098815917969, "logps/rejected": -316.77825927734375, "loss": 0.1627, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.204784631729126, "rewards/confidence": 0.5538536906242371, "rewards/confidence_mean_diff": -0.5538536906242371, "rewards/confidence_moving_diff": -0.005331903696060181, "rewards/margins": 2.7410061359405518, "rewards/mix_margin": 2.7410056591033936, "rewards/real_percentage": 11.774999618530273, "rewards/rejected": -0.5362212657928467, "step": 260 }, { "epoch": 2.16, "learning_rate": 1.1050247308300944e-07, "logits/chosen": -0.48956188559532166, "logits/rejected": -0.5282370448112488, "logps/chosen": -370.67767333984375, "logps/rejected": -370.94476318359375, "loss": 0.1444, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.979029893875122, "rewards/confidence": 0.7768798470497131, "rewards/confidence_mean_diff": -0.7768798470497131, "rewards/confidence_moving_diff": -0.008751118555665016, "rewards/margins": 5.2986369132995605, "rewards/mix_margin": 5.298637866973877, "rewards/real_percentage": 11.875, "rewards/rejected": -3.3196072578430176, "step": 270 }, { "epoch": 2.24, "learning_rate": 9.179609663085594e-08, "logits/chosen": -0.478290855884552, "logits/rejected": -0.5842245817184448, "logps/chosen": -354.32220458984375, "logps/rejected": -323.82830810546875, "loss": 0.1632, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.003836154937744, "rewards/confidence": 1.0290786027908325, "rewards/confidence_mean_diff": -1.0290786027908325, "rewards/confidence_moving_diff": 0.009490849450230598, "rewards/margins": 2.7980473041534424, "rewards/mix_margin": 2.7980475425720215, "rewards/real_percentage": 12.175000190734863, "rewards/rejected": -0.7942115068435669, "step": 280 }, { "epoch": 2.32, "learning_rate": 7.446358055867688e-08, "logits/chosen": -0.4719129502773285, "logits/rejected": -0.5351340174674988, "logps/chosen": -284.57977294921875, "logps/rejected": -244.1188507080078, "loss": 0.1959, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3237833976745605, "rewards/confidence": 0.36271917819976807, "rewards/confidence_mean_diff": -0.36271917819976807, "rewards/confidence_moving_diff": 0.002586688846349716, "rewards/margins": 2.3666577339172363, "rewards/mix_margin": 2.3666574954986572, "rewards/real_percentage": 12.074999809265137, "rewards/rejected": -1.0428742170333862, "step": 290 }, { "epoch": 2.4, "learning_rate": 5.8655442373371164e-08, "logits/chosen": -0.581800639629364, "logits/rejected": -0.6199262142181396, "logps/chosen": -420.638671875, "logps/rejected": -355.60736083984375, "loss": 0.152, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8940696716308594, "rewards/confidence": 0.7953528165817261, "rewards/confidence_mean_diff": -0.7953528165817261, "rewards/confidence_moving_diff": -0.006861658301204443, "rewards/margins": 2.8314507007598877, "rewards/mix_margin": 2.831450939178467, "rewards/real_percentage": 11.875, "rewards/rejected": -0.9373809695243835, "step": 300 }, { "epoch": 2.48, "learning_rate": 4.450896171388219e-08, "logits/chosen": -0.5456718802452087, "logits/rejected": -0.5629149079322815, "logps/chosen": -384.376953125, "logps/rejected": -332.3739318847656, "loss": 0.1365, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.1317899227142334, "rewards/confidence": 0.8724759221076965, "rewards/confidence_mean_diff": -0.8724759221076965, "rewards/confidence_moving_diff": 0.006575644016265869, "rewards/margins": 3.160860300064087, "rewards/mix_margin": 3.160860061645508, "rewards/real_percentage": 12.024999618530273, "rewards/rejected": -1.0290701389312744, "step": 310 }, { "epoch": 2.56, "learning_rate": 3.214698819946879e-08, "logits/chosen": -0.5238803625106812, "logits/rejected": -0.5871630907058716, "logps/chosen": -375.8655090332031, "logps/rejected": -300.987548828125, "loss": 0.1748, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.7292721271514893, "rewards/confidence": 0.5153323411941528, "rewards/confidence_mean_diff": -0.5153323411941528, "rewards/confidence_moving_diff": 0.0004902526852674782, "rewards/margins": 2.5365805625915527, "rewards/mix_margin": 2.5365803241729736, "rewards/real_percentage": 11.949999809265137, "rewards/rejected": -0.8073086738586426, "step": 320 }, { "epoch": 2.64, "learning_rate": 2.1676874589879908e-08, "logits/chosen": -0.49646130204200745, "logits/rejected": -0.5325660705566406, "logps/chosen": -361.79986572265625, "logps/rejected": -292.0426940917969, "loss": 0.1878, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.4343092441558838, "rewards/confidence": 0.5628241300582886, "rewards/confidence_mean_diff": -0.5628241300582886, "rewards/confidence_moving_diff": 0.0001227855682373047, "rewards/margins": 2.628760576248169, "rewards/mix_margin": 2.628760814666748, "rewards/real_percentage": 12.024999618530273, "rewards/rejected": -1.1944514513015747, "step": 330 }, { "epoch": 2.72, "learning_rate": 1.3189544521990032e-08, "logits/chosen": -0.5395928025245667, "logits/rejected": -0.5778788328170776, "logps/chosen": -332.5323791503906, "logps/rejected": -296.8447265625, "loss": 0.1826, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4775941371917725, "rewards/confidence": 0.5103145837783813, "rewards/confidence_mean_diff": -0.5103145837783813, "rewards/confidence_moving_diff": -0.003172731725499034, "rewards/margins": 2.3591160774230957, "rewards/mix_margin": 2.3591160774230957, "rewards/real_percentage": 11.875, "rewards/rejected": -0.8815220594406128, "step": 340 }, { "epoch": 2.8, "learning_rate": 6.7587029187732014e-09, "logits/chosen": -0.5066567659378052, "logits/rejected": -0.5228812098503113, "logps/chosen": -346.0731201171875, "logps/rejected": -309.6691589355469, "loss": 0.1769, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.6007888317108154, "rewards/confidence": 0.6578723788261414, "rewards/confidence_mean_diff": -0.6578723788261414, "rewards/confidence_moving_diff": 0.004312982317060232, "rewards/margins": 2.8280742168426514, "rewards/mix_margin": 2.8280739784240723, "rewards/real_percentage": 12.125, "rewards/rejected": -1.2272855043411255, "step": 350 }, { "epoch": 2.88, "learning_rate": 2.4401959275140437e-09, "logits/chosen": -0.4290226399898529, "logits/rejected": -0.4782096743583679, "logps/chosen": -323.8050231933594, "logps/rejected": -286.50225830078125, "loss": 0.1644, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5553691387176514, "rewards/confidence": 0.5026761889457703, "rewards/confidence_mean_diff": -0.5026761889457703, "rewards/confidence_moving_diff": -0.0073528410866856575, "rewards/margins": 2.44854474067688, "rewards/mix_margin": 2.4485442638397217, "rewards/real_percentage": 11.899999618530273, "rewards/rejected": -0.8931753039360046, "step": 360 }, { "epoch": 2.96, "learning_rate": 2.715259456224084e-10, "logits/chosen": -0.5070622563362122, "logits/rejected": -0.5159127712249756, "logps/chosen": -368.29248046875, "logps/rejected": -357.6094665527344, "loss": 0.1676, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.714816689491272, "rewards/confidence": 0.8204771876335144, "rewards/confidence_mean_diff": -0.8204771876335144, "rewards/confidence_moving_diff": 0.002654359443113208, "rewards/margins": 2.7493791580200195, "rewards/mix_margin": 2.7493796348571777, "rewards/real_percentage": 12.125, "rewards/rejected": -1.034562587738037, "step": 370 }, { "epoch": 3.0, "step": 375, "total_flos": 0.0, "train_loss": 0.3348727149963379, "train_runtime": 2536.3599, "train_samples_per_second": 2.366, "train_steps_per_second": 0.148 } ], "logging_steps": 10, "max_steps": 375, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }