{ "best_metric": 21.83156394958496, "best_model_checkpoint": "./output/checkpoints/2024-05-27_09-04-31/checkpoint-100", "epoch": 1.0, "eval_steps": 100, "global_step": 198, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025252525252525252, "grad_norm": 26.353445053100586, "learning_rate": 4.000000000000001e-06, "logits/chosen": 0.14427797496318817, "logits/rejected": -0.5873457193374634, "logps/chosen": -0.901843249797821, "logps/rejected": -1.3607301712036133, "loss": 24.9998, "rewards/accuracies": 0.21250000596046448, "rewards/chosen": -1.578416777192615e-05, "rewards/margins": 2.430938138786587e-06, "rewards/rejected": -1.8215103409602307e-05, "step": 5 }, { "epoch": 0.050505050505050504, "grad_norm": NaN, "learning_rate": 8.000000000000001e-06, "logits/chosen": -0.10329052060842514, "logits/rejected": -0.4683811664581299, "logps/chosen": -0.9063997268676758, "logps/rejected": -1.461859107017517, "loss": 24.9337, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.00036737616755999625, "rewards/margins": 0.0006723683327436447, "rewards/rejected": -0.0010397445876151323, "step": 10 }, { "epoch": 0.07575757575757576, "grad_norm": 13.749723434448242, "learning_rate": 1.3000000000000001e-05, "logits/chosen": -0.2425023317337036, "logits/rejected": -0.6693668365478516, "logps/chosen": -0.8707982897758484, "logps/rejected": -1.1566194295883179, "loss": 24.9041, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.0014928742311894894, "rewards/margins": 0.0009821585845202208, "rewards/rejected": -0.00247503281570971, "step": 15 }, { "epoch": 0.10101010101010101, "grad_norm": 25.53832244873047, "learning_rate": 1.8e-05, "logits/chosen": -0.46215763688087463, "logits/rejected": -0.9008939862251282, "logps/chosen": -0.959465503692627, "logps/rejected": -1.5446056127548218, "loss": 24.2631, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0067633287981152534, "rewards/margins": 0.007808461785316467, "rewards/rejected": -0.014571788720786572, "step": 20 }, { "epoch": 0.12626262626262627, "grad_norm": 45.06657791137695, "learning_rate": 1.9985985720017786e-05, "logits/chosen": -0.04087737202644348, "logits/rejected": -0.5188297033309937, "logps/chosen": -0.9965022802352905, "logps/rejected": -1.3733254671096802, "loss": 24.1692, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.014551234431564808, "rewards/margins": 0.009625318460166454, "rewards/rejected": -0.024176552891731262, "step": 25 }, { "epoch": 0.15151515151515152, "grad_norm": 28.255924224853516, "learning_rate": 1.9900485105144544e-05, "logits/chosen": -0.14505064487457275, "logits/rejected": -0.5278365015983582, "logps/chosen": -1.0397828817367554, "logps/rejected": -1.44753897190094, "loss": 24.1349, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.018694132566452026, "rewards/margins": 0.01500606257468462, "rewards/rejected": -0.03370019793510437, "step": 30 }, { "epoch": 0.17676767676767677, "grad_norm": NaN, "learning_rate": 1.9776556239997146e-05, "logits/chosen": -0.4809038043022156, "logits/rejected": -0.9093053936958313, "logps/chosen": -1.3904650211334229, "logps/rejected": -2.406257390975952, "loss": 23.5774, "rewards/accuracies": 0.75, "rewards/chosen": -0.04243111237883568, "rewards/margins": 0.052741266787052155, "rewards/rejected": -0.09517236799001694, "step": 35 }, { "epoch": 0.20202020202020202, "grad_norm": 46.83095932006836, "learning_rate": 1.955324742088516e-05, "logits/chosen": -0.6266540288925171, "logits/rejected": -1.0290076732635498, "logps/chosen": -1.2514160871505737, "logps/rejected": -2.1771531105041504, "loss": 22.3291, "rewards/accuracies": 0.625, "rewards/chosen": -0.03719799965620041, "rewards/margins": 0.04375718533992767, "rewards/rejected": -0.08095519244670868, "step": 40 }, { "epoch": 0.22727272727272727, "grad_norm": 76.44580841064453, "learning_rate": 1.9255590665712214e-05, "logits/chosen": -0.6130943894386292, "logits/rejected": -1.143413782119751, "logps/chosen": -1.5433876514434814, "logps/rejected": -2.6532750129699707, "loss": 21.656, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.05903216451406479, "rewards/margins": 0.05665038153529167, "rewards/rejected": -0.11568254232406616, "step": 45 }, { "epoch": 0.25252525252525254, "grad_norm": 78.35297393798828, "learning_rate": 1.8965472436868288e-05, "logits/chosen": -0.757357656955719, "logits/rejected": -1.0666834115982056, "logps/chosen": -1.3742765188217163, "logps/rejected": -3.0053694248199463, "loss": 22.6627, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.05383248254656792, "rewards/margins": 0.09586457908153534, "rewards/rejected": -0.14969706535339355, "step": 50 }, { "epoch": 0.2777777777777778, "grad_norm": 82.29180145263672, "learning_rate": 1.8540204424421264e-05, "logits/chosen": -0.8564749956130981, "logits/rejected": -1.3737789392471313, "logps/chosen": -1.733337163925171, "logps/rejected": -3.3698067665100098, "loss": 19.3611, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.08400858938694, "rewards/margins": 0.10341653972864151, "rewards/rejected": -0.18742512166500092, "step": 55 }, { "epoch": 0.30303030303030304, "grad_norm": 128.23907470703125, "learning_rate": 1.804847246055326e-05, "logits/chosen": -0.9640189409255981, "logits/rejected": -1.1732914447784424, "logps/chosen": -2.538499593734741, "logps/rejected": -3.0090465545654297, "loss": 29.7881, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.14390432834625244, "rewards/margins": 0.02762184664607048, "rewards/rejected": -0.17152616381645203, "step": 60 }, { "epoch": 0.3282828282828283, "grad_norm": 49.566158294677734, "learning_rate": 1.7494103438361252e-05, "logits/chosen": -0.7158849239349365, "logits/rejected": -1.0623328685760498, "logps/chosen": -1.4396604299545288, "logps/rejected": -2.000624179840088, "loss": 21.9915, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.06708293408155441, "rewards/margins": 0.03939032554626465, "rewards/rejected": -0.10647325217723846, "step": 65 }, { "epoch": 0.35353535353535354, "grad_norm": 76.89603424072266, "learning_rate": 1.6881411722458688e-05, "logits/chosen": -0.8769875764846802, "logits/rejected": -1.1103827953338623, "logps/chosen": -2.1615917682647705, "logps/rejected": -3.0439255237579346, "loss": 21.986, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.12244679778814316, "rewards/margins": 0.06274138391017914, "rewards/rejected": -0.1851881742477417, "step": 70 }, { "epoch": 0.3787878787878788, "grad_norm": 134.9673309326172, "learning_rate": 1.6215165572528598e-05, "logits/chosen": -1.343915581703186, "logits/rejected": -1.401227355003357, "logps/chosen": -2.2325069904327393, "logps/rejected": -3.132831573486328, "loss": 21.2106, "rewards/accuracies": 0.6875, "rewards/chosen": -0.13722343742847443, "rewards/margins": 0.06202084943652153, "rewards/rejected": -0.19924426078796387, "step": 75 }, { "epoch": 0.40404040404040403, "grad_norm": 137.21859741210938, "learning_rate": 1.5500550034448415e-05, "logits/chosen": -1.3024094104766846, "logits/rejected": -1.5494719743728638, "logps/chosen": -2.4210548400878906, "logps/rejected": -3.3495230674743652, "loss": 23.1095, "rewards/accuracies": 0.625, "rewards/chosen": -0.15112502872943878, "rewards/margins": 0.07425413280725479, "rewards/rejected": -0.22537918388843536, "step": 80 }, { "epoch": 0.4292929292929293, "grad_norm": 351.0603942871094, "learning_rate": 1.5050862598575474e-05, "logits/chosen": -1.311993956565857, "logits/rejected": -1.6289136409759521, "logps/chosen": -2.586198091506958, "logps/rejected": -5.161986827850342, "loss": 25.0728, "rewards/accuracies": 0.75, "rewards/chosen": -0.16497337818145752, "rewards/margins": 0.17613837122917175, "rewards/rejected": -0.3411117494106293, "step": 85 }, { "epoch": 0.45454545454545453, "grad_norm": 376.21038818359375, "learning_rate": 1.4270564388663761e-05, "logits/chosen": -1.4695305824279785, "logits/rejected": -1.5699679851531982, "logps/chosen": -3.0274829864501953, "logps/rejected": -3.7816379070281982, "loss": 24.3757, "rewards/accuracies": 0.625, "rewards/chosen": -0.20180432498455048, "rewards/margins": 0.06594176590442657, "rewards/rejected": -0.26774606108665466, "step": 90 }, { "epoch": 0.4797979797979798, "grad_norm": 83.94548034667969, "learning_rate": 1.3457030606163564e-05, "logits/chosen": -1.542257308959961, "logits/rejected": -1.640545129776001, "logps/chosen": -3.1931662559509277, "logps/rejected": -4.362542152404785, "loss": 21.7905, "rewards/accuracies": 0.625, "rewards/chosen": -0.21774420142173767, "rewards/margins": 0.09037742763757706, "rewards/rejected": -0.30812162160873413, "step": 95 }, { "epoch": 0.5050505050505051, "grad_norm": 174.58786010742188, "learning_rate": 1.2616592559684408e-05, "logits/chosen": -1.5426051616668701, "logits/rejected": -1.7211687564849854, "logps/chosen": -2.798499345779419, "logps/rejected": -3.3964920043945312, "loss": 25.8166, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.17160701751708984, "rewards/margins": 0.05040215328335762, "rewards/rejected": -0.22200918197631836, "step": 100 }, { "epoch": 0.5050505050505051, "eval_logits/chosen": -1.919495940208435, "eval_logits/rejected": -2.218794584274292, "eval_logps/chosen": -2.5173401832580566, "eval_logps/rejected": -3.3597702980041504, "eval_loss": 21.83156394958496, "eval_rewards/accuracies": 0.6421874761581421, "eval_rewards/chosen": -0.15560917556285858, "eval_rewards/margins": 0.05931411311030388, "eval_rewards/rejected": -0.21492330729961395, "eval_runtime": 256.4168, "eval_samples_per_second": 2.496, "eval_steps_per_second": 0.156, "step": 100 }, { "epoch": 0.5303030303030303, "grad_norm": 94.69363403320312, "learning_rate": 1.1755790939673208e-05, "logits/chosen": -1.6892818212509155, "logits/rejected": -1.860984206199646, "logps/chosen": -2.6088526248931885, "logps/rejected": -3.893810272216797, "loss": 24.6292, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1631762683391571, "rewards/margins": 0.0721951425075531, "rewards/rejected": -0.235371395945549, "step": 105 }, { "epoch": 0.5555555555555556, "grad_norm": 299.636962890625, "learning_rate": 1.088132491563602e-05, "logits/chosen": -1.6523587703704834, "logits/rejected": -1.648794412612915, "logps/chosen": -2.3819022178649902, "logps/rejected": -3.91084623336792, "loss": 24.6609, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.14854103326797485, "rewards/margins": 0.08199040591716766, "rewards/rejected": -0.23053142428398132, "step": 110 }, { "epoch": 0.5808080808080808, "grad_norm": 179.6541748046875, "learning_rate": 1e-05, "logits/chosen": -1.7479238510131836, "logits/rejected": -1.8762273788452148, "logps/chosen": -2.4850611686706543, "logps/rejected": -3.9139976501464844, "loss": 21.9825, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1599404662847519, "rewards/margins": 0.0621558353304863, "rewards/rejected": -0.2220962941646576, "step": 115 }, { "epoch": 0.6060606060606061, "grad_norm": 416.4597473144531, "learning_rate": 9.118675084363986e-06, "logits/chosen": -1.6893389225006104, "logits/rejected": -1.9248136281967163, "logps/chosen": -2.431549549102783, "logps/rejected": -3.4075489044189453, "loss": 23.7008, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.1548345983028412, "rewards/margins": 0.03470990061759949, "rewards/rejected": -0.18954448401927948, "step": 120 }, { "epoch": 0.6313131313131313, "grad_norm": 106.20417022705078, "learning_rate": 8.244209060326794e-06, "logits/chosen": -1.6689144372940063, "logits/rejected": -1.932077407836914, "logps/chosen": -2.1763813495635986, "logps/rejected": -4.08168888092041, "loss": 20.9314, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.13834789395332336, "rewards/margins": 0.07367957383394241, "rewards/rejected": -0.21202746033668518, "step": 125 }, { "epoch": 0.6565656565656566, "grad_norm": 628.0269775390625, "learning_rate": 7.383407440315595e-06, "logits/chosen": -1.7707713842391968, "logits/rejected": -1.8211300373077393, "logps/chosen": -2.591797351837158, "logps/rejected": -4.223265647888184, "loss": 19.9625, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.1714746505022049, "rewards/margins": 0.10012316703796387, "rewards/rejected": -0.27159780263900757, "step": 130 }, { "epoch": 0.6818181818181818, "grad_norm": 166.4376220703125, "learning_rate": 6.542969393836436e-06, "logits/chosen": -1.6975538730621338, "logits/rejected": -1.7919883728027344, "logps/chosen": -2.655794858932495, "logps/rejected": -3.9739787578582764, "loss": 19.885, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.17945000529289246, "rewards/margins": 0.08590926975011826, "rewards/rejected": -0.2653592824935913, "step": 135 }, { "epoch": 0.7070707070707071, "grad_norm": 924.48388671875, "learning_rate": 5.729435611336239e-06, "logits/chosen": -1.6683040857315063, "logits/rejected": -1.8297067880630493, "logps/chosen": -3.389685869216919, "logps/rejected": -4.693975925445557, "loss": 21.4041, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.22584636509418488, "rewards/margins": 0.08828467130661011, "rewards/rejected": -0.3141310513019562, "step": 140 }, { "epoch": 0.7323232323232324, "grad_norm": 208.90626525878906, "learning_rate": 4.949137401424527e-06, "logits/chosen": -1.690625786781311, "logits/rejected": -1.8179527521133423, "logps/chosen": -3.1737165451049805, "logps/rejected": -4.919283866882324, "loss": 19.995, "rewards/accuracies": 0.6875, "rewards/chosen": -0.22326549887657166, "rewards/margins": 0.10102611780166626, "rewards/rejected": -0.3242916166782379, "step": 145 }, { "epoch": 0.7575757575757576, "grad_norm": 243.00192260742188, "learning_rate": 4.208147417604665e-06, "logits/chosen": -1.6386387348175049, "logits/rejected": -1.7950681447982788, "logps/chosen": -3.373720645904541, "logps/rejected": -4.483418941497803, "loss": 20.3863, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.24290914833545685, "rewards/margins": 0.07580031454563141, "rewards/rejected": -0.31870946288108826, "step": 150 }, { "epoch": 0.7828282828282829, "grad_norm": 205.0689697265625, "learning_rate": 3.51223239798274e-06, "logits/chosen": -1.7644588947296143, "logits/rejected": -1.792384147644043, "logps/chosen": -2.8454086780548096, "logps/rejected": -4.108365058898926, "loss": 22.1816, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.19682423770427704, "rewards/margins": 0.07537179440259933, "rewards/rejected": -0.2721960246562958, "step": 155 }, { "epoch": 0.8080808080808081, "grad_norm": 202.64425659179688, "learning_rate": 2.8668082857562006e-06, "logits/chosen": -1.7155227661132812, "logits/rejected": -1.7265026569366455, "logps/chosen": -3.2442708015441895, "logps/rejected": -5.168461799621582, "loss": 20.5007, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.22376994788646698, "rewards/margins": 0.11927111446857452, "rewards/rejected": -0.3430410623550415, "step": 160 }, { "epoch": 0.8333333333333334, "grad_norm": 146.06727600097656, "learning_rate": 2.2768980797561125e-06, "logits/chosen": -1.5448095798492432, "logits/rejected": -1.6818040609359741, "logps/chosen": -3.1757941246032715, "logps/rejected": -4.661167144775391, "loss": 23.3162, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.22800321877002716, "rewards/margins": 0.10849568992853165, "rewards/rejected": -0.336498886346817, "step": 165 }, { "epoch": 0.8585858585858586, "grad_norm": 266.4602966308594, "learning_rate": 1.7470927430702277e-06, "logits/chosen": -1.77353036403656, "logits/rejected": -1.8091161251068115, "logps/chosen": -3.679595470428467, "logps/rejected": -5.641579627990723, "loss": 21.0313, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2708187699317932, "rewards/margins": 0.11891458183526993, "rewards/rejected": -0.38973334431648254, "step": 170 }, { "epoch": 0.8838383838383839, "grad_norm": 191.99391174316406, "learning_rate": 1.281515473974614e-06, "logits/chosen": -1.7262178659439087, "logits/rejected": -1.7621949911117554, "logps/chosen": -3.6915946006774902, "logps/rejected": -4.522196292877197, "loss": 23.1575, "rewards/accuracies": 0.625, "rewards/chosen": -0.2737148106098175, "rewards/margins": 0.06611393392086029, "rewards/rejected": -0.339828759431839, "step": 175 }, { "epoch": 0.9090909090909091, "grad_norm": 445.5780334472656, "learning_rate": 8.837896172345827e-07, "logits/chosen": -1.7799314260482788, "logits/rejected": -1.758079171180725, "logps/chosen": -3.744454860687256, "logps/rejected": -5.533487319946289, "loss": 22.4579, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.26133713126182556, "rewards/margins": 0.10725338757038116, "rewards/rejected": -0.36859050393104553, "step": 180 }, { "epoch": 0.9343434343434344, "grad_norm": 351.77313232421875, "learning_rate": 5.570104655044428e-07, "logits/chosen": -1.8014914989471436, "logits/rejected": -1.8869857788085938, "logps/chosen": -3.1039249897003174, "logps/rejected": -4.702515602111816, "loss": 23.8499, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.22470612823963165, "rewards/margins": 0.12236537039279938, "rewards/rejected": -0.34707149863243103, "step": 185 }, { "epoch": 0.9595959595959596, "grad_norm": 203.9517059326172, "learning_rate": 3.0372117028111825e-07, "logits/chosen": -1.650368332862854, "logits/rejected": -1.7378900051116943, "logps/chosen": -3.5179672241210938, "logps/rejected": -4.001964092254639, "loss": 25.7021, "rewards/accuracies": 0.625, "rewards/chosen": -0.26367172598838806, "rewards/margins": 0.021851424127817154, "rewards/rejected": -0.2855231761932373, "step": 190 }, { "epoch": 0.9848484848484849, "grad_norm": 358.923095703125, "learning_rate": 1.2589294988404887e-07, "logits/chosen": -1.6349338293075562, "logits/rejected": -1.7700506448745728, "logps/chosen": -3.084740161895752, "logps/rejected": -4.999522686004639, "loss": 18.5645, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.22231082618236542, "rewards/margins": 0.12103237211704254, "rewards/rejected": -0.34334319829940796, "step": 195 }, { "epoch": 1.0, "step": 198, "total_flos": 0.0, "train_loss": 22.75462433786103, "train_runtime": 3251.7686, "train_samples_per_second": 0.973, "train_steps_per_second": 0.061 } ], "logging_steps": 5, "max_steps": 198, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }