{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.018685040289618125, "eval_steps": 500, "global_step": 40, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00046712600724045314, "grad_norm": 29.624664306640625, "learning_rate": 0.0, "logits/chosen": -3.153887987136841, "logits/rejected": -3.3905792236328125, "logps/chosen": -164.62596130371094, "logps/rejected": -154.77557373046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0009342520144809063, "grad_norm": 29.32240104675293, "learning_rate": 1.5e-06, "logits/chosen": -3.1478431224823, "logits/rejected": -3.0448203086853027, "logps/chosen": -156.60809326171875, "logps/rejected": -134.02630615234375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0014013780217213593, "grad_norm": 39.618228912353516, "learning_rate": 3e-06, "logits/chosen": -3.099796772003174, "logits/rejected": -3.1112475395202637, "logps/chosen": -139.82913208007812, "logps/rejected": -142.367919921875, "loss": 0.6916, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0017557624960318208, "rewards/margins": 0.0038681034930050373, "rewards/rejected": -0.0021123411133885384, "step": 3 }, { "epoch": 0.0018685040289618125, "grad_norm": 48.11921691894531, "learning_rate": 4.5e-06, "logits/chosen": -2.7930030822753906, "logits/rejected": -3.279337167739868, "logps/chosen": -147.18673706054688, "logps/rejected": -148.54122924804688, "loss": 0.6996, "rewards/accuracies": 0.375, "rewards/chosen": -0.033105019479990005, "rewards/margins": -0.008252889849245548, "rewards/rejected": -0.024852126836776733, "step": 4 }, { "epoch": 0.0023356300362022656, "grad_norm": 33.73259353637695, "learning_rate": 6e-06, "logits/chosen": -3.058988094329834, "logits/rejected": -2.9058432579040527, "logps/chosen": -157.9241180419922, "logps/rejected": -181.765380859375, "loss": 0.728, "rewards/accuracies": 0.5, "rewards/chosen": -0.08487213402986526, "rewards/margins": -0.055237509310245514, "rewards/rejected": -0.029634615406394005, "step": 5 }, { "epoch": 0.0028027560434427186, "grad_norm": 33.588905334472656, "learning_rate": 7.5e-06, "logits/chosen": -3.073216199874878, "logits/rejected": -2.8430886268615723, "logps/chosen": -158.05972290039062, "logps/rejected": -150.7171630859375, "loss": 0.759, "rewards/accuracies": 0.5, "rewards/chosen": -0.1916729062795639, "rewards/margins": -0.05884008854627609, "rewards/rejected": -0.1328328400850296, "step": 6 }, { "epoch": 0.0032698820506831716, "grad_norm": 34.93351745605469, "learning_rate": 9e-06, "logits/chosen": -3.4525327682495117, "logits/rejected": -3.336601495742798, "logps/chosen": -164.04249572753906, "logps/rejected": -165.49948120117188, "loss": 0.769, "rewards/accuracies": 0.5, "rewards/chosen": -0.322049081325531, "rewards/margins": -0.07949253916740417, "rewards/rejected": -0.24255654215812683, "step": 7 }, { "epoch": 0.003737008057923625, "grad_norm": 32.109195709228516, "learning_rate": 1.05e-05, "logits/chosen": -3.057377338409424, "logits/rejected": -2.9476959705352783, "logps/chosen": -179.3075408935547, "logps/rejected": -163.44024658203125, "loss": 0.6912, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4484987258911133, "rewards/margins": 0.13945631682872772, "rewards/rejected": -0.5879549980163574, "step": 8 }, { "epoch": 0.004204134065164078, "grad_norm": 38.499698638916016, "learning_rate": 1.2e-05, "logits/chosen": -3.4018871784210205, "logits/rejected": -2.770911455154419, "logps/chosen": -138.96697998046875, "logps/rejected": -155.197509765625, "loss": 0.7849, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5717523097991943, "rewards/margins": 0.018239814788103104, "rewards/rejected": -0.589992105960846, "step": 9 }, { "epoch": 0.004671260072404531, "grad_norm": 29.717857360839844, "learning_rate": 1.3500000000000001e-05, "logits/chosen": -3.2118136882781982, "logits/rejected": -2.8460254669189453, "logps/chosen": -158.110107421875, "logps/rejected": -147.25413513183594, "loss": 0.8467, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4080456793308258, "rewards/margins": -0.05621982365846634, "rewards/rejected": -0.3518258333206177, "step": 10 }, { "epoch": 0.005138386079644984, "grad_norm": 35.5257682800293, "learning_rate": 1.5e-05, "logits/chosen": -2.7199769020080566, "logits/rejected": -3.1992592811584473, "logps/chosen": -182.67335510253906, "logps/rejected": -205.45220947265625, "loss": 0.7824, "rewards/accuracies": 0.625, "rewards/chosen": -0.34646672010421753, "rewards/margins": 0.4741722643375397, "rewards/rejected": -0.8206390142440796, "step": 11 }, { "epoch": 0.005605512086885437, "grad_norm": 33.356773376464844, "learning_rate": 1.65e-05, "logits/chosen": -3.128831386566162, "logits/rejected": -3.167382001876831, "logps/chosen": -157.21823120117188, "logps/rejected": -169.51663208007812, "loss": 0.6211, "rewards/accuracies": 0.625, "rewards/chosen": -0.16993041336536407, "rewards/margins": 0.4490576684474945, "rewards/rejected": -0.6189880967140198, "step": 12 }, { "epoch": 0.00607263809412589, "grad_norm": 43.2087516784668, "learning_rate": 1.8e-05, "logits/chosen": -3.032745838165283, "logits/rejected": -3.1566920280456543, "logps/chosen": -160.5401153564453, "logps/rejected": -156.610107421875, "loss": 0.8161, "rewards/accuracies": 0.53125, "rewards/chosen": -0.329096257686615, "rewards/margins": 0.2510414123535156, "rewards/rejected": -0.5801376700401306, "step": 13 }, { "epoch": 0.006539764101366343, "grad_norm": 44.841331481933594, "learning_rate": 1.95e-05, "logits/chosen": -2.9390594959259033, "logits/rejected": -2.639657974243164, "logps/chosen": -158.00621032714844, "logps/rejected": -205.90988159179688, "loss": 0.6193, "rewards/accuracies": 0.625, "rewards/chosen": -0.4224599599838257, "rewards/margins": 0.8774706721305847, "rewards/rejected": -1.2999305725097656, "step": 14 }, { "epoch": 0.007006890108606796, "grad_norm": 60.16761779785156, "learning_rate": 2.1e-05, "logits/chosen": -2.844076156616211, "logits/rejected": -3.0058369636535645, "logps/chosen": -174.17816162109375, "logps/rejected": -162.8614959716797, "loss": 0.9068, "rewards/accuracies": 0.46875, "rewards/chosen": -0.4890695810317993, "rewards/margins": 0.31313809752464294, "rewards/rejected": -0.8022076487541199, "step": 15 }, { "epoch": 0.00747401611584725, "grad_norm": 42.41041946411133, "learning_rate": 2.25e-05, "logits/chosen": -2.8459668159484863, "logits/rejected": -2.870767593383789, "logps/chosen": -159.3683624267578, "logps/rejected": -137.30758666992188, "loss": 1.0076, "rewards/accuracies": 0.5, "rewards/chosen": -0.8330074548721313, "rewards/margins": -0.10208512842655182, "rewards/rejected": -0.7309223413467407, "step": 16 }, { "epoch": 0.007941142123087703, "grad_norm": 46.14881134033203, "learning_rate": 2.4e-05, "logits/chosen": -3.2765953540802, "logits/rejected": -3.2238590717315674, "logps/chosen": -165.30923461914062, "logps/rejected": -130.40892028808594, "loss": 0.8999, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6034781336784363, "rewards/margins": 0.04197956249117851, "rewards/rejected": -0.6454576849937439, "step": 17 }, { "epoch": 0.008408268130328156, "grad_norm": 36.91789245605469, "learning_rate": 2.55e-05, "logits/chosen": -2.8234622478485107, "logits/rejected": -3.0411720275878906, "logps/chosen": -175.682373046875, "logps/rejected": -149.590576171875, "loss": 1.2818, "rewards/accuracies": 0.53125, "rewards/chosen": -1.4478638172149658, "rewards/margins": -0.04692135751247406, "rewards/rejected": -1.4009425640106201, "step": 18 }, { "epoch": 0.00887539413756861, "grad_norm": 53.27765655517578, "learning_rate": 2.7000000000000002e-05, "logits/chosen": -3.185443639755249, "logits/rejected": -3.126272439956665, "logps/chosen": -187.12286376953125, "logps/rejected": -188.28640747070312, "loss": 0.7968, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0597200393676758, "rewards/margins": 0.5409759283065796, "rewards/rejected": -1.6006958484649658, "step": 19 }, { "epoch": 0.009342520144809062, "grad_norm": 40.505897521972656, "learning_rate": 2.8499999999999998e-05, "logits/chosen": -2.845191478729248, "logits/rejected": -2.9458093643188477, "logps/chosen": -177.2282257080078, "logps/rejected": -210.2263641357422, "loss": 1.3512, "rewards/accuracies": 0.4375, "rewards/chosen": -2.458928108215332, "rewards/margins": -0.08396562933921814, "rewards/rejected": -2.374962329864502, "step": 20 }, { "epoch": 0.009809646152049515, "grad_norm": 68.49879455566406, "learning_rate": 3e-05, "logits/chosen": -3.1144859790802, "logits/rejected": -2.9034385681152344, "logps/chosen": -216.12496948242188, "logps/rejected": -190.55833435058594, "loss": 1.5832, "rewards/accuracies": 0.46875, "rewards/chosen": -3.5558087825775146, "rewards/margins": -0.48250633478164673, "rewards/rejected": -3.0733022689819336, "step": 21 }, { "epoch": 0.010276772159289968, "grad_norm": 61.08313751220703, "learning_rate": 2.9999922925895862e-05, "logits/chosen": -3.1215250492095947, "logits/rejected": -2.597733974456787, "logps/chosen": -219.85137939453125, "logps/rejected": -196.4637908935547, "loss": 1.4738, "rewards/accuracies": 0.40625, "rewards/chosen": -3.155580997467041, "rewards/margins": -0.5727983117103577, "rewards/rejected": -2.582782745361328, "step": 22 }, { "epoch": 0.010743898166530421, "grad_norm": 41.851783752441406, "learning_rate": 2.999969170437549e-05, "logits/chosen": -3.0340187549591064, "logits/rejected": -2.84521484375, "logps/chosen": -177.37728881835938, "logps/rejected": -183.36668395996094, "loss": 1.0908, "rewards/accuracies": 0.46875, "rewards/chosen": -1.8575204610824585, "rewards/margins": 0.3737794756889343, "rewards/rejected": -2.231299877166748, "step": 23 }, { "epoch": 0.011211024173770874, "grad_norm": 39.802371978759766, "learning_rate": 2.9999306337815055e-05, "logits/chosen": -2.8932125568389893, "logits/rejected": -3.1646995544433594, "logps/chosen": -176.2078399658203, "logps/rejected": -199.5233612060547, "loss": 1.0934, "rewards/accuracies": 0.53125, "rewards/chosen": -1.9304168224334717, "rewards/margins": 0.2455454170703888, "rewards/rejected": -2.175962448120117, "step": 24 }, { "epoch": 0.011678150181011327, "grad_norm": 29.43556785583496, "learning_rate": 2.999876683017479e-05, "logits/chosen": -2.940061092376709, "logits/rejected": -2.888075590133667, "logps/chosen": -193.4315948486328, "logps/rejected": -197.52081298828125, "loss": 1.1539, "rewards/accuracies": 0.5, "rewards/chosen": -1.5905036926269531, "rewards/margins": 0.9262561202049255, "rewards/rejected": -2.5167598724365234, "step": 25 }, { "epoch": 0.01214527618825178, "grad_norm": 54.44322204589844, "learning_rate": 2.999807318699897e-05, "logits/chosen": -2.6563143730163574, "logits/rejected": -2.6990714073181152, "logps/chosen": -156.08848571777344, "logps/rejected": -182.99139404296875, "loss": 1.2892, "rewards/accuracies": 0.4375, "rewards/chosen": -1.465782642364502, "rewards/margins": 0.17339667677879333, "rewards/rejected": -1.6391793489456177, "step": 26 }, { "epoch": 0.012612402195492234, "grad_norm": 33.6212272644043, "learning_rate": 2.999722541541585e-05, "logits/chosen": -2.61735200881958, "logits/rejected": -2.4723963737487793, "logps/chosen": -155.68017578125, "logps/rejected": -158.19589233398438, "loss": 1.0627, "rewards/accuracies": 0.34375, "rewards/chosen": -1.1587157249450684, "rewards/margins": 0.1697230488061905, "rewards/rejected": -1.3284387588500977, "step": 27 }, { "epoch": 0.013079528202732687, "grad_norm": 19.592470169067383, "learning_rate": 2.99962235241376e-05, "logits/chosen": -2.5871520042419434, "logits/rejected": -2.7644474506378174, "logps/chosen": -184.37257385253906, "logps/rejected": -169.0127716064453, "loss": 0.8552, "rewards/accuracies": 0.53125, "rewards/chosen": -0.5363420248031616, "rewards/margins": 0.4760582447052002, "rewards/rejected": -1.0124002695083618, "step": 28 }, { "epoch": 0.01354665420997314, "grad_norm": 20.270267486572266, "learning_rate": 2.9995067523460198e-05, "logits/chosen": -2.8748791217803955, "logits/rejected": -3.1334304809570312, "logps/chosen": -157.93455505371094, "logps/rejected": -176.13912963867188, "loss": 0.9041, "rewards/accuracies": 0.5625, "rewards/chosen": -0.36636897921562195, "rewards/margins": 0.27877098321914673, "rewards/rejected": -0.6451399922370911, "step": 29 }, { "epoch": 0.014013780217213593, "grad_norm": 43.62104415893555, "learning_rate": 2.9993757425263343e-05, "logits/chosen": -2.7570629119873047, "logits/rejected": -2.771437644958496, "logps/chosen": -157.29891967773438, "logps/rejected": -166.71913146972656, "loss": 1.5401, "rewards/accuracies": 0.5625, "rewards/chosen": -0.970974862575531, "rewards/margins": -0.46308204531669617, "rewards/rejected": -0.5078927874565125, "step": 30 }, { "epoch": 0.014480906224454046, "grad_norm": 21.04092025756836, "learning_rate": 2.999229324301032e-05, "logits/chosen": -2.814603328704834, "logits/rejected": -2.561202049255371, "logps/chosen": -126.19819641113281, "logps/rejected": -131.07566833496094, "loss": 0.7815, "rewards/accuracies": 0.5, "rewards/chosen": 0.11197692900896072, "rewards/margins": 0.5568960309028625, "rewards/rejected": -0.4449191689491272, "step": 31 }, { "epoch": 0.0149480322316945, "grad_norm": 23.462085723876953, "learning_rate": 2.9990674991747865e-05, "logits/chosen": -2.608139753341675, "logits/rejected": -3.061997175216675, "logps/chosen": -158.59423828125, "logps/rejected": -130.51840209960938, "loss": 1.2301, "rewards/accuracies": 0.46875, "rewards/chosen": -0.5943432450294495, "rewards/margins": -0.38749587535858154, "rewards/rejected": -0.2068473994731903, "step": 32 }, { "epoch": 0.015415158238934953, "grad_norm": 27.918336868286133, "learning_rate": 2.9988902688106014e-05, "logits/chosen": -2.9852523803710938, "logits/rejected": -2.5511088371276855, "logps/chosen": -166.7327880859375, "logps/rejected": -155.55520629882812, "loss": 0.8347, "rewards/accuracies": 0.65625, "rewards/chosen": -0.009264327585697174, "rewards/margins": 0.6062629222869873, "rewards/rejected": -0.615527331829071, "step": 33 }, { "epoch": 0.015882284246175406, "grad_norm": 25.436614990234375, "learning_rate": 2.9986976350297933e-05, "logits/chosen": -2.850193500518799, "logits/rejected": -2.510417938232422, "logps/chosen": -149.20751953125, "logps/rejected": -155.28610229492188, "loss": 0.9801, "rewards/accuracies": 0.53125, "rewards/chosen": -0.5431436896324158, "rewards/margins": 0.01562432199716568, "rewards/rejected": -0.5587680339813232, "step": 34 }, { "epoch": 0.01634941025341586, "grad_norm": 38.4265022277832, "learning_rate": 2.9984895998119723e-05, "logits/chosen": -2.4000887870788574, "logits/rejected": -2.29819393157959, "logps/chosen": -171.50152587890625, "logps/rejected": -191.80938720703125, "loss": 1.2524, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1593430042266846, "rewards/margins": 0.3696390390396118, "rewards/rejected": -1.5289819240570068, "step": 35 }, { "epoch": 0.016816536260656312, "grad_norm": 31.203420639038086, "learning_rate": 2.998266165295021e-05, "logits/chosen": -2.8619065284729004, "logits/rejected": -2.9606268405914307, "logps/chosen": -144.86415100097656, "logps/rejected": -180.39205932617188, "loss": 0.7866, "rewards/accuracies": 0.59375, "rewards/chosen": -0.18791238963603973, "rewards/margins": 0.6746108531951904, "rewards/rejected": -0.8625231981277466, "step": 36 }, { "epoch": 0.017283662267896766, "grad_norm": 24.40894889831543, "learning_rate": 2.9980273337750767e-05, "logits/chosen": -2.757246255874634, "logits/rejected": -2.466960906982422, "logps/chosen": -192.2352294921875, "logps/rejected": -181.03878784179688, "loss": 0.8876, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6490573287010193, "rewards/margins": 0.9436599016189575, "rewards/rejected": -1.592717170715332, "step": 37 }, { "epoch": 0.01775078827513722, "grad_norm": 27.00850486755371, "learning_rate": 2.9977731077065013e-05, "logits/chosen": -2.9453818798065186, "logits/rejected": -2.8008580207824707, "logps/chosen": -152.46038818359375, "logps/rejected": -173.5645751953125, "loss": 0.6833, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2769155204296112, "rewards/margins": 1.4065779447555542, "rewards/rejected": -1.6834933757781982, "step": 38 }, { "epoch": 0.01821791428237767, "grad_norm": 30.273073196411133, "learning_rate": 2.9975034897018614e-05, "logits/chosen": -2.7667531967163086, "logits/rejected": -2.948810338973999, "logps/chosen": -123.97679901123047, "logps/rejected": -174.9097900390625, "loss": 1.0514, "rewards/accuracies": 0.53125, "rewards/chosen": -0.6435041427612305, "rewards/margins": 0.23302727937698364, "rewards/rejected": -0.8765315413475037, "step": 39 }, { "epoch": 0.018685040289618125, "grad_norm": 33.39718246459961, "learning_rate": 2.9972184825318994e-05, "logits/chosen": -2.9786558151245117, "logits/rejected": -2.691629409790039, "logps/chosen": -186.82821655273438, "logps/rejected": -167.65603637695312, "loss": 1.2466, "rewards/accuracies": 0.59375, "rewards/chosen": -1.0377531051635742, "rewards/margins": 0.2545713186264038, "rewards/rejected": -1.292324423789978, "step": 40 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }