{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "global_step": 804, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4e-09, "logits/chosen": -0.4962873160839081, "logits/rejected": -0.49837109446525574, "logps/chosen": -150.21646118164062, "logps/rejected": -189.03659057617188, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 8e-09, "logits/chosen": -0.22341781854629517, "logits/rejected": -0.20124006271362305, "logps/chosen": -162.60159301757812, "logps/rejected": -176.5699462890625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.01, "learning_rate": 1.1999999999999998e-08, "logits/chosen": -0.35328492522239685, "logits/rejected": -0.33261561393737793, "logps/chosen": -139.79942321777344, "logps/rejected": -166.12020874023438, "loss": 0.6937, "rewards/accuracies": 0.75, "rewards/chosen": 0.013110923580825329, "rewards/margins": 0.0425657294690609, "rewards/rejected": -0.029454804956912994, "step": 3 }, { "epoch": 0.01, "learning_rate": 1.6e-08, "logits/chosen": -0.360799103975296, "logits/rejected": -0.31551846861839294, "logps/chosen": -147.8756103515625, "logps/rejected": -133.46592712402344, "loss": 0.6897, "rewards/accuracies": 0.25, "rewards/chosen": -0.005436897277832031, "rewards/margins": -0.023321915417909622, "rewards/rejected": 0.01788501627743244, "step": 4 }, { "epoch": 0.01, "learning_rate": 2e-08, "logits/chosen": -0.4000914394855499, "logits/rejected": -0.38184449076652527, "logps/chosen": -152.05393981933594, "logps/rejected": -150.4818115234375, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": -0.0015483865281566978, "rewards/margins": -0.008856202475726604, "rewards/rejected": 0.007307816296815872, "step": 5 }, { "epoch": 0.01, "learning_rate": 2.3999999999999997e-08, "logits/chosen": -0.3961685597896576, "logits/rejected": -0.3689308166503906, "logps/chosen": -138.93011474609375, "logps/rejected": -151.27462768554688, "loss": 0.6936, "rewards/accuracies": 0.25, "rewards/chosen": -0.007588958367705345, "rewards/margins": -0.010619353502988815, "rewards/rejected": 0.003030395833775401, "step": 6 }, { "epoch": 0.02, "learning_rate": 2.8000000000000003e-08, "logits/chosen": -0.2514069974422455, "logits/rejected": -0.27424749732017517, "logps/chosen": -139.21969604492188, "logps/rejected": -138.87496948242188, "loss": 0.6964, "rewards/accuracies": 0.5, "rewards/chosen": 0.008938790298998356, "rewards/margins": 0.0064008720219135284, "rewards/rejected": 0.0025379187427461147, "step": 7 }, { "epoch": 0.02, "learning_rate": 3.2e-08, "logits/chosen": -0.3427843451499939, "logits/rejected": -0.2883191704750061, "logps/chosen": -151.87918090820312, "logps/rejected": -139.20993041992188, "loss": 0.6986, "rewards/accuracies": 0.5, "rewards/chosen": 0.007439421955496073, "rewards/margins": -0.005039215553551912, "rewards/rejected": 0.01247863844037056, "step": 8 }, { "epoch": 0.02, "learning_rate": 3.6e-08, "logits/chosen": -0.2080179750919342, "logits/rejected": -0.1723359376192093, "logps/chosen": -133.36578369140625, "logps/rejected": -165.78176879882812, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": -0.020877456292510033, "rewards/margins": -0.01300353929400444, "rewards/rejected": -0.007873916998505592, "step": 9 }, { "epoch": 0.02, "learning_rate": 4e-08, "logits/chosen": -0.2047394961118698, "logits/rejected": -0.18247832357883453, "logps/chosen": -151.28887939453125, "logps/rejected": -175.97695922851562, "loss": 0.6947, "rewards/accuracies": 0.75, "rewards/chosen": 0.008915710262954235, "rewards/margins": 0.0227890033274889, "rewards/rejected": -0.01387329027056694, "step": 10 }, { "epoch": 0.03, "learning_rate": 4.4e-08, "logits/chosen": -0.3620399832725525, "logits/rejected": -0.28811830282211304, "logps/chosen": -144.39288330078125, "logps/rejected": -168.36328125, "loss": 0.6919, "rewards/accuracies": 0.25, "rewards/chosen": -0.02282085455954075, "rewards/margins": -0.009278678335249424, "rewards/rejected": -0.013542176224291325, "step": 11 }, { "epoch": 0.03, "learning_rate": 4.799999999999999e-08, "logits/chosen": -0.33209267258644104, "logits/rejected": -0.3309582769870758, "logps/chosen": -152.83868408203125, "logps/rejected": -145.3541717529297, "loss": 0.6914, "rewards/accuracies": 0.5, "rewards/chosen": -0.004748534876853228, "rewards/margins": -0.007934188470244408, "rewards/rejected": 0.00318565359339118, "step": 12 }, { "epoch": 0.03, "learning_rate": 5.2e-08, "logits/chosen": -0.299163281917572, "logits/rejected": -0.2886192798614502, "logps/chosen": -118.86265563964844, "logps/rejected": -135.7462158203125, "loss": 0.6995, "rewards/accuracies": 0.25, "rewards/chosen": 0.011396599002182484, "rewards/margins": -0.01143341138958931, "rewards/rejected": 0.022830011323094368, "step": 13 }, { "epoch": 0.03, "learning_rate": 5.6000000000000005e-08, "logits/chosen": -0.27951520681381226, "logits/rejected": -0.27927958965301514, "logps/chosen": -142.81582641601562, "logps/rejected": -159.80348205566406, "loss": 0.6943, "rewards/accuracies": 0.0, "rewards/chosen": -0.026047516614198685, "rewards/margins": -0.04367789998650551, "rewards/rejected": 0.017630387097597122, "step": 14 }, { "epoch": 0.04, "learning_rate": 6e-08, "logits/chosen": -0.3053646981716156, "logits/rejected": -0.28793495893478394, "logps/chosen": -143.90509033203125, "logps/rejected": -172.55523681640625, "loss": 0.6922, "rewards/accuracies": 0.75, "rewards/chosen": 0.04072323068976402, "rewards/margins": 0.0424077995121479, "rewards/rejected": -0.0016845706850290298, "step": 15 }, { "epoch": 0.04, "learning_rate": 6.4e-08, "logits/chosen": -0.3732811510562897, "logits/rejected": -0.3632149398326874, "logps/chosen": -174.78147888183594, "logps/rejected": -200.48023986816406, "loss": 0.6977, "rewards/accuracies": 1.0, "rewards/chosen": -0.013237381353974342, "rewards/margins": 0.010766220279037952, "rewards/rejected": -0.02400360070168972, "step": 16 }, { "epoch": 0.04, "learning_rate": 6.8e-08, "logits/chosen": -0.34142476320266724, "logits/rejected": -0.34322112798690796, "logps/chosen": -173.00559997558594, "logps/rejected": -160.32388305664062, "loss": 0.6962, "rewards/accuracies": 0.75, "rewards/chosen": 0.01610088348388672, "rewards/margins": 0.03824291005730629, "rewards/rejected": -0.02214203029870987, "step": 17 }, { "epoch": 0.04, "learning_rate": 7.2e-08, "logits/chosen": -0.24628128111362457, "logits/rejected": -0.20813001692295074, "logps/chosen": -165.75299072265625, "logps/rejected": -214.90179443359375, "loss": 0.6881, "rewards/accuracies": 1.0, "rewards/chosen": -0.020873259752988815, "rewards/margins": 0.03388366848230362, "rewards/rejected": -0.054756928235292435, "step": 18 }, { "epoch": 0.05, "learning_rate": 7.599999999999999e-08, "logits/chosen": -0.35483860969543457, "logits/rejected": -0.3476817309856415, "logps/chosen": -149.76210021972656, "logps/rejected": -184.39366149902344, "loss": 0.6935, "rewards/accuracies": 0.25, "rewards/chosen": -0.022196579724550247, "rewards/margins": -0.011256029829382896, "rewards/rejected": -0.010940550826489925, "step": 19 }, { "epoch": 0.05, "learning_rate": 8e-08, "logits/chosen": -0.1971217691898346, "logits/rejected": -0.2220887541770935, "logps/chosen": -161.53179931640625, "logps/rejected": -153.67034912109375, "loss": 0.6926, "rewards/accuracies": 0.25, "rewards/chosen": 0.01009521447122097, "rewards/margins": -0.00626106234267354, "rewards/rejected": 0.016356278210878372, "step": 20 }, { "epoch": 0.05, "learning_rate": 8.4e-08, "logits/chosen": -0.28008049726486206, "logits/rejected": -0.2634870111942291, "logps/chosen": -146.1951141357422, "logps/rejected": -145.47476196289062, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": 0.01348896138370037, "rewards/margins": 0.03354340046644211, "rewards/rejected": -0.02005443535745144, "step": 21 }, { "epoch": 0.05, "learning_rate": 8.8e-08, "logits/chosen": -0.22695393860340118, "logits/rejected": -0.2168578803539276, "logps/chosen": -140.8927001953125, "logps/rejected": -198.36923217773438, "loss": 0.6826, "rewards/accuracies": 1.0, "rewards/chosen": 0.027412796393036842, "rewards/margins": 0.04466133192181587, "rewards/rejected": -0.01724853552877903, "step": 22 }, { "epoch": 0.06, "learning_rate": 9.2e-08, "logits/chosen": -0.08803988248109818, "logits/rejected": -0.0640474259853363, "logps/chosen": -152.2414093017578, "logps/rejected": -170.63723754882812, "loss": 0.6944, "rewards/accuracies": 0.5, "rewards/chosen": -0.0006290432065725327, "rewards/margins": 0.014466860331594944, "rewards/rejected": -0.015095902606844902, "step": 23 }, { "epoch": 0.06, "learning_rate": 9.599999999999999e-08, "logits/chosen": -0.29258304834365845, "logits/rejected": -0.30887216329574585, "logps/chosen": -129.28440856933594, "logps/rejected": -168.91831970214844, "loss": 0.6936, "rewards/accuracies": 0.75, "rewards/chosen": 0.03726138919591904, "rewards/margins": 0.045609284192323685, "rewards/rejected": -0.008347893133759499, "step": 24 }, { "epoch": 0.06, "learning_rate": 1e-07, "logits/chosen": -0.07482028007507324, "logits/rejected": -0.06703196465969086, "logps/chosen": -133.05142211914062, "logps/rejected": -143.7042694091797, "loss": 0.6894, "rewards/accuracies": 0.75, "rewards/chosen": 0.0017726896330714226, "rewards/margins": 0.009843063540756702, "rewards/rejected": -0.00807037390768528, "step": 25 }, { "epoch": 0.06, "learning_rate": 9.999959340292496e-08, "logits/chosen": -0.3578854501247406, "logits/rejected": -0.35245877504348755, "logps/chosen": -143.47195434570312, "logps/rejected": -158.78030395507812, "loss": 0.6886, "rewards/accuracies": 0.5, "rewards/chosen": -0.01499271485954523, "rewards/margins": -0.006640435196459293, "rewards/rejected": -0.008352279663085938, "step": 26 }, { "epoch": 0.07, "learning_rate": 9.999837361831269e-08, "logits/chosen": -0.3789975643157959, "logits/rejected": -0.3620936870574951, "logps/chosen": -173.5669708251953, "logps/rejected": -160.1005096435547, "loss": 0.6914, "rewards/accuracies": 0.25, "rewards/chosen": 0.006267547607421875, "rewards/margins": -0.010199356824159622, "rewards/rejected": 0.016466904431581497, "step": 27 }, { "epoch": 0.07, "learning_rate": 9.99963406660016e-08, "logits/chosen": -0.41671866178512573, "logits/rejected": -0.4008921980857849, "logps/chosen": -139.25662231445312, "logps/rejected": -173.21099853515625, "loss": 0.6891, "rewards/accuracies": 0.5, "rewards/chosen": 0.020322799682617188, "rewards/margins": 0.018172074109315872, "rewards/rejected": 0.0021507274359464645, "step": 28 }, { "epoch": 0.07, "learning_rate": 9.999349457905543e-08, "logits/chosen": -0.3743477165699005, "logits/rejected": -0.3526586592197418, "logps/chosen": -128.85629272460938, "logps/rejected": -128.87142944335938, "loss": 0.6929, "rewards/accuracies": 0.75, "rewards/chosen": -0.0037338254041969776, "rewards/margins": -0.00042934343218803406, "rewards/rejected": -0.003304482437670231, "step": 29 }, { "epoch": 0.07, "learning_rate": 9.99898354037626e-08, "logits/chosen": -0.31104058027267456, "logits/rejected": -0.3162185549736023, "logps/chosen": -150.50228881835938, "logps/rejected": -170.6068115234375, "loss": 0.6929, "rewards/accuracies": 0.25, "rewards/chosen": -0.0009609225671738386, "rewards/margins": 0.011268997564911842, "rewards/rejected": -0.01222991943359375, "step": 30 }, { "epoch": 0.08, "learning_rate": 9.998536319963547e-08, "logits/chosen": -0.17147815227508545, "logits/rejected": -0.16753524541854858, "logps/chosen": -152.0363311767578, "logps/rejected": -160.18603515625, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.017721176147460938, "rewards/margins": 0.05072307586669922, "rewards/rejected": -0.03300189971923828, "step": 31 }, { "epoch": 0.08, "learning_rate": 9.998007803940948e-08, "logits/chosen": -0.24063293635845184, "logits/rejected": -0.24194712936878204, "logps/chosen": -134.60165405273438, "logps/rejected": -166.09005737304688, "loss": 0.6922, "rewards/accuracies": 0.5, "rewards/chosen": 0.021681595593690872, "rewards/margins": 0.012991715222597122, "rewards/rejected": 0.00868988037109375, "step": 32 }, { "epoch": 0.08, "learning_rate": 9.997398000904185e-08, "logits/chosen": -0.35478857159614563, "logits/rejected": -0.3133459687232971, "logps/chosen": -139.64395141601562, "logps/rejected": -163.80136108398438, "loss": 0.6932, "rewards/accuracies": 0.25, "rewards/chosen": -0.005427169613540173, "rewards/margins": -0.025377273559570312, "rewards/rejected": 0.019950103014707565, "step": 33 }, { "epoch": 0.08, "learning_rate": 9.996706920771024e-08, "logits/chosen": -0.4562496542930603, "logits/rejected": -0.4394756555557251, "logps/chosen": -142.20472717285156, "logps/rejected": -173.4610595703125, "loss": 0.6921, "rewards/accuracies": 0.25, "rewards/chosen": 0.01098785363137722, "rewards/margins": -0.00013465853407979012, "rewards/rejected": 0.011122512631118298, "step": 34 }, { "epoch": 0.09, "learning_rate": 9.995934574781107e-08, "logits/chosen": -0.2133214771747589, "logits/rejected": -0.19974152743816376, "logps/chosen": -147.60159301757812, "logps/rejected": -203.64663696289062, "loss": 0.6955, "rewards/accuracies": 0.75, "rewards/chosen": 0.00575332622975111, "rewards/margins": -0.005796052049845457, "rewards/rejected": 0.01154937781393528, "step": 35 }, { "epoch": 0.09, "learning_rate": 9.995080975495785e-08, "logits/chosen": -0.3352283835411072, "logits/rejected": -0.33982113003730774, "logps/chosen": -145.7166748046875, "logps/rejected": -152.26947021484375, "loss": 0.6985, "rewards/accuracies": 0.5, "rewards/chosen": 0.0006116868462413549, "rewards/margins": -0.011242294684052467, "rewards/rejected": 0.011853981763124466, "step": 36 }, { "epoch": 0.09, "learning_rate": 9.994146136797892e-08, "logits/chosen": -0.2772245407104492, "logits/rejected": -0.25728997588157654, "logps/chosen": -150.11605834960938, "logps/rejected": -166.9167938232422, "loss": 0.6921, "rewards/accuracies": 0.5, "rewards/chosen": -0.018875883892178535, "rewards/margins": 0.0072044371627271175, "rewards/rejected": -0.02608032338321209, "step": 37 }, { "epoch": 0.09, "learning_rate": 9.993130073891539e-08, "logits/chosen": -0.19611378014087677, "logits/rejected": -0.20715470612049103, "logps/chosen": -166.33685302734375, "logps/rejected": -170.9593505859375, "loss": 0.6935, "rewards/accuracies": 0.5, "rewards/chosen": -0.019779587164521217, "rewards/margins": -0.001500321552157402, "rewards/rejected": -0.018279267475008965, "step": 38 }, { "epoch": 0.1, "learning_rate": 9.99203280330185e-08, "logits/chosen": -0.24442417919635773, "logits/rejected": -0.2567129135131836, "logps/chosen": -173.0302276611328, "logps/rejected": -176.58009338378906, "loss": 0.6946, "rewards/accuracies": 0.75, "rewards/chosen": 0.02691955491900444, "rewards/margins": 0.014598846435546875, "rewards/rejected": 0.01232070941478014, "step": 39 }, { "epoch": 0.1, "learning_rate": 9.990854342874711e-08, "logits/chosen": -0.3511047065258026, "logits/rejected": -0.35639142990112305, "logps/chosen": -156.0603485107422, "logps/rejected": -159.26405334472656, "loss": 0.6865, "rewards/accuracies": 0.25, "rewards/chosen": 0.0013706209138035774, "rewards/margins": -0.007876968942582607, "rewards/rejected": 0.00924758892506361, "step": 40 }, { "epoch": 0.1, "learning_rate": 9.98959471177646e-08, "logits/chosen": -0.2305653840303421, "logits/rejected": -0.23628276586532593, "logps/chosen": -149.8102264404297, "logps/rejected": -147.64048767089844, "loss": 0.697, "rewards/accuracies": 0.25, "rewards/chosen": -0.01943359524011612, "rewards/margins": -0.021520424634218216, "rewards/rejected": 0.0020868300925940275, "step": 41 }, { "epoch": 0.1, "learning_rate": 9.988253930493591e-08, "logits/chosen": -0.12143755704164505, "logits/rejected": -0.11446834355592728, "logps/chosen": -157.3408660888672, "logps/rejected": -162.25926208496094, "loss": 0.7011, "rewards/accuracies": 0.5, "rewards/chosen": 0.018420029431581497, "rewards/margins": 0.018954847007989883, "rewards/rejected": -0.0005348213016986847, "step": 42 }, { "epoch": 0.11, "learning_rate": 9.986832020832414e-08, "logits/chosen": -0.24778984487056732, "logits/rejected": -0.21429601311683655, "logps/chosen": -149.16366577148438, "logps/rejected": -185.1569366455078, "loss": 0.6908, "rewards/accuracies": 0.25, "rewards/chosen": -0.006295776925981045, "rewards/margins": -0.006380082573741674, "rewards/rejected": 8.430494926869869e-05, "step": 43 }, { "epoch": 0.11, "learning_rate": 9.985329005918702e-08, "logits/chosen": -0.25823649764060974, "logits/rejected": -0.2498096525669098, "logps/chosen": -162.36563110351562, "logps/rejected": -160.0772705078125, "loss": 0.6919, "rewards/accuracies": 0.75, "rewards/chosen": 0.021771620959043503, "rewards/margins": 0.018832018598914146, "rewards/rejected": 0.002939604688435793, "step": 44 }, { "epoch": 0.11, "learning_rate": 9.983744910197314e-08, "logits/chosen": -0.23687925934791565, "logits/rejected": -0.21196295320987701, "logps/chosen": -169.20556640625, "logps/rejected": -165.78797912597656, "loss": 0.6903, "rewards/accuracies": 0.5, "rewards/chosen": 0.04402484744787216, "rewards/margins": 0.029891395941376686, "rewards/rejected": 0.014133453369140625, "step": 45 }, { "epoch": 0.11, "learning_rate": 9.982079759431795e-08, "logits/chosen": -0.21729090809822083, "logits/rejected": -0.18091966211795807, "logps/chosen": -126.867919921875, "logps/rejected": -155.25270080566406, "loss": 0.6898, "rewards/accuracies": 0.5, "rewards/chosen": 0.007825279608368874, "rewards/margins": 0.003461647778749466, "rewards/rejected": 0.004363631829619408, "step": 46 }, { "epoch": 0.12, "learning_rate": 9.980333580703966e-08, "logits/chosen": -0.22514842450618744, "logits/rejected": -0.21636372804641724, "logps/chosen": -151.24911499023438, "logps/rejected": -146.35067749023438, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": -0.003968429286032915, "rewards/margins": -0.0016761773731559515, "rewards/rejected": -0.0022922521457076073, "step": 47 }, { "epoch": 0.12, "learning_rate": 9.97850640241347e-08, "logits/chosen": -0.21405835449695587, "logits/rejected": -0.17152239382266998, "logps/chosen": -146.0931854248047, "logps/rejected": -162.51242065429688, "loss": 0.6985, "rewards/accuracies": 0.5, "rewards/chosen": -0.005923651624470949, "rewards/margins": -0.025274472311139107, "rewards/rejected": 0.019350815564393997, "step": 48 }, { "epoch": 0.12, "learning_rate": 9.976598254277323e-08, "logits/chosen": -0.26784366369247437, "logits/rejected": -0.24137525260448456, "logps/chosen": -150.0193328857422, "logps/rejected": -148.4168243408203, "loss": 0.6923, "rewards/accuracies": 0.75, "rewards/chosen": 0.004791641142219305, "rewards/margins": 0.021852493286132812, "rewards/rejected": -0.01706085354089737, "step": 49 }, { "epoch": 0.12, "learning_rate": 9.974609167329423e-08, "logits/chosen": -0.18986591696739197, "logits/rejected": -0.1786285787820816, "logps/chosen": -180.76397705078125, "logps/rejected": -125.85525512695312, "loss": 0.6899, "rewards/accuracies": 0.75, "rewards/chosen": 0.012887192890048027, "rewards/margins": 0.02186756208539009, "rewards/rejected": -0.008980369195342064, "step": 50 }, { "epoch": 0.13, "learning_rate": 9.972539173920047e-08, "logits/chosen": -0.37089788913726807, "logits/rejected": -0.3884039521217346, "logps/chosen": -124.50447082519531, "logps/rejected": -162.40817260742188, "loss": 0.6898, "rewards/accuracies": 0.5, "rewards/chosen": 0.027512358501553535, "rewards/margins": 0.008275602012872696, "rewards/rejected": 0.01923675648868084, "step": 51 }, { "epoch": 0.13, "learning_rate": 9.970388307715326e-08, "logits/chosen": -0.38164597749710083, "logits/rejected": -0.39152416586875916, "logps/chosen": -134.7378387451172, "logps/rejected": -193.10971069335938, "loss": 0.6878, "rewards/accuracies": 0.5, "rewards/chosen": 0.007334900088608265, "rewards/margins": -0.01278305146843195, "rewards/rejected": 0.020117951557040215, "step": 52 }, { "epoch": 0.13, "learning_rate": 9.968156603696693e-08, "logits/chosen": -0.2888343036174774, "logits/rejected": -0.27756690979003906, "logps/chosen": -163.6429443359375, "logps/rejected": -149.7277374267578, "loss": 0.6957, "rewards/accuracies": 0.25, "rewards/chosen": -0.0055141448974609375, "rewards/margins": -0.03826809301972389, "rewards/rejected": 0.032753944396972656, "step": 53 }, { "epoch": 0.13, "learning_rate": 9.965844098160325e-08, "logits/chosen": -0.29018983244895935, "logits/rejected": -0.2738547623157501, "logps/chosen": -162.8462371826172, "logps/rejected": -157.67333984375, "loss": 0.6919, "rewards/accuracies": 1.0, "rewards/chosen": 0.016312791034579277, "rewards/margins": 0.04274292290210724, "rewards/rejected": -0.026430130004882812, "step": 54 }, { "epoch": 0.14, "learning_rate": 9.963450828716542e-08, "logits/chosen": -0.31751933693885803, "logits/rejected": -0.31416407227516174, "logps/chosen": -155.7648468017578, "logps/rejected": -199.19033813476562, "loss": 0.6915, "rewards/accuracies": 0.0, "rewards/chosen": -0.017585182562470436, "rewards/margins": -0.02729511260986328, "rewards/rejected": 0.009709930047392845, "step": 55 }, { "epoch": 0.14, "learning_rate": 9.960976834289197e-08, "logits/chosen": -0.26783809065818787, "logits/rejected": -0.27707698941230774, "logps/chosen": -154.0221405029297, "logps/rejected": -138.12969970703125, "loss": 0.6995, "rewards/accuracies": 0.5, "rewards/chosen": -0.011828422546386719, "rewards/margins": -0.019256021827459335, "rewards/rejected": 0.007427597418427467, "step": 56 }, { "epoch": 0.14, "learning_rate": 9.958422155115042e-08, "logits/chosen": -0.17630869150161743, "logits/rejected": -0.16279417276382446, "logps/chosen": -158.38980102539062, "logps/rejected": -160.736083984375, "loss": 0.696, "rewards/accuracies": 0.5, "rewards/chosen": -0.0014921189285814762, "rewards/margins": -0.0018571848049759865, "rewards/rejected": 0.0003650663420557976, "step": 57 }, { "epoch": 0.14, "learning_rate": 9.955786832743088e-08, "logits/chosen": -0.39351290464401245, "logits/rejected": -0.3709285855293274, "logps/chosen": -143.82757568359375, "logps/rejected": -155.3314971923828, "loss": 0.6912, "rewards/accuracies": 0.5, "rewards/chosen": 0.023796461522579193, "rewards/margins": 0.026210403069853783, "rewards/rejected": -0.0024139401502907276, "step": 58 }, { "epoch": 0.15, "learning_rate": 9.953070910033903e-08, "logits/chosen": -0.20270496606826782, "logits/rejected": -0.18284977972507477, "logps/chosen": -137.42755126953125, "logps/rejected": -156.3745574951172, "loss": 0.697, "rewards/accuracies": 0.5, "rewards/chosen": -0.01662578620016575, "rewards/margins": -0.013142586685717106, "rewards/rejected": -0.00348319998010993, "step": 59 }, { "epoch": 0.15, "learning_rate": 9.950274431158938e-08, "logits/chosen": -0.3322741389274597, "logits/rejected": -0.304931104183197, "logps/chosen": -168.2028350830078, "logps/rejected": -180.17543029785156, "loss": 0.6934, "rewards/accuracies": 0.5, "rewards/chosen": 0.011013411916792393, "rewards/margins": 0.01991577073931694, "rewards/rejected": -0.008902359753847122, "step": 60 }, { "epoch": 0.15, "learning_rate": 9.947397441599799e-08, "logits/chosen": -0.3836684226989746, "logits/rejected": -0.3632274568080902, "logps/chosen": -156.5909881591797, "logps/rejected": -179.7236328125, "loss": 0.6916, "rewards/accuracies": 0.75, "rewards/chosen": 0.016911696642637253, "rewards/margins": 0.016721725463867188, "rewards/rejected": 0.00018997222650796175, "step": 61 }, { "epoch": 0.15, "learning_rate": 9.944439988147509e-08, "logits/chosen": -0.3882240355014801, "logits/rejected": -0.37218722701072693, "logps/chosen": -128.0855712890625, "logps/rejected": -146.33535766601562, "loss": 0.6968, "rewards/accuracies": 0.25, "rewards/chosen": -0.012256432324647903, "rewards/margins": -0.011716270819306374, "rewards/rejected": -0.0005401610396802425, "step": 62 }, { "epoch": 0.16, "learning_rate": 9.941402118901743e-08, "logits/chosen": -0.2441699057817459, "logits/rejected": -0.22044947743415833, "logps/chosen": -140.18943786621094, "logps/rejected": -153.59710693359375, "loss": 0.6922, "rewards/accuracies": 0.5, "rewards/chosen": 0.01848926767706871, "rewards/margins": 0.009290125221014023, "rewards/rejected": 0.009199142456054688, "step": 63 }, { "epoch": 0.16, "learning_rate": 9.938283883270049e-08, "logits/chosen": -0.2405036985874176, "logits/rejected": -0.19697965681552887, "logps/chosen": -166.9427490234375, "logps/rejected": -202.09750366210938, "loss": 0.6964, "rewards/accuracies": 0.5, "rewards/chosen": 0.000481033930554986, "rewards/margins": -0.015175246633589268, "rewards/rejected": 0.015656281262636185, "step": 64 }, { "epoch": 0.16, "learning_rate": 9.935085331967052e-08, "logits/chosen": -0.47255995869636536, "logits/rejected": -0.457670658826828, "logps/chosen": -138.23728942871094, "logps/rejected": -195.58535766601562, "loss": 0.6907, "rewards/accuracies": 0.75, "rewards/chosen": 0.026976395398378372, "rewards/margins": 0.034961700439453125, "rewards/rejected": -0.007985305972397327, "step": 65 }, { "epoch": 0.16, "learning_rate": 9.931806517013611e-08, "logits/chosen": -0.19689185917377472, "logits/rejected": -0.20818914473056793, "logps/chosen": -144.07293701171875, "logps/rejected": -138.6442413330078, "loss": 0.697, "rewards/accuracies": 0.5, "rewards/chosen": 0.011381912045180798, "rewards/margins": 0.013867570087313652, "rewards/rejected": -0.002485656877979636, "step": 66 }, { "epoch": 0.17, "learning_rate": 9.928447491735991e-08, "logits/chosen": -0.3118274509906769, "logits/rejected": -0.28908345103263855, "logps/chosen": -162.4678955078125, "logps/rejected": -162.7849578857422, "loss": 0.6866, "rewards/accuracies": 0.75, "rewards/chosen": 0.030529404059052467, "rewards/margins": 0.026997758075594902, "rewards/rejected": 0.0035316459834575653, "step": 67 }, { "epoch": 0.17, "learning_rate": 9.925008310764987e-08, "logits/chosen": -0.2185005098581314, "logits/rejected": -0.22403375804424286, "logps/chosen": -125.71537017822266, "logps/rejected": -202.24842834472656, "loss": 0.6998, "rewards/accuracies": 0.25, "rewards/chosen": -0.016918182373046875, "rewards/margins": -0.027882765978574753, "rewards/rejected": 0.010964585468173027, "step": 68 }, { "epoch": 0.17, "learning_rate": 9.921489030035036e-08, "logits/chosen": -0.3686932623386383, "logits/rejected": -0.32250848412513733, "logps/chosen": -146.2086639404297, "logps/rejected": -174.25192260742188, "loss": 0.6933, "rewards/accuracies": 0.75, "rewards/chosen": 0.0022951122373342514, "rewards/margins": 0.01683788374066353, "rewards/rejected": -0.014542770572006702, "step": 69 }, { "epoch": 0.17, "learning_rate": 9.917889706783303e-08, "logits/chosen": -0.1994059681892395, "logits/rejected": -0.16973429918289185, "logps/chosen": -174.61605834960938, "logps/rejected": -196.6368408203125, "loss": 0.6891, "rewards/accuracies": 0.75, "rewards/chosen": -1.2779724784195423e-05, "rewards/margins": 0.0027841562405228615, "rewards/rejected": -0.0027969353832304478, "step": 70 }, { "epoch": 0.18, "learning_rate": 9.914210399548767e-08, "logits/chosen": -0.16957195103168488, "logits/rejected": -0.15411511063575745, "logps/chosen": -124.71367645263672, "logps/rejected": -165.56991577148438, "loss": 0.6933, "rewards/accuracies": 0.25, "rewards/chosen": -0.01412220112979412, "rewards/margins": -0.02884845621883869, "rewards/rejected": 0.01472625695168972, "step": 71 }, { "epoch": 0.18, "learning_rate": 9.910451168171247e-08, "logits/chosen": -0.4468238651752472, "logits/rejected": -0.41528990864753723, "logps/chosen": -154.30038452148438, "logps/rejected": -192.40492248535156, "loss": 0.6942, "rewards/accuracies": 0.5, "rewards/chosen": -0.0008886344730854034, "rewards/margins": -0.006963921710848808, "rewards/rejected": 0.006075287237763405, "step": 72 }, { "epoch": 0.18, "learning_rate": 9.906612073790442e-08, "logits/chosen": -0.28385451436042786, "logits/rejected": -0.2817462384700775, "logps/chosen": -141.93038940429688, "logps/rejected": -141.3412628173828, "loss": 0.6904, "rewards/accuracies": 0.5, "rewards/chosen": -0.010276030749082565, "rewards/margins": 0.007571985013782978, "rewards/rejected": -0.017848016694188118, "step": 73 }, { "epoch": 0.18, "learning_rate": 9.902693178844935e-08, "logits/chosen": -0.3422333896160126, "logits/rejected": -0.3018242120742798, "logps/chosen": -180.1243896484375, "logps/rejected": -197.69857788085938, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 0.027043532580137253, "rewards/margins": 0.06180611252784729, "rewards/rejected": -0.03476257249712944, "step": 74 }, { "epoch": 0.19, "learning_rate": 9.898694547071176e-08, "logits/chosen": -0.253986656665802, "logits/rejected": -0.2582909166812897, "logps/chosen": -156.06793212890625, "logps/rejected": -147.52340698242188, "loss": 0.6914, "rewards/accuracies": 0.75, "rewards/chosen": -0.012973977252840996, "rewards/margins": 0.00835113599896431, "rewards/rejected": -0.021325113251805305, "step": 75 }, { "epoch": 0.19, "learning_rate": 9.89461624350244e-08, "logits/chosen": -0.3354797661304474, "logits/rejected": -0.3117402493953705, "logps/chosen": -170.35955810546875, "logps/rejected": -162.82464599609375, "loss": 0.6972, "rewards/accuracies": 0.5, "rewards/chosen": 0.008334731683135033, "rewards/margins": -0.008158111944794655, "rewards/rejected": 0.016492843627929688, "step": 76 }, { "epoch": 0.19, "learning_rate": 9.890458334467784e-08, "logits/chosen": -0.25440508127212524, "logits/rejected": -0.23017673194408417, "logps/chosen": -144.4886474609375, "logps/rejected": -145.5629119873047, "loss": 0.6907, "rewards/accuracies": 0.0, "rewards/chosen": -0.039559364318847656, "rewards/margins": -0.04143714904785156, "rewards/rejected": 0.00187778496183455, "step": 77 }, { "epoch": 0.19, "learning_rate": 9.886220887590952e-08, "logits/chosen": -0.24799248576164246, "logits/rejected": -0.21055585145950317, "logps/chosen": -144.97344970703125, "logps/rejected": -174.5103302001953, "loss": 0.6937, "rewards/accuracies": 0.5, "rewards/chosen": -0.0018579494208097458, "rewards/margins": 0.01112346537411213, "rewards/rejected": -0.012981414794921875, "step": 78 }, { "epoch": 0.2, "learning_rate": 9.881903971789284e-08, "logits/chosen": -0.24160897731781006, "logits/rejected": -0.20235231518745422, "logps/chosen": -153.71102905273438, "logps/rejected": -167.36981201171875, "loss": 0.6941, "rewards/accuracies": 0.5, "rewards/chosen": -0.02213592454791069, "rewards/margins": -0.01738414727151394, "rewards/rejected": -0.004751777276396751, "step": 79 }, { "epoch": 0.2, "learning_rate": 9.877507657272594e-08, "logits/chosen": -0.3746883273124695, "logits/rejected": -0.355406254529953, "logps/chosen": -155.84442138671875, "logps/rejected": -156.59449768066406, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": -0.02110767364501953, "rewards/margins": -0.008690644055604935, "rewards/rejected": -0.012417029589414597, "step": 80 }, { "epoch": 0.2, "learning_rate": 9.873032015542027e-08, "logits/chosen": -0.1153663694858551, "logits/rejected": -0.09608101844787598, "logps/chosen": -157.50933837890625, "logps/rejected": -145.6007843017578, "loss": 0.689, "rewards/accuracies": 0.75, "rewards/chosen": -0.004255866166204214, "rewards/margins": 0.004931452218443155, "rewards/rejected": -0.00918731838464737, "step": 81 }, { "epoch": 0.2, "learning_rate": 9.868477119388895e-08, "logits/chosen": -0.1781589686870575, "logits/rejected": -0.14442168176174164, "logps/chosen": -156.14892578125, "logps/rejected": -146.93206787109375, "loss": 0.6995, "rewards/accuracies": 0.5, "rewards/chosen": -0.008284377865493298, "rewards/margins": -0.00472641084343195, "rewards/rejected": -0.0035579672548919916, "step": 82 }, { "epoch": 0.21, "learning_rate": 9.863843042893498e-08, "logits/chosen": -0.4777001738548279, "logits/rejected": -0.4727758467197418, "logps/chosen": -182.97569274902344, "logps/rejected": -158.31190490722656, "loss": 0.6902, "rewards/accuracies": 0.5, "rewards/chosen": 0.0006984705105423927, "rewards/margins": 0.024896621704101562, "rewards/rejected": -0.024198152124881744, "step": 83 }, { "epoch": 0.21, "learning_rate": 9.859129861423913e-08, "logits/chosen": -0.371181845664978, "logits/rejected": -0.3538655936717987, "logps/chosen": -176.64227294921875, "logps/rejected": -155.55096435546875, "loss": 0.6934, "rewards/accuracies": 0.25, "rewards/chosen": -0.039404675364494324, "rewards/margins": -0.046667858958244324, "rewards/rejected": 0.007263181731104851, "step": 84 }, { "epoch": 0.21, "learning_rate": 9.854337651634772e-08, "logits/chosen": -0.41097021102905273, "logits/rejected": -0.3717851936817169, "logps/chosen": -111.05171203613281, "logps/rejected": -170.90025329589844, "loss": 0.6912, "rewards/accuracies": 0.75, "rewards/chosen": -0.008296394720673561, "rewards/margins": -0.00039367692079395056, "rewards/rejected": -0.007902718149125576, "step": 85 }, { "epoch": 0.21, "learning_rate": 9.849466491466016e-08, "logits/chosen": -0.32062047719955444, "logits/rejected": -0.2680882215499878, "logps/chosen": -162.65292358398438, "logps/rejected": -142.61094665527344, "loss": 0.6895, "rewards/accuracies": 0.75, "rewards/chosen": 0.010434724390506744, "rewards/margins": 0.03001098334789276, "rewards/rejected": -0.019576262682676315, "step": 86 }, { "epoch": 0.22, "learning_rate": 9.844516460141621e-08, "logits/chosen": -0.10039065033197403, "logits/rejected": -0.07310805469751358, "logps/chosen": -133.62246704101562, "logps/rejected": -179.91407775878906, "loss": 0.6944, "rewards/accuracies": 0.5, "rewards/chosen": -0.006416702177375555, "rewards/margins": -0.0005016322247684002, "rewards/rejected": -0.005915070418268442, "step": 87 }, { "epoch": 0.22, "learning_rate": 9.83948763816832e-08, "logits/chosen": -0.2936316728591919, "logits/rejected": -0.29500117897987366, "logps/chosen": -125.15013885498047, "logps/rejected": -190.67715454101562, "loss": 0.6957, "rewards/accuracies": 0.25, "rewards/chosen": 0.012711906805634499, "rewards/margins": -0.026453591883182526, "rewards/rejected": 0.039165496826171875, "step": 88 }, { "epoch": 0.22, "learning_rate": 9.834380107334284e-08, "logits/chosen": -0.4575427174568176, "logits/rejected": -0.4617311358451843, "logps/chosen": -150.00390625, "logps/rejected": -148.1856689453125, "loss": 0.6907, "rewards/accuracies": 0.75, "rewards/chosen": 0.029901504516601562, "rewards/margins": 0.0031476952135562897, "rewards/rejected": 0.026753809303045273, "step": 89 }, { "epoch": 0.22, "learning_rate": 9.829193950707798e-08, "logits/chosen": -0.41929033398628235, "logits/rejected": -0.4043019711971283, "logps/chosen": -137.93710327148438, "logps/rejected": -153.40020751953125, "loss": 0.6916, "rewards/accuracies": 0.5, "rewards/chosen": 0.019017411395907402, "rewards/margins": 0.015438269823789597, "rewards/rejected": 0.0035791397094726562, "step": 90 }, { "epoch": 0.23, "learning_rate": 9.823929252635903e-08, "logits/chosen": -0.36602362990379333, "logits/rejected": -0.3416110873222351, "logps/chosen": -148.98875427246094, "logps/rejected": -187.26405334472656, "loss": 0.6946, "rewards/accuracies": 0.25, "rewards/chosen": -0.02917938306927681, "rewards/margins": -0.04241008684039116, "rewards/rejected": 0.013230707496404648, "step": 91 }, { "epoch": 0.23, "learning_rate": 9.818586098743036e-08, "logits/chosen": -0.2519412040710449, "logits/rejected": -0.25213170051574707, "logps/chosen": -146.01922607421875, "logps/rejected": -148.7308807373047, "loss": 0.6906, "rewards/accuracies": 0.75, "rewards/chosen": 0.007200623396784067, "rewards/margins": 0.02349071577191353, "rewards/rejected": -0.0162900909781456, "step": 92 }, { "epoch": 0.23, "learning_rate": 9.813164575929627e-08, "logits/chosen": -0.3663918673992157, "logits/rejected": -0.3760688900947571, "logps/chosen": -139.53932189941406, "logps/rejected": -167.04995727539062, "loss": 0.6882, "rewards/accuracies": 0.75, "rewards/chosen": 0.0013017654418945312, "rewards/margins": 0.018860628828406334, "rewards/rejected": -0.017558859661221504, "step": 93 }, { "epoch": 0.23, "learning_rate": 9.807664772370687e-08, "logits/chosen": -0.3947628438472748, "logits/rejected": -0.40614479780197144, "logps/chosen": -172.17037963867188, "logps/rejected": -181.38906860351562, "loss": 0.6955, "rewards/accuracies": 0.5, "rewards/chosen": -0.007013130933046341, "rewards/margins": -0.016595270484685898, "rewards/rejected": 0.009582138620316982, "step": 94 }, { "epoch": 0.24, "learning_rate": 9.80208677751438e-08, "logits/chosen": -0.26973527669906616, "logits/rejected": -0.27586808800697327, "logps/chosen": -155.80996704101562, "logps/rejected": -170.13143920898438, "loss": 0.688, "rewards/accuracies": 0.75, "rewards/chosen": 0.008857345208525658, "rewards/margins": 0.0027519213035702705, "rewards/rejected": 0.006105423904955387, "step": 95 }, { "epoch": 0.24, "learning_rate": 9.796430682080559e-08, "logits/chosen": -0.30572283267974854, "logits/rejected": -0.3045867383480072, "logps/chosen": -151.87255859375, "logps/rejected": -163.6187744140625, "loss": 0.6954, "rewards/accuracies": 0.5, "rewards/chosen": -0.013500021770596504, "rewards/margins": -0.016722295433282852, "rewards/rejected": 0.00322227505967021, "step": 96 }, { "epoch": 0.24, "learning_rate": 9.790696578059299e-08, "logits/chosen": -0.42095860838890076, "logits/rejected": -0.33871662616729736, "logps/chosen": -176.0961151123047, "logps/rejected": -140.3403778076172, "loss": 0.691, "rewards/accuracies": 0.25, "rewards/chosen": -0.00157241802662611, "rewards/margins": -0.006381798069924116, "rewards/rejected": 0.004809379577636719, "step": 97 }, { "epoch": 0.24, "learning_rate": 9.784884558709397e-08, "logits/chosen": -0.43498530983924866, "logits/rejected": -0.417625367641449, "logps/chosen": -141.5167236328125, "logps/rejected": -145.97329711914062, "loss": 0.6919, "rewards/accuracies": 0.25, "rewards/chosen": -0.014270974323153496, "rewards/margins": -0.020430946722626686, "rewards/rejected": 0.006159973796457052, "step": 98 }, { "epoch": 0.25, "learning_rate": 9.778994718556856e-08, "logits/chosen": -0.1744564175605774, "logits/rejected": -0.17072224617004395, "logps/chosen": -118.09506225585938, "logps/rejected": -154.33230590820312, "loss": 0.6952, "rewards/accuracies": 0.25, "rewards/chosen": -0.011281395331025124, "rewards/margins": 0.0008911131881177425, "rewards/rejected": -0.012172508984804153, "step": 99 }, { "epoch": 0.25, "learning_rate": 9.773027153393347e-08, "logits/chosen": -0.25895658135414124, "logits/rejected": -0.23419491946697235, "logps/chosen": -131.33847045898438, "logps/rejected": -161.32183837890625, "loss": 0.6919, "rewards/accuracies": 0.75, "rewards/chosen": 0.033390045166015625, "rewards/margins": 0.03457527235150337, "rewards/rejected": -0.0011852262541651726, "step": 100 }, { "epoch": 0.25, "learning_rate": 9.766981960274652e-08, "logits/chosen": -0.1907324343919754, "logits/rejected": -0.186609148979187, "logps/chosen": -150.22177124023438, "logps/rejected": -190.63943481445312, "loss": 0.6911, "rewards/accuracies": 0.5, "rewards/chosen": 0.013790512457489967, "rewards/margins": 0.001450730487704277, "rewards/rejected": 0.01233978196978569, "step": 101 }, { "epoch": 0.25, "learning_rate": 9.760859237519086e-08, "logits/chosen": -0.4652169942855835, "logits/rejected": -0.443112850189209, "logps/chosen": -126.5323715209961, "logps/rejected": -171.57228088378906, "loss": 0.6922, "rewards/accuracies": 0.25, "rewards/chosen": -0.03251476213335991, "rewards/margins": -0.026034735143184662, "rewards/rejected": -0.006480026990175247, "step": 102 }, { "epoch": 0.26, "learning_rate": 9.754659084705892e-08, "logits/chosen": -0.34174174070358276, "logits/rejected": -0.29712915420532227, "logps/chosen": -155.11158752441406, "logps/rejected": -183.2632293701172, "loss": 0.6811, "rewards/accuracies": 1.0, "rewards/chosen": 0.00024299649521708488, "rewards/margins": 0.04131794348359108, "rewards/rejected": -0.04107494279742241, "step": 103 }, { "epoch": 0.26, "learning_rate": 9.748381602673633e-08, "logits/chosen": -0.3331354856491089, "logits/rejected": -0.31883135437965393, "logps/chosen": -164.22824096679688, "logps/rejected": -174.50527954101562, "loss": 0.6968, "rewards/accuracies": 0.75, "rewards/chosen": -0.011514663696289062, "rewards/margins": 0.015154839493334293, "rewards/rejected": -0.02666950225830078, "step": 104 }, { "epoch": 0.26, "learning_rate": 9.74202689351854e-08, "logits/chosen": -0.28574031591415405, "logits/rejected": -0.2337077260017395, "logps/chosen": -141.7529296875, "logps/rejected": -154.24615478515625, "loss": 0.6954, "rewards/accuracies": 0.25, "rewards/chosen": -0.012181473895907402, "rewards/margins": -0.015651894733309746, "rewards/rejected": 0.0034704208374023438, "step": 105 }, { "epoch": 0.26, "learning_rate": 9.735595060592861e-08, "logits/chosen": -0.25011715292930603, "logits/rejected": -0.2584919035434723, "logps/chosen": -149.55274963378906, "logps/rejected": -142.13572692871094, "loss": 0.6979, "rewards/accuracies": 0.25, "rewards/chosen": 0.0004024496302008629, "rewards/margins": -0.003561209887266159, "rewards/rejected": 0.003963661380112171, "step": 106 }, { "epoch": 0.27, "learning_rate": 9.729086208503173e-08, "logits/chosen": -0.43358027935028076, "logits/rejected": -0.4148816764354706, "logps/chosen": -145.31915283203125, "logps/rejected": -191.8268280029297, "loss": 0.6937, "rewards/accuracies": 0.0, "rewards/chosen": -0.015350341796875, "rewards/margins": -0.03379669412970543, "rewards/rejected": 0.01844635047018528, "step": 107 }, { "epoch": 0.27, "learning_rate": 9.722500443108686e-08, "logits/chosen": -0.19421431422233582, "logits/rejected": -0.17046543955802917, "logps/chosen": -141.7122802734375, "logps/rejected": -196.25497436523438, "loss": 0.6904, "rewards/accuracies": 0.25, "rewards/chosen": -0.007384109310805798, "rewards/margins": -0.019423672929406166, "rewards/rejected": 0.012039565481245518, "step": 108 }, { "epoch": 0.27, "learning_rate": 9.715837871519516e-08, "logits/chosen": -0.22910308837890625, "logits/rejected": -0.21572650969028473, "logps/chosen": -164.15652465820312, "logps/rejected": -161.12728881835938, "loss": 0.6944, "rewards/accuracies": 0.5, "rewards/chosen": 0.011440087109804153, "rewards/margins": 0.028164861723780632, "rewards/rejected": -0.016724776476621628, "step": 109 }, { "epoch": 0.27, "learning_rate": 9.709098602094951e-08, "logits/chosen": -0.336404949426651, "logits/rejected": -0.3283347487449646, "logps/chosen": -138.89793395996094, "logps/rejected": -153.00485229492188, "loss": 0.6977, "rewards/accuracies": 0.5, "rewards/chosen": 0.006772804073989391, "rewards/margins": 0.014703558757901192, "rewards/rejected": -0.007930755615234375, "step": 110 }, { "epoch": 0.28, "learning_rate": 9.702282744441679e-08, "logits/chosen": -0.3401263654232025, "logits/rejected": -0.3445747196674347, "logps/chosen": -156.12765502929688, "logps/rejected": -166.0361785888672, "loss": 0.6977, "rewards/accuracies": 0.5, "rewards/chosen": 0.024407576769590378, "rewards/margins": 0.021756168454885483, "rewards/rejected": 0.002651405520737171, "step": 111 }, { "epoch": 0.28, "learning_rate": 9.69539040941201e-08, "logits/chosen": -0.16539815068244934, "logits/rejected": -0.20632390677928925, "logps/chosen": -159.5687255859375, "logps/rejected": -161.35906982421875, "loss": 0.6979, "rewards/accuracies": 0.5, "rewards/chosen": -0.004578590393066406, "rewards/margins": -0.01151371095329523, "rewards/rejected": 0.006935120094567537, "step": 112 }, { "epoch": 0.28, "learning_rate": 9.688421709102075e-08, "logits/chosen": -0.3966646194458008, "logits/rejected": -0.3915117383003235, "logps/chosen": -150.79624938964844, "logps/rejected": -178.249755859375, "loss": 0.6941, "rewards/accuracies": 0.25, "rewards/chosen": 0.008501816540956497, "rewards/margins": -0.03547515720129013, "rewards/rejected": 0.043976977467536926, "step": 113 }, { "epoch": 0.28, "learning_rate": 9.681376756850002e-08, "logits/chosen": -0.6171132922172546, "logits/rejected": -0.5883930325508118, "logps/chosen": -164.0187225341797, "logps/rejected": -162.74098205566406, "loss": 0.6937, "rewards/accuracies": 0.75, "rewards/chosen": -0.0005641935858875513, "rewards/margins": 0.0006298072403296828, "rewards/rejected": -0.0011940003605559468, "step": 114 }, { "epoch": 0.29, "learning_rate": 9.674255667234069e-08, "logits/chosen": -0.21323342621326447, "logits/rejected": -0.19038806855678558, "logps/chosen": -135.5599822998047, "logps/rejected": -149.8820037841797, "loss": 0.6938, "rewards/accuracies": 0.5, "rewards/chosen": 0.0022590633016079664, "rewards/margins": -0.029465101659297943, "rewards/rejected": 0.03172416612505913, "step": 115 }, { "epoch": 0.29, "learning_rate": 9.667058556070844e-08, "logits/chosen": -0.31623342633247375, "logits/rejected": -0.29219749569892883, "logps/chosen": -145.1917724609375, "logps/rejected": -204.3798828125, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 0.0018198012840002775, "rewards/margins": 0.0549963004887104, "rewards/rejected": -0.05317649990320206, "step": 116 }, { "epoch": 0.29, "learning_rate": 9.659785540413302e-08, "logits/chosen": -0.39331501722335815, "logits/rejected": -0.3952291011810303, "logps/chosen": -146.72683715820312, "logps/rejected": -153.60067749023438, "loss": 0.6949, "rewards/accuracies": 0.5, "rewards/chosen": -0.00435905484482646, "rewards/margins": -0.011377525515854359, "rewards/rejected": 0.007018471602350473, "step": 117 }, { "epoch": 0.29, "learning_rate": 9.652436738548917e-08, "logits/chosen": -0.36518093943595886, "logits/rejected": -0.3534892499446869, "logps/chosen": -179.52780151367188, "logps/rejected": -183.69444274902344, "loss": 0.6894, "rewards/accuracies": 0.25, "rewards/chosen": -0.0012176514137536287, "rewards/margins": -0.012096976861357689, "rewards/rejected": 0.010879327543079853, "step": 118 }, { "epoch": 0.3, "learning_rate": 9.645012269997745e-08, "logits/chosen": -0.18816417455673218, "logits/rejected": -0.12960399687290192, "logps/chosen": -155.5513153076172, "logps/rejected": -140.69244384765625, "loss": 0.69, "rewards/accuracies": 0.5, "rewards/chosen": 0.006253814324736595, "rewards/margins": 0.0026994701474905014, "rewards/rejected": 0.0035543441772460938, "step": 119 }, { "epoch": 0.3, "learning_rate": 9.637512255510474e-08, "logits/chosen": -0.5402763485908508, "logits/rejected": -0.5160578489303589, "logps/chosen": -154.0438995361328, "logps/rejected": -157.64151000976562, "loss": 0.696, "rewards/accuracies": 0.5, "rewards/chosen": -0.029417801648378372, "rewards/margins": -0.013045500963926315, "rewards/rejected": -0.016372298821806908, "step": 120 }, { "epoch": 0.3, "learning_rate": 9.629936817066458e-08, "logits/chosen": -0.21445423364639282, "logits/rejected": -0.21844062209129333, "logps/chosen": -118.53437805175781, "logps/rejected": -141.606201171875, "loss": 0.6874, "rewards/accuracies": 0.0, "rewards/chosen": -0.021203424781560898, "rewards/margins": -0.012639617547392845, "rewards/rejected": -0.008563803508877754, "step": 121 }, { "epoch": 0.3, "learning_rate": 9.622286077871746e-08, "logits/chosen": -0.21758998930454254, "logits/rejected": -0.16971047222614288, "logps/chosen": -150.9545135498047, "logps/rejected": -161.45169067382812, "loss": 0.6924, "rewards/accuracies": 0.5, "rewards/chosen": 0.0012676240876317024, "rewards/margins": 0.02548503875732422, "rewards/rejected": -0.02421741560101509, "step": 122 }, { "epoch": 0.31, "learning_rate": 9.614560162357064e-08, "logits/chosen": -0.2550797760486603, "logits/rejected": -0.2329166978597641, "logps/chosen": -133.5430908203125, "logps/rejected": -159.45338439941406, "loss": 0.7005, "rewards/accuracies": 0.5, "rewards/chosen": -0.017516708001494408, "rewards/margins": -0.03262748941779137, "rewards/rejected": 0.01511077955365181, "step": 123 }, { "epoch": 0.31, "learning_rate": 9.606759196175797e-08, "logits/chosen": -0.3040820062160492, "logits/rejected": -0.2687731981277466, "logps/chosen": -154.45071411132812, "logps/rejected": -192.50515747070312, "loss": 0.6894, "rewards/accuracies": 1.0, "rewards/chosen": 0.022226333618164062, "rewards/margins": 0.029782678931951523, "rewards/rejected": -0.007556342985481024, "step": 124 }, { "epoch": 0.31, "learning_rate": 9.598883306201948e-08, "logits/chosen": -0.23629455268383026, "logits/rejected": -0.21156544983386993, "logps/chosen": -152.44081115722656, "logps/rejected": -194.99188232421875, "loss": 0.6973, "rewards/accuracies": 0.25, "rewards/chosen": -0.010418510064482689, "rewards/margins": -0.03306408226490021, "rewards/rejected": 0.02264557033777237, "step": 125 }, { "epoch": 0.31, "learning_rate": 9.590932620528068e-08, "logits/chosen": -0.25638389587402344, "logits/rejected": -0.24923522770404816, "logps/chosen": -146.87020874023438, "logps/rejected": -176.79733276367188, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.023680686950683594, "rewards/margins": 0.046227842569351196, "rewards/rejected": -0.022547150030732155, "step": 126 }, { "epoch": 0.32, "learning_rate": 9.582907268463178e-08, "logits/chosen": -0.15167850255966187, "logits/rejected": -0.14892958104610443, "logps/chosen": -166.8184051513672, "logps/rejected": -161.88340759277344, "loss": 0.6877, "rewards/accuracies": 0.75, "rewards/chosen": -0.009617231786251068, "rewards/margins": 0.0010597240179777145, "rewards/rejected": -0.010676956735551357, "step": 127 }, { "epoch": 0.32, "learning_rate": 9.574807380530669e-08, "logits/chosen": -0.2745174169540405, "logits/rejected": -0.24899765849113464, "logps/chosen": -145.4764862060547, "logps/rejected": -175.3505401611328, "loss": 0.6909, "rewards/accuracies": 0.75, "rewards/chosen": 0.010774994269013405, "rewards/margins": 0.011483765207231045, "rewards/rejected": -0.0007087700068950653, "step": 128 }, { "epoch": 0.32, "learning_rate": 9.566633088466167e-08, "logits/chosen": -0.3516853451728821, "logits/rejected": -0.3166256844997406, "logps/chosen": -136.60403442382812, "logps/rejected": -176.60012817382812, "loss": 0.688, "rewards/accuracies": 0.75, "rewards/chosen": -0.011772537603974342, "rewards/margins": -0.002423095516860485, "rewards/rejected": -0.009349441155791283, "step": 129 }, { "epoch": 0.32, "learning_rate": 9.558384525215404e-08, "logits/chosen": -0.24614661931991577, "logits/rejected": -0.23213164508342743, "logps/chosen": -144.8609619140625, "logps/rejected": -169.16017150878906, "loss": 0.701, "rewards/accuracies": 0.25, "rewards/chosen": -0.0057468414306640625, "rewards/margins": -0.02018890343606472, "rewards/rejected": 0.014442062936723232, "step": 130 }, { "epoch": 0.33, "learning_rate": 9.550061824932045e-08, "logits/chosen": -0.2766852080821991, "logits/rejected": -0.2566676437854767, "logps/chosen": -122.31771850585938, "logps/rejected": -132.9036865234375, "loss": 0.6854, "rewards/accuracies": 0.25, "rewards/chosen": -0.008678436279296875, "rewards/margins": -0.015573501586914062, "rewards/rejected": 0.0068950653076171875, "step": 131 }, { "epoch": 0.33, "learning_rate": 9.541665122975518e-08, "logits/chosen": -0.26356223225593567, "logits/rejected": -0.24112221598625183, "logps/chosen": -191.07040405273438, "logps/rejected": -153.02310180664062, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.02226867713034153, "rewards/margins": 0.045044708997011185, "rewards/rejected": -0.022776031866669655, "step": 132 }, { "epoch": 0.33, "learning_rate": 9.533194555908796e-08, "logits/chosen": -0.31920361518859863, "logits/rejected": -0.30487146973609924, "logps/chosen": -114.75049591064453, "logps/rejected": -159.18466186523438, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.01940021477639675, "rewards/margins": 0.03252658620476723, "rewards/rejected": -0.013126373291015625, "step": 133 }, { "epoch": 0.33, "learning_rate": 9.524650261496195e-08, "logits/chosen": -0.270704984664917, "logits/rejected": -0.2773374915122986, "logps/chosen": -182.27671813964844, "logps/rejected": -189.02906799316406, "loss": 0.6965, "rewards/accuracies": 0.5, "rewards/chosen": 0.010899734683334827, "rewards/margins": 0.015352440066635609, "rewards/rejected": -0.004452704451978207, "step": 134 }, { "epoch": 0.34, "learning_rate": 9.516032378701115e-08, "logits/chosen": -0.5010266900062561, "logits/rejected": -0.48914676904678345, "logps/chosen": -192.9725799560547, "logps/rejected": -191.6255340576172, "loss": 0.6865, "rewards/accuracies": 0.5, "rewards/chosen": -0.0035388946998864412, "rewards/margins": 0.007260513957589865, "rewards/rejected": -0.01079940889030695, "step": 135 }, { "epoch": 0.34, "learning_rate": 9.507341047683799e-08, "logits/chosen": -0.1816280484199524, "logits/rejected": -0.17018242180347443, "logps/chosen": -138.62155151367188, "logps/rejected": -141.3739776611328, "loss": 0.6794, "rewards/accuracies": 1.0, "rewards/chosen": 0.007838058285415173, "rewards/margins": 0.04363422468304634, "rewards/rejected": -0.035796165466308594, "step": 136 }, { "epoch": 0.34, "learning_rate": 9.498576409799034e-08, "logits/chosen": -0.10129430890083313, "logits/rejected": -0.10133526474237442, "logps/chosen": -139.69415283203125, "logps/rejected": -182.526123046875, "loss": 0.6881, "rewards/accuracies": 0.25, "rewards/chosen": -0.0030639651231467724, "rewards/margins": -0.012491228990256786, "rewards/rejected": 0.009427262470126152, "step": 137 }, { "epoch": 0.34, "learning_rate": 9.489738607593866e-08, "logits/chosen": -0.28406083583831787, "logits/rejected": -0.2547295391559601, "logps/chosen": -122.43465423583984, "logps/rejected": -148.68115234375, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 0.006805801298469305, "rewards/margins": 0.04203071817755699, "rewards/rejected": -0.03522491455078125, "step": 138 }, { "epoch": 0.35, "learning_rate": 9.480827784805277e-08, "logits/chosen": -0.3729887306690216, "logits/rejected": -0.3755929470062256, "logps/chosen": -145.5623016357422, "logps/rejected": -152.45726013183594, "loss": 0.6901, "rewards/accuracies": 0.5, "rewards/chosen": 0.0043357862159609795, "rewards/margins": -0.005061530973762274, "rewards/rejected": 0.009397316724061966, "step": 139 }, { "epoch": 0.35, "learning_rate": 9.471844086357846e-08, "logits/chosen": -0.24995191395282745, "logits/rejected": -0.24493180215358734, "logps/chosen": -142.64501953125, "logps/rejected": -148.45814514160156, "loss": 0.7003, "rewards/accuracies": 0.5, "rewards/chosen": -0.016196060925722122, "rewards/margins": -0.015368842519819736, "rewards/rejected": -0.0008272160775959492, "step": 140 }, { "epoch": 0.35, "learning_rate": 9.462787658361393e-08, "logits/chosen": -0.3065277338027954, "logits/rejected": -0.3245694935321808, "logps/chosen": -143.80792236328125, "logps/rejected": -169.42396545410156, "loss": 0.694, "rewards/accuracies": 0.5, "rewards/chosen": 0.004491806495934725, "rewards/margins": 0.0051105511374771595, "rewards/rejected": -0.0006187441758811474, "step": 141 }, { "epoch": 0.35, "learning_rate": 9.453658648108604e-08, "logits/chosen": -0.3156987726688385, "logits/rejected": -0.291363924741745, "logps/chosen": -135.6706085205078, "logps/rejected": -147.3319549560547, "loss": 0.6997, "rewards/accuracies": 0.5, "rewards/chosen": 0.002561187371611595, "rewards/margins": -0.019367408007383347, "rewards/rejected": 0.02192859724164009, "step": 142 }, { "epoch": 0.36, "learning_rate": 9.444457204072632e-08, "logits/chosen": -0.22625431418418884, "logits/rejected": -0.2325943261384964, "logps/chosen": -146.1162109375, "logps/rejected": -154.71800231933594, "loss": 0.6931, "rewards/accuracies": 0.75, "rewards/chosen": 0.021827315911650658, "rewards/margins": 0.04821052402257919, "rewards/rejected": -0.026383209973573685, "step": 143 }, { "epoch": 0.36, "learning_rate": 9.435183475904687e-08, "logits/chosen": -0.3245464265346527, "logits/rejected": -0.3040432035923004, "logps/chosen": -135.3441162109375, "logps/rejected": -145.66500854492188, "loss": 0.6899, "rewards/accuracies": 0.0, "rewards/chosen": -0.003575134091079235, "rewards/margins": -0.040207862854003906, "rewards/rejected": 0.0366327278316021, "step": 144 }, { "epoch": 0.36, "learning_rate": 9.4258376144316e-08, "logits/chosen": -0.2764669954776764, "logits/rejected": -0.24179092049598694, "logps/chosen": -188.1370086669922, "logps/rejected": -180.19805908203125, "loss": 0.6867, "rewards/accuracies": 0.75, "rewards/chosen": 0.0003816606476902962, "rewards/margins": 0.01737804524600506, "rewards/rejected": -0.016996383666992188, "step": 145 }, { "epoch": 0.36, "learning_rate": 9.416419771653367e-08, "logits/chosen": -0.28469136357307434, "logits/rejected": -0.2549521327018738, "logps/chosen": -163.19927978515625, "logps/rejected": -171.7887725830078, "loss": 0.6957, "rewards/accuracies": 0.5, "rewards/chosen": 0.01136550959199667, "rewards/margins": 0.014376259408891201, "rewards/rejected": -0.003010750748217106, "step": 146 }, { "epoch": 0.37, "learning_rate": 9.406930100740684e-08, "logits/chosen": -0.13574805855751038, "logits/rejected": -0.13847067952156067, "logps/chosen": -143.40440368652344, "logps/rejected": -158.70094299316406, "loss": 0.6873, "rewards/accuracies": 0.75, "rewards/chosen": 0.010013963095843792, "rewards/margins": 0.03662509843707085, "rewards/rejected": -0.026611139997839928, "step": 147 }, { "epoch": 0.37, "learning_rate": 9.397368756032444e-08, "logits/chosen": -0.2812151312828064, "logits/rejected": -0.24271562695503235, "logps/chosen": -150.60772705078125, "logps/rejected": -156.51295471191406, "loss": 0.6913, "rewards/accuracies": 0.25, "rewards/chosen": -0.00368576031178236, "rewards/margins": -0.027833174914121628, "rewards/rejected": 0.02414741739630699, "step": 148 }, { "epoch": 0.37, "learning_rate": 9.387735893033243e-08, "logits/chosen": -0.3302271366119385, "logits/rejected": -0.3157057762145996, "logps/chosen": -143.56289672851562, "logps/rejected": -173.41888427734375, "loss": 0.6949, "rewards/accuracies": 0.5, "rewards/chosen": 0.0072498326189816, "rewards/margins": 0.0020248424261808395, "rewards/rejected": 0.005224990658462048, "step": 149 }, { "epoch": 0.37, "learning_rate": 9.378031668410835e-08, "logits/chosen": -0.34182924032211304, "logits/rejected": -0.33371883630752563, "logps/chosen": -129.1358642578125, "logps/rejected": -139.93710327148438, "loss": 0.6874, "rewards/accuracies": 0.5, "rewards/chosen": 0.004435348324477673, "rewards/margins": -0.004554747603833675, "rewards/rejected": 0.008990096859633923, "step": 150 }, { "epoch": 0.38, "learning_rate": 9.368256239993596e-08, "logits/chosen": -0.42186275124549866, "logits/rejected": -0.41082480549812317, "logps/chosen": -150.76596069335938, "logps/rejected": -181.23046875, "loss": 0.6892, "rewards/accuracies": 0.5, "rewards/chosen": 0.004304314963519573, "rewards/margins": -0.012043571099638939, "rewards/rejected": 0.016347885131835938, "step": 151 }, { "epoch": 0.38, "learning_rate": 9.358409766767945e-08, "logits/chosen": -0.2442975640296936, "logits/rejected": -0.2554421126842499, "logps/chosen": -145.49732971191406, "logps/rejected": -170.7120819091797, "loss": 0.6952, "rewards/accuracies": 0.25, "rewards/chosen": -0.008858299814164639, "rewards/margins": -0.03100299835205078, "rewards/rejected": 0.022144699469208717, "step": 152 }, { "epoch": 0.38, "learning_rate": 9.348492408875777e-08, "logits/chosen": -0.02903611585497856, "logits/rejected": -0.03056098148226738, "logps/chosen": -160.7294464111328, "logps/rejected": -152.97630310058594, "loss": 0.6939, "rewards/accuracies": 0.75, "rewards/chosen": -0.0010848995298147202, "rewards/margins": 0.008261488750576973, "rewards/rejected": -0.009346390143036842, "step": 153 }, { "epoch": 0.38, "learning_rate": 9.338504327611838e-08, "logits/chosen": -0.45194897055625916, "logits/rejected": -0.4321906566619873, "logps/chosen": -146.51248168945312, "logps/rejected": -158.2119140625, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": -0.0006542205810546875, "rewards/margins": 0.005028534680604935, "rewards/rejected": -0.005682754795998335, "step": 154 }, { "epoch": 0.39, "learning_rate": 9.328445685421112e-08, "logits/chosen": -0.28994399309158325, "logits/rejected": -0.29715695977211, "logps/chosen": -168.133056640625, "logps/rejected": -163.4918670654297, "loss": 0.6864, "rewards/accuracies": 0.75, "rewards/chosen": 0.041481971740722656, "rewards/margins": 0.04745616763830185, "rewards/rejected": -0.005974197760224342, "step": 155 }, { "epoch": 0.39, "learning_rate": 9.31831664589618e-08, "logits/chosen": -0.2538711726665497, "logits/rejected": -0.22484783828258514, "logps/chosen": -114.23457336425781, "logps/rejected": -153.146240234375, "loss": 0.6958, "rewards/accuracies": 0.5, "rewards/chosen": -0.007152747828513384, "rewards/margins": 0.008051872253417969, "rewards/rejected": -0.015204621478915215, "step": 156 }, { "epoch": 0.39, "learning_rate": 9.308117373774554e-08, "logits/chosen": -0.42413759231567383, "logits/rejected": -0.4337882995605469, "logps/chosen": -169.5879669189453, "logps/rejected": -226.20526123046875, "loss": 0.6935, "rewards/accuracies": 0.5, "rewards/chosen": 0.014446640387177467, "rewards/margins": 0.010656356811523438, "rewards/rejected": 0.003790283342823386, "step": 157 }, { "epoch": 0.39, "learning_rate": 9.297848034936006e-08, "logits/chosen": -0.259633332490921, "logits/rejected": -0.23337768018245697, "logps/chosen": -182.9020233154297, "logps/rejected": -171.70611572265625, "loss": 0.6905, "rewards/accuracies": 0.25, "rewards/chosen": -0.00380954728461802, "rewards/margins": -0.01790447346866131, "rewards/rejected": 0.014094924554228783, "step": 158 }, { "epoch": 0.4, "learning_rate": 9.287508796399856e-08, "logits/chosen": -0.2297215312719345, "logits/rejected": -0.21108274161815643, "logps/chosen": -174.08676147460938, "logps/rejected": -157.05368041992188, "loss": 0.6986, "rewards/accuracies": 0.25, "rewards/chosen": -0.006026268471032381, "rewards/margins": -0.043831825256347656, "rewards/rejected": 0.03780555725097656, "step": 159 }, { "epoch": 0.4, "learning_rate": 9.277099826322276e-08, "logits/chosen": -0.2083878517150879, "logits/rejected": -0.207004114985466, "logps/chosen": -141.16390991210938, "logps/rejected": -156.12879943847656, "loss": 0.6949, "rewards/accuracies": 0.75, "rewards/chosen": 0.0038234712556004524, "rewards/margins": 0.027445605024695396, "rewards/rejected": -0.02362213097512722, "step": 160 }, { "epoch": 0.4, "learning_rate": 9.266621293993534e-08, "logits/chosen": -0.2081228792667389, "logits/rejected": -0.1932353526353836, "logps/chosen": -157.94383239746094, "logps/rejected": -141.09075927734375, "loss": 0.6886, "rewards/accuracies": 0.75, "rewards/chosen": 0.02842102199792862, "rewards/margins": 0.03871574625372887, "rewards/rejected": -0.010294724255800247, "step": 161 }, { "epoch": 0.4, "learning_rate": 9.256073369835253e-08, "logits/chosen": -0.17554543912410736, "logits/rejected": -0.16500480473041534, "logps/chosen": -137.97802734375, "logps/rejected": -145.7951202392578, "loss": 0.6933, "rewards/accuracies": 0.5, "rewards/chosen": -0.0013280860148370266, "rewards/margins": -0.0009246817789971828, "rewards/rejected": -0.00040340377017855644, "step": 162 }, { "epoch": 0.41, "learning_rate": 9.245456225397641e-08, "logits/chosen": -0.14572647213935852, "logits/rejected": -0.12821556627750397, "logps/chosen": -155.29782104492188, "logps/rejected": -163.60076904296875, "loss": 0.6919, "rewards/accuracies": 0.5, "rewards/chosen": -0.015210342593491077, "rewards/margins": -0.02555542066693306, "rewards/rejected": 0.010345078073441982, "step": 163 }, { "epoch": 0.41, "learning_rate": 9.234770033356689e-08, "logits/chosen": -0.22252331674098969, "logits/rejected": -0.18472380936145782, "logps/chosen": -140.96144104003906, "logps/rejected": -169.89295959472656, "loss": 0.6888, "rewards/accuracies": 0.25, "rewards/chosen": -0.01068267785012722, "rewards/margins": -0.02151947095990181, "rewards/rejected": 0.010836792178452015, "step": 164 }, { "epoch": 0.41, "learning_rate": 9.224014967511376e-08, "logits/chosen": -0.33721721172332764, "logits/rejected": -0.3231666088104248, "logps/chosen": -157.8468475341797, "logps/rejected": -176.82455444335938, "loss": 0.6865, "rewards/accuracies": 0.5, "rewards/chosen": 0.003184128552675247, "rewards/margins": 0.005944443866610527, "rewards/rejected": -0.002760313916951418, "step": 165 }, { "epoch": 0.41, "learning_rate": 9.213191202780835e-08, "logits/chosen": -0.21546459197998047, "logits/rejected": -0.18148013949394226, "logps/chosen": -128.91848754882812, "logps/rejected": -176.64747619628906, "loss": 0.6914, "rewards/accuracies": 0.5, "rewards/chosen": -0.01769409328699112, "rewards/margins": -0.01394271943718195, "rewards/rejected": -0.003751372452825308, "step": 166 }, { "epoch": 0.42, "learning_rate": 9.202298915201509e-08, "logits/chosen": -0.408473938703537, "logits/rejected": -0.4015934467315674, "logps/chosen": -138.2218017578125, "logps/rejected": -134.11898803710938, "loss": 0.6918, "rewards/accuracies": 0.75, "rewards/chosen": 0.021255873143672943, "rewards/margins": 0.016519926488399506, "rewards/rejected": 0.0047359466552734375, "step": 167 }, { "epoch": 0.42, "learning_rate": 9.191338281924287e-08, "logits/chosen": -0.28697535395622253, "logits/rejected": -0.26992788910865784, "logps/chosen": -158.20025634765625, "logps/rejected": -180.904052734375, "loss": 0.689, "rewards/accuracies": 0.25, "rewards/chosen": 0.009634971618652344, "rewards/margins": -0.0059820180758833885, "rewards/rejected": 0.015616989694535732, "step": 168 }, { "epoch": 0.42, "learning_rate": 9.180309481211628e-08, "logits/chosen": -0.234433114528656, "logits/rejected": -0.22176101803779602, "logps/chosen": -157.01734924316406, "logps/rejected": -177.50201416015625, "loss": 0.6927, "rewards/accuracies": 0.5, "rewards/chosen": -0.010254288092255592, "rewards/margins": -0.00222091656178236, "rewards/rejected": -0.008033370599150658, "step": 169 }, { "epoch": 0.42, "learning_rate": 9.169212692434657e-08, "logits/chosen": -0.16415660083293915, "logits/rejected": -0.15068577229976654, "logps/chosen": -143.95071411132812, "logps/rejected": -201.31198120117188, "loss": 0.6878, "rewards/accuracies": 0.25, "rewards/chosen": -0.015251160599291325, "rewards/margins": -0.011804962530732155, "rewards/rejected": -0.0034461980685591698, "step": 170 }, { "epoch": 0.43, "learning_rate": 9.158048096070248e-08, "logits/chosen": -0.30363351106643677, "logits/rejected": -0.2980118691921234, "logps/chosen": -143.74783325195312, "logps/rejected": -166.1083984375, "loss": 0.6985, "rewards/accuracies": 0.5, "rewards/chosen": -0.01860675774514675, "rewards/margins": -0.009488106705248356, "rewards/rejected": -0.00911865197122097, "step": 171 }, { "epoch": 0.43, "learning_rate": 9.146815873698088e-08, "logits/chosen": -0.2872503101825714, "logits/rejected": -0.2757498025894165, "logps/chosen": -159.1553955078125, "logps/rejected": -154.7501220703125, "loss": 0.6975, "rewards/accuracies": 0.75, "rewards/chosen": -0.011978913098573685, "rewards/margins": 0.005714416969567537, "rewards/rejected": -0.017693329602479935, "step": 172 }, { "epoch": 0.43, "learning_rate": 9.135516207997729e-08, "logits/chosen": -0.3976166844367981, "logits/rejected": -0.3419801890850067, "logps/chosen": -130.193359375, "logps/rejected": -150.24893188476562, "loss": 0.6891, "rewards/accuracies": 1.0, "rewards/chosen": 0.020309830084443092, "rewards/margins": 0.03950195387005806, "rewards/rejected": -0.019192123785614967, "step": 173 }, { "epoch": 0.43, "learning_rate": 9.124149282745612e-08, "logits/chosen": -0.3725080192089081, "logits/rejected": -0.3422316908836365, "logps/chosen": -144.84124755859375, "logps/rejected": -155.5181121826172, "loss": 0.6983, "rewards/accuracies": 0.75, "rewards/chosen": 0.010934066958725452, "rewards/margins": 0.013394737616181374, "rewards/rejected": -0.0024606711231172085, "step": 174 }, { "epoch": 0.44, "learning_rate": 9.112715282812081e-08, "logits/chosen": -0.19931119680404663, "logits/rejected": -0.17014236748218536, "logps/chosen": -128.51605224609375, "logps/rejected": -153.43023681640625, "loss": 0.6909, "rewards/accuracies": 0.5, "rewards/chosen": -0.0188446044921875, "rewards/margins": 0.008853339590132236, "rewards/rejected": -0.02769794501364231, "step": 175 }, { "epoch": 0.44, "learning_rate": 9.10121439415837e-08, "logits/chosen": -0.26165908575057983, "logits/rejected": -0.2703009247779846, "logps/chosen": -125.99543762207031, "logps/rejected": -144.9744110107422, "loss": 0.6895, "rewards/accuracies": 0.75, "rewards/chosen": 0.01848144643008709, "rewards/margins": 0.026509668678045273, "rewards/rejected": -0.008028222247958183, "step": 176 }, { "epoch": 0.44, "learning_rate": 9.089646803833588e-08, "logits/chosen": -0.37032899260520935, "logits/rejected": -0.34720391035079956, "logps/chosen": -150.687744140625, "logps/rejected": -161.06051635742188, "loss": 0.6945, "rewards/accuracies": 0.5, "rewards/chosen": -0.0077362060546875, "rewards/margins": 0.0074075693264603615, "rewards/rejected": -0.015143776312470436, "step": 177 }, { "epoch": 0.44, "learning_rate": 9.078012699971671e-08, "logits/chosen": -0.3263015151023865, "logits/rejected": -0.28149518370628357, "logps/chosen": -142.41978454589844, "logps/rejected": -165.60923767089844, "loss": 0.6926, "rewards/accuracies": 0.0, "rewards/chosen": -0.02440948598086834, "rewards/margins": -0.03346671909093857, "rewards/rejected": 0.009057234972715378, "step": 178 }, { "epoch": 0.45, "learning_rate": 9.066312271788323e-08, "logits/chosen": -0.47916045784950256, "logits/rejected": -0.4515325129032135, "logps/chosen": -124.21101379394531, "logps/rejected": -178.23707580566406, "loss": 0.6892, "rewards/accuracies": 0.25, "rewards/chosen": -0.003984833136200905, "rewards/margins": -0.02550964429974556, "rewards/rejected": 0.021524811163544655, "step": 179 }, { "epoch": 0.45, "learning_rate": 9.054545709577937e-08, "logits/chosen": -0.26819097995758057, "logits/rejected": -0.2516954839229584, "logps/chosen": -198.58111572265625, "logps/rejected": -133.6155548095703, "loss": 0.6927, "rewards/accuracies": 0.5, "rewards/chosen": 0.001816178672015667, "rewards/margins": -0.0033168792724609375, "rewards/rejected": 0.00513305701315403, "step": 180 }, { "epoch": 0.45, "learning_rate": 9.042713204710507e-08, "logits/chosen": -0.4271756410598755, "logits/rejected": -0.4080996811389923, "logps/chosen": -141.5867156982422, "logps/rejected": -191.0716094970703, "loss": 0.683, "rewards/accuracies": 1.0, "rewards/chosen": 0.026133157312870026, "rewards/margins": 0.07673434913158417, "rewards/rejected": -0.05060119926929474, "step": 181 }, { "epoch": 0.45, "learning_rate": 9.030814949628506e-08, "logits/chosen": -0.2771576941013336, "logits/rejected": -0.25074881315231323, "logps/chosen": -125.70806121826172, "logps/rejected": -156.3560333251953, "loss": 0.6893, "rewards/accuracies": 0.5, "rewards/chosen": 0.0009721757960505784, "rewards/margins": -0.0012197485193610191, "rewards/rejected": 0.0021919249556958675, "step": 182 }, { "epoch": 0.46, "learning_rate": 9.018851137843765e-08, "logits/chosen": -0.3067985773086548, "logits/rejected": -0.3205060362815857, "logps/chosen": -138.55825805664062, "logps/rejected": -181.81304931640625, "loss": 0.6883, "rewards/accuracies": 0.5, "rewards/chosen": 0.015453720465302467, "rewards/margins": 0.010844804346561432, "rewards/rejected": 0.00460891705006361, "step": 183 }, { "epoch": 0.46, "learning_rate": 9.006821963934315e-08, "logits/chosen": -0.27493807673454285, "logits/rejected": -0.2710685729980469, "logps/chosen": -148.1483154296875, "logps/rejected": -156.87881469726562, "loss": 0.6924, "rewards/accuracies": 0.5, "rewards/chosen": -0.0016765594482421875, "rewards/margins": -0.012602806091308594, "rewards/rejected": 0.010926246643066406, "step": 184 }, { "epoch": 0.46, "learning_rate": 8.994727623541236e-08, "logits/chosen": -0.17000003159046173, "logits/rejected": -0.1816650629043579, "logps/chosen": -150.04696655273438, "logps/rejected": -175.59521484375, "loss": 0.6893, "rewards/accuracies": 0.75, "rewards/chosen": 0.004690170753747225, "rewards/margins": 0.02449340932071209, "rewards/rejected": -0.019803239032626152, "step": 185 }, { "epoch": 0.46, "learning_rate": 8.982568313365467e-08, "logits/chosen": -0.39151108264923096, "logits/rejected": -0.3401334881782532, "logps/chosen": -154.47323608398438, "logps/rejected": -178.03195190429688, "loss": 0.6868, "rewards/accuracies": 0.5, "rewards/chosen": -0.0054950718767941, "rewards/margins": 0.011688231490552425, "rewards/rejected": -0.017183303833007812, "step": 186 }, { "epoch": 0.47, "learning_rate": 8.970344231164601e-08, "logits/chosen": -0.30120381712913513, "logits/rejected": -0.22506801784038544, "logps/chosen": -131.82669067382812, "logps/rejected": -184.17433166503906, "loss": 0.69, "rewards/accuracies": 0.75, "rewards/chosen": 0.01617450825870037, "rewards/margins": 0.041491128504276276, "rewards/rejected": -0.025316620245575905, "step": 187 }, { "epoch": 0.47, "learning_rate": 8.958055575749684e-08, "logits/chosen": -0.24132317304611206, "logits/rejected": -0.24876552820205688, "logps/chosen": -142.4204864501953, "logps/rejected": -146.37539672851562, "loss": 0.6959, "rewards/accuracies": 0.25, "rewards/chosen": -0.012402724474668503, "rewards/margins": -0.03136100620031357, "rewards/rejected": 0.018958285450935364, "step": 188 }, { "epoch": 0.47, "learning_rate": 8.945702546981968e-08, "logits/chosen": -0.09466602653265, "logits/rejected": -0.10396377742290497, "logps/chosen": -173.03382873535156, "logps/rejected": -154.3691864013672, "loss": 0.686, "rewards/accuracies": 0.75, "rewards/chosen": -0.03820457309484482, "rewards/margins": -0.0014080042019486427, "rewards/rejected": -0.03679656982421875, "step": 189 }, { "epoch": 0.47, "learning_rate": 8.93328534576967e-08, "logits/chosen": -0.32102739810943604, "logits/rejected": -0.33298614621162415, "logps/chosen": -197.45506286621094, "logps/rejected": -148.83407592773438, "loss": 0.6882, "rewards/accuracies": 0.75, "rewards/chosen": 0.0004993434995412827, "rewards/margins": 0.014816664159297943, "rewards/rejected": -0.014317321591079235, "step": 190 }, { "epoch": 0.48, "learning_rate": 8.920804174064695e-08, "logits/chosen": -0.3059934675693512, "logits/rejected": -0.28352177143096924, "logps/chosen": -160.1165771484375, "logps/rejected": -171.6696014404297, "loss": 0.6875, "rewards/accuracies": 0.5, "rewards/chosen": -0.008139037527143955, "rewards/margins": 0.024008560925722122, "rewards/rejected": -0.0321476012468338, "step": 191 }, { "epoch": 0.48, "learning_rate": 8.908259234859364e-08, "logits/chosen": -0.3504178822040558, "logits/rejected": -0.35807478427886963, "logps/chosen": -138.7818145751953, "logps/rejected": -178.67733764648438, "loss": 0.6937, "rewards/accuracies": 0.75, "rewards/chosen": 0.026667023077607155, "rewards/margins": 0.018050767481327057, "rewards/rejected": 0.008616255596280098, "step": 192 }, { "epoch": 0.48, "learning_rate": 8.895650732183093e-08, "logits/chosen": -0.1461472362279892, "logits/rejected": -0.1414792388677597, "logps/chosen": -158.22418212890625, "logps/rejected": -173.317138671875, "loss": 0.687, "rewards/accuracies": 0.25, "rewards/chosen": 0.008144950494170189, "rewards/margins": -0.025119589641690254, "rewards/rejected": 0.03326454386115074, "step": 193 }, { "epoch": 0.48, "learning_rate": 8.882978871099103e-08, "logits/chosen": -0.23885707557201385, "logits/rejected": -0.22217115759849548, "logps/chosen": -199.278564453125, "logps/rejected": -185.5076904296875, "loss": 0.6893, "rewards/accuracies": 0.5, "rewards/chosen": 0.008636856451630592, "rewards/margins": -0.003162955865263939, "rewards/rejected": 0.011799812316894531, "step": 194 }, { "epoch": 0.49, "learning_rate": 8.870243857701053e-08, "logits/chosen": -0.17128491401672363, "logits/rejected": -0.1555423140525818, "logps/chosen": -140.20245361328125, "logps/rejected": -177.27418518066406, "loss": 0.6879, "rewards/accuracies": 1.0, "rewards/chosen": 0.01890258863568306, "rewards/margins": 0.028610993176698685, "rewards/rejected": -0.0097084054723382, "step": 195 }, { "epoch": 0.49, "learning_rate": 8.857445899109715e-08, "logits/chosen": -0.21408778429031372, "logits/rejected": -0.2111765593290329, "logps/chosen": -147.46542358398438, "logps/rejected": -171.0230712890625, "loss": 0.6922, "rewards/accuracies": 0.75, "rewards/chosen": 0.0005344394594430923, "rewards/margins": 0.0019191727042198181, "rewards/rejected": -0.001384735107421875, "step": 196 }, { "epoch": 0.49, "learning_rate": 8.844585203469587e-08, "logits/chosen": -0.3441316783428192, "logits/rejected": -0.32611948251724243, "logps/chosen": -162.23489379882812, "logps/rejected": -154.06924438476562, "loss": 0.694, "rewards/accuracies": 0.5, "rewards/chosen": 0.01202316302806139, "rewards/margins": 0.023552702739834785, "rewards/rejected": -0.011529541574418545, "step": 197 }, { "epoch": 0.49, "learning_rate": 8.831661979945522e-08, "logits/chosen": -0.2840718626976013, "logits/rejected": -0.24774426221847534, "logps/chosen": -168.66140747070312, "logps/rejected": -228.5927276611328, "loss": 0.6912, "rewards/accuracies": 0.75, "rewards/chosen": 0.03636512905359268, "rewards/margins": 0.020107649266719818, "rewards/rejected": 0.016257477924227715, "step": 198 }, { "epoch": 0.5, "learning_rate": 8.818676438719313e-08, "logits/chosen": -0.2668088972568512, "logits/rejected": -0.2537690997123718, "logps/chosen": -152.79136657714844, "logps/rejected": -176.28370666503906, "loss": 0.6963, "rewards/accuracies": 0.25, "rewards/chosen": -0.013895988464355469, "rewards/margins": -0.03183326870203018, "rewards/rejected": 0.017937280237674713, "step": 199 }, { "epoch": 0.5, "learning_rate": 8.805628790986283e-08, "logits/chosen": -0.2449340522289276, "logits/rejected": -0.2309950888156891, "logps/chosen": -178.4908447265625, "logps/rejected": -165.49008178710938, "loss": 0.6858, "rewards/accuracies": 0.5, "rewards/chosen": 0.00504570035263896, "rewards/margins": 0.020589066669344902, "rewards/rejected": -0.015543365851044655, "step": 200 }, { "epoch": 0.5, "learning_rate": 8.792519248951851e-08, "logits/chosen": -0.25436049699783325, "logits/rejected": -0.2535378336906433, "logps/chosen": -165.18589782714844, "logps/rejected": -180.26272583007812, "loss": 0.6858, "rewards/accuracies": 0.75, "rewards/chosen": -0.010652161203324795, "rewards/margins": -0.003285979852080345, "rewards/rejected": -0.007366180419921875, "step": 201 }, { "epoch": 0.5, "learning_rate": 8.77934802582807e-08, "logits/chosen": -0.37953081727027893, "logits/rejected": -0.3853200078010559, "logps/chosen": -155.43736267089844, "logps/rejected": -174.0952606201172, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.04207611083984375, "rewards/margins": 0.03683280944824219, "rewards/rejected": 0.005243300925940275, "step": 202 }, { "epoch": 0.5, "learning_rate": 8.766115335830177e-08, "logits/chosen": -0.3466634750366211, "logits/rejected": -0.34189656376838684, "logps/chosen": -153.22650146484375, "logps/rejected": -201.28738403320312, "loss": 0.6882, "rewards/accuracies": 0.75, "rewards/chosen": -0.00419082585722208, "rewards/margins": 0.034391023218631744, "rewards/rejected": -0.03858184814453125, "step": 203 }, { "epoch": 0.51, "learning_rate": 8.75282139417309e-08, "logits/chosen": -0.4072871208190918, "logits/rejected": -0.36522626876831055, "logps/chosen": -151.84884643554688, "logps/rejected": -197.31735229492188, "loss": 0.6834, "rewards/accuracies": 0.5, "rewards/chosen": 0.03134498745203018, "rewards/margins": 0.01486683078110218, "rewards/rejected": 0.016478156670928, "step": 204 }, { "epoch": 0.51, "learning_rate": 8.739466417067924e-08, "logits/chosen": -0.28011173009872437, "logits/rejected": -0.27503934502601624, "logps/chosen": -150.87515258789062, "logps/rejected": -197.0723876953125, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 0.013455389998853207, "rewards/margins": 0.042475320398807526, "rewards/rejected": -0.029019927605986595, "step": 205 }, { "epoch": 0.51, "learning_rate": 8.726050621718462e-08, "logits/chosen": -0.15364505350589752, "logits/rejected": -0.14804771542549133, "logps/chosen": -155.15867614746094, "logps/rejected": -154.38150024414062, "loss": 0.6941, "rewards/accuracies": 1.0, "rewards/chosen": 0.0034187305718660355, "rewards/margins": 0.018777085468173027, "rewards/rejected": -0.015358353033661842, "step": 206 }, { "epoch": 0.51, "learning_rate": 8.712574226317629e-08, "logits/chosen": -0.2739425003528595, "logits/rejected": -0.24394555389881134, "logps/chosen": -122.85674285888672, "logps/rejected": -175.16143798828125, "loss": 0.6932, "rewards/accuracies": 0.75, "rewards/chosen": 0.019469834864139557, "rewards/margins": 0.04627037048339844, "rewards/rejected": -0.02680053748190403, "step": 207 }, { "epoch": 0.52, "learning_rate": 8.699037450043945e-08, "logits/chosen": -0.30535149574279785, "logits/rejected": -0.2932226061820984, "logps/chosen": -185.9344024658203, "logps/rejected": -172.353759765625, "loss": 0.6855, "rewards/accuracies": 0.5, "rewards/chosen": 0.029937362298369408, "rewards/margins": -0.000634385272860527, "rewards/rejected": 0.030571747571229935, "step": 208 }, { "epoch": 0.52, "learning_rate": 8.685440513057954e-08, "logits/chosen": -0.38558828830718994, "logits/rejected": -0.38467803597450256, "logps/chosen": -171.85472106933594, "logps/rejected": -168.11993408203125, "loss": 0.6925, "rewards/accuracies": 0.25, "rewards/chosen": -0.0092710480093956, "rewards/margins": -0.03081970103085041, "rewards/rejected": 0.021548651158809662, "step": 209 }, { "epoch": 0.52, "learning_rate": 8.671783636498651e-08, "logits/chosen": -0.31061115860939026, "logits/rejected": -0.2680967450141907, "logps/chosen": -145.0992431640625, "logps/rejected": -155.5098114013672, "loss": 0.6952, "rewards/accuracies": 0.25, "rewards/chosen": 0.0113283172249794, "rewards/margins": -0.002647971734404564, "rewards/rejected": 0.01397628802806139, "step": 210 }, { "epoch": 0.52, "learning_rate": 8.658067042479877e-08, "logits/chosen": -0.1541130244731903, "logits/rejected": -0.1385078877210617, "logps/chosen": -128.39527893066406, "logps/rejected": -137.38067626953125, "loss": 0.6885, "rewards/accuracies": 0.75, "rewards/chosen": -0.011347388848662376, "rewards/margins": 0.006174278445541859, "rewards/rejected": -0.01752166822552681, "step": 211 }, { "epoch": 0.53, "learning_rate": 8.644290954086711e-08, "logits/chosen": -0.3003908395767212, "logits/rejected": -0.2765299677848816, "logps/chosen": -148.5131072998047, "logps/rejected": -183.835205078125, "loss": 0.6935, "rewards/accuracies": 0.25, "rewards/chosen": -0.028921127319335938, "rewards/margins": -0.07755470275878906, "rewards/rejected": 0.048633575439453125, "step": 212 }, { "epoch": 0.53, "learning_rate": 8.630455595371843e-08, "logits/chosen": -0.16155289113521576, "logits/rejected": -0.13493509590625763, "logps/chosen": -131.00900268554688, "logps/rejected": -153.5987548828125, "loss": 0.6894, "rewards/accuracies": 0.75, "rewards/chosen": -0.0009895320981740952, "rewards/margins": 0.029336169362068176, "rewards/rejected": -0.030325699597597122, "step": 213 }, { "epoch": 0.53, "learning_rate": 8.616561191351933e-08, "logits/chosen": -0.25977206230163574, "logits/rejected": -0.24244268238544464, "logps/chosen": -130.84266662597656, "logps/rejected": -174.52890014648438, "loss": 0.6806, "rewards/accuracies": 0.5, "rewards/chosen": -0.0057027810253202915, "rewards/margins": -0.007425880059599876, "rewards/rejected": 0.0017230990342795849, "step": 214 }, { "epoch": 0.53, "learning_rate": 8.602607968003934e-08, "logits/chosen": -0.233361154794693, "logits/rejected": -0.19934475421905518, "logps/chosen": -143.21310424804688, "logps/rejected": -168.82907104492188, "loss": 0.6911, "rewards/accuracies": 0.75, "rewards/chosen": 0.014992142096161842, "rewards/margins": 0.01727905124425888, "rewards/rejected": -0.002286910079419613, "step": 215 }, { "epoch": 0.54, "learning_rate": 8.588596152261445e-08, "logits/chosen": -0.32230454683303833, "logits/rejected": -0.29356303811073303, "logps/chosen": -163.3397216796875, "logps/rejected": -180.52740478515625, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": -0.0005641939351335168, "rewards/margins": 0.036463163793087006, "rewards/rejected": -0.03702735900878906, "step": 216 }, { "epoch": 0.54, "learning_rate": 8.574525972010996e-08, "logits/chosen": -0.3436434864997864, "logits/rejected": -0.3304600417613983, "logps/chosen": -136.96627807617188, "logps/rejected": -157.19186401367188, "loss": 0.69, "rewards/accuracies": 0.5, "rewards/chosen": -0.010885430499911308, "rewards/margins": 0.007408715784549713, "rewards/rejected": -0.018294144421815872, "step": 217 }, { "epoch": 0.54, "learning_rate": 8.560397656088353e-08, "logits/chosen": -0.17096391320228577, "logits/rejected": -0.15828007459640503, "logps/chosen": -136.08975219726562, "logps/rejected": -188.43321228027344, "loss": 0.6973, "rewards/accuracies": 0.25, "rewards/chosen": -0.020798109471797943, "rewards/margins": -0.02488689497113228, "rewards/rejected": 0.004088783636689186, "step": 218 }, { "epoch": 0.54, "learning_rate": 8.54621143427479e-08, "logits/chosen": -0.43058961629867554, "logits/rejected": -0.4108070433139801, "logps/chosen": -170.86534118652344, "logps/rejected": -174.75555419921875, "loss": 0.6903, "rewards/accuracies": 0.75, "rewards/chosen": -0.0017459872178733349, "rewards/margins": 0.01482849195599556, "rewards/rejected": -0.016574479639530182, "step": 219 }, { "epoch": 0.55, "learning_rate": 8.531967537293364e-08, "logits/chosen": -0.29536211490631104, "logits/rejected": -0.29335689544677734, "logps/chosen": -149.11984252929688, "logps/rejected": -178.7754669189453, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.027805710211396217, "rewards/margins": 0.023530960083007812, "rewards/rejected": 0.00427474919706583, "step": 220 }, { "epoch": 0.55, "learning_rate": 8.517666196805143e-08, "logits/chosen": -0.27560651302337646, "logits/rejected": -0.23712679743766785, "logps/chosen": -131.60372924804688, "logps/rejected": -146.3184814453125, "loss": 0.6847, "rewards/accuracies": 0.75, "rewards/chosen": -0.004833984188735485, "rewards/margins": 0.004703140817582607, "rewards/rejected": -0.009537125006318092, "step": 221 }, { "epoch": 0.55, "learning_rate": 8.50330764540546e-08, "logits/chosen": -0.30152183771133423, "logits/rejected": -0.2962966561317444, "logps/chosen": -140.4833984375, "logps/rejected": -140.76959228515625, "loss": 0.6831, "rewards/accuracies": 0.75, "rewards/chosen": 0.01984863355755806, "rewards/margins": 0.03547363355755806, "rewards/rejected": -0.015625, "step": 222 }, { "epoch": 0.55, "learning_rate": 8.488892116620112e-08, "logits/chosen": -0.3459860384464264, "logits/rejected": -0.3270363211631775, "logps/chosen": -170.92086791992188, "logps/rejected": -151.10702514648438, "loss": 0.6881, "rewards/accuracies": 0.5, "rewards/chosen": -0.02292327955365181, "rewards/margins": -0.022016143426299095, "rewards/rejected": -0.0009071352542378008, "step": 223 }, { "epoch": 0.56, "learning_rate": 8.474419844901575e-08, "logits/chosen": -0.26102715730667114, "logits/rejected": -0.25543493032455444, "logps/chosen": -150.6943359375, "logps/rejected": -184.22494506835938, "loss": 0.69, "rewards/accuracies": 0.75, "rewards/chosen": 0.023082733154296875, "rewards/margins": 0.024673080071806908, "rewards/rejected": -0.001590348081663251, "step": 224 }, { "epoch": 0.56, "learning_rate": 8.459891065625183e-08, "logits/chosen": -0.30029889941215515, "logits/rejected": -0.27650025486946106, "logps/chosen": -155.9093475341797, "logps/rejected": -190.60577392578125, "loss": 0.6916, "rewards/accuracies": 0.5, "rewards/chosen": 0.012753486633300781, "rewards/margins": 0.025758171454072, "rewards/rejected": -0.013004684820771217, "step": 225 }, { "epoch": 0.56, "learning_rate": 8.445306015085301e-08, "logits/chosen": -0.14024370908737183, "logits/rejected": -0.1590750813484192, "logps/chosen": -150.9900360107422, "logps/rejected": -159.62100219726562, "loss": 0.6976, "rewards/accuracies": 0.5, "rewards/chosen": 0.0069705951027572155, "rewards/margins": -0.00193367013707757, "rewards/rejected": 0.00890426617115736, "step": 226 }, { "epoch": 0.56, "learning_rate": 8.430664930491484e-08, "logits/chosen": -0.36621546745300293, "logits/rejected": -0.3235943615436554, "logps/chosen": -142.5799560546875, "logps/rejected": -158.28384399414062, "loss": 0.6896, "rewards/accuracies": 0.75, "rewards/chosen": -0.006791877560317516, "rewards/margins": 0.043393515050411224, "rewards/rejected": -0.050185393542051315, "step": 227 }, { "epoch": 0.57, "learning_rate": 8.415968049964622e-08, "logits/chosen": -0.2829079031944275, "logits/rejected": -0.29490455985069275, "logps/chosen": -126.27693176269531, "logps/rejected": -176.7613525390625, "loss": 0.6889, "rewards/accuracies": 0.5, "rewards/chosen": -0.004841231741011143, "rewards/margins": -0.0014591198414564133, "rewards/rejected": -0.003382110968232155, "step": 228 }, { "epoch": 0.57, "learning_rate": 8.401215612533055e-08, "logits/chosen": -0.43210989236831665, "logits/rejected": -0.4095042049884796, "logps/chosen": -143.42742919921875, "logps/rejected": -153.03961181640625, "loss": 0.6828, "rewards/accuracies": 0.75, "rewards/chosen": 0.00586814945563674, "rewards/margins": 0.04012298583984375, "rewards/rejected": -0.03425483778119087, "step": 229 }, { "epoch": 0.57, "learning_rate": 8.386407858128706e-08, "logits/chosen": -0.12420197576284409, "logits/rejected": -0.10459509491920471, "logps/chosen": -154.53030395507812, "logps/rejected": -149.61282348632812, "loss": 0.6981, "rewards/accuracies": 0.25, "rewards/chosen": -0.009903526864945889, "rewards/margins": -0.03326892852783203, "rewards/rejected": 0.023365400731563568, "step": 230 }, { "epoch": 0.57, "learning_rate": 8.371545027583154e-08, "logits/chosen": -0.2859150767326355, "logits/rejected": -0.2511066496372223, "logps/chosen": -126.17161560058594, "logps/rejected": -170.82626342773438, "loss": 0.6886, "rewards/accuracies": 0.75, "rewards/chosen": 0.03050212934613228, "rewards/margins": 0.0715673416852951, "rewards/rejected": -0.041065216064453125, "step": 231 }, { "epoch": 0.58, "learning_rate": 8.35662736262374e-08, "logits/chosen": -0.29945066571235657, "logits/rejected": -0.2580132484436035, "logps/chosen": -174.21218872070312, "logps/rejected": -155.91455078125, "loss": 0.6917, "rewards/accuracies": 0.25, "rewards/chosen": 0.010237312875688076, "rewards/margins": -0.023096658289432526, "rewards/rejected": 0.033333972096443176, "step": 232 }, { "epoch": 0.58, "learning_rate": 8.34165510586962e-08, "logits/chosen": -0.3889542818069458, "logits/rejected": -0.37248632311820984, "logps/chosen": -147.13648986816406, "logps/rejected": -175.71646118164062, "loss": 0.6875, "rewards/accuracies": 0.25, "rewards/chosen": -0.014188003726303577, "rewards/margins": -0.023169327527284622, "rewards/rejected": 0.00898132286965847, "step": 233 }, { "epoch": 0.58, "learning_rate": 8.326628500827825e-08, "logits/chosen": -0.16865184903144836, "logits/rejected": -0.1813051849603653, "logps/chosen": -141.02194213867188, "logps/rejected": -174.4136962890625, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 0.018965531140565872, "rewards/margins": 0.05213604122400284, "rewards/rejected": -0.033170510083436966, "step": 234 }, { "epoch": 0.58, "learning_rate": 8.311547791889306e-08, "logits/chosen": -0.16011416912078857, "logits/rejected": -0.15381017327308655, "logps/chosen": -161.5579833984375, "logps/rejected": -159.60702514648438, "loss": 0.6842, "rewards/accuracies": 0.75, "rewards/chosen": 0.019543077796697617, "rewards/margins": 0.03657035529613495, "rewards/rejected": -0.01702728308737278, "step": 235 }, { "epoch": 0.59, "learning_rate": 8.296413224324943e-08, "logits/chosen": -0.33341559767723083, "logits/rejected": -0.2818996012210846, "logps/chosen": -138.02886962890625, "logps/rejected": -157.51564025878906, "loss": 0.6913, "rewards/accuracies": 0.5, "rewards/chosen": -0.004295158665627241, "rewards/margins": -0.008184622973203659, "rewards/rejected": 0.00388946570456028, "step": 236 }, { "epoch": 0.59, "learning_rate": 8.281225044281577e-08, "logits/chosen": -0.40232259035110474, "logits/rejected": -0.3795336186885834, "logps/chosen": -127.94477844238281, "logps/rejected": -163.4525146484375, "loss": 0.6905, "rewards/accuracies": 0.75, "rewards/chosen": 0.004008864983916283, "rewards/margins": 0.022313689813017845, "rewards/rejected": -0.018304824829101562, "step": 237 }, { "epoch": 0.59, "learning_rate": 8.265983498777987e-08, "logits/chosen": -0.1553625613451004, "logits/rejected": -0.1264062523841858, "logps/chosen": -134.90402221679688, "logps/rejected": -158.65211486816406, "loss": 0.6975, "rewards/accuracies": 0.5, "rewards/chosen": -0.00829391460865736, "rewards/margins": 0.0017238631844520569, "rewards/rejected": -0.010017778724431992, "step": 238 }, { "epoch": 0.59, "learning_rate": 8.250688835700888e-08, "logits/chosen": -0.26636219024658203, "logits/rejected": -0.24389076232910156, "logps/chosen": -150.15318298339844, "logps/rejected": -156.50779724121094, "loss": 0.6958, "rewards/accuracies": 1.0, "rewards/chosen": 0.01179123017936945, "rewards/margins": 0.04647655785083771, "rewards/rejected": -0.034685324877500534, "step": 239 }, { "epoch": 0.6, "learning_rate": 8.235341303800891e-08, "logits/chosen": -0.2696017622947693, "logits/rejected": -0.23296892642974854, "logps/chosen": -187.68663024902344, "logps/rejected": -186.88575744628906, "loss": 0.691, "rewards/accuracies": 0.75, "rewards/chosen": -0.002984619000926614, "rewards/margins": 0.022164154797792435, "rewards/rejected": -0.025148771703243256, "step": 240 }, { "epoch": 0.6, "learning_rate": 8.219941152688458e-08, "logits/chosen": -0.33015701174736023, "logits/rejected": -0.3030988872051239, "logps/chosen": -161.37908935546875, "logps/rejected": -179.6246337890625, "loss": 0.6901, "rewards/accuracies": 0.5, "rewards/chosen": -0.009602165780961514, "rewards/margins": -0.0006093978881835938, "rewards/rejected": -0.00899276789277792, "step": 241 }, { "epoch": 0.6, "learning_rate": 8.204488632829847e-08, "logits/chosen": -0.25366243720054626, "logits/rejected": -0.2453373819589615, "logps/chosen": -132.3609619140625, "logps/rejected": -145.31561279296875, "loss": 0.6934, "rewards/accuracies": 0.25, "rewards/chosen": -0.009150886908173561, "rewards/margins": -0.03901462256908417, "rewards/rejected": 0.029863737523555756, "step": 242 }, { "epoch": 0.6, "learning_rate": 8.188983995543031e-08, "logits/chosen": -0.4123667776584625, "logits/rejected": -0.41396525502204895, "logps/chosen": -145.4718017578125, "logps/rejected": -165.02020263671875, "loss": 0.6839, "rewards/accuracies": 0.5, "rewards/chosen": 0.01014099083840847, "rewards/margins": 0.012747573666274548, "rewards/rejected": -0.002606583759188652, "step": 243 }, { "epoch": 0.61, "learning_rate": 8.173427492993616e-08, "logits/chosen": -0.1479879915714264, "logits/rejected": -0.16540715098381042, "logps/chosen": -132.1468505859375, "logps/rejected": -135.96263122558594, "loss": 0.6899, "rewards/accuracies": 0.75, "rewards/chosen": -0.0010593414772301912, "rewards/margins": 0.002113533904775977, "rewards/rejected": -0.0031728744506835938, "step": 244 }, { "epoch": 0.61, "learning_rate": 8.157819378190742e-08, "logits/chosen": -0.13831354677677155, "logits/rejected": -0.11686693876981735, "logps/chosen": -145.63551330566406, "logps/rejected": -168.92276000976562, "loss": 0.6874, "rewards/accuracies": 0.75, "rewards/chosen": -0.012825011275708675, "rewards/margins": 0.011493301019072533, "rewards/rejected": -0.024318313226103783, "step": 245 }, { "epoch": 0.61, "learning_rate": 8.142159904982961e-08, "logits/chosen": -0.21087297797203064, "logits/rejected": -0.19900411367416382, "logps/chosen": -146.36614990234375, "logps/rejected": -150.9007568359375, "loss": 0.6886, "rewards/accuracies": 0.75, "rewards/chosen": -0.0019466402009129524, "rewards/margins": 0.008954240009188652, "rewards/rejected": -0.01090087927877903, "step": 246 }, { "epoch": 0.61, "learning_rate": 8.126449328054114e-08, "logits/chosen": -0.27438339591026306, "logits/rejected": -0.2605949342250824, "logps/chosen": -127.73243713378906, "logps/rejected": -167.54037475585938, "loss": 0.6945, "rewards/accuracies": 0.75, "rewards/chosen": -0.0021636965684592724, "rewards/margins": 0.011970900930464268, "rewards/rejected": -0.014134597964584827, "step": 247 }, { "epoch": 0.62, "learning_rate": 8.110687902919185e-08, "logits/chosen": -0.32070818543434143, "logits/rejected": -0.29506000876426697, "logps/chosen": -148.28076171875, "logps/rejected": -188.26214599609375, "loss": 0.6892, "rewards/accuracies": 0.25, "rewards/chosen": 0.004436111077666283, "rewards/margins": 0.003472518175840378, "rewards/rejected": 0.0009635919705033302, "step": 248 }, { "epoch": 0.62, "learning_rate": 8.094875885920148e-08, "logits/chosen": -0.3538576662540436, "logits/rejected": -0.3172139525413513, "logps/chosen": -131.60948181152344, "logps/rejected": -167.05796813964844, "loss": 0.6979, "rewards/accuracies": 0.25, "rewards/chosen": -0.01565570943057537, "rewards/margins": -0.019545555114746094, "rewards/rejected": 0.0038898466154932976, "step": 249 }, { "epoch": 0.62, "learning_rate": 8.079013534221797e-08, "logits/chosen": -0.24160262942314148, "logits/rejected": -0.249856635928154, "logps/chosen": -157.8656768798828, "logps/rejected": -162.59201049804688, "loss": 0.6951, "rewards/accuracies": 0.75, "rewards/chosen": 0.017561722546815872, "rewards/margins": 0.008451842702925205, "rewards/rejected": 0.009109877981245518, "step": 250 }, { "epoch": 0.62, "learning_rate": 8.063101105807567e-08, "logits/chosen": -0.3522174060344696, "logits/rejected": -0.31834840774536133, "logps/chosen": -160.25808715820312, "logps/rejected": -158.46726989746094, "loss": 0.6885, "rewards/accuracies": 0.75, "rewards/chosen": 0.026763152331113815, "rewards/margins": 0.018315505236387253, "rewards/rejected": 0.008447647094726562, "step": 251 }, { "epoch": 0.63, "learning_rate": 8.047138859475327e-08, "logits/chosen": -0.3441225588321686, "logits/rejected": -0.32750222086906433, "logps/chosen": -150.3926544189453, "logps/rejected": -158.37086486816406, "loss": 0.6871, "rewards/accuracies": 0.5, "rewards/chosen": 0.004871368408203125, "rewards/margins": -0.0026754382997751236, "rewards/rejected": 0.007546806707978249, "step": 252 }, { "epoch": 0.63, "learning_rate": 8.03112705483319e-08, "logits/chosen": -0.25539687275886536, "logits/rejected": -0.25824159383773804, "logps/chosen": -152.025634765625, "logps/rejected": -141.61114501953125, "loss": 0.6902, "rewards/accuracies": 0.75, "rewards/chosen": 0.009497642517089844, "rewards/margins": 0.02222595363855362, "rewards/rejected": -0.012728309258818626, "step": 253 }, { "epoch": 0.63, "learning_rate": 8.01506595229527e-08, "logits/chosen": -0.1252366453409195, "logits/rejected": -0.08082355558872223, "logps/chosen": -136.4565887451172, "logps/rejected": -184.69805908203125, "loss": 0.6949, "rewards/accuracies": 1.0, "rewards/chosen": 0.0053577427752316, "rewards/margins": 0.0322813056409359, "rewards/rejected": -0.026923561468720436, "step": 254 }, { "epoch": 0.63, "learning_rate": 7.998955813077457e-08, "logits/chosen": -0.20865240693092346, "logits/rejected": -0.21084700524806976, "logps/chosen": -136.85903930664062, "logps/rejected": -157.24447631835938, "loss": 0.6942, "rewards/accuracies": 0.5, "rewards/chosen": 0.007633018773049116, "rewards/margins": -0.008599091321229935, "rewards/rejected": 0.01623210869729519, "step": 255 }, { "epoch": 0.64, "learning_rate": 7.982796899193176e-08, "logits/chosen": -0.3077308237552643, "logits/rejected": -0.31084224581718445, "logps/chosen": -144.90084838867188, "logps/rejected": -183.50277709960938, "loss": 0.6936, "rewards/accuracies": 0.75, "rewards/chosen": 0.006693458184599876, "rewards/margins": 0.019202232360839844, "rewards/rejected": -0.012508774176239967, "step": 256 }, { "epoch": 0.64, "learning_rate": 7.966589473449107e-08, "logits/chosen": -0.19042982161045074, "logits/rejected": -0.20488393306732178, "logps/chosen": -155.40847778320312, "logps/rejected": -154.50779724121094, "loss": 0.6949, "rewards/accuracies": 0.0, "rewards/chosen": -0.0084686279296875, "rewards/margins": -0.02720661088824272, "rewards/rejected": 0.01873798295855522, "step": 257 }, { "epoch": 0.64, "learning_rate": 7.95033379944093e-08, "logits/chosen": -0.4381045997142792, "logits/rejected": -0.40324243903160095, "logps/chosen": -157.655517578125, "logps/rejected": -177.00238037109375, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 0.029889296740293503, "rewards/margins": 0.05300483852624893, "rewards/rejected": -0.02311554178595543, "step": 258 }, { "epoch": 0.64, "learning_rate": 7.934030141549022e-08, "logits/chosen": -0.4864521324634552, "logits/rejected": -0.4416276216506958, "logps/chosen": -138.46087646484375, "logps/rejected": -151.03140258789062, "loss": 0.6875, "rewards/accuracies": 0.75, "rewards/chosen": 0.00728454627096653, "rewards/margins": 0.017644500359892845, "rewards/rejected": -0.010359954088926315, "step": 259 }, { "epoch": 0.65, "learning_rate": 7.917678764934168e-08, "logits/chosen": -0.22581568360328674, "logits/rejected": -0.23668459057807922, "logps/chosen": -168.4586944580078, "logps/rejected": -161.40097045898438, "loss": 0.6867, "rewards/accuracies": 0.75, "rewards/chosen": 0.008243941701948643, "rewards/margins": 0.02468414232134819, "rewards/rejected": -0.016440199688076973, "step": 260 }, { "epoch": 0.65, "learning_rate": 7.901279935533247e-08, "logits/chosen": -0.35168641805648804, "logits/rejected": -0.3575977683067322, "logps/chosen": -137.90643310546875, "logps/rejected": -184.3732147216797, "loss": 0.6945, "rewards/accuracies": 0.5, "rewards/chosen": -0.021769333630800247, "rewards/margins": 0.0006835926324129105, "rewards/rejected": -0.022452926263213158, "step": 261 }, { "epoch": 0.65, "learning_rate": 7.884833920054899e-08, "logits/chosen": -0.11308161914348602, "logits/rejected": -0.12035804241895676, "logps/chosen": -151.87440490722656, "logps/rejected": -174.13462829589844, "loss": 0.6971, "rewards/accuracies": 0.5, "rewards/chosen": 0.0036613461561501026, "rewards/margins": -0.01117248460650444, "rewards/rejected": 0.01483383122831583, "step": 262 }, { "epoch": 0.65, "learning_rate": 7.868340985975195e-08, "logits/chosen": -0.1928914189338684, "logits/rejected": -0.14270998537540436, "logps/chosen": -171.16441345214844, "logps/rejected": -147.5443115234375, "loss": 0.6895, "rewards/accuracies": 0.75, "rewards/chosen": -0.02403850667178631, "rewards/margins": 0.000564957968890667, "rewards/rejected": -0.0246034637093544, "step": 263 }, { "epoch": 0.66, "learning_rate": 7.851801401533287e-08, "logits/chosen": -0.4249282479286194, "logits/rejected": -0.4143523573875427, "logps/chosen": -136.8401641845703, "logps/rejected": -157.11415100097656, "loss": 0.6948, "rewards/accuracies": 1.0, "rewards/chosen": 0.005238913930952549, "rewards/margins": 0.049553871154785156, "rewards/rejected": -0.04431495815515518, "step": 264 }, { "epoch": 0.66, "learning_rate": 7.83521543572704e-08, "logits/chosen": -0.2377447783946991, "logits/rejected": -0.2108308970928192, "logps/chosen": -149.1475830078125, "logps/rejected": -153.94198608398438, "loss": 0.6858, "rewards/accuracies": 0.75, "rewards/chosen": 0.01277847308665514, "rewards/margins": 0.04623451456427574, "rewards/rejected": -0.03345603868365288, "step": 265 }, { "epoch": 0.66, "learning_rate": 7.818583358308664e-08, "logits/chosen": -0.2204708307981491, "logits/rejected": -0.19737458229064941, "logps/chosen": -156.72293090820312, "logps/rejected": -170.6324920654297, "loss": 0.6874, "rewards/accuracies": 1.0, "rewards/chosen": 0.004814529791474342, "rewards/margins": 0.0693700835108757, "rewards/rejected": -0.06455555558204651, "step": 266 }, { "epoch": 0.66, "learning_rate": 7.801905439780316e-08, "logits/chosen": -0.3260376751422882, "logits/rejected": -0.32337135076522827, "logps/chosen": -161.92422485351562, "logps/rejected": -165.54684448242188, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": -0.0180740375071764, "rewards/margins": -0.025815963745117188, "rewards/rejected": 0.0077419281005859375, "step": 267 }, { "epoch": 0.67, "learning_rate": 7.785181951389717e-08, "logits/chosen": -0.14612039923667908, "logits/rejected": -0.14131347835063934, "logps/chosen": -144.90162658691406, "logps/rejected": -160.24143981933594, "loss": 0.6885, "rewards/accuracies": 0.5, "rewards/chosen": -0.014644432812929153, "rewards/margins": 0.005269431509077549, "rewards/rejected": -0.019913863390684128, "step": 268 }, { "epoch": 0.67, "learning_rate": 7.768413165125718e-08, "logits/chosen": -0.1283440738916397, "logits/rejected": -0.12405892461538315, "logps/chosen": -130.9295654296875, "logps/rejected": -161.78363037109375, "loss": 0.6917, "rewards/accuracies": 0.75, "rewards/chosen": -0.00119438162073493, "rewards/margins": 0.007559775374829769, "rewards/rejected": -0.008754157461225986, "step": 269 }, { "epoch": 0.67, "learning_rate": 7.751599353713904e-08, "logits/chosen": -0.17581616342067719, "logits/rejected": -0.13777482509613037, "logps/chosen": -151.3377685546875, "logps/rejected": -159.37705993652344, "loss": 0.6852, "rewards/accuracies": 0.75, "rewards/chosen": 0.013359833508729935, "rewards/margins": 0.03871707618236542, "rewards/rejected": -0.02535724639892578, "step": 270 }, { "epoch": 0.67, "learning_rate": 7.734740790612135e-08, "logits/chosen": -0.4508454203605652, "logits/rejected": -0.4346962571144104, "logps/chosen": -122.75418090820312, "logps/rejected": -181.01876831054688, "loss": 0.6858, "rewards/accuracies": 0.5, "rewards/chosen": -0.007392692845314741, "rewards/margins": -0.008040240034461021, "rewards/rejected": 0.0006475450936704874, "step": 271 }, { "epoch": 0.68, "learning_rate": 7.717837750006105e-08, "logits/chosen": -0.18670175969600677, "logits/rejected": -0.14529138803482056, "logps/chosen": -162.68179321289062, "logps/rejected": -181.7730712890625, "loss": 0.6897, "rewards/accuracies": 0.75, "rewards/chosen": -0.002113342983648181, "rewards/margins": -0.0068092355504632, "rewards/rejected": 0.004695892333984375, "step": 272 }, { "epoch": 0.68, "learning_rate": 7.700890506804893e-08, "logits/chosen": -0.18321318924427032, "logits/rejected": -0.17591121792793274, "logps/chosen": -143.46954345703125, "logps/rejected": -181.87908935546875, "loss": 0.6896, "rewards/accuracies": 0.75, "rewards/chosen": -0.0006450660293921828, "rewards/margins": -0.009531785733997822, "rewards/rejected": 0.00888671912252903, "step": 273 }, { "epoch": 0.68, "learning_rate": 7.68389933663648e-08, "logits/chosen": -0.27662646770477295, "logits/rejected": -0.258461058139801, "logps/chosen": -133.740478515625, "logps/rejected": -166.43881225585938, "loss": 0.698, "rewards/accuracies": 0.5, "rewards/chosen": -0.0021085739135742188, "rewards/margins": -0.00012149754911661148, "rewards/rejected": -0.0019870756659656763, "step": 274 }, { "epoch": 0.68, "learning_rate": 7.666864515843265e-08, "logits/chosen": -0.27110227942466736, "logits/rejected": -0.22605834901332855, "logps/chosen": -154.05831909179688, "logps/rejected": -143.2511444091797, "loss": 0.6929, "rewards/accuracies": 0.0, "rewards/chosen": 0.005143356043845415, "rewards/margins": -0.030505754053592682, "rewards/rejected": 0.03564910963177681, "step": 275 }, { "epoch": 0.69, "learning_rate": 7.649786321477584e-08, "logits/chosen": -0.27189868688583374, "logits/rejected": -0.290927529335022, "logps/chosen": -175.7695770263672, "logps/rejected": -174.43881225585938, "loss": 0.6786, "rewards/accuracies": 0.75, "rewards/chosen": 0.025469208136200905, "rewards/margins": 0.030678177252411842, "rewards/rejected": -0.00520896865054965, "step": 276 }, { "epoch": 0.69, "learning_rate": 7.632665031297192e-08, "logits/chosen": -0.5060807466506958, "logits/rejected": -0.5094146728515625, "logps/chosen": -136.1002655029297, "logps/rejected": -141.43304443359375, "loss": 0.6829, "rewards/accuracies": 0.5, "rewards/chosen": 0.003987121395766735, "rewards/margins": 0.0012763983104377985, "rewards/rejected": 0.0027107233181595802, "step": 277 }, { "epoch": 0.69, "learning_rate": 7.615500923760747e-08, "logits/chosen": -0.23150008916854858, "logits/rejected": -0.18472586572170258, "logps/chosen": -132.13475036621094, "logps/rejected": -164.06695556640625, "loss": 0.6943, "rewards/accuracies": 0.5, "rewards/chosen": 0.0019090650603175163, "rewards/margins": -0.03460102155804634, "rewards/rejected": 0.03651008754968643, "step": 278 }, { "epoch": 0.69, "learning_rate": 7.59829427802329e-08, "logits/chosen": -0.33416619896888733, "logits/rejected": -0.3268323242664337, "logps/chosen": -129.24891662597656, "logps/rejected": -171.6980743408203, "loss": 0.688, "rewards/accuracies": 0.5, "rewards/chosen": 0.029984282329678535, "rewards/margins": 0.027544403448700905, "rewards/rejected": 0.0024398802779614925, "step": 279 }, { "epoch": 0.7, "learning_rate": 7.58104537393169e-08, "logits/chosen": -0.3016921579837799, "logits/rejected": -0.25761881470680237, "logps/chosen": -171.5423583984375, "logps/rejected": -129.4259033203125, "loss": 0.6879, "rewards/accuracies": 0.25, "rewards/chosen": -0.006281472276896238, "rewards/margins": -0.021506883203983307, "rewards/rejected": 0.015225410461425781, "step": 280 }, { "epoch": 0.7, "learning_rate": 7.563754492020107e-08, "logits/chosen": -0.31874290108680725, "logits/rejected": -0.31214913725852966, "logps/chosen": -142.83470153808594, "logps/rejected": -189.57427978515625, "loss": 0.6827, "rewards/accuracies": 0.5, "rewards/chosen": -0.008983230218291283, "rewards/margins": -0.01840667799115181, "rewards/rejected": 0.009423446841537952, "step": 281 }, { "epoch": 0.7, "learning_rate": 7.546421913505418e-08, "logits/chosen": -0.3408128023147583, "logits/rejected": -0.3305201530456543, "logps/chosen": -136.98880004882812, "logps/rejected": -161.15008544921875, "loss": 0.6888, "rewards/accuracies": 0.75, "rewards/chosen": 0.04198627546429634, "rewards/margins": 0.0624937079846859, "rewards/rejected": -0.020507432520389557, "step": 282 }, { "epoch": 0.7, "learning_rate": 7.529047920282659e-08, "logits/chosen": -0.1725953221321106, "logits/rejected": -0.17046473920345306, "logps/chosen": -142.81402587890625, "logps/rejected": -147.59347534179688, "loss": 0.6897, "rewards/accuracies": 1.0, "rewards/chosen": 0.00728950509801507, "rewards/margins": 0.04526710510253906, "rewards/rejected": -0.03797760233283043, "step": 283 }, { "epoch": 0.71, "learning_rate": 7.511632794920418e-08, "logits/chosen": -0.3297581970691681, "logits/rejected": -0.2797606289386749, "logps/chosen": -135.39883422851562, "logps/rejected": -161.43145751953125, "loss": 0.6912, "rewards/accuracies": 0.25, "rewards/chosen": -0.01105499267578125, "rewards/margins": -0.020788192749023438, "rewards/rejected": 0.009733200073242188, "step": 284 }, { "epoch": 0.71, "learning_rate": 7.494176820656257e-08, "logits/chosen": -0.3248550593852997, "logits/rejected": -0.33291953802108765, "logps/chosen": -144.35009765625, "logps/rejected": -152.51101684570312, "loss": 0.6826, "rewards/accuracies": 0.25, "rewards/chosen": 0.006376457400619984, "rewards/margins": -0.015984535217285156, "rewards/rejected": 0.022360993549227715, "step": 285 }, { "epoch": 0.71, "learning_rate": 7.476680281392101e-08, "logits/chosen": -0.14163720607757568, "logits/rejected": -0.10671669989824295, "logps/chosen": -146.43685913085938, "logps/rejected": -175.69412231445312, "loss": 0.6863, "rewards/accuracies": 0.75, "rewards/chosen": -0.001655197236686945, "rewards/margins": 0.031159020960330963, "rewards/rejected": -0.03281421959400177, "step": 286 }, { "epoch": 0.71, "learning_rate": 7.459143461689614e-08, "logits/chosen": -0.27805042266845703, "logits/rejected": -0.20530128479003906, "logps/chosen": -138.04624938964844, "logps/rejected": -161.89276123046875, "loss": 0.6833, "rewards/accuracies": 0.75, "rewards/chosen": 0.020868683233857155, "rewards/margins": 0.03305111080408096, "rewards/rejected": -0.012182426638901234, "step": 287 }, { "epoch": 0.72, "learning_rate": 7.441566646765583e-08, "logits/chosen": -0.25653713941574097, "logits/rejected": -0.24623090028762817, "logps/chosen": -143.7225341796875, "logps/rejected": -180.1238555908203, "loss": 0.6876, "rewards/accuracies": 0.75, "rewards/chosen": 0.007796096615493298, "rewards/margins": 0.05420989915728569, "rewards/rejected": -0.04641380161046982, "step": 288 }, { "epoch": 0.72, "learning_rate": 7.423950122487267e-08, "logits/chosen": -0.21841417253017426, "logits/rejected": -0.22265994548797607, "logps/chosen": -146.0672607421875, "logps/rejected": -163.59866333007812, "loss": 0.6892, "rewards/accuracies": 0.75, "rewards/chosen": 0.03489360958337784, "rewards/margins": 0.02281932905316353, "rewards/rejected": 0.01207428053021431, "step": 289 }, { "epoch": 0.72, "learning_rate": 7.406294175367757e-08, "logits/chosen": -0.1890469789505005, "logits/rejected": -0.16699306666851044, "logps/chosen": -121.36961364746094, "logps/rejected": -145.07595825195312, "loss": 0.6892, "rewards/accuracies": 0.5, "rewards/chosen": 0.01571674458682537, "rewards/margins": 0.025329019874334335, "rewards/rejected": -0.009612271562218666, "step": 290 }, { "epoch": 0.72, "learning_rate": 7.388599092561314e-08, "logits/chosen": -0.25326451659202576, "logits/rejected": -0.19595396518707275, "logps/chosen": -125.90462493896484, "logps/rejected": -157.79342651367188, "loss": 0.6901, "rewards/accuracies": 0.5, "rewards/chosen": 0.017676543444395065, "rewards/margins": 0.02339801751077175, "rewards/rejected": -0.005721474066376686, "step": 291 }, { "epoch": 0.73, "learning_rate": 7.370865161858691e-08, "logits/chosen": -0.16216090321540833, "logits/rejected": -0.17558252811431885, "logps/chosen": -164.77037048339844, "logps/rejected": -157.60028076171875, "loss": 0.6886, "rewards/accuracies": 0.25, "rewards/chosen": -0.0277557373046875, "rewards/margins": -0.011358833871781826, "rewards/rejected": -0.016396906226873398, "step": 292 }, { "epoch": 0.73, "learning_rate": 7.353092671682463e-08, "logits/chosen": -0.20518358051776886, "logits/rejected": -0.19826637208461761, "logps/chosen": -136.74014282226562, "logps/rejected": -166.71575927734375, "loss": 0.6868, "rewards/accuracies": 0.75, "rewards/chosen": 0.02195262722671032, "rewards/margins": 0.03229046240448952, "rewards/rejected": -0.010337830521166325, "step": 293 }, { "epoch": 0.73, "learning_rate": 7.33528191108233e-08, "logits/chosen": -0.35182538628578186, "logits/rejected": -0.33779454231262207, "logps/chosen": -145.4327850341797, "logps/rejected": -151.11581420898438, "loss": 0.6856, "rewards/accuracies": 0.5, "rewards/chosen": -5.95098827034235e-05, "rewards/margins": -0.005146408453583717, "rewards/rejected": 0.0050868988037109375, "step": 294 }, { "epoch": 0.73, "learning_rate": 7.31743316973042e-08, "logits/chosen": -0.34481149911880493, "logits/rejected": -0.34978654980659485, "logps/chosen": -177.19937133789062, "logps/rejected": -179.49871826171875, "loss": 0.6874, "rewards/accuracies": 0.5, "rewards/chosen": 0.009244538843631744, "rewards/margins": 0.01805267482995987, "rewards/rejected": -0.008808135986328125, "step": 295 }, { "epoch": 0.74, "learning_rate": 7.299546737916574e-08, "logits/chosen": -0.2378493696451187, "logits/rejected": -0.19701893627643585, "logps/chosen": -141.17982482910156, "logps/rejected": -178.77255249023438, "loss": 0.6834, "rewards/accuracies": 0.5, "rewards/chosen": 0.03002777136862278, "rewards/margins": 0.034966278821229935, "rewards/rejected": -0.00493850652128458, "step": 296 }, { "epoch": 0.74, "learning_rate": 7.281622906543624e-08, "logits/chosen": -0.32651838660240173, "logits/rejected": -0.32795652747154236, "logps/chosen": -150.79141235351562, "logps/rejected": -153.7373046875, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.03448543697595596, "rewards/margins": 0.04821758717298508, "rewards/rejected": -0.01373214740306139, "step": 297 }, { "epoch": 0.74, "learning_rate": 7.263661967122669e-08, "logits/chosen": -0.3384554088115692, "logits/rejected": -0.3053477108478546, "logps/chosen": -139.67919921875, "logps/rejected": -173.1238555908203, "loss": 0.6868, "rewards/accuracies": 0.75, "rewards/chosen": -0.00043563731014728546, "rewards/margins": 0.002759171649813652, "rewards/rejected": -0.0031948089599609375, "step": 298 }, { "epoch": 0.74, "learning_rate": 7.245664211768326e-08, "logits/chosen": -0.15016932785511017, "logits/rejected": -0.11838862299919128, "logps/chosen": -160.58358764648438, "logps/rejected": -139.15968322753906, "loss": 0.6923, "rewards/accuracies": 0.0, "rewards/chosen": -0.00374603271484375, "rewards/margins": -0.02444152906537056, "rewards/rejected": 0.02069549635052681, "step": 299 }, { "epoch": 0.75, "learning_rate": 7.227629933193983e-08, "logits/chosen": -0.3871018588542938, "logits/rejected": -0.37207135558128357, "logps/chosen": -158.733154296875, "logps/rejected": -154.4622802734375, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": -0.010270309634506702, "rewards/margins": 0.00095367431640625, "rewards/rejected": -0.011223983950912952, "step": 300 }, { "epoch": 0.75, "learning_rate": 7.209559424707033e-08, "logits/chosen": -0.3147459626197815, "logits/rejected": -0.3223874568939209, "logps/chosen": -135.83627319335938, "logps/rejected": -143.19610595703125, "loss": 0.6894, "rewards/accuracies": 0.75, "rewards/chosen": 0.000631141709163785, "rewards/margins": 0.005097199231386185, "rewards/rejected": -0.004466055892407894, "step": 301 }, { "epoch": 0.75, "learning_rate": 7.191452980204118e-08, "logits/chosen": -0.15708109736442566, "logits/rejected": -0.12934155762195587, "logps/chosen": -127.29246520996094, "logps/rejected": -128.69992065429688, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.024342728778719902, "rewards/margins": 0.03311748802661896, "rewards/rejected": -0.008774757385253906, "step": 302 }, { "epoch": 0.75, "learning_rate": 7.173310894166326e-08, "logits/chosen": -0.4301181435585022, "logits/rejected": -0.4047245383262634, "logps/chosen": -164.865966796875, "logps/rejected": -171.3487548828125, "loss": 0.69, "rewards/accuracies": 0.5, "rewards/chosen": -0.017482377588748932, "rewards/margins": -0.013854408636689186, "rewards/rejected": -0.0036279670894145966, "step": 303 }, { "epoch": 0.76, "learning_rate": 7.155133461654428e-08, "logits/chosen": -0.2850263714790344, "logits/rejected": -0.2891694903373718, "logps/chosen": -171.63906860351562, "logps/rejected": -187.6642608642578, "loss": 0.6943, "rewards/accuracies": 0.5, "rewards/chosen": -0.012075805105268955, "rewards/margins": 0.001221848651766777, "rewards/rejected": -0.013297654688358307, "step": 304 }, { "epoch": 0.76, "learning_rate": 7.136920978304055e-08, "logits/chosen": -0.2697795331478119, "logits/rejected": -0.26727110147476196, "logps/chosen": -172.6083221435547, "logps/rejected": -179.4028778076172, "loss": 0.6893, "rewards/accuracies": 0.75, "rewards/chosen": 0.00652351463213563, "rewards/margins": 0.02021827921271324, "rewards/rejected": -0.01369476318359375, "step": 305 }, { "epoch": 0.76, "learning_rate": 7.118673740320906e-08, "logits/chosen": -0.2376105636358261, "logits/rejected": -0.20015926659107208, "logps/chosen": -144.03829956054688, "logps/rejected": -158.98605346679688, "loss": 0.6861, "rewards/accuracies": 0.75, "rewards/chosen": 0.018773460760712624, "rewards/margins": 0.01841564103960991, "rewards/rejected": 0.00035781870246864855, "step": 306 }, { "epoch": 0.76, "learning_rate": 7.100392044475929e-08, "logits/chosen": -0.2335495948791504, "logits/rejected": -0.23841440677642822, "logps/chosen": -142.861328125, "logps/rejected": -154.40206909179688, "loss": 0.6792, "rewards/accuracies": 0.75, "rewards/chosen": 0.011588859371840954, "rewards/margins": 0.01133060548454523, "rewards/rejected": 0.0002582548186182976, "step": 307 }, { "epoch": 0.77, "learning_rate": 7.082076188100482e-08, "logits/chosen": -0.3006908595561981, "logits/rejected": -0.2827400863170624, "logps/chosen": -161.46226501464844, "logps/rejected": -193.9744110107422, "loss": 0.6904, "rewards/accuracies": 0.5, "rewards/chosen": 0.0031265257857739925, "rewards/margins": 0.007107161451131105, "rewards/rejected": -0.0039806365966796875, "step": 308 }, { "epoch": 0.77, "learning_rate": 7.06372646908151e-08, "logits/chosen": -0.2696700990200043, "logits/rejected": -0.25373005867004395, "logps/chosen": -195.02606201171875, "logps/rejected": -133.04058837890625, "loss": 0.6845, "rewards/accuracies": 0.5, "rewards/chosen": -0.011331558227539062, "rewards/margins": -0.00313415564596653, "rewards/rejected": -0.008197402581572533, "step": 309 }, { "epoch": 0.77, "learning_rate": 7.0453431858567e-08, "logits/chosen": -0.21082288026809692, "logits/rejected": -0.20169463753700256, "logps/chosen": -131.9996337890625, "logps/rejected": -158.50941467285156, "loss": 0.6838, "rewards/accuracies": 0.75, "rewards/chosen": 0.00891876220703125, "rewards/margins": 0.02249450981616974, "rewards/rejected": -0.01357574574649334, "step": 310 }, { "epoch": 0.77, "learning_rate": 7.026926637409614e-08, "logits/chosen": -0.30012941360473633, "logits/rejected": -0.2880440354347229, "logps/chosen": -127.71986389160156, "logps/rejected": -173.98236083984375, "loss": 0.6873, "rewards/accuracies": 0.75, "rewards/chosen": 0.017081450670957565, "rewards/margins": 0.022528840228915215, "rewards/rejected": -0.0054473876953125, "step": 311 }, { "epoch": 0.78, "learning_rate": 7.008477123264848e-08, "logits/chosen": -0.2798824608325958, "logits/rejected": -0.2592822015285492, "logps/chosen": -191.36322021484375, "logps/rejected": -181.21774291992188, "loss": 0.6799, "rewards/accuracies": 0.5, "rewards/chosen": 0.00326614361256361, "rewards/margins": 0.008475113660097122, "rewards/rejected": -0.0052089691162109375, "step": 312 }, { "epoch": 0.78, "learning_rate": 6.989994943483135e-08, "logits/chosen": -0.31929412484169006, "logits/rejected": -0.2876964807510376, "logps/chosen": -129.9084930419922, "logps/rejected": -149.2738494873047, "loss": 0.6812, "rewards/accuracies": 0.75, "rewards/chosen": 0.02843322791159153, "rewards/margins": 0.05197563394904137, "rewards/rejected": -0.023542404174804688, "step": 313 }, { "epoch": 0.78, "learning_rate": 6.971480398656487e-08, "logits/chosen": -0.340955525636673, "logits/rejected": -0.3354381322860718, "logps/chosen": -145.92446899414062, "logps/rejected": -168.5406494140625, "loss": 0.6898, "rewards/accuracies": 0.75, "rewards/chosen": 0.012357139959931374, "rewards/margins": 0.020522119477391243, "rewards/rejected": -0.00816497765481472, "step": 314 }, { "epoch": 0.78, "learning_rate": 6.952933789903298e-08, "logits/chosen": -0.26365047693252563, "logits/rejected": -0.26520147919654846, "logps/chosen": -147.3225860595703, "logps/rejected": -137.53253173828125, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": 0.019404983147978783, "rewards/margins": 0.02574462816119194, "rewards/rejected": -0.006339645013213158, "step": 315 }, { "epoch": 0.79, "learning_rate": 6.93435541886344e-08, "logits/chosen": -0.32259970903396606, "logits/rejected": -0.25456318259239197, "logps/chosen": -170.91693115234375, "logps/rejected": -167.07968139648438, "loss": 0.6882, "rewards/accuracies": 0.5, "rewards/chosen": 0.003729629097506404, "rewards/margins": 0.005923078395426273, "rewards/rejected": -0.002193450927734375, "step": 316 }, { "epoch": 0.79, "learning_rate": 6.915745587693364e-08, "logits/chosen": -0.24620762467384338, "logits/rejected": -0.2140384167432785, "logps/chosen": -185.3272705078125, "logps/rejected": -155.64073181152344, "loss": 0.6949, "rewards/accuracies": 0.5, "rewards/chosen": -0.027968216687440872, "rewards/margins": -0.010698318481445312, "rewards/rejected": -0.01726989820599556, "step": 317 }, { "epoch": 0.79, "learning_rate": 6.89710459906119e-08, "logits/chosen": -0.27702128887176514, "logits/rejected": -0.28760695457458496, "logps/chosen": -148.6781005859375, "logps/rejected": -148.0454559326172, "loss": 0.6858, "rewards/accuracies": 0.5, "rewards/chosen": 0.020235253497958183, "rewards/margins": 0.002300452906638384, "rewards/rejected": 0.01793479733169079, "step": 318 }, { "epoch": 0.79, "learning_rate": 6.878432756141775e-08, "logits/chosen": -0.3034825623035431, "logits/rejected": -0.29435667395591736, "logps/chosen": -132.8773956298828, "logps/rejected": -157.78843688964844, "loss": 0.688, "rewards/accuracies": 0.5, "rewards/chosen": 0.009839629754424095, "rewards/margins": -0.012009048834443092, "rewards/rejected": 0.021848678588867188, "step": 319 }, { "epoch": 0.8, "learning_rate": 6.859730362611788e-08, "logits/chosen": -0.21969380974769592, "logits/rejected": -0.21817569434642792, "logps/chosen": -159.6400146484375, "logps/rejected": -149.80422973632812, "loss": 0.6913, "rewards/accuracies": 0.25, "rewards/chosen": 0.004973411560058594, "rewards/margins": -0.014077185653150082, "rewards/rejected": 0.01905059814453125, "step": 320 }, { "epoch": 0.8, "learning_rate": 6.840997722644768e-08, "logits/chosen": -0.2871119976043701, "logits/rejected": -0.2761380970478058, "logps/chosen": -141.6320343017578, "logps/rejected": -191.8581085205078, "loss": 0.6897, "rewards/accuracies": 0.75, "rewards/chosen": 0.01703071780502796, "rewards/margins": 0.01784515380859375, "rewards/rejected": -0.0008144378662109375, "step": 321 }, { "epoch": 0.8, "learning_rate": 6.822235140906182e-08, "logits/chosen": -0.29278433322906494, "logits/rejected": -0.26775112748146057, "logps/chosen": -136.64317321777344, "logps/rejected": -198.63494873046875, "loss": 0.682, "rewards/accuracies": 0.5, "rewards/chosen": 0.007469368167221546, "rewards/margins": 0.0043977731838822365, "rewards/rejected": 0.0030715942848473787, "step": 322 }, { "epoch": 0.8, "learning_rate": 6.803442922548461e-08, "logits/chosen": -0.29926878213882446, "logits/rejected": -0.2825644612312317, "logps/chosen": -123.6905746459961, "logps/rejected": -143.47262573242188, "loss": 0.691, "rewards/accuracies": 0.25, "rewards/chosen": -0.009222602471709251, "rewards/margins": -0.02316303178668022, "rewards/rejected": 0.01394042931497097, "step": 323 }, { "epoch": 0.81, "learning_rate": 6.78462137320605e-08, "logits/chosen": -0.2819550335407257, "logits/rejected": -0.26497647166252136, "logps/chosen": -140.44480895996094, "logps/rejected": -179.17251586914062, "loss": 0.6916, "rewards/accuracies": 0.75, "rewards/chosen": 0.023299217224121094, "rewards/margins": 0.022881504148244858, "rewards/rejected": 0.0004177093505859375, "step": 324 }, { "epoch": 0.81, "learning_rate": 6.765770798990422e-08, "logits/chosen": -0.14662609994411469, "logits/rejected": -0.11066768318414688, "logps/chosen": -153.19166564941406, "logps/rejected": -150.35302734375, "loss": 0.6857, "rewards/accuracies": 0.75, "rewards/chosen": -0.00347976665943861, "rewards/margins": 0.02643890492618084, "rewards/rejected": -0.029918670654296875, "step": 325 }, { "epoch": 0.81, "learning_rate": 6.746891506485111e-08, "logits/chosen": -0.459929496049881, "logits/rejected": -0.44970253109931946, "logps/chosen": -156.84396362304688, "logps/rejected": -163.0205078125, "loss": 0.6952, "rewards/accuracies": 0.5, "rewards/chosen": 0.011661147698760033, "rewards/margins": -0.017184067517518997, "rewards/rejected": 0.02884521521627903, "step": 326 }, { "epoch": 0.81, "learning_rate": 6.727983802740722e-08, "logits/chosen": -0.5098133683204651, "logits/rejected": -0.4847020208835602, "logps/chosen": -141.37808227539062, "logps/rejected": -135.517578125, "loss": 0.6777, "rewards/accuracies": 0.5, "rewards/chosen": -0.00447769183665514, "rewards/margins": 0.014820479787886143, "rewards/rejected": -0.019298173487186432, "step": 327 }, { "epoch": 0.82, "learning_rate": 6.709047995269937e-08, "logits/chosen": -0.31755319237709045, "logits/rejected": -0.30586639046669006, "logps/chosen": -131.04705810546875, "logps/rejected": -146.08477783203125, "loss": 0.6767, "rewards/accuracies": 0.75, "rewards/chosen": -0.008494378067553043, "rewards/margins": 0.01799144595861435, "rewards/rejected": -0.026485824957489967, "step": 328 }, { "epoch": 0.82, "learning_rate": 6.690084392042513e-08, "logits/chosen": -0.32358673214912415, "logits/rejected": -0.31914737820625305, "logps/chosen": -153.02322387695312, "logps/rejected": -159.22830200195312, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": -0.011424255557358265, "rewards/margins": -0.017609406262636185, "rewards/rejected": 0.0061851502396166325, "step": 329 }, { "epoch": 0.82, "learning_rate": 6.671093301480275e-08, "logits/chosen": -0.2944464683532715, "logits/rejected": -0.29077959060668945, "logps/chosen": -185.3714599609375, "logps/rejected": -186.38711547851562, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": 0.01912078820168972, "rewards/margins": 0.031784821301698685, "rewards/rejected": -0.01266403216868639, "step": 330 }, { "epoch": 0.82, "learning_rate": 6.652075032452096e-08, "logits/chosen": -0.23162448406219482, "logits/rejected": -0.23530130088329315, "logps/chosen": -131.11181640625, "logps/rejected": -179.89773559570312, "loss": 0.6857, "rewards/accuracies": 0.75, "rewards/chosen": 0.001079559326171875, "rewards/margins": 0.011326026171445847, "rewards/rejected": -0.010246466845273972, "step": 331 }, { "epoch": 0.83, "learning_rate": 6.633029894268878e-08, "logits/chosen": -0.21607355773448944, "logits/rejected": -0.19596712291240692, "logps/chosen": -118.93038940429688, "logps/rejected": -163.18548583984375, "loss": 0.6885, "rewards/accuracies": 0.5, "rewards/chosen": -0.0014316556043922901, "rewards/margins": -0.00036220578476786613, "rewards/rejected": -0.001069449819624424, "step": 332 }, { "epoch": 0.83, "learning_rate": 6.613958196678525e-08, "logits/chosen": -0.4220534563064575, "logits/rejected": -0.40012553334236145, "logps/chosen": -118.59180450439453, "logps/rejected": -161.42613220214844, "loss": 0.6917, "rewards/accuracies": 0.5, "rewards/chosen": 0.017593763768672943, "rewards/margins": 0.010073087178170681, "rewards/rejected": 0.0075206756591796875, "step": 333 }, { "epoch": 0.83, "learning_rate": 6.594860249860887e-08, "logits/chosen": -0.3144893944263458, "logits/rejected": -0.30503252148628235, "logps/chosen": -127.30734252929688, "logps/rejected": -141.81088256835938, "loss": 0.6843, "rewards/accuracies": 0.5, "rewards/chosen": 0.0003406526520848274, "rewards/margins": 0.0005697254091501236, "rewards/rejected": -0.00022907275706529617, "step": 334 }, { "epoch": 0.83, "learning_rate": 6.575736364422746e-08, "logits/chosen": -0.2669622302055359, "logits/rejected": -0.2654730975627899, "logps/chosen": -150.8343963623047, "logps/rejected": -176.5886993408203, "loss": 0.6938, "rewards/accuracies": 0.0, "rewards/chosen": -0.009309006854891777, "rewards/margins": -0.018013764172792435, "rewards/rejected": 0.008704757317900658, "step": 335 }, { "epoch": 0.84, "learning_rate": 6.556586851392729e-08, "logits/chosen": -0.2423519343137741, "logits/rejected": -0.21280045807361603, "logps/chosen": -138.35580444335938, "logps/rejected": -172.50079345703125, "loss": 0.6848, "rewards/accuracies": 0.5, "rewards/chosen": 0.0036285403184592724, "rewards/margins": 3.852788358926773e-05, "rewards/rejected": 0.0035900124348700047, "step": 336 }, { "epoch": 0.84, "learning_rate": 6.53741202221628e-08, "logits/chosen": -0.29869505763053894, "logits/rejected": -0.29253914952278137, "logps/chosen": -122.28984069824219, "logps/rejected": -162.43768310546875, "loss": 0.6893, "rewards/accuracies": 0.5, "rewards/chosen": 0.004367447458207607, "rewards/margins": -0.021445848047733307, "rewards/rejected": 0.02581329643726349, "step": 337 }, { "epoch": 0.84, "learning_rate": 6.518212188750578e-08, "logits/chosen": -0.1878884732723236, "logits/rejected": -0.21235786378383636, "logps/chosen": -153.82379150390625, "logps/rejected": -150.42935180664062, "loss": 0.6911, "rewards/accuracies": 0.5, "rewards/chosen": -0.007975387386977673, "rewards/margins": -0.004745863378047943, "rewards/rejected": -0.0032295221462845802, "step": 338 }, { "epoch": 0.84, "learning_rate": 6.498987663259466e-08, "logits/chosen": -0.30196428298950195, "logits/rejected": -0.2838250994682312, "logps/chosen": -149.5569610595703, "logps/rejected": -151.07559204101562, "loss": 0.6957, "rewards/accuracies": 0.75, "rewards/chosen": -0.015181350521743298, "rewards/margins": 0.022324945777654648, "rewards/rejected": -0.03750629350543022, "step": 339 }, { "epoch": 0.85, "learning_rate": 6.479738758408378e-08, "logits/chosen": -0.4103127717971802, "logits/rejected": -0.38814565539360046, "logps/chosen": -154.9079132080078, "logps/rejected": -169.32843017578125, "loss": 0.6867, "rewards/accuracies": 0.75, "rewards/chosen": 0.008892822079360485, "rewards/margins": 0.022696303203701973, "rewards/rejected": -0.013803482986986637, "step": 340 }, { "epoch": 0.85, "learning_rate": 6.46046578725925e-08, "logits/chosen": -0.33965998888015747, "logits/rejected": -0.3100195527076721, "logps/chosen": -147.24024963378906, "logps/rejected": -181.55633544921875, "loss": 0.6836, "rewards/accuracies": 0.75, "rewards/chosen": 0.00545654259622097, "rewards/margins": 0.03369598463177681, "rewards/rejected": -0.02823944017291069, "step": 341 }, { "epoch": 0.85, "learning_rate": 6.441169063265429e-08, "logits/chosen": -0.19639413058757782, "logits/rejected": -0.1650552600622177, "logps/chosen": -152.89837646484375, "logps/rejected": -203.32830810546875, "loss": 0.6841, "rewards/accuracies": 0.75, "rewards/chosen": 0.014898300170898438, "rewards/margins": 0.028778076171875, "rewards/rejected": -0.013879776000976562, "step": 342 }, { "epoch": 0.85, "learning_rate": 6.42184890026658e-08, "logits/chosen": -0.20559574663639069, "logits/rejected": -0.2027675211429596, "logps/chosen": -132.7511749267578, "logps/rejected": -125.82939147949219, "loss": 0.6911, "rewards/accuracies": 0.0, "rewards/chosen": -0.020386315882205963, "rewards/margins": -0.013251114636659622, "rewards/rejected": -0.007135200314223766, "step": 343 }, { "epoch": 0.86, "learning_rate": 6.402505612483569e-08, "logits/chosen": -0.26869186758995056, "logits/rejected": -0.24253499507904053, "logps/chosen": -144.974609375, "logps/rejected": -145.93035888671875, "loss": 0.6919, "rewards/accuracies": 0.0, "rewards/chosen": -0.007283782586455345, "rewards/margins": -0.02226886712014675, "rewards/rejected": 0.014985084533691406, "step": 344 }, { "epoch": 0.86, "learning_rate": 6.383139514513367e-08, "logits/chosen": -0.24135564267635345, "logits/rejected": -0.2323184758424759, "logps/chosen": -135.3694610595703, "logps/rejected": -155.9537353515625, "loss": 0.6892, "rewards/accuracies": 0.5, "rewards/chosen": 0.008386420086026192, "rewards/margins": 0.0050285328179597855, "rewards/rejected": 0.0033578877337276936, "step": 345 }, { "epoch": 0.86, "learning_rate": 6.363750921323928e-08, "logits/chosen": -0.3537995219230652, "logits/rejected": -0.3171985447406769, "logps/chosen": -151.13662719726562, "logps/rejected": -161.82489013671875, "loss": 0.6858, "rewards/accuracies": 0.5, "rewards/chosen": 0.01337966974824667, "rewards/margins": 0.021019363775849342, "rewards/rejected": -0.007639694958925247, "step": 346 }, { "epoch": 0.86, "learning_rate": 6.34434014824906e-08, "logits/chosen": -0.25661635398864746, "logits/rejected": -0.27793172001838684, "logps/chosen": -158.03439331054688, "logps/rejected": -160.9471893310547, "loss": 0.6901, "rewards/accuracies": 0.75, "rewards/chosen": -0.004362679086625576, "rewards/margins": 0.03298816829919815, "rewards/rejected": -0.0373508483171463, "step": 347 }, { "epoch": 0.87, "learning_rate": 6.324907510983309e-08, "logits/chosen": -0.31726983189582825, "logits/rejected": -0.2595425546169281, "logps/chosen": -156.88409423828125, "logps/rejected": -149.5876922607422, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": -0.011295318603515625, "rewards/margins": -0.0001518242061138153, "rewards/rejected": -0.01114349253475666, "step": 348 }, { "epoch": 0.87, "learning_rate": 6.305453325576809e-08, "logits/chosen": -0.2720155417919159, "logits/rejected": -0.24376578629016876, "logps/chosen": -147.04150390625, "logps/rejected": -182.63087463378906, "loss": 0.681, "rewards/accuracies": 0.75, "rewards/chosen": 0.017029762268066406, "rewards/margins": 0.008805274963378906, "rewards/rejected": 0.008224486373364925, "step": 349 }, { "epoch": 0.87, "learning_rate": 6.285977908430158e-08, "logits/chosen": -0.2788761854171753, "logits/rejected": -0.3170336186885834, "logps/chosen": -143.9376220703125, "logps/rejected": -169.24916076660156, "loss": 0.6814, "rewards/accuracies": 0.5, "rewards/chosen": -0.0008255001157522202, "rewards/margins": 0.006516266614198685, "rewards/rejected": -0.007341766729950905, "step": 350 }, { "epoch": 0.87, "learning_rate": 6.266481576289261e-08, "logits/chosen": -0.15854841470718384, "logits/rejected": -0.16366548836231232, "logps/chosen": -150.04006958007812, "logps/rejected": -136.51756286621094, "loss": 0.6811, "rewards/accuracies": 0.75, "rewards/chosen": 0.025774383917450905, "rewards/margins": 0.0320066437125206, "rewards/rejected": -0.006232261657714844, "step": 351 }, { "epoch": 0.88, "learning_rate": 6.246964646240185e-08, "logits/chosen": -0.25463050603866577, "logits/rejected": -0.2361709475517273, "logps/chosen": -119.09761047363281, "logps/rejected": -168.5689697265625, "loss": 0.6939, "rewards/accuracies": 1.0, "rewards/chosen": 0.018095780164003372, "rewards/margins": 0.06539249420166016, "rewards/rejected": -0.04729671776294708, "step": 352 }, { "epoch": 0.88, "learning_rate": 6.227427435703996e-08, "logits/chosen": -0.2393677830696106, "logits/rejected": -0.23240779340267181, "logps/chosen": -143.42897033691406, "logps/rejected": -148.79185485839844, "loss": 0.6794, "rewards/accuracies": 0.5, "rewards/chosen": 0.02924957312643528, "rewards/margins": 0.009974670596420765, "rewards/rejected": 0.01927490159869194, "step": 353 }, { "epoch": 0.88, "learning_rate": 6.207870262431598e-08, "logits/chosen": -0.4100196957588196, "logits/rejected": -0.36022087931632996, "logps/chosen": -147.70956420898438, "logps/rejected": -161.21975708007812, "loss": 0.6831, "rewards/accuracies": 1.0, "rewards/chosen": 0.02192058600485325, "rewards/margins": 0.051328469067811966, "rewards/rejected": -0.029407883062958717, "step": 354 }, { "epoch": 0.88, "learning_rate": 6.188293444498573e-08, "logits/chosen": -0.43768709897994995, "logits/rejected": -0.4099644720554352, "logps/chosen": -164.7976531982422, "logps/rejected": -166.37921142578125, "loss": 0.683, "rewards/accuracies": 0.5, "rewards/chosen": 0.010435676202178001, "rewards/margins": 0.016027258709073067, "rewards/rejected": -0.005591582506895065, "step": 355 }, { "epoch": 0.89, "learning_rate": 6.168697300299994e-08, "logits/chosen": -0.18835365772247314, "logits/rejected": -0.1590977907180786, "logps/chosen": -138.63629150390625, "logps/rejected": -174.58013916015625, "loss": 0.6819, "rewards/accuracies": 0.75, "rewards/chosen": 0.0031909942626953125, "rewards/margins": 0.0440242774784565, "rewards/rejected": -0.040833283215761185, "step": 356 }, { "epoch": 0.89, "learning_rate": 6.149082148545257e-08, "logits/chosen": -0.32303866744041443, "logits/rejected": -0.29321756958961487, "logps/chosen": -148.59129333496094, "logps/rejected": -138.0511474609375, "loss": 0.6831, "rewards/accuracies": 0.5, "rewards/chosen": 0.025600243359804153, "rewards/margins": 0.017734527587890625, "rewards/rejected": 0.007865714840590954, "step": 357 }, { "epoch": 0.89, "learning_rate": 6.129448308252899e-08, "logits/chosen": -0.5158710479736328, "logits/rejected": -0.5058121681213379, "logps/chosen": -130.06402587890625, "logps/rejected": -136.27020263671875, "loss": 0.69, "rewards/accuracies": 0.25, "rewards/chosen": -0.01463031955063343, "rewards/margins": 0.0026157372631132603, "rewards/rejected": -0.017246056348085403, "step": 358 }, { "epoch": 0.89, "learning_rate": 6.109796098745397e-08, "logits/chosen": -0.35153642296791077, "logits/rejected": -0.2862270176410675, "logps/chosen": -138.31053161621094, "logps/rejected": -179.52174377441406, "loss": 0.6832, "rewards/accuracies": 0.75, "rewards/chosen": 0.014247514307498932, "rewards/margins": 0.031034469604492188, "rewards/rejected": -0.016786957159638405, "step": 359 }, { "epoch": 0.9, "learning_rate": 6.090125839643991e-08, "logits/chosen": -0.2833433449268341, "logits/rejected": -0.2729432284832001, "logps/chosen": -158.3718719482422, "logps/rejected": -144.03128051757812, "loss": 0.6881, "rewards/accuracies": 0.5, "rewards/chosen": 0.023208238184452057, "rewards/margins": 0.03015136905014515, "rewards/rejected": -0.006943130865693092, "step": 360 }, { "epoch": 0.9, "learning_rate": 6.070437850863472e-08, "logits/chosen": -0.2236456722021103, "logits/rejected": -0.2183639109134674, "logps/chosen": -129.0577850341797, "logps/rejected": -145.51480102539062, "loss": 0.6895, "rewards/accuracies": 0.5, "rewards/chosen": -0.015596963465213776, "rewards/margins": 0.00019855424761772156, "rewards/rejected": -0.015795517712831497, "step": 361 }, { "epoch": 0.9, "learning_rate": 6.050732452606985e-08, "logits/chosen": -0.2381472885608673, "logits/rejected": -0.2416631430387497, "logps/chosen": -149.8536376953125, "logps/rejected": -169.5762939453125, "loss": 0.6875, "rewards/accuracies": 0.5, "rewards/chosen": -0.021961595863103867, "rewards/margins": -0.005927848629653454, "rewards/rejected": -0.01603374443948269, "step": 362 }, { "epoch": 0.9, "learning_rate": 6.031009965360824e-08, "logits/chosen": -0.27770763635635376, "logits/rejected": -0.2847503125667572, "logps/chosen": -147.86627197265625, "logps/rejected": -148.46356201171875, "loss": 0.6796, "rewards/accuracies": 0.5, "rewards/chosen": 0.00782699603587389, "rewards/margins": 0.02292461320757866, "rewards/rejected": -0.01509761717170477, "step": 363 }, { "epoch": 0.91, "learning_rate": 6.011270709889213e-08, "logits/chosen": -0.24423371255397797, "logits/rejected": -0.2335626780986786, "logps/chosen": -134.89256286621094, "logps/rejected": -176.87225341796875, "loss": 0.6852, "rewards/accuracies": 0.75, "rewards/chosen": 0.010548019781708717, "rewards/margins": 0.04046668857336044, "rewards/rejected": -0.029918670654296875, "step": 364 }, { "epoch": 0.91, "learning_rate": 5.991515007229092e-08, "logits/chosen": -0.29664504528045654, "logits/rejected": -0.30107381939888, "logps/chosen": -114.81784057617188, "logps/rejected": -204.8734588623047, "loss": 0.6849, "rewards/accuracies": 0.75, "rewards/chosen": -0.007299614138901234, "rewards/margins": 0.004866981878876686, "rewards/rejected": -0.012166595086455345, "step": 365 }, { "epoch": 0.91, "learning_rate": 5.9717431786849e-08, "logits/chosen": -0.3109840452671051, "logits/rejected": -0.3111136555671692, "logps/chosen": -164.4296875, "logps/rejected": -169.23028564453125, "loss": 0.6808, "rewards/accuracies": 0.75, "rewards/chosen": 0.002585791517049074, "rewards/margins": 0.007693672552704811, "rewards/rejected": -0.005107879638671875, "step": 366 }, { "epoch": 0.91, "learning_rate": 5.9519555458233416e-08, "logits/chosen": -0.33236220479011536, "logits/rejected": -0.31515198945999146, "logps/chosen": -161.90692138671875, "logps/rejected": -148.18841552734375, "loss": 0.6855, "rewards/accuracies": 0.75, "rewards/chosen": 0.019072342664003372, "rewards/margins": 0.022620391100645065, "rewards/rejected": -0.003548049833625555, "step": 367 }, { "epoch": 0.92, "learning_rate": 5.9321524304681636e-08, "logits/chosen": -0.3025957942008972, "logits/rejected": -0.28277894854545593, "logps/chosen": -167.50917053222656, "logps/rejected": -168.6441650390625, "loss": 0.6766, "rewards/accuracies": 0.5, "rewards/chosen": -0.003797531593590975, "rewards/margins": -0.004277801141142845, "rewards/rejected": 0.00048027001321315765, "step": 368 }, { "epoch": 0.92, "learning_rate": 5.912334154694918e-08, "logits/chosen": -0.2373957484960556, "logits/rejected": -0.22318467497825623, "logps/chosen": -145.83221435546875, "logps/rejected": -167.86843872070312, "loss": 0.6907, "rewards/accuracies": 0.5, "rewards/chosen": 0.026496123522520065, "rewards/margins": 0.03252754360437393, "rewards/rejected": -0.006031418219208717, "step": 369 }, { "epoch": 0.92, "learning_rate": 5.8925010408257205e-08, "logits/chosen": -0.27252766489982605, "logits/rejected": -0.23249930143356323, "logps/chosen": -135.58291625976562, "logps/rejected": -173.8125457763672, "loss": 0.6897, "rewards/accuracies": 0.75, "rewards/chosen": 0.01874084584414959, "rewards/margins": 0.031122971326112747, "rewards/rejected": -0.012382125481963158, "step": 370 }, { "epoch": 0.92, "learning_rate": 5.8726534114240165e-08, "logits/chosen": -0.2407573014497757, "logits/rejected": -0.2188364714384079, "logps/chosen": -127.37445831298828, "logps/rejected": -163.03695678710938, "loss": 0.6842, "rewards/accuracies": 0.75, "rewards/chosen": 0.023225022479891777, "rewards/margins": 0.031954575330019, "rewards/rejected": -0.008729553781449795, "step": 371 }, { "epoch": 0.93, "learning_rate": 5.85279158928933e-08, "logits/chosen": -0.2110321819782257, "logits/rejected": -0.227130725979805, "logps/chosen": -145.1591796875, "logps/rejected": -156.26292419433594, "loss": 0.6915, "rewards/accuracies": 0.75, "rewards/chosen": -0.002439117291942239, "rewards/margins": 0.023968124762177467, "rewards/rejected": -0.026407241821289062, "step": 372 }, { "epoch": 0.93, "learning_rate": 5.832915897452008e-08, "logits/chosen": -0.2963154911994934, "logits/rejected": -0.27632197737693787, "logps/chosen": -172.1112060546875, "logps/rejected": -156.72808837890625, "loss": 0.6767, "rewards/accuracies": 1.0, "rewards/chosen": 0.05883197858929634, "rewards/margins": 0.05720462650060654, "rewards/rejected": 0.0016273499932140112, "step": 373 }, { "epoch": 0.93, "learning_rate": 5.813026659167981e-08, "logits/chosen": -0.35182395577430725, "logits/rejected": -0.33492299914360046, "logps/chosen": -145.2403106689453, "logps/rejected": -145.70626831054688, "loss": 0.6877, "rewards/accuracies": 0.5, "rewards/chosen": -0.025806045159697533, "rewards/margins": -0.005769345909357071, "rewards/rejected": -0.020036697387695312, "step": 374 }, { "epoch": 0.93, "learning_rate": 5.7931241979134914e-08, "logits/chosen": -0.4178738296031952, "logits/rejected": -0.4064503610134125, "logps/chosen": -161.0409698486328, "logps/rejected": -154.9353790283203, "loss": 0.6845, "rewards/accuracies": 0.75, "rewards/chosen": 0.012017440982162952, "rewards/margins": 0.016335677355527878, "rewards/rejected": -0.004318237770348787, "step": 375 }, { "epoch": 0.94, "learning_rate": 5.7732088373798424e-08, "logits/chosen": -0.3233989179134369, "logits/rejected": -0.30929049849510193, "logps/chosen": -117.18815612792969, "logps/rejected": -142.359130859375, "loss": 0.6939, "rewards/accuracies": 0.75, "rewards/chosen": 0.007401275914162397, "rewards/margins": 0.011991120874881744, "rewards/rejected": -0.00458984449505806, "step": 376 }, { "epoch": 0.94, "learning_rate": 5.753280901468125e-08, "logits/chosen": -0.2317524403333664, "logits/rejected": -0.23095571994781494, "logps/chosen": -166.931884765625, "logps/rejected": -184.88314819335938, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 0.0015460969880223274, "rewards/margins": 0.03398093953728676, "rewards/rejected": -0.032434847205877304, "step": 377 }, { "epoch": 0.94, "learning_rate": 5.733340714283958e-08, "logits/chosen": -0.12190411239862442, "logits/rejected": -0.12927336990833282, "logps/chosen": -197.7294921875, "logps/rejected": -180.0089111328125, "loss": 0.6867, "rewards/accuracies": 0.75, "rewards/chosen": -0.006522751413285732, "rewards/margins": 0.031439974904060364, "rewards/rejected": -0.03796272352337837, "step": 378 }, { "epoch": 0.94, "learning_rate": 5.713388600132216e-08, "logits/chosen": -0.5150523781776428, "logits/rejected": -0.4669954478740692, "logps/chosen": -133.3107452392578, "logps/rejected": -148.522216796875, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 0.019606972113251686, "rewards/margins": 0.06208210438489914, "rewards/rejected": -0.042475126683712006, "step": 379 }, { "epoch": 0.95, "learning_rate": 5.693424883511747e-08, "logits/chosen": -0.30378907918930054, "logits/rejected": -0.29691463708877563, "logps/chosen": -165.95388793945312, "logps/rejected": -166.8492889404297, "loss": 0.6875, "rewards/accuracies": 0.25, "rewards/chosen": -0.0002964017912745476, "rewards/margins": -0.0020545953884720802, "rewards/rejected": 0.00175819406285882, "step": 380 }, { "epoch": 0.95, "learning_rate": 5.6734498891101005e-08, "logits/chosen": -0.3297489285469055, "logits/rejected": -0.31478726863861084, "logps/chosen": -120.26414489746094, "logps/rejected": -174.33535766601562, "loss": 0.6923, "rewards/accuracies": 0.75, "rewards/chosen": -0.01181869488209486, "rewards/margins": -0.005229569971561432, "rewards/rejected": -0.006589126773178577, "step": 381 }, { "epoch": 0.95, "learning_rate": 5.6534639417982513e-08, "logits/chosen": -0.4331267178058624, "logits/rejected": -0.4167942702770233, "logps/chosen": -180.66241455078125, "logps/rejected": -180.63372802734375, "loss": 0.6831, "rewards/accuracies": 0.75, "rewards/chosen": 0.016769029200077057, "rewards/margins": 0.020850373432040215, "rewards/rejected": -0.004081345163285732, "step": 382 }, { "epoch": 0.95, "learning_rate": 5.6334673666253054e-08, "logits/chosen": -0.3383549749851227, "logits/rejected": -0.32620862126350403, "logps/chosen": -125.46138000488281, "logps/rejected": -184.62753295898438, "loss": 0.6796, "rewards/accuracies": 0.5, "rewards/chosen": -0.004977036267518997, "rewards/margins": 0.04506683349609375, "rewards/rejected": -0.05004386976361275, "step": 383 }, { "epoch": 0.96, "learning_rate": 5.613460488813224e-08, "logits/chosen": -0.3283133804798126, "logits/rejected": -0.297524631023407, "logps/chosen": -141.740966796875, "logps/rejected": -154.634765625, "loss": 0.6822, "rewards/accuracies": 0.5, "rewards/chosen": 0.019074440002441406, "rewards/margins": -0.006908608600497246, "rewards/rejected": 0.025983048602938652, "step": 384 }, { "epoch": 0.96, "learning_rate": 5.593443633751527e-08, "logits/chosen": -0.21257595717906952, "logits/rejected": -0.20411695539951324, "logps/chosen": -127.48888397216797, "logps/rejected": -166.86410522460938, "loss": 0.6829, "rewards/accuracies": 0.5, "rewards/chosen": -0.0015047071501612663, "rewards/margins": -0.017706871032714844, "rewards/rejected": 0.016202162951231003, "step": 385 }, { "epoch": 0.96, "learning_rate": 5.5734171269920025e-08, "logits/chosen": -0.3495387136936188, "logits/rejected": -0.35712844133377075, "logps/chosen": -144.152099609375, "logps/rejected": -149.45916748046875, "loss": 0.6835, "rewards/accuracies": 0.5, "rewards/chosen": 0.00885620154440403, "rewards/margins": -0.0026943208649754524, "rewards/rejected": 0.011550520546734333, "step": 386 }, { "epoch": 0.96, "learning_rate": 5.553381294243412e-08, "logits/chosen": -0.4139474630355835, "logits/rejected": -0.3941117227077484, "logps/chosen": -142.09042358398438, "logps/rejected": -156.83660888671875, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": -0.001422310248017311, "rewards/margins": 0.03882160410284996, "rewards/rejected": -0.04024391248822212, "step": 387 }, { "epoch": 0.97, "learning_rate": 5.533336461366199e-08, "logits/chosen": -0.27657651901245117, "logits/rejected": -0.25089386105537415, "logps/chosen": -171.19093322753906, "logps/rejected": -172.11849975585938, "loss": 0.6963, "rewards/accuracies": 0.5, "rewards/chosen": -0.020765304565429688, "rewards/margins": -0.005755234509706497, "rewards/rejected": -0.01501007191836834, "step": 388 }, { "epoch": 0.97, "learning_rate": 5.513282954367178e-08, "logits/chosen": -0.3574690520763397, "logits/rejected": -0.34899359941482544, "logps/chosen": -166.58676147460938, "logps/rejected": -163.39552307128906, "loss": 0.689, "rewards/accuracies": 0.25, "rewards/chosen": -0.012101555243134499, "rewards/margins": -0.02978801727294922, "rewards/rejected": 0.01768646389245987, "step": 389 }, { "epoch": 0.97, "learning_rate": 5.493221099394239e-08, "logits/chosen": -0.2717186212539673, "logits/rejected": -0.20323744416236877, "logps/chosen": -137.66378784179688, "logps/rejected": -156.96243286132812, "loss": 0.6852, "rewards/accuracies": 0.5, "rewards/chosen": 0.003407288109883666, "rewards/margins": 0.0036102295853197575, "rewards/rejected": -0.00020294217392802238, "step": 390 }, { "epoch": 0.97, "learning_rate": 5.4731512227310437e-08, "logits/chosen": -0.41080591082572937, "logits/rejected": -0.3731396496295929, "logps/chosen": -153.28765869140625, "logps/rejected": -169.6854248046875, "loss": 0.6887, "rewards/accuracies": 0.25, "rewards/chosen": -0.03673133999109268, "rewards/margins": -0.03338813781738281, "rewards/rejected": -0.0033431993797421455, "step": 391 }, { "epoch": 0.98, "learning_rate": 5.453073650791723e-08, "logits/chosen": -0.5055679678916931, "logits/rejected": -0.5184247493743896, "logps/chosen": -153.9371337890625, "logps/rejected": -168.031494140625, "loss": 0.6849, "rewards/accuracies": 0.5, "rewards/chosen": -0.015539170242846012, "rewards/margins": 0.008207892999053001, "rewards/rejected": -0.023747064173221588, "step": 392 }, { "epoch": 0.98, "learning_rate": 5.4329887101155525e-08, "logits/chosen": -0.28352367877960205, "logits/rejected": -0.289715975522995, "logps/chosen": -117.79191589355469, "logps/rejected": -146.524658203125, "loss": 0.6831, "rewards/accuracies": 0.5, "rewards/chosen": 0.025947952643036842, "rewards/margins": 0.03984985500574112, "rewards/rejected": -0.013901901431381702, "step": 393 }, { "epoch": 0.98, "learning_rate": 5.4128967273616623e-08, "logits/chosen": -0.3262452781200409, "logits/rejected": -0.3285953402519226, "logps/chosen": -139.6022186279297, "logps/rejected": -155.60865783691406, "loss": 0.6805, "rewards/accuracies": 0.5, "rewards/chosen": -0.01960144191980362, "rewards/margins": 0.0038009630516171455, "rewards/rejected": -0.02340240590274334, "step": 394 }, { "epoch": 0.98, "learning_rate": 5.392798029303705e-08, "logits/chosen": -0.2755436897277832, "logits/rejected": -0.247481569647789, "logps/chosen": -128.03334045410156, "logps/rejected": -173.80462646484375, "loss": 0.69, "rewards/accuracies": 0.75, "rewards/chosen": 0.00035896338522434235, "rewards/margins": 0.0002880091778934002, "rewards/rejected": 7.095345063135028e-05, "step": 395 }, { "epoch": 0.99, "learning_rate": 5.372692942824555e-08, "logits/chosen": -0.2168724089860916, "logits/rejected": -0.19696547091007233, "logps/chosen": -112.45630645751953, "logps/rejected": -157.48486328125, "loss": 0.6885, "rewards/accuracies": 0.75, "rewards/chosen": 0.008276748470962048, "rewards/margins": 0.0211854949593544, "rewards/rejected": -0.012908745557069778, "step": 396 }, { "epoch": 0.99, "learning_rate": 5.3525817949109884e-08, "logits/chosen": -0.28183919191360474, "logits/rejected": -0.274641752243042, "logps/chosen": -140.20278930664062, "logps/rejected": -198.18179321289062, "loss": 0.6926, "rewards/accuracies": 0.5, "rewards/chosen": -0.01975708082318306, "rewards/margins": -0.010359574109315872, "rewards/rejected": -0.009397506713867188, "step": 397 }, { "epoch": 0.99, "learning_rate": 5.332464912648361e-08, "logits/chosen": -0.27677085995674133, "logits/rejected": -0.2748771011829376, "logps/chosen": -138.90187072753906, "logps/rejected": -149.75978088378906, "loss": 0.6967, "rewards/accuracies": 0.25, "rewards/chosen": 0.009492112323641777, "rewards/margins": -0.00878601148724556, "rewards/rejected": 0.018278121948242188, "step": 398 }, { "epoch": 0.99, "learning_rate": 5.312342623215291e-08, "logits/chosen": -0.16639003157615662, "logits/rejected": -0.17365986108779907, "logps/chosen": -172.105224609375, "logps/rejected": -156.21453857421875, "loss": 0.6948, "rewards/accuracies": 1.0, "rewards/chosen": -0.0004337308928370476, "rewards/margins": 0.027239609509706497, "rewards/rejected": -0.02767334133386612, "step": 399 }, { "epoch": 1.0, "learning_rate": 5.292215253878343e-08, "logits/chosen": -0.31481629610061646, "logits/rejected": -0.304355263710022, "logps/chosen": -158.88052368164062, "logps/rejected": -154.33172607421875, "loss": 0.6867, "rewards/accuracies": 0.75, "rewards/chosen": 0.005457687191665173, "rewards/margins": 0.01750793494284153, "rewards/rejected": -0.012050246819853783, "step": 400 }, { "epoch": 1.0, "learning_rate": 5.272083131986692e-08, "logits/chosen": -0.2191314399242401, "logits/rejected": -0.16566720604896545, "logps/chosen": -160.6725311279297, "logps/rejected": -190.1116180419922, "loss": 0.6868, "rewards/accuracies": 0.0, "rewards/chosen": -0.030547523871064186, "rewards/margins": -0.03711872175335884, "rewards/rejected": 0.0065711974166333675, "step": 401 }, { "epoch": 1.0, "learning_rate": 5.2519465849668174e-08, "logits/chosen": -0.3490391969680786, "logits/rejected": -0.325375497341156, "logps/chosen": -135.93893432617188, "logps/rejected": -145.23484802246094, "loss": 0.6914, "rewards/accuracies": 0.5, "rewards/chosen": -0.007262038998305798, "rewards/margins": 0.011168671771883965, "rewards/rejected": -0.018430709838867188, "step": 402 }, { "epoch": 1.0, "learning_rate": 5.2318059403171624e-08, "logits/chosen": -0.25223544239997864, "logits/rejected": -0.24327678978443146, "logps/chosen": -152.73580932617188, "logps/rejected": -151.47113037109375, "loss": 0.6902, "rewards/accuracies": 0.75, "rewards/chosen": 0.021676255390048027, "rewards/margins": 0.031125642359256744, "rewards/rejected": -0.009449386969208717, "step": 403 }, { "epoch": 1.0, "learning_rate": 5.211661525602813e-08, "logits/chosen": -0.312659353017807, "logits/rejected": -0.3089643120765686, "logps/chosen": -125.43807220458984, "logps/rejected": -160.4003143310547, "loss": 0.681, "rewards/accuracies": 0.75, "rewards/chosen": -0.002525901887565851, "rewards/margins": -0.0028772358782589436, "rewards/rejected": 0.00035133445635437965, "step": 404 }, { "epoch": 1.01, "learning_rate": 5.191513668450177e-08, "logits/chosen": -0.21477071940898895, "logits/rejected": -0.23838064074516296, "logps/chosen": -157.63064575195312, "logps/rejected": -133.40895080566406, "loss": 0.6799, "rewards/accuracies": 1.0, "rewards/chosen": 0.03170490264892578, "rewards/margins": 0.040155984461307526, "rewards/rejected": -0.008451081812381744, "step": 405 }, { "epoch": 1.01, "learning_rate": 5.171362696541643e-08, "logits/chosen": -0.42011842131614685, "logits/rejected": -0.400948166847229, "logps/chosen": -138.65213012695312, "logps/rejected": -160.17813110351562, "loss": 0.6881, "rewards/accuracies": 0.0, "rewards/chosen": -0.017650794237852097, "rewards/margins": -0.02034607157111168, "rewards/rejected": 0.002695275004953146, "step": 406 }, { "epoch": 1.01, "learning_rate": 5.151208937610263e-08, "logits/chosen": -0.06795604526996613, "logits/rejected": -0.04549897089600563, "logps/chosen": -140.5522918701172, "logps/rejected": -141.76104736328125, "loss": 0.69, "rewards/accuracies": 0.25, "rewards/chosen": 0.000614548334851861, "rewards/margins": -0.02151184156537056, "rewards/rejected": 0.022126389667391777, "step": 407 }, { "epoch": 1.01, "learning_rate": 5.131052719434412e-08, "logits/chosen": -0.23819629848003387, "logits/rejected": -0.2142365276813507, "logps/chosen": -137.44235229492188, "logps/rejected": -186.45704650878906, "loss": 0.6879, "rewards/accuracies": 0.5, "rewards/chosen": -0.0018718717619776726, "rewards/margins": -0.005964660085737705, "rewards/rejected": 0.004092788323760033, "step": 408 }, { "epoch": 1.02, "learning_rate": 5.110894369832465e-08, "logits/chosen": -0.11583978682756424, "logits/rejected": -0.12938320636749268, "logps/chosen": -160.88583374023438, "logps/rejected": -157.98526000976562, "loss": 0.6795, "rewards/accuracies": 0.75, "rewards/chosen": 0.0071582798846066, "rewards/margins": 0.02057647705078125, "rewards/rejected": -0.013418197631835938, "step": 409 }, { "epoch": 1.02, "learning_rate": 5.090734216657462e-08, "logits/chosen": -0.27238157391548157, "logits/rejected": -0.24627035856246948, "logps/chosen": -152.76296997070312, "logps/rejected": -171.91456604003906, "loss": 0.6866, "rewards/accuracies": 0.5, "rewards/chosen": 0.028651809319853783, "rewards/margins": 0.029474256560206413, "rewards/rejected": -0.0008224491029977798, "step": 410 }, { "epoch": 1.02, "learning_rate": 5.0705725877917746e-08, "logits/chosen": -0.3032512962818146, "logits/rejected": -0.2830606698989868, "logps/chosen": -167.57870483398438, "logps/rejected": -178.730224609375, "loss": 0.6805, "rewards/accuracies": 0.75, "rewards/chosen": 0.013912202790379524, "rewards/margins": 0.05007210001349449, "rewards/rejected": -0.03615989536046982, "step": 411 }, { "epoch": 1.02, "learning_rate": 5.050409811141777e-08, "logits/chosen": -0.15545779466629028, "logits/rejected": -0.129365012049675, "logps/chosen": -144.59722900390625, "logps/rejected": -214.84597778320312, "loss": 0.6875, "rewards/accuracies": 1.0, "rewards/chosen": 0.00778961181640625, "rewards/margins": 0.02811431884765625, "rewards/rejected": -0.02032470889389515, "step": 412 }, { "epoch": 1.03, "learning_rate": 5.030246214632508e-08, "logits/chosen": -0.253824383020401, "logits/rejected": -0.21987974643707275, "logps/chosen": -165.3353271484375, "logps/rejected": -165.6888427734375, "loss": 0.6942, "rewards/accuracies": 0.75, "rewards/chosen": 0.005537986755371094, "rewards/margins": 0.02100849151611328, "rewards/rejected": -0.015470505692064762, "step": 413 }, { "epoch": 1.03, "learning_rate": 5.010082126202343e-08, "logits/chosen": -0.45335009694099426, "logits/rejected": -0.4214354157447815, "logps/chosen": -139.32781982421875, "logps/rejected": -148.6710662841797, "loss": 0.6825, "rewards/accuracies": 0.5, "rewards/chosen": -0.005760765168815851, "rewards/margins": 0.008824538439512253, "rewards/rejected": -0.014585305005311966, "step": 414 }, { "epoch": 1.03, "learning_rate": 4.9899178737976575e-08, "logits/chosen": -0.367992103099823, "logits/rejected": -0.36022746562957764, "logps/chosen": -157.1563720703125, "logps/rejected": -163.40719604492188, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": -0.00236854562535882, "rewards/margins": 0.023421861231327057, "rewards/rejected": -0.02579040452837944, "step": 415 }, { "epoch": 1.03, "learning_rate": 4.969753785367493e-08, "logits/chosen": -0.34548017382621765, "logits/rejected": -0.3167077600955963, "logps/chosen": -145.0408935546875, "logps/rejected": -180.2486572265625, "loss": 0.6827, "rewards/accuracies": 0.25, "rewards/chosen": 0.007814216427505016, "rewards/margins": -0.018580438569188118, "rewards/rejected": 0.02639465406537056, "step": 416 }, { "epoch": 1.04, "learning_rate": 4.949590188858223e-08, "logits/chosen": -0.23495666682720184, "logits/rejected": -0.2227388620376587, "logps/chosen": -142.75180053710938, "logps/rejected": -114.5872802734375, "loss": 0.6899, "rewards/accuracies": 0.75, "rewards/chosen": -0.0010179521050304174, "rewards/margins": 0.033118437975645065, "rewards/rejected": -0.034136392176151276, "step": 417 }, { "epoch": 1.04, "learning_rate": 4.929427412208225e-08, "logits/chosen": -0.24171233177185059, "logits/rejected": -0.24868975579738617, "logps/chosen": -154.09872436523438, "logps/rejected": -120.01252746582031, "loss": 0.6814, "rewards/accuracies": 1.0, "rewards/chosen": 0.03710785135626793, "rewards/margins": 0.06528415530920029, "rewards/rejected": -0.028176309540867805, "step": 418 }, { "epoch": 1.04, "learning_rate": 4.909265783342539e-08, "logits/chosen": -0.28964707255363464, "logits/rejected": -0.2516990005970001, "logps/chosen": -172.5787811279297, "logps/rejected": -164.59881591796875, "loss": 0.6902, "rewards/accuracies": 0.5, "rewards/chosen": 0.03607521206140518, "rewards/margins": 0.01019287109375, "rewards/rejected": 0.025882340967655182, "step": 419 }, { "epoch": 1.04, "learning_rate": 4.8891056301675346e-08, "logits/chosen": -0.3227195739746094, "logits/rejected": -0.30802658200263977, "logps/chosen": -150.77674865722656, "logps/rejected": -174.2827911376953, "loss": 0.6899, "rewards/accuracies": 0.75, "rewards/chosen": 0.005997848697006702, "rewards/margins": 0.01930541917681694, "rewards/rejected": -0.013307570479810238, "step": 420 }, { "epoch": 1.05, "learning_rate": 4.868947280565588e-08, "logits/chosen": -0.20686569809913635, "logits/rejected": -0.1979060173034668, "logps/chosen": -150.58209228515625, "logps/rejected": -177.0712432861328, "loss": 0.6935, "rewards/accuracies": 0.25, "rewards/chosen": -0.0020822519436478615, "rewards/margins": -0.013976857997477055, "rewards/rejected": 0.011894606053829193, "step": 421 }, { "epoch": 1.05, "learning_rate": 4.8487910623897385e-08, "logits/chosen": -0.22876782715320587, "logits/rejected": -0.22552242875099182, "logps/chosen": -167.9886474609375, "logps/rejected": -164.04806518554688, "loss": 0.681, "rewards/accuracies": 0.75, "rewards/chosen": 0.010951233096420765, "rewards/margins": 0.01977691799402237, "rewards/rejected": -0.008825683034956455, "step": 422 }, { "epoch": 1.05, "learning_rate": 4.828637303458356e-08, "logits/chosen": -0.2283521294593811, "logits/rejected": -0.20264165103435516, "logps/chosen": -112.83631134033203, "logps/rejected": -138.70608520507812, "loss": 0.6842, "rewards/accuracies": 0.25, "rewards/chosen": 0.008046722039580345, "rewards/margins": 0.00480880681425333, "rewards/rejected": 0.0032379154581576586, "step": 423 }, { "epoch": 1.05, "learning_rate": 4.808486331549823e-08, "logits/chosen": -0.2689732015132904, "logits/rejected": -0.2613367438316345, "logps/chosen": -151.8926544189453, "logps/rejected": -171.47128295898438, "loss": 0.6875, "rewards/accuracies": 0.25, "rewards/chosen": -0.01940765604376793, "rewards/margins": -0.027012255042791367, "rewards/rejected": 0.007604599464684725, "step": 424 }, { "epoch": 1.06, "learning_rate": 4.788338474397187e-08, "logits/chosen": -0.23995482921600342, "logits/rejected": -0.2154190093278885, "logps/chosen": -151.68466186523438, "logps/rejected": -167.5225067138672, "loss": 0.6806, "rewards/accuracies": 1.0, "rewards/chosen": 0.0182647705078125, "rewards/margins": 0.05400428920984268, "rewards/rejected": -0.03573951870203018, "step": 425 }, { "epoch": 1.06, "learning_rate": 4.7681940596828385e-08, "logits/chosen": -0.24672725796699524, "logits/rejected": -0.22174501419067383, "logps/chosen": -211.1099090576172, "logps/rejected": -131.77804565429688, "loss": 0.6927, "rewards/accuracies": 0.25, "rewards/chosen": -0.005263901315629482, "rewards/margins": 0.0006847381591796875, "rewards/rejected": -0.005948638543486595, "step": 426 }, { "epoch": 1.06, "learning_rate": 4.748053415033183e-08, "logits/chosen": -0.3357540965080261, "logits/rejected": -0.3245833218097687, "logps/chosen": -143.04580688476562, "logps/rejected": -156.8082275390625, "loss": 0.6944, "rewards/accuracies": 0.25, "rewards/chosen": -0.009367752820253372, "rewards/margins": -0.028126144781708717, "rewards/rejected": 0.018758393824100494, "step": 427 }, { "epoch": 1.06, "learning_rate": 4.727916868013309e-08, "logits/chosen": -0.27296003699302673, "logits/rejected": -0.2470063865184784, "logps/chosen": -140.68020629882812, "logps/rejected": -149.87210083007812, "loss": 0.6797, "rewards/accuracies": 0.5, "rewards/chosen": -0.002784156706184149, "rewards/margins": 0.008922386914491653, "rewards/rejected": -0.01170654408633709, "step": 428 }, { "epoch": 1.07, "learning_rate": 4.7077847461216594e-08, "logits/chosen": -0.2807213068008423, "logits/rejected": -0.2788940966129303, "logps/chosen": -120.3724365234375, "logps/rejected": -145.092041015625, "loss": 0.6797, "rewards/accuracies": 1.0, "rewards/chosen": 0.02442607656121254, "rewards/margins": 0.07832546532154083, "rewards/rejected": -0.053899385035037994, "step": 429 }, { "epoch": 1.07, "learning_rate": 4.6876573767847076e-08, "logits/chosen": -0.3932054042816162, "logits/rejected": -0.36225906014442444, "logps/chosen": -131.5428466796875, "logps/rejected": -171.45230102539062, "loss": 0.6877, "rewards/accuracies": 0.5, "rewards/chosen": -0.016547774896025658, "rewards/margins": -7.553119212388992e-05, "rewards/rejected": -0.016472244635224342, "step": 430 }, { "epoch": 1.07, "learning_rate": 4.66753508735164e-08, "logits/chosen": -0.27839192748069763, "logits/rejected": -0.23608747124671936, "logps/chosen": -150.8208465576172, "logps/rejected": -149.77182006835938, "loss": 0.6939, "rewards/accuracies": 0.75, "rewards/chosen": -0.015976333990693092, "rewards/margins": 0.015749357640743256, "rewards/rejected": -0.0317256934940815, "step": 431 }, { "epoch": 1.07, "learning_rate": 4.647418205089012e-08, "logits/chosen": -0.31709644198417664, "logits/rejected": -0.30711308121681213, "logps/chosen": -143.33627319335938, "logps/rejected": -136.26614379882812, "loss": 0.676, "rewards/accuracies": 1.0, "rewards/chosen": 0.05244484171271324, "rewards/margins": 0.10769806802272797, "rewards/rejected": -0.055253222584724426, "step": 432 }, { "epoch": 1.08, "learning_rate": 4.6273070571754435e-08, "logits/chosen": -0.30149397253990173, "logits/rejected": -0.2761725187301636, "logps/chosen": -148.1707763671875, "logps/rejected": -152.85848999023438, "loss": 0.6906, "rewards/accuracies": 0.75, "rewards/chosen": 0.003445817157626152, "rewards/margins": 0.001558113843202591, "rewards/rejected": 0.001887703314423561, "step": 433 }, { "epoch": 1.08, "learning_rate": 4.607201970696295e-08, "logits/chosen": -0.2954486012458801, "logits/rejected": -0.3019717037677765, "logps/chosen": -152.1460723876953, "logps/rejected": -176.7606658935547, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.021599579602479935, "rewards/margins": 0.05184517055749893, "rewards/rejected": -0.030245592817664146, "step": 434 }, { "epoch": 1.08, "learning_rate": 4.5871032726383385e-08, "logits/chosen": -0.4802934527397156, "logits/rejected": -0.46721580624580383, "logps/chosen": -146.81573486328125, "logps/rejected": -173.54483032226562, "loss": 0.6856, "rewards/accuracies": 0.75, "rewards/chosen": 0.011566163040697575, "rewards/margins": 0.0737251341342926, "rewards/rejected": -0.062158964574337006, "step": 435 }, { "epoch": 1.08, "learning_rate": 4.567011289884446e-08, "logits/chosen": -0.319240540266037, "logits/rejected": -0.3411575257778168, "logps/chosen": -158.7672119140625, "logps/rejected": -170.92684936523438, "loss": 0.6923, "rewards/accuracies": 0.25, "rewards/chosen": 0.0038169873878359795, "rewards/margins": -0.021318435668945312, "rewards/rejected": 0.025135422125458717, "step": 436 }, { "epoch": 1.09, "learning_rate": 4.546926349208277e-08, "logits/chosen": -0.34978801012039185, "logits/rejected": -0.3409239649772644, "logps/chosen": -119.12225341796875, "logps/rejected": -168.1797637939453, "loss": 0.6907, "rewards/accuracies": 0.25, "rewards/chosen": -0.024711037054657936, "rewards/margins": -0.03684291988611221, "rewards/rejected": 0.012131881900131702, "step": 437 }, { "epoch": 1.09, "learning_rate": 4.526848777268956e-08, "logits/chosen": -0.3300843834877014, "logits/rejected": -0.3306199312210083, "logps/chosen": -146.4730682373047, "logps/rejected": -195.95120239257812, "loss": 0.692, "rewards/accuracies": 0.25, "rewards/chosen": -0.00091552734375, "rewards/margins": -0.017619704827666283, "rewards/rejected": 0.016704177483916283, "step": 438 }, { "epoch": 1.09, "learning_rate": 4.5067789006057635e-08, "logits/chosen": -0.22803817689418793, "logits/rejected": -0.20285676419734955, "logps/chosen": -160.19993591308594, "logps/rejected": -167.2554168701172, "loss": 0.6802, "rewards/accuracies": 0.25, "rewards/chosen": 0.004535293206572533, "rewards/margins": 0.002489089034497738, "rewards/rejected": 0.0020462041720747948, "step": 439 }, { "epoch": 1.09, "learning_rate": 4.486717045632823e-08, "logits/chosen": -0.2841097116470337, "logits/rejected": -0.27393674850463867, "logps/chosen": -139.43997192382812, "logps/rejected": -155.24798583984375, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.018376922234892845, "rewards/margins": 0.026174545288085938, "rewards/rejected": -0.007797622587531805, "step": 440 }, { "epoch": 1.1, "learning_rate": 4.466663538633801e-08, "logits/chosen": -0.30387774109840393, "logits/rejected": -0.26912450790405273, "logps/chosen": -163.84524536132812, "logps/rejected": -174.91030883789062, "loss": 0.6892, "rewards/accuracies": 0.5, "rewards/chosen": -0.005013275891542435, "rewards/margins": -0.0011323951184749603, "rewards/rejected": -0.0038808819372206926, "step": 441 }, { "epoch": 1.1, "learning_rate": 4.4466187057565874e-08, "logits/chosen": -0.4027409851551056, "logits/rejected": -0.36318010091781616, "logps/chosen": -151.11239624023438, "logps/rejected": -141.8481903076172, "loss": 0.6883, "rewards/accuracies": 0.75, "rewards/chosen": 0.0015680305659770966, "rewards/margins": 0.023792268708348274, "rewards/rejected": -0.02222423627972603, "step": 442 }, { "epoch": 1.1, "learning_rate": 4.426582873007998e-08, "logits/chosen": -0.19299112260341644, "logits/rejected": -0.18022224307060242, "logps/chosen": -143.6571502685547, "logps/rejected": -153.28314208984375, "loss": 0.6884, "rewards/accuracies": 0.5, "rewards/chosen": 0.010234260000288486, "rewards/margins": 0.0026708601508289576, "rewards/rejected": 0.007563400082290173, "step": 443 }, { "epoch": 1.1, "learning_rate": 4.406556366248473e-08, "logits/chosen": -0.13218005001544952, "logits/rejected": -0.13384974002838135, "logps/chosen": -147.25567626953125, "logps/rejected": -192.12301635742188, "loss": 0.6758, "rewards/accuracies": 0.75, "rewards/chosen": 0.013935661874711514, "rewards/margins": 0.04096011817455292, "rewards/rejected": -0.027024460956454277, "step": 444 }, { "epoch": 1.11, "learning_rate": 4.3865395111867767e-08, "logits/chosen": -0.3758508563041687, "logits/rejected": -0.3702693581581116, "logps/chosen": -152.7460174560547, "logps/rejected": -200.8943328857422, "loss": 0.685, "rewards/accuracies": 0.75, "rewards/chosen": 0.002988815074786544, "rewards/margins": 0.038302231580019, "rewards/rejected": -0.03531341627240181, "step": 445 }, { "epoch": 1.11, "learning_rate": 4.3665326333746935e-08, "logits/chosen": -0.27704426646232605, "logits/rejected": -0.2670464515686035, "logps/chosen": -182.75193786621094, "logps/rejected": -136.47950744628906, "loss": 0.6954, "rewards/accuracies": 0.5, "rewards/chosen": -0.045247647911310196, "rewards/margins": -0.04414176940917969, "rewards/rejected": -0.0011058805976063013, "step": 446 }, { "epoch": 1.11, "learning_rate": 4.3465360582017495e-08, "logits/chosen": -0.3146645426750183, "logits/rejected": -0.31072238087654114, "logps/chosen": -124.87754821777344, "logps/rejected": -142.68258666992188, "loss": 0.6896, "rewards/accuracies": 1.0, "rewards/chosen": 0.00301189417950809, "rewards/margins": 0.022633744403719902, "rewards/rejected": -0.019621849060058594, "step": 447 }, { "epoch": 1.11, "learning_rate": 4.3265501108899e-08, "logits/chosen": -0.34116098284721375, "logits/rejected": -0.3222542703151703, "logps/chosen": -141.79261779785156, "logps/rejected": -151.22035217285156, "loss": 0.6861, "rewards/accuracies": 0.25, "rewards/chosen": -0.006075095850974321, "rewards/margins": -0.003839683486148715, "rewards/rejected": -0.00223541259765625, "step": 448 }, { "epoch": 1.12, "learning_rate": 4.3065751164882534e-08, "logits/chosen": -0.24974744021892548, "logits/rejected": -0.2754310071468353, "logps/chosen": -152.01919555664062, "logps/rejected": -164.30023193359375, "loss": 0.6918, "rewards/accuracies": 0.75, "rewards/chosen": 0.04511737823486328, "rewards/margins": 0.021411705762147903, "rewards/rejected": 0.023705672472715378, "step": 449 }, { "epoch": 1.12, "learning_rate": 4.286611399867784e-08, "logits/chosen": -0.3475452959537506, "logits/rejected": -0.2990334928035736, "logps/chosen": -141.4932098388672, "logps/rejected": -169.84536743164062, "loss": 0.6875, "rewards/accuracies": 0.25, "rewards/chosen": 0.00864715501666069, "rewards/margins": -0.011856650933623314, "rewards/rejected": 0.020503805950284004, "step": 450 }, { "epoch": 1.12, "learning_rate": 4.2666592857160414e-08, "logits/chosen": -0.37756818532943726, "logits/rejected": -0.34867042303085327, "logps/chosen": -146.9675750732422, "logps/rejected": -182.0978240966797, "loss": 0.6879, "rewards/accuracies": 0.75, "rewards/chosen": 0.010199548676609993, "rewards/margins": 0.012228965759277344, "rewards/rejected": -0.0020294198766350746, "step": 451 }, { "epoch": 1.12, "learning_rate": 4.2467190985318765e-08, "logits/chosen": -0.28295183181762695, "logits/rejected": -0.2806604504585266, "logps/chosen": -168.1715850830078, "logps/rejected": -132.32009887695312, "loss": 0.692, "rewards/accuracies": 0.75, "rewards/chosen": -0.008933259174227715, "rewards/margins": 0.0048791877925395966, "rewards/rejected": -0.013812446035444736, "step": 452 }, { "epoch": 1.13, "learning_rate": 4.226791162620158e-08, "logits/chosen": -0.23533563315868378, "logits/rejected": -0.16374456882476807, "logps/chosen": -149.90342712402344, "logps/rejected": -169.86021423339844, "loss": 0.6895, "rewards/accuracies": 0.5, "rewards/chosen": 0.0025802617892622948, "rewards/margins": 0.0013549814466387033, "rewards/rejected": 0.0012252808082848787, "step": 453 }, { "epoch": 1.13, "learning_rate": 4.206875802086509e-08, "logits/chosen": -0.3029292821884155, "logits/rejected": -0.28949564695358276, "logps/chosen": -155.91998291015625, "logps/rejected": -151.87466430664062, "loss": 0.6912, "rewards/accuracies": 0.5, "rewards/chosen": -0.006638718768954277, "rewards/margins": -0.01326217781752348, "rewards/rejected": 0.006623458117246628, "step": 454 }, { "epoch": 1.13, "learning_rate": 4.18697334083202e-08, "logits/chosen": -0.4261821210384369, "logits/rejected": -0.4137859046459198, "logps/chosen": -160.1018829345703, "logps/rejected": -167.47116088867188, "loss": 0.6854, "rewards/accuracies": 0.5, "rewards/chosen": -0.012719535268843174, "rewards/margins": 0.004297446459531784, "rewards/rejected": -0.017016984522342682, "step": 455 }, { "epoch": 1.13, "learning_rate": 4.167084102547991e-08, "logits/chosen": -0.365837037563324, "logits/rejected": -0.3579105734825134, "logps/chosen": -144.99856567382812, "logps/rejected": -165.7381591796875, "loss": 0.6867, "rewards/accuracies": 0.75, "rewards/chosen": 0.02811451070010662, "rewards/margins": 0.007578850258141756, "rewards/rejected": 0.020535659044981003, "step": 456 }, { "epoch": 1.14, "learning_rate": 4.147208410710671e-08, "logits/chosen": -0.2847432494163513, "logits/rejected": -0.2765738070011139, "logps/chosen": -130.51126098632812, "logps/rejected": -153.5640106201172, "loss": 0.6826, "rewards/accuracies": 0.5, "rewards/chosen": 0.014338493347167969, "rewards/margins": 0.018437005579471588, "rewards/rejected": -0.00409851036965847, "step": 457 }, { "epoch": 1.14, "learning_rate": 4.127346588575983e-08, "logits/chosen": -0.2856087386608124, "logits/rejected": -0.2737632989883423, "logps/chosen": -151.2877197265625, "logps/rejected": -178.12875366210938, "loss": 0.6813, "rewards/accuracies": 0.75, "rewards/chosen": -0.011826515197753906, "rewards/margins": 0.007711790502071381, "rewards/rejected": -0.019538305699825287, "step": 458 }, { "epoch": 1.14, "learning_rate": 4.107498959174279e-08, "logits/chosen": -0.22828355431556702, "logits/rejected": -0.20918801426887512, "logps/chosen": -137.83358764648438, "logps/rejected": -192.49240112304688, "loss": 0.6876, "rewards/accuracies": 0.5, "rewards/chosen": 0.0044044493697583675, "rewards/margins": 0.00917511060833931, "rewards/rejected": -0.00477065984159708, "step": 459 }, { "epoch": 1.14, "learning_rate": 4.0876658453050825e-08, "logits/chosen": -0.1188543289899826, "logits/rejected": -0.12245139479637146, "logps/chosen": -139.78057861328125, "logps/rejected": -171.82269287109375, "loss": 0.6924, "rewards/accuracies": 0.0, "rewards/chosen": -0.01355667132884264, "rewards/margins": -0.028017615899443626, "rewards/rejected": 0.014460945501923561, "step": 460 }, { "epoch": 1.15, "learning_rate": 4.0678475695318366e-08, "logits/chosen": -0.1465557962656021, "logits/rejected": -0.1479463279247284, "logps/chosen": -140.74880981445312, "logps/rejected": -165.22463989257812, "loss": 0.6874, "rewards/accuracies": 0.75, "rewards/chosen": -0.009416580200195312, "rewards/margins": 0.01556167658418417, "rewards/rejected": -0.024978257715702057, "step": 461 }, { "epoch": 1.15, "learning_rate": 4.048044454176657e-08, "logits/chosen": -0.2592134475708008, "logits/rejected": -0.22423423826694489, "logps/chosen": -146.7341766357422, "logps/rejected": -167.3494873046875, "loss": 0.6898, "rewards/accuracies": 0.25, "rewards/chosen": -0.009665871039032936, "rewards/margins": -0.014539910480380058, "rewards/rejected": 0.004874038510024548, "step": 462 }, { "epoch": 1.15, "learning_rate": 4.0282568213151e-08, "logits/chosen": -0.21237249672412872, "logits/rejected": -0.1980186253786087, "logps/chosen": -140.42127990722656, "logps/rejected": -163.5313262939453, "loss": 0.6849, "rewards/accuracies": 0.5, "rewards/chosen": -0.0001400010660290718, "rewards/margins": 0.005516812205314636, "rewards/rejected": -0.005656814202666283, "step": 463 }, { "epoch": 1.15, "learning_rate": 4.008484992770908e-08, "logits/chosen": -0.21632929146289825, "logits/rejected": -0.1945597529411316, "logps/chosen": -132.65936279296875, "logps/rejected": -162.94581604003906, "loss": 0.6757, "rewards/accuracies": 0.75, "rewards/chosen": 0.004865454975515604, "rewards/margins": 0.041506003588438034, "rewards/rejected": -0.03664054721593857, "step": 464 }, { "epoch": 1.16, "learning_rate": 3.988729290110789e-08, "logits/chosen": -0.18047736585140228, "logits/rejected": -0.1588555872440338, "logps/chosen": -133.6154022216797, "logps/rejected": -132.6431884765625, "loss": 0.6902, "rewards/accuracies": 0.5, "rewards/chosen": 0.017917253077030182, "rewards/margins": 0.0014289859682321548, "rewards/rejected": 0.016488265246152878, "step": 465 }, { "epoch": 1.16, "learning_rate": 3.968990034639177e-08, "logits/chosen": -0.3868981599807739, "logits/rejected": -0.334175169467926, "logps/chosen": -112.07160186767578, "logps/rejected": -162.97250366210938, "loss": 0.6838, "rewards/accuracies": 0.75, "rewards/chosen": -0.01087036170065403, "rewards/margins": 0.020163346081972122, "rewards/rejected": -0.031033707782626152, "step": 466 }, { "epoch": 1.16, "learning_rate": 3.949267547393016e-08, "logits/chosen": -0.2574305236339569, "logits/rejected": -0.2568013668060303, "logps/chosen": -147.23590087890625, "logps/rejected": -147.0992431640625, "loss": 0.6847, "rewards/accuracies": 0.75, "rewards/chosen": -0.004633522592484951, "rewards/margins": 0.038610268384218216, "rewards/rejected": -0.04324379190802574, "step": 467 }, { "epoch": 1.16, "learning_rate": 3.92956214913653e-08, "logits/chosen": -0.29132771492004395, "logits/rejected": -0.29925087094306946, "logps/chosen": -175.79559326171875, "logps/rejected": -163.78167724609375, "loss": 0.6791, "rewards/accuracies": 0.5, "rewards/chosen": -0.03010559268295765, "rewards/margins": -0.02290363423526287, "rewards/rejected": -0.007201956585049629, "step": 468 }, { "epoch": 1.17, "learning_rate": 3.9098741603560095e-08, "logits/chosen": -0.28383779525756836, "logits/rejected": -0.3020591735839844, "logps/chosen": -153.0868682861328, "logps/rejected": -137.07186889648438, "loss": 0.6841, "rewards/accuracies": 0.5, "rewards/chosen": 0.012605668045580387, "rewards/margins": -0.004120255820453167, "rewards/rejected": 0.016725922003388405, "step": 469 }, { "epoch": 1.17, "learning_rate": 3.8902039012546036e-08, "logits/chosen": -0.4138200283050537, "logits/rejected": -0.3950973153114319, "logps/chosen": -130.4630126953125, "logps/rejected": -161.3141326904297, "loss": 0.6887, "rewards/accuracies": 0.75, "rewards/chosen": -0.01744365692138672, "rewards/margins": -0.0023584365844726562, "rewards/rejected": -0.015085220336914062, "step": 470 }, { "epoch": 1.17, "learning_rate": 3.870551691747103e-08, "logits/chosen": -0.2337123155593872, "logits/rejected": -0.21923524141311646, "logps/chosen": -150.5911407470703, "logps/rejected": -180.6041717529297, "loss": 0.6872, "rewards/accuracies": 0.75, "rewards/chosen": 0.013124849647283554, "rewards/margins": 0.026681140065193176, "rewards/rejected": -0.013556290417909622, "step": 471 }, { "epoch": 1.17, "learning_rate": 3.8509178514547424e-08, "logits/chosen": -0.20491641759872437, "logits/rejected": -0.2009429633617401, "logps/chosen": -149.3642578125, "logps/rejected": -127.1915283203125, "loss": 0.685, "rewards/accuracies": 0.75, "rewards/chosen": -0.010222816839814186, "rewards/margins": 0.01345672644674778, "rewards/rejected": -0.023679541423916817, "step": 472 }, { "epoch": 1.18, "learning_rate": 3.8313026997000066e-08, "logits/chosen": -0.3173534572124481, "logits/rejected": -0.29291555285453796, "logps/chosen": -126.49061584472656, "logps/rejected": -158.16493225097656, "loss": 0.6822, "rewards/accuracies": 0.75, "rewards/chosen": 0.014625359326601028, "rewards/margins": -0.0014913585036993027, "rewards/rejected": 0.016116715967655182, "step": 473 }, { "epoch": 1.18, "learning_rate": 3.811706555501428e-08, "logits/chosen": -0.30698898434638977, "logits/rejected": -0.26805439591407776, "logps/chosen": -181.66058349609375, "logps/rejected": -129.58082580566406, "loss": 0.6946, "rewards/accuracies": 0.75, "rewards/chosen": -0.00023231515660881996, "rewards/margins": 0.00894546415656805, "rewards/rejected": -0.009177779778838158, "step": 474 }, { "epoch": 1.18, "learning_rate": 3.792129737568402e-08, "logits/chosen": -0.4402541220188141, "logits/rejected": -0.4393285810947418, "logps/chosen": -172.07510375976562, "logps/rejected": -167.51043701171875, "loss": 0.6777, "rewards/accuracies": 0.5, "rewards/chosen": 0.009970474988222122, "rewards/margins": 0.017763901501893997, "rewards/rejected": -0.007793426513671875, "step": 475 }, { "epoch": 1.18, "learning_rate": 3.7725725642960044e-08, "logits/chosen": -0.24240317940711975, "logits/rejected": -0.2094256728887558, "logps/chosen": -133.65634155273438, "logps/rejected": -151.72865295410156, "loss": 0.6877, "rewards/accuracies": 0.5, "rewards/chosen": -0.00723381107673049, "rewards/margins": 0.02504711225628853, "rewards/rejected": -0.032280921936035156, "step": 476 }, { "epoch": 1.19, "learning_rate": 3.753035353759815e-08, "logits/chosen": -0.17416346073150635, "logits/rejected": -0.17423230409622192, "logps/chosen": -143.8734130859375, "logps/rejected": -152.97283935546875, "loss": 0.6839, "rewards/accuracies": 0.5, "rewards/chosen": 0.004405021667480469, "rewards/margins": -0.0037963856011629105, "rewards/rejected": 0.008201408199965954, "step": 477 }, { "epoch": 1.19, "learning_rate": 3.733518423710739e-08, "logits/chosen": -0.17490865290164948, "logits/rejected": -0.16602027416229248, "logps/chosen": -165.720703125, "logps/rejected": -155.05149841308594, "loss": 0.6899, "rewards/accuracies": 0.5, "rewards/chosen": -0.020901869982481003, "rewards/margins": -0.006134033203125, "rewards/rejected": -0.014767837710678577, "step": 478 }, { "epoch": 1.19, "learning_rate": 3.714022091569841e-08, "logits/chosen": -0.32308128476142883, "logits/rejected": -0.28792279958724976, "logps/chosen": -145.6859130859375, "logps/rejected": -133.42825317382812, "loss": 0.6828, "rewards/accuracies": 0.5, "rewards/chosen": -0.0052429200150072575, "rewards/margins": -0.014289474114775658, "rewards/rejected": 0.009046554565429688, "step": 479 }, { "epoch": 1.19, "learning_rate": 3.6945466744231914e-08, "logits/chosen": -0.1822632998228073, "logits/rejected": -0.1503564566373825, "logps/chosen": -130.87075805664062, "logps/rejected": -149.65667724609375, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.03134803846478462, "rewards/margins": 0.03891010582447052, "rewards/rejected": -0.007562066428363323, "step": 480 }, { "epoch": 1.2, "learning_rate": 3.675092489016692e-08, "logits/chosen": -0.14631415903568268, "logits/rejected": -0.14147941768169403, "logps/chosen": -184.51950073242188, "logps/rejected": -176.75933837890625, "loss": 0.6857, "rewards/accuracies": 0.75, "rewards/chosen": -0.0013561251107603312, "rewards/margins": 0.037514880299568176, "rewards/rejected": -0.038871001452207565, "step": 481 }, { "epoch": 1.2, "learning_rate": 3.655659851750938e-08, "logits/chosen": -0.26541757583618164, "logits/rejected": -0.24085155129432678, "logps/chosen": -156.2366485595703, "logps/rejected": -170.5728759765625, "loss": 0.6936, "rewards/accuracies": 0.75, "rewards/chosen": 0.010323334485292435, "rewards/margins": 0.018515396863222122, "rewards/rejected": -0.008192062377929688, "step": 482 }, { "epoch": 1.2, "learning_rate": 3.636249078676071e-08, "logits/chosen": -0.3892604112625122, "logits/rejected": -0.34375226497650146, "logps/chosen": -131.53012084960938, "logps/rejected": -158.9450225830078, "loss": 0.6872, "rewards/accuracies": 0.75, "rewards/chosen": 0.02230529673397541, "rewards/margins": 0.024855805560946465, "rewards/rejected": -0.0025505064986646175, "step": 483 }, { "epoch": 1.2, "learning_rate": 3.6168604854866325e-08, "logits/chosen": -0.464087575674057, "logits/rejected": -0.45465731620788574, "logps/chosen": -147.35052490234375, "logps/rejected": -172.1685791015625, "loss": 0.6842, "rewards/accuracies": 0.75, "rewards/chosen": -0.003220940474420786, "rewards/margins": 0.02485218271613121, "rewards/rejected": -0.02807312272489071, "step": 484 }, { "epoch": 1.21, "learning_rate": 3.5974943875164296e-08, "logits/chosen": -0.27621719241142273, "logits/rejected": -0.29660603404045105, "logps/chosen": -144.27862548828125, "logps/rejected": -171.61692810058594, "loss": 0.6919, "rewards/accuracies": 0.5, "rewards/chosen": -0.007654381915926933, "rewards/margins": 0.012719152495265007, "rewards/rejected": -0.02037353441119194, "step": 485 }, { "epoch": 1.21, "learning_rate": 3.578151099733421e-08, "logits/chosen": -0.30198466777801514, "logits/rejected": -0.2625545859336853, "logps/chosen": -176.49058532714844, "logps/rejected": -208.35284423828125, "loss": 0.691, "rewards/accuracies": 0.5, "rewards/chosen": 0.0003841407597064972, "rewards/margins": 0.0004489906132221222, "rewards/rejected": -6.484962068498135e-05, "step": 486 }, { "epoch": 1.21, "learning_rate": 3.5588309367345705e-08, "logits/chosen": -0.33079349994659424, "logits/rejected": -0.3434094488620758, "logps/chosen": -160.20578002929688, "logps/rejected": -163.48971557617188, "loss": 0.6838, "rewards/accuracies": 0.5, "rewards/chosen": -0.010322188958525658, "rewards/margins": 0.00936965923756361, "rewards/rejected": -0.019691849127411842, "step": 487 }, { "epoch": 1.21, "learning_rate": 3.539534212740751e-08, "logits/chosen": -0.38274794816970825, "logits/rejected": -0.37483251094818115, "logps/chosen": -187.68011474609375, "logps/rejected": -182.268798828125, "loss": 0.6946, "rewards/accuracies": 0.5, "rewards/chosen": -0.013478850945830345, "rewards/margins": -0.02575721964240074, "rewards/rejected": 0.012278366833925247, "step": 488 }, { "epoch": 1.22, "learning_rate": 3.5202612415916215e-08, "logits/chosen": -0.28142860531806946, "logits/rejected": -0.2560575604438782, "logps/chosen": -142.0404052734375, "logps/rejected": -147.89450073242188, "loss": 0.6761, "rewards/accuracies": 0.75, "rewards/chosen": -0.00116653461009264, "rewards/margins": 0.026440050452947617, "rewards/rejected": -0.027606584131717682, "step": 489 }, { "epoch": 1.22, "learning_rate": 3.5010123367405335e-08, "logits/chosen": -0.299935519695282, "logits/rejected": -0.29129156470298767, "logps/chosen": -158.70065307617188, "logps/rejected": -188.6485595703125, "loss": 0.6791, "rewards/accuracies": 0.75, "rewards/chosen": 0.016464995220303535, "rewards/margins": 0.023587416857481003, "rewards/rejected": -0.007122421637177467, "step": 490 }, { "epoch": 1.22, "learning_rate": 3.4817878112494216e-08, "logits/chosen": -0.4000106453895569, "logits/rejected": -0.3714662194252014, "logps/chosen": -142.55360412597656, "logps/rejected": -153.0328369140625, "loss": 0.6845, "rewards/accuracies": 0.75, "rewards/chosen": -0.0032110214233398438, "rewards/margins": 0.043770600110292435, "rewards/rejected": -0.04698162153363228, "step": 491 }, { "epoch": 1.22, "learning_rate": 3.4625879777837184e-08, "logits/chosen": -0.1677270531654358, "logits/rejected": -0.16764092445373535, "logps/chosen": -146.79379272460938, "logps/rejected": -137.40113830566406, "loss": 0.6816, "rewards/accuracies": 0.75, "rewards/chosen": -0.010049819946289062, "rewards/margins": 0.02713947370648384, "rewards/rejected": -0.0371892936527729, "step": 492 }, { "epoch": 1.23, "learning_rate": 3.4434131486072704e-08, "logits/chosen": -0.23827701807022095, "logits/rejected": -0.23564176261425018, "logps/chosen": -153.28994750976562, "logps/rejected": -164.74671936035156, "loss": 0.6906, "rewards/accuracies": 0.75, "rewards/chosen": 0.02686462365090847, "rewards/margins": 0.03416290134191513, "rewards/rejected": -0.007298278622329235, "step": 493 }, { "epoch": 1.23, "learning_rate": 3.424263635577256e-08, "logits/chosen": -0.3416938781738281, "logits/rejected": -0.34228837490081787, "logps/chosen": -146.3660888671875, "logps/rejected": -172.26605224609375, "loss": 0.681, "rewards/accuracies": 1.0, "rewards/chosen": 0.019389726221561432, "rewards/margins": 0.05564422905445099, "rewards/rejected": -0.03625450283288956, "step": 494 }, { "epoch": 1.23, "learning_rate": 3.405139750139111e-08, "logits/chosen": -0.31058889627456665, "logits/rejected": -0.3049522638320923, "logps/chosen": -142.7027587890625, "logps/rejected": -129.8819580078125, "loss": 0.6808, "rewards/accuracies": 0.75, "rewards/chosen": 0.00687751779332757, "rewards/margins": 0.05610942840576172, "rewards/rejected": -0.049231912940740585, "step": 495 }, { "epoch": 1.23, "learning_rate": 3.3860418033214764e-08, "logits/chosen": -0.17397867143154144, "logits/rejected": -0.14429160952568054, "logps/chosen": -140.85194396972656, "logps/rejected": -147.6016082763672, "loss": 0.6887, "rewards/accuracies": 0.5, "rewards/chosen": 0.02536201663315296, "rewards/margins": 0.008783916011452675, "rewards/rejected": 0.016578102484345436, "step": 496 }, { "epoch": 1.24, "learning_rate": 3.366970105731121e-08, "logits/chosen": -0.2952461838722229, "logits/rejected": -0.2724323868751526, "logps/chosen": -153.266845703125, "logps/rejected": -166.36721801757812, "loss": 0.6939, "rewards/accuracies": 0.5, "rewards/chosen": 0.005629729945212603, "rewards/margins": -0.006874085403978825, "rewards/rejected": 0.012503815814852715, "step": 497 }, { "epoch": 1.24, "learning_rate": 3.347924967547904e-08, "logits/chosen": -0.3627849221229553, "logits/rejected": -0.35580867528915405, "logps/chosen": -183.02679443359375, "logps/rejected": -173.35992431640625, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.021966170519590378, "rewards/margins": 0.033031463623046875, "rewards/rejected": -0.011065291240811348, "step": 498 }, { "epoch": 1.24, "learning_rate": 3.328906698519726e-08, "logits/chosen": -0.23006637394428253, "logits/rejected": -0.18852034211158752, "logps/chosen": -128.5794677734375, "logps/rejected": -155.70333862304688, "loss": 0.6843, "rewards/accuracies": 0.5, "rewards/chosen": 0.031325533986091614, "rewards/margins": 0.036963462829589844, "rewards/rejected": -0.0056379311718046665, "step": 499 }, { "epoch": 1.24, "learning_rate": 3.309915607957487e-08, "logits/chosen": -0.3412286341190338, "logits/rejected": -0.30958738923072815, "logps/chosen": -123.14350891113281, "logps/rejected": -138.606689453125, "loss": 0.6792, "rewards/accuracies": 0.75, "rewards/chosen": 0.015648460015654564, "rewards/margins": 0.033701132982969284, "rewards/rejected": -0.01805267482995987, "step": 500 }, { "epoch": 1.25, "learning_rate": 3.290952004730063e-08, "logits/chosen": -0.24498242139816284, "logits/rejected": -0.22662508487701416, "logps/chosen": -121.46688842773438, "logps/rejected": -139.05038452148438, "loss": 0.6948, "rewards/accuracies": 1.0, "rewards/chosen": 0.00849227886646986, "rewards/margins": 0.05660457909107208, "rewards/rejected": -0.048112303018569946, "step": 501 }, { "epoch": 1.25, "learning_rate": 3.272016197259277e-08, "logits/chosen": -0.18724165856838226, "logits/rejected": -0.17657025158405304, "logps/chosen": -151.99636840820312, "logps/rejected": -204.08172607421875, "loss": 0.6771, "rewards/accuracies": 1.0, "rewards/chosen": 0.025555802509188652, "rewards/margins": 0.06801185756921768, "rewards/rejected": -0.04245605692267418, "step": 502 }, { "epoch": 1.25, "learning_rate": 3.253108493514889e-08, "logits/chosen": -0.270722359418869, "logits/rejected": -0.24658335745334625, "logps/chosen": -154.43080139160156, "logps/rejected": -168.76939392089844, "loss": 0.6888, "rewards/accuracies": 0.25, "rewards/chosen": -0.010981559753417969, "rewards/margins": -0.007911491207778454, "rewards/rejected": -0.003070068545639515, "step": 503 }, { "epoch": 1.25, "learning_rate": 3.234229201009579e-08, "logits/chosen": -0.23582017421722412, "logits/rejected": -0.2245529741048813, "logps/chosen": -172.19406127929688, "logps/rejected": -183.62863159179688, "loss": 0.6822, "rewards/accuracies": 0.5, "rewards/chosen": 0.010894774459302425, "rewards/margins": 0.02584800496697426, "rewards/rejected": -0.014953232370316982, "step": 504 }, { "epoch": 1.26, "learning_rate": 3.2153786267939506e-08, "logits/chosen": -0.30500754714012146, "logits/rejected": -0.29500752687454224, "logps/chosen": -129.5596160888672, "logps/rejected": -150.65963745117188, "loss": 0.6859, "rewards/accuracies": 0.75, "rewards/chosen": 0.005618667230010033, "rewards/margins": 0.046111300587654114, "rewards/rejected": -0.04049263149499893, "step": 505 }, { "epoch": 1.26, "learning_rate": 3.196557077451539e-08, "logits/chosen": -0.31834009289741516, "logits/rejected": -0.29662489891052246, "logps/chosen": -141.5240936279297, "logps/rejected": -143.04931640625, "loss": 0.6806, "rewards/accuracies": 0.5, "rewards/chosen": -0.003181838896125555, "rewards/margins": 0.01852760650217533, "rewards/rejected": -0.021709444001317024, "step": 506 }, { "epoch": 1.26, "learning_rate": 3.1777648590938196e-08, "logits/chosen": -0.30595144629478455, "logits/rejected": -0.2953758239746094, "logps/chosen": -157.86090087890625, "logps/rejected": -144.5273895263672, "loss": 0.6823, "rewards/accuracies": 0.5, "rewards/chosen": 0.005939865950495005, "rewards/margins": -0.0010580066591501236, "rewards/rejected": 0.006997871212661266, "step": 507 }, { "epoch": 1.26, "learning_rate": 3.159002277355232e-08, "logits/chosen": -0.21321788430213928, "logits/rejected": -0.1687745302915573, "logps/chosen": -161.12030029296875, "logps/rejected": -172.64443969726562, "loss": 0.6949, "rewards/accuracies": 0.0, "rewards/chosen": -0.016986846923828125, "rewards/margins": -0.04252052307128906, "rewards/rejected": 0.025533676147460938, "step": 508 }, { "epoch": 1.27, "learning_rate": 3.1402696373882125e-08, "logits/chosen": -0.18215884268283844, "logits/rejected": -0.13644102215766907, "logps/chosen": -126.64824676513672, "logps/rejected": -163.351806640625, "loss": 0.6842, "rewards/accuracies": 0.75, "rewards/chosen": -0.0015794765204191208, "rewards/margins": 0.04812869802117348, "rewards/rejected": -0.049708180129528046, "step": 509 }, { "epoch": 1.27, "learning_rate": 3.121567243858226e-08, "logits/chosen": -0.2122691571712494, "logits/rejected": -0.2103949338197708, "logps/chosen": -160.42257690429688, "logps/rejected": -170.3743438720703, "loss": 0.694, "rewards/accuracies": 0.25, "rewards/chosen": -0.021817587316036224, "rewards/margins": -0.021986961364746094, "rewards/rejected": 0.0001693723606877029, "step": 510 }, { "epoch": 1.27, "learning_rate": 3.102895400938811e-08, "logits/chosen": -0.38633909821510315, "logits/rejected": -0.3622205853462219, "logps/chosen": -189.28109741210938, "logps/rejected": -154.5752410888672, "loss": 0.6792, "rewards/accuracies": 0.5, "rewards/chosen": -0.009952927008271217, "rewards/margins": 0.004849052056670189, "rewards/rejected": -0.014801979064941406, "step": 511 }, { "epoch": 1.27, "learning_rate": 3.084254412306636e-08, "logits/chosen": -0.4227776527404785, "logits/rejected": -0.3694923520088196, "logps/chosen": -158.0902862548828, "logps/rejected": -179.1208038330078, "loss": 0.6886, "rewards/accuracies": 0.5, "rewards/chosen": 0.0025760652497410774, "rewards/margins": 0.013630294241011143, "rewards/rejected": -0.01105422992259264, "step": 512 }, { "epoch": 1.28, "learning_rate": 3.065644581136561e-08, "logits/chosen": -0.2663496434688568, "logits/rejected": -0.24034051597118378, "logps/chosen": -163.9968719482422, "logps/rejected": -206.324462890625, "loss": 0.6815, "rewards/accuracies": 0.25, "rewards/chosen": -0.004749298095703125, "rewards/margins": -0.01431884616613388, "rewards/rejected": 0.009569549933075905, "step": 513 }, { "epoch": 1.28, "learning_rate": 3.047066210096703e-08, "logits/chosen": -0.3420736789703369, "logits/rejected": -0.3163813054561615, "logps/chosen": -173.84938049316406, "logps/rejected": -154.167236328125, "loss": 0.6883, "rewards/accuracies": 0.25, "rewards/chosen": -0.008117293938994408, "rewards/margins": -0.006128313019871712, "rewards/rejected": -0.001988983014598489, "step": 514 }, { "epoch": 1.28, "learning_rate": 3.028519601343511e-08, "logits/chosen": -0.29210907220840454, "logits/rejected": -0.27174749970436096, "logps/chosen": -146.13511657714844, "logps/rejected": -179.2949981689453, "loss": 0.6788, "rewards/accuracies": 0.75, "rewards/chosen": 0.03633403778076172, "rewards/margins": 0.07287921756505966, "rewards/rejected": -0.03654517978429794, "step": 515 }, { "epoch": 1.28, "learning_rate": 3.010005056516865e-08, "logits/chosen": -0.22003404796123505, "logits/rejected": -0.18601547181606293, "logps/chosen": -160.61349487304688, "logps/rejected": -153.73873901367188, "loss": 0.6858, "rewards/accuracies": 0.5, "rewards/chosen": 0.0034606936387717724, "rewards/margins": 0.005670357495546341, "rewards/rejected": -0.0022096638567745686, "step": 516 }, { "epoch": 1.29, "learning_rate": 2.991522876735154e-08, "logits/chosen": -0.18577955663204193, "logits/rejected": -0.14639918506145477, "logps/chosen": -135.37631225585938, "logps/rejected": -182.795166015625, "loss": 0.6771, "rewards/accuracies": 1.0, "rewards/chosen": 0.03529644012451172, "rewards/margins": 0.059063151478767395, "rewards/rejected": -0.023766707628965378, "step": 517 }, { "epoch": 1.29, "learning_rate": 2.9730733625903844e-08, "logits/chosen": -0.4326886534690857, "logits/rejected": -0.4370286166667938, "logps/chosen": -146.94967651367188, "logps/rejected": -162.5266571044922, "loss": 0.6767, "rewards/accuracies": 0.75, "rewards/chosen": 0.009684372693300247, "rewards/margins": 0.039202116429805756, "rewards/rejected": -0.029517745599150658, "step": 518 }, { "epoch": 1.29, "learning_rate": 2.9546568141433003e-08, "logits/chosen": -0.29615530371665955, "logits/rejected": -0.2759881317615509, "logps/chosen": -132.77218627929688, "logps/rejected": -141.67446899414062, "loss": 0.6813, "rewards/accuracies": 1.0, "rewards/chosen": 0.048841096460819244, "rewards/margins": 0.06255340576171875, "rewards/rejected": -0.013712311163544655, "step": 519 }, { "epoch": 1.29, "learning_rate": 2.9362735309184894e-08, "logits/chosen": -0.38372766971588135, "logits/rejected": -0.36532115936279297, "logps/chosen": -119.29953002929688, "logps/rejected": -149.82131958007812, "loss": 0.6795, "rewards/accuracies": 0.5, "rewards/chosen": -2.6703346520662308e-06, "rewards/margins": 0.027231600135564804, "rewards/rejected": -0.027234269306063652, "step": 520 }, { "epoch": 1.3, "learning_rate": 2.9179238118995175e-08, "logits/chosen": -0.25620076060295105, "logits/rejected": -0.2221994251012802, "logps/chosen": -159.02178955078125, "logps/rejected": -141.4084014892578, "loss": 0.6908, "rewards/accuracies": 0.5, "rewards/chosen": 0.0021888730116188526, "rewards/margins": 0.03339214622974396, "rewards/rejected": -0.031203269958496094, "step": 521 }, { "epoch": 1.3, "learning_rate": 2.8996079555240704e-08, "logits/chosen": -0.1365508884191513, "logits/rejected": -0.13679203391075134, "logps/chosen": -160.1322021484375, "logps/rejected": -152.14471435546875, "loss": 0.688, "rewards/accuracies": 0.25, "rewards/chosen": -0.024632643908262253, "rewards/margins": -0.039606474339962006, "rewards/rejected": 0.014973831363022327, "step": 522 }, { "epoch": 1.3, "learning_rate": 2.8813262596790922e-08, "logits/chosen": -0.3435290455818176, "logits/rejected": -0.36167967319488525, "logps/chosen": -125.87947845458984, "logps/rejected": -157.59506225585938, "loss": 0.6879, "rewards/accuracies": 0.75, "rewards/chosen": 0.0011468883603811264, "rewards/margins": 0.016614913940429688, "rewards/rejected": -0.015468025580048561, "step": 523 }, { "epoch": 1.3, "learning_rate": 2.8630790216959455e-08, "logits/chosen": -0.16195707023143768, "logits/rejected": -0.1616470068693161, "logps/chosen": -188.98731994628906, "logps/rejected": -157.29324340820312, "loss": 0.6735, "rewards/accuracies": 1.0, "rewards/chosen": 0.02692565880715847, "rewards/margins": 0.042483139783144, "rewards/rejected": -0.015557480044662952, "step": 524 }, { "epoch": 1.31, "learning_rate": 2.8448665383455737e-08, "logits/chosen": -0.31600621342658997, "logits/rejected": -0.31785500049591064, "logps/chosen": -153.9947509765625, "logps/rejected": -145.56417846679688, "loss": 0.6754, "rewards/accuracies": 0.5, "rewards/chosen": -0.022335052490234375, "rewards/margins": -0.0017650588415563107, "rewards/rejected": -0.020569993183016777, "step": 525 }, { "epoch": 1.31, "learning_rate": 2.8266891058336727e-08, "logits/chosen": -0.2525014281272888, "logits/rejected": -0.2301851361989975, "logps/chosen": -132.78704833984375, "logps/rejected": -154.06175231933594, "loss": 0.6755, "rewards/accuracies": 0.5, "rewards/chosen": 0.015330887399613857, "rewards/margins": 0.025237657129764557, "rewards/rejected": -0.009906768798828125, "step": 526 }, { "epoch": 1.31, "learning_rate": 2.8085470197958827e-08, "logits/chosen": -0.33967188000679016, "logits/rejected": -0.32232844829559326, "logps/chosen": -133.3948974609375, "logps/rejected": -205.01553344726562, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.0025167460553348064, "rewards/margins": 0.046007730066776276, "rewards/rejected": -0.04349098354578018, "step": 527 }, { "epoch": 1.31, "learning_rate": 2.790440575292966e-08, "logits/chosen": -0.1149989664554596, "logits/rejected": -0.11082039773464203, "logps/chosen": -105.77297973632812, "logps/rejected": -174.06539916992188, "loss": 0.6807, "rewards/accuracies": 0.75, "rewards/chosen": 0.02067394182085991, "rewards/margins": 0.05944003909826279, "rewards/rejected": -0.03876609727740288, "step": 528 }, { "epoch": 1.32, "learning_rate": 2.772370066806018e-08, "logits/chosen": -0.3665832281112671, "logits/rejected": -0.3744984567165375, "logps/chosen": -134.68775939941406, "logps/rejected": -165.1165771484375, "loss": 0.6885, "rewards/accuracies": 0.5, "rewards/chosen": 0.018506623804569244, "rewards/margins": 0.00986633449792862, "rewards/rejected": 0.008640289306640625, "step": 529 }, { "epoch": 1.32, "learning_rate": 2.7543357882316742e-08, "logits/chosen": -0.26199036836624146, "logits/rejected": -0.22963954508304596, "logps/chosen": -142.74298095703125, "logps/rejected": -143.3136444091797, "loss": 0.6848, "rewards/accuracies": 1.0, "rewards/chosen": -0.0019077304750680923, "rewards/margins": 0.03345794603228569, "rewards/rejected": -0.03536567836999893, "step": 530 }, { "epoch": 1.32, "learning_rate": 2.73633803287733e-08, "logits/chosen": -0.1981583684682846, "logits/rejected": -0.16853047907352448, "logps/chosen": -141.67332458496094, "logps/rejected": -152.58592224121094, "loss": 0.6783, "rewards/accuracies": 0.75, "rewards/chosen": -0.00467681884765625, "rewards/margins": 0.014140892773866653, "rewards/rejected": -0.018817711621522903, "step": 531 }, { "epoch": 1.32, "learning_rate": 2.7183770934563754e-08, "logits/chosen": -0.3444339632987976, "logits/rejected": -0.3025806248188019, "logps/chosen": -147.90878295898438, "logps/rejected": -189.789794921875, "loss": 0.6874, "rewards/accuracies": 0.5, "rewards/chosen": -0.0005207066424190998, "rewards/margins": 0.002461053431034088, "rewards/rejected": -0.002981758676469326, "step": 532 }, { "epoch": 1.33, "learning_rate": 2.7004532620834273e-08, "logits/chosen": -0.24984893202781677, "logits/rejected": -0.2630960941314697, "logps/chosen": -149.2742919921875, "logps/rejected": -178.5689239501953, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.01201934739947319, "rewards/margins": 0.03453369066119194, "rewards/rejected": -0.02251434326171875, "step": 533 }, { "epoch": 1.33, "learning_rate": 2.682566830269579e-08, "logits/chosen": -0.2277928590774536, "logits/rejected": -0.22289814054965973, "logps/chosen": -157.95751953125, "logps/rejected": -155.55113220214844, "loss": 0.6844, "rewards/accuracies": 0.5, "rewards/chosen": -0.006528091616928577, "rewards/margins": -0.002188110491260886, "rewards/rejected": -0.004339981358498335, "step": 534 }, { "epoch": 1.33, "learning_rate": 2.6647180889176696e-08, "logits/chosen": -0.21791964769363403, "logits/rejected": -0.19589614868164062, "logps/chosen": -150.17843627929688, "logps/rejected": -116.93685150146484, "loss": 0.6905, "rewards/accuracies": 0.25, "rewards/chosen": -0.010780144482851028, "rewards/margins": -0.026894759386777878, "rewards/rejected": 0.016114616766572, "step": 535 }, { "epoch": 1.33, "learning_rate": 2.6469073283175374e-08, "logits/chosen": -0.19779925048351288, "logits/rejected": -0.16164326667785645, "logps/chosen": -149.5416259765625, "logps/rejected": -152.37693786621094, "loss": 0.6784, "rewards/accuracies": 0.75, "rewards/chosen": -0.006720161996781826, "rewards/margins": 0.015967942774295807, "rewards/rejected": -0.02268810197710991, "step": 536 }, { "epoch": 1.34, "learning_rate": 2.6291348381413092e-08, "logits/chosen": -0.18103864789009094, "logits/rejected": -0.13577033579349518, "logps/chosen": -156.2101593017578, "logps/rejected": -181.27777099609375, "loss": 0.6856, "rewards/accuracies": 0.75, "rewards/chosen": -0.011618995107710361, "rewards/margins": 0.026421548798680305, "rewards/rejected": -0.03804054483771324, "step": 537 }, { "epoch": 1.34, "learning_rate": 2.6114009074386845e-08, "logits/chosen": -0.16279618442058563, "logits/rejected": -0.15363749861717224, "logps/chosen": -147.57542419433594, "logps/rejected": -189.48294067382812, "loss": 0.6894, "rewards/accuracies": 0.0, "rewards/chosen": -0.014261437579989433, "rewards/margins": -0.01134796068072319, "rewards/rejected": -0.002913475502282381, "step": 538 }, { "epoch": 1.34, "learning_rate": 2.5937058246322407e-08, "logits/chosen": -0.26418957114219666, "logits/rejected": -0.25097978115081787, "logps/chosen": -123.70403289794922, "logps/rejected": -186.43035888671875, "loss": 0.6812, "rewards/accuracies": 0.0, "rewards/chosen": -0.03223590925335884, "rewards/margins": -0.02242870256304741, "rewards/rejected": -0.009807205758988857, "step": 539 }, { "epoch": 1.34, "learning_rate": 2.5760498775127325e-08, "logits/chosen": -0.27830970287323, "logits/rejected": -0.2583749294281006, "logps/chosen": -134.6124267578125, "logps/rejected": -138.63436889648438, "loss": 0.676, "rewards/accuracies": 0.75, "rewards/chosen": -0.015203094109892845, "rewards/margins": 0.030229952186346054, "rewards/rejected": -0.04543304443359375, "step": 540 }, { "epoch": 1.35, "learning_rate": 2.5584333532344183e-08, "logits/chosen": -0.17090056836605072, "logits/rejected": -0.1360650211572647, "logps/chosen": -114.83134460449219, "logps/rejected": -163.92965698242188, "loss": 0.6878, "rewards/accuracies": 1.0, "rewards/chosen": 0.038639262318611145, "rewards/margins": 0.06653576344251633, "rewards/rejected": -0.027896501123905182, "step": 541 }, { "epoch": 1.35, "learning_rate": 2.5408565383103876e-08, "logits/chosen": -0.13630622625350952, "logits/rejected": -0.12043177336454391, "logps/chosen": -130.7660675048828, "logps/rejected": -128.01412963867188, "loss": 0.6741, "rewards/accuracies": 0.75, "rewards/chosen": 0.024403763934969902, "rewards/margins": 0.027032282203435898, "rewards/rejected": -0.002628517337143421, "step": 542 }, { "epoch": 1.35, "learning_rate": 2.5233197186079015e-08, "logits/chosen": -0.2403082549571991, "logits/rejected": -0.2567209005355835, "logps/chosen": -150.49147033691406, "logps/rejected": -192.16673278808594, "loss": 0.6766, "rewards/accuracies": 0.5, "rewards/chosen": -0.025243569165468216, "rewards/margins": 0.0005598068237304688, "rewards/rejected": -0.025803374126553535, "step": 543 }, { "epoch": 1.35, "learning_rate": 2.505823179343743e-08, "logits/chosen": -0.3770117461681366, "logits/rejected": -0.3687165379524231, "logps/chosen": -161.9558868408203, "logps/rejected": -136.8254852294922, "loss": 0.6863, "rewards/accuracies": 0.75, "rewards/chosen": 0.00830764789134264, "rewards/margins": 0.022754861041903496, "rewards/rejected": -0.014447213150560856, "step": 544 }, { "epoch": 1.36, "learning_rate": 2.4883672050795822e-08, "logits/chosen": -0.14249394834041595, "logits/rejected": -0.1639118194580078, "logps/chosen": -154.18406677246094, "logps/rejected": -160.20974731445312, "loss": 0.6835, "rewards/accuracies": 0.5, "rewards/chosen": 0.02064666897058487, "rewards/margins": 0.014205741696059704, "rewards/rejected": 0.006440926808863878, "step": 545 }, { "epoch": 1.36, "learning_rate": 2.4709520797173415e-08, "logits/chosen": -0.11797182261943817, "logits/rejected": -0.09517889469861984, "logps/chosen": -122.46220397949219, "logps/rejected": -150.09326171875, "loss": 0.6774, "rewards/accuracies": 0.5, "rewards/chosen": 0.011813163757324219, "rewards/margins": 0.015600396320223808, "rewards/rejected": -0.003787231631577015, "step": 546 }, { "epoch": 1.36, "learning_rate": 2.453578086494582e-08, "logits/chosen": -0.1741308569908142, "logits/rejected": -0.20592698454856873, "logps/chosen": -141.9237060546875, "logps/rejected": -138.8103790283203, "loss": 0.691, "rewards/accuracies": 0.5, "rewards/chosen": 0.019477462396025658, "rewards/margins": 0.011877631768584251, "rewards/rejected": 0.007599831558763981, "step": 547 }, { "epoch": 1.36, "learning_rate": 2.4362455079798937e-08, "logits/chosen": -0.3225948214530945, "logits/rejected": -0.29871922731399536, "logps/chosen": -155.17893981933594, "logps/rejected": -174.18527221679688, "loss": 0.6857, "rewards/accuracies": 0.75, "rewards/chosen": -0.0009464267641305923, "rewards/margins": 0.021758651360869408, "rewards/rejected": -0.022705078125, "step": 548 }, { "epoch": 1.37, "learning_rate": 2.418954626068311e-08, "logits/chosen": -0.3078724443912506, "logits/rejected": -0.2980741560459137, "logps/chosen": -124.24716186523438, "logps/rejected": -138.6820068359375, "loss": 0.6896, "rewards/accuracies": 0.5, "rewards/chosen": 0.004253196530044079, "rewards/margins": -0.007610702887177467, "rewards/rejected": 0.011863898485898972, "step": 549 }, { "epoch": 1.37, "learning_rate": 2.4017057219767116e-08, "logits/chosen": -0.30146175622940063, "logits/rejected": -0.291853129863739, "logps/chosen": -152.1803436279297, "logps/rejected": -214.1934356689453, "loss": 0.6896, "rewards/accuracies": 0.5, "rewards/chosen": 0.008594894781708717, "rewards/margins": 0.0015205368399620056, "rewards/rejected": 0.0070743560791015625, "step": 550 }, { "epoch": 1.37, "learning_rate": 2.3844990762392515e-08, "logits/chosen": -0.13557079434394836, "logits/rejected": -0.1561446487903595, "logps/chosen": -126.48365020751953, "logps/rejected": -155.19618225097656, "loss": 0.6819, "rewards/accuracies": 0.25, "rewards/chosen": 0.004221916198730469, "rewards/margins": 0.00247802771627903, "rewards/rejected": 0.001743888482451439, "step": 551 }, { "epoch": 1.37, "learning_rate": 2.367334968702808e-08, "logits/chosen": -0.1695711761713028, "logits/rejected": -0.16036830842494965, "logps/chosen": -132.53587341308594, "logps/rejected": -141.67489624023438, "loss": 0.6903, "rewards/accuracies": 0.75, "rewards/chosen": 0.0331851989030838, "rewards/margins": 0.04812049865722656, "rewards/rejected": -0.01493530347943306, "step": 552 }, { "epoch": 1.38, "learning_rate": 2.3502136785224164e-08, "logits/chosen": -0.25373661518096924, "logits/rejected": -0.24326671659946442, "logps/chosen": -157.42050170898438, "logps/rejected": -167.8223876953125, "loss": 0.6804, "rewards/accuracies": 1.0, "rewards/chosen": 0.008018875494599342, "rewards/margins": 0.03117828257381916, "rewards/rejected": -0.023159408941864967, "step": 553 }, { "epoch": 1.38, "learning_rate": 2.333135484156734e-08, "logits/chosen": -0.28112778067588806, "logits/rejected": -0.26040783524513245, "logps/chosen": -158.44044494628906, "logps/rejected": -177.8938751220703, "loss": 0.6828, "rewards/accuracies": 0.5, "rewards/chosen": 0.020076369866728783, "rewards/margins": 0.05351676791906357, "rewards/rejected": -0.033440399914979935, "step": 554 }, { "epoch": 1.38, "learning_rate": 2.3161006633635205e-08, "logits/chosen": -0.05135172978043556, "logits/rejected": -0.05139927566051483, "logps/chosen": -118.4894790649414, "logps/rejected": -171.46075439453125, "loss": 0.6875, "rewards/accuracies": 1.0, "rewards/chosen": 0.027885818853974342, "rewards/margins": 0.044040679931640625, "rewards/rejected": -0.016154862940311432, "step": 555 }, { "epoch": 1.38, "learning_rate": 2.299109493195106e-08, "logits/chosen": -0.36934030055999756, "logits/rejected": -0.35472676157951355, "logps/chosen": -177.62677001953125, "logps/rejected": -189.3009033203125, "loss": 0.6754, "rewards/accuracies": 0.75, "rewards/chosen": 0.018930815160274506, "rewards/margins": 0.061949919909238815, "rewards/rejected": -0.04301910474896431, "step": 556 }, { "epoch": 1.39, "learning_rate": 2.282162249993895e-08, "logits/chosen": -0.23722119629383087, "logits/rejected": -0.19733746349811554, "logps/chosen": -190.54063415527344, "logps/rejected": -161.6134796142578, "loss": 0.6854, "rewards/accuracies": 0.75, "rewards/chosen": -0.01055603101849556, "rewards/margins": 0.06236878037452698, "rewards/rejected": -0.07292480766773224, "step": 557 }, { "epoch": 1.39, "learning_rate": 2.2652592093878663e-08, "logits/chosen": -0.3848608136177063, "logits/rejected": -0.36559420824050903, "logps/chosen": -143.78814697265625, "logps/rejected": -157.35707092285156, "loss": 0.6816, "rewards/accuracies": 0.75, "rewards/chosen": 0.0071876514703035355, "rewards/margins": 0.02892189286649227, "rewards/rejected": -0.021734237670898438, "step": 558 }, { "epoch": 1.39, "learning_rate": 2.2484006462860965e-08, "logits/chosen": -0.34948503971099854, "logits/rejected": -0.3015076220035553, "logps/chosen": -148.1240234375, "logps/rejected": -154.56024169921875, "loss": 0.6847, "rewards/accuracies": 0.75, "rewards/chosen": 0.0210737232118845, "rewards/margins": 0.03984661027789116, "rewards/rejected": -0.01877288892865181, "step": 559 }, { "epoch": 1.39, "learning_rate": 2.2315868348742827e-08, "logits/chosen": -0.36555004119873047, "logits/rejected": -0.3312801718711853, "logps/chosen": -150.51422119140625, "logps/rejected": -185.03103637695312, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.0336339958012104, "rewards/margins": 0.05964794009923935, "rewards/rejected": -0.026013948023319244, "step": 560 }, { "epoch": 1.4, "learning_rate": 2.2148180486102847e-08, "logits/chosen": -0.29496997594833374, "logits/rejected": -0.29444658756256104, "logps/chosen": -178.25592041015625, "logps/rejected": -187.95321655273438, "loss": 0.6843, "rewards/accuracies": 0.5, "rewards/chosen": -0.0055866241455078125, "rewards/margins": 0.0036575309932231903, "rewards/rejected": -0.009244155138731003, "step": 561 }, { "epoch": 1.4, "learning_rate": 2.198094560219684e-08, "logits/chosen": -0.253012478351593, "logits/rejected": -0.235497385263443, "logps/chosen": -165.47535705566406, "logps/rejected": -177.52191162109375, "loss": 0.6863, "rewards/accuracies": 0.75, "rewards/chosen": -0.016741562634706497, "rewards/margins": 0.01816100999712944, "rewards/rejected": -0.03490257263183594, "step": 562 }, { "epoch": 1.4, "learning_rate": 2.181416641691337e-08, "logits/chosen": -0.28338250517845154, "logits/rejected": -0.2471190243959427, "logps/chosen": -155.67807006835938, "logps/rejected": -162.11993408203125, "loss": 0.6882, "rewards/accuracies": 1.0, "rewards/chosen": 0.035558510571718216, "rewards/margins": 0.05164623260498047, "rewards/rejected": -0.016087723895907402, "step": 563 }, { "epoch": 1.4, "learning_rate": 2.164784564272959e-08, "logits/chosen": -0.28964123129844666, "logits/rejected": -0.2431747168302536, "logps/chosen": -134.32736206054688, "logps/rejected": -169.5690155029297, "loss": 0.6884, "rewards/accuracies": 0.5, "rewards/chosen": -0.01969757117331028, "rewards/margins": -0.009860611520707607, "rewards/rejected": -0.009836958721280098, "step": 564 }, { "epoch": 1.41, "learning_rate": 2.1481985984667128e-08, "logits/chosen": -0.38677653670310974, "logits/rejected": -0.3838343620300293, "logps/chosen": -173.7406005859375, "logps/rejected": -177.00917053222656, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.011037063784897327, "rewards/margins": 0.04855918884277344, "rewards/rejected": -0.037522125989198685, "step": 565 }, { "epoch": 1.41, "learning_rate": 2.1316590140248053e-08, "logits/chosen": -0.13168677687644958, "logits/rejected": -0.12109659612178802, "logps/chosen": -116.57725524902344, "logps/rejected": -199.44293212890625, "loss": 0.6819, "rewards/accuracies": 0.75, "rewards/chosen": 0.03944225609302521, "rewards/margins": 0.017169760540127754, "rewards/rejected": 0.022272489964962006, "step": 566 }, { "epoch": 1.41, "learning_rate": 2.115166079945101e-08, "logits/chosen": -0.5309535264968872, "logits/rejected": -0.5164185762405396, "logps/chosen": -146.93692016601562, "logps/rejected": -168.07101440429688, "loss": 0.6784, "rewards/accuracies": 0.25, "rewards/chosen": -0.0351499542593956, "rewards/margins": -0.0048715583980083466, "rewards/rejected": -0.030278397724032402, "step": 567 }, { "epoch": 1.41, "learning_rate": 2.0987200644667524e-08, "logits/chosen": -0.2227838784456253, "logits/rejected": -0.20410379767417908, "logps/chosen": -149.84768676757812, "logps/rejected": -150.41891479492188, "loss": 0.6918, "rewards/accuracies": 0.25, "rewards/chosen": -0.009550857357680798, "rewards/margins": 0.004248427227139473, "rewards/rejected": -0.013799285516142845, "step": 568 }, { "epoch": 1.42, "learning_rate": 2.0823212350658308e-08, "logits/chosen": -0.26494458317756653, "logits/rejected": -0.25391674041748047, "logps/chosen": -129.50491333007812, "logps/rejected": -153.2370147705078, "loss": 0.6897, "rewards/accuracies": 0.75, "rewards/chosen": 0.016236115247011185, "rewards/margins": 0.05553627014160156, "rewards/rejected": -0.039300158619880676, "step": 569 }, { "epoch": 1.42, "learning_rate": 2.0659698584509767e-08, "logits/chosen": -0.4022488296031952, "logits/rejected": -0.385458767414093, "logps/chosen": -146.66024780273438, "logps/rejected": -160.8321990966797, "loss": 0.6854, "rewards/accuracies": 0.75, "rewards/chosen": -0.011330796405673027, "rewards/margins": 0.030707931146025658, "rewards/rejected": -0.042038723826408386, "step": 570 }, { "epoch": 1.42, "learning_rate": 2.0496662005590692e-08, "logits/chosen": -0.3699157238006592, "logits/rejected": -0.33486080169677734, "logps/chosen": -153.2104949951172, "logps/rejected": -153.38076782226562, "loss": 0.6796, "rewards/accuracies": 0.5, "rewards/chosen": 0.02201690711081028, "rewards/margins": 0.01340179517865181, "rewards/rejected": 0.00861511193215847, "step": 571 }, { "epoch": 1.42, "learning_rate": 2.0334105265508915e-08, "logits/chosen": -0.10293371975421906, "logits/rejected": -0.10345125198364258, "logps/chosen": -131.14788818359375, "logps/rejected": -141.10540771484375, "loss": 0.6795, "rewards/accuracies": 0.75, "rewards/chosen": 0.003065109020099044, "rewards/margins": 0.032406046986579895, "rewards/rejected": -0.02934093587100506, "step": 572 }, { "epoch": 1.43, "learning_rate": 2.017203100806824e-08, "logits/chosen": -0.22167451679706573, "logits/rejected": -0.2503722608089447, "logps/chosen": -146.88233947753906, "logps/rejected": -170.77688598632812, "loss": 0.6875, "rewards/accuracies": 0.75, "rewards/chosen": 0.007261657156050205, "rewards/margins": 0.0030595771968364716, "rewards/rejected": 0.004202080424875021, "step": 573 }, { "epoch": 1.43, "learning_rate": 2.0010441869225426e-08, "logits/chosen": -0.23620356619358063, "logits/rejected": -0.22609563171863556, "logps/chosen": -128.31057739257812, "logps/rejected": -184.0690155029297, "loss": 0.6813, "rewards/accuracies": 0.75, "rewards/chosen": 0.00575752230361104, "rewards/margins": 0.0283050537109375, "rewards/rejected": -0.022547531872987747, "step": 574 }, { "epoch": 1.43, "learning_rate": 1.9849340477047322e-08, "logits/chosen": -0.29816439747810364, "logits/rejected": -0.3202560544013977, "logps/chosen": -177.03799438476562, "logps/rejected": -186.3551788330078, "loss": 0.6822, "rewards/accuracies": 0.75, "rewards/chosen": 0.026165008544921875, "rewards/margins": 0.04379691928625107, "rewards/rejected": -0.017631910741329193, "step": 575 }, { "epoch": 1.43, "learning_rate": 1.968872945166811e-08, "logits/chosen": -0.29855337738990784, "logits/rejected": -0.2659136950969696, "logps/chosen": -157.22418212890625, "logps/rejected": -186.1586151123047, "loss": 0.6921, "rewards/accuracies": 0.25, "rewards/chosen": 0.0031433110125362873, "rewards/margins": -0.03370017930865288, "rewards/rejected": 0.036843493580818176, "step": 576 }, { "epoch": 1.44, "learning_rate": 1.952861140524672e-08, "logits/chosen": -0.19383995234966278, "logits/rejected": -0.17573422193527222, "logps/chosen": -144.12564086914062, "logps/rejected": -135.5634002685547, "loss": 0.6808, "rewards/accuracies": 0.75, "rewards/chosen": 0.00558319129049778, "rewards/margins": 0.04162635654211044, "rewards/rejected": -0.03604316711425781, "step": 577 }, { "epoch": 1.44, "learning_rate": 1.936898894192434e-08, "logits/chosen": -0.15823210775852203, "logits/rejected": -0.15367771685123444, "logps/chosen": -179.83047485351562, "logps/rejected": -159.25650024414062, "loss": 0.6782, "rewards/accuracies": 1.0, "rewards/chosen": 0.0314878448843956, "rewards/margins": 0.07907485961914062, "rewards/rejected": -0.047587014734745026, "step": 578 }, { "epoch": 1.44, "learning_rate": 1.9209864657782038e-08, "logits/chosen": -0.2646196782588959, "logits/rejected": -0.25030893087387085, "logps/chosen": -146.07411193847656, "logps/rejected": -174.96621704101562, "loss": 0.675, "rewards/accuracies": 1.0, "rewards/chosen": 0.030485725030303, "rewards/margins": 0.04139537736773491, "rewards/rejected": -0.010909653268754482, "step": 579 }, { "epoch": 1.44, "learning_rate": 1.905124114079852e-08, "logits/chosen": -0.3410610854625702, "logits/rejected": -0.31324031949043274, "logps/chosen": -147.09657287597656, "logps/rejected": -170.41241455078125, "loss": 0.6847, "rewards/accuracies": 0.25, "rewards/chosen": -0.007642554119229317, "rewards/margins": -0.029616735875606537, "rewards/rejected": 0.02197417989373207, "step": 580 }, { "epoch": 1.45, "learning_rate": 1.8893120970808152e-08, "logits/chosen": -0.4417562484741211, "logits/rejected": -0.4278576672077179, "logps/chosen": -162.84823608398438, "logps/rejected": -162.49594116210938, "loss": 0.6788, "rewards/accuracies": 0.5, "rewards/chosen": -0.006165504455566406, "rewards/margins": 0.004237554967403412, "rewards/rejected": -0.010403061285614967, "step": 581 }, { "epoch": 1.45, "learning_rate": 1.873550671945886e-08, "logits/chosen": -0.23778235912322998, "logits/rejected": -0.22897124290466309, "logps/chosen": -150.25259399414062, "logps/rejected": -164.26329040527344, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 0.010075760073959827, "rewards/margins": 0.018807221204042435, "rewards/rejected": -0.008731461130082607, "step": 582 }, { "epoch": 1.45, "learning_rate": 1.8578400950170387e-08, "logits/chosen": -0.3315502405166626, "logits/rejected": -0.32032445073127747, "logps/chosen": -149.302490234375, "logps/rejected": -176.78167724609375, "loss": 0.6779, "rewards/accuracies": 0.75, "rewards/chosen": 0.0012802124256268144, "rewards/margins": 0.013499832712113857, "rewards/rejected": -0.012219619937241077, "step": 583 }, { "epoch": 1.45, "learning_rate": 1.8421806218092566e-08, "logits/chosen": -0.1554540991783142, "logits/rejected": -0.1287715584039688, "logps/chosen": -160.52102661132812, "logps/rejected": -146.36026000976562, "loss": 0.6835, "rewards/accuracies": 0.75, "rewards/chosen": -0.013485335744917393, "rewards/margins": 0.028116989880800247, "rewards/rejected": -0.041602328419685364, "step": 584 }, { "epoch": 1.46, "learning_rate": 1.826572507006383e-08, "logits/chosen": -0.15696974098682404, "logits/rejected": -0.1545967161655426, "logps/chosen": -167.72763061523438, "logps/rejected": -173.70460510253906, "loss": 0.6867, "rewards/accuracies": 0.5, "rewards/chosen": 0.004246902652084827, "rewards/margins": 0.024385835975408554, "rewards/rejected": -0.020138932392001152, "step": 585 }, { "epoch": 1.46, "learning_rate": 1.81101600445697e-08, "logits/chosen": -0.10908050835132599, "logits/rejected": -0.10260801017284393, "logps/chosen": -136.98500061035156, "logps/rejected": -151.88890075683594, "loss": 0.692, "rewards/accuracies": 0.75, "rewards/chosen": 0.03437557443976402, "rewards/margins": 0.03656063228845596, "rewards/rejected": -0.002185058780014515, "step": 586 }, { "epoch": 1.46, "learning_rate": 1.795511367170153e-08, "logits/chosen": -0.2778129279613495, "logits/rejected": -0.22868821024894714, "logps/chosen": -146.0940704345703, "logps/rejected": -142.7373046875, "loss": 0.6767, "rewards/accuracies": 0.75, "rewards/chosen": 0.0379117950797081, "rewards/margins": 0.04872589558362961, "rewards/rejected": -0.010814094915986061, "step": 587 }, { "epoch": 1.46, "learning_rate": 1.7800588473115414e-08, "logits/chosen": -0.2226821631193161, "logits/rejected": -0.20225778222084045, "logps/chosen": -137.42660522460938, "logps/rejected": -169.9799346923828, "loss": 0.6802, "rewards/accuracies": 0.75, "rewards/chosen": 0.01814899407327175, "rewards/margins": 0.019651221111416817, "rewards/rejected": -0.0015022275038063526, "step": 588 }, { "epoch": 1.47, "learning_rate": 1.764658696199109e-08, "logits/chosen": -0.2503301203250885, "logits/rejected": -0.22783653438091278, "logps/chosen": -180.85989379882812, "logps/rejected": -145.98138427734375, "loss": 0.691, "rewards/accuracies": 0.5, "rewards/chosen": 0.016205597668886185, "rewards/margins": 0.016016770154237747, "rewards/rejected": 0.0001888275146484375, "step": 589 }, { "epoch": 1.47, "learning_rate": 1.749311164299112e-08, "logits/chosen": -0.22115638852119446, "logits/rejected": -0.22723744809627533, "logps/chosen": -152.6621551513672, "logps/rejected": -182.33761596679688, "loss": 0.6882, "rewards/accuracies": 1.0, "rewards/chosen": 0.02572632022202015, "rewards/margins": 0.028755951672792435, "rewards/rejected": -0.0030296326149255037, "step": 590 }, { "epoch": 1.47, "learning_rate": 1.7340165012220136e-08, "logits/chosen": -0.2543298304080963, "logits/rejected": -0.2691631615161896, "logps/chosen": -154.33583068847656, "logps/rejected": -160.78517150878906, "loss": 0.6862, "rewards/accuracies": 0.25, "rewards/chosen": -0.005037689581513405, "rewards/margins": -0.005853270646184683, "rewards/rejected": 0.000815581064671278, "step": 591 }, { "epoch": 1.47, "learning_rate": 1.7187749557184246e-08, "logits/chosen": -0.2181331217288971, "logits/rejected": -0.2142607420682907, "logps/chosen": -152.67178344726562, "logps/rejected": -202.7827606201172, "loss": 0.6829, "rewards/accuracies": 0.75, "rewards/chosen": 0.0040416717529296875, "rewards/margins": 0.020860671997070312, "rewards/rejected": -0.016819000244140625, "step": 592 }, { "epoch": 1.48, "learning_rate": 1.703586775675056e-08, "logits/chosen": -0.293631911277771, "logits/rejected": -0.2605924904346466, "logps/chosen": -178.06503295898438, "logps/rejected": -155.83580017089844, "loss": 0.6851, "rewards/accuracies": 0.75, "rewards/chosen": 0.010467911139130592, "rewards/margins": 0.025528335943818092, "rewards/rejected": -0.0150604248046875, "step": 593 }, { "epoch": 1.48, "learning_rate": 1.6884522081106944e-08, "logits/chosen": -0.14583951234817505, "logits/rejected": -0.1555401086807251, "logps/chosen": -152.85546875, "logps/rejected": -153.00704956054688, "loss": 0.6845, "rewards/accuracies": 0.5, "rewards/chosen": -0.003637314308434725, "rewards/margins": 0.010899354703724384, "rewards/rejected": -0.014536667615175247, "step": 594 }, { "epoch": 1.48, "learning_rate": 1.6733714991721737e-08, "logits/chosen": -0.5400964021682739, "logits/rejected": -0.5291615128517151, "logps/chosen": -145.59869384765625, "logps/rejected": -162.552001953125, "loss": 0.6845, "rewards/accuracies": 0.75, "rewards/chosen": 0.018359757959842682, "rewards/margins": 0.02890472486615181, "rewards/rejected": -0.010544967837631702, "step": 595 }, { "epoch": 1.48, "learning_rate": 1.6583448941303808e-08, "logits/chosen": -0.24793751537799835, "logits/rejected": -0.18472829461097717, "logps/chosen": -152.8089599609375, "logps/rejected": -130.7303924560547, "loss": 0.6831, "rewards/accuracies": 0.75, "rewards/chosen": -0.006607054732739925, "rewards/margins": 0.0267625842243433, "rewards/rejected": -0.03336963802576065, "step": 596 }, { "epoch": 1.49, "learning_rate": 1.6433726373762598e-08, "logits/chosen": -0.2423139363527298, "logits/rejected": -0.20911438763141632, "logps/chosen": -146.1404571533203, "logps/rejected": -163.2782745361328, "loss": 0.6832, "rewards/accuracies": 0.75, "rewards/chosen": 0.04785461723804474, "rewards/margins": 0.05092010647058487, "rewards/rejected": -0.0030654901638627052, "step": 597 }, { "epoch": 1.49, "learning_rate": 1.628454972416846e-08, "logits/chosen": -0.27341702580451965, "logits/rejected": -0.25563204288482666, "logps/chosen": -148.88458251953125, "logps/rejected": -159.73939514160156, "loss": 0.694, "rewards/accuracies": 0.25, "rewards/chosen": -0.06918449699878693, "rewards/margins": -0.052112966775894165, "rewards/rejected": -0.01707153394818306, "step": 598 }, { "epoch": 1.49, "learning_rate": 1.6135921418712954e-08, "logits/chosen": -0.36579376459121704, "logits/rejected": -0.37067800760269165, "logps/chosen": -141.43667602539062, "logps/rejected": -166.23614501953125, "loss": 0.6866, "rewards/accuracies": 1.0, "rewards/chosen": 0.03339099884033203, "rewards/margins": 0.033805277198553085, "rewards/rejected": -0.00041427602991461754, "step": 599 }, { "epoch": 1.49, "learning_rate": 1.5987843874669432e-08, "logits/chosen": -0.23316819965839386, "logits/rejected": -0.19682303071022034, "logps/chosen": -116.18138885498047, "logps/rejected": -157.0629425048828, "loss": 0.6798, "rewards/accuracies": 0.75, "rewards/chosen": 0.02034149318933487, "rewards/margins": 0.031649399548769, "rewards/rejected": -0.011307907290756702, "step": 600 }, { "epoch": 1.5, "learning_rate": 1.584031950035378e-08, "logits/chosen": -0.28318047523498535, "logits/rejected": -0.2665945589542389, "logps/chosen": -136.60269165039062, "logps/rejected": -170.0757598876953, "loss": 0.6824, "rewards/accuracies": 0.75, "rewards/chosen": 0.024314500391483307, "rewards/margins": 0.061774447560310364, "rewards/rejected": -0.03745994716882706, "step": 601 }, { "epoch": 1.5, "learning_rate": 1.5693350695085155e-08, "logits/chosen": -0.2450769543647766, "logits/rejected": -0.22717997431755066, "logps/chosen": -143.39022827148438, "logps/rejected": -145.87013244628906, "loss": 0.6855, "rewards/accuracies": 0.25, "rewards/chosen": 0.0032644274178892374, "rewards/margins": -0.00423202570527792, "rewards/rejected": 0.0074964528903365135, "step": 602 }, { "epoch": 1.5, "learning_rate": 1.554693984914698e-08, "logits/chosen": -0.30917003750801086, "logits/rejected": -0.2936357855796814, "logps/chosen": -138.23294067382812, "logps/rejected": -166.1607666015625, "loss": 0.691, "rewards/accuracies": 0.75, "rewards/chosen": 0.0031730651389807463, "rewards/margins": 0.026659011840820312, "rewards/rejected": -0.023485947400331497, "step": 603 }, { "epoch": 1.5, "learning_rate": 1.5401089343748157e-08, "logits/chosen": -0.22063404321670532, "logits/rejected": -0.1825275868177414, "logps/chosen": -136.88836669921875, "logps/rejected": -169.08074951171875, "loss": 0.6884, "rewards/accuracies": 0.75, "rewards/chosen": 0.0005069728940725327, "rewards/margins": 0.010980033315718174, "rewards/rejected": -0.010473061352968216, "step": 604 }, { "epoch": 1.5, "learning_rate": 1.525580155098424e-08, "logits/chosen": -0.07498370110988617, "logits/rejected": -0.03689798712730408, "logps/chosen": -150.84963989257812, "logps/rejected": -166.9881591796875, "loss": 0.6861, "rewards/accuracies": 0.5, "rewards/chosen": -0.009874343872070312, "rewards/margins": 0.01104583591222763, "rewards/rejected": -0.020920181646943092, "step": 605 }, { "epoch": 1.51, "learning_rate": 1.5111078833798875e-08, "logits/chosen": -0.3380682170391083, "logits/rejected": -0.30118632316589355, "logps/chosen": -137.03475952148438, "logps/rejected": -161.825927734375, "loss": 0.6774, "rewards/accuracies": 0.75, "rewards/chosen": 0.009907340630888939, "rewards/margins": 0.03407230228185654, "rewards/rejected": -0.024164963513612747, "step": 606 }, { "epoch": 1.51, "learning_rate": 1.4966923545945408e-08, "logits/chosen": -0.34691107273101807, "logits/rejected": -0.3322242200374603, "logps/chosen": -171.81797790527344, "logps/rejected": -159.35366821289062, "loss": 0.6904, "rewards/accuracies": 0.5, "rewards/chosen": 0.0012834547087550163, "rewards/margins": 0.02296733856201172, "rewards/rejected": -0.021683882921934128, "step": 607 }, { "epoch": 1.51, "learning_rate": 1.4823338031948579e-08, "logits/chosen": -0.22343455255031586, "logits/rejected": -0.19029833376407623, "logps/chosen": -151.98883056640625, "logps/rejected": -164.38009643554688, "loss": 0.6813, "rewards/accuracies": 0.5, "rewards/chosen": 0.02108917385339737, "rewards/margins": 0.023837663233280182, "rewards/rejected": -0.0027484893798828125, "step": 608 }, { "epoch": 1.51, "learning_rate": 1.4680324627066381e-08, "logits/chosen": -0.2989797294139862, "logits/rejected": -0.2831348180770874, "logps/chosen": -189.84515380859375, "logps/rejected": -160.56236267089844, "loss": 0.6856, "rewards/accuracies": 0.25, "rewards/chosen": -0.022490691393613815, "rewards/margins": -0.015752410516142845, "rewards/rejected": -0.0067382813431322575, "step": 609 }, { "epoch": 1.52, "learning_rate": 1.4537885657252091e-08, "logits/chosen": -0.280917763710022, "logits/rejected": -0.24368290603160858, "logps/chosen": -144.9268035888672, "logps/rejected": -175.41668701171875, "loss": 0.6886, "rewards/accuracies": 0.25, "rewards/chosen": -0.01749381795525551, "rewards/margins": -0.015292737632989883, "rewards/rejected": -0.002201080322265625, "step": 610 }, { "epoch": 1.52, "learning_rate": 1.4396023439116477e-08, "logits/chosen": -0.2073800414800644, "logits/rejected": -0.19872711598873138, "logps/chosen": -144.14659118652344, "logps/rejected": -128.55950927734375, "loss": 0.681, "rewards/accuracies": 0.5, "rewards/chosen": -0.004068946931511164, "rewards/margins": 0.001507568173110485, "rewards/rejected": -0.005576515570282936, "step": 611 }, { "epoch": 1.52, "learning_rate": 1.4254740279890037e-08, "logits/chosen": -0.28967148065567017, "logits/rejected": -0.27359193563461304, "logps/chosen": -169.2911376953125, "logps/rejected": -186.87242126464844, "loss": 0.6844, "rewards/accuracies": 0.5, "rewards/chosen": 0.010456085205078125, "rewards/margins": -0.008076667785644531, "rewards/rejected": 0.018532754853367805, "step": 612 }, { "epoch": 1.52, "learning_rate": 1.4114038477385538e-08, "logits/chosen": -0.4295933246612549, "logits/rejected": -0.42970508337020874, "logps/chosen": -154.90725708007812, "logps/rejected": -177.3048095703125, "loss": 0.6846, "rewards/accuracies": 0.75, "rewards/chosen": 0.023922348394989967, "rewards/margins": -0.0017478950321674347, "rewards/rejected": 0.025670241564512253, "step": 613 }, { "epoch": 1.53, "learning_rate": 1.3973920319960653e-08, "logits/chosen": -0.3549492657184601, "logits/rejected": -0.30072152614593506, "logps/chosen": -166.2919464111328, "logps/rejected": -162.95223999023438, "loss": 0.686, "rewards/accuracies": 0.5, "rewards/chosen": -0.0018278129864484072, "rewards/margins": 0.03435974195599556, "rewards/rejected": -0.0361875556409359, "step": 614 }, { "epoch": 1.53, "learning_rate": 1.3834388086480685e-08, "logits/chosen": -0.3422825336456299, "logits/rejected": -0.3047506809234619, "logps/chosen": -139.44375610351562, "logps/rejected": -182.77862548828125, "loss": 0.679, "rewards/accuracies": 0.75, "rewards/chosen": 0.019237518310546875, "rewards/margins": 0.04842948913574219, "rewards/rejected": -0.029191970825195312, "step": 615 }, { "epoch": 1.53, "learning_rate": 1.3695444046281551e-08, "logits/chosen": -0.23958246409893036, "logits/rejected": -0.24693436920642853, "logps/chosen": -136.3277130126953, "logps/rejected": -170.51026916503906, "loss": 0.6809, "rewards/accuracies": 1.0, "rewards/chosen": 0.04582080990076065, "rewards/margins": 0.056087687611579895, "rewards/rejected": -0.01026687677949667, "step": 616 }, { "epoch": 1.53, "learning_rate": 1.3557090459132886e-08, "logits/chosen": -0.38596001267433167, "logits/rejected": -0.3638424873352051, "logps/chosen": -152.86471557617188, "logps/rejected": -170.90850830078125, "loss": 0.6813, "rewards/accuracies": 1.0, "rewards/chosen": 0.018538856878876686, "rewards/margins": 0.024425696581602097, "rewards/rejected": -0.00588684156537056, "step": 617 }, { "epoch": 1.54, "learning_rate": 1.3419329575201233e-08, "logits/chosen": -0.3505130410194397, "logits/rejected": -0.35060790181159973, "logps/chosen": -138.21743774414062, "logps/rejected": -181.8092803955078, "loss": 0.6939, "rewards/accuracies": 0.25, "rewards/chosen": -0.03368072584271431, "rewards/margins": 0.006889726966619492, "rewards/rejected": -0.0405704528093338, "step": 618 }, { "epoch": 1.54, "learning_rate": 1.3282163635013489e-08, "logits/chosen": -0.3928719460964203, "logits/rejected": -0.36921223998069763, "logps/chosen": -155.99078369140625, "logps/rejected": -147.62779235839844, "loss": 0.6758, "rewards/accuracies": 0.5, "rewards/chosen": -0.025135422125458717, "rewards/margins": -0.00041294051334261894, "rewards/rejected": -0.02472248114645481, "step": 619 }, { "epoch": 1.54, "learning_rate": 1.314559486942044e-08, "logits/chosen": -0.40771371126174927, "logits/rejected": -0.370278924703598, "logps/chosen": -147.09548950195312, "logps/rejected": -192.5513153076172, "loss": 0.6833, "rewards/accuracies": 0.75, "rewards/chosen": 0.009571838192641735, "rewards/margins": 0.03234558179974556, "rewards/rejected": -0.02277374267578125, "step": 620 }, { "epoch": 1.54, "learning_rate": 1.3009625499560546e-08, "logits/chosen": -0.23393546044826508, "logits/rejected": -0.190766841173172, "logps/chosen": -145.80084228515625, "logps/rejected": -167.303466796875, "loss": 0.6831, "rewards/accuracies": 0.75, "rewards/chosen": 0.018384171649813652, "rewards/margins": 0.031030654907226562, "rewards/rejected": -0.012646484188735485, "step": 621 }, { "epoch": 1.55, "learning_rate": 1.2874257736823713e-08, "logits/chosen": -0.313372939825058, "logits/rejected": -0.2930768132209778, "logps/chosen": -141.68692016601562, "logps/rejected": -142.57537841796875, "loss": 0.6777, "rewards/accuracies": 0.75, "rewards/chosen": 0.01615142822265625, "rewards/margins": 0.02666931040585041, "rewards/rejected": -0.01051788218319416, "step": 622 }, { "epoch": 1.55, "learning_rate": 1.2739493782815392e-08, "logits/chosen": -0.2809706926345825, "logits/rejected": -0.2863350212574005, "logps/chosen": -143.23287963867188, "logps/rejected": -149.8250732421875, "loss": 0.6853, "rewards/accuracies": 0.5, "rewards/chosen": 0.00675392197445035, "rewards/margins": 0.028064917773008347, "rewards/rejected": -0.021310996264219284, "step": 623 }, { "epoch": 1.55, "learning_rate": 1.2605335829320767e-08, "logits/chosen": -0.3177556097507477, "logits/rejected": -0.2880592942237854, "logps/chosen": -150.18748474121094, "logps/rejected": -149.878173828125, "loss": 0.6863, "rewards/accuracies": 0.5, "rewards/chosen": 0.0031036371365189552, "rewards/margins": 0.029024124145507812, "rewards/rejected": -0.025920487940311432, "step": 624 }, { "epoch": 1.55, "learning_rate": 1.24717860582691e-08, "logits/chosen": -0.5403696298599243, "logits/rejected": -0.5060780048370361, "logps/chosen": -138.0675811767578, "logps/rejected": -166.14585876464844, "loss": 0.6772, "rewards/accuracies": 0.5, "rewards/chosen": 0.01043548621237278, "rewards/margins": 0.002549170982092619, "rewards/rejected": 0.007886315695941448, "step": 625 }, { "epoch": 1.56, "learning_rate": 1.2338846641698226e-08, "logits/chosen": -0.27301138639450073, "logits/rejected": -0.2636381685733795, "logps/chosen": -148.90118408203125, "logps/rejected": -168.1456756591797, "loss": 0.6877, "rewards/accuracies": 0.5, "rewards/chosen": -0.029729843139648438, "rewards/margins": -0.01692047342658043, "rewards/rejected": -0.012809371575713158, "step": 626 }, { "epoch": 1.56, "learning_rate": 1.220651974171929e-08, "logits/chosen": -0.24258148670196533, "logits/rejected": -0.24351949989795685, "logps/chosen": -159.89376831054688, "logps/rejected": -160.35531616210938, "loss": 0.6888, "rewards/accuracies": 0.5, "rewards/chosen": 0.02786560356616974, "rewards/margins": 0.029453853145241737, "rewards/rejected": -0.0015882492298260331, "step": 627 }, { "epoch": 1.56, "learning_rate": 1.2074807510481494e-08, "logits/chosen": -0.3424605429172516, "logits/rejected": -0.30497586727142334, "logps/chosen": -141.3307647705078, "logps/rejected": -164.53025817871094, "loss": 0.6789, "rewards/accuracies": 0.5, "rewards/chosen": 0.017374420538544655, "rewards/margins": 0.025086212903261185, "rewards/rejected": -0.0077117932960391045, "step": 628 }, { "epoch": 1.56, "learning_rate": 1.1943712090137153e-08, "logits/chosen": -0.17325511574745178, "logits/rejected": -0.1250770539045334, "logps/chosen": -137.62652587890625, "logps/rejected": -154.30307006835938, "loss": 0.6786, "rewards/accuracies": 0.75, "rewards/chosen": 0.009408188052475452, "rewards/margins": 0.020849039778113365, "rewards/rejected": -0.011440850794315338, "step": 629 }, { "epoch": 1.57, "learning_rate": 1.1813235612806865e-08, "logits/chosen": -0.20835459232330322, "logits/rejected": -0.17623770236968994, "logps/chosen": -158.7451629638672, "logps/rejected": -134.85443115234375, "loss": 0.6726, "rewards/accuracies": 0.5, "rewards/chosen": 0.0211334228515625, "rewards/margins": 0.05316925048828125, "rewards/rejected": -0.03203582763671875, "step": 630 }, { "epoch": 1.57, "learning_rate": 1.168338020054478e-08, "logits/chosen": -0.3196198046207428, "logits/rejected": -0.3095609247684479, "logps/chosen": -155.195556640625, "logps/rejected": -178.34967041015625, "loss": 0.6895, "rewards/accuracies": 0.5, "rewards/chosen": -0.028489112854003906, "rewards/margins": 0.001922035589814186, "rewards/rejected": -0.030411148443818092, "step": 631 }, { "epoch": 1.57, "learning_rate": 1.1554147965304129e-08, "logits/chosen": -0.5015212893486023, "logits/rejected": -0.48790109157562256, "logps/chosen": -169.4319305419922, "logps/rejected": -193.94403076171875, "loss": 0.6854, "rewards/accuracies": 0.75, "rewards/chosen": 0.0068988800048828125, "rewards/margins": 0.024532318115234375, "rewards/rejected": -0.017633436247706413, "step": 632 }, { "epoch": 1.57, "learning_rate": 1.142554100890285e-08, "logits/chosen": -0.42921724915504456, "logits/rejected": -0.398971289396286, "logps/chosen": -137.12118530273438, "logps/rejected": -172.83413696289062, "loss": 0.6874, "rewards/accuracies": 0.75, "rewards/chosen": -0.02866802178323269, "rewards/margins": 0.026400184258818626, "rewards/rejected": -0.055068209767341614, "step": 633 }, { "epoch": 1.58, "learning_rate": 1.1297561422989466e-08, "logits/chosen": -0.2462986260652542, "logits/rejected": -0.2507481276988983, "logps/chosen": -155.91998291015625, "logps/rejected": -171.51893615722656, "loss": 0.693, "rewards/accuracies": 0.0, "rewards/chosen": -0.032654572278261185, "rewards/margins": -0.05092144384980202, "rewards/rejected": 0.018266869708895683, "step": 634 }, { "epoch": 1.58, "learning_rate": 1.117021128900898e-08, "logits/chosen": -0.2858719825744629, "logits/rejected": -0.2546524107456207, "logps/chosen": -162.22952270507812, "logps/rejected": -166.89215087890625, "loss": 0.6783, "rewards/accuracies": 1.0, "rewards/chosen": 0.025674819946289062, "rewards/margins": 0.056740954518318176, "rewards/rejected": -0.031066132709383965, "step": 635 }, { "epoch": 1.58, "learning_rate": 1.1043492678169064e-08, "logits/chosen": -0.4596944749355316, "logits/rejected": -0.4214317798614502, "logps/chosen": -123.72189331054688, "logps/rejected": -165.6112823486328, "loss": 0.6943, "rewards/accuracies": 0.5, "rewards/chosen": 0.03686828911304474, "rewards/margins": -0.005080985836684704, "rewards/rejected": 0.04194927215576172, "step": 636 }, { "epoch": 1.58, "learning_rate": 1.091740765140638e-08, "logits/chosen": -0.3177310824394226, "logits/rejected": -0.2913024425506592, "logps/chosen": -139.12274169921875, "logps/rejected": -133.01792907714844, "loss": 0.6805, "rewards/accuracies": 1.0, "rewards/chosen": 0.015906715765595436, "rewards/margins": 0.041857339441776276, "rewards/rejected": -0.02595062367618084, "step": 637 }, { "epoch": 1.59, "learning_rate": 1.079195825935304e-08, "logits/chosen": -0.27592629194259644, "logits/rejected": -0.26236778497695923, "logps/chosen": -132.34812927246094, "logps/rejected": -182.83258056640625, "loss": 0.6846, "rewards/accuracies": 0.5, "rewards/chosen": -0.017136573791503906, "rewards/margins": -0.01633014716207981, "rewards/rejected": -0.0008064266294240952, "step": 638 }, { "epoch": 1.59, "learning_rate": 1.0667146542303306e-08, "logits/chosen": -0.12078359723091125, "logits/rejected": -0.06677890568971634, "logps/chosen": -121.54171752929688, "logps/rejected": -162.84469604492188, "loss": 0.6792, "rewards/accuracies": 0.5, "rewards/chosen": 0.01712207868695259, "rewards/margins": 0.028320884332060814, "rewards/rejected": -0.011198805645108223, "step": 639 }, { "epoch": 1.59, "learning_rate": 1.0542974530180326e-08, "logits/chosen": -0.3971944749355316, "logits/rejected": -0.39515259861946106, "logps/chosen": -155.57083129882812, "logps/rejected": -147.23976135253906, "loss": 0.6855, "rewards/accuracies": 0.75, "rewards/chosen": 0.018644332885742188, "rewards/margins": 0.034172821789979935, "rewards/rejected": -0.015528487972915173, "step": 640 }, { "epoch": 1.59, "learning_rate": 1.0419444242503178e-08, "logits/chosen": -0.1276007741689682, "logits/rejected": -0.1228765994310379, "logps/chosen": -141.36099243164062, "logps/rejected": -143.3678741455078, "loss": 0.6852, "rewards/accuracies": 0.75, "rewards/chosen": 0.0012104036286473274, "rewards/margins": 0.00513954134657979, "rewards/rejected": -0.003929137717932463, "step": 641 }, { "epoch": 1.6, "learning_rate": 1.0296557688353996e-08, "logits/chosen": -0.2248343825340271, "logits/rejected": -0.20781181752681732, "logps/chosen": -137.01853942871094, "logps/rejected": -158.09915161132812, "loss": 0.6783, "rewards/accuracies": 0.75, "rewards/chosen": 0.01172943040728569, "rewards/margins": 0.023328781127929688, "rewards/rejected": -0.011599348857998848, "step": 642 }, { "epoch": 1.6, "learning_rate": 1.0174316866345339e-08, "logits/chosen": -0.35209017992019653, "logits/rejected": -0.3201100826263428, "logps/chosen": -132.21694946289062, "logps/rejected": -163.50021362304688, "loss": 0.6864, "rewards/accuracies": 0.75, "rewards/chosen": 0.022886086255311966, "rewards/margins": 0.04570064693689346, "rewards/rejected": -0.022814560681581497, "step": 643 }, { "epoch": 1.6, "learning_rate": 1.0052723764587634e-08, "logits/chosen": -0.3624230623245239, "logits/rejected": -0.3077896237373352, "logps/chosen": -154.53091430664062, "logps/rejected": -142.50296020507812, "loss": 0.6851, "rewards/accuracies": 0.25, "rewards/chosen": -0.009971046820282936, "rewards/margins": -0.03154449537396431, "rewards/rejected": 0.021573448553681374, "step": 644 }, { "epoch": 1.6, "learning_rate": 9.93178036065685e-09, "logits/chosen": -0.2298841029405594, "logits/rejected": -0.18940600752830505, "logps/chosen": -130.6974639892578, "logps/rejected": -144.34136962890625, "loss": 0.6803, "rewards/accuracies": 0.5, "rewards/chosen": -0.01557769812643528, "rewards/margins": 0.009110069833695889, "rewards/rejected": -0.024687767028808594, "step": 645 }, { "epoch": 1.61, "learning_rate": 9.811488621562347e-09, "logits/chosen": -0.24639225006103516, "logits/rejected": -0.23600156605243683, "logps/chosen": -187.217529296875, "logps/rejected": -195.23068237304688, "loss": 0.6868, "rewards/accuracies": 0.25, "rewards/chosen": 0.02271270751953125, "rewards/margins": -0.006113434210419655, "rewards/rejected": 0.028826141729950905, "step": 646 }, { "epoch": 1.61, "learning_rate": 9.691850503714926e-09, "logits/chosen": -0.18258750438690186, "logits/rejected": -0.16372627019882202, "logps/chosen": -150.43899536132812, "logps/rejected": -188.60302734375, "loss": 0.6852, "rewards/accuracies": 0.5, "rewards/chosen": -0.014594651758670807, "rewards/margins": 0.006256865803152323, "rewards/rejected": -0.02085151895880699, "step": 647 }, { "epoch": 1.61, "learning_rate": 9.572867952894925e-09, "logits/chosen": -0.4449678361415863, "logits/rejected": -0.41474229097366333, "logps/chosen": -156.16339111328125, "logps/rejected": -148.63180541992188, "loss": 0.6882, "rewards/accuracies": 0.5, "rewards/chosen": 0.0014583589509129524, "rewards/margins": 0.0003244406543672085, "rewards/rejected": 0.001133918878622353, "step": 648 }, { "epoch": 1.61, "learning_rate": 9.454542904220619e-09, "logits/chosen": -0.34837889671325684, "logits/rejected": -0.34004127979278564, "logps/chosen": -170.19232177734375, "logps/rejected": -181.0757598876953, "loss": 0.6798, "rewards/accuracies": 0.25, "rewards/chosen": -0.015354728326201439, "rewards/margins": -0.04295406863093376, "rewards/rejected": 0.027599336579442024, "step": 649 }, { "epoch": 1.62, "learning_rate": 9.336877282116772e-09, "logits/chosen": -0.3935059607028961, "logits/rejected": -0.3775092363357544, "logps/chosen": -152.58230590820312, "logps/rejected": -169.49270629882812, "loss": 0.679, "rewards/accuracies": 1.0, "rewards/chosen": 0.049274444580078125, "rewards/margins": 0.0540439635515213, "rewards/rejected": -0.004769516177475452, "step": 650 }, { "epoch": 1.62, "learning_rate": 9.21987300028329e-09, "logits/chosen": -0.37629249691963196, "logits/rejected": -0.3505760431289673, "logps/chosen": -158.5342559814453, "logps/rejected": -142.15638732910156, "loss": 0.6917, "rewards/accuracies": 0.25, "rewards/chosen": 0.005700111389160156, "rewards/margins": -0.002135658636689186, "rewards/rejected": 0.007835770025849342, "step": 651 }, { "epoch": 1.62, "learning_rate": 9.103531961664119e-09, "logits/chosen": -0.16507147252559662, "logits/rejected": -0.12946796417236328, "logps/chosen": -127.25350952148438, "logps/rejected": -164.006591796875, "loss": 0.6844, "rewards/accuracies": 0.5, "rewards/chosen": 0.014896392822265625, "rewards/margins": 0.02301769144833088, "rewards/rejected": -0.008121299557387829, "step": 652 }, { "epoch": 1.62, "learning_rate": 8.987856058416305e-09, "logits/chosen": -0.3350844085216522, "logits/rejected": -0.3568568527698517, "logps/chosen": -145.6357879638672, "logps/rejected": -127.7461929321289, "loss": 0.6822, "rewards/accuracies": 0.75, "rewards/chosen": 0.00510177668184042, "rewards/margins": 0.027272608131170273, "rewards/rejected": -0.02217083051800728, "step": 653 }, { "epoch": 1.63, "learning_rate": 8.8728471718792e-09, "logits/chosen": -0.28328222036361694, "logits/rejected": -0.23956109583377838, "logps/chosen": -155.1990509033203, "logps/rejected": -216.09014892578125, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 0.05726604536175728, "rewards/margins": 0.06536273658275604, "rewards/rejected": -0.008096694946289062, "step": 654 }, { "epoch": 1.63, "learning_rate": 8.758507172543883e-09, "logits/chosen": -0.1933213770389557, "logits/rejected": -0.18073707818984985, "logps/chosen": -154.54981994628906, "logps/rejected": -143.92247009277344, "loss": 0.6887, "rewards/accuracies": 0.5, "rewards/chosen": 0.009598542004823685, "rewards/margins": -0.0021554939448833466, "rewards/rejected": 0.011754035949707031, "step": 655 }, { "epoch": 1.63, "learning_rate": 8.644837920022708e-09, "logits/chosen": -0.2601086497306824, "logits/rejected": -0.25952091813087463, "logps/chosen": -129.53106689453125, "logps/rejected": -178.482421875, "loss": 0.6868, "rewards/accuracies": 0.25, "rewards/chosen": 0.011276436038315296, "rewards/margins": -0.007050515152513981, "rewards/rejected": 0.018326949328184128, "step": 656 }, { "epoch": 1.63, "learning_rate": 8.531841263019124e-09, "logits/chosen": -0.2545204758644104, "logits/rejected": -0.2653210461139679, "logps/chosen": -155.0509490966797, "logps/rejected": -218.5908203125, "loss": 0.6825, "rewards/accuracies": 0.75, "rewards/chosen": -0.011025810614228249, "rewards/margins": 0.032938577234745026, "rewards/rejected": -0.043964385986328125, "step": 657 }, { "epoch": 1.64, "learning_rate": 8.419519039297535e-09, "logits/chosen": -0.25709861516952515, "logits/rejected": -0.23942717909812927, "logps/chosen": -147.8103485107422, "logps/rejected": -176.09661865234375, "loss": 0.6844, "rewards/accuracies": 0.75, "rewards/chosen": 0.01585559919476509, "rewards/margins": 0.014322853647172451, "rewards/rejected": 0.00153274554759264, "step": 658 }, { "epoch": 1.64, "learning_rate": 8.307873075653426e-09, "logits/chosen": -0.26760274171829224, "logits/rejected": -0.2767907381057739, "logps/chosen": -136.84970092773438, "logps/rejected": -169.89373779296875, "loss": 0.6814, "rewards/accuracies": 0.75, "rewards/chosen": -0.008061789907515049, "rewards/margins": -0.002153778448700905, "rewards/rejected": -0.005908013321459293, "step": 659 }, { "epoch": 1.64, "learning_rate": 8.196905187883713e-09, "logits/chosen": -0.15812046825885773, "logits/rejected": -0.15011446177959442, "logps/chosen": -167.2583770751953, "logps/rejected": -159.32565307617188, "loss": 0.6864, "rewards/accuracies": 0.25, "rewards/chosen": -0.027190400287508965, "rewards/margins": -0.038636017590761185, "rewards/rejected": 0.011445618234574795, "step": 660 }, { "epoch": 1.64, "learning_rate": 8.086617180757132e-09, "logits/chosen": -0.28487589955329895, "logits/rejected": -0.24784614145755768, "logps/chosen": -156.72738647460938, "logps/rejected": -137.36370849609375, "loss": 0.6704, "rewards/accuracies": 0.75, "rewards/chosen": 0.0006284713745117188, "rewards/margins": 0.017513275146484375, "rewards/rejected": -0.016884803771972656, "step": 661 }, { "epoch": 1.65, "learning_rate": 7.97701084798491e-09, "logits/chosen": -0.4110132157802582, "logits/rejected": -0.38127464056015015, "logps/chosen": -166.95529174804688, "logps/rejected": -173.4225311279297, "loss": 0.6906, "rewards/accuracies": 0.75, "rewards/chosen": 0.03038330189883709, "rewards/margins": 0.02675018459558487, "rewards/rejected": 0.0036331163719296455, "step": 662 }, { "epoch": 1.65, "learning_rate": 7.868087972191645e-09, "logits/chosen": -0.2927989363670349, "logits/rejected": -0.26053640246391296, "logps/chosen": -148.18130493164062, "logps/rejected": -195.53883361816406, "loss": 0.6805, "rewards/accuracies": 0.5, "rewards/chosen": 0.018325425684452057, "rewards/margins": 0.029327392578125, "rewards/rejected": -0.011001968756318092, "step": 663 }, { "epoch": 1.65, "learning_rate": 7.759850324886235e-09, "logits/chosen": -0.2575388550758362, "logits/rejected": -0.26225072145462036, "logps/chosen": -158.02975463867188, "logps/rejected": -211.40660095214844, "loss": 0.6852, "rewards/accuracies": 0.75, "rewards/chosen": 0.01236877404153347, "rewards/margins": 0.020952414721250534, "rewards/rejected": -0.008583641611039639, "step": 664 }, { "epoch": 1.65, "learning_rate": 7.6522996664331e-09, "logits/chosen": -0.1484163999557495, "logits/rejected": -0.1337553858757019, "logps/chosen": -152.16067504882812, "logps/rejected": -161.6768798828125, "loss": 0.6792, "rewards/accuracies": 0.5, "rewards/chosen": -0.009782027453184128, "rewards/margins": 0.0006561279296875, "rewards/rejected": -0.010438157245516777, "step": 665 }, { "epoch": 1.66, "learning_rate": 7.545437746023586e-09, "logits/chosen": -0.29485371708869934, "logits/rejected": -0.250537246465683, "logps/chosen": -156.18067932128906, "logps/rejected": -161.8492889404297, "loss": 0.6829, "rewards/accuracies": 0.75, "rewards/chosen": 0.027919577434659004, "rewards/margins": 0.013802146539092064, "rewards/rejected": 0.014117431826889515, "step": 666 }, { "epoch": 1.66, "learning_rate": 7.439266301647457e-09, "logits/chosen": -0.15990181267261505, "logits/rejected": -0.14735284447669983, "logps/chosen": -154.7827911376953, "logps/rejected": -158.27456665039062, "loss": 0.6805, "rewards/accuracies": 0.75, "rewards/chosen": 0.01941223070025444, "rewards/margins": 0.030659105628728867, "rewards/rejected": -0.011246871203184128, "step": 667 }, { "epoch": 1.66, "learning_rate": 7.333787060064661e-09, "logits/chosen": -0.41008704900741577, "logits/rejected": -0.37551116943359375, "logps/chosen": -134.82025146484375, "logps/rejected": -173.14585876464844, "loss": 0.6867, "rewards/accuracies": 0.75, "rewards/chosen": 0.01485538575798273, "rewards/margins": 0.05698452144861221, "rewards/rejected": -0.04212913662195206, "step": 668 }, { "epoch": 1.66, "learning_rate": 7.2290017367772345e-09, "logits/chosen": -0.36443018913269043, "logits/rejected": -0.3513912260532379, "logps/chosen": -139.25106811523438, "logps/rejected": -145.8203887939453, "loss": 0.6818, "rewards/accuracies": 0.5, "rewards/chosen": 0.016538430005311966, "rewards/margins": 0.0039005286525934935, "rewards/rejected": 0.012637902051210403, "step": 669 }, { "epoch": 1.67, "learning_rate": 7.124912036001429e-09, "logits/chosen": -0.23372051119804382, "logits/rejected": -0.19467739760875702, "logps/chosen": -157.76963806152344, "logps/rejected": -196.59512329101562, "loss": 0.6772, "rewards/accuracies": 0.5, "rewards/chosen": 0.019295310601592064, "rewards/margins": 0.011816596612334251, "rewards/rejected": 0.0074787139892578125, "step": 670 }, { "epoch": 1.67, "learning_rate": 7.021519650639951e-09, "logits/chosen": -0.05537058785557747, "logits/rejected": -0.027172140777111053, "logps/chosen": -132.31280517578125, "logps/rejected": -153.2760772705078, "loss": 0.6914, "rewards/accuracies": 0.25, "rewards/chosen": -0.0007678982801735401, "rewards/margins": -0.021738052368164062, "rewards/rejected": 0.02097015455365181, "step": 671 }, { "epoch": 1.67, "learning_rate": 6.9188262622544515e-09, "logits/chosen": -0.21869581937789917, "logits/rejected": -0.1788782924413681, "logps/chosen": -183.26385498046875, "logps/rejected": -200.66798400878906, "loss": 0.6794, "rewards/accuracies": 0.75, "rewards/chosen": 0.016810797154903412, "rewards/margins": 0.022446250542998314, "rewards/rejected": -0.005635452456772327, "step": 672 }, { "epoch": 1.67, "learning_rate": 6.816833541038203e-09, "logits/chosen": -0.3740076720714569, "logits/rejected": -0.3524826169013977, "logps/chosen": -145.87539672851562, "logps/rejected": -179.04803466796875, "loss": 0.6834, "rewards/accuracies": 0.5, "rewards/chosen": 0.012392044067382812, "rewards/margins": 0.0016159070655703545, "rewards/rejected": 0.010776137933135033, "step": 673 }, { "epoch": 1.68, "learning_rate": 6.7155431457888826e-09, "logits/chosen": -0.19469214975833893, "logits/rejected": -0.16316333413124084, "logps/chosen": -149.32379150390625, "logps/rejected": -130.66807556152344, "loss": 0.6844, "rewards/accuracies": 0.75, "rewards/chosen": 0.024932099506258965, "rewards/margins": 0.03638400882482529, "rewards/rejected": -0.01145191304385662, "step": 674 }, { "epoch": 1.68, "learning_rate": 6.614956723881615e-09, "logits/chosen": -0.2261587381362915, "logits/rejected": -0.21632154285907745, "logps/chosen": -144.59030151367188, "logps/rejected": -158.31410217285156, "loss": 0.6875, "rewards/accuracies": 0.5, "rewards/chosen": 0.025677872821688652, "rewards/margins": 0.02060680463910103, "rewards/rejected": 0.005071068182587624, "step": 675 }, { "epoch": 1.68, "learning_rate": 6.5150759112422234e-09, "logits/chosen": -0.478666216135025, "logits/rejected": -0.47704923152923584, "logps/chosen": -131.00689697265625, "logps/rejected": -165.12017822265625, "loss": 0.6812, "rewards/accuracies": 0.25, "rewards/chosen": 0.008286094292998314, "rewards/margins": -0.029364969581365585, "rewards/rejected": 0.03765106201171875, "step": 676 }, { "epoch": 1.68, "learning_rate": 6.415902332320544e-09, "logits/chosen": -0.25670936703681946, "logits/rejected": -0.23158378899097443, "logps/chosen": -152.87396240234375, "logps/rejected": -146.0147705078125, "loss": 0.6881, "rewards/accuracies": 0.75, "rewards/chosen": -0.007493019104003906, "rewards/margins": 0.008602142333984375, "rewards/rejected": -0.01609516143798828, "step": 677 }, { "epoch": 1.69, "learning_rate": 6.317437600064046e-09, "logits/chosen": -0.27737659215927124, "logits/rejected": -0.28294047713279724, "logps/chosen": -172.517822265625, "logps/rejected": -160.29592895507812, "loss": 0.6763, "rewards/accuracies": 0.75, "rewards/chosen": 0.019706344231963158, "rewards/margins": 0.025475312024354935, "rewards/rejected": -0.005768966861069202, "step": 678 }, { "epoch": 1.69, "learning_rate": 6.219683315891644e-09, "logits/chosen": -0.47840943932533264, "logits/rejected": -0.441383421421051, "logps/chosen": -192.63925170898438, "logps/rejected": -179.90408325195312, "loss": 0.6807, "rewards/accuracies": 0.75, "rewards/chosen": 0.03647727891802788, "rewards/margins": 0.03498191386461258, "rewards/rejected": 0.0014953622594475746, "step": 679 }, { "epoch": 1.69, "learning_rate": 6.122641069667567e-09, "logits/chosen": -0.30052363872528076, "logits/rejected": -0.2846235930919647, "logps/chosen": -161.01170349121094, "logps/rejected": -144.43902587890625, "loss": 0.6883, "rewards/accuracies": 0.75, "rewards/chosen": -0.00676307687535882, "rewards/margins": 0.010946082882583141, "rewards/rejected": -0.0177091583609581, "step": 680 }, { "epoch": 1.69, "learning_rate": 6.026312439675552e-09, "logits/chosen": -0.2627371847629547, "logits/rejected": -0.2578830420970917, "logps/chosen": -157.73651123046875, "logps/rejected": -180.27919006347656, "loss": 0.6851, "rewards/accuracies": 0.5, "rewards/chosen": -0.004706191830337048, "rewards/margins": -0.004056548234075308, "rewards/rejected": -0.0006496427813544869, "step": 681 }, { "epoch": 1.7, "learning_rate": 5.930698992593153e-09, "logits/chosen": -0.3602895736694336, "logits/rejected": -0.3459394872188568, "logps/chosen": -155.046875, "logps/rejected": -165.21412658691406, "loss": 0.6912, "rewards/accuracies": 0.5, "rewards/chosen": 0.013539505191147327, "rewards/margins": 0.015571975149214268, "rewards/rejected": -0.0020324711222201586, "step": 682 }, { "epoch": 1.7, "learning_rate": 5.835802283466312e-09, "logits/chosen": -0.25078678131103516, "logits/rejected": -0.250380277633667, "logps/chosen": -144.10562133789062, "logps/rejected": -158.42770385742188, "loss": 0.6766, "rewards/accuracies": 0.75, "rewards/chosen": 0.006118583492934704, "rewards/margins": 0.022781945765018463, "rewards/rejected": -0.016663361340761185, "step": 683 }, { "epoch": 1.7, "learning_rate": 5.741623855683992e-09, "logits/chosen": -0.2770085632801056, "logits/rejected": -0.2844131290912628, "logps/chosen": -144.04078674316406, "logps/rejected": -147.1527099609375, "loss": 0.6923, "rewards/accuracies": 0.5, "rewards/chosen": 0.010652541182935238, "rewards/margins": 0.010173414833843708, "rewards/rejected": 0.00047912634909152985, "step": 684 }, { "epoch": 1.7, "learning_rate": 5.648165240953123e-09, "logits/chosen": -0.3593659996986389, "logits/rejected": -0.35899364948272705, "logps/chosen": -157.81802368164062, "logps/rejected": -187.27145385742188, "loss": 0.6754, "rewards/accuracies": 0.5, "rewards/chosen": 0.026572609320282936, "rewards/margins": 0.05437755957245827, "rewards/rejected": -0.027804944664239883, "step": 685 }, { "epoch": 1.71, "learning_rate": 5.555427959273684e-09, "logits/chosen": -0.23967808485031128, "logits/rejected": -0.22721916437149048, "logps/chosen": -132.64285278320312, "logps/rejected": -154.76904296875, "loss": 0.6856, "rewards/accuracies": 0.5, "rewards/chosen": 0.0010446547530591488, "rewards/margins": 0.02147998847067356, "rewards/rejected": -0.020435333251953125, "step": 686 }, { "epoch": 1.71, "learning_rate": 5.463413518913973e-09, "logits/chosen": -0.25270596146583557, "logits/rejected": -0.21074172854423523, "logps/chosen": -164.25669860839844, "logps/rejected": -186.56361389160156, "loss": 0.6902, "rewards/accuracies": 0.25, "rewards/chosen": -0.022333526983857155, "rewards/margins": -0.013950729742646217, "rewards/rejected": -0.008382797241210938, "step": 687 }, { "epoch": 1.71, "learning_rate": 5.3721234163860655e-09, "logits/chosen": -0.42796990275382996, "logits/rejected": -0.418274462223053, "logps/chosen": -135.34219360351562, "logps/rejected": -171.4232177734375, "loss": 0.6842, "rewards/accuracies": 0.25, "rewards/chosen": -0.0027580272872000933, "rewards/margins": 0.002682493068277836, "rewards/rejected": -0.005440521519631147, "step": 688 }, { "epoch": 1.71, "learning_rate": 5.281559136421537e-09, "logits/chosen": -0.2849990129470825, "logits/rejected": -0.28000184893608093, "logps/chosen": -144.4366912841797, "logps/rejected": -157.6671142578125, "loss": 0.6785, "rewards/accuracies": 0.25, "rewards/chosen": 0.0061157215386629105, "rewards/margins": -0.0014316565357148647, "rewards/rejected": 0.0075473785400390625, "step": 689 }, { "epoch": 1.72, "learning_rate": 5.1917221519472256e-09, "logits/chosen": -0.33362722396850586, "logits/rejected": -0.3169896900653839, "logps/chosen": -153.74740600585938, "logps/rejected": -205.4685821533203, "loss": 0.6865, "rewards/accuracies": 0.25, "rewards/chosen": -0.008580016903579235, "rewards/margins": -0.010317611508071423, "rewards/rejected": 0.0017375946044921875, "step": 690 }, { "epoch": 1.72, "learning_rate": 5.102613924061339e-09, "logits/chosen": -0.37616291642189026, "logits/rejected": -0.3433045744895935, "logps/chosen": -173.50714111328125, "logps/rejected": -148.9453582763672, "loss": 0.6783, "rewards/accuracies": 0.75, "rewards/chosen": 0.034148406237363815, "rewards/margins": 0.06164589151740074, "rewards/rejected": -0.027497481554746628, "step": 691 }, { "epoch": 1.72, "learning_rate": 5.014235902009656e-09, "logits/chosen": -0.41904744505882263, "logits/rejected": -0.4170151650905609, "logps/chosen": -157.04238891601562, "logps/rejected": -159.0098419189453, "loss": 0.6789, "rewards/accuracies": 0.75, "rewards/chosen": 0.010870743542909622, "rewards/margins": 0.023893356323242188, "rewards/rejected": -0.01302261371165514, "step": 692 }, { "epoch": 1.72, "learning_rate": 4.926589523162012e-09, "logits/chosen": -0.445955365896225, "logits/rejected": -0.44672492146492004, "logps/chosen": -154.92312622070312, "logps/rejected": -167.3513946533203, "loss": 0.6854, "rewards/accuracies": 0.75, "rewards/chosen": 0.004062080290168524, "rewards/margins": 0.009718131273984909, "rewards/rejected": -0.005656050518155098, "step": 693 }, { "epoch": 1.73, "learning_rate": 4.839676212988847e-09, "logits/chosen": -0.1684308797121048, "logits/rejected": -0.12417499721050262, "logps/chosen": -175.14596557617188, "logps/rejected": -133.45706176757812, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.00287284841760993, "rewards/margins": -0.011943627148866653, "rewards/rejected": 0.014816475100815296, "step": 694 }, { "epoch": 1.73, "learning_rate": 4.753497385038057e-09, "logits/chosen": -0.3355487883090973, "logits/rejected": -0.3485339283943176, "logps/chosen": -115.72613525390625, "logps/rejected": -171.2386474609375, "loss": 0.6899, "rewards/accuracies": 0.5, "rewards/chosen": -0.02428111992776394, "rewards/margins": -0.03484134376049042, "rewards/rejected": 0.010560226626694202, "step": 695 }, { "epoch": 1.73, "learning_rate": 4.6680544409120396e-09, "logits/chosen": -0.4636997878551483, "logits/rejected": -0.4688413441181183, "logps/chosen": -130.4059295654297, "logps/rejected": -140.6136932373047, "loss": 0.6884, "rewards/accuracies": 0.25, "rewards/chosen": -0.04168377071619034, "rewards/margins": -0.01980609819293022, "rewards/rejected": -0.021877672523260117, "step": 696 }, { "epoch": 1.73, "learning_rate": 4.583348770244833e-09, "logits/chosen": -0.24159882962703705, "logits/rejected": -0.2110578566789627, "logps/chosen": -134.30462646484375, "logps/rejected": -176.96856689453125, "loss": 0.6886, "rewards/accuracies": 0.25, "rewards/chosen": -0.017894936725497246, "rewards/margins": -0.047588542103767395, "rewards/rejected": 0.029693603515625, "step": 697 }, { "epoch": 1.74, "learning_rate": 4.4993817506795375e-09, "logits/chosen": -0.2440304309129715, "logits/rejected": -0.2238645702600479, "logps/chosen": -157.3824462890625, "logps/rejected": -181.01702880859375, "loss": 0.6841, "rewards/accuracies": 0.75, "rewards/chosen": 0.017752457410097122, "rewards/margins": 0.05183982849121094, "rewards/rejected": -0.034087371081113815, "step": 698 }, { "epoch": 1.74, "learning_rate": 4.416154747845957e-09, "logits/chosen": -0.3288722038269043, "logits/rejected": -0.3110567331314087, "logps/chosen": -139.18577575683594, "logps/rejected": -152.48626708984375, "loss": 0.6936, "rewards/accuracies": 0.25, "rewards/chosen": -0.011680412106215954, "rewards/margins": -0.01050586812198162, "rewards/rejected": -0.0011745449155569077, "step": 699 }, { "epoch": 1.74, "learning_rate": 4.333669115338323e-09, "logits/chosen": -0.5402836799621582, "logits/rejected": -0.5032609701156616, "logps/chosen": -112.0423583984375, "logps/rejected": -160.2455596923828, "loss": 0.6804, "rewards/accuracies": 1.0, "rewards/chosen": 0.01864909939467907, "rewards/margins": 0.07087840884923935, "rewards/rejected": -0.05222930759191513, "step": 700 }, { "epoch": 1.74, "learning_rate": 4.251926194693306e-09, "logits/chosen": -0.08478525280952454, "logits/rejected": -0.07648051530122757, "logps/chosen": -104.5135269165039, "logps/rejected": -158.32232666015625, "loss": 0.6818, "rewards/accuracies": 1.0, "rewards/chosen": 0.022081375122070312, "rewards/margins": 0.052178576588630676, "rewards/rejected": -0.030097197741270065, "step": 701 }, { "epoch": 1.75, "learning_rate": 4.1709273153682115e-09, "logits/chosen": -0.2927549183368683, "logits/rejected": -0.2982402741909027, "logps/chosen": -161.15097045898438, "logps/rejected": -162.403564453125, "loss": 0.6873, "rewards/accuracies": 0.75, "rewards/chosen": -0.02574348635971546, "rewards/margins": -0.001930808648467064, "rewards/rejected": -0.02381267584860325, "step": 702 }, { "epoch": 1.75, "learning_rate": 4.090673794719329e-09, "logits/chosen": -0.3099636435508728, "logits/rejected": -0.2954809069633484, "logps/chosen": -161.21258544921875, "logps/rejected": -149.09239196777344, "loss": 0.6837, "rewards/accuracies": 0.5, "rewards/chosen": -0.00982666015625, "rewards/margins": 0.009798622690141201, "rewards/rejected": -0.019625281915068626, "step": 703 }, { "epoch": 1.75, "learning_rate": 4.011166937980531e-09, "logits/chosen": -0.30899596214294434, "logits/rejected": -0.309733122587204, "logps/chosen": -137.05081176757812, "logps/rejected": -152.7642059326172, "loss": 0.6837, "rewards/accuracies": 0.75, "rewards/chosen": -0.010649681091308594, "rewards/margins": 0.02718639373779297, "rewards/rejected": -0.03783607855439186, "step": 704 }, { "epoch": 1.75, "learning_rate": 3.932408038242024e-09, "logits/chosen": -0.30932489037513733, "logits/rejected": -0.3119296729564667, "logps/chosen": -170.345947265625, "logps/rejected": -166.82537841796875, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.023124312981963158, "rewards/margins": 0.029848860576748848, "rewards/rejected": -0.006724547129124403, "step": 705 }, { "epoch": 1.76, "learning_rate": 3.8543983764293575e-09, "logits/chosen": -0.3169057071208954, "logits/rejected": -0.33663466572761536, "logps/chosen": -154.58053588867188, "logps/rejected": -175.82962036132812, "loss": 0.6828, "rewards/accuracies": 0.5, "rewards/chosen": -0.0004049297422170639, "rewards/margins": 0.02965870127081871, "rewards/rejected": -0.030063629150390625, "step": 706 }, { "epoch": 1.76, "learning_rate": 3.7771392212825385e-09, "logits/chosen": -0.34492719173431396, "logits/rejected": -0.3436262905597687, "logps/chosen": -141.2680206298828, "logps/rejected": -137.19558715820312, "loss": 0.684, "rewards/accuracies": 0.5, "rewards/chosen": 0.012998772785067558, "rewards/margins": 0.004922295920550823, "rewards/rejected": 0.008076476864516735, "step": 707 }, { "epoch": 1.76, "learning_rate": 3.70063182933541e-09, "logits/chosen": -0.5700247883796692, "logits/rejected": -0.5731378793716431, "logps/chosen": -160.80340576171875, "logps/rejected": -147.24395751953125, "loss": 0.6848, "rewards/accuracies": 0.5, "rewards/chosen": 0.014473343268036842, "rewards/margins": 0.007581328973174095, "rewards/rejected": 0.006892014294862747, "step": 708 }, { "epoch": 1.76, "learning_rate": 3.624877444895269e-09, "logits/chosen": -0.3132096529006958, "logits/rejected": -0.2733702063560486, "logps/chosen": -162.50994873046875, "logps/rejected": -175.61439514160156, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": -0.0016098023625090718, "rewards/margins": 0.017223739996552467, "rewards/rejected": -0.018833540380001068, "step": 709 }, { "epoch": 1.77, "learning_rate": 3.549877300022547e-09, "logits/chosen": -0.16802877187728882, "logits/rejected": -0.12742722034454346, "logps/chosen": -146.74273681640625, "logps/rejected": -165.57920837402344, "loss": 0.6733, "rewards/accuracies": 0.75, "rewards/chosen": 0.019086265936493874, "rewards/margins": 0.020230483263731003, "rewards/rejected": -0.0011442184913903475, "step": 710 }, { "epoch": 1.77, "learning_rate": 3.4756326145108204e-09, "logits/chosen": -0.28521984815597534, "logits/rejected": -0.28820034861564636, "logps/chosen": -131.3827362060547, "logps/rejected": -184.465087890625, "loss": 0.6802, "rewards/accuracies": 0.75, "rewards/chosen": -0.01753387413918972, "rewards/margins": 0.017963029444217682, "rewards/rejected": -0.03549690544605255, "step": 711 }, { "epoch": 1.77, "learning_rate": 3.402144595866979e-09, "logits/chosen": -0.2475489228963852, "logits/rejected": -0.24266310036182404, "logps/chosen": -141.86697387695312, "logps/rejected": -155.89100646972656, "loss": 0.6874, "rewards/accuracies": 0.5, "rewards/chosen": -0.007604408077895641, "rewards/margins": -0.00382919330149889, "rewards/rejected": -0.0037752147763967514, "step": 712 }, { "epoch": 1.77, "learning_rate": 3.3294144392915615e-09, "logits/chosen": -0.3065216541290283, "logits/rejected": -0.3108878433704376, "logps/chosen": -137.3340301513672, "logps/rejected": -190.79360961914062, "loss": 0.6857, "rewards/accuracies": 0.25, "rewards/chosen": -0.0005994788371026516, "rewards/margins": -0.018739700317382812, "rewards/rejected": 0.018140221014618874, "step": 713 }, { "epoch": 1.78, "learning_rate": 3.2574433276593093e-09, "logits/chosen": -0.40151113271713257, "logits/rejected": -0.3936748504638672, "logps/chosen": -142.9602508544922, "logps/rejected": -139.8641815185547, "loss": 0.679, "rewards/accuracies": 1.0, "rewards/chosen": 0.043836213648319244, "rewards/margins": 0.055555153638124466, "rewards/rejected": -0.01171894185245037, "step": 714 }, { "epoch": 1.78, "learning_rate": 3.1862324314999745e-09, "logits/chosen": -0.39955204725265503, "logits/rejected": -0.39771851897239685, "logps/chosen": -132.59327697753906, "logps/rejected": -141.05923461914062, "loss": 0.6894, "rewards/accuracies": 0.5, "rewards/chosen": 0.00811004638671875, "rewards/margins": 0.010672378353774548, "rewards/rejected": -0.002562332432717085, "step": 715 }, { "epoch": 1.78, "learning_rate": 3.115782908979242e-09, "logits/chosen": -0.35924550890922546, "logits/rejected": -0.3388606905937195, "logps/chosen": -139.2794189453125, "logps/rejected": -147.30055236816406, "loss": 0.6869, "rewards/accuracies": 0.75, "rewards/chosen": 0.011670876294374466, "rewards/margins": 0.04229755327105522, "rewards/rejected": -0.030626678839325905, "step": 716 }, { "epoch": 1.78, "learning_rate": 3.0460959058798976e-09, "logits/chosen": -0.2266380488872528, "logits/rejected": -0.2151523381471634, "logps/chosen": -155.42446899414062, "logps/rejected": -162.85836791992188, "loss": 0.6793, "rewards/accuracies": 0.5, "rewards/chosen": -0.01393661554902792, "rewards/margins": -0.011530876159667969, "rewards/rejected": -0.002405739389359951, "step": 717 }, { "epoch": 1.79, "learning_rate": 2.9771725555832127e-09, "logits/chosen": -0.2815777361392975, "logits/rejected": -0.2823942005634308, "logps/chosen": -155.4105224609375, "logps/rejected": -159.99044799804688, "loss": 0.68, "rewards/accuracies": 1.0, "rewards/chosen": -0.003388976911082864, "rewards/margins": 0.017633819952607155, "rewards/rejected": -0.021022796630859375, "step": 718 }, { "epoch": 1.79, "learning_rate": 2.9090139790504844e-09, "logits/chosen": -0.2500440776348114, "logits/rejected": -0.24461126327514648, "logps/chosen": -131.95156860351562, "logps/rejected": -161.03836059570312, "loss": 0.6872, "rewards/accuracies": 0.75, "rewards/chosen": 0.031036376953125, "rewards/margins": 0.0044593820348382, "rewards/rejected": 0.026576995849609375, "step": 719 }, { "epoch": 1.79, "learning_rate": 2.8416212848048294e-09, "logits/chosen": -0.4478338956832886, "logits/rejected": -0.44027742743492126, "logps/chosen": -121.08335876464844, "logps/rejected": -172.17433166503906, "loss": 0.6784, "rewards/accuracies": 0.75, "rewards/chosen": 0.0049533843994140625, "rewards/margins": 0.042894937098026276, "rewards/rejected": -0.03794155269861221, "step": 720 }, { "epoch": 1.79, "learning_rate": 2.7749955689131355e-09, "logits/chosen": -0.25336113572120667, "logits/rejected": -0.2277272492647171, "logps/chosen": -141.25534057617188, "logps/rejected": -153.0599365234375, "loss": 0.6876, "rewards/accuracies": 0.75, "rewards/chosen": 0.010981368832290173, "rewards/margins": 0.011605262756347656, "rewards/rejected": -0.0006238939240574837, "step": 721 }, { "epoch": 1.8, "learning_rate": 2.709137914968268e-09, "logits/chosen": -0.3632303774356842, "logits/rejected": -0.3311956822872162, "logps/chosen": -156.71588134765625, "logps/rejected": -173.7198486328125, "loss": 0.679, "rewards/accuracies": 0.5, "rewards/chosen": -0.012160874903202057, "rewards/margins": -0.03779792785644531, "rewards/rejected": 0.025637054815888405, "step": 722 }, { "epoch": 1.8, "learning_rate": 2.644049394071396e-09, "logits/chosen": -0.21773380041122437, "logits/rejected": -0.18973961472511292, "logps/chosen": -150.86550903320312, "logps/rejected": -165.55300903320312, "loss": 0.6975, "rewards/accuracies": 0.5, "rewards/chosen": -0.010735701769590378, "rewards/margins": 0.025095367804169655, "rewards/rejected": -0.03583106771111488, "step": 723 }, { "epoch": 1.8, "learning_rate": 2.5797310648145997e-09, "logits/chosen": 0.017437485978007317, "logits/rejected": 0.03614863008260727, "logps/chosen": -130.5182342529297, "logps/rejected": -147.40167236328125, "loss": 0.6879, "rewards/accuracies": 1.0, "rewards/chosen": 0.015927888453006744, "rewards/margins": 0.048543740063905716, "rewards/rejected": -0.03261585161089897, "step": 724 }, { "epoch": 1.8, "learning_rate": 2.5161839732636804e-09, "logits/chosen": -0.18191169202327728, "logits/rejected": -0.16815690696239471, "logps/chosen": -118.30210876464844, "logps/rejected": -164.16445922851562, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": -0.0021341326646506786, "rewards/margins": 0.028832437470555305, "rewards/rejected": -0.030966568738222122, "step": 725 }, { "epoch": 1.81, "learning_rate": 2.4534091529410827e-09, "logits/chosen": -0.23099862039089203, "logits/rejected": -0.2209969162940979, "logps/chosen": -155.95529174804688, "logps/rejected": -188.0478057861328, "loss": 0.6762, "rewards/accuracies": 1.0, "rewards/chosen": 0.020334243774414062, "rewards/margins": 0.08093662559986115, "rewards/rejected": -0.06060238182544708, "step": 726 }, { "epoch": 1.81, "learning_rate": 2.3914076248091498e-09, "logits/chosen": -0.21510761976242065, "logits/rejected": -0.17674271762371063, "logps/chosen": -127.64717102050781, "logps/rejected": -173.82933044433594, "loss": 0.687, "rewards/accuracies": 0.5, "rewards/chosen": -0.01994914934039116, "rewards/margins": -0.0074281697161495686, "rewards/rejected": -0.012520980089902878, "step": 727 }, { "epoch": 1.81, "learning_rate": 2.330180397253473e-09, "logits/chosen": -0.4625481069087982, "logits/rejected": -0.4090662896633148, "logps/chosen": -151.50889587402344, "logps/rejected": -191.02374267578125, "loss": 0.6801, "rewards/accuracies": 0.5, "rewards/chosen": 0.01114501990377903, "rewards/margins": 0.051039889454841614, "rewards/rejected": -0.039894867688417435, "step": 728 }, { "epoch": 1.81, "learning_rate": 2.2697284660665227e-09, "logits/chosen": -0.22416403889656067, "logits/rejected": -0.18390977382659912, "logps/chosen": -146.892333984375, "logps/rejected": -141.079833984375, "loss": 0.6811, "rewards/accuracies": 0.5, "rewards/chosen": 0.017294693738222122, "rewards/margins": 0.010381698608398438, "rewards/rejected": 0.006912993732839823, "step": 729 }, { "epoch": 1.82, "learning_rate": 2.21005281443144e-09, "logits/chosen": -0.3722299337387085, "logits/rejected": -0.37576237320899963, "logps/chosen": -103.73098754882812, "logps/rejected": -177.26663208007812, "loss": 0.6796, "rewards/accuracies": 0.5, "rewards/chosen": 0.01306533720344305, "rewards/margins": 0.04141082614660263, "rewards/rejected": -0.028345488011837006, "step": 730 }, { "epoch": 1.82, "learning_rate": 2.151154412906031e-09, "logits/chosen": -0.46038389205932617, "logits/rejected": -0.4003458023071289, "logps/chosen": -158.3670196533203, "logps/rejected": -179.98049926757812, "loss": 0.6796, "rewards/accuracies": 0.75, "rewards/chosen": 0.024242782965302467, "rewards/margins": 0.016054153442382812, "rewards/rejected": 0.00818862859159708, "step": 731 }, { "epoch": 1.82, "learning_rate": 2.093034219407014e-09, "logits/chosen": -0.3212709426879883, "logits/rejected": -0.2849760055541992, "logps/chosen": -149.58544921875, "logps/rejected": -181.6809844970703, "loss": 0.6795, "rewards/accuracies": 0.25, "rewards/chosen": -0.001999093219637871, "rewards/margins": -0.005925368517637253, "rewards/rejected": 0.003926277160644531, "step": 732 }, { "epoch": 1.82, "learning_rate": 2.0356931791944143e-09, "logits/chosen": -0.36948665976524353, "logits/rejected": -0.31101447343826294, "logps/chosen": -185.33599853515625, "logps/rejected": -161.894775390625, "loss": 0.6867, "rewards/accuracies": 0.25, "rewards/chosen": 0.0028171539306640625, "rewards/margins": -0.02756328694522381, "rewards/rejected": 0.03038044087588787, "step": 733 }, { "epoch": 1.83, "learning_rate": 1.9791322248562003e-09, "logits/chosen": -0.2806205153465271, "logits/rejected": -0.2947535514831543, "logps/chosen": -164.91702270507812, "logps/rejected": -136.955322265625, "loss": 0.6822, "rewards/accuracies": 1.0, "rewards/chosen": 0.026218798011541367, "rewards/margins": 0.059495165944099426, "rewards/rejected": -0.03327636793255806, "step": 734 }, { "epoch": 1.83, "learning_rate": 1.923352276293122e-09, "logits/chosen": -0.18706639111042023, "logits/rejected": -0.17402759194374084, "logps/chosen": -155.65640258789062, "logps/rejected": -173.6845245361328, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 0.023102952167391777, "rewards/margins": 0.038238525390625, "rewards/rejected": -0.015135575085878372, "step": 735 }, { "epoch": 1.83, "learning_rate": 1.8683542407037234e-09, "logits/chosen": -0.3595023453235626, "logits/rejected": -0.3472379446029663, "logps/chosen": -165.7363739013672, "logps/rejected": -153.3146514892578, "loss": 0.6859, "rewards/accuracies": 0.25, "rewards/chosen": -0.002639389131218195, "rewards/margins": -0.023072432726621628, "rewards/rejected": 0.02043304592370987, "step": 736 }, { "epoch": 1.83, "learning_rate": 1.8141390125696265e-09, "logits/chosen": -0.35592615604400635, "logits/rejected": -0.35711753368377686, "logps/chosen": -126.14794158935547, "logps/rejected": -150.60086059570312, "loss": 0.686, "rewards/accuracies": 0.75, "rewards/chosen": 0.008098411373794079, "rewards/margins": 0.03435249254107475, "rewards/rejected": -0.0262540802359581, "step": 737 }, { "epoch": 1.84, "learning_rate": 1.7607074736409654e-09, "logits/chosen": -0.22285297513008118, "logits/rejected": -0.22908125817775726, "logps/chosen": -172.4499053955078, "logps/rejected": -173.73739624023438, "loss": 0.6862, "rewards/accuracies": 0.75, "rewards/chosen": -0.02055969275534153, "rewards/margins": 0.0015264519024640322, "rewards/rejected": -0.022086145356297493, "step": 738 }, { "epoch": 1.84, "learning_rate": 1.708060492922031e-09, "logits/chosen": -0.4201814532279968, "logits/rejected": -0.41861915588378906, "logps/chosen": -160.91893005371094, "logps/rejected": -176.24093627929688, "loss": 0.6798, "rewards/accuracies": 0.75, "rewards/chosen": -0.0018880846910178661, "rewards/margins": 0.03203430026769638, "rewards/rejected": -0.033922385424375534, "step": 739 }, { "epoch": 1.84, "learning_rate": 1.6561989266571652e-09, "logits/chosen": -0.4432060420513153, "logits/rejected": -0.43045860528945923, "logps/chosen": -145.23434448242188, "logps/rejected": -154.80584716796875, "loss": 0.686, "rewards/accuracies": 0.5, "rewards/chosen": 0.02754497528076172, "rewards/margins": 0.033217430114746094, "rewards/rejected": -0.005672454833984375, "step": 740 }, { "epoch": 1.84, "learning_rate": 1.605123618316795e-09, "logits/chosen": -0.3729058504104614, "logits/rejected": -0.3403409421443939, "logps/chosen": -170.69064331054688, "logps/rejected": -188.27978515625, "loss": 0.6914, "rewards/accuracies": 0.5, "rewards/chosen": 0.005207443609833717, "rewards/margins": 0.0026561738923192024, "rewards/rejected": 0.002551269717514515, "step": 741 }, { "epoch": 1.85, "learning_rate": 1.554835398583787e-09, "logits/chosen": -0.18766961991786957, "logits/rejected": -0.19065891206264496, "logps/chosen": -136.557861328125, "logps/rejected": -181.27975463867188, "loss": 0.6873, "rewards/accuracies": 0.25, "rewards/chosen": 0.01949920691549778, "rewards/margins": -0.021843336522579193, "rewards/rejected": 0.041342541575431824, "step": 742 }, { "epoch": 1.85, "learning_rate": 1.5053350853398428e-09, "logits/chosen": -0.25174376368522644, "logits/rejected": -0.24048955738544464, "logps/chosen": -163.76272583007812, "logps/rejected": -152.920166015625, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 0.017729762941598892, "rewards/margins": 0.05235176533460617, "rewards/rejected": -0.03462200239300728, "step": 743 }, { "epoch": 1.85, "learning_rate": 1.4566234836522695e-09, "logits/chosen": -0.23492830991744995, "logits/rejected": -0.21976181864738464, "logps/chosen": -159.24203491210938, "logps/rejected": -168.45303344726562, "loss": 0.6857, "rewards/accuracies": 0.25, "rewards/chosen": -0.023498153313994408, "rewards/margins": -0.02048034779727459, "rewards/rejected": -0.0030178073793649673, "step": 744 }, { "epoch": 1.85, "learning_rate": 1.4087013857608632e-09, "logits/chosen": -0.35555049777030945, "logits/rejected": -0.34326332807540894, "logps/chosen": -138.73318481445312, "logps/rejected": -145.26914978027344, "loss": 0.6921, "rewards/accuracies": 0.5, "rewards/chosen": 0.012540055438876152, "rewards/margins": -0.011474989354610443, "rewards/rejected": 0.024015046656131744, "step": 745 }, { "epoch": 1.86, "learning_rate": 1.3615695710650189e-09, "logits/chosen": -0.25830206274986267, "logits/rejected": -0.24732112884521484, "logps/chosen": -167.9259796142578, "logps/rejected": -178.7469940185547, "loss": 0.6911, "rewards/accuracies": 0.25, "rewards/chosen": -0.007859420962631702, "rewards/margins": -0.035393714904785156, "rewards/rejected": 0.02753429114818573, "step": 746 }, { "epoch": 1.86, "learning_rate": 1.3152288061110516e-09, "logits/chosen": -0.2145419716835022, "logits/rejected": -0.15789546072483063, "logps/chosen": -167.23045349121094, "logps/rejected": -174.3115234375, "loss": 0.6887, "rewards/accuracies": 0.5, "rewards/chosen": -0.026740266010165215, "rewards/margins": 0.004968644119799137, "rewards/rejected": -0.031708911061286926, "step": 747 }, { "epoch": 1.86, "learning_rate": 1.2696798445797351e-09, "logits/chosen": -0.25413602590560913, "logits/rejected": -0.23638883233070374, "logps/chosen": -137.76348876953125, "logps/rejected": -164.36044311523438, "loss": 0.6854, "rewards/accuracies": 0.0, "rewards/chosen": 0.006549263373017311, "rewards/margins": -0.023618698120117188, "rewards/rejected": 0.03016795963048935, "step": 748 }, { "epoch": 1.86, "learning_rate": 1.2249234272740605e-09, "logits/chosen": -0.1590869128704071, "logits/rejected": -0.13725630939006805, "logps/chosen": -129.75006103515625, "logps/rejected": -195.87799072265625, "loss": 0.6904, "rewards/accuracies": 0.5, "rewards/chosen": 0.0037042638286948204, "rewards/margins": 0.0090547576546669, "rewards/rejected": -0.0053504942916333675, "step": 749 }, { "epoch": 1.87, "learning_rate": 1.1809602821071574e-09, "logits/chosen": -0.3475223183631897, "logits/rejected": -0.3649798333644867, "logps/chosen": -158.76283264160156, "logps/rejected": -140.7595977783203, "loss": 0.6861, "rewards/accuracies": 0.25, "rewards/chosen": 0.0008304589428007603, "rewards/margins": 0.005503272172063589, "rewards/rejected": -0.004672813694924116, "step": 750 }, { "epoch": 1.87, "learning_rate": 1.1377911240904758e-09, "logits/chosen": -0.2935466766357422, "logits/rejected": -0.2817026674747467, "logps/chosen": -137.2165069580078, "logps/rejected": -136.94387817382812, "loss": 0.6822, "rewards/accuracies": 0.5, "rewards/chosen": -0.006660079583525658, "rewards/margins": 0.035413362085819244, "rewards/rejected": -0.04207343980669975, "step": 751 }, { "epoch": 1.87, "learning_rate": 1.0954166553221567e-09, "logits/chosen": -0.3378526270389557, "logits/rejected": -0.3240903615951538, "logps/chosen": -150.3462677001953, "logps/rejected": -149.58242797851562, "loss": 0.6885, "rewards/accuracies": 0.25, "rewards/chosen": 0.012328528799116611, "rewards/margins": -0.005926323123276234, "rewards/rejected": 0.018254851922392845, "step": 752 }, { "epoch": 1.87, "learning_rate": 1.0538375649755905e-09, "logits/chosen": -0.14620281755924225, "logits/rejected": -0.16615572571754456, "logps/chosen": -124.23204040527344, "logps/rejected": -173.46144104003906, "loss": 0.6857, "rewards/accuracies": 0.25, "rewards/chosen": -0.014691924676299095, "rewards/margins": -0.035561371594667435, "rewards/rejected": 0.02086944505572319, "step": 753 }, { "epoch": 1.88, "learning_rate": 1.0130545292882432e-09, "logits/chosen": -0.4177802801132202, "logits/rejected": -0.4322090446949005, "logps/chosen": -162.0972137451172, "logps/rejected": -153.7075958251953, "loss": 0.676, "rewards/accuracies": 1.0, "rewards/chosen": 0.011498451232910156, "rewards/margins": 0.05389728397130966, "rewards/rejected": -0.042398836463689804, "step": 754 }, { "epoch": 1.88, "learning_rate": 9.730682115506428e-10, "logits/chosen": -0.2942149043083191, "logits/rejected": -0.3021039664745331, "logps/chosen": -143.35597229003906, "logps/rejected": -162.73486328125, "loss": 0.6827, "rewards/accuracies": 0.5, "rewards/chosen": -0.009059716947376728, "rewards/margins": 0.00495529267936945, "rewards/rejected": -0.014015007764101028, "step": 755 }, { "epoch": 1.88, "learning_rate": 9.338792620955772e-10, "logits/chosen": -0.2566831409931183, "logits/rejected": -0.19420400261878967, "logps/chosen": -171.09283447265625, "logps/rejected": -171.6182861328125, "loss": 0.6828, "rewards/accuracies": 0.25, "rewards/chosen": -0.031034087762236595, "rewards/margins": -0.05211944505572319, "rewards/rejected": 0.021085355430841446, "step": 756 }, { "epoch": 1.88, "learning_rate": 8.954883182875295e-10, "logits/chosen": -0.42724859714508057, "logits/rejected": -0.4075240194797516, "logps/chosen": -156.96047973632812, "logps/rejected": -173.21621704101562, "loss": 0.68, "rewards/accuracies": 0.75, "rewards/chosen": -0.015172004699707031, "rewards/margins": 0.02910137176513672, "rewards/rejected": -0.04427337646484375, "step": 757 }, { "epoch": 1.89, "learning_rate": 8.578960045123263e-10, "logits/chosen": -0.20377184450626373, "logits/rejected": -0.18595322966575623, "logps/chosen": -145.51406860351562, "logps/rejected": -145.1554412841797, "loss": 0.6856, "rewards/accuracies": 0.75, "rewards/chosen": -0.01027221605181694, "rewards/margins": 0.04730777442455292, "rewards/rejected": -0.057579994201660156, "step": 758 }, { "epoch": 1.89, "learning_rate": 8.211029321669615e-10, "logits/chosen": -0.3163500428199768, "logits/rejected": -0.3210850954055786, "logps/chosen": -141.3299102783203, "logps/rejected": -163.1626434326172, "loss": 0.6878, "rewards/accuracies": 0.5, "rewards/chosen": 0.014751053415238857, "rewards/margins": 0.029007337987422943, "rewards/rejected": -0.014256286434829235, "step": 759 }, { "epoch": 1.89, "learning_rate": 7.851096996496498e-10, "logits/chosen": -0.25466227531433105, "logits/rejected": -0.24531960487365723, "logps/chosen": -121.60061645507812, "logps/rejected": -163.83584594726562, "loss": 0.6751, "rewards/accuracies": 0.5, "rewards/chosen": 0.0013595586642622948, "rewards/margins": 0.0022933962754905224, "rewards/rejected": -0.0009338380768895149, "step": 760 }, { "epoch": 1.89, "learning_rate": 7.49916892350122e-10, "logits/chosen": -0.21173043549060822, "logits/rejected": -0.22953207790851593, "logps/chosen": -149.0849151611328, "logps/rejected": -189.95571899414062, "loss": 0.6844, "rewards/accuracies": 0.75, "rewards/chosen": -0.026080895215272903, "rewards/margins": 0.010769844986498356, "rewards/rejected": -0.036850739270448685, "step": 761 }, { "epoch": 1.9, "learning_rate": 7.155250826400783e-10, "logits/chosen": -0.18358701467514038, "logits/rejected": -0.1800529658794403, "logps/chosen": -163.079345703125, "logps/rejected": -139.250732421875, "loss": 0.6889, "rewards/accuracies": 0.5, "rewards/chosen": -0.00039978139102458954, "rewards/margins": 0.013623427599668503, "rewards/rejected": -0.014023208990693092, "step": 762 }, { "epoch": 1.9, "learning_rate": 6.819348298638839e-10, "logits/chosen": -0.5273974537849426, "logits/rejected": -0.5110995769500732, "logps/chosen": -172.63720703125, "logps/rejected": -150.0941162109375, "loss": 0.691, "rewards/accuracies": 0.75, "rewards/chosen": 0.04293670505285263, "rewards/margins": 0.04175853729248047, "rewards/rejected": 0.001178169623017311, "step": 763 }, { "epoch": 1.9, "learning_rate": 6.49146680329482e-10, "logits/chosen": -0.293468713760376, "logits/rejected": -0.24186769127845764, "logps/chosen": -146.17098999023438, "logps/rejected": -164.78631591796875, "loss": 0.6882, "rewards/accuracies": 0.75, "rewards/chosen": 0.04458179697394371, "rewards/margins": 0.034311484545469284, "rewards/rejected": 0.010270308703184128, "step": 764 }, { "epoch": 1.9, "learning_rate": 6.171611672994958e-10, "logits/chosen": -0.2368391901254654, "logits/rejected": -0.2220778614282608, "logps/chosen": -188.18495178222656, "logps/rejected": -202.2126922607422, "loss": 0.6878, "rewards/accuracies": 0.25, "rewards/chosen": -0.013287736102938652, "rewards/margins": -0.019134903326630592, "rewards/rejected": 0.005847167689353228, "step": 765 }, { "epoch": 1.91, "learning_rate": 5.859788109825792e-10, "logits/chosen": -0.3403434455394745, "logits/rejected": -0.2655618488788605, "logps/chosen": -200.15078735351562, "logps/rejected": -151.66783142089844, "loss": 0.6739, "rewards/accuracies": 0.5, "rewards/chosen": 0.017093658447265625, "rewards/margins": 0.03432884439826012, "rewards/rejected": -0.017235184088349342, "step": 766 }, { "epoch": 1.91, "learning_rate": 5.556001185249071e-10, "logits/chosen": -0.23768411576747894, "logits/rejected": -0.21744497120380402, "logps/chosen": -153.5445556640625, "logps/rejected": -204.71487426757812, "loss": 0.679, "rewards/accuracies": 0.25, "rewards/chosen": -0.0055374144576489925, "rewards/margins": -0.015329742804169655, "rewards/rejected": 0.009792327880859375, "step": 767 }, { "epoch": 1.91, "learning_rate": 5.260255840020045e-10, "logits/chosen": -0.2909236550331116, "logits/rejected": -0.26094570755958557, "logps/chosen": -143.08189392089844, "logps/rejected": -152.19625854492188, "loss": 0.6836, "rewards/accuracies": 0.25, "rewards/chosen": 0.004693604074418545, "rewards/margins": -0.03225555270910263, "rewards/rejected": 0.03694915771484375, "step": 768 }, { "epoch": 1.91, "learning_rate": 4.972556884106194e-10, "logits/chosen": -0.3162881135940552, "logits/rejected": -0.3397354781627655, "logps/chosen": -134.59689331054688, "logps/rejected": -167.27679443359375, "loss": 0.69, "rewards/accuracies": 0.75, "rewards/chosen": 0.0024129869416356087, "rewards/margins": 0.016449546441435814, "rewards/rejected": -0.01403656043112278, "step": 769 }, { "epoch": 1.92, "learning_rate": 4.692908996609734e-10, "logits/chosen": -0.2708783447742462, "logits/rejected": -0.2560499310493469, "logps/chosen": -108.48041534423828, "logps/rejected": -133.04739379882812, "loss": 0.6819, "rewards/accuracies": 0.5, "rewards/chosen": -0.006335067562758923, "rewards/margins": -0.028063584119081497, "rewards/rejected": 0.02172851748764515, "step": 770 }, { "epoch": 1.92, "learning_rate": 4.421316725691293e-10, "logits/chosen": -0.24003545939922333, "logits/rejected": -0.2516106367111206, "logps/chosen": -123.02095031738281, "logps/rejected": -136.91143798828125, "loss": 0.6875, "rewards/accuracies": 0.0, "rewards/chosen": -0.0038221366703510284, "rewards/margins": -0.023751068860292435, "rewards/rejected": 0.019928932189941406, "step": 771 }, { "epoch": 1.92, "learning_rate": 4.157784488495686e-10, "logits/chosen": -0.2862072288990021, "logits/rejected": -0.2702292799949646, "logps/chosen": -132.4319305419922, "logps/rejected": -163.08668518066406, "loss": 0.6851, "rewards/accuracies": 0.5, "rewards/chosen": -0.0009868619963526726, "rewards/margins": 0.004122543148696423, "rewards/rejected": -0.005109405145049095, "step": 772 }, { "epoch": 1.92, "learning_rate": 3.9023165710803664e-10, "logits/chosen": -0.1790563017129898, "logits/rejected": -0.19999904930591583, "logps/chosen": -155.51950073242188, "logps/rejected": -173.55003356933594, "loss": 0.674, "rewards/accuracies": 0.75, "rewards/chosen": -0.006876753643155098, "rewards/margins": 0.0456794798374176, "rewards/rejected": -0.05255622789263725, "step": 773 }, { "epoch": 1.93, "learning_rate": 3.654917128345758e-10, "logits/chosen": -0.3235059380531311, "logits/rejected": -0.29000499844551086, "logps/chosen": -127.49346923828125, "logps/rejected": -163.13653564453125, "loss": 0.6761, "rewards/accuracies": 0.75, "rewards/chosen": 0.03901195526123047, "rewards/margins": 0.04585132747888565, "rewards/rejected": -0.006839370355010033, "step": 774 }, { "epoch": 1.93, "learning_rate": 3.415590183967365e-10, "logits/chosen": -0.3870020806789398, "logits/rejected": -0.39868125319480896, "logps/chosen": -136.86431884765625, "logps/rejected": -177.61077880859375, "loss": 0.6846, "rewards/accuracies": 0.5, "rewards/chosen": -0.012813568115234375, "rewards/margins": 0.008787157014012337, "rewards/rejected": -0.021600723266601562, "step": 775 }, { "epoch": 1.93, "learning_rate": 3.1843396303306567e-10, "logits/chosen": -0.24960343539714813, "logits/rejected": -0.24354517459869385, "logps/chosen": -138.7925567626953, "logps/rejected": -209.8158416748047, "loss": 0.6859, "rewards/accuracies": 0.5, "rewards/chosen": 0.0032464980613440275, "rewards/margins": 0.0010789884254336357, "rewards/rejected": 0.00216751080006361, "step": 776 }, { "epoch": 1.93, "learning_rate": 2.961169228467508e-10, "logits/chosen": -0.35012802481651306, "logits/rejected": -0.34669432044029236, "logps/chosen": -129.70993041992188, "logps/rejected": -165.93661499023438, "loss": 0.6791, "rewards/accuracies": 1.0, "rewards/chosen": 0.03542270511388779, "rewards/margins": 0.048667337745428085, "rewards/rejected": -0.01324462890625, "step": 777 }, { "epoch": 1.94, "learning_rate": 2.7460826079953034e-10, "logits/chosen": -0.3977464437484741, "logits/rejected": -0.3871355950832367, "logps/chosen": -138.27178955078125, "logps/rejected": -161.60955810546875, "loss": 0.6908, "rewards/accuracies": 0.5, "rewards/chosen": 0.010584831237792969, "rewards/margins": 0.048471640795469284, "rewards/rejected": -0.037886809557676315, "step": 778 }, { "epoch": 1.94, "learning_rate": 2.539083267057651e-10, "logits/chosen": -0.3721188008785248, "logits/rejected": -0.378692626953125, "logps/chosen": -125.14947509765625, "logps/rejected": -144.66836547851562, "loss": 0.6786, "rewards/accuracies": 0.75, "rewards/chosen": 0.0060272216796875, "rewards/margins": 0.043579865247011185, "rewards/rejected": -0.037552643567323685, "step": 779 }, { "epoch": 1.94, "learning_rate": 2.34017457226765e-10, "logits/chosen": -0.35849979519844055, "logits/rejected": -0.37831544876098633, "logps/chosen": -148.6514129638672, "logps/rejected": -156.99722290039062, "loss": 0.6781, "rewards/accuracies": 0.75, "rewards/chosen": 0.023597147315740585, "rewards/margins": 0.0352388396859169, "rewards/rejected": -0.011641692370176315, "step": 780 }, { "epoch": 1.94, "learning_rate": 2.1493597586529355e-10, "logits/chosen": -0.2796367406845093, "logits/rejected": -0.25844764709472656, "logps/chosen": -171.32749938964844, "logps/rejected": -180.91403198242188, "loss": 0.6792, "rewards/accuracies": 1.0, "rewards/chosen": 0.04870453104376793, "rewards/margins": 0.08585777878761292, "rewards/rejected": -0.03715324401855469, "step": 781 }, { "epoch": 1.95, "learning_rate": 1.966641929603441e-10, "logits/chosen": -0.2838178277015686, "logits/rejected": -0.2388259917497635, "logps/chosen": -162.0492401123047, "logps/rejected": -191.85140991210938, "loss": 0.6903, "rewards/accuracies": 0.25, "rewards/chosen": -0.0005756376776844263, "rewards/margins": -0.016336441040039062, "rewards/rejected": 0.01576080359518528, "step": 782 }, { "epoch": 1.95, "learning_rate": 1.7920240568204404e-10, "logits/chosen": -0.24745391309261322, "logits/rejected": -0.21306456625461578, "logps/chosen": -171.17181396484375, "logps/rejected": -152.28805541992188, "loss": 0.6827, "rewards/accuracies": 0.5, "rewards/chosen": 0.03167877346277237, "rewards/margins": 0.022266389802098274, "rewards/rejected": 0.009412385523319244, "step": 783 }, { "epoch": 1.95, "learning_rate": 1.6255089802686418e-10, "logits/chosen": -0.20550376176834106, "logits/rejected": -0.19856008887290955, "logps/chosen": -152.4654083251953, "logps/rejected": -172.22982788085938, "loss": 0.6815, "rewards/accuracies": 0.75, "rewards/chosen": -0.009408948943018913, "rewards/margins": 0.015521243214607239, "rewards/rejected": -0.024930192157626152, "step": 784 }, { "epoch": 1.95, "learning_rate": 1.4670994081297794e-10, "logits/chosen": -0.4253084361553192, "logits/rejected": -0.4475700557231903, "logps/chosen": -182.95986938476562, "logps/rejected": -143.60377502441406, "loss": 0.6787, "rewards/accuracies": 1.0, "rewards/chosen": 0.03573188558220863, "rewards/margins": 0.07633648067712784, "rewards/rejected": -0.040604591369628906, "step": 785 }, { "epoch": 1.96, "learning_rate": 1.3167979167585942e-10, "logits/chosen": -0.35527539253234863, "logits/rejected": -0.3505282700061798, "logps/chosen": -142.94442749023438, "logps/rejected": -196.30337524414062, "loss": 0.6803, "rewards/accuracies": 0.75, "rewards/chosen": 0.00816192664206028, "rewards/margins": 0.025087356567382812, "rewards/rejected": -0.016925431787967682, "step": 786 }, { "epoch": 1.96, "learning_rate": 1.1746069506409217e-10, "logits/chosen": -0.18952374160289764, "logits/rejected": -0.16053064167499542, "logps/chosen": -122.97006225585938, "logps/rejected": -159.20419311523438, "loss": 0.6933, "rewards/accuracies": 1.0, "rewards/chosen": 0.018012428656220436, "rewards/margins": 0.02543659135699272, "rewards/rejected": -0.007424164097756147, "step": 787 }, { "epoch": 1.96, "learning_rate": 1.0405288223540587e-10, "logits/chosen": -0.22045674920082092, "logits/rejected": -0.18312689661979675, "logps/chosen": -142.34710693359375, "logps/rejected": -139.95669555664062, "loss": 0.6802, "rewards/accuracies": 0.75, "rewards/chosen": 0.013618089258670807, "rewards/margins": 0.06599196791648865, "rewards/rejected": -0.05237388610839844, "step": 788 }, { "epoch": 1.96, "learning_rate": 9.145657125289586e-11, "logits/chosen": -0.28289762139320374, "logits/rejected": -0.3045879900455475, "logps/chosen": -147.8663330078125, "logps/rejected": -178.0054473876953, "loss": 0.6875, "rewards/accuracies": 0.0, "rewards/chosen": -0.019739722833037376, "rewards/margins": -0.018899347633123398, "rewards/rejected": -0.0008403779938817024, "step": 789 }, { "epoch": 1.97, "learning_rate": 7.96719669814927e-11, "logits/chosen": -0.3200885057449341, "logits/rejected": -0.32953307032585144, "logps/chosen": -156.22726440429688, "logps/rejected": -177.82418823242188, "loss": 0.6851, "rewards/accuracies": 0.5, "rewards/chosen": 0.0010250089690089226, "rewards/margins": 0.007703018374741077, "rewards/rejected": -0.0066780089400708675, "step": 790 }, { "epoch": 1.97, "learning_rate": 6.869926108462043e-11, "logits/chosen": -0.31671130657196045, "logits/rejected": -0.3041873276233673, "logps/chosen": -149.583251953125, "logps/rejected": -166.99130249023438, "loss": 0.6861, "rewards/accuracies": 1.0, "rewards/chosen": 0.023619461804628372, "rewards/margins": 0.03872489929199219, "rewards/rejected": -0.015105439350008965, "step": 791 }, { "epoch": 1.97, "learning_rate": 5.853863202108234e-11, "logits/chosen": -0.30131471157073975, "logits/rejected": -0.2884294390678406, "logps/chosen": -140.4972686767578, "logps/rejected": -156.35865783691406, "loss": 0.6748, "rewards/accuracies": 0.5, "rewards/chosen": 0.019428633153438568, "rewards/margins": 0.016312027350068092, "rewards/rejected": 0.0031166067346930504, "step": 792 }, { "epoch": 1.97, "learning_rate": 4.919024504215774e-11, "logits/chosen": -0.25848492980003357, "logits/rejected": -0.2534676492214203, "logps/chosen": -135.21376037597656, "logps/rejected": -159.66883850097656, "loss": 0.6864, "rewards/accuracies": 0.25, "rewards/chosen": -0.01886615715920925, "rewards/margins": -0.01613445207476616, "rewards/rejected": -0.0027317055501043797, "step": 793 }, { "epoch": 1.98, "learning_rate": 4.0654252188926374e-11, "logits/chosen": -0.26942718029022217, "logits/rejected": -0.20902927219867706, "logps/chosen": -135.72386169433594, "logps/rejected": -140.98497009277344, "loss": 0.6817, "rewards/accuracies": 0.75, "rewards/chosen": 0.002636146731674671, "rewards/margins": 0.047391124069690704, "rewards/rejected": -0.044754981994628906, "step": 794 }, { "epoch": 1.98, "learning_rate": 3.293079228977036e-11, "logits/chosen": -0.4397571384906769, "logits/rejected": -0.40951550006866455, "logps/chosen": -149.1449737548828, "logps/rejected": -178.73641967773438, "loss": 0.6791, "rewards/accuracies": 0.5, "rewards/chosen": 0.021401595324277878, "rewards/margins": -0.009366225451231003, "rewards/rejected": 0.03076782263815403, "step": 795 }, { "epoch": 1.98, "learning_rate": 2.6019990958148218e-11, "logits/chosen": -0.282534122467041, "logits/rejected": -0.29282429814338684, "logps/chosen": -164.73472595214844, "logps/rejected": -172.10565185546875, "loss": 0.6792, "rewards/accuracies": 1.0, "rewards/chosen": 0.014454269781708717, "rewards/margins": 0.0636318176984787, "rewards/rejected": -0.04917754977941513, "step": 796 }, { "epoch": 1.98, "learning_rate": 1.992196059051321e-11, "logits/chosen": -0.2998826205730438, "logits/rejected": -0.29153597354888916, "logps/chosen": -151.49337768554688, "logps/rejected": -193.42938232421875, "loss": 0.6846, "rewards/accuracies": 0.5, "rewards/chosen": -0.017692947760224342, "rewards/margins": -0.005760574247688055, "rewards/rejected": -0.011932373046875, "step": 797 }, { "epoch": 1.99, "learning_rate": 1.4636800364520306e-11, "logits/chosen": -0.28615647554397583, "logits/rejected": -0.28325188159942627, "logps/chosen": -164.1488494873047, "logps/rejected": -163.37033081054688, "loss": 0.6845, "rewards/accuracies": 0.75, "rewards/chosen": 0.012083053588867188, "rewards/margins": 0.00469055213034153, "rewards/rejected": 0.007392501924186945, "step": 798 }, { "epoch": 1.99, "learning_rate": 1.0164596237399736e-11, "logits/chosen": -0.37696167826652527, "logits/rejected": -0.3737022876739502, "logps/chosen": -161.381103515625, "logps/rejected": -167.7950439453125, "loss": 0.68, "rewards/accuracies": 1.0, "rewards/chosen": 0.026195526123046875, "rewards/margins": 0.05651893466711044, "rewards/rejected": -0.030323408544063568, "step": 799 }, { "epoch": 1.99, "learning_rate": 6.505420944552531e-12, "logits/chosen": -0.1680089831352234, "logits/rejected": -0.16415239870548248, "logps/chosen": -125.23417663574219, "logps/rejected": -149.60081481933594, "loss": 0.6825, "rewards/accuracies": 0.5, "rewards/chosen": -0.009267425164580345, "rewards/margins": -0.008036231622099876, "rewards/rejected": -0.0012311943573877215, "step": 800 }, { "epoch": 1.99, "learning_rate": 3.659333998384806e-12, "logits/chosen": -0.15393847227096558, "logits/rejected": -0.16026459634304047, "logps/chosen": -109.39883422851562, "logps/rejected": -163.16354370117188, "loss": 0.6946, "rewards/accuracies": 0.5, "rewards/chosen": -0.00136566162109375, "rewards/margins": -0.005845069885253906, "rewards/rejected": 0.004479408264160156, "step": 801 }, { "epoch": 2.0, "learning_rate": 1.6263816873141044e-12, "logits/chosen": -0.306001752614975, "logits/rejected": -0.2957990765571594, "logps/chosen": -167.87338256835938, "logps/rejected": -178.76011657714844, "loss": 0.6885, "rewards/accuracies": 0.25, "rewards/chosen": -0.004488754086196423, "rewards/margins": -0.0015731817111372948, "rewards/rejected": -0.002915572840720415, "step": 802 }, { "epoch": 2.0, "learning_rate": 4.065970750422032e-13, "logits/chosen": -0.2924094498157501, "logits/rejected": -0.29542016983032227, "logps/chosen": -141.75889587402344, "logps/rejected": -153.72006225585938, "loss": 0.6832, "rewards/accuracies": 0.75, "rewards/chosen": 0.0247560515999794, "rewards/margins": 0.04653148725628853, "rewards/rejected": -0.021775439381599426, "step": 803 }, { "epoch": 2.0, "learning_rate": 0.0, "logits/chosen": -0.36169809103012085, "logits/rejected": -0.3390687108039856, "logps/chosen": -144.78567504882812, "logps/rejected": -182.76097106933594, "loss": 0.6876, "rewards/accuracies": 0.5, "rewards/chosen": 0.011655616573989391, "rewards/margins": 0.01846637949347496, "rewards/rejected": -0.006810761988162994, "step": 804 }, { "epoch": 2.0, "step": 804, "total_flos": 0.0, "train_loss": 0.6874097331394604, "train_runtime": 3270.081, "train_samples_per_second": 7.863, "train_steps_per_second": 0.246 } ], "max_steps": 804, "num_train_epochs": 2, "total_flos": 0.0, "trial_name": null, "trial_params": null }