{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0011680293675955, "eval_steps": 600, "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005005840146837978, "grad_norm": 8.724861145019531, "learning_rate": 1.9994994159853163e-05, "logits/chosen": -0.5767760872840881, "logits/rejected": -0.5705638527870178, "logps/chosen": -70.2891616821289, "logps/rejected": -121.37105560302734, "loss": 0.6964, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.0051635741256177425, "rewards/margins": -0.014656447805464268, "rewards/rejected": 0.019820023328065872, "step": 3 }, { "epoch": 0.0010011680293675956, "grad_norm": 7.4223737716674805, "learning_rate": 1.9989988319706325e-05, "logits/chosen": -0.26915356516838074, "logits/rejected": -0.2142164260149002, "logps/chosen": -111.6724853515625, "logps/rejected": -51.86235046386719, "loss": 0.6929, "rewards/accuracies": 0.0, "rewards/chosen": -0.06150208041071892, "rewards/margins": -0.0522284209728241, "rewards/rejected": -0.009273656643927097, "step": 6 }, { "epoch": 0.0015017520440513933, "grad_norm": 8.638936996459961, "learning_rate": 1.9984982479559487e-05, "logits/chosen": -0.3732071816921234, "logits/rejected": -0.3898884057998657, "logps/chosen": -92.2378158569336, "logps/rejected": -120.37467193603516, "loss": 0.6855, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.0076777138747274876, "rewards/margins": 0.023975372314453125, "rewards/rejected": -0.016297658905386925, "step": 9 }, { "epoch": 0.002002336058735191, "grad_norm": 8.98553466796875, "learning_rate": 1.997997663941265e-05, "logits/chosen": -0.5769428610801697, "logits/rejected": -0.5579409003257751, "logps/chosen": -92.29843139648438, "logps/rejected": -60.21346664428711, "loss": 0.6777, "rewards/accuracies": 1.0, "rewards/chosen": 0.005732473451644182, "rewards/margins": 0.02123413234949112, "rewards/rejected": -0.01550165843218565, "step": 12 }, { "epoch": 0.002502920073418989, "grad_norm": 7.792603015899658, "learning_rate": 1.997497079926581e-05, "logits/chosen": -0.7743476033210754, "logits/rejected": -0.7626956105232239, "logps/chosen": -63.29452133178711, "logps/rejected": -97.2860107421875, "loss": 0.6783, "rewards/accuracies": 1.0, "rewards/chosen": 0.05664558708667755, "rewards/margins": 0.06731045991182327, "rewards/rejected": -0.01066487655043602, "step": 15 }, { "epoch": 0.0030035040881027865, "grad_norm": 6.276625633239746, "learning_rate": 1.9969964959118976e-05, "logits/chosen": -0.4556831121444702, "logits/rejected": -0.4802916347980499, "logps/chosen": -50.83781051635742, "logps/rejected": -75.61394500732422, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 0.027076976373791695, "rewards/margins": 0.08313179016113281, "rewards/rejected": -0.056054819375276566, "step": 18 }, { "epoch": 0.003504088102786584, "grad_norm": 9.929220199584961, "learning_rate": 1.9964959118972138e-05, "logits/chosen": -0.48960527777671814, "logits/rejected": -0.533420979976654, "logps/chosen": -76.5549087524414, "logps/rejected": -101.87915802001953, "loss": 0.69, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.0003365833836141974, "rewards/margins": 0.0035246536135673523, "rewards/rejected": -0.0038612366188317537, "step": 21 }, { "epoch": 0.004004672117470382, "grad_norm": 7.7070817947387695, "learning_rate": 1.9959953278825296e-05, "logits/chosen": -0.4721977710723877, "logits/rejected": -0.4709413945674896, "logps/chosen": -126.61627197265625, "logps/rejected": -127.38492584228516, "loss": 0.6891, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.03024037927389145, "rewards/margins": -0.04719721898436546, "rewards/rejected": 0.016956837847828865, "step": 24 }, { "epoch": 0.00450525613215418, "grad_norm": 9.879571914672852, "learning_rate": 1.995494743867846e-05, "logits/chosen": -0.4895535707473755, "logits/rejected": -0.4355856478214264, "logps/chosen": -120.6561050415039, "logps/rejected": -82.18750762939453, "loss": 0.6925, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.02864024043083191, "rewards/margins": 0.01831868477165699, "rewards/rejected": 0.01032155379652977, "step": 27 }, { "epoch": 0.005005840146837978, "grad_norm": 7.11757755279541, "learning_rate": 1.9949941598531623e-05, "logits/chosen": -0.4184379577636719, "logits/rejected": -0.43294236063957214, "logps/chosen": -51.38810729980469, "logps/rejected": -94.67711639404297, "loss": 0.7062, "rewards/accuracies": 0.0, "rewards/chosen": -0.034350842237472534, "rewards/margins": -0.0531565397977829, "rewards/rejected": 0.018805695697665215, "step": 30 }, { "epoch": 0.005506424161521775, "grad_norm": 8.231386184692383, "learning_rate": 1.9944935758384785e-05, "logits/chosen": -0.480350524187088, "logits/rejected": -0.39751124382019043, "logps/chosen": -63.107757568359375, "logps/rejected": -33.98720169067383, "loss": 0.6855, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.007241886109113693, "rewards/margins": 0.011381973512470722, "rewards/rejected": -0.01862386055290699, "step": 33 }, { "epoch": 0.006007008176205573, "grad_norm": 6.747024059295654, "learning_rate": 1.9939929918237947e-05, "logits/chosen": -0.6438906192779541, "logits/rejected": -0.6405500769615173, "logps/chosen": -46.299774169921875, "logps/rejected": -53.80692672729492, "loss": 0.6916, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.02719758450984955, "rewards/margins": 0.0702395811676979, "rewards/rejected": -0.04304199293255806, "step": 36 }, { "epoch": 0.006507592190889371, "grad_norm": 10.80404281616211, "learning_rate": 1.993492407809111e-05, "logits/chosen": -0.3269790709018707, "logits/rejected": -0.3455749750137329, "logps/chosen": -66.87875366210938, "logps/rejected": -96.90936279296875, "loss": 0.6916, "rewards/accuracies": 0.0, "rewards/chosen": 0.004446219187229872, "rewards/margins": -0.031132444739341736, "rewards/rejected": 0.035578664392232895, "step": 39 }, { "epoch": 0.007008176205573168, "grad_norm": 8.992171287536621, "learning_rate": 1.992991823794427e-05, "logits/chosen": -0.472567081451416, "logits/rejected": -0.4288417398929596, "logps/chosen": -74.72491455078125, "logps/rejected": -41.31052017211914, "loss": 0.6933, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.023735174909234047, "rewards/margins": -0.01335671916604042, "rewards/rejected": -0.010378456674516201, "step": 42 }, { "epoch": 0.007508760220256966, "grad_norm": 8.610702514648438, "learning_rate": 1.9924912397797432e-05, "logits/chosen": -0.45873355865478516, "logits/rejected": -0.49661362171173096, "logps/chosen": -74.43848419189453, "logps/rejected": -122.80913543701172, "loss": 0.6867, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.02687632106244564, "rewards/margins": 0.03258616477251053, "rewards/rejected": -0.05946248769760132, "step": 45 }, { "epoch": 0.008009344234940765, "grad_norm": 7.9503068923950195, "learning_rate": 1.9919906557650594e-05, "logits/chosen": -0.6605178713798523, "logits/rejected": -0.6988124251365662, "logps/chosen": -80.64437103271484, "logps/rejected": -114.75115203857422, "loss": 0.6938, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.04302978515625, "rewards/margins": 0.05895588919520378, "rewards/rejected": -0.01592610590159893, "step": 48 }, { "epoch": 0.008509928249624562, "grad_norm": 9.002120971679688, "learning_rate": 1.9914900717503756e-05, "logits/chosen": -0.41920971870422363, "logits/rejected": -0.4320966899394989, "logps/chosen": -77.68414306640625, "logps/rejected": -96.83153533935547, "loss": 0.6816, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.005727577488869429, "rewards/margins": 0.07025674730539322, "rewards/rejected": -0.0645291656255722, "step": 51 }, { "epoch": 0.00901051226430836, "grad_norm": 6.997349739074707, "learning_rate": 1.9909894877356917e-05, "logits/chosen": -0.5261194109916687, "logits/rejected": -0.5245893597602844, "logps/chosen": -95.17967987060547, "logps/rejected": -84.6362075805664, "loss": 0.6839, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.01773478277027607, "rewards/margins": 0.0018381749978289008, "rewards/rejected": -0.01957295835018158, "step": 54 }, { "epoch": 0.009511096278992158, "grad_norm": 10.870786666870117, "learning_rate": 1.990488903721008e-05, "logits/chosen": -0.39070579409599304, "logits/rejected": -0.45036789774894714, "logps/chosen": -54.15953063964844, "logps/rejected": -132.71754455566406, "loss": 0.6831, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.020861051976680756, "rewards/margins": 0.03821105882525444, "rewards/rejected": -0.059072110801935196, "step": 57 }, { "epoch": 0.010011680293675955, "grad_norm": 8.85783863067627, "learning_rate": 1.989988319706324e-05, "logits/chosen": -0.5614233016967773, "logits/rejected": -0.5514200329780579, "logps/chosen": -88.83231353759766, "logps/rejected": -81.86658477783203, "loss": 0.6878, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.00991032738238573, "rewards/margins": -0.026215871796011925, "rewards/rejected": 0.01630554161965847, "step": 60 }, { "epoch": 0.010512264308359753, "grad_norm": 7.02630615234375, "learning_rate": 1.9894877356916406e-05, "logits/chosen": -0.3962419033050537, "logits/rejected": -0.40417155623435974, "logps/chosen": -142.57601928710938, "logps/rejected": -116.18509674072266, "loss": 0.6819, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.05075684189796448, "rewards/margins": 0.013475286774337292, "rewards/rejected": -0.06423213332891464, "step": 63 }, { "epoch": 0.01101284832304355, "grad_norm": 6.898167610168457, "learning_rate": 1.9889871516769565e-05, "logits/chosen": -0.6134812235832214, "logits/rejected": -0.5775959491729736, "logps/chosen": -79.01615905761719, "logps/rejected": -71.99337768554688, "loss": 0.694, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.02068938873708248, "rewards/margins": 0.026929473504424095, "rewards/rejected": -0.047618865966796875, "step": 66 }, { "epoch": 0.011513432337727348, "grad_norm": 7.475701332092285, "learning_rate": 1.988486567662273e-05, "logits/chosen": -0.5061147809028625, "logits/rejected": -0.48131632804870605, "logps/chosen": -72.74146270751953, "logps/rejected": -40.173126220703125, "loss": 0.6821, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.10210024565458298, "rewards/margins": -0.07885029911994934, "rewards/rejected": -0.023249944671988487, "step": 69 }, { "epoch": 0.012014016352411146, "grad_norm": 10.804630279541016, "learning_rate": 1.987985983647589e-05, "logits/chosen": -0.4360532760620117, "logits/rejected": -0.39780744910240173, "logps/chosen": -104.65933227539062, "logps/rejected": -91.29132843017578, "loss": 0.7079, "rewards/accuracies": 0.0, "rewards/chosen": -0.06720250844955444, "rewards/margins": -0.036109670996665955, "rewards/rejected": -0.03109283559024334, "step": 72 }, { "epoch": 0.012514600367094944, "grad_norm": 11.203512191772461, "learning_rate": 1.9874853996329053e-05, "logits/chosen": -0.602271318435669, "logits/rejected": -0.6287251114845276, "logps/chosen": -89.50079345703125, "logps/rejected": -99.49365234375, "loss": 0.7273, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.11708247661590576, "rewards/margins": -0.08146286010742188, "rewards/rejected": -0.03561961278319359, "step": 75 }, { "epoch": 0.013015184381778741, "grad_norm": 9.543815612792969, "learning_rate": 1.9869848156182215e-05, "logits/chosen": -0.7026975750923157, "logits/rejected": -0.7387280464172363, "logps/chosen": -31.050430297851562, "logps/rejected": -87.76520538330078, "loss": 0.7287, "rewards/accuracies": 0.0, "rewards/chosen": -0.057484015822410583, "rewards/margins": -0.06940240412950516, "rewards/rejected": 0.01191838551312685, "step": 78 }, { "epoch": 0.013515768396462539, "grad_norm": 5.260530471801758, "learning_rate": 1.9864842316035377e-05, "logits/chosen": -0.38555240631103516, "logits/rejected": -0.44096335768699646, "logps/chosen": -51.00654220581055, "logps/rejected": -76.27032470703125, "loss": 0.6798, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.020123036578297615, "rewards/margins": 0.00040092444396577775, "rewards/rejected": -0.02052396349608898, "step": 81 }, { "epoch": 0.014016352411146337, "grad_norm": 6.855349063873291, "learning_rate": 1.985983647588854e-05, "logits/chosen": -0.3796462118625641, "logits/rejected": -0.3785543739795685, "logps/chosen": -65.74349212646484, "logps/rejected": -80.3519058227539, "loss": 0.6807, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.11893704533576965, "rewards/margins": -0.0020104709547013044, "rewards/rejected": -0.11692658066749573, "step": 84 }, { "epoch": 0.014516936425830134, "grad_norm": 8.063116073608398, "learning_rate": 1.98548306357417e-05, "logits/chosen": -0.5329327583312988, "logits/rejected": -0.5260716080665588, "logps/chosen": -104.98519134521484, "logps/rejected": -64.15169525146484, "loss": 0.6778, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.015752919018268585, "rewards/margins": -0.043999094516038895, "rewards/rejected": 0.028246179223060608, "step": 87 }, { "epoch": 0.015017520440513932, "grad_norm": 7.644782066345215, "learning_rate": 1.9849824795594862e-05, "logits/chosen": -0.4613448679447174, "logits/rejected": -0.43695321679115295, "logps/chosen": -125.09862518310547, "logps/rejected": -87.648193359375, "loss": 0.6752, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.09191843867301941, "rewards/margins": -0.026395924389362335, "rewards/rejected": -0.06552251428365707, "step": 90 }, { "epoch": 0.015518104455197732, "grad_norm": 7.416725158691406, "learning_rate": 1.9844818955448024e-05, "logits/chosen": -0.7457137107849121, "logits/rejected": -0.7860862612724304, "logps/chosen": -58.81886672973633, "logps/rejected": -89.91065216064453, "loss": 0.6744, "rewards/accuracies": 1.0, "rewards/chosen": -0.05878207087516785, "rewards/margins": 0.12405791133642197, "rewards/rejected": -0.18283997476100922, "step": 93 }, { "epoch": 0.01601868846988153, "grad_norm": 7.371373176574707, "learning_rate": 1.9839813115301186e-05, "logits/chosen": -0.45406949520111084, "logits/rejected": -0.44163426756858826, "logps/chosen": -104.90299224853516, "logps/rejected": -87.6500473022461, "loss": 0.7039, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.09899139404296875, "rewards/margins": -0.011925317347049713, "rewards/rejected": -0.08706607669591904, "step": 96 }, { "epoch": 0.016519272484565327, "grad_norm": 10.472285270690918, "learning_rate": 1.9834807275154348e-05, "logits/chosen": -0.4250528812408447, "logits/rejected": -0.4418218433856964, "logps/chosen": -84.14141082763672, "logps/rejected": -88.28448486328125, "loss": 0.6765, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.14739710092544556, "rewards/margins": -0.08755848556756973, "rewards/rejected": -0.059838611632585526, "step": 99 }, { "epoch": 0.017019856499249125, "grad_norm": 7.325054168701172, "learning_rate": 1.982980143500751e-05, "logits/chosen": -0.535942792892456, "logits/rejected": -0.5776256918907166, "logps/chosen": -86.45342254638672, "logps/rejected": -118.93306732177734, "loss": 0.6556, "rewards/accuracies": 1.0, "rewards/chosen": -0.0772879347205162, "rewards/margins": 0.06516469269990921, "rewards/rejected": -0.14245261251926422, "step": 102 }, { "epoch": 0.017520440513932922, "grad_norm": 7.388084411621094, "learning_rate": 1.9824795594860675e-05, "logits/chosen": -0.4087042510509491, "logits/rejected": -0.5295590162277222, "logps/chosen": -84.67880249023438, "logps/rejected": -140.09739685058594, "loss": 0.6679, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.1033017560839653, "rewards/margins": 0.09385273605585098, "rewards/rejected": -0.1971544772386551, "step": 105 }, { "epoch": 0.01802102452861672, "grad_norm": 9.48304557800293, "learning_rate": 1.9819789754713833e-05, "logits/chosen": -0.4450256824493408, "logits/rejected": -0.431253582239151, "logps/chosen": -77.58075714111328, "logps/rejected": -45.82686233520508, "loss": 0.6738, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.06906154006719589, "rewards/margins": 0.05901208892464638, "rewards/rejected": -0.12807361781597137, "step": 108 }, { "epoch": 0.018521608543300518, "grad_norm": 8.529993057250977, "learning_rate": 1.9814783914566998e-05, "logits/chosen": -0.6409376263618469, "logits/rejected": -0.6335044503211975, "logps/chosen": -82.5493392944336, "logps/rejected": -86.7012710571289, "loss": 0.6143, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.08215434104204178, "rewards/margins": 0.13915328681468964, "rewards/rejected": -0.221307635307312, "step": 111 }, { "epoch": 0.019022192557984315, "grad_norm": 7.910213947296143, "learning_rate": 1.980977807442016e-05, "logits/chosen": -0.46422824263572693, "logits/rejected": -0.47923633456230164, "logps/chosen": -77.99495697021484, "logps/rejected": -93.66088104248047, "loss": 0.6435, "rewards/accuracies": 1.0, "rewards/chosen": -0.04695230722427368, "rewards/margins": 0.19553326070308685, "rewards/rejected": -0.24248556792736053, "step": 114 }, { "epoch": 0.019522776572668113, "grad_norm": 8.047046661376953, "learning_rate": 1.980477223427332e-05, "logits/chosen": -0.45389652252197266, "logits/rejected": -0.537562906742096, "logps/chosen": -43.10470199584961, "logps/rejected": -84.65811157226562, "loss": 0.6642, "rewards/accuracies": 1.0, "rewards/chosen": -0.03112335316836834, "rewards/margins": 0.19560331106185913, "rewards/rejected": -0.22672666609287262, "step": 117 }, { "epoch": 0.02002336058735191, "grad_norm": 7.754847049713135, "learning_rate": 1.9799766394126484e-05, "logits/chosen": -0.4942411184310913, "logits/rejected": -0.5529773831367493, "logps/chosen": -63.01723098754883, "logps/rejected": -111.05652618408203, "loss": 0.6373, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.060074806213378906, "rewards/margins": 0.11091542989015579, "rewards/rejected": -0.1709902435541153, "step": 120 }, { "epoch": 0.02052394460203571, "grad_norm": 10.253473281860352, "learning_rate": 1.9794760553979645e-05, "logits/chosen": -0.5080655813217163, "logits/rejected": -0.5250653624534607, "logps/chosen": -43.33737564086914, "logps/rejected": -64.2974624633789, "loss": 0.7192, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.08639857172966003, "rewards/margins": 0.030324937775731087, "rewards/rejected": -0.11672350764274597, "step": 123 }, { "epoch": 0.021024528616719506, "grad_norm": 7.887247562408447, "learning_rate": 1.9789754713832807e-05, "logits/chosen": -0.694227397441864, "logits/rejected": -0.7071532607078552, "logps/chosen": -57.68690490722656, "logps/rejected": -83.45989227294922, "loss": 0.689, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.032653748989105225, "rewards/margins": -0.07088795304298401, "rewards/rejected": 0.038234200328588486, "step": 126 }, { "epoch": 0.021525112631403304, "grad_norm": 9.716451644897461, "learning_rate": 1.978474887368597e-05, "logits/chosen": -0.6147742867469788, "logits/rejected": -0.6261494755744934, "logps/chosen": -68.3243179321289, "logps/rejected": -123.30370330810547, "loss": 0.707, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.07449290156364441, "rewards/margins": 0.01724993623793125, "rewards/rejected": -0.09174283593893051, "step": 129 }, { "epoch": 0.0220256966460871, "grad_norm": 7.585305690765381, "learning_rate": 1.977974303353913e-05, "logits/chosen": -0.5042986273765564, "logits/rejected": -0.5150167346000671, "logps/chosen": -74.94017028808594, "logps/rejected": -98.8851547241211, "loss": 0.6621, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.0020276394207030535, "rewards/margins": 0.11446356028318405, "rewards/rejected": -0.11243591457605362, "step": 132 }, { "epoch": 0.0225262806607709, "grad_norm": 10.208983421325684, "learning_rate": 1.9774737193392293e-05, "logits/chosen": -0.5516179203987122, "logits/rejected": -0.5476913452148438, "logps/chosen": -62.8372802734375, "logps/rejected": -70.0451431274414, "loss": 0.7299, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.09002762287855148, "rewards/margins": 0.07062741369009018, "rewards/rejected": 0.01940021477639675, "step": 135 }, { "epoch": 0.023026864675454697, "grad_norm": 8.93551254272461, "learning_rate": 1.9769731353245454e-05, "logits/chosen": -0.7511720657348633, "logits/rejected": -0.7758583426475525, "logps/chosen": -80.78060913085938, "logps/rejected": -99.394775390625, "loss": 0.6159, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.15971960127353668, "rewards/margins": 0.33390888571739197, "rewards/rejected": -0.49362850189208984, "step": 138 }, { "epoch": 0.023527448690138494, "grad_norm": 8.47469425201416, "learning_rate": 1.9764725513098616e-05, "logits/chosen": -0.31772956252098083, "logits/rejected": -0.3521935045719147, "logps/chosen": -60.21269607543945, "logps/rejected": -97.80374145507812, "loss": 0.6752, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.10029392689466476, "rewards/margins": 0.0813741609454155, "rewards/rejected": -0.18166808784008026, "step": 141 }, { "epoch": 0.024028032704822292, "grad_norm": 12.632401466369629, "learning_rate": 1.9759719672951778e-05, "logits/chosen": -0.5573607087135315, "logits/rejected": -0.5147889256477356, "logps/chosen": -95.14031982421875, "logps/rejected": -62.49309158325195, "loss": 0.6567, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.35810112953186035, "rewards/margins": -0.0995432436466217, "rewards/rejected": -0.25855791568756104, "step": 144 }, { "epoch": 0.02452861671950609, "grad_norm": 7.946066379547119, "learning_rate": 1.9754713832804943e-05, "logits/chosen": -0.7316634058952332, "logits/rejected": -0.7115657925605774, "logps/chosen": -55.87677001953125, "logps/rejected": -53.2441520690918, "loss": 0.6908, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.1166304275393486, "rewards/margins": 0.09244028478860855, "rewards/rejected": -0.20907072722911835, "step": 147 }, { "epoch": 0.025029200734189887, "grad_norm": 10.525147438049316, "learning_rate": 1.97497079926581e-05, "logits/chosen": -0.5675979256629944, "logits/rejected": -0.5620729327201843, "logps/chosen": -83.0022964477539, "logps/rejected": -71.6855697631836, "loss": 0.6353, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.1067022979259491, "rewards/margins": 0.14874102175235748, "rewards/rejected": -0.2554433047771454, "step": 150 }, { "epoch": 0.025529784748873685, "grad_norm": 9.344196319580078, "learning_rate": 1.9744702152511263e-05, "logits/chosen": -0.4274073541164398, "logits/rejected": -0.44886159896850586, "logps/chosen": -39.803829193115234, "logps/rejected": -65.0020980834961, "loss": 0.6432, "rewards/accuracies": 1.0, "rewards/chosen": 0.05814120173454285, "rewards/margins": 0.22218959033489227, "rewards/rejected": -0.16404838860034943, "step": 153 }, { "epoch": 0.026030368763557483, "grad_norm": 14.536722183227539, "learning_rate": 1.973969631236443e-05, "logits/chosen": -0.41227230429649353, "logits/rejected": -0.4205130636692047, "logps/chosen": -70.5719985961914, "logps/rejected": -73.2717056274414, "loss": 0.7393, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.32287701964378357, "rewards/margins": -0.09020677953958511, "rewards/rejected": -0.23267023265361786, "step": 156 }, { "epoch": 0.02653095277824128, "grad_norm": 9.305561065673828, "learning_rate": 1.9734690472217587e-05, "logits/chosen": -0.4080498516559601, "logits/rejected": -0.41574668884277344, "logps/chosen": -64.10465240478516, "logps/rejected": -52.34767150878906, "loss": 0.6537, "rewards/accuracies": 0.0, "rewards/chosen": -0.22136028110980988, "rewards/margins": -0.1772560328245163, "rewards/rejected": -0.044104259461164474, "step": 159 }, { "epoch": 0.027031536792925078, "grad_norm": 11.00994873046875, "learning_rate": 1.9729684632070752e-05, "logits/chosen": -0.3760051429271698, "logits/rejected": -0.2736321687698364, "logps/chosen": -67.76935577392578, "logps/rejected": -50.0460090637207, "loss": 0.6849, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.2015673965215683, "rewards/margins": 0.008209675550460815, "rewards/rejected": -0.2097770720720291, "step": 162 }, { "epoch": 0.027532120807608876, "grad_norm": 13.943159103393555, "learning_rate": 1.9724678791923914e-05, "logits/chosen": -0.525124728679657, "logits/rejected": -0.5371784567832947, "logps/chosen": -92.21277618408203, "logps/rejected": -122.07571411132812, "loss": 0.7238, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3199380338191986, "rewards/margins": -0.049041107296943665, "rewards/rejected": -0.27089691162109375, "step": 165 }, { "epoch": 0.028032704822292673, "grad_norm": 7.4482269287109375, "learning_rate": 1.9719672951777076e-05, "logits/chosen": -0.46594977378845215, "logits/rejected": -0.5370703339576721, "logps/chosen": -63.662017822265625, "logps/rejected": -124.71795654296875, "loss": 0.6038, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.13074874877929688, "rewards/margins": 0.061211396008729935, "rewards/rejected": -0.1919601559638977, "step": 168 }, { "epoch": 0.02853328883697647, "grad_norm": 7.979672908782959, "learning_rate": 1.9714667111630238e-05, "logits/chosen": -0.5897461771965027, "logits/rejected": -0.6508184671401978, "logps/chosen": -76.46642303466797, "logps/rejected": -150.1466827392578, "loss": 0.6491, "rewards/accuracies": 1.0, "rewards/chosen": -0.09404309839010239, "rewards/margins": 0.19667373597621918, "rewards/rejected": -0.2907167971134186, "step": 171 }, { "epoch": 0.02903387285166027, "grad_norm": 8.693970680236816, "learning_rate": 1.97096612714834e-05, "logits/chosen": -0.21947409212589264, "logits/rejected": -0.22531943023204803, "logps/chosen": -76.132568359375, "logps/rejected": -93.81356811523438, "loss": 0.6445, "rewards/accuracies": 1.0, "rewards/chosen": -0.04450441896915436, "rewards/margins": 0.3367197513580322, "rewards/rejected": -0.3812241852283478, "step": 174 }, { "epoch": 0.029534456866344067, "grad_norm": 10.337450981140137, "learning_rate": 1.970465543133656e-05, "logits/chosen": -0.3332197368144989, "logits/rejected": -0.34104469418525696, "logps/chosen": -65.81095123291016, "logps/rejected": -106.2038803100586, "loss": 0.6175, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.1827189177274704, "rewards/margins": -0.017560705542564392, "rewards/rejected": -0.165158212184906, "step": 177 }, { "epoch": 0.030035040881027864, "grad_norm": 11.308184623718262, "learning_rate": 1.9699649591189723e-05, "logits/chosen": -0.35230496525764465, "logits/rejected": -0.3329446017742157, "logps/chosen": -72.62149810791016, "logps/rejected": -55.15718078613281, "loss": 0.6969, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.17741191387176514, "rewards/margins": -0.05369803309440613, "rewards/rejected": -0.12371388077735901, "step": 180 }, { "epoch": 0.030535624895711662, "grad_norm": 12.588647842407227, "learning_rate": 1.9694643751042885e-05, "logits/chosen": -0.38428547978401184, "logits/rejected": -0.3544352054595947, "logps/chosen": -117.17345428466797, "logps/rejected": -72.5843505859375, "loss": 0.6407, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.47139230370521545, "rewards/margins": -0.1594575196504593, "rewards/rejected": -0.31193479895591736, "step": 183 }, { "epoch": 0.031036208910395463, "grad_norm": 9.59596061706543, "learning_rate": 1.9689637910896047e-05, "logits/chosen": -0.3964896500110626, "logits/rejected": -0.4113638401031494, "logps/chosen": -66.39368438720703, "logps/rejected": -107.64663696289062, "loss": 0.6637, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.1304527372121811, "rewards/margins": 0.026724496856331825, "rewards/rejected": -0.15717722475528717, "step": 186 }, { "epoch": 0.03153679292507926, "grad_norm": 12.908454895019531, "learning_rate": 1.9684632070749212e-05, "logits/chosen": -0.45171085000038147, "logits/rejected": -0.43888726830482483, "logps/chosen": -79.93399810791016, "logps/rejected": -105.24813842773438, "loss": 0.6261, "rewards/accuracies": 1.0, "rewards/chosen": -0.096002958714962, "rewards/margins": 0.40545347332954407, "rewards/rejected": -0.5014564394950867, "step": 189 }, { "epoch": 0.03203737693976306, "grad_norm": 13.189518928527832, "learning_rate": 1.967962623060237e-05, "logits/chosen": -0.4180600345134735, "logits/rejected": -0.4506050646305084, "logps/chosen": -89.11404418945312, "logps/rejected": -85.80895233154297, "loss": 0.7296, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.2641119956970215, "rewards/margins": 0.11478271335363388, "rewards/rejected": -0.3788946866989136, "step": 192 }, { "epoch": 0.03253796095444685, "grad_norm": 11.214173316955566, "learning_rate": 1.9674620390455532e-05, "logits/chosen": -0.5544005036354065, "logits/rejected": -0.5440300703048706, "logps/chosen": -61.10152053833008, "logps/rejected": -67.07144927978516, "loss": 0.5642, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.03117453120648861, "rewards/margins": 0.4261864721775055, "rewards/rejected": -0.3950119912624359, "step": 195 }, { "epoch": 0.033038544969130654, "grad_norm": 12.332818984985352, "learning_rate": 1.9669614550308697e-05, "logits/chosen": -0.3594237267971039, "logits/rejected": -0.47051095962524414, "logps/chosen": -34.85139083862305, "logps/rejected": -104.68535614013672, "loss": 0.5401, "rewards/accuracies": 1.0, "rewards/chosen": 0.1438388228416443, "rewards/margins": 0.5550634264945984, "rewards/rejected": -0.4112246334552765, "step": 198 }, { "epoch": 0.03353912898381445, "grad_norm": 12.848368644714355, "learning_rate": 1.9664608710161855e-05, "logits/chosen": -0.3462580740451813, "logits/rejected": -0.3528475761413574, "logps/chosen": -57.74058532714844, "logps/rejected": -86.5303726196289, "loss": 0.5925, "rewards/accuracies": 1.0, "rewards/chosen": -0.08438360691070557, "rewards/margins": 0.494453102350235, "rewards/rejected": -0.5788367390632629, "step": 201 }, { "epoch": 0.03403971299849825, "grad_norm": 6.802806854248047, "learning_rate": 1.965960287001502e-05, "logits/chosen": -0.5422907471656799, "logits/rejected": -0.5689066052436829, "logps/chosen": -92.66846466064453, "logps/rejected": -107.50556182861328, "loss": 0.5521, "rewards/accuracies": 1.0, "rewards/chosen": -0.13963355123996735, "rewards/margins": 0.49029651284217834, "rewards/rejected": -0.6299301385879517, "step": 204 }, { "epoch": 0.03454029701318204, "grad_norm": 10.852568626403809, "learning_rate": 1.9654597029868182e-05, "logits/chosen": -0.5871621370315552, "logits/rejected": -0.5690633654594421, "logps/chosen": -79.63545989990234, "logps/rejected": -95.1275405883789, "loss": 0.5613, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5926532745361328, "rewards/margins": 0.17889034748077393, "rewards/rejected": -0.7715436816215515, "step": 207 }, { "epoch": 0.035040881027865844, "grad_norm": 15.717996597290039, "learning_rate": 1.9649591189721344e-05, "logits/chosen": -0.6057240962982178, "logits/rejected": -0.6259468197822571, "logps/chosen": -89.01056671142578, "logps/rejected": -110.72564697265625, "loss": 0.5999, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5169289112091064, "rewards/margins": 0.5061347484588623, "rewards/rejected": -1.0230636596679688, "step": 210 }, { "epoch": 0.03554146504254964, "grad_norm": 8.376073837280273, "learning_rate": 1.9644585349574506e-05, "logits/chosen": -0.39605259895324707, "logits/rejected": -0.45041796565055847, "logps/chosen": -74.54571533203125, "logps/rejected": -82.37498474121094, "loss": 0.7277, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.23305188119411469, "rewards/margins": 0.1209806278347969, "rewards/rejected": -0.3540325164794922, "step": 213 }, { "epoch": 0.03604204905723344, "grad_norm": 9.254660606384277, "learning_rate": 1.9639579509427668e-05, "logits/chosen": -0.41463446617126465, "logits/rejected": -0.4819006025791168, "logps/chosen": -97.2774429321289, "logps/rejected": -152.22959899902344, "loss": 0.4916, "rewards/accuracies": 1.0, "rewards/chosen": -0.6402173638343811, "rewards/margins": 0.6244313716888428, "rewards/rejected": -1.2646487951278687, "step": 216 }, { "epoch": 0.036542633071917234, "grad_norm": 14.027813911437988, "learning_rate": 1.963457366928083e-05, "logits/chosen": -0.541886031627655, "logits/rejected": -0.543449342250824, "logps/chosen": -68.04785919189453, "logps/rejected": -63.294464111328125, "loss": 0.7639, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.33682799339294434, "rewards/margins": 0.07261911034584045, "rewards/rejected": -0.4094471037387848, "step": 219 }, { "epoch": 0.037043217086601035, "grad_norm": 9.763524055480957, "learning_rate": 1.962956782913399e-05, "logits/chosen": -0.2862584888935089, "logits/rejected": -0.28527528047561646, "logps/chosen": -91.2696533203125, "logps/rejected": -63.37473678588867, "loss": 0.6138, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5974193215370178, "rewards/margins": 0.058018624782562256, "rewards/rejected": -0.6554380059242249, "step": 222 }, { "epoch": 0.03754380110128483, "grad_norm": 12.48825454711914, "learning_rate": 1.9624561988987153e-05, "logits/chosen": -0.6025230884552002, "logits/rejected": -0.5152387022972107, "logps/chosen": -165.83279418945312, "logps/rejected": -129.7216339111328, "loss": 0.6266, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.8321029543876648, "rewards/margins": -0.04120122268795967, "rewards/rejected": -0.7909017205238342, "step": 225 }, { "epoch": 0.03804438511596863, "grad_norm": 16.71776008605957, "learning_rate": 1.9619556148840315e-05, "logits/chosen": -0.4602840840816498, "logits/rejected": -0.4444321095943451, "logps/chosen": -97.12667083740234, "logps/rejected": -86.22513580322266, "loss": 0.6517, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.9296107292175293, "rewards/margins": -0.293268620967865, "rewards/rejected": -0.6363421082496643, "step": 228 }, { "epoch": 0.038544969130652425, "grad_norm": 10.489968299865723, "learning_rate": 1.9614550308693477e-05, "logits/chosen": -0.4312724769115448, "logits/rejected": -0.41550126671791077, "logps/chosen": -67.71570587158203, "logps/rejected": -71.6884765625, "loss": 0.6857, "rewards/accuracies": 1.0, "rewards/chosen": -0.27778828144073486, "rewards/margins": 0.5308883786201477, "rewards/rejected": -0.8086767196655273, "step": 231 }, { "epoch": 0.039045553145336226, "grad_norm": 14.936903953552246, "learning_rate": 1.960954446854664e-05, "logits/chosen": -0.5941847562789917, "logits/rejected": -0.6272369623184204, "logps/chosen": -71.86568450927734, "logps/rejected": -96.02825927734375, "loss": 0.5629, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.17498905956745148, "rewards/margins": 0.2637893557548523, "rewards/rejected": -0.43877843022346497, "step": 234 }, { "epoch": 0.03954613716002002, "grad_norm": 10.137005805969238, "learning_rate": 1.96045386283998e-05, "logits/chosen": -0.5868473052978516, "logits/rejected": -0.6112702488899231, "logps/chosen": -64.0637435913086, "logps/rejected": -120.45406341552734, "loss": 0.5011, "rewards/accuracies": 1.0, "rewards/chosen": -0.31652820110321045, "rewards/margins": 0.5459242463111877, "rewards/rejected": -0.862452507019043, "step": 237 }, { "epoch": 0.04004672117470382, "grad_norm": 14.199751853942871, "learning_rate": 1.9599532788252966e-05, "logits/chosen": -0.548393964767456, "logits/rejected": -0.5780861973762512, "logps/chosen": -78.53543853759766, "logps/rejected": -114.546142578125, "loss": 0.6213, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.2466958612203598, "rewards/margins": 0.24570071697235107, "rewards/rejected": -0.4923965632915497, "step": 240 }, { "epoch": 0.040547305189387615, "grad_norm": 18.196678161621094, "learning_rate": 1.9594526948106124e-05, "logits/chosen": -0.5845401883125305, "logits/rejected": -0.5839157700538635, "logps/chosen": -89.89134979248047, "logps/rejected": -83.37657928466797, "loss": 0.6531, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3392186462879181, "rewards/margins": 0.14270935952663422, "rewards/rejected": -0.4819279909133911, "step": 243 }, { "epoch": 0.04104788920407142, "grad_norm": 18.638883590698242, "learning_rate": 1.958952110795929e-05, "logits/chosen": -0.4554649889469147, "logits/rejected": -0.4757753908634186, "logps/chosen": -69.35784912109375, "logps/rejected": -90.81224822998047, "loss": 0.6897, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.36071088910102844, "rewards/margins": 0.5804110169410706, "rewards/rejected": -0.9411219954490662, "step": 246 }, { "epoch": 0.04154847321875522, "grad_norm": 8.039569854736328, "learning_rate": 1.958451526781245e-05, "logits/chosen": -0.37602469325065613, "logits/rejected": -0.39006495475769043, "logps/chosen": -73.51749420166016, "logps/rejected": -93.34200286865234, "loss": 0.5477, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.09794127196073532, "rewards/margins": 0.438186913728714, "rewards/rejected": -0.5361281633377075, "step": 249 }, { "epoch": 0.04204905723343901, "grad_norm": 10.118498802185059, "learning_rate": 1.957950942766561e-05, "logits/chosen": -0.40186476707458496, "logits/rejected": -0.42522773146629333, "logps/chosen": -61.69330978393555, "logps/rejected": -79.25873565673828, "loss": 0.6769, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.3455641567707062, "rewards/margins": 0.31933823227882385, "rewards/rejected": 0.026225924491882324, "step": 252 }, { "epoch": 0.04254964124812281, "grad_norm": 8.840149879455566, "learning_rate": 1.9574503587518775e-05, "logits/chosen": -0.1817980408668518, "logits/rejected": -0.20639145374298096, "logps/chosen": -96.81845092773438, "logps/rejected": -86.4477310180664, "loss": 0.4427, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8703694939613342, "rewards/margins": 0.21177101135253906, "rewards/rejected": -1.0821404457092285, "step": 255 }, { "epoch": 0.04305022526280661, "grad_norm": 18.0274715423584, "learning_rate": 1.9569497747371936e-05, "logits/chosen": -0.6512477397918701, "logits/rejected": -0.6087456345558167, "logps/chosen": -75.3637466430664, "logps/rejected": -79.34364318847656, "loss": 0.6827, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.19869942963123322, "rewards/margins": -0.07182057946920395, "rewards/rejected": -0.12687885761260986, "step": 258 }, { "epoch": 0.04355080927749041, "grad_norm": 19.43569564819336, "learning_rate": 1.9564491907225098e-05, "logits/chosen": -0.5396474003791809, "logits/rejected": -0.5395939946174622, "logps/chosen": -68.53577423095703, "logps/rejected": -88.13300323486328, "loss": 0.7191, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.39252981543540955, "rewards/margins": 0.5066749453544617, "rewards/rejected": -0.8992047309875488, "step": 261 }, { "epoch": 0.0440513932921742, "grad_norm": 12.436830520629883, "learning_rate": 1.955948606707826e-05, "logits/chosen": -0.35492584109306335, "logits/rejected": -0.4084993600845337, "logps/chosen": -85.71853637695312, "logps/rejected": -104.92510986328125, "loss": 0.6172, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.34854957461357117, "rewards/margins": 0.682887613773346, "rewards/rejected": -1.0314372777938843, "step": 264 }, { "epoch": 0.044551977306858004, "grad_norm": 11.046446800231934, "learning_rate": 1.9554480226931422e-05, "logits/chosen": -0.48581185936927795, "logits/rejected": -0.5123505592346191, "logps/chosen": -36.16456604003906, "logps/rejected": -133.84588623046875, "loss": 0.4908, "rewards/accuracies": 1.0, "rewards/chosen": 0.11962338536977768, "rewards/margins": 1.053932785987854, "rewards/rejected": -0.9343094229698181, "step": 267 }, { "epoch": 0.0450525613215418, "grad_norm": 7.491408824920654, "learning_rate": 1.9549474386784584e-05, "logits/chosen": -0.4588066339492798, "logits/rejected": -0.5211063623428345, "logps/chosen": -59.07872009277344, "logps/rejected": -112.51175689697266, "loss": 0.4292, "rewards/accuracies": 1.0, "rewards/chosen": -0.24806715548038483, "rewards/margins": 0.987355649471283, "rewards/rejected": -1.2354227304458618, "step": 270 }, { "epoch": 0.0455531453362256, "grad_norm": 12.267205238342285, "learning_rate": 1.9544468546637745e-05, "logits/chosen": -0.4979308843612671, "logits/rejected": -0.6099672913551331, "logps/chosen": -83.81920623779297, "logps/rejected": -149.2686309814453, "loss": 0.5656, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6131090521812439, "rewards/margins": 1.1360609531402588, "rewards/rejected": -1.749169945716858, "step": 273 }, { "epoch": 0.04605372935090939, "grad_norm": 13.286481857299805, "learning_rate": 1.9539462706490907e-05, "logits/chosen": -0.30617600679397583, "logits/rejected": -0.3427312672138214, "logps/chosen": -67.1170425415039, "logps/rejected": -100.63431549072266, "loss": 0.4649, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4070744812488556, "rewards/margins": 0.12343445420265198, "rewards/rejected": -0.5305089354515076, "step": 276 }, { "epoch": 0.046554313365593195, "grad_norm": 13.488237380981445, "learning_rate": 1.953445686634407e-05, "logits/chosen": -0.6630814075469971, "logits/rejected": -0.6871933341026306, "logps/chosen": -85.0347671508789, "logps/rejected": -104.3840560913086, "loss": 0.7264, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.1898999214172363, "rewards/margins": -0.27688053250312805, "rewards/rejected": -0.9130194187164307, "step": 279 }, { "epoch": 0.04705489738027699, "grad_norm": 9.320180892944336, "learning_rate": 1.9529451026197234e-05, "logits/chosen": -0.5500667691230774, "logits/rejected": -0.6175974011421204, "logps/chosen": -104.2516098022461, "logps/rejected": -128.98411560058594, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": -0.7016032338142395, "rewards/margins": 0.3175140619277954, "rewards/rejected": -1.0191172361373901, "step": 282 }, { "epoch": 0.04755548139496079, "grad_norm": 14.00979232788086, "learning_rate": 1.9524445186050393e-05, "logits/chosen": -0.31710222363471985, "logits/rejected": -0.29125580191612244, "logps/chosen": -115.86102294921875, "logps/rejected": -83.5594711303711, "loss": 0.6413, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.7994852662086487, "rewards/margins": 0.05593669414520264, "rewards/rejected": -0.8554220199584961, "step": 285 }, { "epoch": 0.048056065409644584, "grad_norm": 21.418622970581055, "learning_rate": 1.9519439345903554e-05, "logits/chosen": -0.3074537217617035, "logits/rejected": -0.30970028042793274, "logps/chosen": -68.73403930664062, "logps/rejected": -74.95172882080078, "loss": 0.7051, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.6858701705932617, "rewards/margins": -0.1988905519247055, "rewards/rejected": -0.486979603767395, "step": 288 }, { "epoch": 0.048556649424328385, "grad_norm": 9.029216766357422, "learning_rate": 1.951443350575672e-05, "logits/chosen": -0.4912242889404297, "logits/rejected": -0.5199511647224426, "logps/chosen": -70.45879364013672, "logps/rejected": -88.01766204833984, "loss": 0.5746, "rewards/accuracies": 1.0, "rewards/chosen": -0.7900221943855286, "rewards/margins": 0.8394811749458313, "rewards/rejected": -1.6295033693313599, "step": 291 }, { "epoch": 0.04905723343901218, "grad_norm": 37.71689224243164, "learning_rate": 1.9509427665609878e-05, "logits/chosen": -0.5888793468475342, "logits/rejected": -0.5497369766235352, "logps/chosen": -126.89214324951172, "logps/rejected": -98.7485122680664, "loss": 0.7682, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.931575894355774, "rewards/margins": -0.37053290009498596, "rewards/rejected": -1.5610429048538208, "step": 294 }, { "epoch": 0.04955781745369598, "grad_norm": 18.059293746948242, "learning_rate": 1.9504421825463043e-05, "logits/chosen": -0.7382469177246094, "logits/rejected": -0.7117753028869629, "logps/chosen": -134.3428192138672, "logps/rejected": -140.04344177246094, "loss": 0.5395, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.5468246340751648, "rewards/margins": -0.4242187440395355, "rewards/rejected": -0.12260589748620987, "step": 297 }, { "epoch": 0.050058401468379775, "grad_norm": 16.224327087402344, "learning_rate": 1.9499415985316205e-05, "logits/chosen": -0.46657976508140564, "logits/rejected": -0.4588247239589691, "logps/chosen": -82.73014068603516, "logps/rejected": -74.47808837890625, "loss": 0.6218, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7802572250366211, "rewards/margins": 0.06556892395019531, "rewards/rejected": -0.8458261489868164, "step": 300 }, { "epoch": 0.050558985483063576, "grad_norm": 13.995024681091309, "learning_rate": 1.9494410145169367e-05, "logits/chosen": -0.5319907665252686, "logits/rejected": -0.5203548669815063, "logps/chosen": -88.18023681640625, "logps/rejected": -82.3585205078125, "loss": 0.9044, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.9923644661903381, "rewards/margins": -0.642040491104126, "rewards/rejected": -0.35032400488853455, "step": 303 }, { "epoch": 0.05105956949774737, "grad_norm": 21.591949462890625, "learning_rate": 1.948940430502253e-05, "logits/chosen": -0.6060585379600525, "logits/rejected": -0.6480692028999329, "logps/chosen": -118.4862289428711, "logps/rejected": -167.11610412597656, "loss": 0.7662, "rewards/accuracies": 1.0, "rewards/chosen": -0.9089513421058655, "rewards/margins": 0.570026695728302, "rewards/rejected": -1.478978157043457, "step": 306 }, { "epoch": 0.05156015351243117, "grad_norm": 19.469181060791016, "learning_rate": 1.948439846487569e-05, "logits/chosen": -0.5211424827575684, "logits/rejected": -0.5617905259132385, "logps/chosen": -108.74056243896484, "logps/rejected": -138.95651245117188, "loss": 0.8703, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.5281046628952026, "rewards/margins": -0.4637691080570221, "rewards/rejected": -1.0643354654312134, "step": 309 }, { "epoch": 0.052060737527114966, "grad_norm": 10.958456993103027, "learning_rate": 1.9479392624728852e-05, "logits/chosen": -0.5386335253715515, "logits/rejected": -0.5501885414123535, "logps/chosen": -64.52410125732422, "logps/rejected": -77.97899627685547, "loss": 0.6519, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.08214861154556274, "rewards/margins": 0.3695589601993561, "rewards/rejected": -0.4517076015472412, "step": 312 }, { "epoch": 0.05256132154179877, "grad_norm": 12.185980796813965, "learning_rate": 1.9474386784582014e-05, "logits/chosen": -0.4697903096675873, "logits/rejected": -0.445043683052063, "logps/chosen": -99.08877563476562, "logps/rejected": -91.45025634765625, "loss": 0.5547, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.9654609560966492, "rewards/margins": 0.16965968906879425, "rewards/rejected": -1.1351206302642822, "step": 315 }, { "epoch": 0.05306190555648256, "grad_norm": 11.416824340820312, "learning_rate": 1.9469380944435176e-05, "logits/chosen": -0.5023326873779297, "logits/rejected": -0.48300930857658386, "logps/chosen": -83.22664642333984, "logps/rejected": -82.4186019897461, "loss": 0.7208, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4542018473148346, "rewards/margins": -0.14495915174484253, "rewards/rejected": -0.30924275517463684, "step": 318 }, { "epoch": 0.05356248957116636, "grad_norm": 21.613924026489258, "learning_rate": 1.9464375104288337e-05, "logits/chosen": -0.34895777702331543, "logits/rejected": -0.37338051199913025, "logps/chosen": -32.67759704589844, "logps/rejected": -51.9149055480957, "loss": 0.6809, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.03302103653550148, "rewards/margins": 0.31603094935417175, "rewards/rejected": -0.2830098569393158, "step": 321 }, { "epoch": 0.054063073585850156, "grad_norm": 20.074546813964844, "learning_rate": 1.9459369264141503e-05, "logits/chosen": -0.5692782402038574, "logits/rejected": -0.5425412654876709, "logps/chosen": -128.0848846435547, "logps/rejected": -106.82476043701172, "loss": 0.598, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.4053192138671875, "rewards/margins": -0.1467786282300949, "rewards/rejected": -1.2585406303405762, "step": 324 }, { "epoch": 0.05456365760053396, "grad_norm": 9.60274600982666, "learning_rate": 1.945436342399466e-05, "logits/chosen": -0.6896126866340637, "logits/rejected": -0.6908554434776306, "logps/chosen": -101.54647827148438, "logps/rejected": -151.49375915527344, "loss": 0.4935, "rewards/accuracies": 1.0, "rewards/chosen": -0.36460137367248535, "rewards/margins": 1.695618987083435, "rewards/rejected": -2.06022047996521, "step": 327 }, { "epoch": 0.05506424161521775, "grad_norm": 24.479522705078125, "learning_rate": 1.9449357583847823e-05, "logits/chosen": -0.23384155333042145, "logits/rejected": -0.22403846681118011, "logps/chosen": -70.77587127685547, "logps/rejected": -79.06688690185547, "loss": 0.7609, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.2424674034118652, "rewards/margins": -0.21886534988880157, "rewards/rejected": -1.02360200881958, "step": 330 }, { "epoch": 0.05556482562990155, "grad_norm": 6.808981418609619, "learning_rate": 1.9444351743700988e-05, "logits/chosen": -0.41988134384155273, "logits/rejected": -0.4272496998310089, "logps/chosen": -73.81121826171875, "logps/rejected": -112.65164184570312, "loss": 0.5303, "rewards/accuracies": 1.0, "rewards/chosen": -0.2239575833082199, "rewards/margins": 0.5720238089561462, "rewards/rejected": -0.7959813475608826, "step": 333 }, { "epoch": 0.05606540964458535, "grad_norm": 18.381925582885742, "learning_rate": 1.9439345903554146e-05, "logits/chosen": -0.5055586695671082, "logits/rejected": -0.5175905227661133, "logps/chosen": -103.24712371826172, "logps/rejected": -82.0528793334961, "loss": 0.7515, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.8724918961524963, "rewards/margins": -0.40608203411102295, "rewards/rejected": -0.46640992164611816, "step": 336 }, { "epoch": 0.05656599365926915, "grad_norm": 10.178421974182129, "learning_rate": 1.943434006340731e-05, "logits/chosen": -0.4162217378616333, "logits/rejected": -0.5200716853141785, "logps/chosen": -68.16260528564453, "logps/rejected": -196.7333984375, "loss": 0.4386, "rewards/accuracies": 1.0, "rewards/chosen": -0.3780777156352997, "rewards/margins": 1.414039134979248, "rewards/rejected": -1.7921169996261597, "step": 339 }, { "epoch": 0.05706657767395294, "grad_norm": 15.12691593170166, "learning_rate": 1.9429334223260473e-05, "logits/chosen": -0.3595803678035736, "logits/rejected": -0.30523717403411865, "logps/chosen": -80.3182601928711, "logps/rejected": -78.0139389038086, "loss": 0.5076, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9482619762420654, "rewards/margins": 0.22040754556655884, "rewards/rejected": -1.168669581413269, "step": 342 }, { "epoch": 0.057567161688636743, "grad_norm": 23.711170196533203, "learning_rate": 1.9424328383113632e-05, "logits/chosen": -0.4883565902709961, "logits/rejected": -0.5044384002685547, "logps/chosen": -119.21509552001953, "logps/rejected": -137.65264892578125, "loss": 0.4958, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7564709782600403, "rewards/margins": 0.9467166066169739, "rewards/rejected": -1.7031875848770142, "step": 345 }, { "epoch": 0.05806774570332054, "grad_norm": 12.306049346923828, "learning_rate": 1.9419322542966797e-05, "logits/chosen": -0.4039939343929291, "logits/rejected": -0.40163755416870117, "logps/chosen": -72.9594497680664, "logps/rejected": -87.05890655517578, "loss": 0.4394, "rewards/accuracies": 1.0, "rewards/chosen": -0.6055414080619812, "rewards/margins": 0.688044011592865, "rewards/rejected": -1.2935854196548462, "step": 348 }, { "epoch": 0.05856832971800434, "grad_norm": 17.483091354370117, "learning_rate": 1.941431670281996e-05, "logits/chosen": -0.30970093607902527, "logits/rejected": -0.34486106038093567, "logps/chosen": -81.2982406616211, "logps/rejected": -93.0177001953125, "loss": 0.6174, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.3206037282943726, "rewards/margins": 0.19255046546459198, "rewards/rejected": -1.513154149055481, "step": 351 }, { "epoch": 0.05906891373268813, "grad_norm": 19.525312423706055, "learning_rate": 1.940931086267312e-05, "logits/chosen": -0.6936759948730469, "logits/rejected": -0.6800588965415955, "logps/chosen": -130.2056884765625, "logps/rejected": -116.55221557617188, "loss": 0.774, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.058390736579895, "rewards/margins": 0.26204732060432434, "rewards/rejected": -1.320438027381897, "step": 354 }, { "epoch": 0.059569497747371934, "grad_norm": 22.18893814086914, "learning_rate": 1.9404305022526282e-05, "logits/chosen": -0.6181744933128357, "logits/rejected": -0.6770758628845215, "logps/chosen": -95.9687728881836, "logps/rejected": -162.10536193847656, "loss": 0.3905, "rewards/accuracies": 1.0, "rewards/chosen": -0.8993685841560364, "rewards/margins": 1.5037285089492798, "rewards/rejected": -2.403097152709961, "step": 357 }, { "epoch": 0.06007008176205573, "grad_norm": 16.509613037109375, "learning_rate": 1.9399299182379444e-05, "logits/chosen": -0.5682656168937683, "logits/rejected": -0.560720682144165, "logps/chosen": -61.035400390625, "logps/rejected": -62.8187255859375, "loss": 0.6051, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5935408473014832, "rewards/margins": 0.10120811313390732, "rewards/rejected": -0.6947489380836487, "step": 360 }, { "epoch": 0.06057066577673953, "grad_norm": 21.341821670532227, "learning_rate": 1.9394293342232606e-05, "logits/chosen": -0.3624834716320038, "logits/rejected": -0.3788395822048187, "logps/chosen": -84.89652252197266, "logps/rejected": -82.43135070800781, "loss": 0.6368, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9810125231742859, "rewards/margins": 0.24316497147083282, "rewards/rejected": -1.2241774797439575, "step": 363 }, { "epoch": 0.061071249791423324, "grad_norm": 24.290849685668945, "learning_rate": 1.9389287502085768e-05, "logits/chosen": -0.7508997321128845, "logits/rejected": -0.6909691691398621, "logps/chosen": -106.0478744506836, "logps/rejected": -61.63850784301758, "loss": 0.9418, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.679394245147705, "rewards/margins": -0.9545603394508362, "rewards/rejected": -0.7248337864875793, "step": 366 }, { "epoch": 0.061571833806107125, "grad_norm": 6.99367618560791, "learning_rate": 1.938428166193893e-05, "logits/chosen": -0.4183484613895416, "logits/rejected": -0.41808709502220154, "logps/chosen": -75.11186218261719, "logps/rejected": -109.17420196533203, "loss": 0.7769, "rewards/accuracies": 1.0, "rewards/chosen": -0.1898762732744217, "rewards/margins": 1.353424072265625, "rewards/rejected": -1.5433002710342407, "step": 369 }, { "epoch": 0.062072417820790926, "grad_norm": 10.945162773132324, "learning_rate": 1.937927582179209e-05, "logits/chosen": -0.4032825529575348, "logits/rejected": -0.44575896859169006, "logps/chosen": -51.2329216003418, "logps/rejected": -113.53580474853516, "loss": 0.4687, "rewards/accuracies": 1.0, "rewards/chosen": -0.09464240074157715, "rewards/margins": 1.0795127153396606, "rewards/rejected": -1.1741551160812378, "step": 372 }, { "epoch": 0.06257300183547472, "grad_norm": 12.802529335021973, "learning_rate": 1.9374269981645257e-05, "logits/chosen": -0.33507800102233887, "logits/rejected": -0.3754465579986572, "logps/chosen": -76.96139526367188, "logps/rejected": -99.40238189697266, "loss": 0.386, "rewards/accuracies": 1.0, "rewards/chosen": -0.3758811950683594, "rewards/margins": 1.7800954580307007, "rewards/rejected": -2.1559765338897705, "step": 375 }, { "epoch": 0.06307358585015851, "grad_norm": 16.311885833740234, "learning_rate": 1.9369264141498415e-05, "logits/chosen": -0.5146444439888, "logits/rejected": -0.49989381432533264, "logps/chosen": -97.84176635742188, "logps/rejected": -88.69256591796875, "loss": 0.9288, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.3953733444213867, "rewards/margins": -0.28010714054107666, "rewards/rejected": -1.11526620388031, "step": 378 }, { "epoch": 0.06357416986484232, "grad_norm": 8.600621223449707, "learning_rate": 1.936425830135158e-05, "logits/chosen": -0.3877370357513428, "logits/rejected": -0.4006563425064087, "logps/chosen": -86.60824584960938, "logps/rejected": -96.58072662353516, "loss": 0.636, "rewards/accuracies": 1.0, "rewards/chosen": -0.46997925639152527, "rewards/margins": 0.41564521193504333, "rewards/rejected": -0.8856244087219238, "step": 381 }, { "epoch": 0.06407475387952612, "grad_norm": 11.824315071105957, "learning_rate": 1.9359252461204742e-05, "logits/chosen": -0.46095308661460876, "logits/rejected": -0.4372645914554596, "logps/chosen": -74.64606475830078, "logps/rejected": -46.31528091430664, "loss": 0.8601, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8951137065887451, "rewards/margins": -0.21388424932956696, "rewards/rejected": -0.6812294125556946, "step": 384 }, { "epoch": 0.06457533789420991, "grad_norm": 6.991612911224365, "learning_rate": 1.93542466210579e-05, "logits/chosen": -0.4864165782928467, "logits/rejected": -0.5203666090965271, "logps/chosen": -48.284881591796875, "logps/rejected": -80.6177978515625, "loss": 0.5039, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.136830136179924, "rewards/margins": 0.7931089401245117, "rewards/rejected": -0.9299390316009521, "step": 387 }, { "epoch": 0.0650759219088937, "grad_norm": 10.927249908447266, "learning_rate": 1.9349240780911066e-05, "logits/chosen": -0.48049402236938477, "logits/rejected": -0.4871286451816559, "logps/chosen": -97.13955688476562, "logps/rejected": -115.94568634033203, "loss": 0.5074, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.34998002648353577, "rewards/margins": 1.4437127113342285, "rewards/rejected": -1.7936925888061523, "step": 390 }, { "epoch": 0.06557650592357751, "grad_norm": 13.803153038024902, "learning_rate": 1.9344234940764227e-05, "logits/chosen": -0.46536949276924133, "logits/rejected": -0.49166157841682434, "logps/chosen": -67.97809600830078, "logps/rejected": -78.20266723632812, "loss": 0.6509, "rewards/accuracies": 1.0, "rewards/chosen": -0.07217586785554886, "rewards/margins": 1.2712159156799316, "rewards/rejected": -1.3433918952941895, "step": 393 }, { "epoch": 0.06607708993826131, "grad_norm": 20.28651237487793, "learning_rate": 1.933922910061739e-05, "logits/chosen": -0.1084669902920723, "logits/rejected": -0.21043600142002106, "logps/chosen": -65.9664306640625, "logps/rejected": -125.09674835205078, "loss": 0.5646, "rewards/accuracies": 1.0, "rewards/chosen": -0.6433246731758118, "rewards/margins": 1.2797731161117554, "rewards/rejected": -1.9230977296829224, "step": 396 }, { "epoch": 0.0665776739529451, "grad_norm": 16.613357543945312, "learning_rate": 1.933422326047055e-05, "logits/chosen": -0.5330974459648132, "logits/rejected": -0.4701884090900421, "logps/chosen": -134.2743377685547, "logps/rejected": -79.91191864013672, "loss": 0.615, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.33781126141548157, "rewards/margins": 0.07083096355199814, "rewards/rejected": -0.4086422622203827, "step": 399 }, { "epoch": 0.0670782579676289, "grad_norm": 12.048373222351074, "learning_rate": 1.9329217420323713e-05, "logits/chosen": -0.3605904281139374, "logits/rejected": -0.43198490142822266, "logps/chosen": -71.3725814819336, "logps/rejected": -124.489990234375, "loss": 0.3988, "rewards/accuracies": 1.0, "rewards/chosen": 0.10653114318847656, "rewards/margins": 1.9063607454299927, "rewards/rejected": -1.7998296022415161, "step": 402 }, { "epoch": 0.0675788419823127, "grad_norm": 13.508430480957031, "learning_rate": 1.9324211580176875e-05, "logits/chosen": -0.720395028591156, "logits/rejected": -0.6347939372062683, "logps/chosen": -132.89266967773438, "logps/rejected": -112.40463256835938, "loss": 0.5781, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.590985119342804, "rewards/margins": 1.0483018159866333, "rewards/rejected": -1.6392868757247925, "step": 405 }, { "epoch": 0.0680794259969965, "grad_norm": 12.439876556396484, "learning_rate": 1.9319205740030036e-05, "logits/chosen": -0.5360059142112732, "logits/rejected": -0.522932231426239, "logps/chosen": -127.54686737060547, "logps/rejected": -81.24031829833984, "loss": 0.4533, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.7192400097846985, "rewards/margins": -0.038644928485155106, "rewards/rejected": -0.6805951595306396, "step": 408 }, { "epoch": 0.06858001001168029, "grad_norm": 20.85866928100586, "learning_rate": 1.9314199899883198e-05, "logits/chosen": -0.4129132926464081, "logits/rejected": -0.3886561393737793, "logps/chosen": -100.08414459228516, "logps/rejected": -73.39453887939453, "loss": 0.6388, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.2605036497116089, "rewards/margins": -0.2981545925140381, "rewards/rejected": -0.962348997592926, "step": 411 }, { "epoch": 0.06908059402636409, "grad_norm": 20.410797119140625, "learning_rate": 1.930919405973636e-05, "logits/chosen": -0.5913076996803284, "logits/rejected": -0.5919175744056702, "logps/chosen": -89.96944427490234, "logps/rejected": -85.61437225341797, "loss": 0.7019, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5675491094589233, "rewards/margins": 0.6131324768066406, "rewards/rejected": -1.180681586265564, "step": 414 }, { "epoch": 0.0695811780410479, "grad_norm": 24.301433563232422, "learning_rate": 1.9304188219589525e-05, "logits/chosen": -0.39495396614074707, "logits/rejected": -0.4749439060688019, "logps/chosen": -51.8681526184082, "logps/rejected": -130.39332580566406, "loss": 0.7996, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.29833248257637024, "rewards/margins": 1.626028060913086, "rewards/rejected": -1.9243603944778442, "step": 417 }, { "epoch": 0.07008176205573169, "grad_norm": 6.923988342285156, "learning_rate": 1.9299182379442684e-05, "logits/chosen": -0.4291091859340668, "logits/rejected": -0.43901076912879944, "logps/chosen": -79.98858642578125, "logps/rejected": -103.34085845947266, "loss": 0.3366, "rewards/accuracies": 1.0, "rewards/chosen": -0.7330780029296875, "rewards/margins": 1.209781289100647, "rewards/rejected": -1.9428592920303345, "step": 420 }, { "epoch": 0.07058234607041548, "grad_norm": 7.511977672576904, "learning_rate": 1.9294176539295845e-05, "logits/chosen": -0.5354660749435425, "logits/rejected": -0.5337448716163635, "logps/chosen": -58.98429870605469, "logps/rejected": -58.728363037109375, "loss": 0.4165, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.11280619353055954, "rewards/margins": 0.14040210843086243, "rewards/rejected": -0.027595901861786842, "step": 423 }, { "epoch": 0.07108293008509928, "grad_norm": 18.419919967651367, "learning_rate": 1.928917069914901e-05, "logits/chosen": -0.23579978942871094, "logits/rejected": -0.29306069016456604, "logps/chosen": -51.14501953125, "logps/rejected": -87.55368041992188, "loss": 0.531, "rewards/accuracies": 1.0, "rewards/chosen": 0.5280992388725281, "rewards/margins": 1.6073236465454102, "rewards/rejected": -1.0792244672775269, "step": 426 }, { "epoch": 0.07158351409978309, "grad_norm": 17.01349639892578, "learning_rate": 1.928416485900217e-05, "logits/chosen": -0.6304401755332947, "logits/rejected": -0.6006234288215637, "logps/chosen": -105.0439453125, "logps/rejected": -53.6550178527832, "loss": 0.9086, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6615303158760071, "rewards/margins": -0.6957531571388245, "rewards/rejected": 0.03422285616397858, "step": 429 }, { "epoch": 0.07208409811446688, "grad_norm": 28.756332397460938, "learning_rate": 1.9279159018855334e-05, "logits/chosen": -0.4877108037471771, "logits/rejected": -0.4830784797668457, "logps/chosen": -128.0105438232422, "logps/rejected": -130.5446319580078, "loss": 0.7245, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.9542155265808105, "rewards/margins": 0.31823787093162537, "rewards/rejected": -1.2724534273147583, "step": 432 }, { "epoch": 0.07258468212915067, "grad_norm": 16.144540786743164, "learning_rate": 1.9274153178708496e-05, "logits/chosen": -0.5249345302581787, "logits/rejected": -0.5921141505241394, "logps/chosen": -43.47652816772461, "logps/rejected": -79.54888153076172, "loss": 0.5698, "rewards/accuracies": 1.0, "rewards/chosen": 0.22834403812885284, "rewards/margins": 0.6281831860542297, "rewards/rejected": -0.3998391628265381, "step": 435 }, { "epoch": 0.07308526614383447, "grad_norm": 5.99837589263916, "learning_rate": 1.9269147338561658e-05, "logits/chosen": -0.38701334595680237, "logits/rejected": -0.3866714537143707, "logps/chosen": -62.922607421875, "logps/rejected": -94.61357879638672, "loss": 0.4139, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6486719846725464, "rewards/margins": 0.9611427187919617, "rewards/rejected": -1.6098147630691528, "step": 438 }, { "epoch": 0.07358585015851828, "grad_norm": 23.466320037841797, "learning_rate": 1.926414149841482e-05, "logits/chosen": -0.6976329684257507, "logits/rejected": -0.7543008923530579, "logps/chosen": -50.77499771118164, "logps/rejected": -80.46546173095703, "loss": 0.702, "rewards/accuracies": 1.0, "rewards/chosen": 0.1063184142112732, "rewards/margins": 1.3266466856002808, "rewards/rejected": -1.2203283309936523, "step": 441 }, { "epoch": 0.07408643417320207, "grad_norm": 14.530047416687012, "learning_rate": 1.925913565826798e-05, "logits/chosen": -0.24599702656269073, "logits/rejected": -0.2411126345396042, "logps/chosen": -190.8173828125, "logps/rejected": -91.77538299560547, "loss": 0.6342, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.08008447289466858, "rewards/margins": -0.0634358748793602, "rewards/rejected": -0.01664859987795353, "step": 444 }, { "epoch": 0.07458701818788586, "grad_norm": 15.28846549987793, "learning_rate": 1.9254129818121143e-05, "logits/chosen": -0.49922728538513184, "logits/rejected": -0.5933619141578674, "logps/chosen": -73.37516021728516, "logps/rejected": -129.63978576660156, "loss": 0.7068, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.21703962981700897, "rewards/margins": 0.6995819211006165, "rewards/rejected": -0.9166215062141418, "step": 447 }, { "epoch": 0.07508760220256966, "grad_norm": 21.378549575805664, "learning_rate": 1.9249123977974305e-05, "logits/chosen": -0.5841546058654785, "logits/rejected": -0.6017794013023376, "logps/chosen": -86.86368560791016, "logps/rejected": -70.40081024169922, "loss": 0.8949, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.17593741416931152, "rewards/margins": -0.2396741360425949, "rewards/rejected": 0.06373673677444458, "step": 450 }, { "epoch": 0.07558818621725347, "grad_norm": 29.48621368408203, "learning_rate": 1.9244118137827467e-05, "logits/chosen": -0.5711730122566223, "logits/rejected": -0.5919149518013, "logps/chosen": -121.43365478515625, "logps/rejected": -163.1272430419922, "loss": 0.6313, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8140730857849121, "rewards/margins": 0.5582466721534729, "rewards/rejected": -1.3723196983337402, "step": 453 }, { "epoch": 0.07608877023193726, "grad_norm": 16.996654510498047, "learning_rate": 1.923911229768063e-05, "logits/chosen": -0.4857243001461029, "logits/rejected": -0.5428259968757629, "logps/chosen": -73.35305786132812, "logps/rejected": -101.27294921875, "loss": 0.6384, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.2918347418308258, "rewards/margins": 0.6622252464294434, "rewards/rejected": -0.9540600776672363, "step": 456 }, { "epoch": 0.07658935424662106, "grad_norm": 11.219143867492676, "learning_rate": 1.923410645753379e-05, "logits/chosen": -0.5952122211456299, "logits/rejected": -0.6360089182853699, "logps/chosen": -29.730314254760742, "logps/rejected": -100.8993911743164, "loss": 0.4897, "rewards/accuracies": 1.0, "rewards/chosen": 0.7887187600135803, "rewards/margins": 1.9943675994873047, "rewards/rejected": -1.2056488990783691, "step": 459 }, { "epoch": 0.07708993826130485, "grad_norm": 9.360259056091309, "learning_rate": 1.9229100617386952e-05, "logits/chosen": -0.6840530037879944, "logits/rejected": -0.6247029900550842, "logps/chosen": -84.78114318847656, "logps/rejected": -66.7667007446289, "loss": 0.7845, "rewards/accuracies": 1.0, "rewards/chosen": 0.1892140656709671, "rewards/margins": 0.9957358241081238, "rewards/rejected": -0.8065217137336731, "step": 462 }, { "epoch": 0.07759052227598866, "grad_norm": 13.258530616760254, "learning_rate": 1.9224094777240114e-05, "logits/chosen": -0.4494756758213043, "logits/rejected": -0.4211489260196686, "logps/chosen": -95.2788314819336, "logps/rejected": -81.2781753540039, "loss": 0.5282, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.2640652656555176, "rewards/margins": 0.023024359717965126, "rewards/rejected": -1.2870897054672241, "step": 465 }, { "epoch": 0.07809110629067245, "grad_norm": 21.15235137939453, "learning_rate": 1.921908893709328e-05, "logits/chosen": -0.34525370597839355, "logits/rejected": -0.3335687220096588, "logps/chosen": -170.57598876953125, "logps/rejected": -123.45999908447266, "loss": 0.5913, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.3111199140548706, "rewards/margins": -1.0602623224258423, "rewards/rejected": -0.25085756182670593, "step": 468 }, { "epoch": 0.07859169030535625, "grad_norm": 15.86354923248291, "learning_rate": 1.9214083096946437e-05, "logits/chosen": -0.39116907119750977, "logits/rejected": -0.41164374351501465, "logps/chosen": -125.6161117553711, "logps/rejected": -130.8726043701172, "loss": 0.5211, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7711958885192871, "rewards/margins": 0.8106459975242615, "rewards/rejected": -1.5818418264389038, "step": 471 }, { "epoch": 0.07909227432004004, "grad_norm": 18.435888290405273, "learning_rate": 1.9209077256799603e-05, "logits/chosen": -0.5404213070869446, "logits/rejected": -0.5996436476707458, "logps/chosen": -66.30769348144531, "logps/rejected": -78.6794662475586, "loss": 0.8019, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.16991615295410156, "rewards/margins": 0.02801424264907837, "rewards/rejected": -0.19793041050434113, "step": 474 }, { "epoch": 0.07959285833472385, "grad_norm": 21.53291130065918, "learning_rate": 1.9204071416652764e-05, "logits/chosen": -0.6259146332740784, "logits/rejected": -0.6170547604560852, "logps/chosen": -90.23627471923828, "logps/rejected": -68.46244049072266, "loss": 0.6512, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.6799696087837219, "rewards/margins": -0.02360662817955017, "rewards/rejected": -0.6563630700111389, "step": 477 }, { "epoch": 0.08009344234940764, "grad_norm": 4.300323009490967, "learning_rate": 1.9199065576505923e-05, "logits/chosen": -0.6345335841178894, "logits/rejected": -0.6618843078613281, "logps/chosen": -58.87705612182617, "logps/rejected": -96.99911499023438, "loss": 0.3686, "rewards/accuracies": 1.0, "rewards/chosen": 0.5125829577445984, "rewards/margins": 1.4083997011184692, "rewards/rejected": -0.8958167433738708, "step": 480 }, { "epoch": 0.08059402636409144, "grad_norm": 6.85362434387207, "learning_rate": 1.9194059736359088e-05, "logits/chosen": -0.6624900698661804, "logits/rejected": -0.6812710762023926, "logps/chosen": -127.51028442382812, "logps/rejected": -128.84725952148438, "loss": 0.3108, "rewards/accuracies": 1.0, "rewards/chosen": -0.13735631108283997, "rewards/margins": 1.627339243888855, "rewards/rejected": -1.7646955251693726, "step": 483 }, { "epoch": 0.08109461037877523, "grad_norm": 9.283049583435059, "learning_rate": 1.918905389621225e-05, "logits/chosen": -0.4735625088214874, "logits/rejected": -0.4836982190608978, "logps/chosen": -59.9385871887207, "logps/rejected": -84.80692291259766, "loss": 0.324, "rewards/accuracies": 1.0, "rewards/chosen": 0.6538184285163879, "rewards/margins": 1.2246980667114258, "rewards/rejected": -0.5708796381950378, "step": 486 }, { "epoch": 0.08159519439345904, "grad_norm": 12.856939315795898, "learning_rate": 1.918404805606541e-05, "logits/chosen": -0.6847310066223145, "logits/rejected": -0.6830217242240906, "logps/chosen": -83.97434997558594, "logps/rejected": -61.36452102661133, "loss": 0.4613, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.03846893832087517, "rewards/margins": -0.17805391550064087, "rewards/rejected": 0.1395849883556366, "step": 489 }, { "epoch": 0.08209577840814283, "grad_norm": 17.56700325012207, "learning_rate": 1.9179042215918573e-05, "logits/chosen": -0.192884624004364, "logits/rejected": -0.312303751707077, "logps/chosen": -100.82024383544922, "logps/rejected": -166.4080352783203, "loss": 0.5325, "rewards/accuracies": 1.0, "rewards/chosen": -0.5417330265045166, "rewards/margins": 1.1345633268356323, "rewards/rejected": -1.6762962341308594, "step": 492 }, { "epoch": 0.08259636242282663, "grad_norm": 13.998141288757324, "learning_rate": 1.9174036375771735e-05, "logits/chosen": -0.3080444633960724, "logits/rejected": -0.3455202579498291, "logps/chosen": -52.3138313293457, "logps/rejected": -103.24068450927734, "loss": 0.3733, "rewards/accuracies": 1.0, "rewards/chosen": -0.9195217490196228, "rewards/margins": 1.6642694473266602, "rewards/rejected": -2.5837910175323486, "step": 495 }, { "epoch": 0.08309694643751044, "grad_norm": 7.56748628616333, "learning_rate": 1.9169030535624897e-05, "logits/chosen": -0.3708571493625641, "logits/rejected": -0.36839333176612854, "logps/chosen": -72.7711410522461, "logps/rejected": -79.34241485595703, "loss": 0.6864, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.17721877992153168, "rewards/margins": 0.1236928328871727, "rewards/rejected": -0.3009115755558014, "step": 498 }, { "epoch": 0.08359753045219423, "grad_norm": 26.786277770996094, "learning_rate": 1.916402469547806e-05, "logits/chosen": -0.6261784434318542, "logits/rejected": -0.5722800493240356, "logps/chosen": -86.43836212158203, "logps/rejected": -69.0311050415039, "loss": 0.8627, "rewards/accuracies": 1.0, "rewards/chosen": -0.45285746455192566, "rewards/margins": 0.7427818179130554, "rewards/rejected": -1.1956392526626587, "step": 501 }, { "epoch": 0.08409811446687802, "grad_norm": 9.369241714477539, "learning_rate": 1.915901885533122e-05, "logits/chosen": -0.31430765986442566, "logits/rejected": -0.32443344593048096, "logps/chosen": -88.35706329345703, "logps/rejected": -72.69669342041016, "loss": 0.4452, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4923512637615204, "rewards/margins": 0.3894740641117096, "rewards/rejected": -0.8818252682685852, "step": 504 }, { "epoch": 0.08459869848156182, "grad_norm": 27.172536849975586, "learning_rate": 1.9154013015184382e-05, "logits/chosen": -0.2163965255022049, "logits/rejected": -0.13429105281829834, "logps/chosen": -134.7101287841797, "logps/rejected": -105.86595916748047, "loss": 0.6903, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.0472428798675537, "rewards/margins": -0.08406718820333481, "rewards/rejected": -1.963175892829895, "step": 507 }, { "epoch": 0.08509928249624563, "grad_norm": 16.05742073059082, "learning_rate": 1.9149007175037548e-05, "logits/chosen": -0.27416375279426575, "logits/rejected": -0.28437915444374084, "logps/chosen": -56.02669906616211, "logps/rejected": -82.7410659790039, "loss": 0.6936, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.6472126245498657, "rewards/margins": 1.031396746635437, "rewards/rejected": -0.3841841518878937, "step": 510 }, { "epoch": 0.08559986651092942, "grad_norm": 23.548728942871094, "learning_rate": 1.9144001334890706e-05, "logits/chosen": -0.47873827815055847, "logits/rejected": -0.4734811782836914, "logps/chosen": -89.3942642211914, "logps/rejected": -107.69921112060547, "loss": 0.8086, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.6931015253067017, "rewards/margins": 0.1478988379240036, "rewards/rejected": -1.8410004377365112, "step": 513 }, { "epoch": 0.08610045052561321, "grad_norm": 7.3832221031188965, "learning_rate": 1.9138995494743868e-05, "logits/chosen": -0.6859163641929626, "logits/rejected": -0.687582790851593, "logps/chosen": -100.385498046875, "logps/rejected": -111.00314331054688, "loss": 0.4893, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.41171541810035706, "rewards/margins": 1.1645582914352417, "rewards/rejected": -1.5762735605239868, "step": 516 }, { "epoch": 0.08660103454029701, "grad_norm": 10.816651344299316, "learning_rate": 1.9133989654597033e-05, "logits/chosen": -0.3680359125137329, "logits/rejected": -0.4543178081512451, "logps/chosen": -74.07032775878906, "logps/rejected": -128.76170349121094, "loss": 0.6077, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.39874526858329773, "rewards/margins": 0.8818873763084412, "rewards/rejected": -0.48314204812049866, "step": 519 }, { "epoch": 0.08710161855498082, "grad_norm": 10.225590705871582, "learning_rate": 1.912898381445019e-05, "logits/chosen": -0.22851543128490448, "logits/rejected": -0.24572257697582245, "logps/chosen": -82.6640396118164, "logps/rejected": -96.03799438476562, "loss": 0.666, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9228310585021973, "rewards/margins": -0.030604163184762, "rewards/rejected": -0.8922269344329834, "step": 522 }, { "epoch": 0.08760220256966461, "grad_norm": 10.530823707580566, "learning_rate": 1.9123977974303357e-05, "logits/chosen": -0.6348093152046204, "logits/rejected": -0.6055904626846313, "logps/chosen": -54.93471145629883, "logps/rejected": -87.17385864257812, "loss": 0.4799, "rewards/accuracies": 1.0, "rewards/chosen": 0.3397029936313629, "rewards/margins": 1.0216834545135498, "rewards/rejected": -0.6819804310798645, "step": 525 }, { "epoch": 0.0881027865843484, "grad_norm": 13.723482131958008, "learning_rate": 1.911897213415652e-05, "logits/chosen": -0.3985039293766022, "logits/rejected": -0.3985374867916107, "logps/chosen": -106.5438003540039, "logps/rejected": -123.55355072021484, "loss": 0.5852, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.29268863797187805, "rewards/margins": 0.5248717665672302, "rewards/rejected": -0.8175603747367859, "step": 528 }, { "epoch": 0.0886033705990322, "grad_norm": 20.7388916015625, "learning_rate": 1.911396629400968e-05, "logits/chosen": -0.51712965965271, "logits/rejected": -0.5343462824821472, "logps/chosen": -54.09477233886719, "logps/rejected": -92.33126068115234, "loss": 0.5773, "rewards/accuracies": 1.0, "rewards/chosen": 0.10204067081212997, "rewards/margins": 0.747016966342926, "rewards/rejected": -0.6449763178825378, "step": 531 }, { "epoch": 0.08910395461371601, "grad_norm": 32.5612678527832, "learning_rate": 1.9108960453862842e-05, "logits/chosen": -0.5815667510032654, "logits/rejected": -0.5824565291404724, "logps/chosen": -77.04855346679688, "logps/rejected": -111.60964965820312, "loss": 0.8812, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.7286960482597351, "rewards/margins": 2.2326159477233887, "rewards/rejected": -1.5039199590682983, "step": 534 }, { "epoch": 0.0896045386283998, "grad_norm": 15.816092491149902, "learning_rate": 1.9103954613716004e-05, "logits/chosen": -0.42233988642692566, "logits/rejected": -0.4820968210697174, "logps/chosen": -29.64549446105957, "logps/rejected": -121.83172607421875, "loss": 0.5511, "rewards/accuracies": 1.0, "rewards/chosen": -0.11207377165555954, "rewards/margins": 0.9925110936164856, "rewards/rejected": -1.104584813117981, "step": 537 }, { "epoch": 0.0901051226430836, "grad_norm": 13.671829223632812, "learning_rate": 1.9098948773569165e-05, "logits/chosen": -0.4820469915866852, "logits/rejected": -0.4595530033111572, "logps/chosen": -103.44762420654297, "logps/rejected": -94.60051727294922, "loss": 0.7268, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.2442530393600464, "rewards/margins": 0.417413592338562, "rewards/rejected": -0.17316055297851562, "step": 540 }, { "epoch": 0.09060570665776739, "grad_norm": 14.897601127624512, "learning_rate": 1.9093942933422327e-05, "logits/chosen": -0.6425567269325256, "logits/rejected": -0.6805591583251953, "logps/chosen": -54.865081787109375, "logps/rejected": -82.41045379638672, "loss": 0.5785, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.16434556245803833, "rewards/margins": -0.20813904702663422, "rewards/rejected": 0.37248459458351135, "step": 543 }, { "epoch": 0.0911062906724512, "grad_norm": 7.76992130279541, "learning_rate": 1.908893709327549e-05, "logits/chosen": -0.746488630771637, "logits/rejected": -0.7851570248603821, "logps/chosen": -49.95719528198242, "logps/rejected": -83.88729095458984, "loss": 0.4165, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.3823684751987457, "rewards/margins": 1.7885336875915527, "rewards/rejected": -1.4061651229858398, "step": 546 }, { "epoch": 0.09160687468713499, "grad_norm": 10.624967575073242, "learning_rate": 1.908393125312865e-05, "logits/chosen": -0.4556730091571808, "logits/rejected": -0.520971953868866, "logps/chosen": -66.62755584716797, "logps/rejected": -124.29802703857422, "loss": 0.4615, "rewards/accuracies": 1.0, "rewards/chosen": 0.1762678176164627, "rewards/margins": 1.900954246520996, "rewards/rejected": -1.7246865034103394, "step": 549 }, { "epoch": 0.09210745870181879, "grad_norm": 20.630664825439453, "learning_rate": 1.9078925412981816e-05, "logits/chosen": -0.3668385446071625, "logits/rejected": -0.38090863823890686, "logps/chosen": -64.38400268554688, "logps/rejected": -80.05499267578125, "loss": 0.7062, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.17646551132202148, "rewards/margins": 1.0627906322479248, "rewards/rejected": -0.8863250613212585, "step": 552 }, { "epoch": 0.09260804271650258, "grad_norm": 14.368407249450684, "learning_rate": 1.9073919572834974e-05, "logits/chosen": -0.49921903014183044, "logits/rejected": -0.46519407629966736, "logps/chosen": -133.1221466064453, "logps/rejected": -144.03953552246094, "loss": 0.5577, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.04133021831512451, "rewards/margins": 0.29794541001319885, "rewards/rejected": -0.339275598526001, "step": 555 }, { "epoch": 0.09310862673118639, "grad_norm": 10.988116264343262, "learning_rate": 1.9068913732688136e-05, "logits/chosen": -0.7207644581794739, "logits/rejected": -0.6945379376411438, "logps/chosen": -66.8647689819336, "logps/rejected": -85.4770278930664, "loss": 0.5301, "rewards/accuracies": 1.0, "rewards/chosen": 0.4064488708972931, "rewards/margins": 1.3132855892181396, "rewards/rejected": -0.9068366885185242, "step": 558 }, { "epoch": 0.09360921074587018, "grad_norm": 15.131922721862793, "learning_rate": 1.90639078925413e-05, "logits/chosen": -0.6015565991401672, "logits/rejected": -0.6085624098777771, "logps/chosen": -67.3998794555664, "logps/rejected": -108.89411163330078, "loss": 0.6933, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.4958629608154297, "rewards/margins": 1.1885753870010376, "rewards/rejected": -0.6927124857902527, "step": 561 }, { "epoch": 0.09410979476055398, "grad_norm": 22.63860511779785, "learning_rate": 1.905890205239446e-05, "logits/chosen": -0.4400366246700287, "logits/rejected": -0.4492012560367584, "logps/chosen": -46.565372467041016, "logps/rejected": -74.08740997314453, "loss": 0.4767, "rewards/accuracies": 1.0, "rewards/chosen": 0.615999162197113, "rewards/margins": 0.8117950558662415, "rewards/rejected": -0.1957959532737732, "step": 564 }, { "epoch": 0.09461037877523777, "grad_norm": 14.312948226928711, "learning_rate": 1.9053896212247625e-05, "logits/chosen": -0.45205703377723694, "logits/rejected": -0.45111632347106934, "logps/chosen": -105.37650299072266, "logps/rejected": -84.43905639648438, "loss": 0.655, "rewards/accuracies": 1.0, "rewards/chosen": 0.4092610776424408, "rewards/margins": 1.9298709630966187, "rewards/rejected": -1.5206098556518555, "step": 567 }, { "epoch": 0.09511096278992158, "grad_norm": 13.169382095336914, "learning_rate": 1.9048890372100787e-05, "logits/chosen": -0.15758875012397766, "logits/rejected": -0.13324515521526337, "logps/chosen": -102.90341186523438, "logps/rejected": -56.536197662353516, "loss": 0.59, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.47127053141593933, "rewards/margins": -0.3990355432033539, "rewards/rejected": -0.07223498076200485, "step": 570 }, { "epoch": 0.09561154680460537, "grad_norm": 16.354663848876953, "learning_rate": 1.904388453195395e-05, "logits/chosen": -0.618902325630188, "logits/rejected": -0.6013855338096619, "logps/chosen": -109.029052734375, "logps/rejected": -85.6430892944336, "loss": 0.5354, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.13483555614948273, "rewards/margins": 0.4436996281147003, "rewards/rejected": -0.5785351991653442, "step": 573 }, { "epoch": 0.09611213081928917, "grad_norm": 20.150434494018555, "learning_rate": 1.903887869180711e-05, "logits/chosen": -0.6230969429016113, "logits/rejected": -0.5939390659332275, "logps/chosen": -97.90276336669922, "logps/rejected": -62.14243698120117, "loss": 0.5392, "rewards/accuracies": 1.0, "rewards/chosen": -0.03299000859260559, "rewards/margins": 0.6229171752929688, "rewards/rejected": -0.6559072136878967, "step": 576 }, { "epoch": 0.09661271483397296, "grad_norm": 10.155540466308594, "learning_rate": 1.9033872851660272e-05, "logits/chosen": -0.46280112862586975, "logits/rejected": -0.4682457447052002, "logps/chosen": -75.18807220458984, "logps/rejected": -90.9746322631836, "loss": 0.47, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5349571108818054, "rewards/margins": 0.8560082912445068, "rewards/rejected": -1.390965461730957, "step": 579 }, { "epoch": 0.09711329884865677, "grad_norm": 19.8143310546875, "learning_rate": 1.9028867011513434e-05, "logits/chosen": -0.43509912490844727, "logits/rejected": -0.4177820682525635, "logps/chosen": -135.5380096435547, "logps/rejected": -147.48284912109375, "loss": 0.5804, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.8600100874900818, "rewards/margins": -0.18292365968227386, "rewards/rejected": -0.677086353302002, "step": 582 }, { "epoch": 0.09761388286334056, "grad_norm": 17.808568954467773, "learning_rate": 1.9023861171366596e-05, "logits/chosen": -0.6682588458061218, "logits/rejected": -0.6932258605957031, "logps/chosen": -50.2974739074707, "logps/rejected": -94.1041488647461, "loss": 0.6729, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.01781412772834301, "rewards/margins": 0.02990507148206234, "rewards/rejected": -0.04771919548511505, "step": 585 }, { "epoch": 0.09811446687802436, "grad_norm": 33.57632827758789, "learning_rate": 1.9018855331219758e-05, "logits/chosen": -0.5799828171730042, "logits/rejected": -0.5986420512199402, "logps/chosen": -56.76956558227539, "logps/rejected": -114.27787017822266, "loss": 0.8672, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.29812756180763245, "rewards/margins": 0.5654115080833435, "rewards/rejected": -0.8635391592979431, "step": 588 }, { "epoch": 0.09861505089270815, "grad_norm": 22.4406681060791, "learning_rate": 1.901384949107292e-05, "logits/chosen": -0.320192813873291, "logits/rejected": -0.3228585422039032, "logps/chosen": -161.1801300048828, "logps/rejected": -93.7433090209961, "loss": 0.7287, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.637468695640564, "rewards/margins": -0.06912104040384293, "rewards/rejected": -0.568347692489624, "step": 591 }, { "epoch": 0.09911563490739196, "grad_norm": 22.248811721801758, "learning_rate": 1.900884365092608e-05, "logits/chosen": -0.6477208137512207, "logits/rejected": -0.6806497573852539, "logps/chosen": -56.27652359008789, "logps/rejected": -78.3902816772461, "loss": 0.6419, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3759583532810211, "rewards/margins": -0.1897265911102295, "rewards/rejected": -0.18623173236846924, "step": 594 }, { "epoch": 0.09961621892207576, "grad_norm": 26.00592613220215, "learning_rate": 1.9003837810779243e-05, "logits/chosen": -0.48588502407073975, "logits/rejected": -0.4866816997528076, "logps/chosen": -85.93338012695312, "logps/rejected": -75.7076187133789, "loss": 0.8485, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.05585697293281555, "rewards/margins": -0.22079746425151825, "rewards/rejected": 0.16494052112102509, "step": 597 }, { "epoch": 0.10011680293675955, "grad_norm": 25.12641143798828, "learning_rate": 1.8998831970632405e-05, "logits/chosen": -0.5101989507675171, "logits/rejected": -0.48954543471336365, "logps/chosen": -87.08856201171875, "logps/rejected": -63.53908920288086, "loss": 0.7643, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4939727783203125, "rewards/margins": 0.16732282936573029, "rewards/rejected": -0.6612955927848816, "step": 600 }, { "epoch": 0.10011680293675955, "eval_logits/chosen": -0.49837321043014526, "eval_logits/rejected": -0.512079119682312, "eval_logps/chosen": -81.19551086425781, "eval_logps/rejected": -100.21288299560547, "eval_loss": 0.6268848180770874, "eval_rewards/accuracies": 0.6651651859283447, "eval_rewards/chosen": -0.09377840161323547, "eval_rewards/margins": 0.6146910190582275, "eval_rewards/rejected": -0.7084694504737854, "eval_runtime": 346.9158, "eval_samples_per_second": 7.679, "eval_steps_per_second": 1.92, "step": 600 }, { "epoch": 0.10061738695144334, "grad_norm": 14.540258407592773, "learning_rate": 1.899382613048557e-05, "logits/chosen": -0.4517531096935272, "logits/rejected": -0.4389139711856842, "logps/chosen": -138.48326110839844, "logps/rejected": -88.333251953125, "loss": 0.4703, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.1434888392686844, "rewards/margins": 0.3652603328227997, "rewards/rejected": -0.5087491273880005, "step": 603 }, { "epoch": 0.10111797096612715, "grad_norm": 17.40253448486328, "learning_rate": 1.898882029033873e-05, "logits/chosen": -0.5876478552818298, "logits/rejected": -0.575262725353241, "logps/chosen": -92.91494750976562, "logps/rejected": -77.96021270751953, "loss": 0.6343, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7459259629249573, "rewards/margins": 0.23069043457508087, "rewards/rejected": -0.976616382598877, "step": 606 }, { "epoch": 0.10161855498081095, "grad_norm": 15.371357917785645, "learning_rate": 1.8983814450191894e-05, "logits/chosen": -0.5033895373344421, "logits/rejected": -0.45847105979919434, "logps/chosen": -105.06391143798828, "logps/rejected": -97.90018463134766, "loss": 0.6706, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.531080424785614, "rewards/margins": 0.1878809779882431, "rewards/rejected": -0.7189612984657288, "step": 609 }, { "epoch": 0.10211913899549474, "grad_norm": 17.56338882446289, "learning_rate": 1.8978808610045055e-05, "logits/chosen": -0.35767707228660583, "logits/rejected": -0.34649547934532166, "logps/chosen": -57.294864654541016, "logps/rejected": -51.16297912597656, "loss": 0.5101, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.4008772373199463, "rewards/margins": 0.1320032775402069, "rewards/rejected": 0.2688739597797394, "step": 612 }, { "epoch": 0.10261972301017855, "grad_norm": 11.303997039794922, "learning_rate": 1.8973802769898214e-05, "logits/chosen": -0.542989194393158, "logits/rejected": -0.5407366156578064, "logps/chosen": -75.39936065673828, "logps/rejected": -85.2082290649414, "loss": 0.4449, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.6209197640419006, "rewards/margins": -0.3711714744567871, "rewards/rejected": -0.24974822998046875, "step": 615 }, { "epoch": 0.10312030702486234, "grad_norm": 11.056468963623047, "learning_rate": 1.896879692975138e-05, "logits/chosen": -0.4548330008983612, "logits/rejected": -0.46429285407066345, "logps/chosen": -80.33203887939453, "logps/rejected": -81.7595443725586, "loss": 0.9706, "rewards/accuracies": 1.0, "rewards/chosen": 0.7018416523933411, "rewards/margins": 0.9307842254638672, "rewards/rejected": -0.2289426326751709, "step": 618 }, { "epoch": 0.10362089103954614, "grad_norm": 10.195731163024902, "learning_rate": 1.896379108960454e-05, "logits/chosen": -0.38214921951293945, "logits/rejected": -0.4111761152744293, "logps/chosen": -70.1272964477539, "logps/rejected": -150.79554748535156, "loss": 0.4821, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.15206654369831085, "rewards/margins": 0.7430660724639893, "rewards/rejected": -0.8951326012611389, "step": 621 }, { "epoch": 0.10412147505422993, "grad_norm": 17.643850326538086, "learning_rate": 1.8958785249457703e-05, "logits/chosen": -0.5300541520118713, "logits/rejected": -0.5238021016120911, "logps/chosen": -84.51563262939453, "logps/rejected": -120.17327880859375, "loss": 0.6253, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.5578721761703491, "rewards/margins": 1.4028035402297974, "rewards/rejected": -1.960675835609436, "step": 624 }, { "epoch": 0.10462205906891374, "grad_norm": 10.787839889526367, "learning_rate": 1.8953779409310864e-05, "logits/chosen": -0.47638139128685, "logits/rejected": -0.4904251992702484, "logps/chosen": -66.2474136352539, "logps/rejected": -78.051025390625, "loss": 0.458, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.44273272156715393, "rewards/margins": 0.2698267698287964, "rewards/rejected": -0.7125595211982727, "step": 627 }, { "epoch": 0.10512264308359753, "grad_norm": 16.385814666748047, "learning_rate": 1.8948773569164026e-05, "logits/chosen": -0.4239307940006256, "logits/rejected": -0.45467790961265564, "logps/chosen": -60.61561965942383, "logps/rejected": -96.32140350341797, "loss": 0.5369, "rewards/accuracies": 1.0, "rewards/chosen": 0.28840065002441406, "rewards/margins": 2.4299323558807373, "rewards/rejected": -2.1415317058563232, "step": 630 }, { "epoch": 0.10562322709828133, "grad_norm": 14.9242582321167, "learning_rate": 1.8943767729017188e-05, "logits/chosen": -0.36888745427131653, "logits/rejected": -0.3845241963863373, "logps/chosen": -53.692935943603516, "logps/rejected": -91.45345306396484, "loss": 0.7065, "rewards/accuracies": 0.0, "rewards/chosen": -0.5193760991096497, "rewards/margins": -0.7049927115440369, "rewards/rejected": 0.18561656773090363, "step": 633 }, { "epoch": 0.10612381111296512, "grad_norm": 21.892501831054688, "learning_rate": 1.893876188887035e-05, "logits/chosen": -0.4325160086154938, "logits/rejected": -0.5333804488182068, "logps/chosen": -42.129241943359375, "logps/rejected": -124.4920883178711, "loss": 0.3838, "rewards/accuracies": 1.0, "rewards/chosen": 0.2845504581928253, "rewards/margins": 1.028039813041687, "rewards/rejected": -0.7434893250465393, "step": 636 }, { "epoch": 0.10662439512764893, "grad_norm": 11.676685333251953, "learning_rate": 1.893375604872351e-05, "logits/chosen": -0.6130569577217102, "logits/rejected": -0.6205433011054993, "logps/chosen": -114.83296966552734, "logps/rejected": -106.45601654052734, "loss": 0.71, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5767303109169006, "rewards/margins": -0.15580682456493378, "rewards/rejected": -0.42092347145080566, "step": 639 }, { "epoch": 0.10712497914233272, "grad_norm": 25.814252853393555, "learning_rate": 1.8928750208576673e-05, "logits/chosen": -0.6238194108009338, "logits/rejected": -0.5944227576255798, "logps/chosen": -112.97637176513672, "logps/rejected": -92.75333404541016, "loss": 0.622, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.9483585357666016, "rewards/margins": -0.058697741478681564, "rewards/rejected": -0.8896607756614685, "step": 642 }, { "epoch": 0.10762556315701652, "grad_norm": 23.232511520385742, "learning_rate": 1.892374436842984e-05, "logits/chosen": -0.6769110560417175, "logits/rejected": -0.6837050318717957, "logps/chosen": -65.14563751220703, "logps/rejected": -71.30071258544922, "loss": 0.5548, "rewards/accuracies": 1.0, "rewards/chosen": 0.5391027331352234, "rewards/margins": 0.703230082988739, "rewards/rejected": -0.16412736475467682, "step": 645 }, { "epoch": 0.10812614717170031, "grad_norm": 19.26404571533203, "learning_rate": 1.8918738528282997e-05, "logits/chosen": -0.600271463394165, "logits/rejected": -0.5944409370422363, "logps/chosen": -49.07125473022461, "logps/rejected": -40.83254623413086, "loss": 0.5744, "rewards/accuracies": 0.0, "rewards/chosen": 0.23898263275623322, "rewards/margins": -0.49768149852752686, "rewards/rejected": 0.7366641163825989, "step": 648 }, { "epoch": 0.10862673118638412, "grad_norm": 9.12114143371582, "learning_rate": 1.891373268813616e-05, "logits/chosen": -0.56306391954422, "logits/rejected": -0.6150545477867126, "logps/chosen": -66.56282806396484, "logps/rejected": -108.70563507080078, "loss": 0.5064, "rewards/accuracies": 1.0, "rewards/chosen": -0.14290234446525574, "rewards/margins": 1.0228627920150757, "rewards/rejected": -1.1657651662826538, "step": 651 }, { "epoch": 0.10912731520106791, "grad_norm": 21.80184555053711, "learning_rate": 1.8908726847989324e-05, "logits/chosen": -0.5150684118270874, "logits/rejected": -0.468707799911499, "logps/chosen": -113.18859100341797, "logps/rejected": -127.10428619384766, "loss": 0.7548, "rewards/accuracies": 0.0, "rewards/chosen": -1.1306525468826294, "rewards/margins": -0.9406844973564148, "rewards/rejected": -0.18996810913085938, "step": 654 }, { "epoch": 0.10962789921575171, "grad_norm": 24.77217674255371, "learning_rate": 1.8903721007842482e-05, "logits/chosen": -0.3286081850528717, "logits/rejected": -0.39723050594329834, "logps/chosen": -63.773983001708984, "logps/rejected": -180.7831268310547, "loss": 0.7857, "rewards/accuracies": 1.0, "rewards/chosen": 0.4344474971294403, "rewards/margins": 3.3792362213134766, "rewards/rejected": -2.944788694381714, "step": 657 }, { "epoch": 0.1101284832304355, "grad_norm": 35.077125549316406, "learning_rate": 1.8898715167695647e-05, "logits/chosen": -0.6610751152038574, "logits/rejected": -0.6171354055404663, "logps/chosen": -107.7786636352539, "logps/rejected": -79.0728988647461, "loss": 0.7842, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.2858022451400757, "rewards/margins": 0.30018866062164307, "rewards/rejected": -1.5859909057617188, "step": 660 }, { "epoch": 0.11062906724511931, "grad_norm": 23.758853912353516, "learning_rate": 1.889370932754881e-05, "logits/chosen": -0.5730631351470947, "logits/rejected": -0.5064703822135925, "logps/chosen": -120.2728500366211, "logps/rejected": -102.1275863647461, "loss": 0.5525, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.33452704548835754, "rewards/margins": 0.3983805179595947, "rewards/rejected": -0.06385345757007599, "step": 663 }, { "epoch": 0.1111296512598031, "grad_norm": 14.008994102478027, "learning_rate": 1.888870348740197e-05, "logits/chosen": -0.6654678583145142, "logits/rejected": -0.6963537335395813, "logps/chosen": -113.46001434326172, "logps/rejected": -103.4555892944336, "loss": 0.6776, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.279384970664978, "rewards/margins": -0.23010246455669403, "rewards/rejected": -1.049282431602478, "step": 666 }, { "epoch": 0.1116302352744869, "grad_norm": 17.57147789001465, "learning_rate": 1.8883697647255133e-05, "logits/chosen": -0.27743998169898987, "logits/rejected": -0.3307076692581177, "logps/chosen": -30.180877685546875, "logps/rejected": -103.73648834228516, "loss": 0.7691, "rewards/accuracies": 1.0, "rewards/chosen": 0.7491443753242493, "rewards/margins": 1.744162678718567, "rewards/rejected": -0.9950183033943176, "step": 669 }, { "epoch": 0.1121308192891707, "grad_norm": 16.87297248840332, "learning_rate": 1.8878691807108295e-05, "logits/chosen": -0.5360866189002991, "logits/rejected": -0.4647587239742279, "logps/chosen": -143.08494567871094, "logps/rejected": -111.04397583007812, "loss": 0.751, "rewards/accuracies": 1.0, "rewards/chosen": -0.37797048687934875, "rewards/margins": 1.0791672468185425, "rewards/rejected": -1.4571377038955688, "step": 672 }, { "epoch": 0.1126314033038545, "grad_norm": 22.321529388427734, "learning_rate": 1.8873685966961456e-05, "logits/chosen": -0.4223831593990326, "logits/rejected": -0.483290433883667, "logps/chosen": -44.243408203125, "logps/rejected": -97.61200714111328, "loss": 0.6399, "rewards/accuracies": 1.0, "rewards/chosen": -0.040927570313215256, "rewards/margins": 1.4126309156417847, "rewards/rejected": -1.4535584449768066, "step": 675 }, { "epoch": 0.1131319873185383, "grad_norm": 29.320951461791992, "learning_rate": 1.8868680126814618e-05, "logits/chosen": -0.37256908416748047, "logits/rejected": -0.36957406997680664, "logps/chosen": -109.53466796875, "logps/rejected": -91.56647491455078, "loss": 0.6173, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.1971263885498047, "rewards/margins": 1.0518029928207397, "rewards/rejected": -1.2489293813705444, "step": 678 }, { "epoch": 0.11363257133322209, "grad_norm": 15.37810230255127, "learning_rate": 1.886367428666778e-05, "logits/chosen": -0.6804950833320618, "logits/rejected": -0.646766185760498, "logps/chosen": -99.68496704101562, "logps/rejected": -73.69520568847656, "loss": 0.5058, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5416730046272278, "rewards/margins": 0.2645528018474579, "rewards/rejected": -0.8062257766723633, "step": 681 }, { "epoch": 0.11413315534790588, "grad_norm": 25.711824417114258, "learning_rate": 1.8858668446520942e-05, "logits/chosen": -0.5172246098518372, "logits/rejected": -0.48563846945762634, "logps/chosen": -101.94820404052734, "logps/rejected": -90.2385025024414, "loss": 0.8943, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.02036283351480961, "rewards/margins": 1.0310004949569702, "rewards/rejected": -1.0513633489608765, "step": 684 }, { "epoch": 0.11463373936258969, "grad_norm": 5.8026204109191895, "learning_rate": 1.8853662606374107e-05, "logits/chosen": -0.6122124195098877, "logits/rejected": -0.6407455801963806, "logps/chosen": -27.88566780090332, "logps/rejected": -84.46724700927734, "loss": 0.3757, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.3012407720088959, "rewards/margins": 1.8242281675338745, "rewards/rejected": -1.5229873657226562, "step": 687 }, { "epoch": 0.11513432337727349, "grad_norm": 16.757064819335938, "learning_rate": 1.8848656766227265e-05, "logits/chosen": -0.6797816753387451, "logits/rejected": -0.676175057888031, "logps/chosen": -92.97174072265625, "logps/rejected": -76.48322296142578, "loss": 0.4639, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.2044517546892166, "rewards/margins": 0.5634739995002747, "rewards/rejected": -0.35902225971221924, "step": 690 }, { "epoch": 0.11563490739195728, "grad_norm": 25.67753791809082, "learning_rate": 1.8843650926080427e-05, "logits/chosen": -0.47079721093177795, "logits/rejected": -0.4591124355792999, "logps/chosen": -186.6246795654297, "logps/rejected": -141.90496826171875, "loss": 0.6532, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.46344196796417236, "rewards/margins": 0.2892057001590729, "rewards/rejected": -0.7526476979255676, "step": 693 }, { "epoch": 0.11613549140664108, "grad_norm": 11.32342529296875, "learning_rate": 1.8838645085933592e-05, "logits/chosen": -0.4787558615207672, "logits/rejected": -0.4605957567691803, "logps/chosen": -104.2638168334961, "logps/rejected": -110.91304779052734, "loss": 0.6562, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.24632619321346283, "rewards/margins": 0.9706711173057556, "rewards/rejected": -0.7243449091911316, "step": 696 }, { "epoch": 0.11663607542132488, "grad_norm": 19.042165756225586, "learning_rate": 1.883363924578675e-05, "logits/chosen": -0.34223827719688416, "logits/rejected": -0.35646024346351624, "logps/chosen": -72.27471160888672, "logps/rejected": -79.89682006835938, "loss": 0.7923, "rewards/accuracies": 1.0, "rewards/chosen": 0.6282352805137634, "rewards/margins": 0.9175863265991211, "rewards/rejected": -0.2893510162830353, "step": 699 }, { "epoch": 0.11713665943600868, "grad_norm": 22.609420776367188, "learning_rate": 1.8828633405639916e-05, "logits/chosen": -0.4153958261013031, "logits/rejected": -0.4764162600040436, "logps/chosen": -60.9111328125, "logps/rejected": -82.95389556884766, "loss": 0.7288, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.330389142036438, "rewards/margins": 0.5675027370452881, "rewards/rejected": -1.8978919982910156, "step": 702 }, { "epoch": 0.11763724345069247, "grad_norm": 18.427234649658203, "learning_rate": 1.8823627565493078e-05, "logits/chosen": -0.6184000968933105, "logits/rejected": -0.624178409576416, "logps/chosen": -82.18538665771484, "logps/rejected": -75.47867584228516, "loss": 0.6336, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.2074354887008667, "rewards/margins": 0.3414168357849121, "rewards/rejected": -0.5488523244857788, "step": 705 }, { "epoch": 0.11813782746537627, "grad_norm": 34.81830596923828, "learning_rate": 1.8818621725346236e-05, "logits/chosen": -0.6011881232261658, "logits/rejected": -0.6437613368034363, "logps/chosen": -82.37380981445312, "logps/rejected": -104.9653091430664, "loss": 0.6772, "rewards/accuracies": 1.0, "rewards/chosen": 0.872718334197998, "rewards/margins": 1.3242006301879883, "rewards/rejected": -0.45148229598999023, "step": 708 }, { "epoch": 0.11863841148006007, "grad_norm": 19.70418930053711, "learning_rate": 1.88136158851994e-05, "logits/chosen": -0.5473688244819641, "logits/rejected": -0.5200638771057129, "logps/chosen": -97.05130767822266, "logps/rejected": -66.58451843261719, "loss": 0.7398, "rewards/accuracies": 1.0, "rewards/chosen": 0.37968015670776367, "rewards/margins": 1.3451212644577026, "rewards/rejected": -0.9654411673545837, "step": 711 }, { "epoch": 0.11913899549474387, "grad_norm": 14.31709098815918, "learning_rate": 1.8808610045052563e-05, "logits/chosen": -0.4812484681606293, "logits/rejected": -0.5871054530143738, "logps/chosen": -90.74539947509766, "logps/rejected": -131.30662536621094, "loss": 0.5848, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.045516591519117355, "rewards/margins": 0.11442112922668457, "rewards/rejected": -0.06890455633401871, "step": 714 }, { "epoch": 0.11963957950942766, "grad_norm": 16.909603118896484, "learning_rate": 1.8803604204905725e-05, "logits/chosen": -0.4349532127380371, "logits/rejected": -0.48690342903137207, "logps/chosen": -91.98128509521484, "logps/rejected": -116.98430633544922, "loss": 0.5295, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.10643819719552994, "rewards/margins": 1.1254172325134277, "rewards/rejected": -1.0189789533615112, "step": 717 }, { "epoch": 0.12014016352411146, "grad_norm": 8.336536407470703, "learning_rate": 1.8798598364758887e-05, "logits/chosen": -0.5612412095069885, "logits/rejected": -0.5890812277793884, "logps/chosen": -57.37919235229492, "logps/rejected": -103.3601303100586, "loss": 0.5566, "rewards/accuracies": 1.0, "rewards/chosen": 0.39263948798179626, "rewards/margins": 2.3004140853881836, "rewards/rejected": -1.907774567604065, "step": 720 }, { "epoch": 0.12064074753879526, "grad_norm": 20.06405258178711, "learning_rate": 1.879359252461205e-05, "logits/chosen": -0.2395641952753067, "logits/rejected": -0.2929794490337372, "logps/chosen": -67.70130920410156, "logps/rejected": -94.44605255126953, "loss": 0.6131, "rewards/accuracies": 1.0, "rewards/chosen": -1.0863772630691528, "rewards/margins": 0.8537542223930359, "rewards/rejected": -1.9401315450668335, "step": 723 }, { "epoch": 0.12114133155347906, "grad_norm": 10.544294357299805, "learning_rate": 1.878858668446521e-05, "logits/chosen": -0.2931189239025116, "logits/rejected": -0.3297370970249176, "logps/chosen": -80.29792785644531, "logps/rejected": -120.59407806396484, "loss": 0.5914, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8119540214538574, "rewards/margins": 0.08243969082832336, "rewards/rejected": -0.8943936824798584, "step": 726 }, { "epoch": 0.12164191556816285, "grad_norm": 12.624857902526855, "learning_rate": 1.8783580844318372e-05, "logits/chosen": -0.37726712226867676, "logits/rejected": -0.43502482771873474, "logps/chosen": -61.9097900390625, "logps/rejected": -103.0181655883789, "loss": 0.6013, "rewards/accuracies": 1.0, "rewards/chosen": 0.5838953852653503, "rewards/margins": 1.4186969995498657, "rewards/rejected": -0.8348016142845154, "step": 729 }, { "epoch": 0.12214249958284665, "grad_norm": 7.9448442459106445, "learning_rate": 1.8778575004171534e-05, "logits/chosen": -0.6319306492805481, "logits/rejected": -0.6416140794754028, "logps/chosen": -54.45139694213867, "logps/rejected": -91.27648162841797, "loss": 0.7046, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.22403423488140106, "rewards/margins": 0.21906942129135132, "rewards/rejected": 0.004964818712323904, "step": 732 }, { "epoch": 0.12264308359753046, "grad_norm": 11.335164070129395, "learning_rate": 1.8773569164024696e-05, "logits/chosen": -0.40222564339637756, "logits/rejected": -0.41539087891578674, "logps/chosen": -86.27828216552734, "logps/rejected": -94.38814544677734, "loss": 0.5014, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3804335594177246, "rewards/margins": 0.1254851073026657, "rewards/rejected": -0.5059186220169067, "step": 735 }, { "epoch": 0.12314366761221425, "grad_norm": 11.768908500671387, "learning_rate": 1.876856332387786e-05, "logits/chosen": -0.4232079088687897, "logits/rejected": -0.4373345375061035, "logps/chosen": -72.97079467773438, "logps/rejected": -49.39860153198242, "loss": 0.6271, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.2810254395008087, "rewards/margins": -0.4533773362636566, "rewards/rejected": 0.17235183715820312, "step": 738 }, { "epoch": 0.12364425162689804, "grad_norm": 18.056135177612305, "learning_rate": 1.876355748373102e-05, "logits/chosen": -0.4108428955078125, "logits/rejected": -0.41424572467803955, "logps/chosen": -107.02629852294922, "logps/rejected": -76.04642486572266, "loss": 0.5784, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.38874077796936035, "rewards/margins": -0.35451555252075195, "rewards/rejected": -0.034225206822156906, "step": 741 }, { "epoch": 0.12414483564158185, "grad_norm": 20.87856674194336, "learning_rate": 1.8758551643584185e-05, "logits/chosen": -0.5007167458534241, "logits/rejected": -0.5148741602897644, "logps/chosen": -87.54584503173828, "logps/rejected": -72.95510864257812, "loss": 0.6195, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.2568168640136719, "rewards/margins": -0.29381123185157776, "rewards/rejected": 0.03699437901377678, "step": 744 }, { "epoch": 0.12464541965626565, "grad_norm": 21.65856170654297, "learning_rate": 1.8753545803437346e-05, "logits/chosen": -0.5718379616737366, "logits/rejected": -0.5632105469703674, "logps/chosen": -84.9299545288086, "logps/rejected": -88.18753814697266, "loss": 0.7191, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.04705098271369934, "rewards/margins": 1.5617021322250366, "rewards/rejected": -1.6087532043457031, "step": 747 }, { "epoch": 0.12514600367094944, "grad_norm": 14.03543758392334, "learning_rate": 1.8748539963290505e-05, "logits/chosen": -0.5257697105407715, "logits/rejected": -0.5115090608596802, "logps/chosen": -69.64279174804688, "logps/rejected": -78.79129791259766, "loss": 0.5677, "rewards/accuracies": 1.0, "rewards/chosen": 0.17533405125141144, "rewards/margins": 1.454505443572998, "rewards/rejected": -1.279171347618103, "step": 750 }, { "epoch": 0.12564658768563325, "grad_norm": 8.228617668151855, "learning_rate": 1.874353412314367e-05, "logits/chosen": -0.6290264129638672, "logits/rejected": -0.5807158350944519, "logps/chosen": -136.25619506835938, "logps/rejected": -81.48441314697266, "loss": 0.5689, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8243346810340881, "rewards/margins": -0.13106484711170197, "rewards/rejected": -0.6932699084281921, "step": 753 }, { "epoch": 0.12614717170031703, "grad_norm": 21.74030113220215, "learning_rate": 1.8738528282996832e-05, "logits/chosen": -0.44006696343421936, "logits/rejected": -0.500983715057373, "logps/chosen": -62.70356369018555, "logps/rejected": -134.52989196777344, "loss": 0.7731, "rewards/accuracies": 0.0, "rewards/chosen": -0.8165143132209778, "rewards/margins": -0.39403200149536133, "rewards/rejected": -0.42248234152793884, "step": 756 }, { "epoch": 0.12664775571500084, "grad_norm": 27.5025634765625, "learning_rate": 1.8733522442849994e-05, "logits/chosen": -0.2517527937889099, "logits/rejected": -0.27915462851524353, "logps/chosen": -62.45344543457031, "logps/rejected": -81.22643280029297, "loss": 0.7337, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.9022436738014221, "rewards/margins": 0.35038283467292786, "rewards/rejected": -1.2526265382766724, "step": 759 }, { "epoch": 0.12714833972968465, "grad_norm": 26.607784271240234, "learning_rate": 1.8728516602703155e-05, "logits/chosen": -0.51470547914505, "logits/rejected": -0.5252942442893982, "logps/chosen": -92.90225219726562, "logps/rejected": -107.36666107177734, "loss": 0.7528, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.18842411041259766, "rewards/margins": 0.9556513428688049, "rewards/rejected": -1.1440753936767578, "step": 762 }, { "epoch": 0.12764892374436843, "grad_norm": 22.021034240722656, "learning_rate": 1.8723510762556317e-05, "logits/chosen": -0.2683875262737274, "logits/rejected": -0.38781416416168213, "logps/chosen": -88.5538558959961, "logps/rejected": -123.869140625, "loss": 0.4841, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.47608694434165955, "rewards/margins": 0.2713302671909332, "rewards/rejected": -0.7474172115325928, "step": 765 }, { "epoch": 0.12814950775905223, "grad_norm": 6.771920680999756, "learning_rate": 1.871850492240948e-05, "logits/chosen": -0.6497290134429932, "logits/rejected": -0.6226914525032043, "logps/chosen": -63.80961608886719, "logps/rejected": -58.70027542114258, "loss": 0.6022, "rewards/accuracies": 1.0, "rewards/chosen": 0.5977115631103516, "rewards/margins": 1.348021149635315, "rewards/rejected": -0.7503094673156738, "step": 768 }, { "epoch": 0.12865009177373601, "grad_norm": 10.980789184570312, "learning_rate": 1.871349908226264e-05, "logits/chosen": -0.49115753173828125, "logits/rejected": -0.5377857089042664, "logps/chosen": -81.74817657470703, "logps/rejected": -161.2404022216797, "loss": 0.725, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.3114008605480194, "rewards/margins": 0.40167292952537537, "rewards/rejected": -0.71307373046875, "step": 771 }, { "epoch": 0.12915067578841982, "grad_norm": 26.00565528869629, "learning_rate": 1.8708493242115802e-05, "logits/chosen": -0.17765061557292938, "logits/rejected": -0.32802531123161316, "logps/chosen": -77.52609252929688, "logps/rejected": -155.7930450439453, "loss": 0.5578, "rewards/accuracies": 1.0, "rewards/chosen": 0.25434061884880066, "rewards/margins": 1.452053427696228, "rewards/rejected": -1.197712779045105, "step": 774 }, { "epoch": 0.12965125980310363, "grad_norm": 5.796546459197998, "learning_rate": 1.8703487401968964e-05, "logits/chosen": -0.3125587999820709, "logits/rejected": -0.30522891879081726, "logps/chosen": -69.61959075927734, "logps/rejected": -91.59154510498047, "loss": 0.6017, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.555505096912384, "rewards/margins": -0.12974344193935394, "rewards/rejected": -0.42576169967651367, "step": 777 }, { "epoch": 0.1301518438177874, "grad_norm": 16.645755767822266, "learning_rate": 1.869848156182213e-05, "logits/chosen": -0.35112640261650085, "logits/rejected": -0.4280388355255127, "logps/chosen": -47.74639892578125, "logps/rejected": -109.87158203125, "loss": 0.4375, "rewards/accuracies": 1.0, "rewards/chosen": -0.1617245227098465, "rewards/margins": 1.0952426195144653, "rewards/rejected": -1.2569671869277954, "step": 780 }, { "epoch": 0.13065242783247122, "grad_norm": 9.75275993347168, "learning_rate": 1.8693475721675288e-05, "logits/chosen": -0.3398604393005371, "logits/rejected": -0.39073920249938965, "logps/chosen": -22.82122802734375, "logps/rejected": -86.8133316040039, "loss": 0.3603, "rewards/accuracies": 1.0, "rewards/chosen": 0.31413018703460693, "rewards/margins": 0.6548935770988464, "rewards/rejected": -0.3407633602619171, "step": 783 }, { "epoch": 0.13115301184715503, "grad_norm": 21.30471420288086, "learning_rate": 1.868846988152845e-05, "logits/chosen": -0.33316436409950256, "logits/rejected": -0.33811697363853455, "logps/chosen": -76.48616790771484, "logps/rejected": -71.9807357788086, "loss": 0.6171, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6756222248077393, "rewards/margins": 0.4008607864379883, "rewards/rejected": -1.076482892036438, "step": 786 }, { "epoch": 0.1316535958618388, "grad_norm": 17.462339401245117, "learning_rate": 1.8683464041381615e-05, "logits/chosen": -0.39947831630706787, "logits/rejected": -0.3788091242313385, "logps/chosen": -89.89527130126953, "logps/rejected": -78.93167877197266, "loss": 0.4404, "rewards/accuracies": 1.0, "rewards/chosen": -0.22326432168483734, "rewards/margins": 1.0019598007202148, "rewards/rejected": -1.2252241373062134, "step": 789 }, { "epoch": 0.13215417987652262, "grad_norm": 12.629739761352539, "learning_rate": 1.8678458201234773e-05, "logits/chosen": -0.6178362965583801, "logits/rejected": -0.6121547818183899, "logps/chosen": -86.032470703125, "logps/rejected": -111.13970947265625, "loss": 0.523, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7869542241096497, "rewards/margins": 0.12202310562133789, "rewards/rejected": -0.9089773297309875, "step": 792 }, { "epoch": 0.1326547638912064, "grad_norm": 17.738601684570312, "learning_rate": 1.867345236108794e-05, "logits/chosen": -0.7131072878837585, "logits/rejected": -0.698513925075531, "logps/chosen": -73.04058837890625, "logps/rejected": -77.99017333984375, "loss": 0.6239, "rewards/accuracies": 1.0, "rewards/chosen": 0.05140380561351776, "rewards/margins": 0.8834379315376282, "rewards/rejected": -0.832034170627594, "step": 795 }, { "epoch": 0.1331553479058902, "grad_norm": 18.265419006347656, "learning_rate": 1.86684465209411e-05, "logits/chosen": -0.3725244998931885, "logits/rejected": -0.448500394821167, "logps/chosen": -54.46723556518555, "logps/rejected": -127.08043670654297, "loss": 0.3903, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7118756175041199, "rewards/margins": 1.9870151281356812, "rewards/rejected": -2.6988906860351562, "step": 798 }, { "epoch": 0.133655931920574, "grad_norm": 9.856668472290039, "learning_rate": 1.8663440680794262e-05, "logits/chosen": -0.5587300658226013, "logits/rejected": -0.5860275030136108, "logps/chosen": -69.88325500488281, "logps/rejected": -116.93888092041016, "loss": 0.4709, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4562489986419678, "rewards/margins": 1.9587591886520386, "rewards/rejected": -2.415008306503296, "step": 801 }, { "epoch": 0.1341565159352578, "grad_norm": 9.796996116638184, "learning_rate": 1.8658434840647424e-05, "logits/chosen": -0.7318238615989685, "logits/rejected": -0.7031087875366211, "logps/chosen": -52.58591842651367, "logps/rejected": -82.06627655029297, "loss": 0.3959, "rewards/accuracies": 1.0, "rewards/chosen": -0.06500611454248428, "rewards/margins": 1.4131988286972046, "rewards/rejected": -1.4782048463821411, "step": 804 }, { "epoch": 0.1346570999499416, "grad_norm": 18.80247688293457, "learning_rate": 1.8653429000500586e-05, "logits/chosen": -0.48360368609428406, "logits/rejected": -0.5174424052238464, "logps/chosen": -87.7242202758789, "logps/rejected": -141.6905059814453, "loss": 0.4488, "rewards/accuracies": 1.0, "rewards/chosen": 0.23018644750118256, "rewards/margins": 2.444551706314087, "rewards/rejected": -2.214365243911743, "step": 807 }, { "epoch": 0.1351576839646254, "grad_norm": 17.693037033081055, "learning_rate": 1.8648423160353747e-05, "logits/chosen": -0.47083258628845215, "logits/rejected": -0.49725380539894104, "logps/chosen": -116.91394805908203, "logps/rejected": -127.21715545654297, "loss": 0.754, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6143321394920349, "rewards/margins": 1.2171534299850464, "rewards/rejected": -1.831485390663147, "step": 810 }, { "epoch": 0.1356582679793092, "grad_norm": 18.493255615234375, "learning_rate": 1.864341732020691e-05, "logits/chosen": -0.6701855659484863, "logits/rejected": -0.6641104221343994, "logps/chosen": -83.72863006591797, "logps/rejected": -101.017822265625, "loss": 0.7266, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.6935867667198181, "rewards/margins": -0.2700332701206207, "rewards/rejected": -0.423553466796875, "step": 813 }, { "epoch": 0.136158851993993, "grad_norm": 8.883771896362305, "learning_rate": 1.863841148006007e-05, "logits/chosen": -0.7101079821586609, "logits/rejected": -0.6699175834655762, "logps/chosen": -109.95149993896484, "logps/rejected": -73.68907928466797, "loss": 0.4411, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.26763445138931274, "rewards/margins": 0.36565184593200684, "rewards/rejected": -0.6332862973213196, "step": 816 }, { "epoch": 0.13665943600867678, "grad_norm": 21.88258934020996, "learning_rate": 1.8633405639913233e-05, "logits/chosen": -0.5239258408546448, "logits/rejected": -0.5526931881904602, "logps/chosen": -55.51579666137695, "logps/rejected": -111.79915618896484, "loss": 0.8558, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.06721071153879166, "rewards/margins": 1.8055092096328735, "rewards/rejected": -1.8727201223373413, "step": 819 }, { "epoch": 0.13716002002336058, "grad_norm": 26.1844539642334, "learning_rate": 1.8628399799766395e-05, "logits/chosen": -0.48460760712623596, "logits/rejected": -0.5832151174545288, "logps/chosen": -54.04063415527344, "logps/rejected": -136.1400604248047, "loss": 0.5106, "rewards/accuracies": 1.0, "rewards/chosen": 0.6246482729911804, "rewards/margins": 2.457366943359375, "rewards/rejected": -1.8327187299728394, "step": 822 }, { "epoch": 0.1376606040380444, "grad_norm": 9.291252136230469, "learning_rate": 1.8623393959619556e-05, "logits/chosen": -0.5160666704177856, "logits/rejected": -0.5272957682609558, "logps/chosen": -162.46405029296875, "logps/rejected": -130.86920166015625, "loss": 0.5967, "rewards/accuracies": 1.0, "rewards/chosen": -0.19641445577144623, "rewards/margins": 1.9482213258743286, "rewards/rejected": -2.1446359157562256, "step": 825 }, { "epoch": 0.13816118805272817, "grad_norm": 11.66380500793457, "learning_rate": 1.8618388119472718e-05, "logits/chosen": -0.7259472012519836, "logits/rejected": -0.7333776950836182, "logps/chosen": -58.57403564453125, "logps/rejected": -77.18624114990234, "loss": 0.4032, "rewards/accuracies": 1.0, "rewards/chosen": 0.022792277857661247, "rewards/margins": 1.2343014478683472, "rewards/rejected": -1.2115092277526855, "step": 828 }, { "epoch": 0.13866177206741198, "grad_norm": 8.00262451171875, "learning_rate": 1.8613382279325883e-05, "logits/chosen": -0.27613794803619385, "logits/rejected": -0.26665446162223816, "logps/chosen": -61.490474700927734, "logps/rejected": -44.50395584106445, "loss": 0.5273, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.857166051864624, "rewards/margins": -0.3388482630252838, "rewards/rejected": -0.518317699432373, "step": 831 }, { "epoch": 0.1391623560820958, "grad_norm": 27.637847900390625, "learning_rate": 1.8608376439179042e-05, "logits/chosen": -0.6381475329399109, "logits/rejected": -0.5827834606170654, "logps/chosen": -128.20603942871094, "logps/rejected": -102.74443817138672, "loss": 0.989, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.5836301445960999, "rewards/margins": 0.6874679923057556, "rewards/rejected": -1.271098256111145, "step": 834 }, { "epoch": 0.13966294009677957, "grad_norm": 16.665096282958984, "learning_rate": 1.8603370599032207e-05, "logits/chosen": -0.581046462059021, "logits/rejected": -0.5875861048698425, "logps/chosen": -75.22843170166016, "logps/rejected": -65.35602569580078, "loss": 0.7622, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.40258684754371643, "rewards/margins": 0.6500387191772461, "rewards/rejected": -1.0526256561279297, "step": 837 }, { "epoch": 0.14016352411146338, "grad_norm": 14.777203559875488, "learning_rate": 1.859836475888537e-05, "logits/chosen": -0.5791838765144348, "logits/rejected": -0.6041818261146545, "logps/chosen": -54.23915100097656, "logps/rejected": -86.52245330810547, "loss": 0.3481, "rewards/accuracies": 1.0, "rewards/chosen": 0.14581897854804993, "rewards/margins": 0.5446891188621521, "rewards/rejected": -0.3988700807094574, "step": 840 }, { "epoch": 0.14066410812614716, "grad_norm": 12.242483139038086, "learning_rate": 1.8593358918738527e-05, "logits/chosen": -0.642367422580719, "logits/rejected": -0.6282162070274353, "logps/chosen": -90.19892120361328, "logps/rejected": -57.82357406616211, "loss": 0.9107, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.2715650796890259, "rewards/margins": -1.5237077474594116, "rewards/rejected": 0.25214266777038574, "step": 843 }, { "epoch": 0.14116469214083097, "grad_norm": 22.594552993774414, "learning_rate": 1.8588353078591692e-05, "logits/chosen": -0.6452459692955017, "logits/rejected": -0.625758707523346, "logps/chosen": -59.54782485961914, "logps/rejected": -65.7889175415039, "loss": 0.8166, "rewards/accuracies": 1.0, "rewards/chosen": 0.572325587272644, "rewards/margins": 1.05320405960083, "rewards/rejected": -0.48087847232818604, "step": 846 }, { "epoch": 0.14166527615551477, "grad_norm": 20.265073776245117, "learning_rate": 1.8583347238444854e-05, "logits/chosen": -0.5109583735466003, "logits/rejected": -0.6134198307991028, "logps/chosen": -81.52173614501953, "logps/rejected": -110.8116226196289, "loss": 0.7432, "rewards/accuracies": 1.0, "rewards/chosen": -0.389065146446228, "rewards/margins": 0.9058201313018799, "rewards/rejected": -1.294885277748108, "step": 849 }, { "epoch": 0.14216586017019855, "grad_norm": 12.319670677185059, "learning_rate": 1.8578341398298016e-05, "logits/chosen": -0.38460811972618103, "logits/rejected": -0.37740007042884827, "logps/chosen": -77.98944854736328, "logps/rejected": -72.29666900634766, "loss": 0.5668, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5448551177978516, "rewards/margins": -0.03360303118824959, "rewards/rejected": -0.5112521052360535, "step": 852 }, { "epoch": 0.14266644418488236, "grad_norm": 18.84042739868164, "learning_rate": 1.8573335558151178e-05, "logits/chosen": -0.5016024112701416, "logits/rejected": -0.4781442880630493, "logps/chosen": -108.31281280517578, "logps/rejected": -93.91124725341797, "loss": 0.6218, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3097817897796631, "rewards/margins": 0.10319530963897705, "rewards/rejected": -0.41297709941864014, "step": 855 }, { "epoch": 0.14316702819956617, "grad_norm": 17.1103572845459, "learning_rate": 1.856832971800434e-05, "logits/chosen": -0.5056658387184143, "logits/rejected": -0.49928244948387146, "logps/chosen": -82.92904663085938, "logps/rejected": -101.8731460571289, "loss": 0.402, "rewards/accuracies": 1.0, "rewards/chosen": -0.42308685183525085, "rewards/margins": 0.5874548554420471, "rewards/rejected": -1.0105417966842651, "step": 858 }, { "epoch": 0.14366761221424995, "grad_norm": 12.19447135925293, "learning_rate": 1.85633238778575e-05, "logits/chosen": -0.6734912395477295, "logits/rejected": -0.7351929545402527, "logps/chosen": -57.33343505859375, "logps/rejected": -107.8826904296875, "loss": 0.5049, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.07309819012880325, "rewards/margins": 1.2277153730392456, "rewards/rejected": -1.154617190361023, "step": 861 }, { "epoch": 0.14416819622893376, "grad_norm": 19.979320526123047, "learning_rate": 1.8558318037710663e-05, "logits/chosen": -0.3740997314453125, "logits/rejected": -0.2941635549068451, "logps/chosen": -134.1952362060547, "logps/rejected": -72.84314727783203, "loss": 0.6182, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.1527305841445923, "rewards/margins": -0.1569884568452835, "rewards/rejected": -0.9957420825958252, "step": 864 }, { "epoch": 0.14466878024361757, "grad_norm": 24.6162052154541, "learning_rate": 1.8553312197563825e-05, "logits/chosen": -0.5161963105201721, "logits/rejected": -0.5004764199256897, "logps/chosen": -66.65927124023438, "logps/rejected": -45.8681755065918, "loss": 0.5269, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.5432066917419434, "rewards/margins": 0.646484375, "rewards/rejected": -0.10327772051095963, "step": 867 }, { "epoch": 0.14516936425830135, "grad_norm": 11.302977561950684, "learning_rate": 1.8548306357416987e-05, "logits/chosen": -0.7764948010444641, "logits/rejected": -0.7912716865539551, "logps/chosen": -93.79891967773438, "logps/rejected": -110.34454345703125, "loss": 0.4909, "rewards/accuracies": 1.0, "rewards/chosen": -0.5839260220527649, "rewards/margins": 1.6878995895385742, "rewards/rejected": -2.2718255519866943, "step": 870 }, { "epoch": 0.14566994827298516, "grad_norm": 8.052953720092773, "learning_rate": 1.8543300517270152e-05, "logits/chosen": -0.5747771263122559, "logits/rejected": -0.5629491209983826, "logps/chosen": -88.48332977294922, "logps/rejected": -76.55240631103516, "loss": 0.3736, "rewards/accuracies": 1.0, "rewards/chosen": -0.5483777523040771, "rewards/margins": 0.4590069353580475, "rewards/rejected": -1.0073846578598022, "step": 873 }, { "epoch": 0.14617053228766894, "grad_norm": 40.696022033691406, "learning_rate": 1.853829467712331e-05, "logits/chosen": -0.43258795142173767, "logits/rejected": -0.45977982878685, "logps/chosen": -69.79147338867188, "logps/rejected": -74.83924102783203, "loss": 0.8351, "rewards/accuracies": 1.0, "rewards/chosen": -0.25082334876060486, "rewards/margins": 0.978928804397583, "rewards/rejected": -1.2297521829605103, "step": 876 }, { "epoch": 0.14667111630235274, "grad_norm": 25.45183563232422, "learning_rate": 1.8533288836976472e-05, "logits/chosen": -0.5202443599700928, "logits/rejected": -0.5362161993980408, "logps/chosen": -102.99857330322266, "logps/rejected": -123.65129852294922, "loss": 0.4711, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.02571690082550049, "rewards/margins": 1.463557243347168, "rewards/rejected": -1.4892741441726685, "step": 879 }, { "epoch": 0.14717170031703655, "grad_norm": 17.05387306213379, "learning_rate": 1.8528282996829637e-05, "logits/chosen": -0.5953119397163391, "logits/rejected": -0.6110121607780457, "logps/chosen": -97.16352081298828, "logps/rejected": -115.71337890625, "loss": 0.5472, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.11631635576486588, "rewards/margins": 1.6225537061691284, "rewards/rejected": -1.5062373876571655, "step": 882 }, { "epoch": 0.14767228433172033, "grad_norm": 16.395530700683594, "learning_rate": 1.8523277156682796e-05, "logits/chosen": -0.3840501308441162, "logits/rejected": -0.39859893918037415, "logps/chosen": -74.48568725585938, "logps/rejected": -125.0333023071289, "loss": 0.4989, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.2707250118255615, "rewards/margins": 0.6388404965400696, "rewards/rejected": -0.9095655083656311, "step": 885 }, { "epoch": 0.14817286834640414, "grad_norm": 10.325078964233398, "learning_rate": 1.851827131653596e-05, "logits/chosen": -0.5494861602783203, "logits/rejected": -0.5588715672492981, "logps/chosen": -109.18121337890625, "logps/rejected": -113.62027740478516, "loss": 0.3935, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.2911328375339508, "rewards/margins": 0.5441632270812988, "rewards/rejected": -0.2530304193496704, "step": 888 }, { "epoch": 0.14867345236108795, "grad_norm": 18.65891456604004, "learning_rate": 1.8513265476389123e-05, "logits/chosen": -0.5554218292236328, "logits/rejected": -0.573684573173523, "logps/chosen": -137.2322998046875, "logps/rejected": -148.29299926757812, "loss": 0.4066, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.12981553375720978, "rewards/margins": 0.9834840297698975, "rewards/rejected": -1.1132994890213013, "step": 891 }, { "epoch": 0.14917403637577173, "grad_norm": 22.986228942871094, "learning_rate": 1.8508259636242284e-05, "logits/chosen": -0.5905902981758118, "logits/rejected": -0.6067942380905151, "logps/chosen": -77.8730697631836, "logps/rejected": -121.87833404541016, "loss": 0.7781, "rewards/accuracies": 1.0, "rewards/chosen": 0.4034688472747803, "rewards/margins": 1.4927419424057007, "rewards/rejected": -1.0892730951309204, "step": 894 }, { "epoch": 0.14967462039045554, "grad_norm": 16.000568389892578, "learning_rate": 1.8503253796095446e-05, "logits/chosen": -0.6043725609779358, "logits/rejected": -0.5772908329963684, "logps/chosen": -85.65276336669922, "logps/rejected": -119.2361068725586, "loss": 0.459, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.33445408940315247, "rewards/margins": 0.3575131893157959, "rewards/rejected": -0.6919673085212708, "step": 897 }, { "epoch": 0.15017520440513932, "grad_norm": 18.02143669128418, "learning_rate": 1.8498247955948608e-05, "logits/chosen": -0.6253533959388733, "logits/rejected": -0.6850559115409851, "logps/chosen": -78.16963195800781, "logps/rejected": -117.75096893310547, "loss": 0.8449, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.04616749286651611, "rewards/margins": 1.2298551797866821, "rewards/rejected": -1.1836875677108765, "step": 900 }, { "epoch": 0.15067578841982313, "grad_norm": 21.411449432373047, "learning_rate": 1.849324211580177e-05, "logits/chosen": -0.8193187117576599, "logits/rejected": -0.8252790570259094, "logps/chosen": -77.61990356445312, "logps/rejected": -93.69246673583984, "loss": 0.459, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7339780330657959, "rewards/margins": 2.6374399662017822, "rewards/rejected": -3.371417999267578, "step": 903 }, { "epoch": 0.15117637243450693, "grad_norm": 9.623788833618164, "learning_rate": 1.848823627565493e-05, "logits/chosen": -0.5803210139274597, "logits/rejected": -0.5651042461395264, "logps/chosen": -28.624948501586914, "logps/rejected": -35.66592025756836, "loss": 0.4333, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.0012823393335565925, "rewards/margins": 0.4312172830104828, "rewards/rejected": -0.43249955773353577, "step": 906 }, { "epoch": 0.15167695644919071, "grad_norm": 12.239215850830078, "learning_rate": 1.8483230435508093e-05, "logits/chosen": -0.5875744223594666, "logits/rejected": -0.678648054599762, "logps/chosen": -49.14045715332031, "logps/rejected": -149.74734497070312, "loss": 0.3713, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.02985350228846073, "rewards/margins": 1.7399048805236816, "rewards/rejected": -1.710051417350769, "step": 909 }, { "epoch": 0.15217754046387452, "grad_norm": 19.949045181274414, "learning_rate": 1.8478224595361255e-05, "logits/chosen": -0.6492641568183899, "logits/rejected": -0.5677857398986816, "logps/chosen": -149.86016845703125, "logps/rejected": -74.92477416992188, "loss": 0.5786, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7111973762512207, "rewards/margins": 1.0124095678329468, "rewards/rejected": -1.723607063293457, "step": 912 }, { "epoch": 0.15267812447855833, "grad_norm": 15.344461441040039, "learning_rate": 1.847321875521442e-05, "logits/chosen": -0.8030144572257996, "logits/rejected": -0.8000724911689758, "logps/chosen": -83.2477798461914, "logps/rejected": -61.447967529296875, "loss": 0.4221, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7260280251502991, "rewards/margins": 0.38681891560554504, "rewards/rejected": -1.1128469705581665, "step": 915 }, { "epoch": 0.1531787084932421, "grad_norm": 12.63650894165039, "learning_rate": 1.846821291506758e-05, "logits/chosen": -0.6802496314048767, "logits/rejected": -0.7509645819664001, "logps/chosen": -60.22509765625, "logps/rejected": -119.02740478515625, "loss": 0.5162, "rewards/accuracies": 1.0, "rewards/chosen": 0.557718813419342, "rewards/margins": 1.042564868927002, "rewards/rejected": -0.4848460257053375, "step": 918 }, { "epoch": 0.15367929250792592, "grad_norm": 7.7192583084106445, "learning_rate": 1.846320707492074e-05, "logits/chosen": -0.5745155811309814, "logits/rejected": -0.6913867592811584, "logps/chosen": -23.78076171875, "logps/rejected": -87.18936920166016, "loss": 0.3134, "rewards/accuracies": 1.0, "rewards/chosen": -0.22108954191207886, "rewards/margins": 1.4683104753494263, "rewards/rejected": -1.68940007686615, "step": 921 }, { "epoch": 0.1541798765226097, "grad_norm": 19.444374084472656, "learning_rate": 1.8458201234773906e-05, "logits/chosen": -0.4305586814880371, "logits/rejected": -0.39207854866981506, "logps/chosen": -162.6547393798828, "logps/rejected": -125.006591796875, "loss": 0.7265, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.19637930393219, "rewards/margins": 1.6606122255325317, "rewards/rejected": -2.856991767883301, "step": 924 }, { "epoch": 0.1546804605372935, "grad_norm": 27.28119659423828, "learning_rate": 1.8453195394627064e-05, "logits/chosen": -0.2727828323841095, "logits/rejected": -0.2797076404094696, "logps/chosen": -91.0331039428711, "logps/rejected": -119.9698486328125, "loss": 0.5678, "rewards/accuracies": 1.0, "rewards/chosen": -0.6407540440559387, "rewards/margins": 0.9714901447296143, "rewards/rejected": -1.6122442483901978, "step": 927 }, { "epoch": 0.15518104455197732, "grad_norm": 21.418869018554688, "learning_rate": 1.844818955448023e-05, "logits/chosen": -0.4847460687160492, "logits/rejected": -0.5419628620147705, "logps/chosen": -38.17502212524414, "logps/rejected": -104.96025848388672, "loss": 0.4253, "rewards/accuracies": 1.0, "rewards/chosen": 0.186837837100029, "rewards/margins": 2.169434070587158, "rewards/rejected": -1.9825962781906128, "step": 930 }, { "epoch": 0.1556816285666611, "grad_norm": 39.06606674194336, "learning_rate": 1.844318371433339e-05, "logits/chosen": -0.6044460535049438, "logits/rejected": -0.580995500087738, "logps/chosen": -129.7698211669922, "logps/rejected": -73.92018127441406, "loss": 1.0945, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.235051155090332, "rewards/margins": -1.604443907737732, "rewards/rejected": -0.6306073069572449, "step": 933 }, { "epoch": 0.1561822125813449, "grad_norm": 11.453155517578125, "learning_rate": 1.8438177874186553e-05, "logits/chosen": -0.4967546761035919, "logits/rejected": -0.5186527371406555, "logps/chosen": -77.42998504638672, "logps/rejected": -100.61599731445312, "loss": 0.4796, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.3400949239730835, "rewards/margins": 1.1145302057266235, "rewards/rejected": -2.454624891281128, "step": 936 }, { "epoch": 0.1566827965960287, "grad_norm": 19.22016143798828, "learning_rate": 1.8433172034039715e-05, "logits/chosen": -0.7787268161773682, "logits/rejected": -0.7881965637207031, "logps/chosen": -69.58435821533203, "logps/rejected": -80.95064544677734, "loss": 0.3415, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.5361213684082031, "rewards/margins": 0.40341904759407043, "rewards/rejected": -1.9395403861999512, "step": 939 }, { "epoch": 0.1571833806107125, "grad_norm": 18.27324867248535, "learning_rate": 1.8428166193892877e-05, "logits/chosen": -0.4081279933452606, "logits/rejected": -0.44421741366386414, "logps/chosen": -75.0099105834961, "logps/rejected": -134.86090087890625, "loss": 0.5936, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1161236763000488, "rewards/margins": 0.9802024960517883, "rewards/rejected": -2.0963263511657715, "step": 942 }, { "epoch": 0.1576839646253963, "grad_norm": 2.531312942504883, "learning_rate": 1.842316035374604e-05, "logits/chosen": -0.8002307415008545, "logits/rejected": -0.7673618793487549, "logps/chosen": -104.0634994506836, "logps/rejected": -78.54243469238281, "loss": 0.5036, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.832170307636261, "rewards/margins": 1.2525324821472168, "rewards/rejected": -2.084702730178833, "step": 945 }, { "epoch": 0.15818454864008008, "grad_norm": 24.650983810424805, "learning_rate": 1.84181545135992e-05, "logits/chosen": -0.4437934458255768, "logits/rejected": -0.5519049763679504, "logps/chosen": -35.196537017822266, "logps/rejected": -151.7914276123047, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.14445368945598602, "rewards/margins": 3.963486909866333, "rewards/rejected": -3.81903338432312, "step": 948 }, { "epoch": 0.1586851326547639, "grad_norm": 10.92353630065918, "learning_rate": 1.8413148673452362e-05, "logits/chosen": -0.41743743419647217, "logits/rejected": -0.4281028211116791, "logps/chosen": -85.31327819824219, "logps/rejected": -70.24401092529297, "loss": 0.6621, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.149283766746521, "rewards/margins": 0.6903818249702454, "rewards/rejected": -1.8396657705307007, "step": 951 }, { "epoch": 0.1591857166694477, "grad_norm": 18.412004470825195, "learning_rate": 1.8408142833305524e-05, "logits/chosen": -0.5967370867729187, "logits/rejected": -0.609666645526886, "logps/chosen": -74.93822479248047, "logps/rejected": -70.3968276977539, "loss": 0.649, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.21402639150619507, "rewards/margins": -0.3991447389125824, "rewards/rejected": 0.18511836230754852, "step": 954 }, { "epoch": 0.15968630068413148, "grad_norm": 14.084173202514648, "learning_rate": 1.8403136993158686e-05, "logits/chosen": -0.7584827542304993, "logits/rejected": -0.8331267833709717, "logps/chosen": -64.03678131103516, "logps/rejected": -164.2843780517578, "loss": 0.4641, "rewards/accuracies": 1.0, "rewards/chosen": -0.1444798856973648, "rewards/margins": 2.2938945293426514, "rewards/rejected": -2.4383742809295654, "step": 957 }, { "epoch": 0.16018688469881529, "grad_norm": 20.348499298095703, "learning_rate": 1.8398131153011847e-05, "logits/chosen": -0.38075244426727295, "logits/rejected": -0.4193410873413086, "logps/chosen": -47.66176986694336, "logps/rejected": -110.99578857421875, "loss": 0.4457, "rewards/accuracies": 1.0, "rewards/chosen": 0.4732166826725006, "rewards/margins": 2.340535879135132, "rewards/rejected": -1.8673192262649536, "step": 960 }, { "epoch": 0.1606874687134991, "grad_norm": 26.773164749145508, "learning_rate": 1.839312531286501e-05, "logits/chosen": -0.47932544350624084, "logits/rejected": -0.6666694283485413, "logps/chosen": -64.4451675415039, "logps/rejected": -200.7410888671875, "loss": 0.5039, "rewards/accuracies": 1.0, "rewards/chosen": -0.16542915999889374, "rewards/margins": 2.968043327331543, "rewards/rejected": -3.1334726810455322, "step": 963 }, { "epoch": 0.16118805272818287, "grad_norm": 35.138465881347656, "learning_rate": 1.8388119472718174e-05, "logits/chosen": -0.5722485184669495, "logits/rejected": -0.6349905133247375, "logps/chosen": -102.3006820678711, "logps/rejected": -119.4212646484375, "loss": 0.6438, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.4405931532382965, "rewards/margins": 0.7016227841377258, "rewards/rejected": -1.1422158479690552, "step": 966 }, { "epoch": 0.16168863674286668, "grad_norm": 14.568436622619629, "learning_rate": 1.8383113632571333e-05, "logits/chosen": -0.6611059308052063, "logits/rejected": -0.6379392743110657, "logps/chosen": -73.93926239013672, "logps/rejected": -81.56415557861328, "loss": 0.4867, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.11506498605012894, "rewards/margins": 0.4357953369617462, "rewards/rejected": -0.5508603453636169, "step": 969 }, { "epoch": 0.16218922075755046, "grad_norm": 22.177349090576172, "learning_rate": 1.8378107792424498e-05, "logits/chosen": -0.7223671078681946, "logits/rejected": -0.6432243585586548, "logps/chosen": -109.27977752685547, "logps/rejected": -48.99754333496094, "loss": 0.5948, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.0645039901137352, "rewards/margins": -0.2540607154369354, "rewards/rejected": 0.3185647428035736, "step": 972 }, { "epoch": 0.16268980477223427, "grad_norm": 14.075732231140137, "learning_rate": 1.837310195227766e-05, "logits/chosen": -0.732415497303009, "logits/rejected": -0.6974007487297058, "logps/chosen": -88.18155670166016, "logps/rejected": -90.04342651367188, "loss": 0.5132, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.436480164527893, "rewards/margins": -0.22904466092586517, "rewards/rejected": -1.2074356079101562, "step": 975 }, { "epoch": 0.16319038878691808, "grad_norm": 11.67020034790039, "learning_rate": 1.836809611213082e-05, "logits/chosen": -0.8368883728981018, "logits/rejected": -0.8785104751586914, "logps/chosen": -61.39664077758789, "logps/rejected": -92.4822006225586, "loss": 0.3566, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.21131211519241333, "rewards/margins": 1.387419581413269, "rewards/rejected": -1.1761075258255005, "step": 978 }, { "epoch": 0.16369097280160186, "grad_norm": 25.54633140563965, "learning_rate": 1.8363090271983983e-05, "logits/chosen": -0.413632869720459, "logits/rejected": -0.4316672086715698, "logps/chosen": -93.1711654663086, "logps/rejected": -122.660888671875, "loss": 0.3191, "rewards/accuracies": 1.0, "rewards/chosen": -1.0226998329162598, "rewards/margins": 1.2442787885665894, "rewards/rejected": -2.2669785022735596, "step": 981 }, { "epoch": 0.16419155681628567, "grad_norm": 14.84162425994873, "learning_rate": 1.8358084431837145e-05, "logits/chosen": -0.6176236271858215, "logits/rejected": -0.6469376683235168, "logps/chosen": -39.8067741394043, "logps/rejected": -80.91305541992188, "loss": 0.4913, "rewards/accuracies": 1.0, "rewards/chosen": -0.5394758582115173, "rewards/margins": 1.7812018394470215, "rewards/rejected": -2.3206777572631836, "step": 984 }, { "epoch": 0.16469214083096947, "grad_norm": 33.56821060180664, "learning_rate": 1.8353078591690307e-05, "logits/chosen": -0.6999426484107971, "logits/rejected": -0.6406018137931824, "logps/chosen": -91.0126953125, "logps/rejected": -85.3644790649414, "loss": 0.5745, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8072282671928406, "rewards/margins": 0.4827673137187958, "rewards/rejected": -1.289995551109314, "step": 987 }, { "epoch": 0.16519272484565325, "grad_norm": 25.49745750427246, "learning_rate": 1.834807275154347e-05, "logits/chosen": -0.6162946224212646, "logits/rejected": -0.5693221688270569, "logps/chosen": -121.32282257080078, "logps/rejected": -88.72616577148438, "loss": 0.4722, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.2535117864608765, "rewards/margins": 0.2938050329685211, "rewards/rejected": -1.5473169088363647, "step": 990 }, { "epoch": 0.16569330886033706, "grad_norm": 41.73869323730469, "learning_rate": 1.834306691139663e-05, "logits/chosen": -0.6475600600242615, "logits/rejected": -0.5384101271629333, "logps/chosen": -113.3487777709961, "logps/rejected": -61.90293502807617, "loss": 0.7987, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.0157884359359741, "rewards/margins": -0.46589648723602295, "rewards/rejected": -0.5498918890953064, "step": 993 }, { "epoch": 0.16619389287502087, "grad_norm": 40.28272247314453, "learning_rate": 1.8338061071249792e-05, "logits/chosen": -0.7284862399101257, "logits/rejected": -0.7189204692840576, "logps/chosen": -91.74370574951172, "logps/rejected": -69.01103210449219, "loss": 0.8751, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.2501630783081055, "rewards/margins": -0.9681406021118164, "rewards/rejected": -0.28202247619628906, "step": 996 }, { "epoch": 0.16669447688970465, "grad_norm": 36.06787109375, "learning_rate": 1.8333055231102954e-05, "logits/chosen": -0.7475762367248535, "logits/rejected": -0.7694829106330872, "logps/chosen": -88.926025390625, "logps/rejected": -141.2747039794922, "loss": 1.2555, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.0322859287261963, "rewards/margins": 0.9262965321540833, "rewards/rejected": -2.958582639694214, "step": 999 }, { "epoch": 0.16719506090438846, "grad_norm": 16.22271728515625, "learning_rate": 1.8328049390956116e-05, "logits/chosen": -0.7747845649719238, "logits/rejected": -0.7664408087730408, "logps/chosen": -67.52413177490234, "logps/rejected": -78.78546142578125, "loss": 0.4795, "rewards/accuracies": 1.0, "rewards/chosen": 0.22858543694019318, "rewards/margins": 1.2895129919052124, "rewards/rejected": -1.060927391052246, "step": 1002 }, { "epoch": 0.16769564491907224, "grad_norm": 10.14065933227539, "learning_rate": 1.8323043550809278e-05, "logits/chosen": -0.4327339231967926, "logits/rejected": -0.5297791957855225, "logps/chosen": -44.46305847167969, "logps/rejected": -139.2282257080078, "loss": 0.4993, "rewards/accuracies": 1.0, "rewards/chosen": -0.026851078495383263, "rewards/margins": 1.1057547330856323, "rewards/rejected": -1.132605791091919, "step": 1005 }, { "epoch": 0.16819622893375605, "grad_norm": 20.959867477416992, "learning_rate": 1.8318037710662443e-05, "logits/chosen": -0.7304897904396057, "logits/rejected": -0.6970915794372559, "logps/chosen": -61.9694938659668, "logps/rejected": -48.400146484375, "loss": 0.7416, "rewards/accuracies": 1.0, "rewards/chosen": -0.1398056000471115, "rewards/margins": 0.9049678444862366, "rewards/rejected": -1.0447734594345093, "step": 1008 }, { "epoch": 0.16869681294843986, "grad_norm": 25.155454635620117, "learning_rate": 1.83130318705156e-05, "logits/chosen": -0.408810019493103, "logits/rejected": -0.46040335297584534, "logps/chosen": -67.90068817138672, "logps/rejected": -153.55859375, "loss": 0.3457, "rewards/accuracies": 1.0, "rewards/chosen": -0.9229760766029358, "rewards/margins": 1.8746744394302368, "rewards/rejected": -2.7976505756378174, "step": 1011 }, { "epoch": 0.16919739696312364, "grad_norm": 28.153121948242188, "learning_rate": 1.8308026030368763e-05, "logits/chosen": -0.5432897210121155, "logits/rejected": -0.6080502867698669, "logps/chosen": -62.17134475708008, "logps/rejected": -131.4306182861328, "loss": 0.5713, "rewards/accuracies": 1.0, "rewards/chosen": 0.41759952902793884, "rewards/margins": 1.177918791770935, "rewards/rejected": -0.7603194117546082, "step": 1014 }, { "epoch": 0.16969798097780744, "grad_norm": 7.53228759765625, "learning_rate": 1.8303020190221928e-05, "logits/chosen": -0.6018062233924866, "logits/rejected": -0.6345563530921936, "logps/chosen": -47.840396881103516, "logps/rejected": -105.61771392822266, "loss": 0.4335, "rewards/accuracies": 1.0, "rewards/chosen": 0.6635826230049133, "rewards/margins": 2.1010966300964355, "rewards/rejected": -1.4375139474868774, "step": 1017 }, { "epoch": 0.17019856499249125, "grad_norm": 17.56317710876465, "learning_rate": 1.829801435007509e-05, "logits/chosen": -0.6263250708580017, "logits/rejected": -0.6391803026199341, "logps/chosen": -58.85734176635742, "logps/rejected": -113.19467163085938, "loss": 0.2755, "rewards/accuracies": 1.0, "rewards/chosen": 0.4292604625225067, "rewards/margins": 3.089975118637085, "rewards/rejected": -2.6607143878936768, "step": 1020 }, { "epoch": 0.17069914900717503, "grad_norm": 16.408187866210938, "learning_rate": 1.8293008509928252e-05, "logits/chosen": -0.7024360299110413, "logits/rejected": -0.746423065662384, "logps/chosen": -83.9012222290039, "logps/rejected": -130.48423767089844, "loss": 0.4899, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.07109832763671875, "rewards/margins": 1.0408287048339844, "rewards/rejected": -1.1119270324707031, "step": 1023 }, { "epoch": 0.17119973302185884, "grad_norm": 15.82673168182373, "learning_rate": 1.8288002669781414e-05, "logits/chosen": -0.7492650151252747, "logits/rejected": -0.8140811920166016, "logps/chosen": -91.9413070678711, "logps/rejected": -108.7629165649414, "loss": 0.7865, "rewards/accuracies": 1.0, "rewards/chosen": -0.5826147198677063, "rewards/margins": 2.2128512859344482, "rewards/rejected": -2.7954657077789307, "step": 1026 }, { "epoch": 0.17170031703654262, "grad_norm": 14.871737480163574, "learning_rate": 1.8282996829634575e-05, "logits/chosen": -0.6494803428649902, "logits/rejected": -0.718670666217804, "logps/chosen": -56.15293502807617, "logps/rejected": -128.30596923828125, "loss": 0.4122, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.0875420868396759, "rewards/margins": 2.1504673957824707, "rewards/rejected": -2.238009452819824, "step": 1029 }, { "epoch": 0.17220090105122643, "grad_norm": 19.379680633544922, "learning_rate": 1.8277990989487737e-05, "logits/chosen": -0.6263663172721863, "logits/rejected": -0.6916250586509705, "logps/chosen": -99.00801849365234, "logps/rejected": -116.71892547607422, "loss": 0.4429, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.15574900805950165, "rewards/margins": 0.029515227302908897, "rewards/rejected": 0.126233771443367, "step": 1032 }, { "epoch": 0.17270148506591024, "grad_norm": 15.581350326538086, "learning_rate": 1.82729851493409e-05, "logits/chosen": -0.6231219172477722, "logits/rejected": -0.6571192741394043, "logps/chosen": -109.8952865600586, "logps/rejected": -135.90843200683594, "loss": 0.5544, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.11848881840705872, "rewards/margins": 1.7939977645874023, "rewards/rejected": -1.675508975982666, "step": 1035 }, { "epoch": 0.17320206908059402, "grad_norm": 31.64156150817871, "learning_rate": 1.826797930919406e-05, "logits/chosen": -0.6547476649284363, "logits/rejected": -0.696573793888092, "logps/chosen": -118.42037200927734, "logps/rejected": -116.89282989501953, "loss": 0.7877, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -3.0162715911865234, "rewards/margins": -0.1138143539428711, "rewards/rejected": -2.9024569988250732, "step": 1038 }, { "epoch": 0.17370265309527783, "grad_norm": 12.158845901489258, "learning_rate": 1.8262973469047223e-05, "logits/chosen": -0.5443341732025146, "logits/rejected": -0.5757256746292114, "logps/chosen": -43.502166748046875, "logps/rejected": -105.8546142578125, "loss": 0.518, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.20896339416503906, "rewards/margins": 0.38248229026794434, "rewards/rejected": -0.17351888120174408, "step": 1041 }, { "epoch": 0.17420323710996163, "grad_norm": 22.49630355834961, "learning_rate": 1.8257967628900384e-05, "logits/chosen": -0.6288720965385437, "logits/rejected": -0.6457843780517578, "logps/chosen": -127.28862762451172, "logps/rejected": -147.52027893066406, "loss": 0.88, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.468598484992981, "rewards/margins": -0.6025804281234741, "rewards/rejected": -0.8660181164741516, "step": 1044 }, { "epoch": 0.17470382112464541, "grad_norm": 18.36517906188965, "learning_rate": 1.8252961788753546e-05, "logits/chosen": -0.48018965125083923, "logits/rejected": -0.46122559905052185, "logps/chosen": -97.61758422851562, "logps/rejected": -58.42717361450195, "loss": 0.6965, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.34069254994392395, "rewards/margins": 0.334674596786499, "rewards/rejected": 0.0060179331339895725, "step": 1047 }, { "epoch": 0.17520440513932922, "grad_norm": 31.86559295654297, "learning_rate": 1.824795594860671e-05, "logits/chosen": -0.7222628593444824, "logits/rejected": -0.7267997860908508, "logps/chosen": -40.35491943359375, "logps/rejected": -59.78610610961914, "loss": 0.7873, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.4257930815219879, "rewards/margins": -0.14462535083293915, "rewards/rejected": -0.2811677157878876, "step": 1050 }, { "epoch": 0.175704989154013, "grad_norm": 17.828622817993164, "learning_rate": 1.824295010845987e-05, "logits/chosen": -0.8970478177070618, "logits/rejected": -0.915001392364502, "logps/chosen": -97.9358901977539, "logps/rejected": -108.8412857055664, "loss": 0.5786, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.4714144468307495, "rewards/margins": -0.0029149551410228014, "rewards/rejected": 0.4743293821811676, "step": 1053 }, { "epoch": 0.1762055731686968, "grad_norm": 13.345202445983887, "learning_rate": 1.823794426831303e-05, "logits/chosen": -0.5422232747077942, "logits/rejected": -0.48982226848602295, "logps/chosen": -163.60470581054688, "logps/rejected": -93.46611785888672, "loss": 0.6604, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.4274490177631378, "rewards/margins": 0.43371570110321045, "rewards/rejected": -0.006266653537750244, "step": 1056 }, { "epoch": 0.17670615718338062, "grad_norm": 20.941986083984375, "learning_rate": 1.8232938428166197e-05, "logits/chosen": -0.8421046733856201, "logits/rejected": -0.833854615688324, "logps/chosen": -68.8692626953125, "logps/rejected": -82.19425201416016, "loss": 0.4826, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.01581752300262451, "rewards/margins": 2.0025510787963867, "rewards/rejected": -2.0183684825897217, "step": 1059 }, { "epoch": 0.1772067411980644, "grad_norm": 28.123241424560547, "learning_rate": 1.822793258801936e-05, "logits/chosen": -0.7229909896850586, "logits/rejected": -0.7449116706848145, "logps/chosen": -90.8486099243164, "logps/rejected": -116.0763931274414, "loss": 0.6133, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.03151804208755493, "rewards/margins": 0.6764137148857117, "rewards/rejected": -0.7079318165779114, "step": 1062 }, { "epoch": 0.1777073252127482, "grad_norm": 10.131790161132812, "learning_rate": 1.822292674787252e-05, "logits/chosen": -0.5064796209335327, "logits/rejected": -0.46294352412223816, "logps/chosen": -72.79815673828125, "logps/rejected": -73.46514892578125, "loss": 0.4548, "rewards/accuracies": 1.0, "rewards/chosen": 0.030208779498934746, "rewards/margins": 0.6770113110542297, "rewards/rejected": -0.6468024849891663, "step": 1065 }, { "epoch": 0.17820790922743202, "grad_norm": 15.191394805908203, "learning_rate": 1.8217920907725682e-05, "logits/chosen": -0.6545515656471252, "logits/rejected": -0.667466938495636, "logps/chosen": -138.87255859375, "logps/rejected": -130.1216583251953, "loss": 0.6054, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.3650926351547241, "rewards/margins": -0.043426673859357834, "rewards/rejected": -0.3216659426689148, "step": 1068 }, { "epoch": 0.1787084932421158, "grad_norm": 12.39291000366211, "learning_rate": 1.8212915067578844e-05, "logits/chosen": -0.5922884345054626, "logits/rejected": -0.6093013882637024, "logps/chosen": -84.27598571777344, "logps/rejected": -109.35474395751953, "loss": 0.5327, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4683783948421478, "rewards/margins": 0.9174950122833252, "rewards/rejected": -1.3858733177185059, "step": 1071 }, { "epoch": 0.1792090772567996, "grad_norm": 19.330625534057617, "learning_rate": 1.8207909227432006e-05, "logits/chosen": -0.6073471903800964, "logits/rejected": -0.6025769114494324, "logps/chosen": -86.7308349609375, "logps/rejected": -87.94800567626953, "loss": 0.6538, "rewards/accuracies": 0.0, "rewards/chosen": -1.041127324104309, "rewards/margins": -0.4863840639591217, "rewards/rejected": -0.5547432899475098, "step": 1074 }, { "epoch": 0.17970966127148338, "grad_norm": 2.617833375930786, "learning_rate": 1.8202903387285168e-05, "logits/chosen": -0.6443076133728027, "logits/rejected": -0.703256368637085, "logps/chosen": -62.44948196411133, "logps/rejected": -136.97789001464844, "loss": 0.406, "rewards/accuracies": 1.0, "rewards/chosen": -0.6570183634757996, "rewards/margins": 2.145983934402466, "rewards/rejected": -2.8030025959014893, "step": 1077 }, { "epoch": 0.1802102452861672, "grad_norm": 20.577425003051758, "learning_rate": 1.819789754713833e-05, "logits/chosen": -0.5139809250831604, "logits/rejected": -0.6056511998176575, "logps/chosen": -63.05489730834961, "logps/rejected": -148.4107208251953, "loss": 0.7659, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.14573778212070465, "rewards/margins": 0.5204558968544006, "rewards/rejected": -0.3747180998325348, "step": 1080 }, { "epoch": 0.180710829300851, "grad_norm": 17.893903732299805, "learning_rate": 1.819289170699149e-05, "logits/chosen": -0.5062335133552551, "logits/rejected": -0.5597944259643555, "logps/chosen": -72.68590545654297, "logps/rejected": -120.7841567993164, "loss": 0.713, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.32348179817199707, "rewards/margins": 1.4111891984939575, "rewards/rejected": -1.7346711158752441, "step": 1083 }, { "epoch": 0.18121141331553478, "grad_norm": 25.393632888793945, "learning_rate": 1.8187885866844653e-05, "logits/chosen": -0.6907656192779541, "logits/rejected": -0.8013394474983215, "logps/chosen": -46.77080154418945, "logps/rejected": -130.2031707763672, "loss": 0.559, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.29233601689338684, "rewards/margins": 1.1973711252212524, "rewards/rejected": -1.489707350730896, "step": 1086 }, { "epoch": 0.1817119973302186, "grad_norm": 37.938209533691406, "learning_rate": 1.8182880026697815e-05, "logits/chosen": -0.5545749068260193, "logits/rejected": -0.5623749494552612, "logps/chosen": -101.27822875976562, "logps/rejected": -169.02252197265625, "loss": 0.7535, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.3057712316513062, "rewards/margins": 1.197665810585022, "rewards/rejected": -2.503437042236328, "step": 1089 }, { "epoch": 0.1822125813449024, "grad_norm": 19.368070602416992, "learning_rate": 1.8177874186550977e-05, "logits/chosen": -0.6146214604377747, "logits/rejected": -0.6516785025596619, "logps/chosen": -60.529388427734375, "logps/rejected": -81.0186767578125, "loss": 0.3228, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.9082586169242859, "rewards/margins": 0.5778992772102356, "rewards/rejected": -1.486157774925232, "step": 1092 }, { "epoch": 0.18271316535958618, "grad_norm": 57.04454803466797, "learning_rate": 1.817286834640414e-05, "logits/chosen": -0.8413894772529602, "logits/rejected": -0.7864217162132263, "logps/chosen": -191.5496368408203, "logps/rejected": -101.21390533447266, "loss": 1.1368, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.050032138824463, "rewards/margins": -0.16911065578460693, "rewards/rejected": -0.8809214234352112, "step": 1095 }, { "epoch": 0.18321374937426999, "grad_norm": 8.408284187316895, "learning_rate": 1.81678625062573e-05, "logits/chosen": -0.6218413710594177, "logits/rejected": -0.6113179326057434, "logps/chosen": -82.52692413330078, "logps/rejected": -81.01628875732422, "loss": 0.6658, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.31268247961997986, "rewards/margins": 0.1201881542801857, "rewards/rejected": -0.43287062644958496, "step": 1098 }, { "epoch": 0.1837143333889538, "grad_norm": 32.0211181640625, "learning_rate": 1.8162856666110465e-05, "logits/chosen": -0.7364760041236877, "logits/rejected": -0.719905436038971, "logps/chosen": -67.49429321289062, "logps/rejected": -62.21870040893555, "loss": 0.8845, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.06798756122589111, "rewards/margins": 0.22613990306854248, "rewards/rejected": -0.15815234184265137, "step": 1101 }, { "epoch": 0.18421491740363757, "grad_norm": 12.06209945678711, "learning_rate": 1.8157850825963627e-05, "logits/chosen": -0.6752015948295593, "logits/rejected": -0.7042739391326904, "logps/chosen": -87.3218002319336, "logps/rejected": -76.4205093383789, "loss": 0.6632, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.13420690596103668, "rewards/margins": -0.12067413330078125, "rewards/rejected": -0.013532768003642559, "step": 1104 }, { "epoch": 0.18471550141832138, "grad_norm": 13.096672058105469, "learning_rate": 1.815284498581679e-05, "logits/chosen": -0.47341910004615784, "logits/rejected": -0.4567759931087494, "logps/chosen": -103.96540069580078, "logps/rejected": -79.0676040649414, "loss": 0.4492, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1223106384277344, "rewards/margins": 0.4268936812877655, "rewards/rejected": -1.5492042303085327, "step": 1107 }, { "epoch": 0.18521608543300516, "grad_norm": 19.43445587158203, "learning_rate": 1.814783914566995e-05, "logits/chosen": -0.6646769046783447, "logits/rejected": -0.7687476277351379, "logps/chosen": -114.912109375, "logps/rejected": -153.61819458007812, "loss": 0.7018, "rewards/accuracies": 1.0, "rewards/chosen": -0.14410851895809174, "rewards/margins": 1.264255404472351, "rewards/rejected": -1.4083638191223145, "step": 1110 }, { "epoch": 0.18571666944768897, "grad_norm": 23.784526824951172, "learning_rate": 1.8142833305523112e-05, "logits/chosen": -0.47224247455596924, "logits/rejected": -0.5543248057365417, "logps/chosen": -58.11577224731445, "logps/rejected": -170.09066772460938, "loss": 0.386, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.5010046362876892, "rewards/margins": 1.7095603942871094, "rewards/rejected": -1.208555817604065, "step": 1113 }, { "epoch": 0.18621725346237278, "grad_norm": 13.998215675354004, "learning_rate": 1.8137827465376274e-05, "logits/chosen": -0.688877284526825, "logits/rejected": -0.7753574252128601, "logps/chosen": -65.1966781616211, "logps/rejected": -122.7828140258789, "loss": 0.5706, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.3785953223705292, "rewards/margins": 0.2204064130783081, "rewards/rejected": 0.15818889439105988, "step": 1116 }, { "epoch": 0.18671783747705656, "grad_norm": 12.818315505981445, "learning_rate": 1.8132821625229436e-05, "logits/chosen": -0.6703548431396484, "logits/rejected": -0.6981260776519775, "logps/chosen": -82.31500244140625, "logps/rejected": -113.60945892333984, "loss": 0.4911, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4918552339076996, "rewards/margins": 1.4127670526504517, "rewards/rejected": -1.9046224355697632, "step": 1119 }, { "epoch": 0.18721842149174037, "grad_norm": 19.5346622467041, "learning_rate": 1.8127815785082598e-05, "logits/chosen": -0.5964506268501282, "logits/rejected": -0.6170119643211365, "logps/chosen": -67.83063507080078, "logps/rejected": -98.03973388671875, "loss": 0.663, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.10182812809944153, "rewards/margins": 1.89584219455719, "rewards/rejected": -1.9976705312728882, "step": 1122 }, { "epoch": 0.18771900550642417, "grad_norm": 20.108531951904297, "learning_rate": 1.812280994493576e-05, "logits/chosen": -0.8086018562316895, "logits/rejected": -0.6963886618614197, "logps/chosen": -158.40420532226562, "logps/rejected": -69.27342224121094, "loss": 0.6555, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1539947986602783, "rewards/margins": -0.9722700119018555, "rewards/rejected": -0.18172474205493927, "step": 1125 }, { "epoch": 0.18821958952110796, "grad_norm": 10.33030891418457, "learning_rate": 1.811780410478892e-05, "logits/chosen": -0.5998716950416565, "logits/rejected": -0.6165620684623718, "logps/chosen": -82.88990020751953, "logps/rejected": -164.01382446289062, "loss": 0.2967, "rewards/accuracies": 1.0, "rewards/chosen": 0.4427380859851837, "rewards/margins": 1.5721458196640015, "rewards/rejected": -1.1294077634811401, "step": 1128 }, { "epoch": 0.18872017353579176, "grad_norm": 28.780920028686523, "learning_rate": 1.8112798264642083e-05, "logits/chosen": -0.7932894229888916, "logits/rejected": -0.8003626465797424, "logps/chosen": -150.07716369628906, "logps/rejected": -137.67759704589844, "loss": 0.7323, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.3984209299087524, "rewards/margins": -0.2881934940814972, "rewards/rejected": -1.1102274656295776, "step": 1131 }, { "epoch": 0.18922075755047554, "grad_norm": 31.667205810546875, "learning_rate": 1.8107792424495245e-05, "logits/chosen": -0.6008508801460266, "logits/rejected": -0.6272829174995422, "logps/chosen": -83.94636535644531, "logps/rejected": -87.45916748046875, "loss": 0.7321, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.12411066144704819, "rewards/margins": 0.22575481235980988, "rewards/rejected": -0.34986546635627747, "step": 1134 }, { "epoch": 0.18972134156515935, "grad_norm": 21.08437728881836, "learning_rate": 1.8102786584348407e-05, "logits/chosen": -0.7058134078979492, "logits/rejected": -0.6582134366035461, "logps/chosen": -90.01748657226562, "logps/rejected": -107.15963745117188, "loss": 0.4483, "rewards/accuracies": 1.0, "rewards/chosen": 0.3633337914943695, "rewards/margins": 1.8193858861923218, "rewards/rejected": -1.4560521841049194, "step": 1137 }, { "epoch": 0.19022192557984316, "grad_norm": 27.1783504486084, "learning_rate": 1.809778074420157e-05, "logits/chosen": -0.7070128321647644, "logits/rejected": -0.7029461860656738, "logps/chosen": -60.435638427734375, "logps/rejected": -54.40789794921875, "loss": 0.5986, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.08885803073644638, "rewards/margins": 0.4942653179168701, "rewards/rejected": -0.40540727972984314, "step": 1140 }, { "epoch": 0.19072250959452694, "grad_norm": 14.269248008728027, "learning_rate": 1.8092774904054734e-05, "logits/chosen": -0.5246663689613342, "logits/rejected": -0.5427135825157166, "logps/chosen": -77.36762237548828, "logps/rejected": -87.9792251586914, "loss": 0.8722, "rewards/accuracies": 1.0, "rewards/chosen": -0.4982461631298065, "rewards/margins": 0.7202866077423096, "rewards/rejected": -1.218532681465149, "step": 1143 }, { "epoch": 0.19122309360921075, "grad_norm": 35.6895751953125, "learning_rate": 1.8087769063907896e-05, "logits/chosen": -0.6950699687004089, "logits/rejected": -0.7695247530937195, "logps/chosen": -73.9492416381836, "logps/rejected": -152.4287872314453, "loss": 1.0051, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.19278739392757416, "rewards/margins": 2.0983667373657227, "rewards/rejected": -2.291154146194458, "step": 1146 }, { "epoch": 0.19172367762389456, "grad_norm": 6.757198810577393, "learning_rate": 1.8082763223761054e-05, "logits/chosen": -0.5658649802207947, "logits/rejected": -0.5783953070640564, "logps/chosen": -42.9971809387207, "logps/rejected": -85.7350845336914, "loss": 0.317, "rewards/accuracies": 1.0, "rewards/chosen": 1.1222573518753052, "rewards/margins": 1.9026533365249634, "rewards/rejected": -0.780396044254303, "step": 1149 }, { "epoch": 0.19222426163857834, "grad_norm": 24.703285217285156, "learning_rate": 1.807775738361422e-05, "logits/chosen": -0.592670738697052, "logits/rejected": -0.6960000395774841, "logps/chosen": -92.9126968383789, "logps/rejected": -125.53141021728516, "loss": 0.3546, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.5270706415176392, "rewards/margins": 0.4925251007080078, "rewards/rejected": -2.0195958614349365, "step": 1152 }, { "epoch": 0.19272484565326214, "grad_norm": 10.556418418884277, "learning_rate": 1.807275154346738e-05, "logits/chosen": -0.5443143844604492, "logits/rejected": -0.5903344750404358, "logps/chosen": -40.70964431762695, "logps/rejected": -100.3470687866211, "loss": 0.5519, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.4720342457294464, "rewards/margins": 2.6069624423980713, "rewards/rejected": -2.1349282264709473, "step": 1155 }, { "epoch": 0.19322542966794592, "grad_norm": 19.15245819091797, "learning_rate": 1.8067745703320543e-05, "logits/chosen": -0.7545328140258789, "logits/rejected": -0.7768728137016296, "logps/chosen": -84.46829223632812, "logps/rejected": -111.16226959228516, "loss": 0.7642, "rewards/accuracies": 1.0, "rewards/chosen": 0.2677476108074188, "rewards/margins": 2.686488389968872, "rewards/rejected": -2.41874098777771, "step": 1158 }, { "epoch": 0.19372601368262973, "grad_norm": 25.70322608947754, "learning_rate": 1.8062739863173705e-05, "logits/chosen": -0.6302701234817505, "logits/rejected": -0.657745361328125, "logps/chosen": -69.1265640258789, "logps/rejected": -97.7176284790039, "loss": 0.7102, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.2013181895017624, "rewards/margins": 1.2901971340179443, "rewards/rejected": -1.4915152788162231, "step": 1161 }, { "epoch": 0.19422659769731354, "grad_norm": 15.785233497619629, "learning_rate": 1.8057734023026866e-05, "logits/chosen": -0.7527437210083008, "logits/rejected": -0.6909255981445312, "logps/chosen": -103.04582977294922, "logps/rejected": -56.058414459228516, "loss": 0.381, "rewards/accuracies": 1.0, "rewards/chosen": -0.2693725526332855, "rewards/margins": 0.8941535949707031, "rewards/rejected": -1.163526177406311, "step": 1164 }, { "epoch": 0.19472718171199732, "grad_norm": 29.527111053466797, "learning_rate": 1.8052728182880028e-05, "logits/chosen": -0.6238663792610168, "logits/rejected": -0.649927020072937, "logps/chosen": -98.78958892822266, "logps/rejected": -106.8717041015625, "loss": 0.7665, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.5962991714477539, "rewards/margins": 1.000242829322815, "rewards/rejected": -1.5965420007705688, "step": 1167 }, { "epoch": 0.19522776572668113, "grad_norm": 21.783918380737305, "learning_rate": 1.804772234273319e-05, "logits/chosen": -0.6887848377227783, "logits/rejected": -0.7307208180427551, "logps/chosen": -70.92481231689453, "logps/rejected": -137.81787109375, "loss": 0.5779, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4597243070602417, "rewards/margins": 0.7855483889579773, "rewards/rejected": -1.2452727556228638, "step": 1170 }, { "epoch": 0.19572834974136494, "grad_norm": 37.185699462890625, "learning_rate": 1.8042716502586352e-05, "logits/chosen": -0.4816320836544037, "logits/rejected": -0.5618330836296082, "logps/chosen": -47.24982833862305, "logps/rejected": -76.57693481445312, "loss": 0.7386, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5171414017677307, "rewards/margins": 0.7838654518127441, "rewards/rejected": -1.30100679397583, "step": 1173 }, { "epoch": 0.19622893375604872, "grad_norm": 30.385395050048828, "learning_rate": 1.8037710662439514e-05, "logits/chosen": -0.8408731818199158, "logits/rejected": -0.8573296666145325, "logps/chosen": -80.63996124267578, "logps/rejected": -130.178466796875, "loss": 0.5255, "rewards/accuracies": 1.0, "rewards/chosen": -0.6076984405517578, "rewards/margins": 1.1513596773147583, "rewards/rejected": -1.7590581178665161, "step": 1176 }, { "epoch": 0.19672951777073253, "grad_norm": 15.298443794250488, "learning_rate": 1.8032704822292675e-05, "logits/chosen": -0.6579909920692444, "logits/rejected": -0.7032784819602966, "logps/chosen": -46.64663314819336, "logps/rejected": -113.72833251953125, "loss": 0.4853, "rewards/accuracies": 1.0, "rewards/chosen": -0.7001999020576477, "rewards/margins": 1.087867259979248, "rewards/rejected": -1.7880672216415405, "step": 1179 }, { "epoch": 0.1972301017854163, "grad_norm": 22.221525192260742, "learning_rate": 1.8027698982145837e-05, "logits/chosen": -0.5399391055107117, "logits/rejected": -0.5882784724235535, "logps/chosen": -77.4721450805664, "logps/rejected": -108.31134033203125, "loss": 0.4727, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.1335955411195755, "rewards/margins": 0.3325187563896179, "rewards/rejected": -0.1989232301712036, "step": 1182 }, { "epoch": 0.19773068580010011, "grad_norm": 30.643896102905273, "learning_rate": 1.8022693141999e-05, "logits/chosen": -0.7923319339752197, "logits/rejected": -0.8548566699028015, "logps/chosen": -81.75917053222656, "logps/rejected": -146.2770233154297, "loss": 1.0035, "rewards/accuracies": 1.0, "rewards/chosen": 0.17340902984142303, "rewards/margins": 1.4315208196640015, "rewards/rejected": -1.258111596107483, "step": 1185 }, { "epoch": 0.19823126981478392, "grad_norm": 18.05436134338379, "learning_rate": 1.8017687301852164e-05, "logits/chosen": -0.6256530284881592, "logits/rejected": -0.6008245944976807, "logps/chosen": -128.94039916992188, "logps/rejected": -104.86064910888672, "loss": 0.8764, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.013603836297988892, "rewards/margins": -0.29054680466651917, "rewards/rejected": 0.2769429683685303, "step": 1188 }, { "epoch": 0.1987318538294677, "grad_norm": 11.027203559875488, "learning_rate": 1.8012681461705323e-05, "logits/chosen": -0.515414297580719, "logits/rejected": -0.5098969340324402, "logps/chosen": -114.0708236694336, "logps/rejected": -98.2279052734375, "loss": 0.4668, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.1237228512763977, "rewards/margins": -0.5595430731773376, "rewards/rejected": 0.43582025170326233, "step": 1191 }, { "epoch": 0.1992324378441515, "grad_norm": 26.06267738342285, "learning_rate": 1.8007675621558488e-05, "logits/chosen": -0.7082893252372742, "logits/rejected": -0.7663147449493408, "logps/chosen": -72.36519622802734, "logps/rejected": -103.80895233154297, "loss": 0.6919, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.06717202812433243, "rewards/margins": 1.9289183616638184, "rewards/rejected": -1.8617464303970337, "step": 1194 }, { "epoch": 0.19973302185883532, "grad_norm": 10.306700706481934, "learning_rate": 1.800266978141165e-05, "logits/chosen": -0.9373574256896973, "logits/rejected": -0.9219964146614075, "logps/chosen": -72.90534210205078, "logps/rejected": -102.95861053466797, "loss": 0.4383, "rewards/accuracies": 1.0, "rewards/chosen": 1.1416363716125488, "rewards/margins": 2.1294336318969727, "rewards/rejected": -0.9877972602844238, "step": 1197 }, { "epoch": 0.2002336058735191, "grad_norm": 16.615209579467773, "learning_rate": 1.799766394126481e-05, "logits/chosen": -0.7610917091369629, "logits/rejected": -0.7360236048698425, "logps/chosen": -89.5280990600586, "logps/rejected": -73.5097885131836, "loss": 0.6533, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.7296028733253479, "rewards/margins": 0.6354528069496155, "rewards/rejected": -1.3650559186935425, "step": 1200 }, { "epoch": 0.2002336058735191, "eval_logits/chosen": -0.692094087600708, "eval_logits/rejected": -0.705393373966217, "eval_logps/chosen": -79.67239379882812, "eval_logps/rejected": -100.8046875, "eval_loss": 0.5988917946815491, "eval_rewards/accuracies": 0.7012012004852295, "eval_rewards/chosen": 0.05853283032774925, "eval_rewards/margins": 0.8261826634407043, "eval_rewards/rejected": -0.76764976978302, "eval_runtime": 345.5458, "eval_samples_per_second": 7.71, "eval_steps_per_second": 1.927, "step": 1200 }, { "epoch": 0.2007341898882029, "grad_norm": 20.65858268737793, "learning_rate": 1.7992658101117973e-05, "logits/chosen": -0.7424077987670898, "logits/rejected": -0.7396953105926514, "logps/chosen": -63.55167770385742, "logps/rejected": -59.400421142578125, "loss": 0.6157, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.21893565356731415, "rewards/margins": -0.3223860263824463, "rewards/rejected": 0.10345041751861572, "step": 1203 }, { "epoch": 0.2012347739028867, "grad_norm": 32.11043167114258, "learning_rate": 1.7987652260971135e-05, "logits/chosen": -0.7978861927986145, "logits/rejected": -0.8288702964782715, "logps/chosen": -90.2911376953125, "logps/rejected": -114.14360809326172, "loss": 0.7794, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.08745473623275757, "rewards/margins": 1.0356769561767578, "rewards/rejected": -1.1231317520141602, "step": 1206 }, { "epoch": 0.2017353579175705, "grad_norm": 16.839157104492188, "learning_rate": 1.7982646420824297e-05, "logits/chosen": -0.7740273475646973, "logits/rejected": -0.8109803199768066, "logps/chosen": -77.49887084960938, "logps/rejected": -107.26538848876953, "loss": 0.4591, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9890101552009583, "rewards/margins": 0.24078939855098724, "rewards/rejected": -1.229799509048462, "step": 1209 }, { "epoch": 0.2022359419322543, "grad_norm": 13.76618480682373, "learning_rate": 1.797764058067746e-05, "logits/chosen": -0.555410623550415, "logits/rejected": -0.5211437344551086, "logps/chosen": -79.55378723144531, "logps/rejected": -67.08892059326172, "loss": 0.4511, "rewards/accuracies": 1.0, "rewards/chosen": 0.4736614525318146, "rewards/margins": 2.4502077102661133, "rewards/rejected": -1.976546287536621, "step": 1212 }, { "epoch": 0.20273652594693808, "grad_norm": 35.88215637207031, "learning_rate": 1.797263474053062e-05, "logits/chosen": -0.556317150592804, "logits/rejected": -0.6364912986755371, "logps/chosen": -55.205291748046875, "logps/rejected": -109.41454315185547, "loss": 0.602, "rewards/accuracies": 1.0, "rewards/chosen": 0.4362363815307617, "rewards/margins": 0.7473461031913757, "rewards/rejected": -0.3111096918582916, "step": 1215 }, { "epoch": 0.2032371099616219, "grad_norm": 31.569955825805664, "learning_rate": 1.7967628900383782e-05, "logits/chosen": -0.7093944549560547, "logits/rejected": -0.6940339207649231, "logps/chosen": -55.5921630859375, "logps/rejected": -60.37961196899414, "loss": 0.6893, "rewards/accuracies": 1.0, "rewards/chosen": 1.003533959388733, "rewards/margins": 1.3123459815979004, "rewards/rejected": -0.30881205201148987, "step": 1218 }, { "epoch": 0.2037376939763057, "grad_norm": 37.38691329956055, "learning_rate": 1.7962623060236944e-05, "logits/chosen": -0.6206009387969971, "logits/rejected": -0.6377667784690857, "logps/chosen": -87.35247039794922, "logps/rejected": -143.07191467285156, "loss": 0.6069, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.108123779296875, "rewards/margins": -0.16796672344207764, "rewards/rejected": -0.9401569962501526, "step": 1221 }, { "epoch": 0.20423827799098948, "grad_norm": 12.506324768066406, "learning_rate": 1.7957617220090106e-05, "logits/chosen": -0.4944934844970703, "logits/rejected": -0.4821246862411499, "logps/chosen": -64.51868438720703, "logps/rejected": -70.90249633789062, "loss": 0.482, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.11733093112707138, "rewards/margins": -0.24431218206882477, "rewards/rejected": 0.36164310574531555, "step": 1224 }, { "epoch": 0.2047388620056733, "grad_norm": 19.032594680786133, "learning_rate": 1.7952611379943267e-05, "logits/chosen": -0.9611468315124512, "logits/rejected": -0.9671390652656555, "logps/chosen": -80.5359115600586, "logps/rejected": -95.93124389648438, "loss": 0.5813, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.21545231342315674, "rewards/margins": 1.368625283241272, "rewards/rejected": -1.5840777158737183, "step": 1227 }, { "epoch": 0.2052394460203571, "grad_norm": 10.958185195922852, "learning_rate": 1.7947605539796433e-05, "logits/chosen": -0.47366538643836975, "logits/rejected": -0.4961899220943451, "logps/chosen": -30.077810287475586, "logps/rejected": -70.03571319580078, "loss": 0.3266, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.6024187207221985, "rewards/margins": 1.109424114227295, "rewards/rejected": -0.5070053935050964, "step": 1230 }, { "epoch": 0.20574003003504088, "grad_norm": 12.15113639831543, "learning_rate": 1.794259969964959e-05, "logits/chosen": -0.4919814169406891, "logits/rejected": -0.639107882976532, "logps/chosen": -35.2580680847168, "logps/rejected": -110.31815338134766, "loss": 0.3844, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.8583670258522034, "rewards/margins": 2.475770950317383, "rewards/rejected": -1.6174039840698242, "step": 1233 }, { "epoch": 0.20624061404972469, "grad_norm": 16.946041107177734, "learning_rate": 1.7937593859502756e-05, "logits/chosen": -0.8060691952705383, "logits/rejected": -0.8226811289787292, "logps/chosen": -51.11186599731445, "logps/rejected": -67.83405303955078, "loss": 0.539, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.9806830883026123, "rewards/margins": 2.029604911804199, "rewards/rejected": -1.0489219427108765, "step": 1236 }, { "epoch": 0.20674119806440847, "grad_norm": 12.875670433044434, "learning_rate": 1.7932588019355918e-05, "logits/chosen": -0.5077283978462219, "logits/rejected": -0.5178724527359009, "logps/chosen": -51.80998611450195, "logps/rejected": -89.4110107421875, "loss": 0.5629, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.22531522810459137, "rewards/margins": 0.6713624596595764, "rewards/rejected": -0.8966776728630066, "step": 1239 }, { "epoch": 0.20724178207909227, "grad_norm": 14.443327903747559, "learning_rate": 1.7927582179209076e-05, "logits/chosen": -0.5915122628211975, "logits/rejected": -0.6035146117210388, "logps/chosen": -81.54136657714844, "logps/rejected": -92.2308120727539, "loss": 0.7063, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.3688298463821411, "rewards/margins": -0.7475493550300598, "rewards/rejected": -0.6212806105613708, "step": 1242 }, { "epoch": 0.20774236609377608, "grad_norm": 11.449729919433594, "learning_rate": 1.792257633906224e-05, "logits/chosen": -0.7078880667686462, "logits/rejected": -0.6361680030822754, "logps/chosen": -76.31856536865234, "logps/rejected": -100.33255004882812, "loss": 0.3151, "rewards/accuracies": 1.0, "rewards/chosen": -0.8721542358398438, "rewards/margins": 0.8056859374046326, "rewards/rejected": -1.677840232849121, "step": 1245 }, { "epoch": 0.20824295010845986, "grad_norm": 6.188257217407227, "learning_rate": 1.7917570498915403e-05, "logits/chosen": -0.7782987952232361, "logits/rejected": -0.8149599432945251, "logps/chosen": -83.13260650634766, "logps/rejected": -99.4970474243164, "loss": 0.6545, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.31499621272087097, "rewards/margins": 0.3382020890712738, "rewards/rejected": -0.6531983017921448, "step": 1248 }, { "epoch": 0.20874353412314367, "grad_norm": 12.373640060424805, "learning_rate": 1.7912564658768565e-05, "logits/chosen": -0.45192959904670715, "logits/rejected": -0.5194546580314636, "logps/chosen": -68.30915832519531, "logps/rejected": -149.07583618164062, "loss": 0.5105, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8391969203948975, "rewards/margins": 0.5876145958900452, "rewards/rejected": -1.4268115758895874, "step": 1251 }, { "epoch": 0.20924411813782748, "grad_norm": 38.251712799072266, "learning_rate": 1.7907558818621727e-05, "logits/chosen": -0.6315311789512634, "logits/rejected": -0.7211523056030273, "logps/chosen": -57.64826965332031, "logps/rejected": -108.03863525390625, "loss": 0.8061, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.180059552192688, "rewards/margins": 1.452985405921936, "rewards/rejected": -1.272925853729248, "step": 1254 }, { "epoch": 0.20974470215251126, "grad_norm": 31.479894638061523, "learning_rate": 1.790255297847489e-05, "logits/chosen": -0.6047012805938721, "logits/rejected": -0.5812085866928101, "logps/chosen": -54.3844108581543, "logps/rejected": -70.04906463623047, "loss": 0.7523, "rewards/accuracies": 1.0, "rewards/chosen": 0.675595760345459, "rewards/margins": 0.7448461055755615, "rewards/rejected": -0.06925036758184433, "step": 1257 }, { "epoch": 0.21024528616719507, "grad_norm": 17.837059020996094, "learning_rate": 1.789754713832805e-05, "logits/chosen": -0.8173485398292542, "logits/rejected": -0.8477612137794495, "logps/chosen": -52.012027740478516, "logps/rejected": -123.10446166992188, "loss": 0.5203, "rewards/accuracies": 1.0, "rewards/chosen": 0.1838027387857437, "rewards/margins": 1.4114872217178345, "rewards/rejected": -1.227684497833252, "step": 1260 }, { "epoch": 0.21074587018187885, "grad_norm": 14.990022659301758, "learning_rate": 1.7892541298181212e-05, "logits/chosen": -0.6558148860931396, "logits/rejected": -0.7030786871910095, "logps/chosen": -63.36274719238281, "logps/rejected": -125.3333511352539, "loss": 0.4002, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.6815361380577087, "rewards/margins": 2.2806925773620605, "rewards/rejected": -1.5991564989089966, "step": 1263 }, { "epoch": 0.21124645419656266, "grad_norm": 13.71129035949707, "learning_rate": 1.7887535458034374e-05, "logits/chosen": -0.8065879344940186, "logits/rejected": -0.8334904313087463, "logps/chosen": -47.55266189575195, "logps/rejected": -98.65511322021484, "loss": 0.4062, "rewards/accuracies": 1.0, "rewards/chosen": -0.03998788818717003, "rewards/margins": 2.3552372455596924, "rewards/rejected": -2.3952252864837646, "step": 1266 }, { "epoch": 0.21174703821124646, "grad_norm": 23.319059371948242, "learning_rate": 1.7882529617887536e-05, "logits/chosen": -0.5892873406410217, "logits/rejected": -0.6009683012962341, "logps/chosen": -66.0489273071289, "logps/rejected": -68.84061431884766, "loss": 0.7764, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.37583431601524353, "rewards/margins": -0.5666119456291199, "rewards/rejected": 0.19077767431735992, "step": 1269 }, { "epoch": 0.21224762222593024, "grad_norm": 19.689390182495117, "learning_rate": 1.78775237777407e-05, "logits/chosen": -0.579804003238678, "logits/rejected": -0.6066977381706238, "logps/chosen": -67.6319580078125, "logps/rejected": -83.6774673461914, "loss": 0.6272, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.22528140246868134, "rewards/margins": 1.2390671968460083, "rewards/rejected": -1.4643486738204956, "step": 1272 }, { "epoch": 0.21274820624061405, "grad_norm": 29.038150787353516, "learning_rate": 1.787251793759386e-05, "logits/chosen": -0.5062251687049866, "logits/rejected": -0.5790446400642395, "logps/chosen": -57.53759765625, "logps/rejected": -118.3148422241211, "loss": 0.4638, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.42747578024864197, "rewards/margins": 0.6941129565238953, "rewards/rejected": -1.1215885877609253, "step": 1275 }, { "epoch": 0.21324879025529786, "grad_norm": 15.607301712036133, "learning_rate": 1.7867512097447025e-05, "logits/chosen": -0.9830458760261536, "logits/rejected": -0.9526227116584778, "logps/chosen": -84.22323608398438, "logps/rejected": -57.822662353515625, "loss": 0.3844, "rewards/accuracies": 1.0, "rewards/chosen": -0.2278076410293579, "rewards/margins": 0.5057051777839661, "rewards/rejected": -0.733512818813324, "step": 1278 }, { "epoch": 0.21374937426998164, "grad_norm": 11.410201072692871, "learning_rate": 1.7862506257300187e-05, "logits/chosen": -0.39403238892555237, "logits/rejected": -0.5901203751564026, "logps/chosen": -52.36875915527344, "logps/rejected": -159.67787170410156, "loss": 0.2497, "rewards/accuracies": 1.0, "rewards/chosen": -0.3534422814846039, "rewards/margins": 2.64308762550354, "rewards/rejected": -2.9965295791625977, "step": 1281 }, { "epoch": 0.21424995828466545, "grad_norm": 19.87769889831543, "learning_rate": 1.7857500417153345e-05, "logits/chosen": -0.8107660412788391, "logits/rejected": -0.8570258021354675, "logps/chosen": -72.5135498046875, "logps/rejected": -108.66845703125, "loss": 0.7023, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.8156195282936096, "rewards/margins": -0.5330908298492432, "rewards/rejected": -0.28252866864204407, "step": 1284 }, { "epoch": 0.21475054229934923, "grad_norm": 23.798141479492188, "learning_rate": 1.785249457700651e-05, "logits/chosen": -0.7123122811317444, "logits/rejected": -0.7334511876106262, "logps/chosen": -90.46569061279297, "logps/rejected": -103.3952865600586, "loss": 0.5142, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.12764830887317657, "rewards/margins": 1.4662774801254272, "rewards/rejected": -1.5939258337020874, "step": 1287 }, { "epoch": 0.21525112631403304, "grad_norm": 7.784563064575195, "learning_rate": 1.7847488736859672e-05, "logits/chosen": -0.7311463952064514, "logits/rejected": -0.7475035786628723, "logps/chosen": -89.1267318725586, "logps/rejected": -109.48462677001953, "loss": 0.7173, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.9941152930259705, "rewards/margins": 0.6552081108093262, "rewards/rejected": -1.6493234634399414, "step": 1290 }, { "epoch": 0.21575171032871684, "grad_norm": 22.549747467041016, "learning_rate": 1.7842482896712834e-05, "logits/chosen": -0.7078463435173035, "logits/rejected": -0.665543794631958, "logps/chosen": -82.4439468383789, "logps/rejected": -57.07575607299805, "loss": 0.7646, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7142128944396973, "rewards/margins": -0.4141404330730438, "rewards/rejected": -0.3000723421573639, "step": 1293 }, { "epoch": 0.21625229434340063, "grad_norm": 9.644754409790039, "learning_rate": 1.7837477056565996e-05, "logits/chosen": -0.8240029215812683, "logits/rejected": -0.9179061055183411, "logps/chosen": -79.74503326416016, "logps/rejected": -180.5644989013672, "loss": 0.4171, "rewards/accuracies": 1.0, "rewards/chosen": -0.3787703216075897, "rewards/margins": 1.6320475339889526, "rewards/rejected": -2.010817766189575, "step": 1296 }, { "epoch": 0.21675287835808443, "grad_norm": 23.351299285888672, "learning_rate": 1.7832471216419157e-05, "logits/chosen": -0.8017775416374207, "logits/rejected": -0.7933629155158997, "logps/chosen": -79.5746841430664, "logps/rejected": -98.56529998779297, "loss": 0.4308, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.32598841190338135, "rewards/margins": 1.2543689012527466, "rewards/rejected": -0.9283804893493652, "step": 1299 }, { "epoch": 0.21725346237276824, "grad_norm": 28.877723693847656, "learning_rate": 1.782746537627232e-05, "logits/chosen": -0.6967372298240662, "logits/rejected": -0.7530484199523926, "logps/chosen": -87.92386627197266, "logps/rejected": -129.48643493652344, "loss": 0.771, "rewards/accuracies": 1.0, "rewards/chosen": 0.630944550037384, "rewards/margins": 1.8121484518051147, "rewards/rejected": -1.181203842163086, "step": 1302 }, { "epoch": 0.21775404638745202, "grad_norm": 32.01348114013672, "learning_rate": 1.782245953612548e-05, "logits/chosen": -0.5767238736152649, "logits/rejected": -0.46568921208381653, "logps/chosen": -128.09861755371094, "logps/rejected": -74.80575561523438, "loss": 0.7102, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.6257261633872986, "rewards/margins": -0.3946298062801361, "rewards/rejected": -0.23109643161296844, "step": 1305 }, { "epoch": 0.21825463040213583, "grad_norm": 10.453418731689453, "learning_rate": 1.7817453695978643e-05, "logits/chosen": -0.7957475781440735, "logits/rejected": -0.780756950378418, "logps/chosen": -78.02984619140625, "logps/rejected": -52.608707427978516, "loss": 0.483, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.1188836097717285, "rewards/margins": -0.49490442872047424, "rewards/rejected": -0.6239792108535767, "step": 1308 }, { "epoch": 0.2187552144168196, "grad_norm": 30.04639434814453, "learning_rate": 1.7812447855831805e-05, "logits/chosen": -0.8703813552856445, "logits/rejected": -0.8706970810890198, "logps/chosen": -95.7827377319336, "logps/rejected": -96.06744384765625, "loss": 0.7311, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.2655482292175293, "rewards/margins": -0.6645256876945496, "rewards/rejected": -1.6010226011276245, "step": 1311 }, { "epoch": 0.21925579843150342, "grad_norm": 6.9862542152404785, "learning_rate": 1.780744201568497e-05, "logits/chosen": -0.7726666331291199, "logits/rejected": -0.8147712349891663, "logps/chosen": -48.26976013183594, "logps/rejected": -127.7807388305664, "loss": 0.3107, "rewards/accuracies": 1.0, "rewards/chosen": 1.0549583435058594, "rewards/margins": 4.124457836151123, "rewards/rejected": -3.0694990158081055, "step": 1314 }, { "epoch": 0.21975638244618723, "grad_norm": 38.155555725097656, "learning_rate": 1.7802436175538128e-05, "logits/chosen": -0.6550207734107971, "logits/rejected": -0.6665410399436951, "logps/chosen": -97.4012451171875, "logps/rejected": -128.8595733642578, "loss": 0.7937, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.9972569942474365, "rewards/margins": 0.3025552034378052, "rewards/rejected": -1.2998121976852417, "step": 1317 }, { "epoch": 0.220256966460871, "grad_norm": 26.806196212768555, "learning_rate": 1.779743033539129e-05, "logits/chosen": -0.8109634518623352, "logits/rejected": -0.7641869187355042, "logps/chosen": -99.9028549194336, "logps/rejected": -92.30518341064453, "loss": 0.5562, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.10964623838663101, "rewards/margins": 0.8453440070152283, "rewards/rejected": -0.9549901485443115, "step": 1320 }, { "epoch": 0.22075755047555481, "grad_norm": 15.787264823913574, "learning_rate": 1.7792424495244455e-05, "logits/chosen": -0.6926500797271729, "logits/rejected": -0.7459931969642639, "logps/chosen": -65.6156005859375, "logps/rejected": -134.88185119628906, "loss": 0.7287, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.0018281340599060059, "rewards/margins": 1.2536824941635132, "rewards/rejected": -1.251854419708252, "step": 1323 }, { "epoch": 0.22125813449023862, "grad_norm": 15.30100154876709, "learning_rate": 1.7787418655097614e-05, "logits/chosen": -0.6205049753189087, "logits/rejected": -0.6718721389770508, "logps/chosen": -78.50716400146484, "logps/rejected": -103.19713592529297, "loss": 0.2772, "rewards/accuracies": 1.0, "rewards/chosen": 0.606548547744751, "rewards/margins": 2.5677242279052734, "rewards/rejected": -1.961175560951233, "step": 1326 }, { "epoch": 0.2217587185049224, "grad_norm": 14.213190078735352, "learning_rate": 1.778241281495078e-05, "logits/chosen": -0.7646705508232117, "logits/rejected": -0.8412689566612244, "logps/chosen": -51.23065185546875, "logps/rejected": -143.0082550048828, "loss": 0.4384, "rewards/accuracies": 1.0, "rewards/chosen": -0.20561085641384125, "rewards/margins": 2.653628349304199, "rewards/rejected": -2.859239339828491, "step": 1329 }, { "epoch": 0.2222593025196062, "grad_norm": 12.8325834274292, "learning_rate": 1.777740697480394e-05, "logits/chosen": -0.8216665387153625, "logits/rejected": -0.791779100894928, "logps/chosen": -76.92017364501953, "logps/rejected": -78.15902709960938, "loss": 0.6708, "rewards/accuracies": 1.0, "rewards/chosen": 0.6131117343902588, "rewards/margins": 1.522528052330017, "rewards/rejected": -0.9094161987304688, "step": 1332 }, { "epoch": 0.22275988653429, "grad_norm": 8.591293334960938, "learning_rate": 1.7772401134657102e-05, "logits/chosen": -0.7031553387641907, "logits/rejected": -0.8226421475410461, "logps/chosen": -54.11043930053711, "logps/rejected": -149.28138732910156, "loss": 0.3781, "rewards/accuracies": 1.0, "rewards/chosen": -0.20807063579559326, "rewards/margins": 2.6797635555267334, "rewards/rejected": -2.887834310531616, "step": 1335 }, { "epoch": 0.2232604705489738, "grad_norm": 20.678064346313477, "learning_rate": 1.7767395294510264e-05, "logits/chosen": -0.5931413173675537, "logits/rejected": -0.5732429027557373, "logps/chosen": -109.65322875976562, "logps/rejected": -128.69549560546875, "loss": 0.8066, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.176990270614624, "rewards/margins": -0.5059077143669128, "rewards/rejected": -0.6710825562477112, "step": 1338 }, { "epoch": 0.2237610545636576, "grad_norm": 20.988710403442383, "learning_rate": 1.7762389454363426e-05, "logits/chosen": -0.6826449036598206, "logits/rejected": -0.7734613418579102, "logps/chosen": -77.55672454833984, "logps/rejected": -92.06436157226562, "loss": 0.4424, "rewards/accuracies": 1.0, "rewards/chosen": -0.04376094043254852, "rewards/margins": 1.235319972038269, "rewards/rejected": -1.279080867767334, "step": 1341 }, { "epoch": 0.2242616385783414, "grad_norm": 16.715742111206055, "learning_rate": 1.7757383614216588e-05, "logits/chosen": -0.7173048853874207, "logits/rejected": -0.7841911911964417, "logps/chosen": -103.31787109375, "logps/rejected": -104.74857330322266, "loss": 1.0406, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.04751090332865715, "rewards/margins": 1.5311447381973267, "rewards/rejected": -1.5786558389663696, "step": 1344 }, { "epoch": 0.2247622225930252, "grad_norm": 7.8017072677612305, "learning_rate": 1.775237777406975e-05, "logits/chosen": -0.6085734367370605, "logits/rejected": -0.6347225904464722, "logps/chosen": -83.7490463256836, "logps/rejected": -92.1376953125, "loss": 0.4489, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7683258056640625, "rewards/margins": 1.4852389097213745, "rewards/rejected": -2.2535645961761475, "step": 1347 }, { "epoch": 0.225262806607709, "grad_norm": 38.25666046142578, "learning_rate": 1.774737193392291e-05, "logits/chosen": -0.6901364922523499, "logits/rejected": -0.6966801285743713, "logps/chosen": -116.9964828491211, "logps/rejected": -164.8548583984375, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": -0.3987678587436676, "rewards/margins": 1.4676305055618286, "rewards/rejected": -1.8663983345031738, "step": 1350 }, { "epoch": 0.22576339062239278, "grad_norm": 17.992109298706055, "learning_rate": 1.7742366093776073e-05, "logits/chosen": -0.651923656463623, "logits/rejected": -0.7349541187286377, "logps/chosen": -97.0821762084961, "logps/rejected": -192.24786376953125, "loss": 0.3453, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.0758311077952385, "rewards/margins": 1.2717585563659668, "rewards/rejected": -1.195927381515503, "step": 1353 }, { "epoch": 0.2262639746370766, "grad_norm": 29.784423828125, "learning_rate": 1.7737360253629238e-05, "logits/chosen": -0.7356597781181335, "logits/rejected": -0.7531207203865051, "logps/chosen": -68.32013702392578, "logps/rejected": -96.6578598022461, "loss": 0.4947, "rewards/accuracies": 1.0, "rewards/chosen": 0.23024316132068634, "rewards/margins": 1.8027173280715942, "rewards/rejected": -1.5724741220474243, "step": 1356 }, { "epoch": 0.2267645586517604, "grad_norm": 13.144864082336426, "learning_rate": 1.7732354413482397e-05, "logits/chosen": -0.8079113364219666, "logits/rejected": -0.8146321177482605, "logps/chosen": -61.30057144165039, "logps/rejected": -82.98992156982422, "loss": 0.6241, "rewards/accuracies": 1.0, "rewards/chosen": 0.45958229899406433, "rewards/margins": 0.8283320069313049, "rewards/rejected": -0.368749737739563, "step": 1359 }, { "epoch": 0.22726514266644418, "grad_norm": 10.819211959838867, "learning_rate": 1.772734857333556e-05, "logits/chosen": -0.9670699238777161, "logits/rejected": -0.9940981864929199, "logps/chosen": -62.937076568603516, "logps/rejected": -96.19183349609375, "loss": 0.4666, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.12726186215877533, "rewards/margins": 1.16641104221344, "rewards/rejected": -1.2936729192733765, "step": 1362 }, { "epoch": 0.227765726681128, "grad_norm": 23.445781707763672, "learning_rate": 1.7722342733188724e-05, "logits/chosen": -0.7840655446052551, "logits/rejected": -0.7706950306892395, "logps/chosen": -98.40030670166016, "logps/rejected": -86.58108520507812, "loss": 0.5294, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.10445862263441086, "rewards/margins": 1.6248455047607422, "rewards/rejected": -1.520386815071106, "step": 1365 }, { "epoch": 0.22826631069581177, "grad_norm": 9.377874374389648, "learning_rate": 1.7717336893041882e-05, "logits/chosen": -0.6279318928718567, "logits/rejected": -0.6187726855278015, "logps/chosen": -56.68959045410156, "logps/rejected": -81.8797378540039, "loss": 0.4727, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.2716014087200165, "rewards/margins": 0.9200548529624939, "rewards/rejected": -1.191656231880188, "step": 1368 }, { "epoch": 0.22876689471049558, "grad_norm": 6.546853065490723, "learning_rate": 1.7712331052895047e-05, "logits/chosen": -0.7659444212913513, "logits/rejected": -0.807716429233551, "logps/chosen": -87.50211334228516, "logps/rejected": -122.16722869873047, "loss": 0.49, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7815987467765808, "rewards/margins": 1.843509316444397, "rewards/rejected": -2.625108003616333, "step": 1371 }, { "epoch": 0.22926747872517939, "grad_norm": 18.30091667175293, "learning_rate": 1.770732521274821e-05, "logits/chosen": -0.7881311774253845, "logits/rejected": -0.7708616256713867, "logps/chosen": -59.63606643676758, "logps/rejected": -63.12385177612305, "loss": 0.663, "rewards/accuracies": 1.0, "rewards/chosen": 0.0786433219909668, "rewards/margins": 0.6440518498420715, "rewards/rejected": -0.5654085278511047, "step": 1374 }, { "epoch": 0.22976806273986317, "grad_norm": 13.315500259399414, "learning_rate": 1.7702319372601367e-05, "logits/chosen": -0.806903064250946, "logits/rejected": -0.8225806355476379, "logps/chosen": -68.8484878540039, "logps/rejected": -64.2791976928711, "loss": 0.2925, "rewards/accuracies": 1.0, "rewards/chosen": 0.17948685586452484, "rewards/margins": 0.3518526554107666, "rewards/rejected": -0.17236584424972534, "step": 1377 }, { "epoch": 0.23026864675454697, "grad_norm": 14.011713981628418, "learning_rate": 1.7697313532454533e-05, "logits/chosen": -0.8131189346313477, "logits/rejected": -0.8570130467414856, "logps/chosen": -69.59114837646484, "logps/rejected": -113.80135345458984, "loss": 0.4523, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.1324175000190735, "rewards/margins": 0.12465167045593262, "rewards/rejected": -0.2570691406726837, "step": 1380 }, { "epoch": 0.23076923076923078, "grad_norm": 18.853267669677734, "learning_rate": 1.7692307692307694e-05, "logits/chosen": -0.8043251037597656, "logits/rejected": -0.8289003968238831, "logps/chosen": -98.2921142578125, "logps/rejected": -134.06593322753906, "loss": 0.6793, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4613296687602997, "rewards/margins": 0.8301844596862793, "rewards/rejected": -1.2915140390396118, "step": 1383 }, { "epoch": 0.23126981478391456, "grad_norm": 14.263971328735352, "learning_rate": 1.7687301852160856e-05, "logits/chosen": -0.6331673264503479, "logits/rejected": -0.6999477744102478, "logps/chosen": -100.403564453125, "logps/rejected": -179.97979736328125, "loss": 0.7086, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6582878232002258, "rewards/margins": 0.23307369649410248, "rewards/rejected": -0.8913615345954895, "step": 1386 }, { "epoch": 0.23177039879859837, "grad_norm": 47.35725021362305, "learning_rate": 1.7682296012014018e-05, "logits/chosen": -0.8273677825927734, "logits/rejected": -0.8200691342353821, "logps/chosen": -102.72188568115234, "logps/rejected": -103.321533203125, "loss": 1.1426, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.3003467321395874, "rewards/margins": 0.13219620287418365, "rewards/rejected": -1.4325429201126099, "step": 1389 }, { "epoch": 0.23227098281328215, "grad_norm": 27.03546714782715, "learning_rate": 1.767729017186718e-05, "logits/chosen": -0.7975885272026062, "logits/rejected": -0.7724003791809082, "logps/chosen": -108.09062957763672, "logps/rejected": -104.8475570678711, "loss": 0.5872, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.126677468419075, "rewards/margins": -0.34892508387565613, "rewards/rejected": 0.22224760055541992, "step": 1392 }, { "epoch": 0.23277156682796596, "grad_norm": 10.313899040222168, "learning_rate": 1.767228433172034e-05, "logits/chosen": -0.7358344197273254, "logits/rejected": -0.7775822281837463, "logps/chosen": -46.78559494018555, "logps/rejected": -108.4854736328125, "loss": 0.7376, "rewards/accuracies": 1.0, "rewards/chosen": -0.04030152037739754, "rewards/margins": 2.3168210983276367, "rewards/rejected": -2.3571226596832275, "step": 1395 }, { "epoch": 0.23327215084264977, "grad_norm": 31.918949127197266, "learning_rate": 1.7667278491573503e-05, "logits/chosen": -0.6332667469978333, "logits/rejected": -0.6215240359306335, "logps/chosen": -113.93869018554688, "logps/rejected": -74.81441497802734, "loss": 0.4223, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.29547157883644104, "rewards/margins": -0.2509455680847168, "rewards/rejected": -0.04452604055404663, "step": 1398 }, { "epoch": 0.23377273485733355, "grad_norm": 5.692457675933838, "learning_rate": 1.7662272651426665e-05, "logits/chosen": -0.7266442775726318, "logits/rejected": -0.7543144226074219, "logps/chosen": -28.30662727355957, "logps/rejected": -53.690372467041016, "loss": 0.3793, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.43875351548194885, "rewards/margins": 1.0050987005233765, "rewards/rejected": -0.5663452744483948, "step": 1401 }, { "epoch": 0.23427331887201736, "grad_norm": 35.28190994262695, "learning_rate": 1.7657266811279827e-05, "logits/chosen": -0.7061331272125244, "logits/rejected": -0.7011720538139343, "logps/chosen": -71.84046936035156, "logps/rejected": -95.5965347290039, "loss": 0.4892, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.6983246803283691, "rewards/margins": 1.748438835144043, "rewards/rejected": -1.0501141548156738, "step": 1404 }, { "epoch": 0.23477390288670116, "grad_norm": 51.749664306640625, "learning_rate": 1.7652260971132992e-05, "logits/chosen": -0.6743314862251282, "logits/rejected": -0.7093523144721985, "logps/chosen": -43.24635696411133, "logps/rejected": -70.21375274658203, "loss": 0.7157, "rewards/accuracies": 1.0, "rewards/chosen": 1.208142876625061, "rewards/margins": 1.0644317865371704, "rewards/rejected": 0.14371109008789062, "step": 1407 }, { "epoch": 0.23527448690138494, "grad_norm": 19.817522048950195, "learning_rate": 1.764725513098615e-05, "logits/chosen": -0.7556647658348083, "logits/rejected": -0.7682566046714783, "logps/chosen": -44.1040153503418, "logps/rejected": -59.850032806396484, "loss": 0.5916, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.481560617685318, "rewards/margins": 0.36492279171943665, "rewards/rejected": 0.11663781851530075, "step": 1410 }, { "epoch": 0.23577507091606875, "grad_norm": 14.324979782104492, "learning_rate": 1.7642249290839316e-05, "logits/chosen": -0.8554025292396545, "logits/rejected": -0.847601592540741, "logps/chosen": -62.60888671875, "logps/rejected": -79.03056335449219, "loss": 0.4892, "rewards/accuracies": 1.0, "rewards/chosen": 0.0844518169760704, "rewards/margins": 1.9274548292160034, "rewards/rejected": -1.8430029153823853, "step": 1413 }, { "epoch": 0.23627565493075253, "grad_norm": 9.577752113342285, "learning_rate": 1.7637243450692478e-05, "logits/chosen": -0.9367254376411438, "logits/rejected": -0.8601172566413879, "logps/chosen": -119.400390625, "logps/rejected": -85.54755401611328, "loss": 0.6035, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.14853185415267944, "rewards/margins": 1.2564245462417603, "rewards/rejected": -1.404956340789795, "step": 1416 }, { "epoch": 0.23677623894543634, "grad_norm": 30.4339656829834, "learning_rate": 1.7632237610545636e-05, "logits/chosen": -0.6098413467407227, "logits/rejected": -0.6872169375419617, "logps/chosen": -64.5599594116211, "logps/rejected": -134.79310607910156, "loss": 0.3772, "rewards/accuracies": 1.0, "rewards/chosen": 0.6417327523231506, "rewards/margins": 1.5708751678466797, "rewards/rejected": -0.9291422963142395, "step": 1419 }, { "epoch": 0.23727682296012015, "grad_norm": 41.206233978271484, "learning_rate": 1.76272317703988e-05, "logits/chosen": -0.7449240684509277, "logits/rejected": -0.7689297199249268, "logps/chosen": -90.85519409179688, "logps/rejected": -102.02532958984375, "loss": 0.6775, "rewards/accuracies": 1.0, "rewards/chosen": 0.1466519832611084, "rewards/margins": 1.2216957807540894, "rewards/rejected": -1.075043797492981, "step": 1422 }, { "epoch": 0.23777740697480393, "grad_norm": 51.36943435668945, "learning_rate": 1.7622225930251963e-05, "logits/chosen": -0.7477259635925293, "logits/rejected": -0.7634846568107605, "logps/chosen": -66.478271484375, "logps/rejected": -102.00786590576172, "loss": 0.4564, "rewards/accuracies": 1.0, "rewards/chosen": 0.39562925696372986, "rewards/margins": 2.4412269592285156, "rewards/rejected": -2.045597553253174, "step": 1425 }, { "epoch": 0.23827799098948774, "grad_norm": 56.17255401611328, "learning_rate": 1.7617220090105125e-05, "logits/chosen": -0.8206016421318054, "logits/rejected": -0.805898129940033, "logps/chosen": -75.90736389160156, "logps/rejected": -89.11279296875, "loss": 1.1424, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.08256606757640839, "rewards/margins": 0.2558804750442505, "rewards/rejected": -0.3384465277194977, "step": 1428 }, { "epoch": 0.23877857500417154, "grad_norm": 11.697542190551758, "learning_rate": 1.7612214249958287e-05, "logits/chosen": -0.6793883442878723, "logits/rejected": -0.6752644181251526, "logps/chosen": -96.16512298583984, "logps/rejected": -112.2759017944336, "loss": 0.4028, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7507777810096741, "rewards/margins": 0.7695716023445129, "rewards/rejected": -1.5203495025634766, "step": 1431 }, { "epoch": 0.23927915901885533, "grad_norm": 14.310315132141113, "learning_rate": 1.760720840981145e-05, "logits/chosen": -0.8019158840179443, "logits/rejected": -0.8295445442199707, "logps/chosen": -72.4179458618164, "logps/rejected": -118.28177642822266, "loss": 0.2527, "rewards/accuracies": 1.0, "rewards/chosen": -0.7214563488960266, "rewards/margins": 1.5153851509094238, "rewards/rejected": -2.2368414402008057, "step": 1434 }, { "epoch": 0.23977974303353913, "grad_norm": 37.32477951049805, "learning_rate": 1.760220256966461e-05, "logits/chosen": -0.6509774327278137, "logits/rejected": -0.6607632637023926, "logps/chosen": -120.68499755859375, "logps/rejected": -135.61973571777344, "loss": 0.7456, "rewards/accuracies": 1.0, "rewards/chosen": -1.136370301246643, "rewards/margins": 1.0396116971969604, "rewards/rejected": -2.1759822368621826, "step": 1437 }, { "epoch": 0.2402803270482229, "grad_norm": 28.463510513305664, "learning_rate": 1.7597196729517772e-05, "logits/chosen": -0.7703172564506531, "logits/rejected": -0.7924044728279114, "logps/chosen": -118.0400390625, "logps/rejected": -145.87059020996094, "loss": 0.8972, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.734275460243225, "rewards/margins": -0.14244723320007324, "rewards/rejected": -1.5918283462524414, "step": 1440 }, { "epoch": 0.24078091106290672, "grad_norm": 36.895694732666016, "learning_rate": 1.7592190889370934e-05, "logits/chosen": -0.8227425217628479, "logits/rejected": -0.8308501243591309, "logps/chosen": -95.5701904296875, "logps/rejected": -95.3317642211914, "loss": 0.5158, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.03454658389091492, "rewards/margins": 3.1941394805908203, "rewards/rejected": -3.2286856174468994, "step": 1443 }, { "epoch": 0.24128149507759053, "grad_norm": 28.56867790222168, "learning_rate": 1.7587185049224096e-05, "logits/chosen": -0.8287420868873596, "logits/rejected": -0.788926899433136, "logps/chosen": -104.302001953125, "logps/rejected": -83.07005310058594, "loss": 1.1008, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1683982610702515, "rewards/margins": -0.16246914863586426, "rewards/rejected": -1.0059291124343872, "step": 1446 }, { "epoch": 0.2417820790922743, "grad_norm": 31.277441024780273, "learning_rate": 1.758217920907726e-05, "logits/chosen": -0.6867057681083679, "logits/rejected": -0.7220490574836731, "logps/chosen": -58.133819580078125, "logps/rejected": -115.756103515625, "loss": 0.421, "rewards/accuracies": 1.0, "rewards/chosen": -0.03099656105041504, "rewards/margins": 3.880460500717163, "rewards/rejected": -3.911457061767578, "step": 1449 }, { "epoch": 0.24228266310695812, "grad_norm": 9.329375267028809, "learning_rate": 1.757717336893042e-05, "logits/chosen": -0.881709098815918, "logits/rejected": -0.9192050099372864, "logps/chosen": -36.24677658081055, "logps/rejected": -119.36473846435547, "loss": 0.4399, "rewards/accuracies": 1.0, "rewards/chosen": 0.08789438009262085, "rewards/margins": 1.9775303602218628, "rewards/rejected": -1.8896359205245972, "step": 1452 }, { "epoch": 0.24278324712164193, "grad_norm": 7.12981653213501, "learning_rate": 1.757216752878358e-05, "logits/chosen": -0.770003616809845, "logits/rejected": -0.8276870846748352, "logps/chosen": -83.25164794921875, "logps/rejected": -120.70491790771484, "loss": 0.4465, "rewards/accuracies": 1.0, "rewards/chosen": -0.047561511397361755, "rewards/margins": 1.4652212858200073, "rewards/rejected": -1.5127826929092407, "step": 1455 }, { "epoch": 0.2432838311363257, "grad_norm": 48.6910514831543, "learning_rate": 1.7567161688636746e-05, "logits/chosen": -0.6477674841880798, "logits/rejected": -0.6621039509773254, "logps/chosen": -78.39610290527344, "logps/rejected": -100.01605987548828, "loss": 0.4772, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5154547691345215, "rewards/margins": 1.3011741638183594, "rewards/rejected": -1.8166288137435913, "step": 1458 }, { "epoch": 0.24378441515100951, "grad_norm": 40.154090881347656, "learning_rate": 1.7562155848489904e-05, "logits/chosen": -0.7550749778747559, "logits/rejected": -0.8547286987304688, "logps/chosen": -34.12419509887695, "logps/rejected": -129.38218688964844, "loss": 0.6613, "rewards/accuracies": 1.0, "rewards/chosen": 0.1465616673231125, "rewards/margins": 4.491122722625732, "rewards/rejected": -4.344560623168945, "step": 1461 }, { "epoch": 0.2442849991656933, "grad_norm": 20.574403762817383, "learning_rate": 1.755715000834307e-05, "logits/chosen": -0.6945971846580505, "logits/rejected": -0.7386805415153503, "logps/chosen": -60.177974700927734, "logps/rejected": -78.3365249633789, "loss": 0.7981, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.7286598086357117, "rewards/margins": 0.5562655329704285, "rewards/rejected": -1.2849254608154297, "step": 1464 }, { "epoch": 0.2447855831803771, "grad_norm": 15.24002742767334, "learning_rate": 1.755214416819623e-05, "logits/chosen": -0.7810025215148926, "logits/rejected": -0.7904656529426575, "logps/chosen": -34.06245803833008, "logps/rejected": -77.57085418701172, "loss": 0.3614, "rewards/accuracies": 1.0, "rewards/chosen": -0.1332852840423584, "rewards/margins": 1.6731348037719727, "rewards/rejected": -1.8064203262329102, "step": 1467 }, { "epoch": 0.2452861671950609, "grad_norm": 39.83440017700195, "learning_rate": 1.7547138328049393e-05, "logits/chosen": -0.8897663950920105, "logits/rejected": -0.946861743927002, "logps/chosen": -48.519527435302734, "logps/rejected": -144.25721740722656, "loss": 0.4407, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.37582048773765564, "rewards/margins": 1.8595455884933472, "rewards/rejected": -1.4837251901626587, "step": 1470 }, { "epoch": 0.2457867512097447, "grad_norm": 16.207117080688477, "learning_rate": 1.7542132487902555e-05, "logits/chosen": -0.8994324803352356, "logits/rejected": -0.8697109818458557, "logps/chosen": -96.18958282470703, "logps/rejected": -70.45266723632812, "loss": 0.3829, "rewards/accuracies": 1.0, "rewards/chosen": 0.4761834442615509, "rewards/margins": 1.7963165044784546, "rewards/rejected": -1.3201329708099365, "step": 1473 }, { "epoch": 0.2462873352244285, "grad_norm": 18.52305030822754, "learning_rate": 1.7537126647755717e-05, "logits/chosen": -0.8411853909492493, "logits/rejected": -0.8341304659843445, "logps/chosen": -121.9775390625, "logps/rejected": -121.64443969726562, "loss": 0.2171, "rewards/accuracies": 1.0, "rewards/chosen": -1.3144768476486206, "rewards/margins": 1.866662621498108, "rewards/rejected": -3.1811392307281494, "step": 1476 }, { "epoch": 0.2467879192391123, "grad_norm": 17.121623992919922, "learning_rate": 1.753212080760888e-05, "logits/chosen": -0.823836624622345, "logits/rejected": -0.8957588076591492, "logps/chosen": -97.48722076416016, "logps/rejected": -170.1110382080078, "loss": 0.8384, "rewards/accuracies": 1.0, "rewards/chosen": -0.02984221838414669, "rewards/margins": 2.649538278579712, "rewards/rejected": -2.679380416870117, "step": 1479 }, { "epoch": 0.2472885032537961, "grad_norm": 17.61093521118164, "learning_rate": 1.752711496746204e-05, "logits/chosen": -0.7211164832115173, "logits/rejected": -0.6842034459114075, "logps/chosen": -94.8259048461914, "logps/rejected": -86.7606201171875, "loss": 0.5358, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.3892179727554321, "rewards/margins": 0.923000156879425, "rewards/rejected": -2.312218189239502, "step": 1482 }, { "epoch": 0.2477890872684799, "grad_norm": 12.669596672058105, "learning_rate": 1.7522109127315202e-05, "logits/chosen": -0.7500690817832947, "logits/rejected": -0.7706665992736816, "logps/chosen": -99.50726318359375, "logps/rejected": -108.69615936279297, "loss": 0.5045, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.9414228200912476, "rewards/margins": 1.1212420463562012, "rewards/rejected": -3.0626649856567383, "step": 1485 }, { "epoch": 0.2482896712831637, "grad_norm": 13.275140762329102, "learning_rate": 1.7517103287168364e-05, "logits/chosen": -0.7871794104576111, "logits/rejected": -0.8302497863769531, "logps/chosen": -62.67306137084961, "logps/rejected": -128.0399169921875, "loss": 0.2755, "rewards/accuracies": 1.0, "rewards/chosen": -0.12812583148479462, "rewards/margins": 2.436614990234375, "rewards/rejected": -2.5647408962249756, "step": 1488 }, { "epoch": 0.24879025529784748, "grad_norm": 21.497112274169922, "learning_rate": 1.7512097447021526e-05, "logits/chosen": -0.6801228523254395, "logits/rejected": -0.8645017147064209, "logps/chosen": -35.23634719848633, "logps/rejected": -138.1218719482422, "loss": 0.627, "rewards/accuracies": 1.0, "rewards/chosen": -0.6164463758468628, "rewards/margins": 2.163191080093384, "rewards/rejected": -2.779637575149536, "step": 1491 }, { "epoch": 0.2492908393125313, "grad_norm": 40.60688400268555, "learning_rate": 1.7507091606874688e-05, "logits/chosen": -0.8473336100578308, "logits/rejected": -0.9088197350502014, "logps/chosen": -84.75588989257812, "logps/rejected": -149.41697692871094, "loss": 0.6967, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.3758773803710938, "rewards/margins": 0.6975984573364258, "rewards/rejected": -2.0734760761260986, "step": 1494 }, { "epoch": 0.24979142332721507, "grad_norm": 10.837547302246094, "learning_rate": 1.750208576672785e-05, "logits/chosen": -0.7999324202537537, "logits/rejected": -0.7340612411499023, "logps/chosen": -68.33758544921875, "logps/rejected": -86.5186538696289, "loss": 0.6682, "rewards/accuracies": 1.0, "rewards/chosen": 0.28356435894966125, "rewards/margins": 1.5882428884506226, "rewards/rejected": -1.3046784400939941, "step": 1497 }, { "epoch": 0.2502920073418989, "grad_norm": 47.146690368652344, "learning_rate": 1.7497079926581015e-05, "logits/chosen": -0.7711662650108337, "logits/rejected": -0.810645341873169, "logps/chosen": -46.86945724487305, "logps/rejected": -67.49018096923828, "loss": 0.6458, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.5919141173362732, "rewards/margins": -0.21474480628967285, "rewards/rejected": -0.37716928124427795, "step": 1500 }, { "epoch": 0.25079259135658266, "grad_norm": 20.72958755493164, "learning_rate": 1.7492074086434173e-05, "logits/chosen": -0.7257005572319031, "logits/rejected": -0.7532842755317688, "logps/chosen": -77.52152252197266, "logps/rejected": -110.96075439453125, "loss": 0.5857, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5361270904541016, "rewards/margins": 2.4470205307006836, "rewards/rejected": -2.9831478595733643, "step": 1503 }, { "epoch": 0.2512931753712665, "grad_norm": 10.748506546020508, "learning_rate": 1.7487068246287338e-05, "logits/chosen": -0.6230623722076416, "logits/rejected": -0.7158913612365723, "logps/chosen": -44.343048095703125, "logps/rejected": -142.25372314453125, "loss": 0.8549, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.13883429765701294, "rewards/margins": 0.9785154461860657, "rewards/rejected": -1.1173497438430786, "step": 1506 }, { "epoch": 0.2517937593859503, "grad_norm": 8.697507858276367, "learning_rate": 1.74820624061405e-05, "logits/chosen": -0.7266430258750916, "logits/rejected": -0.7198823094367981, "logps/chosen": -87.67279815673828, "logps/rejected": -85.40668487548828, "loss": 0.3762, "rewards/accuracies": 1.0, "rewards/chosen": -0.29984810948371887, "rewards/margins": 1.2723244428634644, "rewards/rejected": -1.5721725225448608, "step": 1509 }, { "epoch": 0.25229434340063406, "grad_norm": 16.960391998291016, "learning_rate": 1.747705656599366e-05, "logits/chosen": -0.7253556251525879, "logits/rejected": -0.7122480869293213, "logps/chosen": -89.77933502197266, "logps/rejected": -82.9645767211914, "loss": 0.4532, "rewards/accuracies": 1.0, "rewards/chosen": -0.37853333353996277, "rewards/margins": 0.7904589772224426, "rewards/rejected": -1.1689924001693726, "step": 1512 }, { "epoch": 0.2527949274153179, "grad_norm": 38.24057388305664, "learning_rate": 1.7472050725846824e-05, "logits/chosen": -0.654330849647522, "logits/rejected": -0.5706191062927246, "logps/chosen": -83.58966827392578, "logps/rejected": -64.5629653930664, "loss": 0.7203, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9190184473991394, "rewards/margins": 0.4042421281337738, "rewards/rejected": -1.3232604265213013, "step": 1515 }, { "epoch": 0.2532955114300017, "grad_norm": 22.83722496032715, "learning_rate": 1.7467044885699985e-05, "logits/chosen": -0.8415301442146301, "logits/rejected": -0.8033193945884705, "logps/chosen": -96.40127563476562, "logps/rejected": -65.68797302246094, "loss": 0.6749, "rewards/accuracies": 1.0, "rewards/chosen": -1.6172046661376953, "rewards/margins": 1.445643424987793, "rewards/rejected": -3.0628480911254883, "step": 1518 }, { "epoch": 0.25379609544468545, "grad_norm": 5.468020915985107, "learning_rate": 1.7462039045553147e-05, "logits/chosen": -0.5557944178581238, "logits/rejected": -0.6879435181617737, "logps/chosen": -38.01266098022461, "logps/rejected": -178.6471405029297, "loss": 0.3052, "rewards/accuracies": 1.0, "rewards/chosen": 0.46982526779174805, "rewards/margins": 5.13545560836792, "rewards/rejected": -4.665629863739014, "step": 1521 }, { "epoch": 0.2542966794593693, "grad_norm": 14.856036186218262, "learning_rate": 1.745703320540631e-05, "logits/chosen": -0.9544763565063477, "logits/rejected": -0.9982609748840332, "logps/chosen": -90.5261001586914, "logps/rejected": -160.6277313232422, "loss": 0.2467, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.875128448009491, "rewards/margins": 1.7362514734268188, "rewards/rejected": -2.611379861831665, "step": 1524 }, { "epoch": 0.25479726347405307, "grad_norm": 25.440282821655273, "learning_rate": 1.745202736525947e-05, "logits/chosen": -0.8870159983634949, "logits/rejected": -0.8841062188148499, "logps/chosen": -135.13719177246094, "logps/rejected": -144.12091064453125, "loss": 0.8734, "rewards/accuracies": 1.0, "rewards/chosen": -0.09892349690198898, "rewards/margins": 1.5804907083511353, "rewards/rejected": -1.6794142723083496, "step": 1527 }, { "epoch": 0.25529784748873685, "grad_norm": 9.707795143127441, "learning_rate": 1.7447021525112633e-05, "logits/chosen": -0.7767565846443176, "logits/rejected": -0.821166455745697, "logps/chosen": -73.0248794555664, "logps/rejected": -91.90091705322266, "loss": 0.3837, "rewards/accuracies": 1.0, "rewards/chosen": -1.1815463304519653, "rewards/margins": 1.5588802099227905, "rewards/rejected": -2.7404263019561768, "step": 1530 }, { "epoch": 0.25579843150342063, "grad_norm": 23.02920150756836, "learning_rate": 1.7442015684965794e-05, "logits/chosen": -0.6152187585830688, "logits/rejected": -0.7467724680900574, "logps/chosen": -79.85611724853516, "logps/rejected": -154.79931640625, "loss": 0.7406, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8978349566459656, "rewards/margins": 1.2559797763824463, "rewards/rejected": -2.1538145542144775, "step": 1533 }, { "epoch": 0.25629901551810447, "grad_norm": 16.245603561401367, "learning_rate": 1.7437009844818956e-05, "logits/chosen": -0.8565535545349121, "logits/rejected": -0.8619733452796936, "logps/chosen": -110.52774047851562, "logps/rejected": -125.87125396728516, "loss": 0.2313, "rewards/accuracies": 1.0, "rewards/chosen": -1.6302088499069214, "rewards/margins": 2.303239583969116, "rewards/rejected": -3.933448076248169, "step": 1536 }, { "epoch": 0.25679959953278825, "grad_norm": 21.43558120727539, "learning_rate": 1.7432004004672118e-05, "logits/chosen": -0.8540325164794922, "logits/rejected": -0.793900191783905, "logps/chosen": -82.48239135742188, "logps/rejected": -89.07328033447266, "loss": 0.4717, "rewards/accuracies": 1.0, "rewards/chosen": -0.051090795546770096, "rewards/margins": 2.0532963275909424, "rewards/rejected": -2.104387044906616, "step": 1539 }, { "epoch": 0.25730018354747203, "grad_norm": 75.13682556152344, "learning_rate": 1.7426998164525283e-05, "logits/chosen": -0.6298723816871643, "logits/rejected": -0.7395370006561279, "logps/chosen": -59.30070877075195, "logps/rejected": -159.38426208496094, "loss": 0.9165, "rewards/accuracies": 1.0, "rewards/chosen": -0.2221725583076477, "rewards/margins": 4.010432243347168, "rewards/rejected": -4.232604503631592, "step": 1542 }, { "epoch": 0.25780076756215586, "grad_norm": 59.30804443359375, "learning_rate": 1.742199232437844e-05, "logits/chosen": -0.7429110407829285, "logits/rejected": -0.7550830841064453, "logps/chosen": -88.9181900024414, "logps/rejected": -108.38247680664062, "loss": 0.8399, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8155872821807861, "rewards/margins": 1.9190106391906738, "rewards/rejected": -2.73459792137146, "step": 1545 }, { "epoch": 0.25830135157683964, "grad_norm": 28.513534545898438, "learning_rate": 1.7416986484231603e-05, "logits/chosen": -0.7025863528251648, "logits/rejected": -0.7731030583381653, "logps/chosen": -60.505863189697266, "logps/rejected": -148.65402221679688, "loss": 0.7793, "rewards/accuracies": 1.0, "rewards/chosen": 0.13918010890483856, "rewards/margins": 3.339585304260254, "rewards/rejected": -3.2004051208496094, "step": 1548 }, { "epoch": 0.2588019355915234, "grad_norm": 9.769996643066406, "learning_rate": 1.741198064408477e-05, "logits/chosen": -0.5356557369232178, "logits/rejected": -0.5419954657554626, "logps/chosen": -110.07025146484375, "logps/rejected": -148.5365447998047, "loss": 1.0019, "rewards/accuracies": 1.0, "rewards/chosen": -1.3590278625488281, "rewards/margins": 1.6320348978042603, "rewards/rejected": -2.991062879562378, "step": 1551 }, { "epoch": 0.25930251960620726, "grad_norm": 43.5840950012207, "learning_rate": 1.7406974803937927e-05, "logits/chosen": -0.7282806038856506, "logits/rejected": -0.7261146903038025, "logps/chosen": -75.18692016601562, "logps/rejected": -83.74568939208984, "loss": 0.7513, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.338687539100647, "rewards/margins": -0.27491387724876404, "rewards/rejected": -1.06377375125885, "step": 1554 }, { "epoch": 0.25980310362089104, "grad_norm": 20.0660400390625, "learning_rate": 1.7401968963791092e-05, "logits/chosen": -0.7798879742622375, "logits/rejected": -0.7718772888183594, "logps/chosen": -53.23771667480469, "logps/rejected": -97.246826171875, "loss": 0.4255, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.3943718373775482, "rewards/margins": 3.105365037918091, "rewards/rejected": -2.7109930515289307, "step": 1557 }, { "epoch": 0.2603036876355748, "grad_norm": 22.317264556884766, "learning_rate": 1.7396963123644254e-05, "logits/chosen": -0.7420122623443604, "logits/rejected": -0.7415937781333923, "logps/chosen": -101.01725006103516, "logps/rejected": -119.80431365966797, "loss": 1.2232, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7491142153739929, "rewards/margins": 1.4718661308288574, "rewards/rejected": -2.220980405807495, "step": 1560 }, { "epoch": 0.26080427165025866, "grad_norm": 20.10280990600586, "learning_rate": 1.7391957283497416e-05, "logits/chosen": -0.8760104775428772, "logits/rejected": -0.752991259098053, "logps/chosen": -157.6231231689453, "logps/rejected": -75.84233856201172, "loss": 0.5417, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.1277354508638382, "rewards/margins": 0.2442615032196045, "rewards/rejected": -0.37199699878692627, "step": 1563 }, { "epoch": 0.26130485566494244, "grad_norm": 10.580503463745117, "learning_rate": 1.7386951443350577e-05, "logits/chosen": -0.8489119410514832, "logits/rejected": -0.8350103497505188, "logps/chosen": -55.763797760009766, "logps/rejected": -70.94808197021484, "loss": 0.3668, "rewards/accuracies": 1.0, "rewards/chosen": 0.2232709676027298, "rewards/margins": 0.8023713231086731, "rewards/rejected": -0.5791003704071045, "step": 1566 }, { "epoch": 0.2618054396796262, "grad_norm": 13.829645156860352, "learning_rate": 1.738194560320374e-05, "logits/chosen": -0.5399713516235352, "logits/rejected": -0.5648065209388733, "logps/chosen": -88.2224349975586, "logps/rejected": -102.67049407958984, "loss": 0.6586, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.7603347897529602, "rewards/margins": 0.608917236328125, "rewards/rejected": -1.36925208568573, "step": 1569 }, { "epoch": 0.26230602369431005, "grad_norm": 18.937602996826172, "learning_rate": 1.73769397630569e-05, "logits/chosen": -0.7541562914848328, "logits/rejected": -0.7310267090797424, "logps/chosen": -70.5789566040039, "logps/rejected": -54.518733978271484, "loss": 0.6784, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.27875658869743347, "rewards/margins": -0.7970037460327148, "rewards/rejected": 0.5182471871376038, "step": 1572 }, { "epoch": 0.26280660770899383, "grad_norm": 17.400375366210938, "learning_rate": 1.7371933922910063e-05, "logits/chosen": -0.7049347758293152, "logits/rejected": -0.7126789689064026, "logps/chosen": -55.23308181762695, "logps/rejected": -63.874691009521484, "loss": 0.9007, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.15706326067447662, "rewards/margins": -0.580818772315979, "rewards/rejected": 0.4237555265426636, "step": 1575 }, { "epoch": 0.2633071917236776, "grad_norm": 20.045791625976562, "learning_rate": 1.7366928082763225e-05, "logits/chosen": -0.6482511162757874, "logits/rejected": -0.7186039090156555, "logps/chosen": -118.578857421875, "logps/rejected": -157.86598205566406, "loss": 0.394, "rewards/accuracies": 1.0, "rewards/chosen": -1.2469855546951294, "rewards/margins": 0.8091050982475281, "rewards/rejected": -2.0560905933380127, "step": 1578 }, { "epoch": 0.2638077757383614, "grad_norm": 28.225191116333008, "learning_rate": 1.7361922242616386e-05, "logits/chosen": -0.7609429359436035, "logits/rejected": -0.8776403069496155, "logps/chosen": -133.27305603027344, "logps/rejected": -135.79725646972656, "loss": 0.5724, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.22927361726760864, "rewards/margins": -0.08592110872268677, "rewards/rejected": -0.14335250854492188, "step": 1581 }, { "epoch": 0.26430835975304523, "grad_norm": 22.05215072631836, "learning_rate": 1.735691640246955e-05, "logits/chosen": -0.6886124610900879, "logits/rejected": -0.684091329574585, "logps/chosen": -76.74842071533203, "logps/rejected": -74.58596801757812, "loss": 0.8956, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.06359553337097168, "rewards/margins": 0.5674341320991516, "rewards/rejected": -0.6310296654701233, "step": 1584 }, { "epoch": 0.264808943767729, "grad_norm": 12.17245864868164, "learning_rate": 1.735191056232271e-05, "logits/chosen": -0.8348298668861389, "logits/rejected": -0.8450069427490234, "logps/chosen": -67.15042877197266, "logps/rejected": -137.73892211914062, "loss": 0.6111, "rewards/accuracies": 1.0, "rewards/chosen": -0.4168325364589691, "rewards/margins": 2.58937931060791, "rewards/rejected": -3.006211996078491, "step": 1587 }, { "epoch": 0.2653095277824128, "grad_norm": 17.419849395751953, "learning_rate": 1.7346904722175872e-05, "logits/chosen": -0.7663359642028809, "logits/rejected": -0.7844097018241882, "logps/chosen": -83.47576904296875, "logps/rejected": -116.09075927734375, "loss": 0.5099, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.012486271560192108, "rewards/margins": 0.8768434524536133, "rewards/rejected": -0.8893296718597412, "step": 1590 }, { "epoch": 0.2658101117970966, "grad_norm": 11.213963508605957, "learning_rate": 1.7341898882029037e-05, "logits/chosen": -0.9095010757446289, "logits/rejected": -0.9200587272644043, "logps/chosen": -94.76532745361328, "logps/rejected": -114.0147476196289, "loss": 0.2658, "rewards/accuracies": 1.0, "rewards/chosen": -0.4573674499988556, "rewards/margins": 0.9167154431343079, "rewards/rejected": -1.3740829229354858, "step": 1593 }, { "epoch": 0.2663106958117804, "grad_norm": 21.37502670288086, "learning_rate": 1.7336893041882195e-05, "logits/chosen": -0.5708044171333313, "logits/rejected": -0.6352770328521729, "logps/chosen": -73.86487579345703, "logps/rejected": -113.61283111572266, "loss": 0.6977, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.08514251559972763, "rewards/margins": 0.7218427658081055, "rewards/rejected": -0.8069851994514465, "step": 1596 }, { "epoch": 0.2668112798264642, "grad_norm": 23.17128562927246, "learning_rate": 1.733188720173536e-05, "logits/chosen": -0.826915979385376, "logits/rejected": -0.8326261043548584, "logps/chosen": -64.92417907714844, "logps/rejected": -61.87002182006836, "loss": 0.659, "rewards/accuracies": 1.0, "rewards/chosen": 0.04281453415751457, "rewards/margins": 0.5457473993301392, "rewards/rejected": -0.5029328465461731, "step": 1599 }, { "epoch": 0.267311863841148, "grad_norm": 27.08679962158203, "learning_rate": 1.7326881361588522e-05, "logits/chosen": -0.7738103270530701, "logits/rejected": -0.8127413392066956, "logps/chosen": -110.26284790039062, "logps/rejected": -220.3611602783203, "loss": 0.2953, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.2020416259765625, "rewards/margins": 0.7787678837776184, "rewards/rejected": -1.9808095693588257, "step": 1602 }, { "epoch": 0.2678124478558318, "grad_norm": 8.778404235839844, "learning_rate": 1.732187552144168e-05, "logits/chosen": -0.7314310073852539, "logits/rejected": -0.7271873950958252, "logps/chosen": -125.71856689453125, "logps/rejected": -130.81268310546875, "loss": 0.6879, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.45438385009765625, "rewards/margins": 1.3999017477035522, "rewards/rejected": -1.8542855978012085, "step": 1605 }, { "epoch": 0.2683130318705156, "grad_norm": 8.26904582977295, "learning_rate": 1.7316869681294846e-05, "logits/chosen": -0.7125261425971985, "logits/rejected": -0.6944610476493835, "logps/chosen": -81.46514892578125, "logps/rejected": -131.6710968017578, "loss": 0.3751, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.9207829833030701, "rewards/margins": 2.4236643314361572, "rewards/rejected": -1.502881407737732, "step": 1608 }, { "epoch": 0.2688136158851994, "grad_norm": 41.432464599609375, "learning_rate": 1.7311863841148008e-05, "logits/chosen": -0.554027795791626, "logits/rejected": -0.6244040131568909, "logps/chosen": -149.6868133544922, "logps/rejected": -184.76544189453125, "loss": 0.6517, "rewards/accuracies": 1.0, "rewards/chosen": -0.6806973814964294, "rewards/margins": 0.6340447068214417, "rewards/rejected": -1.314742088317871, "step": 1611 }, { "epoch": 0.2693141998998832, "grad_norm": 30.762765884399414, "learning_rate": 1.730685800100117e-05, "logits/chosen": -0.8240781426429749, "logits/rejected": -0.9054374694824219, "logps/chosen": -54.8974494934082, "logps/rejected": -135.66058349609375, "loss": 0.6804, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.47747182846069336, "rewards/margins": 1.9437488317489624, "rewards/rejected": -1.4662771224975586, "step": 1614 }, { "epoch": 0.269814783914567, "grad_norm": 70.69098663330078, "learning_rate": 1.730185216085433e-05, "logits/chosen": -0.5922824740409851, "logits/rejected": -0.6459053158760071, "logps/chosen": -87.03034210205078, "logps/rejected": -201.86415100097656, "loss": 0.6119, "rewards/accuracies": 1.0, "rewards/chosen": -0.4464748799800873, "rewards/margins": 2.353452205657959, "rewards/rejected": -2.799926996231079, "step": 1617 }, { "epoch": 0.2703153679292508, "grad_norm": 11.054139137268066, "learning_rate": 1.7296846320707493e-05, "logits/chosen": -0.5543078780174255, "logits/rejected": -0.6298578381538391, "logps/chosen": -32.62651062011719, "logps/rejected": -105.8588638305664, "loss": 0.2673, "rewards/accuracies": 1.0, "rewards/chosen": 0.8348062038421631, "rewards/margins": 2.8378307819366455, "rewards/rejected": -2.0030248165130615, "step": 1620 }, { "epoch": 0.2708159519439346, "grad_norm": 17.971515655517578, "learning_rate": 1.7291840480560655e-05, "logits/chosen": -0.6702277660369873, "logits/rejected": -0.7247174382209778, "logps/chosen": -62.837162017822266, "logps/rejected": -134.0081329345703, "loss": 0.4457, "rewards/accuracies": 1.0, "rewards/chosen": -0.18883629143238068, "rewards/margins": 2.7589271068573, "rewards/rejected": -2.947763204574585, "step": 1623 }, { "epoch": 0.2713165359586184, "grad_norm": 23.098461151123047, "learning_rate": 1.7286834640413817e-05, "logits/chosen": -0.6671536564826965, "logits/rejected": -0.6668103337287903, "logps/chosen": -92.82192993164062, "logps/rejected": -94.6788101196289, "loss": 0.3271, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7521801590919495, "rewards/margins": 0.9626132845878601, "rewards/rejected": -1.7147935628890991, "step": 1626 }, { "epoch": 0.2718171199733022, "grad_norm": 20.76407814025879, "learning_rate": 1.728182880026698e-05, "logits/chosen": -0.8728942275047302, "logits/rejected": -0.8838784694671631, "logps/chosen": -75.11857604980469, "logps/rejected": -109.9517593383789, "loss": 0.3888, "rewards/accuracies": 1.0, "rewards/chosen": 0.6668737530708313, "rewards/margins": 1.6738532781600952, "rewards/rejected": -1.0069794654846191, "step": 1629 }, { "epoch": 0.272317703987986, "grad_norm": 23.992868423461914, "learning_rate": 1.727682296012014e-05, "logits/chosen": -0.732765257358551, "logits/rejected": -0.7685745358467102, "logps/chosen": -78.36827850341797, "logps/rejected": -81.07926940917969, "loss": 0.5524, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.0243092775344849, "rewards/margins": 0.6505404114723206, "rewards/rejected": -1.6748496294021606, "step": 1632 }, { "epoch": 0.2728182880026698, "grad_norm": 28.56195068359375, "learning_rate": 1.7271817119973306e-05, "logits/chosen": -0.7361524105072021, "logits/rejected": -0.6984057426452637, "logps/chosen": -133.7798614501953, "logps/rejected": -90.79785919189453, "loss": 0.846, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.08090133219957352, "rewards/margins": 0.041181694716215134, "rewards/rejected": 0.03971964120864868, "step": 1635 }, { "epoch": 0.27331887201735355, "grad_norm": 26.51972007751465, "learning_rate": 1.7266811279826464e-05, "logits/chosen": -0.743392288684845, "logits/rejected": -0.737536609172821, "logps/chosen": -83.0455093383789, "logps/rejected": -113.20673370361328, "loss": 0.4387, "rewards/accuracies": 1.0, "rewards/chosen": -0.8601956367492676, "rewards/margins": 1.2086080312728882, "rewards/rejected": -2.068803548812866, "step": 1638 }, { "epoch": 0.2738194560320374, "grad_norm": 11.435738563537598, "learning_rate": 1.726180543967963e-05, "logits/chosen": -0.6721405386924744, "logits/rejected": -0.7212746739387512, "logps/chosen": -68.05096435546875, "logps/rejected": -104.79843139648438, "loss": 0.7442, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.10533968359231949, "rewards/margins": 1.5912500619888306, "rewards/rejected": -1.6965898275375366, "step": 1641 }, { "epoch": 0.27432004004672117, "grad_norm": 12.021963119506836, "learning_rate": 1.725679959953279e-05, "logits/chosen": -0.7821572422981262, "logits/rejected": -0.7650159001350403, "logps/chosen": -94.94991302490234, "logps/rejected": -88.9824447631836, "loss": 0.3182, "rewards/accuracies": 1.0, "rewards/chosen": -0.5803468823432922, "rewards/margins": 1.2248553037643433, "rewards/rejected": -1.8052021265029907, "step": 1644 }, { "epoch": 0.27482062406140495, "grad_norm": 35.55992889404297, "learning_rate": 1.725179375938595e-05, "logits/chosen": -0.568465530872345, "logits/rejected": -0.7174608707427979, "logps/chosen": -29.460657119750977, "logps/rejected": -159.21749877929688, "loss": 0.888, "rewards/accuracies": 1.0, "rewards/chosen": 0.6502434611320496, "rewards/margins": 5.674160480499268, "rewards/rejected": -5.0239176750183105, "step": 1647 }, { "epoch": 0.2753212080760888, "grad_norm": 42.49506378173828, "learning_rate": 1.7246787919239115e-05, "logits/chosen": -0.6902018189430237, "logits/rejected": -0.7007099986076355, "logps/chosen": -71.82865142822266, "logps/rejected": -96.08142852783203, "loss": 0.9536, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9449464678764343, "rewards/margins": -0.04366741701960564, "rewards/rejected": -0.9012791514396667, "step": 1650 }, { "epoch": 0.27582179209077257, "grad_norm": 31.737131118774414, "learning_rate": 1.7241782079092276e-05, "logits/chosen": -0.6922162175178528, "logits/rejected": -0.7036904692649841, "logps/chosen": -88.77165985107422, "logps/rejected": -114.58465576171875, "loss": 0.471, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.6662191152572632, "rewards/margins": 1.5242137908935547, "rewards/rejected": -3.1904327869415283, "step": 1653 }, { "epoch": 0.27632237610545635, "grad_norm": 44.461246490478516, "learning_rate": 1.7236776238945438e-05, "logits/chosen": -1.0522154569625854, "logits/rejected": -0.9496886134147644, "logps/chosen": -124.5841293334961, "logps/rejected": -66.6957778930664, "loss": 1.7988, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.132148504257202, "rewards/margins": -0.4332611560821533, "rewards/rejected": -1.6988873481750488, "step": 1656 }, { "epoch": 0.2768229601201402, "grad_norm": 43.862144470214844, "learning_rate": 1.72317703987986e-05, "logits/chosen": -0.8428245186805725, "logits/rejected": -0.9132785201072693, "logps/chosen": -62.26719284057617, "logps/rejected": -128.34783935546875, "loss": 0.4862, "rewards/accuracies": 1.0, "rewards/chosen": 0.5459632873535156, "rewards/margins": 2.5860559940338135, "rewards/rejected": -2.040092706680298, "step": 1659 }, { "epoch": 0.27732354413482396, "grad_norm": 32.792118072509766, "learning_rate": 1.7226764558651762e-05, "logits/chosen": -0.8664596676826477, "logits/rejected": -0.8912467956542969, "logps/chosen": -93.76749420166016, "logps/rejected": -91.32147216796875, "loss": 0.7906, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.784235954284668, "rewards/margins": -0.8539404273033142, "rewards/rejected": -1.9302955865859985, "step": 1662 }, { "epoch": 0.27782412814950774, "grad_norm": 35.95106887817383, "learning_rate": 1.7221758718504924e-05, "logits/chosen": -0.7394234538078308, "logits/rejected": -0.836131751537323, "logps/chosen": -77.71109771728516, "logps/rejected": -145.4647674560547, "loss": 0.4994, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.689075231552124, "rewards/margins": 1.4057111740112305, "rewards/rejected": -2.0947864055633545, "step": 1665 }, { "epoch": 0.2783247121641916, "grad_norm": 17.0372314453125, "learning_rate": 1.7216752878358085e-05, "logits/chosen": -0.7098132967948914, "logits/rejected": -0.74226313829422, "logps/chosen": -61.72281265258789, "logps/rejected": -124.68465423583984, "loss": 0.6714, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.16173897683620453, "rewards/margins": 1.8911900520324707, "rewards/rejected": -2.052928924560547, "step": 1668 }, { "epoch": 0.27882529617887536, "grad_norm": 12.01474666595459, "learning_rate": 1.7211747038211247e-05, "logits/chosen": -0.7967488169670105, "logits/rejected": -0.7848665714263916, "logps/chosen": -92.90335083007812, "logps/rejected": -145.28759765625, "loss": 0.438, "rewards/accuracies": 1.0, "rewards/chosen": -0.032555896788835526, "rewards/margins": 1.4794459342956543, "rewards/rejected": -1.512001872062683, "step": 1671 }, { "epoch": 0.27932588019355914, "grad_norm": 35.06546401977539, "learning_rate": 1.720674119806441e-05, "logits/chosen": -0.8742832541465759, "logits/rejected": -0.8622459769248962, "logps/chosen": -92.93621063232422, "logps/rejected": -100.69153594970703, "loss": 0.4402, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.6234008073806763, "rewards/margins": 1.2564325332641602, "rewards/rejected": -2.879833221435547, "step": 1674 }, { "epoch": 0.279826464208243, "grad_norm": 13.154982566833496, "learning_rate": 1.7201735357917574e-05, "logits/chosen": -0.6998047232627869, "logits/rejected": -0.6940675377845764, "logps/chosen": -75.4240493774414, "logps/rejected": -93.81302642822266, "loss": 0.4748, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.2406032532453537, "rewards/margins": 1.189996600151062, "rewards/rejected": -1.4305996894836426, "step": 1677 }, { "epoch": 0.28032704822292676, "grad_norm": 27.585073471069336, "learning_rate": 1.7196729517770732e-05, "logits/chosen": -0.7308316230773926, "logits/rejected": -0.7724862098693848, "logps/chosen": -50.71378707885742, "logps/rejected": -106.6922378540039, "loss": 0.6354, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.43474695086479187, "rewards/margins": 1.4150093793869019, "rewards/rejected": -1.8497562408447266, "step": 1680 }, { "epoch": 0.28082763223761054, "grad_norm": 32.18365478515625, "learning_rate": 1.7191723677623894e-05, "logits/chosen": -0.6686344742774963, "logits/rejected": -0.6547979712486267, "logps/chosen": -97.53714752197266, "logps/rejected": -98.21270751953125, "loss": 0.5325, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.47805070877075195, "rewards/margins": 0.7573623061180115, "rewards/rejected": -1.2354129552841187, "step": 1683 }, { "epoch": 0.2813282162522943, "grad_norm": 29.261310577392578, "learning_rate": 1.718671783747706e-05, "logits/chosen": -0.8029695153236389, "logits/rejected": -0.8077278137207031, "logps/chosen": -86.84896087646484, "logps/rejected": -130.828125, "loss": 0.4878, "rewards/accuracies": 1.0, "rewards/chosen": -0.7108723521232605, "rewards/margins": 1.9284992218017578, "rewards/rejected": -2.639371633529663, "step": 1686 }, { "epoch": 0.28182880026697815, "grad_norm": 32.11146545410156, "learning_rate": 1.7181711997330218e-05, "logits/chosen": -0.7155942320823669, "logits/rejected": -0.6671925187110901, "logps/chosen": -66.58102416992188, "logps/rejected": -58.271728515625, "loss": 0.6234, "rewards/accuracies": 1.0, "rewards/chosen": 0.9622493386268616, "rewards/margins": 1.7692266702651978, "rewards/rejected": -0.8069774508476257, "step": 1689 }, { "epoch": 0.28232938428166193, "grad_norm": 31.55167007446289, "learning_rate": 1.7176706157183383e-05, "logits/chosen": -0.7751696109771729, "logits/rejected": -0.7716215252876282, "logps/chosen": -49.49278259277344, "logps/rejected": -74.70862579345703, "loss": 0.611, "rewards/accuracies": 1.0, "rewards/chosen": 0.31650373339653015, "rewards/margins": 1.7994041442871094, "rewards/rejected": -1.4829002618789673, "step": 1692 }, { "epoch": 0.2828299682963457, "grad_norm": 26.242279052734375, "learning_rate": 1.7171700317036545e-05, "logits/chosen": -0.5893658995628357, "logits/rejected": -0.60469651222229, "logps/chosen": -120.57485961914062, "logps/rejected": -114.05731201171875, "loss": 0.3736, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.2483561038970947, "rewards/margins": 0.2780371606349945, "rewards/rejected": -1.5263932943344116, "step": 1695 }, { "epoch": 0.28333055231102955, "grad_norm": 17.669336318969727, "learning_rate": 1.7166694476889707e-05, "logits/chosen": -0.6798678040504456, "logits/rejected": -0.6949685215950012, "logps/chosen": -150.56309509277344, "logps/rejected": -145.85545349121094, "loss": 0.5496, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7466349601745605, "rewards/margins": 0.2299044281244278, "rewards/rejected": -0.9765393733978271, "step": 1698 }, { "epoch": 0.28383113632571333, "grad_norm": 17.36517333984375, "learning_rate": 1.716168863674287e-05, "logits/chosen": -0.9986286759376526, "logits/rejected": -1.0741010904312134, "logps/chosen": -101.26192474365234, "logps/rejected": -164.20169067382812, "loss": 0.4019, "rewards/accuracies": 1.0, "rewards/chosen": -0.059922557324171066, "rewards/margins": 0.9739441275596619, "rewards/rejected": -1.0338666439056396, "step": 1701 }, { "epoch": 0.2843317203403971, "grad_norm": 26.58785629272461, "learning_rate": 1.715668279659603e-05, "logits/chosen": -0.42573627829551697, "logits/rejected": -0.39021411538124084, "logps/chosen": -67.80960845947266, "logps/rejected": -43.71307373046875, "loss": 0.923, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.19674162566661835, "rewards/margins": 0.22673232853412628, "rewards/rejected": -0.42347392439842224, "step": 1704 }, { "epoch": 0.28483230435508095, "grad_norm": 2.8769478797912598, "learning_rate": 1.7151676956449192e-05, "logits/chosen": -0.7961005568504333, "logits/rejected": -0.7799324989318848, "logps/chosen": -56.75706100463867, "logps/rejected": -68.28055572509766, "loss": 0.2337, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.4456087648868561, "rewards/margins": 1.3319885730743408, "rewards/rejected": -0.8863797783851624, "step": 1707 }, { "epoch": 0.2853328883697647, "grad_norm": 15.881799697875977, "learning_rate": 1.7146671116302354e-05, "logits/chosen": -0.8413300514221191, "logits/rejected": -0.9249407649040222, "logps/chosen": -47.6681022644043, "logps/rejected": -131.33740234375, "loss": 0.4206, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.569930374622345, "rewards/margins": 2.227341890335083, "rewards/rejected": -1.6574115753173828, "step": 1710 }, { "epoch": 0.2858334723844485, "grad_norm": 42.12688064575195, "learning_rate": 1.7141665276155516e-05, "logits/chosen": -0.7561938166618347, "logits/rejected": -0.7354585528373718, "logps/chosen": -58.4573860168457, "logps/rejected": -97.6532211303711, "loss": 0.7617, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 1.201723575592041, "rewards/margins": 1.6414426565170288, "rewards/rejected": -0.4397190511226654, "step": 1713 }, { "epoch": 0.28633405639913234, "grad_norm": 24.480487823486328, "learning_rate": 1.7136659436008677e-05, "logits/chosen": -0.7760427594184875, "logits/rejected": -0.7643373608589172, "logps/chosen": -88.91009521484375, "logps/rejected": -105.58828735351562, "loss": 0.3207, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6077135801315308, "rewards/margins": 1.6501637697219849, "rewards/rejected": -2.2578773498535156, "step": 1716 }, { "epoch": 0.2868346404138161, "grad_norm": 14.395140647888184, "learning_rate": 1.7131653595861843e-05, "logits/chosen": -0.7007812857627869, "logits/rejected": -0.7576733231544495, "logps/chosen": -52.387176513671875, "logps/rejected": -161.13775634765625, "loss": 0.524, "rewards/accuracies": 1.0, "rewards/chosen": 0.7705500721931458, "rewards/margins": 4.117392063140869, "rewards/rejected": -3.3468425273895264, "step": 1719 }, { "epoch": 0.2873352244284999, "grad_norm": 13.410903930664062, "learning_rate": 1.7126647755715e-05, "logits/chosen": -0.855047881603241, "logits/rejected": -0.8347625732421875, "logps/chosen": -64.68383026123047, "logps/rejected": -68.61650848388672, "loss": 0.5293, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.2696394622325897, "rewards/margins": 0.6621000170707703, "rewards/rejected": -0.9317395091056824, "step": 1722 }, { "epoch": 0.28783580844318374, "grad_norm": 36.9836540222168, "learning_rate": 1.7121641915568163e-05, "logits/chosen": -0.8489211201667786, "logits/rejected": -0.855940580368042, "logps/chosen": -118.35428619384766, "logps/rejected": -80.89128875732422, "loss": 0.8581, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.3735989332199097, "rewards/margins": -0.42222389578819275, "rewards/rejected": -0.9513749480247498, "step": 1725 }, { "epoch": 0.2883363924578675, "grad_norm": 8.43781566619873, "learning_rate": 1.7116636075421328e-05, "logits/chosen": -0.8538916707038879, "logits/rejected": -0.8538044095039368, "logps/chosen": -30.745147705078125, "logps/rejected": -60.81761169433594, "loss": 0.4113, "rewards/accuracies": 1.0, "rewards/chosen": 0.46179887652397156, "rewards/margins": 2.2473323345184326, "rewards/rejected": -1.7855334281921387, "step": 1728 }, { "epoch": 0.2888369764725513, "grad_norm": 20.791366577148438, "learning_rate": 1.7111630235274486e-05, "logits/chosen": -0.775013267993927, "logits/rejected": -0.7807397842407227, "logps/chosen": -70.54534149169922, "logps/rejected": -100.30819702148438, "loss": 0.7289, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.09986651688814163, "rewards/margins": 0.719066858291626, "rewards/rejected": -0.8189334273338318, "step": 1731 }, { "epoch": 0.28933756048723513, "grad_norm": 12.939569473266602, "learning_rate": 1.710662439512765e-05, "logits/chosen": -0.7159623503684998, "logits/rejected": -0.759615957736969, "logps/chosen": -78.48694610595703, "logps/rejected": -81.27099609375, "loss": 0.5579, "rewards/accuracies": 1.0, "rewards/chosen": 1.5428671836853027, "rewards/margins": 2.5679798126220703, "rewards/rejected": -1.0251127481460571, "step": 1734 }, { "epoch": 0.2898381445019189, "grad_norm": 11.494287490844727, "learning_rate": 1.7101618554980813e-05, "logits/chosen": -0.7006254196166992, "logits/rejected": -0.7338852286338806, "logps/chosen": -68.4680404663086, "logps/rejected": -93.84988403320312, "loss": 0.4698, "rewards/accuracies": 1.0, "rewards/chosen": -0.08477604389190674, "rewards/margins": 1.3957067728042603, "rewards/rejected": -1.4804826974868774, "step": 1737 }, { "epoch": 0.2903387285166027, "grad_norm": 23.2462215423584, "learning_rate": 1.7096612714833972e-05, "logits/chosen": -0.6043983101844788, "logits/rejected": -0.644166886806488, "logps/chosen": -28.823348999023438, "logps/rejected": -79.52639770507812, "loss": 0.4473, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.18207292258739471, "rewards/margins": 1.6755738258361816, "rewards/rejected": -1.493500828742981, "step": 1740 }, { "epoch": 0.2908393125312865, "grad_norm": 28.081907272338867, "learning_rate": 1.7091606874687137e-05, "logits/chosen": -0.47693073749542236, "logits/rejected": -0.5314216613769531, "logps/chosen": -116.31392669677734, "logps/rejected": -125.69596099853516, "loss": 0.5577, "rewards/accuracies": 0.0, "rewards/chosen": -1.202191710472107, "rewards/margins": -0.3780042231082916, "rewards/rejected": -0.8241875171661377, "step": 1743 }, { "epoch": 0.2913398965459703, "grad_norm": 17.320417404174805, "learning_rate": 1.70866010345403e-05, "logits/chosen": -0.6594040989875793, "logits/rejected": -0.6981696486473083, "logps/chosen": -43.804473876953125, "logps/rejected": -99.02754974365234, "loss": 0.4453, "rewards/accuracies": 1.0, "rewards/chosen": -0.18169361352920532, "rewards/margins": 2.12463641166687, "rewards/rejected": -2.3063299655914307, "step": 1746 }, { "epoch": 0.2918404805606541, "grad_norm": 15.877540588378906, "learning_rate": 1.708159519439346e-05, "logits/chosen": -0.8607742190361023, "logits/rejected": -0.8652995228767395, "logps/chosen": -88.83744049072266, "logps/rejected": -138.12208557128906, "loss": 0.4496, "rewards/accuracies": 1.0, "rewards/chosen": -0.08598706871271133, "rewards/margins": 2.621800184249878, "rewards/rejected": -2.707787275314331, "step": 1749 }, { "epoch": 0.2923410645753379, "grad_norm": 8.909708976745605, "learning_rate": 1.7076589354246622e-05, "logits/chosen": -0.6341436505317688, "logits/rejected": -0.6685865521430969, "logps/chosen": -88.51494598388672, "logps/rejected": -98.49060821533203, "loss": 0.4676, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6371994614601135, "rewards/margins": -0.12009537220001221, "rewards/rejected": -0.5171040892601013, "step": 1752 }, { "epoch": 0.2928416485900217, "grad_norm": 12.404638290405273, "learning_rate": 1.7071583514099784e-05, "logits/chosen": -0.5070740580558777, "logits/rejected": -0.6104663014411926, "logps/chosen": -67.49967193603516, "logps/rejected": -120.30277252197266, "loss": 0.391, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.2672025263309479, "rewards/margins": 1.8648592233657837, "rewards/rejected": -1.5976567268371582, "step": 1755 }, { "epoch": 0.2933422326047055, "grad_norm": 26.567602157592773, "learning_rate": 1.7066577673952946e-05, "logits/chosen": -0.7101178765296936, "logits/rejected": -0.7820910811424255, "logps/chosen": -46.12689208984375, "logps/rejected": -189.4584503173828, "loss": 0.736, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.2838940918445587, "rewards/margins": 4.325104713439941, "rewards/rejected": -4.608998775482178, "step": 1758 }, { "epoch": 0.29384281661938927, "grad_norm": 25.101106643676758, "learning_rate": 1.7061571833806108e-05, "logits/chosen": -0.5447719693183899, "logits/rejected": -0.5668811202049255, "logps/chosen": -76.54730224609375, "logps/rejected": -153.3422393798828, "loss": 0.4574, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.34483858942985535, "rewards/margins": 3.6155803203582764, "rewards/rejected": -3.2707417011260986, "step": 1761 }, { "epoch": 0.2943434006340731, "grad_norm": 8.748393058776855, "learning_rate": 1.705656599365927e-05, "logits/chosen": -0.6618757843971252, "logits/rejected": -0.6687838435173035, "logps/chosen": -76.08802032470703, "logps/rejected": -98.31289672851562, "loss": 0.7009, "rewards/accuracies": 1.0, "rewards/chosen": -0.4202931821346283, "rewards/margins": 1.258129596710205, "rewards/rejected": -1.6784225702285767, "step": 1764 }, { "epoch": 0.2948439846487569, "grad_norm": 16.43433380126953, "learning_rate": 1.705156015351243e-05, "logits/chosen": -0.6767668724060059, "logits/rejected": -0.7566723823547363, "logps/chosen": -70.13359832763672, "logps/rejected": -143.3151397705078, "loss": 0.6371, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.22670356929302216, "rewards/margins": 2.3923332691192627, "rewards/rejected": -2.619036912918091, "step": 1767 }, { "epoch": 0.29534456866344067, "grad_norm": 15.914045333862305, "learning_rate": 1.7046554313365597e-05, "logits/chosen": -0.7208352088928223, "logits/rejected": -0.7600555419921875, "logps/chosen": -58.36613845825195, "logps/rejected": -81.55960083007812, "loss": 0.5571, "rewards/accuracies": 1.0, "rewards/chosen": 0.5338577628135681, "rewards/margins": 1.7263545989990234, "rewards/rejected": -1.1924968957901, "step": 1770 }, { "epoch": 0.2958451526781245, "grad_norm": 19.69272232055664, "learning_rate": 1.7041548473218755e-05, "logits/chosen": -0.687532901763916, "logits/rejected": -0.6754403114318848, "logps/chosen": -75.11873626708984, "logps/rejected": -106.4769287109375, "loss": 0.9251, "rewards/accuracies": 1.0, "rewards/chosen": 0.1211269274353981, "rewards/margins": 2.400880813598633, "rewards/rejected": -2.2797539234161377, "step": 1773 }, { "epoch": 0.2963457366928083, "grad_norm": 11.43566608428955, "learning_rate": 1.703654263307192e-05, "logits/chosen": -0.6963183283805847, "logits/rejected": -0.7149564623832703, "logps/chosen": -57.86086654663086, "logps/rejected": -76.47442626953125, "loss": 0.5481, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.6585421562194824, "rewards/margins": -0.16981177031993866, "rewards/rejected": -0.48873040080070496, "step": 1776 }, { "epoch": 0.29684632070749206, "grad_norm": 16.39366912841797, "learning_rate": 1.7031536792925082e-05, "logits/chosen": -0.7116819024085999, "logits/rejected": -0.7449557185173035, "logps/chosen": -51.7790412902832, "logps/rejected": -119.38716888427734, "loss": 0.59, "rewards/accuracies": 1.0, "rewards/chosen": 0.019392773509025574, "rewards/margins": 1.2489228248596191, "rewards/rejected": -1.2295302152633667, "step": 1779 }, { "epoch": 0.2973469047221759, "grad_norm": 8.830911636352539, "learning_rate": 1.702653095277824e-05, "logits/chosen": -0.6663742065429688, "logits/rejected": -0.6872708201408386, "logps/chosen": -87.92638397216797, "logps/rejected": -192.65020751953125, "loss": 0.2567, "rewards/accuracies": 1.0, "rewards/chosen": 0.38000667095184326, "rewards/margins": 2.38455867767334, "rewards/rejected": -2.004552125930786, "step": 1782 }, { "epoch": 0.2978474887368597, "grad_norm": 15.484322547912598, "learning_rate": 1.7021525112631406e-05, "logits/chosen": -0.7821173071861267, "logits/rejected": -0.691139280796051, "logps/chosen": -116.76331329345703, "logps/rejected": -25.98309898376465, "loss": 0.5457, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.0011340379714966, "rewards/margins": -0.4865717887878418, "rewards/rejected": -0.5145623087882996, "step": 1785 }, { "epoch": 0.29834807275154346, "grad_norm": 12.521993637084961, "learning_rate": 1.7016519272484567e-05, "logits/chosen": -0.7079005241394043, "logits/rejected": -0.6756348609924316, "logps/chosen": -88.11825561523438, "logps/rejected": -57.86753463745117, "loss": 0.2693, "rewards/accuracies": 1.0, "rewards/chosen": 0.44468584656715393, "rewards/margins": 1.377968668937683, "rewards/rejected": -0.9332828521728516, "step": 1788 }, { "epoch": 0.29884865676622724, "grad_norm": 38.957374572753906, "learning_rate": 1.701151343233773e-05, "logits/chosen": -0.5728759169578552, "logits/rejected": -0.6347112059593201, "logps/chosen": -105.76593017578125, "logps/rejected": -163.98025512695312, "loss": 0.4636, "rewards/accuracies": 1.0, "rewards/chosen": -0.08056221157312393, "rewards/margins": 2.219797372817993, "rewards/rejected": -2.3003594875335693, "step": 1791 }, { "epoch": 0.2993492407809111, "grad_norm": 7.356330871582031, "learning_rate": 1.700650759219089e-05, "logits/chosen": -0.6092327237129211, "logits/rejected": -0.6410971879959106, "logps/chosen": -51.63804244995117, "logps/rejected": -95.34973907470703, "loss": 0.4249, "rewards/accuracies": 1.0, "rewards/chosen": 0.7428706288337708, "rewards/margins": 2.4167652130126953, "rewards/rejected": -1.6738942861557007, "step": 1794 }, { "epoch": 0.29984982479559485, "grad_norm": 25.52002716064453, "learning_rate": 1.7001501752044053e-05, "logits/chosen": -0.8666794300079346, "logits/rejected": -0.8062429428100586, "logps/chosen": -135.00132751464844, "logps/rejected": -140.8594207763672, "loss": 0.6082, "rewards/accuracies": 1.0, "rewards/chosen": 0.6136342883110046, "rewards/margins": 2.157395839691162, "rewards/rejected": -1.5437617301940918, "step": 1797 }, { "epoch": 0.30035040881027864, "grad_norm": 18.407915115356445, "learning_rate": 1.6996495911897214e-05, "logits/chosen": -0.8496769070625305, "logits/rejected": -0.7718207240104675, "logps/chosen": -123.29801177978516, "logps/rejected": -91.22445678710938, "loss": 0.4587, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.4771326780319214, "rewards/margins": -0.1280355304479599, "rewards/rejected": -0.3490971624851227, "step": 1800 }, { "epoch": 0.30035040881027864, "eval_logits/chosen": -0.7274773716926575, "eval_logits/rejected": -0.7412143349647522, "eval_logps/chosen": -83.58704376220703, "eval_logps/rejected": -107.24825286865234, "eval_loss": 0.5914470553398132, "eval_rewards/accuracies": 0.7192192077636719, "eval_rewards/chosen": -0.33293092250823975, "eval_rewards/margins": 1.079075574874878, "eval_rewards/rejected": -1.4120064973831177, "eval_runtime": 345.6365, "eval_samples_per_second": 7.708, "eval_steps_per_second": 1.927, "step": 1800 }, { "epoch": 0.30085099282496247, "grad_norm": 19.730575561523438, "learning_rate": 1.6991490071750376e-05, "logits/chosen": -0.6917944550514221, "logits/rejected": -0.7429867386817932, "logps/chosen": -124.9297866821289, "logps/rejected": -163.4666748046875, "loss": 0.5537, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5272108912467957, "rewards/margins": 0.03374164178967476, "rewards/rejected": -0.5609524846076965, "step": 1803 }, { "epoch": 0.30135157683964625, "grad_norm": 33.188316345214844, "learning_rate": 1.6986484231603538e-05, "logits/chosen": -0.7722838521003723, "logits/rejected": -0.765946090221405, "logps/chosen": -117.52294158935547, "logps/rejected": -95.91046142578125, "loss": 0.5659, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.3732643127441406, "rewards/margins": -0.6095991134643555, "rewards/rejected": -0.7636652588844299, "step": 1806 }, { "epoch": 0.30185216085433003, "grad_norm": 26.502750396728516, "learning_rate": 1.69814783914567e-05, "logits/chosen": -0.6812806129455566, "logits/rejected": -0.6861699223518372, "logps/chosen": -89.66818237304688, "logps/rejected": -82.83386993408203, "loss": 0.6621, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7652156949043274, "rewards/margins": 0.07732632011175156, "rewards/rejected": -0.8425419926643372, "step": 1809 }, { "epoch": 0.30235274486901387, "grad_norm": 25.747962951660156, "learning_rate": 1.6976472551309865e-05, "logits/chosen": -0.7575272917747498, "logits/rejected": -0.7555331587791443, "logps/chosen": -50.89219284057617, "logps/rejected": -54.519161224365234, "loss": 0.7116, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5112714171409607, "rewards/margins": 0.45764124393463135, "rewards/rejected": -0.968912661075592, "step": 1812 }, { "epoch": 0.30285332888369765, "grad_norm": 11.924506187438965, "learning_rate": 1.6971466711163023e-05, "logits/chosen": -0.8400651812553406, "logits/rejected": -0.7778652310371399, "logps/chosen": -103.35455322265625, "logps/rejected": -81.16043853759766, "loss": 0.341, "rewards/accuracies": 1.0, "rewards/chosen": -0.2664789855480194, "rewards/margins": 1.6065093278884888, "rewards/rejected": -1.8729883432388306, "step": 1815 }, { "epoch": 0.30335391289838143, "grad_norm": 25.673246383666992, "learning_rate": 1.6966460871016185e-05, "logits/chosen": -0.6644123196601868, "logits/rejected": -0.6411871314048767, "logps/chosen": -64.0475082397461, "logps/rejected": -95.1968002319336, "loss": 0.4843, "rewards/accuracies": 1.0, "rewards/chosen": -0.3462606370449066, "rewards/margins": 1.5005706548690796, "rewards/rejected": -1.8468313217163086, "step": 1818 }, { "epoch": 0.30385449691306526, "grad_norm": 28.298311233520508, "learning_rate": 1.696145503086935e-05, "logits/chosen": -0.7933860421180725, "logits/rejected": -0.7761983871459961, "logps/chosen": -117.70807647705078, "logps/rejected": -91.6739273071289, "loss": 1.1238, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.7781139016151428, "rewards/margins": -0.14091677963733673, "rewards/rejected": -0.6371971368789673, "step": 1821 }, { "epoch": 0.30435508092774904, "grad_norm": 21.485698699951172, "learning_rate": 1.695644919072251e-05, "logits/chosen": -0.8956348896026611, "logits/rejected": -0.9088940620422363, "logps/chosen": -96.7318344116211, "logps/rejected": -106.10587310791016, "loss": 0.6297, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.45634356141090393, "rewards/margins": 1.0437082052230835, "rewards/rejected": -0.5873644948005676, "step": 1824 }, { "epoch": 0.3048556649424328, "grad_norm": 20.976301193237305, "learning_rate": 1.6951443350575674e-05, "logits/chosen": -0.7238051295280457, "logits/rejected": -0.6813049912452698, "logps/chosen": -145.951171875, "logps/rejected": -102.60595703125, "loss": 0.7383, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.29824256896972656, "rewards/margins": 1.0274490118026733, "rewards/rejected": -0.7292063236236572, "step": 1827 }, { "epoch": 0.30535624895711666, "grad_norm": 27.305686950683594, "learning_rate": 1.6946437510428836e-05, "logits/chosen": -0.6099315285682678, "logits/rejected": -0.7275886535644531, "logps/chosen": -66.108154296875, "logps/rejected": -127.602294921875, "loss": 0.642, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.39606907963752747, "rewards/margins": 1.8704895973205566, "rewards/rejected": -1.4744205474853516, "step": 1830 }, { "epoch": 0.30585683297180044, "grad_norm": 27.504676818847656, "learning_rate": 1.6941431670281998e-05, "logits/chosen": -0.6578494906425476, "logits/rejected": -0.6058751940727234, "logps/chosen": -109.54793548583984, "logps/rejected": -78.2166748046875, "loss": 0.5976, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.308789610862732, "rewards/margins": 0.6444241404533386, "rewards/rejected": -1.9532136917114258, "step": 1833 }, { "epoch": 0.3063574169864842, "grad_norm": 22.827285766601562, "learning_rate": 1.693642583013516e-05, "logits/chosen": -0.8995265960693359, "logits/rejected": -0.8752580285072327, "logps/chosen": -100.03070831298828, "logps/rejected": -66.76534271240234, "loss": 0.7394, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.985234260559082, "rewards/margins": -0.7278752326965332, "rewards/rejected": -1.2573589086532593, "step": 1836 }, { "epoch": 0.30685800100116806, "grad_norm": 25.543176651000977, "learning_rate": 1.693141998998832e-05, "logits/chosen": -1.0527774095535278, "logits/rejected": -1.0701106786727905, "logps/chosen": -84.59093475341797, "logps/rejected": -112.37603759765625, "loss": 0.498, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.3144153356552124, "rewards/margins": 0.3711979389190674, "rewards/rejected": -0.6856133341789246, "step": 1839 }, { "epoch": 0.30735858501585184, "grad_norm": 33.022361755371094, "learning_rate": 1.6926414149841483e-05, "logits/chosen": -0.7236862182617188, "logits/rejected": -0.7285628318786621, "logps/chosen": -56.29166030883789, "logps/rejected": -103.14669036865234, "loss": 0.3564, "rewards/accuracies": 1.0, "rewards/chosen": -0.7480056881904602, "rewards/margins": 2.570225954055786, "rewards/rejected": -3.3182315826416016, "step": 1842 }, { "epoch": 0.3078591690305356, "grad_norm": 31.58062171936035, "learning_rate": 1.6921408309694645e-05, "logits/chosen": -0.6937692761421204, "logits/rejected": -0.7545182108879089, "logps/chosen": -53.8294563293457, "logps/rejected": -175.0450439453125, "loss": 0.686, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.186482235789299, "rewards/margins": 1.8652653694152832, "rewards/rejected": -2.0517475605010986, "step": 1845 }, { "epoch": 0.3083597530452194, "grad_norm": 27.35738182067871, "learning_rate": 1.6916402469547807e-05, "logits/chosen": -0.6609358787536621, "logits/rejected": -0.7129762768745422, "logps/chosen": -69.69298553466797, "logps/rejected": -125.90276336669922, "loss": 0.4886, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 1.0501781702041626, "rewards/margins": 1.9663949012756348, "rewards/rejected": -0.9162167906761169, "step": 1848 }, { "epoch": 0.30886033705990323, "grad_norm": 27.179807662963867, "learning_rate": 1.691139662940097e-05, "logits/chosen": -0.6876239776611328, "logits/rejected": -0.6747037768363953, "logps/chosen": -90.93123626708984, "logps/rejected": -115.2554702758789, "loss": 0.6994, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5053678154945374, "rewards/margins": 0.8151409029960632, "rewards/rejected": -1.320508599281311, "step": 1851 }, { "epoch": 0.309360921074587, "grad_norm": 11.01917839050293, "learning_rate": 1.690639078925413e-05, "logits/chosen": -0.9514169096946716, "logits/rejected": -0.940392255783081, "logps/chosen": -56.92195510864258, "logps/rejected": -70.18463134765625, "loss": 0.4272, "rewards/accuracies": 1.0, "rewards/chosen": -0.17178364098072052, "rewards/margins": 1.719319462776184, "rewards/rejected": -1.8911031484603882, "step": 1854 }, { "epoch": 0.3098615050892708, "grad_norm": 30.233585357666016, "learning_rate": 1.6901384949107292e-05, "logits/chosen": -0.6104156970977783, "logits/rejected": -0.6190745830535889, "logps/chosen": -77.14183807373047, "logps/rejected": -108.37926483154297, "loss": 0.7167, "rewards/accuracies": 1.0, "rewards/chosen": 0.565288782119751, "rewards/margins": 1.4546865224838257, "rewards/rejected": -0.8893976807594299, "step": 1857 }, { "epoch": 0.31036208910395463, "grad_norm": 8.775229454040527, "learning_rate": 1.6896379108960454e-05, "logits/chosen": -0.5896092057228088, "logits/rejected": -0.5768322944641113, "logps/chosen": -51.953521728515625, "logps/rejected": -73.48786163330078, "loss": 0.572, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.28730693459510803, "rewards/margins": 0.5053200721740723, "rewards/rejected": -0.21801316738128662, "step": 1860 }, { "epoch": 0.3108626731186384, "grad_norm": 24.235013961791992, "learning_rate": 1.689137326881362e-05, "logits/chosen": -0.6547455191612244, "logits/rejected": -0.6551836133003235, "logps/chosen": -69.86236572265625, "logps/rejected": -110.34770965576172, "loss": 0.5782, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.936776340007782, "rewards/margins": 0.5731938481330872, "rewards/rejected": -1.5099701881408691, "step": 1863 }, { "epoch": 0.3113632571333222, "grad_norm": 14.011931419372559, "learning_rate": 1.6886367428666777e-05, "logits/chosen": -0.8232201933860779, "logits/rejected": -0.7806755900382996, "logps/chosen": -134.69483947753906, "logps/rejected": -134.94691467285156, "loss": 0.775, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6229838728904724, "rewards/margins": 1.0649840831756592, "rewards/rejected": -1.6879678964614868, "step": 1866 }, { "epoch": 0.311863841148006, "grad_norm": 21.148090362548828, "learning_rate": 1.6881361588519943e-05, "logits/chosen": -0.7533500790596008, "logits/rejected": -0.7616627216339111, "logps/chosen": -55.89933395385742, "logps/rejected": -92.780517578125, "loss": 0.3441, "rewards/accuracies": 1.0, "rewards/chosen": -0.45905056595802307, "rewards/margins": 2.667555570602417, "rewards/rejected": -3.126605987548828, "step": 1869 }, { "epoch": 0.3123644251626898, "grad_norm": 5.957732200622559, "learning_rate": 1.6876355748373104e-05, "logits/chosen": -0.7090754508972168, "logits/rejected": -0.7182371020317078, "logps/chosen": -100.71792602539062, "logps/rejected": -73.8635482788086, "loss": 0.6573, "rewards/accuracies": 1.0, "rewards/chosen": 0.30231261253356934, "rewards/margins": 0.4319356381893158, "rewards/rejected": -0.12962304055690765, "step": 1872 }, { "epoch": 0.3128650091773736, "grad_norm": 13.247547149658203, "learning_rate": 1.6871349908226263e-05, "logits/chosen": -0.9529953002929688, "logits/rejected": -0.9357520937919617, "logps/chosen": -87.5675048828125, "logps/rejected": -94.39031982421875, "loss": 0.462, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.6527639031410217, "rewards/margins": 0.21050284802913666, "rewards/rejected": -0.8632667064666748, "step": 1875 }, { "epoch": 0.3133655931920574, "grad_norm": 14.913676261901855, "learning_rate": 1.6866344068079428e-05, "logits/chosen": -0.6976696848869324, "logits/rejected": -0.8210912346839905, "logps/chosen": -76.59062194824219, "logps/rejected": -120.58252716064453, "loss": 0.3543, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.5400545001029968, "rewards/margins": 2.077033281326294, "rewards/rejected": -1.5369787216186523, "step": 1878 }, { "epoch": 0.3138661772067412, "grad_norm": 21.237693786621094, "learning_rate": 1.686133822793259e-05, "logits/chosen": -0.5670031905174255, "logits/rejected": -0.6736325621604919, "logps/chosen": -65.02843475341797, "logps/rejected": -131.2970428466797, "loss": 0.6989, "rewards/accuracies": 1.0, "rewards/chosen": -0.09130095690488815, "rewards/margins": 2.1903223991394043, "rewards/rejected": -2.281623363494873, "step": 1881 }, { "epoch": 0.314366761221425, "grad_norm": 24.56658363342285, "learning_rate": 1.685633238778575e-05, "logits/chosen": -0.6872568130493164, "logits/rejected": -0.6658517718315125, "logps/chosen": -61.82890701293945, "logps/rejected": -58.44282913208008, "loss": 0.7515, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.05894633010029793, "rewards/margins": 0.05692747235298157, "rewards/rejected": 0.0020188589114695787, "step": 1884 }, { "epoch": 0.3148673452361088, "grad_norm": 11.306321144104004, "learning_rate": 1.6851326547638913e-05, "logits/chosen": -0.8583152294158936, "logits/rejected": -0.8320664763450623, "logps/chosen": -104.48355102539062, "logps/rejected": -84.30516052246094, "loss": 0.4111, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.016144057735800743, "rewards/margins": 0.16395558416843414, "rewards/rejected": -0.18009965121746063, "step": 1887 }, { "epoch": 0.3153679292507926, "grad_norm": 13.92123794555664, "learning_rate": 1.6846320707492075e-05, "logits/chosen": -0.7297344207763672, "logits/rejected": -0.7632315158843994, "logps/chosen": -54.76420211791992, "logps/rejected": -107.53701782226562, "loss": 0.489, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.06948293000459671, "rewards/margins": 0.7809441685676575, "rewards/rejected": -0.7114613056182861, "step": 1890 }, { "epoch": 0.3158685132654764, "grad_norm": 30.78082847595215, "learning_rate": 1.6841314867345237e-05, "logits/chosen": -0.7980168461799622, "logits/rejected": -0.8444302082061768, "logps/chosen": -159.71612548828125, "logps/rejected": -146.0645294189453, "loss": 0.5194, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.9139992594718933, "rewards/margins": 0.8630710244178772, "rewards/rejected": -1.777070164680481, "step": 1893 }, { "epoch": 0.31636909728016016, "grad_norm": 26.5939998626709, "learning_rate": 1.68363090271984e-05, "logits/chosen": -0.8142144680023193, "logits/rejected": -0.7560893893241882, "logps/chosen": -113.83154296875, "logps/rejected": -83.61249542236328, "loss": 0.5934, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4154007136821747, "rewards/margins": 2.719888687133789, "rewards/rejected": -3.135289192199707, "step": 1896 }, { "epoch": 0.316869681294844, "grad_norm": 14.465415000915527, "learning_rate": 1.683130318705156e-05, "logits/chosen": -0.7745420336723328, "logits/rejected": -0.8163614273071289, "logps/chosen": -70.3541030883789, "logps/rejected": -142.8548126220703, "loss": 0.7516, "rewards/accuracies": 1.0, "rewards/chosen": -0.8258912563323975, "rewards/margins": 1.9233412742614746, "rewards/rejected": -2.749232530593872, "step": 1899 }, { "epoch": 0.3173702653095278, "grad_norm": 22.243770599365234, "learning_rate": 1.6826297346904722e-05, "logits/chosen": -0.8829059600830078, "logits/rejected": -0.8629239201545715, "logps/chosen": -122.3258285522461, "logps/rejected": -68.691650390625, "loss": 0.7845, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.235435128211975, "rewards/margins": 0.1111738309264183, "rewards/rejected": -1.3466089963912964, "step": 1902 }, { "epoch": 0.31787084932421156, "grad_norm": 18.126188278198242, "learning_rate": 1.6821291506757887e-05, "logits/chosen": -0.8369978070259094, "logits/rejected": -0.8057553172111511, "logps/chosen": -115.87052154541016, "logps/rejected": -155.21315002441406, "loss": 0.607, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.836108386516571, "rewards/margins": 0.6320238709449768, "rewards/rejected": -1.4681323766708374, "step": 1905 }, { "epoch": 0.3183714333388954, "grad_norm": 17.44388198852539, "learning_rate": 1.6816285666611046e-05, "logits/chosen": -0.5937266945838928, "logits/rejected": -0.698998749256134, "logps/chosen": -86.552490234375, "logps/rejected": -142.30499267578125, "loss": 0.5821, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.004449218511581421, "rewards/margins": 0.9018142819404602, "rewards/rejected": -0.906263530254364, "step": 1908 }, { "epoch": 0.3188720173535792, "grad_norm": 30.7576847076416, "learning_rate": 1.6811279826464208e-05, "logits/chosen": -0.689990222454071, "logits/rejected": -0.8023679852485657, "logps/chosen": -77.6781005859375, "logps/rejected": -165.2975311279297, "loss": 0.8986, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.1282590627670288, "rewards/margins": -0.1329144984483719, "rewards/rejected": -0.9953445792198181, "step": 1911 }, { "epoch": 0.31937260136826295, "grad_norm": 19.45651626586914, "learning_rate": 1.6806273986317373e-05, "logits/chosen": -0.7402329444885254, "logits/rejected": -0.72761470079422, "logps/chosen": -82.39055633544922, "logps/rejected": -64.0690689086914, "loss": 0.6377, "rewards/accuracies": 0.0, "rewards/chosen": -1.3309879302978516, "rewards/margins": -0.7451136708259583, "rewards/rejected": -0.5858742594718933, "step": 1914 }, { "epoch": 0.3198731853829468, "grad_norm": 36.95547103881836, "learning_rate": 1.680126814617053e-05, "logits/chosen": -0.6937759518623352, "logits/rejected": -0.7109912037849426, "logps/chosen": -75.64360046386719, "logps/rejected": -101.4083480834961, "loss": 0.7033, "rewards/accuracies": 1.0, "rewards/chosen": -0.7274091243743896, "rewards/margins": 1.724196434020996, "rewards/rejected": -2.4516055583953857, "step": 1917 }, { "epoch": 0.32037376939763057, "grad_norm": 13.586204528808594, "learning_rate": 1.6796262306023696e-05, "logits/chosen": -0.7721939086914062, "logits/rejected": -0.8295038342475891, "logps/chosen": -81.13458251953125, "logps/rejected": -165.7418670654297, "loss": 0.3369, "rewards/accuracies": 1.0, "rewards/chosen": -0.47325897216796875, "rewards/margins": 2.7351577281951904, "rewards/rejected": -3.208416700363159, "step": 1920 }, { "epoch": 0.32087435341231435, "grad_norm": 13.4139404296875, "learning_rate": 1.6791256465876858e-05, "logits/chosen": -0.6841358542442322, "logits/rejected": -0.7224209904670715, "logps/chosen": -50.51792526245117, "logps/rejected": -133.5126495361328, "loss": 0.5786, "rewards/accuracies": 1.0, "rewards/chosen": -0.3782011568546295, "rewards/margins": 1.2624058723449707, "rewards/rejected": -1.6406068801879883, "step": 1923 }, { "epoch": 0.3213749374269982, "grad_norm": 13.25943374633789, "learning_rate": 1.678625062573002e-05, "logits/chosen": -0.6666104197502136, "logits/rejected": -0.8090373873710632, "logps/chosen": -38.2485237121582, "logps/rejected": -124.18463134765625, "loss": 0.5806, "rewards/accuracies": 1.0, "rewards/chosen": 0.9077754616737366, "rewards/margins": 3.3651371002197266, "rewards/rejected": -2.4573614597320557, "step": 1926 }, { "epoch": 0.32187552144168197, "grad_norm": 8.887785911560059, "learning_rate": 1.6781244785583182e-05, "logits/chosen": -0.6483407020568848, "logits/rejected": -0.6886523365974426, "logps/chosen": -75.05508422851562, "logps/rejected": -127.94829559326172, "loss": 0.3757, "rewards/accuracies": 1.0, "rewards/chosen": 0.6369571089744568, "rewards/margins": 4.233068466186523, "rewards/rejected": -3.596111536026001, "step": 1929 }, { "epoch": 0.32237610545636575, "grad_norm": 27.390148162841797, "learning_rate": 1.6776238945436344e-05, "logits/chosen": -0.9464519619941711, "logits/rejected": -0.9500895142555237, "logps/chosen": -106.40132904052734, "logps/rejected": -120.59795379638672, "loss": 0.8395, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.5808077454566956, "rewards/margins": 0.5122418999671936, "rewards/rejected": -1.0930496454238892, "step": 1932 }, { "epoch": 0.3228766894710496, "grad_norm": 39.53611373901367, "learning_rate": 1.6771233105289505e-05, "logits/chosen": -0.8142228126525879, "logits/rejected": -0.7692120671272278, "logps/chosen": -178.06040954589844, "logps/rejected": -126.56073760986328, "loss": 0.7242, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.4601666927337646, "rewards/margins": -0.81587153673172, "rewards/rejected": -1.6442952156066895, "step": 1935 }, { "epoch": 0.32337727348573336, "grad_norm": 23.49749755859375, "learning_rate": 1.6766227265142667e-05, "logits/chosen": -0.6916241645812988, "logits/rejected": -0.7944245934486389, "logps/chosen": -68.16060638427734, "logps/rejected": -121.36035919189453, "loss": 0.3402, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.3145326375961304, "rewards/margins": 0.6849560737609863, "rewards/rejected": -1.9994887113571167, "step": 1938 }, { "epoch": 0.32387785750041714, "grad_norm": 27.397829055786133, "learning_rate": 1.676122142499583e-05, "logits/chosen": -0.7917422652244568, "logits/rejected": -0.7832877039909363, "logps/chosen": -74.38282775878906, "logps/rejected": -85.78392791748047, "loss": 0.6236, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6419230103492737, "rewards/margins": 0.5921158194541931, "rewards/rejected": -1.2340388298034668, "step": 1941 }, { "epoch": 0.3243784415151009, "grad_norm": 26.26380157470703, "learning_rate": 1.675621558484899e-05, "logits/chosen": -0.8349164128303528, "logits/rejected": -0.7627144455909729, "logps/chosen": -125.20433807373047, "logps/rejected": -85.91089630126953, "loss": 0.3504, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4081207811832428, "rewards/margins": 0.8509969115257263, "rewards/rejected": -1.259117603302002, "step": 1944 }, { "epoch": 0.32487902552978476, "grad_norm": 9.956957817077637, "learning_rate": 1.6751209744702156e-05, "logits/chosen": -0.6331689357757568, "logits/rejected": -0.6796969771385193, "logps/chosen": -85.757568359375, "logps/rejected": -117.01797485351562, "loss": 0.4433, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.0899569988250732, "rewards/margins": 0.8256890773773193, "rewards/rejected": -1.9156460762023926, "step": 1947 }, { "epoch": 0.32537960954446854, "grad_norm": 16.302587509155273, "learning_rate": 1.6746203904555314e-05, "logits/chosen": -0.7056145071983337, "logits/rejected": -0.7817551493644714, "logps/chosen": -105.25634765625, "logps/rejected": -117.24181365966797, "loss": 0.8563, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.6582697629928589, "rewards/margins": -0.7923746705055237, "rewards/rejected": -0.8658950924873352, "step": 1950 }, { "epoch": 0.3258801935591523, "grad_norm": 16.776540756225586, "learning_rate": 1.6741198064408476e-05, "logits/chosen": -0.810614824295044, "logits/rejected": -0.7951495051383972, "logps/chosen": -153.87890625, "logps/rejected": -183.0011444091797, "loss": 0.3028, "rewards/accuracies": 1.0, "rewards/chosen": -0.9514010548591614, "rewards/margins": 1.3484021425247192, "rewards/rejected": -2.2998032569885254, "step": 1953 }, { "epoch": 0.32638077757383616, "grad_norm": 18.145200729370117, "learning_rate": 1.673619222426164e-05, "logits/chosen": -0.7611098289489746, "logits/rejected": -0.7645800709724426, "logps/chosen": -78.08358001708984, "logps/rejected": -112.6084213256836, "loss": 0.4639, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.32311174273490906, "rewards/margins": 1.4203964471817017, "rewards/rejected": -1.743508219718933, "step": 1956 }, { "epoch": 0.32688136158851994, "grad_norm": 16.425949096679688, "learning_rate": 1.67311863841148e-05, "logits/chosen": -0.6401432156562805, "logits/rejected": -0.6565421223640442, "logps/chosen": -96.07733917236328, "logps/rejected": -115.87850952148438, "loss": 0.6533, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.2798049449920654, "rewards/margins": -0.5523682832717896, "rewards/rejected": -0.7274366021156311, "step": 1959 }, { "epoch": 0.3273819456032037, "grad_norm": 31.492586135864258, "learning_rate": 1.6726180543967965e-05, "logits/chosen": -0.6280815005302429, "logits/rejected": -0.5880460143089294, "logps/chosen": -101.67559051513672, "logps/rejected": -97.15001678466797, "loss": 0.3782, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.45606693625450134, "rewards/margins": 1.4105550050735474, "rewards/rejected": -1.866621971130371, "step": 1962 }, { "epoch": 0.32788252961788755, "grad_norm": 6.935392379760742, "learning_rate": 1.6721174703821127e-05, "logits/chosen": -0.8784529566764832, "logits/rejected": -0.941998302936554, "logps/chosen": -49.67327880859375, "logps/rejected": -142.43919372558594, "loss": 0.3786, "rewards/accuracies": 1.0, "rewards/chosen": 0.4864584505558014, "rewards/margins": 4.967612266540527, "rewards/rejected": -4.481153964996338, "step": 1965 }, { "epoch": 0.32838311363257133, "grad_norm": 11.17880916595459, "learning_rate": 1.671616886367429e-05, "logits/chosen": -0.5762531161308289, "logits/rejected": -0.597471296787262, "logps/chosen": -79.9166488647461, "logps/rejected": -162.8763427734375, "loss": 0.6156, "rewards/accuracies": 1.0, "rewards/chosen": -0.18337862193584442, "rewards/margins": 5.270055770874023, "rewards/rejected": -5.453433990478516, "step": 1968 }, { "epoch": 0.3288836976472551, "grad_norm": 11.003763198852539, "learning_rate": 1.671116302352745e-05, "logits/chosen": -0.8912760615348816, "logits/rejected": -0.9142957329750061, "logps/chosen": -69.23259735107422, "logps/rejected": -76.43972778320312, "loss": 0.6428, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.643023669719696, "rewards/margins": 0.19759805500507355, "rewards/rejected": -0.8406217694282532, "step": 1971 }, { "epoch": 0.32938428166193895, "grad_norm": 12.046690940856934, "learning_rate": 1.6706157183380612e-05, "logits/chosen": -0.6882603168487549, "logits/rejected": -0.7894819378852844, "logps/chosen": -39.21347427368164, "logps/rejected": -146.0172882080078, "loss": 0.4618, "rewards/accuracies": 1.0, "rewards/chosen": -0.07198558002710342, "rewards/margins": 4.239290714263916, "rewards/rejected": -4.311276435852051, "step": 1974 }, { "epoch": 0.32988486567662273, "grad_norm": 53.84632873535156, "learning_rate": 1.6701151343233774e-05, "logits/chosen": -0.7972219586372375, "logits/rejected": -0.7061479091644287, "logps/chosen": -121.66168212890625, "logps/rejected": -81.57160186767578, "loss": 0.6364, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1801338195800781, "rewards/margins": 0.476961612701416, "rewards/rejected": -1.6570955514907837, "step": 1977 }, { "epoch": 0.3303854496913065, "grad_norm": 21.829090118408203, "learning_rate": 1.6696145503086936e-05, "logits/chosen": -0.5869014263153076, "logits/rejected": -0.6632862091064453, "logps/chosen": -44.90907669067383, "logps/rejected": -134.5117950439453, "loss": 0.4981, "rewards/accuracies": 1.0, "rewards/chosen": 0.362953782081604, "rewards/margins": 2.8371810913085938, "rewards/rejected": -2.4742274284362793, "step": 1980 }, { "epoch": 0.33088603370599035, "grad_norm": 16.16994285583496, "learning_rate": 1.6691139662940098e-05, "logits/chosen": -0.7777422070503235, "logits/rejected": -0.7861501574516296, "logps/chosen": -69.766845703125, "logps/rejected": -121.30926513671875, "loss": 0.3496, "rewards/accuracies": 1.0, "rewards/chosen": -0.9012213349342346, "rewards/margins": 3.098912000656128, "rewards/rejected": -4.000133514404297, "step": 1983 }, { "epoch": 0.3313866177206741, "grad_norm": 22.030302047729492, "learning_rate": 1.668613382279326e-05, "logits/chosen": -0.8195145130157471, "logits/rejected": -0.8163827061653137, "logps/chosen": -115.8097915649414, "logps/rejected": -113.88636016845703, "loss": 0.8974, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.35341215133667, "rewards/margins": 0.7005359530448914, "rewards/rejected": -2.053948163986206, "step": 1986 }, { "epoch": 0.3318872017353579, "grad_norm": 6.4493584632873535, "learning_rate": 1.668112798264642e-05, "logits/chosen": -0.6379549503326416, "logits/rejected": -0.6307434439659119, "logps/chosen": -72.83394622802734, "logps/rejected": -75.89950561523438, "loss": 0.3273, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5920694470405579, "rewards/margins": 0.7064102292060852, "rewards/rejected": -1.298479676246643, "step": 1989 }, { "epoch": 0.33238778575004174, "grad_norm": 19.317018508911133, "learning_rate": 1.6676122142499583e-05, "logits/chosen": -0.9171629548072815, "logits/rejected": -0.9521691203117371, "logps/chosen": -45.0306396484375, "logps/rejected": -62.05099105834961, "loss": 0.3831, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.14211419224739075, "rewards/margins": 0.3082280457019806, "rewards/rejected": -0.4503422677516937, "step": 1992 }, { "epoch": 0.3328883697647255, "grad_norm": 15.63606071472168, "learning_rate": 1.6671116302352745e-05, "logits/chosen": -0.8150641322135925, "logits/rejected": -0.7678658366203308, "logps/chosen": -55.26142883300781, "logps/rejected": -83.25630950927734, "loss": 0.5929, "rewards/accuracies": 1.0, "rewards/chosen": 0.07826101034879684, "rewards/margins": 2.4756720066070557, "rewards/rejected": -2.3974111080169678, "step": 1995 }, { "epoch": 0.3333889537794093, "grad_norm": 47.521236419677734, "learning_rate": 1.666611046220591e-05, "logits/chosen": -0.820943295955658, "logits/rejected": -0.8385365605354309, "logps/chosen": -59.44832229614258, "logps/rejected": -98.5613784790039, "loss": 0.7562, "rewards/accuracies": 1.0, "rewards/chosen": 0.19992388784885406, "rewards/margins": 1.5560835599899292, "rewards/rejected": -1.3561595678329468, "step": 1998 }, { "epoch": 0.3338895377940931, "grad_norm": 32.50735855102539, "learning_rate": 1.666110462205907e-05, "logits/chosen": -0.7532026171684265, "logits/rejected": -0.7200968861579895, "logps/chosen": -75.4580307006836, "logps/rejected": -66.06098175048828, "loss": 0.8788, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.31302544474601746, "rewards/margins": 1.3158268928527832, "rewards/rejected": -1.628852367401123, "step": 2001 }, { "epoch": 0.3343901218087769, "grad_norm": 7.811986923217773, "learning_rate": 1.6656098781912234e-05, "logits/chosen": -0.7236436009407043, "logits/rejected": -0.8179299235343933, "logps/chosen": -54.142826080322266, "logps/rejected": -133.61363220214844, "loss": 0.3815, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.24275349080562592, "rewards/margins": 1.7822996377944946, "rewards/rejected": -2.0250532627105713, "step": 2004 }, { "epoch": 0.3348907058234607, "grad_norm": 10.961118698120117, "learning_rate": 1.6651092941765395e-05, "logits/chosen": -0.8861842751502991, "logits/rejected": -0.9012430310249329, "logps/chosen": -68.12358093261719, "logps/rejected": -111.10480499267578, "loss": 0.3913, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.662878394126892, "rewards/margins": 2.0880026817321777, "rewards/rejected": -3.7508811950683594, "step": 2007 }, { "epoch": 0.3353912898381445, "grad_norm": 15.019346237182617, "learning_rate": 1.6646087101618557e-05, "logits/chosen": -0.7483189105987549, "logits/rejected": -0.7378368377685547, "logps/chosen": -101.37845611572266, "logps/rejected": -94.62372589111328, "loss": 0.7727, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8958110809326172, "rewards/margins": 1.665998101234436, "rewards/rejected": -2.5618093013763428, "step": 2010 }, { "epoch": 0.3358918738528283, "grad_norm": 4.9430742263793945, "learning_rate": 1.664108126147172e-05, "logits/chosen": -0.9103769659996033, "logits/rejected": -0.8992440104484558, "logps/chosen": -113.4769287109375, "logps/rejected": -96.68302154541016, "loss": 0.5254, "rewards/accuracies": 1.0, "rewards/chosen": -0.7326151728630066, "rewards/margins": 1.979343056678772, "rewards/rejected": -2.711958169937134, "step": 2013 }, { "epoch": 0.3363924578675121, "grad_norm": 18.416231155395508, "learning_rate": 1.663607542132488e-05, "logits/chosen": -0.9269912838935852, "logits/rejected": -0.9183341860771179, "logps/chosen": -72.55060577392578, "logps/rejected": -92.08817291259766, "loss": 0.3941, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5481008887290955, "rewards/margins": 0.7997296452522278, "rewards/rejected": -1.3478306531906128, "step": 2016 }, { "epoch": 0.3368930418821959, "grad_norm": 22.963897705078125, "learning_rate": 1.6631069581178042e-05, "logits/chosen": -0.5881931781768799, "logits/rejected": -0.5923774242401123, "logps/chosen": -82.7655258178711, "logps/rejected": -97.36270904541016, "loss": 0.6587, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.7651354670524597, "rewards/margins": 0.9778665900230408, "rewards/rejected": -1.7430020570755005, "step": 2019 }, { "epoch": 0.3373936258968797, "grad_norm": 10.647331237792969, "learning_rate": 1.6626063741031204e-05, "logits/chosen": -0.8615872859954834, "logits/rejected": -0.8693227171897888, "logps/chosen": -18.322078704833984, "logps/rejected": -65.16804504394531, "loss": 0.6175, "rewards/accuracies": 1.0, "rewards/chosen": 0.7035861015319824, "rewards/margins": 2.131267547607422, "rewards/rejected": -1.42768132686615, "step": 2022 }, { "epoch": 0.3378942099115635, "grad_norm": 10.566824913024902, "learning_rate": 1.6621057900884366e-05, "logits/chosen": -0.9065432548522949, "logits/rejected": -0.9166151881217957, "logps/chosen": -103.8121337890625, "logps/rejected": -119.8609848022461, "loss": 0.3957, "rewards/accuracies": 1.0, "rewards/chosen": 0.43922117352485657, "rewards/margins": 2.3799471855163574, "rewards/rejected": -1.9407259225845337, "step": 2025 }, { "epoch": 0.3383947939262473, "grad_norm": 24.925079345703125, "learning_rate": 1.6616052060737528e-05, "logits/chosen": -0.7756584286689758, "logits/rejected": -0.7260213494300842, "logps/chosen": -138.6807098388672, "logps/rejected": -70.94709014892578, "loss": 0.7796, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.0446395874023438, "rewards/margins": -0.001776019693352282, "rewards/rejected": -1.0428636074066162, "step": 2028 }, { "epoch": 0.3388953779409311, "grad_norm": 22.66495132446289, "learning_rate": 1.661104622059069e-05, "logits/chosen": -0.8359119892120361, "logits/rejected": -0.8136897683143616, "logps/chosen": -85.85106658935547, "logps/rejected": -67.95391845703125, "loss": 0.551, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7089510560035706, "rewards/margins": -0.18304520845413208, "rewards/rejected": -0.5259058475494385, "step": 2031 }, { "epoch": 0.3393959619556149, "grad_norm": 19.152301788330078, "learning_rate": 1.660604038044385e-05, "logits/chosen": -0.880767285823822, "logits/rejected": -0.8355833888053894, "logps/chosen": -82.07743072509766, "logps/rejected": -89.95601654052734, "loss": 0.343, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3014066219329834, "rewards/margins": 0.3787335455417633, "rewards/rejected": -0.6801401972770691, "step": 2034 }, { "epoch": 0.33989654597029867, "grad_norm": 30.32927894592285, "learning_rate": 1.6601034540297013e-05, "logits/chosen": -0.7493662238121033, "logits/rejected": -0.7999820709228516, "logps/chosen": -110.4018783569336, "logps/rejected": -140.8137969970703, "loss": 0.5229, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6039861440658569, "rewards/margins": -0.1273033171892166, "rewards/rejected": -0.4766828119754791, "step": 2037 }, { "epoch": 0.3403971299849825, "grad_norm": 21.114118576049805, "learning_rate": 1.659602870015018e-05, "logits/chosen": -0.8933882713317871, "logits/rejected": -0.9203779697418213, "logps/chosen": -63.6519660949707, "logps/rejected": -118.0184097290039, "loss": 0.4965, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.2377194166183472, "rewards/margins": 2.2955713272094727, "rewards/rejected": -3.5332908630371094, "step": 2040 }, { "epoch": 0.3408977139996663, "grad_norm": 27.631683349609375, "learning_rate": 1.6591022860003337e-05, "logits/chosen": -0.6832491755485535, "logits/rejected": -0.7176468968391418, "logps/chosen": -66.48033905029297, "logps/rejected": -99.02676391601562, "loss": 0.5629, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7086096405982971, "rewards/margins": 1.2688430547714233, "rewards/rejected": -1.9774527549743652, "step": 2043 }, { "epoch": 0.34139829801435007, "grad_norm": 32.188720703125, "learning_rate": 1.65860170198565e-05, "logits/chosen": -0.7068149447441101, "logits/rejected": -0.6749356389045715, "logps/chosen": -111.50948333740234, "logps/rejected": -86.96810150146484, "loss": 0.8657, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.784501552581787, "rewards/margins": -1.470751404762268, "rewards/rejected": -0.3137500584125519, "step": 2046 }, { "epoch": 0.34189888202903385, "grad_norm": 16.3426570892334, "learning_rate": 1.6581011179709664e-05, "logits/chosen": -0.807063102722168, "logits/rejected": -0.8244335651397705, "logps/chosen": -69.38396453857422, "logps/rejected": -213.2432403564453, "loss": 0.4479, "rewards/accuracies": 1.0, "rewards/chosen": 0.38884785771369934, "rewards/margins": 4.605329990386963, "rewards/rejected": -4.216482162475586, "step": 2049 }, { "epoch": 0.3423994660437177, "grad_norm": 9.43206787109375, "learning_rate": 1.6576005339562826e-05, "logits/chosen": -0.8431730270385742, "logits/rejected": -0.844815731048584, "logps/chosen": -82.13445281982422, "logps/rejected": -86.57595825195312, "loss": 0.586, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6938490867614746, "rewards/margins": 0.5781807899475098, "rewards/rejected": -1.2720298767089844, "step": 2052 }, { "epoch": 0.34290005005840146, "grad_norm": 16.16309928894043, "learning_rate": 1.6570999499415987e-05, "logits/chosen": -0.8209899067878723, "logits/rejected": -0.8514609336853027, "logps/chosen": -87.6590347290039, "logps/rejected": -148.10047912597656, "loss": 0.3106, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7816688418388367, "rewards/margins": 1.5797666311264038, "rewards/rejected": -2.361435651779175, "step": 2055 }, { "epoch": 0.34340063407308524, "grad_norm": 22.64757537841797, "learning_rate": 1.656599365926915e-05, "logits/chosen": -0.5348387360572815, "logits/rejected": -0.577364981174469, "logps/chosen": -72.83464813232422, "logps/rejected": -117.62020111083984, "loss": 0.3226, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.6878353357315063, "rewards/margins": 1.8443132638931274, "rewards/rejected": -3.532148599624634, "step": 2058 }, { "epoch": 0.3439012180877691, "grad_norm": 26.841915130615234, "learning_rate": 1.656098781912231e-05, "logits/chosen": -0.8014845252037048, "logits/rejected": -0.7879384160041809, "logps/chosen": -68.204833984375, "logps/rejected": -56.58640670776367, "loss": 1.0323, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.7517883777618408, "rewards/margins": -0.7700719237327576, "rewards/rejected": 0.01828359253704548, "step": 2061 }, { "epoch": 0.34440180210245286, "grad_norm": 28.60386085510254, "learning_rate": 1.6555981978975473e-05, "logits/chosen": -0.6877243518829346, "logits/rejected": -0.7411143779754639, "logps/chosen": -117.5536880493164, "logps/rejected": -207.5010986328125, "loss": 0.4458, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.0942869186401367, "rewards/margins": 1.460645079612732, "rewards/rejected": -3.554931879043579, "step": 2064 }, { "epoch": 0.34490238611713664, "grad_norm": 16.363182067871094, "learning_rate": 1.6550976138828635e-05, "logits/chosen": -0.7822746634483337, "logits/rejected": -0.8321852087974548, "logps/chosen": -50.47869873046875, "logps/rejected": -114.24762725830078, "loss": 0.312, "rewards/accuracies": 1.0, "rewards/chosen": -0.30603453516960144, "rewards/margins": 2.9701411724090576, "rewards/rejected": -3.2761754989624023, "step": 2067 }, { "epoch": 0.3454029701318205, "grad_norm": 12.558526039123535, "learning_rate": 1.6545970298681796e-05, "logits/chosen": -0.7682178616523743, "logits/rejected": -0.7126315236091614, "logps/chosen": -94.69617462158203, "logps/rejected": -76.39035034179688, "loss": 0.998, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.5385459661483765, "rewards/margins": 1.0520734786987305, "rewards/rejected": -2.5906193256378174, "step": 2070 }, { "epoch": 0.34590355414650426, "grad_norm": 22.421499252319336, "learning_rate": 1.6540964458534958e-05, "logits/chosen": -0.8328455090522766, "logits/rejected": -0.868632972240448, "logps/chosen": -67.89054107666016, "logps/rejected": -133.37281799316406, "loss": 0.5517, "rewards/accuracies": 1.0, "rewards/chosen": -0.6097949147224426, "rewards/margins": 1.0950974225997925, "rewards/rejected": -1.7048922777175903, "step": 2073 }, { "epoch": 0.34640413816118804, "grad_norm": 61.088871002197266, "learning_rate": 1.653595861838812e-05, "logits/chosen": -0.9632417559623718, "logits/rejected": -0.8957064151763916, "logps/chosen": -106.7155990600586, "logps/rejected": -91.86962127685547, "loss": 0.8013, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.8002798557281494, "rewards/margins": 0.24323804676532745, "rewards/rejected": -3.04351806640625, "step": 2076 }, { "epoch": 0.34690472217587187, "grad_norm": 32.03681182861328, "learning_rate": 1.6530952778241282e-05, "logits/chosen": -0.8429970741271973, "logits/rejected": -0.8817481994628906, "logps/chosen": -67.2114486694336, "logps/rejected": -119.7996597290039, "loss": 0.5018, "rewards/accuracies": 1.0, "rewards/chosen": -0.2205609679222107, "rewards/margins": 2.491692066192627, "rewards/rejected": -2.7122528553009033, "step": 2079 }, { "epoch": 0.34740530619055565, "grad_norm": 24.944704055786133, "learning_rate": 1.6525946938094447e-05, "logits/chosen": -0.7534313797950745, "logits/rejected": -0.8165841102600098, "logps/chosen": -72.76679229736328, "logps/rejected": -204.2996826171875, "loss": 0.5497, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3308751583099365, "rewards/margins": 3.3363163471221924, "rewards/rejected": -3.667191743850708, "step": 2082 }, { "epoch": 0.34790589020523943, "grad_norm": 12.608721733093262, "learning_rate": 1.6520941097947605e-05, "logits/chosen": -0.7712128162384033, "logits/rejected": -0.7778420448303223, "logps/chosen": -73.9840087890625, "logps/rejected": -91.83361053466797, "loss": 0.6028, "rewards/accuracies": 1.0, "rewards/chosen": -0.6604027152061462, "rewards/margins": 0.372701495885849, "rewards/rejected": -1.0331043004989624, "step": 2085 }, { "epoch": 0.34840647421992327, "grad_norm": 20.59442710876465, "learning_rate": 1.6515935257800767e-05, "logits/chosen": -0.9894130229949951, "logits/rejected": -1.0265454053878784, "logps/chosen": -86.27606201171875, "logps/rejected": -131.45042419433594, "loss": 0.458, "rewards/accuracies": 1.0, "rewards/chosen": -1.4818342924118042, "rewards/margins": 2.1296451091766357, "rewards/rejected": -3.6114795207977295, "step": 2088 }, { "epoch": 0.34890705823460705, "grad_norm": 9.582266807556152, "learning_rate": 1.6510929417653932e-05, "logits/chosen": -0.8007538914680481, "logits/rejected": -0.832940399646759, "logps/chosen": -87.7345199584961, "logps/rejected": -169.1168975830078, "loss": 0.6677, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.9837530851364136, "rewards/margins": 0.2853505611419678, "rewards/rejected": -2.269103527069092, "step": 2091 }, { "epoch": 0.34940764224929083, "grad_norm": 25.493051528930664, "learning_rate": 1.6505923577507094e-05, "logits/chosen": -0.6240516304969788, "logits/rejected": -0.6051943898200989, "logps/chosen": -135.6118927001953, "logps/rejected": -106.67214965820312, "loss": 0.5915, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.4740310907363892, "rewards/margins": 0.012304465286433697, "rewards/rejected": -1.4863353967666626, "step": 2094 }, { "epoch": 0.34990822626397466, "grad_norm": 6.453088760375977, "learning_rate": 1.6500917737360256e-05, "logits/chosen": -0.6252234578132629, "logits/rejected": -0.6584280133247375, "logps/chosen": -101.41378784179688, "logps/rejected": -156.0856170654297, "loss": 0.5322, "rewards/accuracies": 1.0, "rewards/chosen": -0.6460933685302734, "rewards/margins": 2.7102391719818115, "rewards/rejected": -3.3563320636749268, "step": 2097 }, { "epoch": 0.35040881027865844, "grad_norm": 58.821075439453125, "learning_rate": 1.6495911897213418e-05, "logits/chosen": -0.812552273273468, "logits/rejected": -0.8842489123344421, "logps/chosen": -86.15837860107422, "logps/rejected": -132.14329528808594, "loss": 0.7466, "rewards/accuracies": 1.0, "rewards/chosen": 0.5212927460670471, "rewards/margins": 1.7760391235351562, "rewards/rejected": -1.254746437072754, "step": 2100 }, { "epoch": 0.3509093942933422, "grad_norm": 10.11136531829834, "learning_rate": 1.649090605706658e-05, "logits/chosen": -0.8503201603889465, "logits/rejected": -0.8788855671882629, "logps/chosen": -53.5626335144043, "logps/rejected": -105.93225860595703, "loss": 0.5584, "rewards/accuracies": 1.0, "rewards/chosen": -0.04921875521540642, "rewards/margins": 2.9397003650665283, "rewards/rejected": -2.988919258117676, "step": 2103 }, { "epoch": 0.351409978308026, "grad_norm": 18.903959274291992, "learning_rate": 1.648590021691974e-05, "logits/chosen": -0.7323679327964783, "logits/rejected": -0.8395978808403015, "logps/chosen": -53.27783203125, "logps/rejected": -122.15087890625, "loss": 0.458, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.1766643524169922, "rewards/margins": 0.8639118671417236, "rewards/rejected": -0.6872475147247314, "step": 2106 }, { "epoch": 0.35191056232270984, "grad_norm": 19.551406860351562, "learning_rate": 1.6480894376772903e-05, "logits/chosen": -0.8580182194709778, "logits/rejected": -0.8558862805366516, "logps/chosen": -83.90103912353516, "logps/rejected": -117.88057708740234, "loss": 0.3644, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.2547903060913086, "rewards/margins": 0.5802426338195801, "rewards/rejected": -1.8350329399108887, "step": 2109 }, { "epoch": 0.3524111463373936, "grad_norm": 12.276021003723145, "learning_rate": 1.6475888536626065e-05, "logits/chosen": -0.826632022857666, "logits/rejected": -0.8312123417854309, "logps/chosen": -56.46901321411133, "logps/rejected": -74.4023208618164, "loss": 0.6597, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.7301092147827148, "rewards/margins": 0.6544631123542786, "rewards/rejected": -1.3845723867416382, "step": 2112 }, { "epoch": 0.3529117303520774, "grad_norm": 23.363183975219727, "learning_rate": 1.6470882696479227e-05, "logits/chosen": -0.9181692004203796, "logits/rejected": -0.9416297078132629, "logps/chosen": -56.611785888671875, "logps/rejected": -79.71460723876953, "loss": 0.5894, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.0045779943466187, "rewards/margins": -0.32863175868988037, "rewards/rejected": -0.6759462952613831, "step": 2115 }, { "epoch": 0.35341231436676124, "grad_norm": 33.696617126464844, "learning_rate": 1.646587685633239e-05, "logits/chosen": -0.9762522578239441, "logits/rejected": -0.9955361485481262, "logps/chosen": -101.2260971069336, "logps/rejected": -143.3799285888672, "loss": 0.3117, "rewards/accuracies": 1.0, "rewards/chosen": -0.06414056569337845, "rewards/margins": 3.4045631885528564, "rewards/rejected": -3.4687042236328125, "step": 2118 }, { "epoch": 0.353912898381445, "grad_norm": 15.950493812561035, "learning_rate": 1.646087101618555e-05, "logits/chosen": -0.8323886394500732, "logits/rejected": -0.8321631550788879, "logps/chosen": -87.5099868774414, "logps/rejected": -112.79874420166016, "loss": 0.8537, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.5970349311828613, "rewards/margins": 1.4771610498428345, "rewards/rejected": -3.0741958618164062, "step": 2121 }, { "epoch": 0.3544134823961288, "grad_norm": 8.593965530395508, "learning_rate": 1.6455865176038712e-05, "logits/chosen": -0.7900614738464355, "logits/rejected": -0.9141923785209656, "logps/chosen": -55.534427642822266, "logps/rejected": -123.75174713134766, "loss": 0.3327, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.2748278081417084, "rewards/margins": 2.546945810317993, "rewards/rejected": -2.8217735290527344, "step": 2124 }, { "epoch": 0.35491406641081263, "grad_norm": 39.603271484375, "learning_rate": 1.6450859335891874e-05, "logits/chosen": -0.908890426158905, "logits/rejected": -0.9066804051399231, "logps/chosen": -64.94762420654297, "logps/rejected": -74.49031829833984, "loss": 0.8369, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5546485781669617, "rewards/margins": 1.2183655500411987, "rewards/rejected": -1.7730141878128052, "step": 2127 }, { "epoch": 0.3554146504254964, "grad_norm": 30.947120666503906, "learning_rate": 1.6445853495745036e-05, "logits/chosen": -0.7850301861763, "logits/rejected": -0.8167354464530945, "logps/chosen": -53.015350341796875, "logps/rejected": -108.40740966796875, "loss": 0.5663, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9643965363502502, "rewards/margins": 2.014066696166992, "rewards/rejected": -2.9784631729125977, "step": 2130 }, { "epoch": 0.3559152344401802, "grad_norm": 28.228214263916016, "learning_rate": 1.64408476555982e-05, "logits/chosen": -0.7297599911689758, "logits/rejected": -0.6680200695991516, "logps/chosen": -133.25167846679688, "logps/rejected": -79.2611083984375, "loss": 0.6444, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.2697222232818604, "rewards/margins": -0.5260334610939026, "rewards/rejected": -1.743688702583313, "step": 2133 }, { "epoch": 0.35641581845486403, "grad_norm": 37.459171295166016, "learning_rate": 1.6435841815451363e-05, "logits/chosen": -0.5482661724090576, "logits/rejected": -0.5980406403541565, "logps/chosen": -72.32159423828125, "logps/rejected": -116.79447174072266, "loss": 0.8047, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.0584169626235962, "rewards/margins": 0.04254315420985222, "rewards/rejected": -1.1009602546691895, "step": 2136 }, { "epoch": 0.3569164024695478, "grad_norm": 42.13191604614258, "learning_rate": 1.6430835975304524e-05, "logits/chosen": -0.7169283032417297, "logits/rejected": -0.7204186916351318, "logps/chosen": -62.9406623840332, "logps/rejected": -73.07183074951172, "loss": 1.2341, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.805933952331543, "rewards/margins": 0.5182885527610779, "rewards/rejected": -1.3242225646972656, "step": 2139 }, { "epoch": 0.3574169864842316, "grad_norm": 20.62352180480957, "learning_rate": 1.6425830135157686e-05, "logits/chosen": -0.8892529606819153, "logits/rejected": -0.8708431124687195, "logps/chosen": -86.00960540771484, "logps/rejected": -146.7954559326172, "loss": 0.4167, "rewards/accuracies": 1.0, "rewards/chosen": 0.40674224495887756, "rewards/margins": 2.669334650039673, "rewards/rejected": -2.2625925540924072, "step": 2142 }, { "epoch": 0.3579175704989154, "grad_norm": 46.93566131591797, "learning_rate": 1.6420824295010848e-05, "logits/chosen": -0.9546138644218445, "logits/rejected": -0.9376944899559021, "logps/chosen": -92.22943878173828, "logps/rejected": -105.41759490966797, "loss": 0.6181, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6139705777168274, "rewards/margins": 2.568160057067871, "rewards/rejected": -3.1821305751800537, "step": 2145 }, { "epoch": 0.3584181545135992, "grad_norm": 68.0331039428711, "learning_rate": 1.641581845486401e-05, "logits/chosen": -0.9112908244132996, "logits/rejected": -0.8859713673591614, "logps/chosen": -168.97853088378906, "logps/rejected": -112.30164337158203, "loss": 1.1443, "rewards/accuracies": 0.0, "rewards/chosen": -2.6768620014190674, "rewards/margins": -2.316800355911255, "rewards/rejected": -0.3600616455078125, "step": 2148 }, { "epoch": 0.358918738528283, "grad_norm": 20.64935302734375, "learning_rate": 1.641081261471717e-05, "logits/chosen": -0.844137966632843, "logits/rejected": -0.8658124804496765, "logps/chosen": -99.27115631103516, "logps/rejected": -129.43211364746094, "loss": 0.301, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.2864123582839966, "rewards/margins": 1.9726656675338745, "rewards/rejected": -3.259078025817871, "step": 2151 }, { "epoch": 0.35941932254296677, "grad_norm": 16.77361297607422, "learning_rate": 1.6405806774570333e-05, "logits/chosen": -0.7948789596557617, "logits/rejected": -0.7702309489250183, "logps/chosen": -116.5586929321289, "logps/rejected": -125.59391021728516, "loss": 0.5863, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -3.496213912963867, "rewards/margins": 0.6238419413566589, "rewards/rejected": -4.120055675506592, "step": 2154 }, { "epoch": 0.3599199065576506, "grad_norm": 41.14187240600586, "learning_rate": 1.6400800934423495e-05, "logits/chosen": -0.8331630229949951, "logits/rejected": -0.8491995334625244, "logps/chosen": -82.41492462158203, "logps/rejected": -142.23577880859375, "loss": 0.4181, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.05325325205922127, "rewards/margins": 5.46169900894165, "rewards/rejected": -5.514952182769775, "step": 2157 }, { "epoch": 0.3604204905723344, "grad_norm": 28.84272003173828, "learning_rate": 1.6395795094276657e-05, "logits/chosen": -0.8342573046684265, "logits/rejected": -0.8376586437225342, "logps/chosen": -62.396610260009766, "logps/rejected": -84.70171356201172, "loss": 0.5153, "rewards/accuracies": 1.0, "rewards/chosen": -0.3590141236782074, "rewards/margins": 1.3873132467269897, "rewards/rejected": -1.74632728099823, "step": 2160 }, { "epoch": 0.36092107458701816, "grad_norm": 16.02791404724121, "learning_rate": 1.639078925412982e-05, "logits/chosen": -0.63853520154953, "logits/rejected": -0.6161568760871887, "logps/chosen": -55.617862701416016, "logps/rejected": -76.13221740722656, "loss": 0.4257, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.8471291065216064, "rewards/margins": 2.1659977436065674, "rewards/rejected": -3.013127088546753, "step": 2163 }, { "epoch": 0.361421658601702, "grad_norm": 59.47908401489258, "learning_rate": 1.638578341398298e-05, "logits/chosen": -0.5908772945404053, "logits/rejected": -0.6666173934936523, "logps/chosen": -58.51907730102539, "logps/rejected": -136.2406463623047, "loss": 0.926, "rewards/accuracies": 1.0, "rewards/chosen": -0.03268265724182129, "rewards/margins": 2.621220827102661, "rewards/rejected": -2.6539034843444824, "step": 2166 }, { "epoch": 0.3619222426163858, "grad_norm": 22.353700637817383, "learning_rate": 1.6380777573836142e-05, "logits/chosen": -0.7848102450370789, "logits/rejected": -0.7517692446708679, "logps/chosen": -128.67242431640625, "logps/rejected": -120.9066162109375, "loss": 0.6486, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9951061606407166, "rewards/margins": 0.4342184066772461, "rewards/rejected": -1.4293246269226074, "step": 2169 }, { "epoch": 0.36242282663106956, "grad_norm": 24.42130470275879, "learning_rate": 1.6375771733689304e-05, "logits/chosen": -0.889998733997345, "logits/rejected": -0.8694841265678406, "logps/chosen": -90.92525482177734, "logps/rejected": -77.00154113769531, "loss": 0.6781, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.9541841745376587, "rewards/margins": 0.6739937663078308, "rewards/rejected": -2.6281778812408447, "step": 2172 }, { "epoch": 0.3629234106457534, "grad_norm": 37.994991302490234, "learning_rate": 1.637076589354247e-05, "logits/chosen": -0.6555395722389221, "logits/rejected": -0.7438516020774841, "logps/chosen": -78.89826202392578, "logps/rejected": -163.6137237548828, "loss": 0.9023, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9054848551750183, "rewards/margins": 0.6894596219062805, "rewards/rejected": -1.5949445962905884, "step": 2175 }, { "epoch": 0.3634239946604372, "grad_norm": 44.968936920166016, "learning_rate": 1.636576005339563e-05, "logits/chosen": -0.8957555294036865, "logits/rejected": -0.9194667935371399, "logps/chosen": -122.93426513671875, "logps/rejected": -133.4359893798828, "loss": 0.9649, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.549631953239441, "rewards/margins": 0.1627775877714157, "rewards/rejected": -1.712409496307373, "step": 2178 }, { "epoch": 0.36392457867512096, "grad_norm": 15.419439315795898, "learning_rate": 1.636075421324879e-05, "logits/chosen": -0.7498825192451477, "logits/rejected": -0.7919406890869141, "logps/chosen": -76.37872314453125, "logps/rejected": -124.248779296875, "loss": 0.6737, "rewards/accuracies": 1.0, "rewards/chosen": -0.18801359832286835, "rewards/margins": 1.1490410566329956, "rewards/rejected": -1.3370546102523804, "step": 2181 }, { "epoch": 0.3644251626898048, "grad_norm": 27.882328033447266, "learning_rate": 1.6355748373101955e-05, "logits/chosen": -0.8810141086578369, "logits/rejected": -0.8363118171691895, "logps/chosen": -80.88321685791016, "logps/rejected": -72.58783721923828, "loss": 0.2945, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7166738510131836, "rewards/margins": 1.1070996522903442, "rewards/rejected": -1.8237735033035278, "step": 2184 }, { "epoch": 0.3649257467044886, "grad_norm": 37.688350677490234, "learning_rate": 1.6350742532955117e-05, "logits/chosen": -0.6408339142799377, "logits/rejected": -0.6592617034912109, "logps/chosen": -49.5195426940918, "logps/rejected": -57.10142135620117, "loss": 0.9287, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.7350843548774719, "rewards/margins": 0.7578137516975403, "rewards/rejected": -0.022729357704520226, "step": 2187 }, { "epoch": 0.36542633071917235, "grad_norm": 23.314775466918945, "learning_rate": 1.634573669280828e-05, "logits/chosen": -0.8489620089530945, "logits/rejected": -0.8116611838340759, "logps/chosen": -123.36996459960938, "logps/rejected": -69.59640502929688, "loss": 0.5763, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.6652191281318665, "rewards/margins": -0.2685526907444, "rewards/rejected": -0.3966665267944336, "step": 2190 }, { "epoch": 0.3659269147338562, "grad_norm": 5.969237804412842, "learning_rate": 1.634073085266144e-05, "logits/chosen": -0.8056034445762634, "logits/rejected": -0.8490055203437805, "logps/chosen": -72.61412811279297, "logps/rejected": -193.435791015625, "loss": 0.4829, "rewards/accuracies": 1.0, "rewards/chosen": 0.2491029053926468, "rewards/margins": 2.8269221782684326, "rewards/rejected": -2.577819347381592, "step": 2193 }, { "epoch": 0.36642749874853997, "grad_norm": 25.936683654785156, "learning_rate": 1.6335725012514602e-05, "logits/chosen": -0.7376623749732971, "logits/rejected": -0.6873061060905457, "logps/chosen": -138.9318389892578, "logps/rejected": -67.8882064819336, "loss": 0.4777, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7082479000091553, "rewards/margins": 1.2767080068588257, "rewards/rejected": -1.9849557876586914, "step": 2196 }, { "epoch": 0.36692808276322375, "grad_norm": 21.00391960144043, "learning_rate": 1.6330719172367764e-05, "logits/chosen": -0.6871139407157898, "logits/rejected": -0.7590663433074951, "logps/chosen": -66.02434539794922, "logps/rejected": -107.37943267822266, "loss": 0.4037, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.829852819442749, "rewards/margins": 2.7593040466308594, "rewards/rejected": -1.9294509887695312, "step": 2199 }, { "epoch": 0.3674286667779076, "grad_norm": 10.56385326385498, "learning_rate": 1.6325713332220926e-05, "logits/chosen": -0.9264715313911438, "logits/rejected": -0.9571776390075684, "logps/chosen": -27.904844284057617, "logps/rejected": -113.0328598022461, "loss": 0.3786, "rewards/accuracies": 1.0, "rewards/chosen": 0.4172248840332031, "rewards/margins": 2.3753297328948975, "rewards/rejected": -1.9581049680709839, "step": 2202 }, { "epoch": 0.36792925079259137, "grad_norm": 29.252016067504883, "learning_rate": 1.6320707492074087e-05, "logits/chosen": -0.9105876088142395, "logits/rejected": -0.9094643592834473, "logps/chosen": -68.12511444091797, "logps/rejected": -80.08325958251953, "loss": 0.6027, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8616896271705627, "rewards/margins": 0.4995304048061371, "rewards/rejected": -1.3612200021743774, "step": 2205 }, { "epoch": 0.36842983480727515, "grad_norm": 17.367523193359375, "learning_rate": 1.631570165192725e-05, "logits/chosen": -0.8824341893196106, "logits/rejected": -0.8834490776062012, "logps/chosen": -82.27393341064453, "logps/rejected": -100.2283706665039, "loss": 0.4072, "rewards/accuracies": 1.0, "rewards/chosen": -1.4244275093078613, "rewards/margins": 1.105317234992981, "rewards/rejected": -2.529744863510132, "step": 2208 }, { "epoch": 0.3689304188219589, "grad_norm": 10.76339054107666, "learning_rate": 1.631069581178041e-05, "logits/chosen": -0.9398791193962097, "logits/rejected": -0.9690764546394348, "logps/chosen": -106.1508560180664, "logps/rejected": -110.79874420166016, "loss": 0.368, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8229090571403503, "rewards/margins": 1.101993203163147, "rewards/rejected": -1.924902319908142, "step": 2211 }, { "epoch": 0.36943100283664276, "grad_norm": 21.15926742553711, "learning_rate": 1.6305689971633573e-05, "logits/chosen": -0.7922101020812988, "logits/rejected": -0.7993461489677429, "logps/chosen": -122.79630279541016, "logps/rejected": -118.9022216796875, "loss": 0.5259, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -3.348660707473755, "rewards/margins": -0.2893599271774292, "rewards/rejected": -3.059300661087036, "step": 2214 }, { "epoch": 0.36993158685132654, "grad_norm": 19.099634170532227, "learning_rate": 1.6300684131486735e-05, "logits/chosen": -0.6526718139648438, "logits/rejected": -0.715366780757904, "logps/chosen": -88.34752655029297, "logps/rejected": -135.1314239501953, "loss": 0.625, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.006811022758483887, "rewards/margins": 2.7728888988494873, "rewards/rejected": -2.7797000408172607, "step": 2217 }, { "epoch": 0.3704321708660103, "grad_norm": 8.381346702575684, "learning_rate": 1.62956782913399e-05, "logits/chosen": -0.7939942479133606, "logits/rejected": -0.7663717269897461, "logps/chosen": -67.5701904296875, "logps/rejected": -93.75192260742188, "loss": 0.2899, "rewards/accuracies": 1.0, "rewards/chosen": 0.5043137669563293, "rewards/margins": 1.2745964527130127, "rewards/rejected": -0.7702826857566833, "step": 2220 }, { "epoch": 0.37093275488069416, "grad_norm": 48.31520462036133, "learning_rate": 1.6290672451193058e-05, "logits/chosen": -0.7161192893981934, "logits/rejected": -0.7608781456947327, "logps/chosen": -51.962005615234375, "logps/rejected": -113.89112091064453, "loss": 0.6921, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.207025170326233, "rewards/margins": 0.2397523671388626, "rewards/rejected": -1.4467777013778687, "step": 2223 }, { "epoch": 0.37143333889537794, "grad_norm": 36.738914489746094, "learning_rate": 1.6285666611046223e-05, "logits/chosen": -0.865804135799408, "logits/rejected": -0.8672282695770264, "logps/chosen": -90.79280853271484, "logps/rejected": -133.44085693359375, "loss": 0.7002, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7700722217559814, "rewards/margins": 3.183727264404297, "rewards/rejected": -3.953799247741699, "step": 2226 }, { "epoch": 0.3719339229100617, "grad_norm": 10.011866569519043, "learning_rate": 1.6280660770899385e-05, "logits/chosen": -0.78994220495224, "logits/rejected": -0.8475337028503418, "logps/chosen": -77.7845230102539, "logps/rejected": -133.83514404296875, "loss": 0.4476, "rewards/accuracies": 1.0, "rewards/chosen": 0.3280933201313019, "rewards/margins": 2.714069366455078, "rewards/rejected": -2.3859760761260986, "step": 2229 }, { "epoch": 0.37243450692474556, "grad_norm": 22.679073333740234, "learning_rate": 1.6275654930752547e-05, "logits/chosen": -0.9158630967140198, "logits/rejected": -0.9049233794212341, "logps/chosen": -82.95755004882812, "logps/rejected": -89.85407257080078, "loss": 0.5159, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.425310492515564, "rewards/margins": 0.8257400989532471, "rewards/rejected": -2.2510502338409424, "step": 2232 }, { "epoch": 0.37293509093942934, "grad_norm": 18.181970596313477, "learning_rate": 1.627064909060571e-05, "logits/chosen": -0.9611427187919617, "logits/rejected": -0.977762758731842, "logps/chosen": -81.7042236328125, "logps/rejected": -100.78287506103516, "loss": 0.5592, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.0662869215011597, "rewards/margins": 1.329996943473816, "rewards/rejected": -2.3962838649749756, "step": 2235 }, { "epoch": 0.3734356749541131, "grad_norm": 42.350467681884766, "learning_rate": 1.626564325045887e-05, "logits/chosen": -0.797013521194458, "logits/rejected": -0.8377386927604675, "logps/chosen": -116.02169799804688, "logps/rejected": -139.88426208496094, "loss": 0.5351, "rewards/accuracies": 1.0, "rewards/chosen": -0.1439775973558426, "rewards/margins": 1.5131264925003052, "rewards/rejected": -1.6571041345596313, "step": 2238 }, { "epoch": 0.37393625896879695, "grad_norm": 21.58661460876465, "learning_rate": 1.6260637410312032e-05, "logits/chosen": -0.7577773928642273, "logits/rejected": -0.8088934421539307, "logps/chosen": -74.63603210449219, "logps/rejected": -130.21205139160156, "loss": 0.4011, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3192455470561981, "rewards/margins": 1.473406195640564, "rewards/rejected": -1.792651653289795, "step": 2241 }, { "epoch": 0.37443684298348073, "grad_norm": 27.122068405151367, "learning_rate": 1.6255631570165194e-05, "logits/chosen": -0.8631088137626648, "logits/rejected": -0.8900110721588135, "logps/chosen": -62.506221771240234, "logps/rejected": -87.2449722290039, "loss": 0.3692, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3801192343235016, "rewards/margins": 0.8371691703796387, "rewards/rejected": -1.2172883749008179, "step": 2244 }, { "epoch": 0.3749374269981645, "grad_norm": 16.75638198852539, "learning_rate": 1.6250625730018356e-05, "logits/chosen": -0.8838591575622559, "logits/rejected": -0.9081608653068542, "logps/chosen": -68.60306549072266, "logps/rejected": -120.23389434814453, "loss": 0.6499, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7492437362670898, "rewards/margins": 1.936159610748291, "rewards/rejected": -2.6854031085968018, "step": 2247 }, { "epoch": 0.37543801101284835, "grad_norm": 28.04286003112793, "learning_rate": 1.6245619889871518e-05, "logits/chosen": -1.015913486480713, "logits/rejected": -1.011716365814209, "logps/chosen": -127.52141571044922, "logps/rejected": -154.6398162841797, "loss": 0.3818, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.5890926122665405, "rewards/margins": 0.022756418213248253, "rewards/rejected": -1.6118489503860474, "step": 2250 }, { "epoch": 0.37593859502753213, "grad_norm": 35.36320114135742, "learning_rate": 1.624061404972468e-05, "logits/chosen": -0.7592543959617615, "logits/rejected": -0.8344838619232178, "logps/chosen": -46.85835647583008, "logps/rejected": -161.08154296875, "loss": 0.1843, "rewards/accuracies": 1.0, "rewards/chosen": 0.9008072018623352, "rewards/margins": 6.277851104736328, "rewards/rejected": -5.3770432472229, "step": 2253 }, { "epoch": 0.3764391790422159, "grad_norm": 22.03831672668457, "learning_rate": 1.623560820957784e-05, "logits/chosen": -0.8500257134437561, "logits/rejected": -0.8784549236297607, "logps/chosen": -89.49637603759766, "logps/rejected": -124.50241088867188, "loss": 0.4721, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.594088077545166, "rewards/margins": 1.8346534967422485, "rewards/rejected": -2.428741693496704, "step": 2256 }, { "epoch": 0.3769397630568997, "grad_norm": 26.07710075378418, "learning_rate": 1.6230602369431003e-05, "logits/chosen": -0.8939083218574524, "logits/rejected": -0.9362091422080994, "logps/chosen": -81.57280731201172, "logps/rejected": -136.4521942138672, "loss": 0.4934, "rewards/accuracies": 1.0, "rewards/chosen": -1.7194339036941528, "rewards/margins": 1.7809566259384155, "rewards/rejected": -3.5003902912139893, "step": 2259 }, { "epoch": 0.3774403470715835, "grad_norm": 43.807491302490234, "learning_rate": 1.6225596529284168e-05, "logits/chosen": -0.920520007610321, "logits/rejected": -0.8748698234558105, "logps/chosen": -120.1619873046875, "logps/rejected": -78.48072814941406, "loss": 0.9391, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.0828592777252197, "rewards/margins": -0.062138717621564865, "rewards/rejected": -2.0207207202911377, "step": 2262 }, { "epoch": 0.3779409310862673, "grad_norm": 19.41044044494629, "learning_rate": 1.6220590689137327e-05, "logits/chosen": -0.8439032435417175, "logits/rejected": -0.9121478199958801, "logps/chosen": -95.51850128173828, "logps/rejected": -127.97274780273438, "loss": 0.5046, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.847339153289795, "rewards/margins": 1.0569995641708374, "rewards/rejected": -2.9043385982513428, "step": 2265 }, { "epoch": 0.3784415151009511, "grad_norm": 34.96827697753906, "learning_rate": 1.6215584848990492e-05, "logits/chosen": -0.7963159680366516, "logits/rejected": -0.7597125172615051, "logps/chosen": -88.3986587524414, "logps/rejected": -126.42499542236328, "loss": 0.9156, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6494574546813965, "rewards/margins": 1.322034478187561, "rewards/rejected": -1.971491813659668, "step": 2268 }, { "epoch": 0.3789420991156349, "grad_norm": 24.030729293823242, "learning_rate": 1.6210579008843654e-05, "logits/chosen": -0.7124750018119812, "logits/rejected": -0.7353551387786865, "logps/chosen": -79.39144134521484, "logps/rejected": -104.13739776611328, "loss": 0.5002, "rewards/accuracies": 1.0, "rewards/chosen": -0.7402891516685486, "rewards/margins": 2.00335431098938, "rewards/rejected": -2.743643045425415, "step": 2271 }, { "epoch": 0.3794426831303187, "grad_norm": 16.2674617767334, "learning_rate": 1.6205573168696812e-05, "logits/chosen": -0.6893038749694824, "logits/rejected": -0.7213742136955261, "logps/chosen": -63.99765396118164, "logps/rejected": -117.95105743408203, "loss": 0.2748, "rewards/accuracies": 1.0, "rewards/chosen": -0.3687860071659088, "rewards/margins": 3.0160207748413086, "rewards/rejected": -3.3848066329956055, "step": 2274 }, { "epoch": 0.3799432671450025, "grad_norm": 26.85379409790039, "learning_rate": 1.6200567328549977e-05, "logits/chosen": -0.9807183146476746, "logits/rejected": -0.946748673915863, "logps/chosen": -65.78633880615234, "logps/rejected": -69.76624298095703, "loss": 0.5357, "rewards/accuracies": 1.0, "rewards/chosen": -0.393475204706192, "rewards/margins": 2.6510026454925537, "rewards/rejected": -3.044477701187134, "step": 2277 }, { "epoch": 0.3804438511596863, "grad_norm": 14.67674732208252, "learning_rate": 1.619556148840314e-05, "logits/chosen": -0.9863710403442383, "logits/rejected": -0.9676780104637146, "logps/chosen": -83.86272430419922, "logps/rejected": -89.7818832397461, "loss": 0.2332, "rewards/accuracies": 1.0, "rewards/chosen": -0.2574293315410614, "rewards/margins": 2.227043628692627, "rewards/rejected": -2.4844729900360107, "step": 2280 }, { "epoch": 0.3809444351743701, "grad_norm": 28.04673957824707, "learning_rate": 1.61905556482563e-05, "logits/chosen": -0.8798709511756897, "logits/rejected": -0.8524606227874756, "logps/chosen": -131.53782653808594, "logps/rejected": -77.93405151367188, "loss": 0.4809, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.3650627136230469, "rewards/margins": -0.22442297637462616, "rewards/rejected": -1.1406397819519043, "step": 2283 }, { "epoch": 0.3814450191890539, "grad_norm": 25.275287628173828, "learning_rate": 1.6185549808109463e-05, "logits/chosen": -0.9346728324890137, "logits/rejected": -0.9179765582084656, "logps/chosen": -124.71666717529297, "logps/rejected": -128.88531494140625, "loss": 0.7615, "rewards/accuracies": 1.0, "rewards/chosen": -0.6535224914550781, "rewards/margins": 1.7309184074401855, "rewards/rejected": -2.3844406604766846, "step": 2286 }, { "epoch": 0.3819456032037377, "grad_norm": 29.113187789916992, "learning_rate": 1.6180543967962624e-05, "logits/chosen": -0.8680453896522522, "logits/rejected": -0.8083016276359558, "logps/chosen": -102.1539077758789, "logps/rejected": -52.51600646972656, "loss": 0.7301, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8823110461235046, "rewards/margins": -0.13747958838939667, "rewards/rejected": -0.7448315024375916, "step": 2289 }, { "epoch": 0.3824461872184215, "grad_norm": 29.677730560302734, "learning_rate": 1.6175538127815786e-05, "logits/chosen": -0.7594053745269775, "logits/rejected": -0.6995253562927246, "logps/chosen": -98.4602279663086, "logps/rejected": -86.49285888671875, "loss": 0.5753, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9805116653442383, "rewards/margins": 1.0335360765457153, "rewards/rejected": -2.014047861099243, "step": 2292 }, { "epoch": 0.3829467712331053, "grad_norm": 32.3576545715332, "learning_rate": 1.6170532287668948e-05, "logits/chosen": -0.9695053100585938, "logits/rejected": -0.9699729084968567, "logps/chosen": -107.31464385986328, "logps/rejected": -146.34996032714844, "loss": 0.4164, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.8804829716682434, "rewards/margins": -0.00847144890576601, "rewards/rejected": -0.8720116019248962, "step": 2295 }, { "epoch": 0.3834473552477891, "grad_norm": 3.0264194011688232, "learning_rate": 1.616552644752211e-05, "logits/chosen": -0.858531653881073, "logits/rejected": -0.8837566375732422, "logps/chosen": -103.78356170654297, "logps/rejected": -117.90555572509766, "loss": 0.1187, "rewards/accuracies": 1.0, "rewards/chosen": -0.13669587671756744, "rewards/margins": 2.8266372680664062, "rewards/rejected": -2.9633331298828125, "step": 2298 }, { "epoch": 0.3839479392624729, "grad_norm": 35.388916015625, "learning_rate": 1.616052060737527e-05, "logits/chosen": -0.9464123845100403, "logits/rejected": -0.9314901232719421, "logps/chosen": -80.11772918701172, "logps/rejected": -56.10218811035156, "loss": 1.1786, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.5230838060379028, "rewards/margins": -0.3791567385196686, "rewards/rejected": -1.1439270973205566, "step": 2301 }, { "epoch": 0.3844485232771567, "grad_norm": 8.771753311157227, "learning_rate": 1.6155514767228437e-05, "logits/chosen": -0.8980367183685303, "logits/rejected": -0.8873017430305481, "logps/chosen": -74.67292022705078, "logps/rejected": -104.20220184326172, "loss": 0.3862, "rewards/accuracies": 1.0, "rewards/chosen": -0.07768527418375015, "rewards/margins": 3.5667455196380615, "rewards/rejected": -3.6444308757781982, "step": 2304 }, { "epoch": 0.38494910729184045, "grad_norm": 14.480551719665527, "learning_rate": 1.6150508927081595e-05, "logits/chosen": -0.9492427706718445, "logits/rejected": -0.9342164993286133, "logps/chosen": -63.67842483520508, "logps/rejected": -101.3064956665039, "loss": 0.6439, "rewards/accuracies": 1.0, "rewards/chosen": -0.01306694746017456, "rewards/margins": 2.880176305770874, "rewards/rejected": -2.8932430744171143, "step": 2307 }, { "epoch": 0.3854496913065243, "grad_norm": 16.240253448486328, "learning_rate": 1.614550308693476e-05, "logits/chosen": -0.7907989025115967, "logits/rejected": -0.7511375546455383, "logps/chosen": -94.4263916015625, "logps/rejected": -74.24269104003906, "loss": 0.4872, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.2826067209243774, "rewards/margins": 0.4983506202697754, "rewards/rejected": -1.7809573411941528, "step": 2310 }, { "epoch": 0.38595027532120807, "grad_norm": 29.932113647460938, "learning_rate": 1.6140497246787922e-05, "logits/chosen": -0.9051384925842285, "logits/rejected": -0.9428765177726746, "logps/chosen": -59.95785140991211, "logps/rejected": -153.6707305908203, "loss": 0.7265, "rewards/accuracies": 1.0, "rewards/chosen": 0.2745637595653534, "rewards/margins": 2.693690061569214, "rewards/rejected": -2.419126272201538, "step": 2313 }, { "epoch": 0.38645085933589185, "grad_norm": 18.51011085510254, "learning_rate": 1.613549140664108e-05, "logits/chosen": -0.8567144870758057, "logits/rejected": -0.849149763584137, "logps/chosen": -67.7549819946289, "logps/rejected": -85.29119110107422, "loss": 0.3597, "rewards/accuracies": 1.0, "rewards/chosen": -0.5167239308357239, "rewards/margins": 1.9021700620651245, "rewards/rejected": -2.418894052505493, "step": 2316 }, { "epoch": 0.3869514433505757, "grad_norm": 18.050792694091797, "learning_rate": 1.6130485566494246e-05, "logits/chosen": -0.8928427696228027, "logits/rejected": -0.954613983631134, "logps/chosen": -64.25492095947266, "logps/rejected": -97.70122528076172, "loss": 0.6147, "rewards/accuracies": 1.0, "rewards/chosen": -0.5719193816184998, "rewards/margins": 1.3876272439956665, "rewards/rejected": -1.959546685218811, "step": 2319 }, { "epoch": 0.38745202736525947, "grad_norm": 21.707530975341797, "learning_rate": 1.6125479726347408e-05, "logits/chosen": -0.8869357705116272, "logits/rejected": -0.8691923022270203, "logps/chosen": -96.8823013305664, "logps/rejected": -92.46541595458984, "loss": 0.9054, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7444486021995544, "rewards/margins": 1.133705973625183, "rewards/rejected": -1.8781546354293823, "step": 2322 }, { "epoch": 0.38795261137994325, "grad_norm": 5.52480411529541, "learning_rate": 1.612047388620057e-05, "logits/chosen": -0.8926132321357727, "logits/rejected": -0.9264482855796814, "logps/chosen": -81.1999282836914, "logps/rejected": -142.36441040039062, "loss": 0.3729, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.03353893756866455, "rewards/margins": 1.8314628601074219, "rewards/rejected": -1.7979240417480469, "step": 2325 }, { "epoch": 0.3884531953946271, "grad_norm": 37.36903762817383, "learning_rate": 1.611546804605373e-05, "logits/chosen": -1.0036386251449585, "logits/rejected": -0.9352741837501526, "logps/chosen": -89.01490020751953, "logps/rejected": -68.15046691894531, "loss": 0.6273, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5421218276023865, "rewards/margins": 1.7288841009140015, "rewards/rejected": -2.271005868911743, "step": 2328 }, { "epoch": 0.38895377940931086, "grad_norm": 11.003807067871094, "learning_rate": 1.6110462205906893e-05, "logits/chosen": -0.8634300827980042, "logits/rejected": -0.9210906028747559, "logps/chosen": -46.81085205078125, "logps/rejected": -144.45960998535156, "loss": 0.2437, "rewards/accuracies": 1.0, "rewards/chosen": -0.12815900146961212, "rewards/margins": 2.3313100337982178, "rewards/rejected": -2.4594688415527344, "step": 2331 }, { "epoch": 0.38945436342399464, "grad_norm": 19.97978401184082, "learning_rate": 1.6105456365760055e-05, "logits/chosen": -0.9961013197898865, "logits/rejected": -0.9794971346855164, "logps/chosen": -89.9499282836914, "logps/rejected": -104.36124420166016, "loss": 0.5275, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6007186770439148, "rewards/margins": 1.5024715662002563, "rewards/rejected": -2.1031901836395264, "step": 2334 }, { "epoch": 0.3899549474386785, "grad_norm": 38.58561325073242, "learning_rate": 1.6100450525613217e-05, "logits/chosen": -1.13329017162323, "logits/rejected": -1.1127456426620483, "logps/chosen": -115.97308349609375, "logps/rejected": -118.71192169189453, "loss": 0.1626, "rewards/accuracies": 1.0, "rewards/chosen": -0.16123580932617188, "rewards/margins": 4.106659412384033, "rewards/rejected": -4.267895221710205, "step": 2337 }, { "epoch": 0.39045553145336226, "grad_norm": 10.627931594848633, "learning_rate": 1.609544468546638e-05, "logits/chosen": -0.9185426831245422, "logits/rejected": -0.9241724014282227, "logps/chosen": -110.9693603515625, "logps/rejected": -116.07061004638672, "loss": 0.8236, "rewards/accuracies": 1.0, "rewards/chosen": -0.9404798150062561, "rewards/margins": 0.6936755180358887, "rewards/rejected": -1.6341552734375, "step": 2340 }, { "epoch": 0.39095611546804604, "grad_norm": 32.02168273925781, "learning_rate": 1.609043884531954e-05, "logits/chosen": -0.8076589107513428, "logits/rejected": -0.8537599444389343, "logps/chosen": -80.46737670898438, "logps/rejected": -63.29072189331055, "loss": 0.781, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.21301740407943726, "rewards/margins": -0.07159817963838577, "rewards/rejected": -0.14141923189163208, "step": 2343 }, { "epoch": 0.3914566994827299, "grad_norm": 17.498146057128906, "learning_rate": 1.6085433005172705e-05, "logits/chosen": -1.0767146348953247, "logits/rejected": -1.0814415216445923, "logps/chosen": -72.08026885986328, "logps/rejected": -134.97698974609375, "loss": 0.4197, "rewards/accuracies": 1.0, "rewards/chosen": 0.3816610276699066, "rewards/margins": 4.773003101348877, "rewards/rejected": -4.3913421630859375, "step": 2346 }, { "epoch": 0.39195728349741366, "grad_norm": 7.201789855957031, "learning_rate": 1.6080427165025864e-05, "logits/chosen": -0.8060832619667053, "logits/rejected": -0.8766726851463318, "logps/chosen": -55.71575927734375, "logps/rejected": -179.8358917236328, "loss": 0.5686, "rewards/accuracies": 1.0, "rewards/chosen": 0.37292060256004333, "rewards/margins": 2.6971356868743896, "rewards/rejected": -2.3242149353027344, "step": 2349 }, { "epoch": 0.39245786751209744, "grad_norm": 14.335562705993652, "learning_rate": 1.6075421324879026e-05, "logits/chosen": -0.9569587111473083, "logits/rejected": -0.945416271686554, "logps/chosen": -103.40193939208984, "logps/rejected": -106.56793212890625, "loss": 0.3755, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.09506423026323318, "rewards/margins": 1.6289724111557007, "rewards/rejected": -1.5339080095291138, "step": 2352 }, { "epoch": 0.39295845152678127, "grad_norm": 20.901914596557617, "learning_rate": 1.607041548473219e-05, "logits/chosen": -1.012883186340332, "logits/rejected": -1.0364574193954468, "logps/chosen": -114.77276611328125, "logps/rejected": -141.2860565185547, "loss": 0.5954, "rewards/accuracies": 1.0, "rewards/chosen": -0.77560955286026, "rewards/margins": 0.8531992435455322, "rewards/rejected": -1.628808856010437, "step": 2355 }, { "epoch": 0.39345903554146505, "grad_norm": 22.259910583496094, "learning_rate": 1.606540964458535e-05, "logits/chosen": -0.7772979736328125, "logits/rejected": -0.8203818202018738, "logps/chosen": -83.0315170288086, "logps/rejected": -139.5029296875, "loss": 0.4132, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.11161074042320251, "rewards/margins": 1.5583323240280151, "rewards/rejected": -1.4467216730117798, "step": 2358 }, { "epoch": 0.39395961955614883, "grad_norm": 35.568565368652344, "learning_rate": 1.6060403804438514e-05, "logits/chosen": -0.8894297480583191, "logits/rejected": -0.88352370262146, "logps/chosen": -52.08034896850586, "logps/rejected": -97.8401870727539, "loss": 0.7695, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.05203397944569588, "rewards/margins": 3.1445388793945312, "rewards/rejected": -3.0925045013427734, "step": 2361 }, { "epoch": 0.3944602035708326, "grad_norm": 18.923185348510742, "learning_rate": 1.6055397964291676e-05, "logits/chosen": -0.9565348029136658, "logits/rejected": -0.9696239829063416, "logps/chosen": -66.02782440185547, "logps/rejected": -98.9853744506836, "loss": 0.8217, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8492022156715393, "rewards/margins": 0.5466365814208984, "rewards/rejected": -1.395838737487793, "step": 2364 }, { "epoch": 0.39496078758551645, "grad_norm": 21.005170822143555, "learning_rate": 1.6050392124144838e-05, "logits/chosen": -0.7097098231315613, "logits/rejected": -0.7384300231933594, "logps/chosen": -86.81897735595703, "logps/rejected": -92.37397003173828, "loss": 0.5868, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.15047593414783478, "rewards/margins": 0.6331038475036621, "rewards/rejected": -0.48262789845466614, "step": 2367 }, { "epoch": 0.39546137160020023, "grad_norm": 57.268917083740234, "learning_rate": 1.6045386283998e-05, "logits/chosen": -0.9029326438903809, "logits/rejected": -0.9524850249290466, "logps/chosen": -81.23889923095703, "logps/rejected": -121.82942962646484, "loss": 0.8018, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.8028087615966797, "rewards/margins": -1.4215985536575317, "rewards/rejected": -1.381210207939148, "step": 2370 }, { "epoch": 0.395961955614884, "grad_norm": 7.318064212799072, "learning_rate": 1.604038044385116e-05, "logits/chosen": -1.0223146677017212, "logits/rejected": -1.0316392183303833, "logps/chosen": -39.39822769165039, "logps/rejected": -75.5404281616211, "loss": 0.5483, "rewards/accuracies": 1.0, "rewards/chosen": -0.4883086681365967, "rewards/margins": 0.7237846851348877, "rewards/rejected": -1.2120933532714844, "step": 2373 }, { "epoch": 0.39646253962956785, "grad_norm": 19.23173713684082, "learning_rate": 1.6035374603704323e-05, "logits/chosen": -0.8904151916503906, "logits/rejected": -0.9141344428062439, "logps/chosen": -58.766353607177734, "logps/rejected": -107.97498321533203, "loss": 0.7117, "rewards/accuracies": 1.0, "rewards/chosen": 0.03956069424748421, "rewards/margins": 2.4346423149108887, "rewards/rejected": -2.3950815200805664, "step": 2376 }, { "epoch": 0.3969631236442516, "grad_norm": 6.346599102020264, "learning_rate": 1.6030368763557485e-05, "logits/chosen": -1.002332329750061, "logits/rejected": -0.9914229512214661, "logps/chosen": -102.1301498413086, "logps/rejected": -89.3426513671875, "loss": 0.3127, "rewards/accuracies": 1.0, "rewards/chosen": -0.5099052786827087, "rewards/margins": 1.16130530834198, "rewards/rejected": -1.6712106466293335, "step": 2379 }, { "epoch": 0.3974637076589354, "grad_norm": 38.05545425415039, "learning_rate": 1.6025362923410647e-05, "logits/chosen": -0.9416358470916748, "logits/rejected": -0.9547324180603027, "logps/chosen": -93.06554412841797, "logps/rejected": -93.63714599609375, "loss": 0.6776, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.9911460280418396, "rewards/margins": 0.5340887904167175, "rewards/rejected": -1.5252348184585571, "step": 2382 }, { "epoch": 0.39796429167361924, "grad_norm": 20.998512268066406, "learning_rate": 1.602035708326381e-05, "logits/chosen": -0.8736494183540344, "logits/rejected": -0.8885116577148438, "logps/chosen": -98.46875762939453, "logps/rejected": -96.64083862304688, "loss": 0.6291, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.3092361688613892, "rewards/margins": 1.0822032690048218, "rewards/rejected": -2.391439437866211, "step": 2385 }, { "epoch": 0.398464875688303, "grad_norm": 22.99417495727539, "learning_rate": 1.6015351243116974e-05, "logits/chosen": -0.6994736194610596, "logits/rejected": -0.7304642796516418, "logps/chosen": -103.42587280273438, "logps/rejected": -135.5136260986328, "loss": 0.4821, "rewards/accuracies": 1.0, "rewards/chosen": -0.8132964968681335, "rewards/margins": 0.5707325339317322, "rewards/rejected": -1.3840290307998657, "step": 2388 }, { "epoch": 0.3989654597029868, "grad_norm": 11.226778030395508, "learning_rate": 1.6010345402970132e-05, "logits/chosen": -0.8278894424438477, "logits/rejected": -0.8179739117622375, "logps/chosen": -44.607818603515625, "logps/rejected": -87.3814468383789, "loss": 0.3407, "rewards/accuracies": 1.0, "rewards/chosen": 0.524578869342804, "rewards/margins": 1.8901195526123047, "rewards/rejected": -1.3655405044555664, "step": 2391 }, { "epoch": 0.39946604371767064, "grad_norm": 14.198925018310547, "learning_rate": 1.6005339562823294e-05, "logits/chosen": -1.065095067024231, "logits/rejected": -1.0807393789291382, "logps/chosen": -113.5374984741211, "logps/rejected": -144.99658203125, "loss": 0.9773, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.8165056705474854, "rewards/margins": -0.21290898323059082, "rewards/rejected": -2.6035966873168945, "step": 2394 }, { "epoch": 0.3999666277323544, "grad_norm": 33.85491180419922, "learning_rate": 1.600033372267646e-05, "logits/chosen": -0.9181995987892151, "logits/rejected": -0.9315548539161682, "logps/chosen": -57.5315055847168, "logps/rejected": -129.68077087402344, "loss": 0.3532, "rewards/accuracies": 1.0, "rewards/chosen": 0.22404785454273224, "rewards/margins": 6.054731845855713, "rewards/rejected": -5.830684661865234, "step": 2397 }, { "epoch": 0.4004672117470382, "grad_norm": 28.619762420654297, "learning_rate": 1.5995327882529618e-05, "logits/chosen": -0.8107498288154602, "logits/rejected": -0.8710036277770996, "logps/chosen": -54.26198959350586, "logps/rejected": -127.57638549804688, "loss": 0.4961, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.1491549164056778, "rewards/margins": 2.5944952964782715, "rewards/rejected": -2.743650197982788, "step": 2400 }, { "epoch": 0.4004672117470382, "eval_logits/chosen": -0.9022203087806702, "eval_logits/rejected": -0.9140791893005371, "eval_logps/chosen": -88.33490753173828, "eval_logps/rejected": -114.15959167480469, "eval_loss": 0.6084445118904114, "eval_rewards/accuracies": 0.7192192077636719, "eval_rewards/chosen": -0.8077185750007629, "eval_rewards/margins": 1.2954226732254028, "eval_rewards/rejected": -2.1031410694122314, "eval_runtime": 346.6642, "eval_samples_per_second": 7.685, "eval_steps_per_second": 1.921, "step": 2400 }, { "epoch": 0.40096779576172203, "grad_norm": 34.48155975341797, "learning_rate": 1.5990322042382783e-05, "logits/chosen": -0.9636203646659851, "logits/rejected": -0.9693118929862976, "logps/chosen": -101.74222564697266, "logps/rejected": -92.20540618896484, "loss": 1.0649, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.7921026945114136, "rewards/margins": -0.5939450263977051, "rewards/rejected": -1.1981576681137085, "step": 2403 }, { "epoch": 0.4014683797764058, "grad_norm": 7.6715826988220215, "learning_rate": 1.5985316202235945e-05, "logits/chosen": -0.7863264679908752, "logits/rejected": -0.7565281391143799, "logps/chosen": -151.65074157714844, "logps/rejected": -161.01284790039062, "loss": 0.6802, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.2492262125015259, "rewards/margins": 1.2883329391479492, "rewards/rejected": -2.5375592708587646, "step": 2406 }, { "epoch": 0.4019689637910896, "grad_norm": 27.885587692260742, "learning_rate": 1.5980310362089103e-05, "logits/chosen": -0.7726184725761414, "logits/rejected": -0.7420617938041687, "logps/chosen": -82.61673736572266, "logps/rejected": -50.00605392456055, "loss": 0.9332, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.9181932210922241, "rewards/margins": -0.15239493548870087, "rewards/rejected": -1.7657982110977173, "step": 2409 }, { "epoch": 0.4024695478057734, "grad_norm": 13.769123077392578, "learning_rate": 1.5975304521942268e-05, "logits/chosen": -0.877970278263092, "logits/rejected": -0.8839214444160461, "logps/chosen": -104.8001708984375, "logps/rejected": -179.463134765625, "loss": 0.3341, "rewards/accuracies": 1.0, "rewards/chosen": 0.22617049515247345, "rewards/margins": 2.831695556640625, "rewards/rejected": -2.605525016784668, "step": 2412 }, { "epoch": 0.4029701318204572, "grad_norm": 20.460203170776367, "learning_rate": 1.597029868179543e-05, "logits/chosen": -0.8149098753929138, "logits/rejected": -0.7946133613586426, "logps/chosen": -101.0431900024414, "logps/rejected": -143.33413696289062, "loss": 0.3643, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.5844844579696655, "rewards/margins": 1.811120629310608, "rewards/rejected": -3.3956050872802734, "step": 2415 }, { "epoch": 0.403470715835141, "grad_norm": 35.96871566772461, "learning_rate": 1.5965292841648592e-05, "logits/chosen": -0.965871274471283, "logits/rejected": -0.9707831740379333, "logps/chosen": -122.81179809570312, "logps/rejected": -115.99273681640625, "loss": 0.7058, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.9645849466323853, "rewards/margins": -0.7991015315055847, "rewards/rejected": -1.1654834747314453, "step": 2418 }, { "epoch": 0.40397129984982477, "grad_norm": 41.651065826416016, "learning_rate": 1.5960287001501754e-05, "logits/chosen": -0.9447624087333679, "logits/rejected": -0.9515500664710999, "logps/chosen": -90.63079071044922, "logps/rejected": -116.22037506103516, "loss": 0.6157, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.383753776550293, "rewards/margins": 0.7675432562828064, "rewards/rejected": -2.151296854019165, "step": 2421 }, { "epoch": 0.4044718838645086, "grad_norm": 7.199325084686279, "learning_rate": 1.5955281161354915e-05, "logits/chosen": -0.8867314457893372, "logits/rejected": -0.8780179023742676, "logps/chosen": -66.54988861083984, "logps/rejected": -93.18121337890625, "loss": 0.2038, "rewards/accuracies": 1.0, "rewards/chosen": -0.03358091786503792, "rewards/margins": 1.5329079627990723, "rewards/rejected": -1.5664888620376587, "step": 2424 }, { "epoch": 0.4049724678791924, "grad_norm": 41.7384033203125, "learning_rate": 1.5950275321208077e-05, "logits/chosen": -0.8750856518745422, "logits/rejected": -0.8682069778442383, "logps/chosen": -70.197998046875, "logps/rejected": -82.86649322509766, "loss": 0.8698, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.301797479391098, "rewards/margins": -0.3857386112213135, "rewards/rejected": 0.08394112437963486, "step": 2427 }, { "epoch": 0.40547305189387617, "grad_norm": 39.48567199707031, "learning_rate": 1.594526948106124e-05, "logits/chosen": -1.0078703165054321, "logits/rejected": -0.9764707088470459, "logps/chosen": -112.8956069946289, "logps/rejected": -115.65755462646484, "loss": 1.0215, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.8171034455299377, "rewards/margins": -0.26116427779197693, "rewards/rejected": -0.5559391975402832, "step": 2430 }, { "epoch": 0.40597363590856, "grad_norm": 5.659927845001221, "learning_rate": 1.59402636409144e-05, "logits/chosen": -1.0147759914398193, "logits/rejected": -1.015501618385315, "logps/chosen": -97.79791259765625, "logps/rejected": -112.01436614990234, "loss": 0.5452, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.210279703140259, "rewards/margins": 1.625813603401184, "rewards/rejected": -3.8360931873321533, "step": 2433 }, { "epoch": 0.4064742199232438, "grad_norm": 7.46884822845459, "learning_rate": 1.5935257800767563e-05, "logits/chosen": -0.8980162739753723, "logits/rejected": -0.9641782641410828, "logps/chosen": -66.35132598876953, "logps/rejected": -134.6433868408203, "loss": 0.2633, "rewards/accuracies": 1.0, "rewards/chosen": -1.0923300981521606, "rewards/margins": 1.854963779449463, "rewards/rejected": -2.947293996810913, "step": 2436 }, { "epoch": 0.40697480393792757, "grad_norm": 28.936113357543945, "learning_rate": 1.5930251960620728e-05, "logits/chosen": -0.8991326689720154, "logits/rejected": -0.9368875622749329, "logps/chosen": -71.78022003173828, "logps/rejected": -140.17138671875, "loss": 0.614, "rewards/accuracies": 1.0, "rewards/chosen": -0.576801598072052, "rewards/margins": 3.8592395782470703, "rewards/rejected": -4.436041355133057, "step": 2439 }, { "epoch": 0.4074753879526114, "grad_norm": 29.083988189697266, "learning_rate": 1.5925246120473886e-05, "logits/chosen": -0.9111723303794861, "logits/rejected": -0.8830860257148743, "logps/chosen": -86.5368881225586, "logps/rejected": -89.81732177734375, "loss": 0.8552, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.03197314217686653, "rewards/margins": 1.5555377006530762, "rewards/rejected": -1.5235646963119507, "step": 2442 }, { "epoch": 0.4079759719672952, "grad_norm": 17.231481552124023, "learning_rate": 1.592024028032705e-05, "logits/chosen": -0.9751819968223572, "logits/rejected": -0.9254141449928284, "logps/chosen": -100.02886962890625, "logps/rejected": -69.00000762939453, "loss": 0.6133, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.0572781562805176, "rewards/margins": -0.5884373784065247, "rewards/rejected": -0.4688408672809601, "step": 2445 }, { "epoch": 0.40847655598197896, "grad_norm": 30.41059112548828, "learning_rate": 1.5915234440180213e-05, "logits/chosen": -0.8128368854522705, "logits/rejected": -0.8496702313423157, "logps/chosen": -81.60843658447266, "logps/rejected": -122.4454116821289, "loss": 0.7873, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.883151650428772, "rewards/margins": -0.7455920577049255, "rewards/rejected": -1.1375595331192017, "step": 2448 }, { "epoch": 0.4089771399966628, "grad_norm": 32.60798645019531, "learning_rate": 1.591022860003337e-05, "logits/chosen": -0.8137383460998535, "logits/rejected": -0.8490200638771057, "logps/chosen": -120.4005126953125, "logps/rejected": -165.193603515625, "loss": 0.6838, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1585876941680908, "rewards/margins": 0.5389354228973389, "rewards/rejected": -1.6975231170654297, "step": 2451 }, { "epoch": 0.4094777240113466, "grad_norm": 23.40165901184082, "learning_rate": 1.5905222759886537e-05, "logits/chosen": -0.829035758972168, "logits/rejected": -0.8430650234222412, "logps/chosen": -136.9753875732422, "logps/rejected": -99.29703521728516, "loss": 0.6101, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.6469959020614624, "rewards/margins": -0.34127259254455566, "rewards/rejected": -1.3057231903076172, "step": 2454 }, { "epoch": 0.40997830802603036, "grad_norm": 35.41733169555664, "learning_rate": 1.59002169197397e-05, "logits/chosen": -0.8237959742546082, "logits/rejected": -0.9021279215812683, "logps/chosen": -70.0140609741211, "logps/rejected": -210.04148864746094, "loss": 0.5933, "rewards/accuracies": 1.0, "rewards/chosen": -0.7382137775421143, "rewards/margins": 1.1235990524291992, "rewards/rejected": -1.861812710762024, "step": 2457 }, { "epoch": 0.4104788920407142, "grad_norm": 19.049205780029297, "learning_rate": 1.589521107959286e-05, "logits/chosen": -0.9459130167961121, "logits/rejected": -0.8750713467597961, "logps/chosen": -119.64310455322266, "logps/rejected": -94.90081787109375, "loss": 0.5337, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6324053406715393, "rewards/margins": 0.6133268475532532, "rewards/rejected": -1.2457321882247925, "step": 2460 }, { "epoch": 0.410979476055398, "grad_norm": 11.744474411010742, "learning_rate": 1.5890205239446022e-05, "logits/chosen": -0.8832728266716003, "logits/rejected": -0.9159252047538757, "logps/chosen": -56.827606201171875, "logps/rejected": -89.59341430664062, "loss": 0.3067, "rewards/accuracies": 1.0, "rewards/chosen": 0.9062042236328125, "rewards/margins": 2.660551071166992, "rewards/rejected": -1.7543468475341797, "step": 2463 }, { "epoch": 0.41148006007008175, "grad_norm": 21.763813018798828, "learning_rate": 1.5885199399299184e-05, "logits/chosen": -0.8445947766304016, "logits/rejected": -0.7541365027427673, "logps/chosen": -131.2175750732422, "logps/rejected": -70.6383285522461, "loss": 0.4861, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.4694618284702301, "rewards/margins": 0.2898547947406769, "rewards/rejected": 0.17960698902606964, "step": 2466 }, { "epoch": 0.41198064408476553, "grad_norm": 33.35871124267578, "learning_rate": 1.5880193559152346e-05, "logits/chosen": -0.9532966017723083, "logits/rejected": -0.9846823215484619, "logps/chosen": -42.8210334777832, "logps/rejected": -136.21641540527344, "loss": 0.6658, "rewards/accuracies": 1.0, "rewards/chosen": 0.6724328398704529, "rewards/margins": 2.460055112838745, "rewards/rejected": -1.787622094154358, "step": 2469 }, { "epoch": 0.41248122809944937, "grad_norm": 27.213085174560547, "learning_rate": 1.5875187719005507e-05, "logits/chosen": -0.8647055625915527, "logits/rejected": -0.9344687461853027, "logps/chosen": -98.37395477294922, "logps/rejected": -110.64124298095703, "loss": 0.7927, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.8780470490455627, "rewards/margins": -0.897189199924469, "rewards/rejected": 0.01914215087890625, "step": 2472 }, { "epoch": 0.41298181211413315, "grad_norm": 16.303138732910156, "learning_rate": 1.587018187885867e-05, "logits/chosen": -0.6529162526130676, "logits/rejected": -0.7108669281005859, "logps/chosen": -74.11264038085938, "logps/rejected": -136.37362670898438, "loss": 0.6897, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.10415369272232056, "rewards/margins": 0.509943425655365, "rewards/rejected": -0.40578970313072205, "step": 2475 }, { "epoch": 0.41348239612881693, "grad_norm": 14.990106582641602, "learning_rate": 1.586517603871183e-05, "logits/chosen": -0.8598313331604004, "logits/rejected": -0.8198433518409729, "logps/chosen": -140.85987854003906, "logps/rejected": -147.47064208984375, "loss": 0.5959, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.2159166932106018, "rewards/margins": 0.30132296681404114, "rewards/rejected": -0.5172396302223206, "step": 2478 }, { "epoch": 0.41398298014350077, "grad_norm": 19.545345306396484, "learning_rate": 1.5860170198564996e-05, "logits/chosen": -0.9971917271614075, "logits/rejected": -0.9753754734992981, "logps/chosen": -125.51639556884766, "logps/rejected": -99.38512420654297, "loss": 0.8043, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3584631383419037, "rewards/margins": 0.23603226244449615, "rewards/rejected": -0.5944953560829163, "step": 2481 }, { "epoch": 0.41448356415818455, "grad_norm": 22.509023666381836, "learning_rate": 1.5855164358418155e-05, "logits/chosen": -0.713033139705658, "logits/rejected": -0.6799766421318054, "logps/chosen": -105.04756927490234, "logps/rejected": -84.25860595703125, "loss": 0.4819, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.16569609940052032, "rewards/margins": 1.3027855157852173, "rewards/rejected": -1.1370893716812134, "step": 2484 }, { "epoch": 0.41498414817286833, "grad_norm": 34.130985260009766, "learning_rate": 1.5850158518271316e-05, "logits/chosen": -0.9407970309257507, "logits/rejected": -0.96746826171875, "logps/chosen": -85.97101593017578, "logps/rejected": -129.1863250732422, "loss": 0.7155, "rewards/accuracies": 1.0, "rewards/chosen": -0.10135575383901596, "rewards/margins": 4.1105170249938965, "rewards/rejected": -4.2118730545043945, "step": 2487 }, { "epoch": 0.41548473218755216, "grad_norm": 22.583168029785156, "learning_rate": 1.584515267812448e-05, "logits/chosen": -0.9717308878898621, "logits/rejected": -0.9243203997612, "logps/chosen": -67.22967529296875, "logps/rejected": -68.7101821899414, "loss": 0.676, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.012182255275547504, "rewards/margins": 1.4525762796401978, "rewards/rejected": -1.4403940439224243, "step": 2490 }, { "epoch": 0.41598531620223594, "grad_norm": 19.1274471282959, "learning_rate": 1.584014683797764e-05, "logits/chosen": -0.6752076148986816, "logits/rejected": -0.7207834720611572, "logps/chosen": -98.84893798828125, "logps/rejected": -101.09661102294922, "loss": 0.4936, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.40846049785614014, "rewards/margins": 0.006625274661928415, "rewards/rejected": -0.4150857925415039, "step": 2493 }, { "epoch": 0.4164859002169197, "grad_norm": 17.941730499267578, "learning_rate": 1.5835140997830805e-05, "logits/chosen": -0.9266887307167053, "logits/rejected": -1.01303231716156, "logps/chosen": -61.1004524230957, "logps/rejected": -140.20399475097656, "loss": 0.6102, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.2651865780353546, "rewards/margins": 1.047357201576233, "rewards/rejected": -0.7821707129478455, "step": 2496 }, { "epoch": 0.41698648423160356, "grad_norm": 27.370187759399414, "learning_rate": 1.5830135157683967e-05, "logits/chosen": -0.6649432182312012, "logits/rejected": -0.623727560043335, "logps/chosen": -112.99514770507812, "logps/rejected": -107.63333892822266, "loss": 0.8216, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.4802247285842896, "rewards/margins": -0.4840981960296631, "rewards/rejected": -0.9961265921592712, "step": 2499 }, { "epoch": 0.41748706824628734, "grad_norm": 16.453088760375977, "learning_rate": 1.582512931753713e-05, "logits/chosen": -0.8359879851341248, "logits/rejected": -0.8191804885864258, "logps/chosen": -76.99199676513672, "logps/rejected": -72.63713836669922, "loss": 0.6666, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.2127081155776978, "rewards/margins": 0.2529841661453247, "rewards/rejected": -1.465692162513733, "step": 2502 }, { "epoch": 0.4179876522609711, "grad_norm": 24.714372634887695, "learning_rate": 1.582012347739029e-05, "logits/chosen": -0.9221298098564148, "logits/rejected": -0.9612224698066711, "logps/chosen": -46.23458480834961, "logps/rejected": -97.42171478271484, "loss": 0.5252, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.06285353749990463, "rewards/margins": 1.1192950010299683, "rewards/rejected": -1.1821485757827759, "step": 2505 }, { "epoch": 0.41848823627565496, "grad_norm": 14.969747543334961, "learning_rate": 1.5815117637243452e-05, "logits/chosen": -0.9372038245201111, "logits/rejected": -0.937982976436615, "logps/chosen": -111.27742767333984, "logps/rejected": -115.11966705322266, "loss": 0.4065, "rewards/accuracies": 1.0, "rewards/chosen": 0.059073638170957565, "rewards/margins": 2.1457135677337646, "rewards/rejected": -2.086639881134033, "step": 2508 }, { "epoch": 0.41898882029033874, "grad_norm": 10.557933807373047, "learning_rate": 1.5810111797096614e-05, "logits/chosen": -0.8560435771942139, "logits/rejected": -0.9028708934783936, "logps/chosen": -91.45471954345703, "logps/rejected": -142.88475036621094, "loss": 0.5297, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8050772547721863, "rewards/margins": 0.24668645858764648, "rewards/rejected": -1.051763653755188, "step": 2511 }, { "epoch": 0.4194894043050225, "grad_norm": 26.15439224243164, "learning_rate": 1.5805105956949776e-05, "logits/chosen": -0.7263135313987732, "logits/rejected": -0.7009028792381287, "logps/chosen": -122.21783447265625, "logps/rejected": -82.14472198486328, "loss": 0.4615, "rewards/accuracies": 1.0, "rewards/chosen": 0.24931158125400543, "rewards/margins": 1.8842288255691528, "rewards/rejected": -1.6349172592163086, "step": 2514 }, { "epoch": 0.4199899883197063, "grad_norm": 26.22974967956543, "learning_rate": 1.5800100116802938e-05, "logits/chosen": -0.841898500919342, "logits/rejected": -0.8319675922393799, "logps/chosen": -72.18728637695312, "logps/rejected": -96.68119049072266, "loss": 0.4866, "rewards/accuracies": 1.0, "rewards/chosen": 0.915014922618866, "rewards/margins": 1.2983490228652954, "rewards/rejected": -0.38333407044410706, "step": 2517 }, { "epoch": 0.42049057233439013, "grad_norm": 7.638154983520508, "learning_rate": 1.57950942766561e-05, "logits/chosen": -0.8177909851074219, "logits/rejected": -0.792499840259552, "logps/chosen": -105.70958709716797, "logps/rejected": -83.7660140991211, "loss": 0.6083, "rewards/accuracies": 1.0, "rewards/chosen": -0.5629073977470398, "rewards/margins": 0.5664963126182556, "rewards/rejected": -1.1294037103652954, "step": 2520 }, { "epoch": 0.4209911563490739, "grad_norm": 19.903932571411133, "learning_rate": 1.579008843650926e-05, "logits/chosen": -0.932206928730011, "logits/rejected": -0.957622230052948, "logps/chosen": -78.53726959228516, "logps/rejected": -134.02381896972656, "loss": 0.4257, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1795095205307007, "rewards/margins": 2.5765111446380615, "rewards/rejected": -3.7560207843780518, "step": 2523 }, { "epoch": 0.4214917403637577, "grad_norm": 31.974573135375977, "learning_rate": 1.5785082596362423e-05, "logits/chosen": -0.8981712460517883, "logits/rejected": -0.9068310856819153, "logps/chosen": -86.96844482421875, "logps/rejected": -93.12097930908203, "loss": 0.617, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.348736971616745, "rewards/margins": 0.7348403930664062, "rewards/rejected": -1.0835773944854736, "step": 2526 }, { "epoch": 0.42199232437844153, "grad_norm": 10.603660583496094, "learning_rate": 1.5780076756215585e-05, "logits/chosen": -0.863275945186615, "logits/rejected": -0.8687975406646729, "logps/chosen": -59.04764938354492, "logps/rejected": -79.09471893310547, "loss": 0.6284, "rewards/accuracies": 1.0, "rewards/chosen": 0.251324862241745, "rewards/margins": 0.4709831476211548, "rewards/rejected": -0.21965830028057098, "step": 2529 }, { "epoch": 0.4224929083931253, "grad_norm": 8.30844497680664, "learning_rate": 1.577507091606875e-05, "logits/chosen": -0.9528944492340088, "logits/rejected": -0.9689652323722839, "logps/chosen": -106.2360610961914, "logps/rejected": -118.91544342041016, "loss": 0.5955, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.17313627898693085, "rewards/margins": 2.904095411300659, "rewards/rejected": -2.730958938598633, "step": 2532 }, { "epoch": 0.4229934924078091, "grad_norm": 37.120697021484375, "learning_rate": 1.577006507592191e-05, "logits/chosen": -0.8208120465278625, "logits/rejected": -0.832114040851593, "logps/chosen": -77.83612823486328, "logps/rejected": -76.42160034179688, "loss": 0.9574, "rewards/accuracies": 1.0, "rewards/chosen": -0.06355756521224976, "rewards/margins": 0.17525608837604523, "rewards/rejected": -0.23881365358829498, "step": 2535 }, { "epoch": 0.4234940764224929, "grad_norm": 31.335208892822266, "learning_rate": 1.5765059235775074e-05, "logits/chosen": -0.9888593554496765, "logits/rejected": -1.0157380104064941, "logps/chosen": -97.37158966064453, "logps/rejected": -98.68560791015625, "loss": 0.634, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.049842577427625656, "rewards/margins": 0.8013200759887695, "rewards/rejected": -0.7514775395393372, "step": 2538 }, { "epoch": 0.4239946604371767, "grad_norm": 17.061939239501953, "learning_rate": 1.5760053395628236e-05, "logits/chosen": -0.8241052627563477, "logits/rejected": -0.818021297454834, "logps/chosen": -48.1319580078125, "logps/rejected": -79.66951751708984, "loss": 0.7348, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.704349935054779, "rewards/margins": 0.5697352290153503, "rewards/rejected": 0.13461469113826752, "step": 2541 }, { "epoch": 0.4244952444518605, "grad_norm": 7.175799369812012, "learning_rate": 1.5755047555481394e-05, "logits/chosen": -0.7800089716911316, "logits/rejected": -0.7934136390686035, "logps/chosen": -85.2086410522461, "logps/rejected": -153.0386962890625, "loss": 0.2736, "rewards/accuracies": 1.0, "rewards/chosen": 0.5824400782585144, "rewards/margins": 2.156456708908081, "rewards/rejected": -1.5740165710449219, "step": 2544 }, { "epoch": 0.4249958284665443, "grad_norm": 11.474605560302734, "learning_rate": 1.575004171533456e-05, "logits/chosen": -0.9359360337257385, "logits/rejected": -0.9023539423942566, "logps/chosen": -97.71569061279297, "logps/rejected": -100.6985855102539, "loss": 0.3525, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3806014060974121, "rewards/margins": 1.3792715072631836, "rewards/rejected": -1.7598730325698853, "step": 2547 }, { "epoch": 0.4254964124812281, "grad_norm": 25.62955093383789, "learning_rate": 1.574503587518772e-05, "logits/chosen": -0.8026896119117737, "logits/rejected": -0.748546838760376, "logps/chosen": -79.130859375, "logps/rejected": -53.28306198120117, "loss": 0.6161, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.4982971251010895, "rewards/margins": 1.6245551109313965, "rewards/rejected": -1.1262578964233398, "step": 2550 }, { "epoch": 0.4259969964959119, "grad_norm": 17.211286544799805, "learning_rate": 1.5740030035040883e-05, "logits/chosen": -0.9336841702461243, "logits/rejected": -0.9516246318817139, "logps/chosen": -68.52273559570312, "logps/rejected": -99.41912078857422, "loss": 0.525, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4917691648006439, "rewards/margins": 2.754384756088257, "rewards/rejected": -3.2461538314819336, "step": 2553 }, { "epoch": 0.4264975805105957, "grad_norm": 10.678394317626953, "learning_rate": 1.5735024194894045e-05, "logits/chosen": -1.0099529027938843, "logits/rejected": -0.9809265732765198, "logps/chosen": -123.9093017578125, "logps/rejected": -70.61091613769531, "loss": 0.5869, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.09095561504364014, "rewards/margins": 0.4822748005390167, "rewards/rejected": -0.5732304453849792, "step": 2556 }, { "epoch": 0.4269981645252795, "grad_norm": 15.48426628112793, "learning_rate": 1.5730018354747206e-05, "logits/chosen": -0.8559388518333435, "logits/rejected": -0.8789599537849426, "logps/chosen": -71.9674301147461, "logps/rejected": -95.541259765625, "loss": 0.3649, "rewards/accuracies": 1.0, "rewards/chosen": 0.2286585122346878, "rewards/margins": 1.3236740827560425, "rewards/rejected": -1.0950156450271606, "step": 2559 }, { "epoch": 0.4274987485399633, "grad_norm": 34.19529724121094, "learning_rate": 1.5725012514600368e-05, "logits/chosen": -0.7406672835350037, "logits/rejected": -0.6943380832672119, "logps/chosen": -123.47821807861328, "logps/rejected": -81.36618041992188, "loss": 0.7438, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9054689407348633, "rewards/margins": 0.0026396114844828844, "rewards/rejected": -0.9081085324287415, "step": 2562 }, { "epoch": 0.42799933255464706, "grad_norm": 17.251880645751953, "learning_rate": 1.572000667445353e-05, "logits/chosen": -1.0427744388580322, "logits/rejected": -1.016582727432251, "logps/chosen": -103.13765716552734, "logps/rejected": -102.51351928710938, "loss": 0.4732, "rewards/accuracies": 1.0, "rewards/chosen": 0.9671729207038879, "rewards/margins": 2.2258007526397705, "rewards/rejected": -1.258628010749817, "step": 2565 }, { "epoch": 0.4284999165693309, "grad_norm": 12.606436729431152, "learning_rate": 1.5715000834306692e-05, "logits/chosen": -0.7797305583953857, "logits/rejected": -0.8293656706809998, "logps/chosen": -93.97440338134766, "logps/rejected": -133.305908203125, "loss": 0.7473, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.19853568077087402, "rewards/margins": 0.03947677090764046, "rewards/rejected": -0.23801244795322418, "step": 2568 }, { "epoch": 0.4290005005840147, "grad_norm": 35.26978302001953, "learning_rate": 1.5709994994159854e-05, "logits/chosen": -0.8645050525665283, "logits/rejected": -0.8909902572631836, "logps/chosen": -72.44849395751953, "logps/rejected": -63.058197021484375, "loss": 0.6914, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.0859324112534523, "rewards/margins": -0.0737895742058754, "rewards/rejected": 0.1597220003604889, "step": 2571 }, { "epoch": 0.42950108459869846, "grad_norm": 37.96416091918945, "learning_rate": 1.570498915401302e-05, "logits/chosen": -0.7532165050506592, "logits/rejected": -0.8157045841217041, "logps/chosen": -55.34608459472656, "logps/rejected": -83.9797592163086, "loss": 0.6055, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.2625800371170044, "rewards/margins": -0.046797752380371094, "rewards/rejected": 0.3093777894973755, "step": 2574 }, { "epoch": 0.4300016686133823, "grad_norm": 18.642837524414062, "learning_rate": 1.5699983313866177e-05, "logits/chosen": -0.8646928668022156, "logits/rejected": -0.8479979634284973, "logps/chosen": -92.2366943359375, "logps/rejected": -54.91123962402344, "loss": 1.0008, "rewards/accuracies": 0.0, "rewards/chosen": -0.4317483901977539, "rewards/margins": -1.0096769332885742, "rewards/rejected": 0.5779285430908203, "step": 2577 }, { "epoch": 0.4305022526280661, "grad_norm": 17.503435134887695, "learning_rate": 1.569497747371934e-05, "logits/chosen": -0.7569417953491211, "logits/rejected": -0.7473174929618835, "logps/chosen": -175.2999267578125, "logps/rejected": -129.56434631347656, "loss": 0.7699, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.313552588224411, "rewards/margins": -0.6096531748771667, "rewards/rejected": 0.2961006164550781, "step": 2580 }, { "epoch": 0.43100283664274985, "grad_norm": 36.146209716796875, "learning_rate": 1.5689971633572504e-05, "logits/chosen": -0.8240843415260315, "logits/rejected": -0.8210293650627136, "logps/chosen": -105.410888671875, "logps/rejected": -99.54840850830078, "loss": 0.6986, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.18156510591506958, "rewards/margins": -0.09770134836435318, "rewards/rejected": -0.0838637575507164, "step": 2583 }, { "epoch": 0.4315034206574337, "grad_norm": 16.584064483642578, "learning_rate": 1.5684965793425663e-05, "logits/chosen": -0.8871267437934875, "logits/rejected": -0.8218555450439453, "logps/chosen": -134.0646209716797, "logps/rejected": -94.11710357666016, "loss": 0.7792, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.542536735534668, "rewards/margins": -0.516995370388031, "rewards/rejected": -0.025541314855217934, "step": 2586 }, { "epoch": 0.43200400467211747, "grad_norm": 13.3280668258667, "learning_rate": 1.5679959953278828e-05, "logits/chosen": -0.7978420257568359, "logits/rejected": -0.8203480243682861, "logps/chosen": -29.610071182250977, "logps/rejected": -139.97360229492188, "loss": 0.604, "rewards/accuracies": 1.0, "rewards/chosen": 0.35522034764289856, "rewards/margins": 1.2289149761199951, "rewards/rejected": -0.873694658279419, "step": 2589 }, { "epoch": 0.43250458868680125, "grad_norm": 22.2131290435791, "learning_rate": 1.567495411313199e-05, "logits/chosen": -0.7471596598625183, "logits/rejected": -0.81316739320755, "logps/chosen": -27.042465209960938, "logps/rejected": -92.66314697265625, "loss": 0.5337, "rewards/accuracies": 1.0, "rewards/chosen": 0.5271643996238708, "rewards/margins": 1.413723349571228, "rewards/rejected": -0.886559009552002, "step": 2592 }, { "epoch": 0.4330051727014851, "grad_norm": 25.524763107299805, "learning_rate": 1.566994827298515e-05, "logits/chosen": -0.8879702687263489, "logits/rejected": -0.8891997337341309, "logps/chosen": -75.53290557861328, "logps/rejected": -77.55054473876953, "loss": 0.8659, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.5726760029792786, "rewards/margins": -0.04353739693760872, "rewards/rejected": 0.6162133812904358, "step": 2595 }, { "epoch": 0.43350575671616887, "grad_norm": 18.335418701171875, "learning_rate": 1.5664942432838313e-05, "logits/chosen": -0.7659950256347656, "logits/rejected": -0.7810766696929932, "logps/chosen": -66.5325927734375, "logps/rejected": -145.18017578125, "loss": 0.1957, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.18488581478595734, "rewards/margins": 1.9425519704818726, "rewards/rejected": -2.1274378299713135, "step": 2598 }, { "epoch": 0.43400634073085265, "grad_norm": 17.990825653076172, "learning_rate": 1.5659936592691475e-05, "logits/chosen": -0.8652997016906738, "logits/rejected": -0.9195011258125305, "logps/chosen": -89.12593841552734, "logps/rejected": -165.0581817626953, "loss": 0.6594, "rewards/accuracies": 1.0, "rewards/chosen": -0.5615589022636414, "rewards/margins": 1.51456880569458, "rewards/rejected": -2.076127767562866, "step": 2601 }, { "epoch": 0.4345069247455365, "grad_norm": 62.146095275878906, "learning_rate": 1.5654930752544637e-05, "logits/chosen": -0.8032026886940002, "logits/rejected": -0.8616532683372498, "logps/chosen": -57.5766487121582, "logps/rejected": -114.00402069091797, "loss": 0.4601, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.39209964871406555, "rewards/margins": 2.8584330081939697, "rewards/rejected": -2.4663331508636475, "step": 2604 }, { "epoch": 0.43500750876022026, "grad_norm": 12.408262252807617, "learning_rate": 1.56499249123978e-05, "logits/chosen": -0.589514434337616, "logits/rejected": -0.5348185300827026, "logps/chosen": -141.17893981933594, "logps/rejected": -78.85376739501953, "loss": 0.7119, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.24067358672618866, "rewards/margins": 0.2531207799911499, "rewards/rejected": -0.012447237968444824, "step": 2607 }, { "epoch": 0.43550809277490404, "grad_norm": 15.97327709197998, "learning_rate": 1.564491907225096e-05, "logits/chosen": -0.6462165713310242, "logits/rejected": -0.6602483987808228, "logps/chosen": -125.54170989990234, "logps/rejected": -138.50379943847656, "loss": 0.6041, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.9787664413452148, "rewards/margins": 0.15561790764331818, "rewards/rejected": -2.1343843936920166, "step": 2610 }, { "epoch": 0.4360086767895879, "grad_norm": 14.814096450805664, "learning_rate": 1.5639913232104122e-05, "logits/chosen": -0.8637590408325195, "logits/rejected": -0.8701186180114746, "logps/chosen": -67.10729217529297, "logps/rejected": -75.82355499267578, "loss": 0.6021, "rewards/accuracies": 1.0, "rewards/chosen": 0.08101890236139297, "rewards/margins": 0.8121479153633118, "rewards/rejected": -0.7311291098594666, "step": 2613 }, { "epoch": 0.43650926080427166, "grad_norm": 13.5531644821167, "learning_rate": 1.5634907391957287e-05, "logits/chosen": -0.8231509327888489, "logits/rejected": -0.8187994956970215, "logps/chosen": -103.05347442626953, "logps/rejected": -95.10457611083984, "loss": 0.6569, "rewards/accuracies": 1.0, "rewards/chosen": 0.04776344820857048, "rewards/margins": 0.738386869430542, "rewards/rejected": -0.6906234622001648, "step": 2616 }, { "epoch": 0.43700984481895544, "grad_norm": 12.066028594970703, "learning_rate": 1.5629901551810446e-05, "logits/chosen": -0.8899424076080322, "logits/rejected": -0.8786775469779968, "logps/chosen": -102.46405029296875, "logps/rejected": -91.57833099365234, "loss": 0.554, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.26608264446258545, "rewards/margins": 0.07504946738481522, "rewards/rejected": 0.19103318452835083, "step": 2619 }, { "epoch": 0.4375104288336392, "grad_norm": 32.38545608520508, "learning_rate": 1.5624895711663607e-05, "logits/chosen": -0.8429990410804749, "logits/rejected": -0.7902650237083435, "logps/chosen": -65.86750793457031, "logps/rejected": -89.06511688232422, "loss": 0.6314, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.11595077067613602, "rewards/margins": -0.16649965941905975, "rewards/rejected": 0.050548870116472244, "step": 2622 }, { "epoch": 0.43801101284832306, "grad_norm": 19.271881103515625, "learning_rate": 1.5619889871516773e-05, "logits/chosen": -0.5899143815040588, "logits/rejected": -0.6576213240623474, "logps/chosen": -105.21295166015625, "logps/rejected": -108.1371078491211, "loss": 0.9573, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.966977596282959, "rewards/margins": -0.1998692899942398, "rewards/rejected": -0.7671082615852356, "step": 2625 }, { "epoch": 0.43851159686300684, "grad_norm": 35.42887878417969, "learning_rate": 1.561488403136993e-05, "logits/chosen": -0.7908062934875488, "logits/rejected": -0.8142819404602051, "logps/chosen": -78.08441162109375, "logps/rejected": -123.6993408203125, "loss": 1.0198, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.33276811242103577, "rewards/margins": 0.945310652256012, "rewards/rejected": -0.6125425100326538, "step": 2628 }, { "epoch": 0.4390121808776906, "grad_norm": 17.968196868896484, "learning_rate": 1.5609878191223096e-05, "logits/chosen": -0.9283221364021301, "logits/rejected": -0.9243566393852234, "logps/chosen": -63.98971939086914, "logps/rejected": -95.1068344116211, "loss": 0.4407, "rewards/accuracies": 1.0, "rewards/chosen": 0.3771653473377228, "rewards/margins": 1.9866701364517212, "rewards/rejected": -1.6095050573349, "step": 2631 }, { "epoch": 0.43951276489237445, "grad_norm": 15.043238639831543, "learning_rate": 1.5604872351076258e-05, "logits/chosen": -0.8809712529182434, "logits/rejected": -0.8822525143623352, "logps/chosen": -67.60355377197266, "logps/rejected": -78.82465362548828, "loss": 0.4294, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3475026786327362, "rewards/margins": 0.2627885043621063, "rewards/rejected": -0.6102911829948425, "step": 2634 }, { "epoch": 0.44001334890705823, "grad_norm": 16.67151641845703, "learning_rate": 1.5599866510929416e-05, "logits/chosen": -0.7640002369880676, "logits/rejected": -0.7827153205871582, "logps/chosen": -51.48123550415039, "logps/rejected": -59.429107666015625, "loss": 0.717, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.29896190762519836, "rewards/margins": 0.9997522234916687, "rewards/rejected": -1.2987140417099, "step": 2637 }, { "epoch": 0.440513932921742, "grad_norm": 10.88797378540039, "learning_rate": 1.559486067078258e-05, "logits/chosen": -0.7718318104743958, "logits/rejected": -0.7535855174064636, "logps/chosen": -81.20984649658203, "logps/rejected": -85.2577896118164, "loss": 0.3465, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.3394155502319336, "rewards/margins": 1.6683826446533203, "rewards/rejected": -1.3289669752120972, "step": 2640 }, { "epoch": 0.44101451693642585, "grad_norm": 26.294469833374023, "learning_rate": 1.5589854830635743e-05, "logits/chosen": -0.7802169919013977, "logits/rejected": -0.8170993328094482, "logps/chosen": -81.1797103881836, "logps/rejected": -116.0797348022461, "loss": 0.7272, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.2526949644088745, "rewards/margins": 0.10707321017980576, "rewards/rejected": -0.35976818203926086, "step": 2643 }, { "epoch": 0.44151510095110963, "grad_norm": 4.633907794952393, "learning_rate": 1.5584848990488905e-05, "logits/chosen": -0.8728008270263672, "logits/rejected": -0.8871369361877441, "logps/chosen": -109.58202362060547, "logps/rejected": -151.24024963378906, "loss": 0.3731, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7454119324684143, "rewards/margins": 1.9498423337936401, "rewards/rejected": -2.695254325866699, "step": 2646 }, { "epoch": 0.4420156849657934, "grad_norm": 5.782660484313965, "learning_rate": 1.5579843150342067e-05, "logits/chosen": -0.917989194393158, "logits/rejected": -0.8882122039794922, "logps/chosen": -137.49063110351562, "logps/rejected": -112.3174819946289, "loss": 0.3088, "rewards/accuracies": 1.0, "rewards/chosen": -0.6383494138717651, "rewards/margins": 1.060721755027771, "rewards/rejected": -1.6990712881088257, "step": 2649 }, { "epoch": 0.44251626898047725, "grad_norm": 21.045082092285156, "learning_rate": 1.557483731019523e-05, "logits/chosen": -0.8319317698478699, "logits/rejected": -0.8400439620018005, "logps/chosen": -91.39055633544922, "logps/rejected": -86.19011688232422, "loss": 0.3525, "rewards/accuracies": 1.0, "rewards/chosen": -0.5483419895172119, "rewards/margins": 1.2656004428863525, "rewards/rejected": -1.8139424324035645, "step": 2652 }, { "epoch": 0.443016852995161, "grad_norm": 17.642004013061523, "learning_rate": 1.556983147004839e-05, "logits/chosen": -0.7579970359802246, "logits/rejected": -0.8350677490234375, "logps/chosen": -52.21464920043945, "logps/rejected": -138.7144012451172, "loss": 0.4579, "rewards/accuracies": 1.0, "rewards/chosen": -0.03218142315745354, "rewards/margins": 1.8745266199111938, "rewards/rejected": -1.9067081212997437, "step": 2655 }, { "epoch": 0.4435174370098448, "grad_norm": 30.055131912231445, "learning_rate": 1.5564825629901552e-05, "logits/chosen": -0.8357399106025696, "logits/rejected": -0.8590754866600037, "logps/chosen": -84.18415832519531, "logps/rejected": -115.01140594482422, "loss": 0.4201, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7560372352600098, "rewards/margins": 0.27935707569122314, "rewards/rejected": -1.035394310951233, "step": 2658 }, { "epoch": 0.44401802102452864, "grad_norm": 4.835415840148926, "learning_rate": 1.5559819789754714e-05, "logits/chosen": -1.0083293914794922, "logits/rejected": -1.034510612487793, "logps/chosen": -78.9821548461914, "logps/rejected": -117.04583740234375, "loss": 0.2788, "rewards/accuracies": 1.0, "rewards/chosen": 0.9914626479148865, "rewards/margins": 3.611077070236206, "rewards/rejected": -2.6196141242980957, "step": 2661 }, { "epoch": 0.4445186050392124, "grad_norm": 18.964872360229492, "learning_rate": 1.5554813949607876e-05, "logits/chosen": -1.0057822465896606, "logits/rejected": -0.9493237137794495, "logps/chosen": -110.3271255493164, "logps/rejected": -62.72929000854492, "loss": 0.4937, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.33400049805641174, "rewards/margins": -0.0001850724220275879, "rewards/rejected": -0.33381542563438416, "step": 2664 }, { "epoch": 0.4450191890538962, "grad_norm": 22.257814407348633, "learning_rate": 1.554980810946104e-05, "logits/chosen": -0.8612465262413025, "logits/rejected": -0.8031750321388245, "logps/chosen": -146.17649841308594, "logps/rejected": -129.80743408203125, "loss": 0.3656, "rewards/accuracies": 1.0, "rewards/chosen": -1.0904150009155273, "rewards/margins": 1.3295844793319702, "rewards/rejected": -2.419999361038208, "step": 2667 }, { "epoch": 0.44551977306858, "grad_norm": 69.70230102539062, "learning_rate": 1.55448022693142e-05, "logits/chosen": -0.9536139369010925, "logits/rejected": -1.0053473711013794, "logps/chosen": -80.30331420898438, "logps/rejected": -111.49140167236328, "loss": 0.6446, "rewards/accuracies": 1.0, "rewards/chosen": -0.10924430936574936, "rewards/margins": 1.4938513040542603, "rewards/rejected": -1.6030956506729126, "step": 2670 }, { "epoch": 0.4460203570832638, "grad_norm": 13.248903274536133, "learning_rate": 1.5539796429167365e-05, "logits/chosen": -0.9109594225883484, "logits/rejected": -0.8695123195648193, "logps/chosen": -36.847896575927734, "logps/rejected": -36.58045196533203, "loss": 0.6565, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 1.128719449043274, "rewards/margins": 1.4474958181381226, "rewards/rejected": -0.31877630949020386, "step": 2673 }, { "epoch": 0.4465209410979476, "grad_norm": 22.585235595703125, "learning_rate": 1.5534790589020527e-05, "logits/chosen": -0.9298221468925476, "logits/rejected": -0.9176281094551086, "logps/chosen": -90.7500228881836, "logps/rejected": -76.5709457397461, "loss": 0.6838, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.7768514156341553, "rewards/margins": -0.18701402842998505, "rewards/rejected": -0.589837372303009, "step": 2676 }, { "epoch": 0.4470215251126314, "grad_norm": 13.658461570739746, "learning_rate": 1.5529784748873685e-05, "logits/chosen": -1.0201129913330078, "logits/rejected": -1.032610297203064, "logps/chosen": -111.39257049560547, "logps/rejected": -171.9765625, "loss": 0.28, "rewards/accuracies": 1.0, "rewards/chosen": 0.28605780005455017, "rewards/margins": 2.9744441509246826, "rewards/rejected": -2.6883862018585205, "step": 2679 }, { "epoch": 0.4475221091273152, "grad_norm": 13.603950500488281, "learning_rate": 1.552477890872685e-05, "logits/chosen": -1.0301352739334106, "logits/rejected": -1.0278849601745605, "logps/chosen": -70.28995513916016, "logps/rejected": -109.12761688232422, "loss": 0.5932, "rewards/accuracies": 1.0, "rewards/chosen": 0.7210721969604492, "rewards/margins": 3.8000218868255615, "rewards/rejected": -3.0789496898651123, "step": 2682 }, { "epoch": 0.448022693141999, "grad_norm": 28.61229133605957, "learning_rate": 1.5519773068580012e-05, "logits/chosen": -0.8298432230949402, "logits/rejected": -0.785841166973114, "logps/chosen": -130.4880828857422, "logps/rejected": -81.19384765625, "loss": 0.5983, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.3279199600219727, "rewards/margins": -0.08115174621343613, "rewards/rejected": -1.2467682361602783, "step": 2685 }, { "epoch": 0.4485232771566828, "grad_norm": 7.306514739990234, "learning_rate": 1.5514767228433174e-05, "logits/chosen": -1.025795340538025, "logits/rejected": -1.0089010000228882, "logps/chosen": -105.3459701538086, "logps/rejected": -66.03986358642578, "loss": 0.809, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.112889528274536, "rewards/margins": -1.7943576574325562, "rewards/rejected": -0.31853187084198, "step": 2688 }, { "epoch": 0.4490238611713666, "grad_norm": 21.724571228027344, "learning_rate": 1.5509761388286336e-05, "logits/chosen": -0.7145792841911316, "logits/rejected": -0.7722517848014832, "logps/chosen": -55.59507369995117, "logps/rejected": -130.618896484375, "loss": 0.4066, "rewards/accuracies": 1.0, "rewards/chosen": 0.18904916942119598, "rewards/margins": 1.7984586954116821, "rewards/rejected": -1.6094094514846802, "step": 2691 }, { "epoch": 0.4495244451860504, "grad_norm": 25.965240478515625, "learning_rate": 1.5504755548139497e-05, "logits/chosen": -1.1344965696334839, "logits/rejected": -1.1325541734695435, "logps/chosen": -105.3973388671875, "logps/rejected": -121.49560546875, "loss": 0.4197, "rewards/accuracies": 1.0, "rewards/chosen": -1.3502012491226196, "rewards/margins": 2.8773391246795654, "rewards/rejected": -4.227540493011475, "step": 2694 }, { "epoch": 0.4500250292007342, "grad_norm": 11.863935470581055, "learning_rate": 1.549974970799266e-05, "logits/chosen": -0.8873701095581055, "logits/rejected": -0.9031927585601807, "logps/chosen": -58.88435363769531, "logps/rejected": -60.30419921875, "loss": 0.4791, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.15867562592029572, "rewards/margins": -0.35401713848114014, "rewards/rejected": 0.19534148275852203, "step": 2697 }, { "epoch": 0.450525613215418, "grad_norm": 38.12928771972656, "learning_rate": 1.549474386784582e-05, "logits/chosen": -0.8516242504119873, "logits/rejected": -0.87827467918396, "logps/chosen": -65.79276275634766, "logps/rejected": -87.94429779052734, "loss": 0.6735, "rewards/accuracies": 1.0, "rewards/chosen": 0.4826989471912384, "rewards/margins": 1.2887580394744873, "rewards/rejected": -0.8060590624809265, "step": 2700 }, { "epoch": 0.4510261972301018, "grad_norm": 10.90114974975586, "learning_rate": 1.5489738027698983e-05, "logits/chosen": -0.8851545453071594, "logits/rejected": -0.8596999049186707, "logps/chosen": -118.52509307861328, "logps/rejected": -87.2333984375, "loss": 0.7109, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.3661104738712311, "rewards/margins": -0.6075895428657532, "rewards/rejected": 0.24147911369800568, "step": 2703 }, { "epoch": 0.45152678124478557, "grad_norm": 7.5008721351623535, "learning_rate": 1.5484732187552144e-05, "logits/chosen": -0.8179451823234558, "logits/rejected": -0.7910704016685486, "logps/chosen": -118.86856842041016, "logps/rejected": -154.46290588378906, "loss": 0.4434, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8992016911506653, "rewards/margins": -0.1842595338821411, "rewards/rejected": -0.714942216873169, "step": 2706 }, { "epoch": 0.4520273652594694, "grad_norm": 16.224952697753906, "learning_rate": 1.547972634740531e-05, "logits/chosen": -1.115905523300171, "logits/rejected": -1.1237846612930298, "logps/chosen": -56.22621536254883, "logps/rejected": -66.5630874633789, "loss": 0.9078, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.6702455282211304, "rewards/margins": -0.7832254767417908, "rewards/rejected": -0.8870198726654053, "step": 2709 }, { "epoch": 0.4525279492741532, "grad_norm": 13.710493087768555, "learning_rate": 1.5474720507258468e-05, "logits/chosen": -0.9109975695610046, "logits/rejected": -0.9278648495674133, "logps/chosen": -55.93330001831055, "logps/rejected": -96.92938232421875, "loss": 0.4487, "rewards/accuracies": 1.0, "rewards/chosen": 0.3478947877883911, "rewards/margins": 2.776380777359009, "rewards/rejected": -2.428485870361328, "step": 2712 }, { "epoch": 0.45302853328883697, "grad_norm": 17.64592933654785, "learning_rate": 1.546971466711163e-05, "logits/chosen": -0.8403076529502869, "logits/rejected": -0.8002796769142151, "logps/chosen": -68.3135986328125, "logps/rejected": -62.81180953979492, "loss": 0.5888, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.3197970390319824, "rewards/margins": 0.010849733836948872, "rewards/rejected": -0.3306467533111572, "step": 2715 }, { "epoch": 0.4535291173035208, "grad_norm": 13.00331974029541, "learning_rate": 1.5464708826964795e-05, "logits/chosen": -0.9248568415641785, "logits/rejected": -0.8945505023002625, "logps/chosen": -106.3543472290039, "logps/rejected": -112.223388671875, "loss": 0.4797, "rewards/accuracies": 1.0, "rewards/chosen": 0.715613067150116, "rewards/margins": 3.8889379501342773, "rewards/rejected": -3.1733248233795166, "step": 2718 }, { "epoch": 0.4540297013182046, "grad_norm": 44.69001007080078, "learning_rate": 1.5459702986817953e-05, "logits/chosen": -0.9280686974525452, "logits/rejected": -0.9860032200813293, "logps/chosen": -110.23790740966797, "logps/rejected": -122.42902374267578, "loss": 0.8693, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.2861584722995758, "rewards/margins": 0.24041646718978882, "rewards/rejected": 0.04574204608798027, "step": 2721 }, { "epoch": 0.45453028533288836, "grad_norm": 29.60573387145996, "learning_rate": 1.545469714667112e-05, "logits/chosen": -0.9750514030456543, "logits/rejected": -0.9564560055732727, "logps/chosen": -70.68753814697266, "logps/rejected": -95.04901885986328, "loss": 0.4066, "rewards/accuracies": 1.0, "rewards/chosen": 0.017068862915039062, "rewards/margins": 3.487541913986206, "rewards/rejected": -3.470473051071167, "step": 2724 }, { "epoch": 0.45503086934757214, "grad_norm": 11.273609161376953, "learning_rate": 1.544969130652428e-05, "logits/chosen": -0.9992694854736328, "logits/rejected": -0.9814703464508057, "logps/chosen": -86.0129623413086, "logps/rejected": -85.24907684326172, "loss": 0.4022, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.1583462953567505, "rewards/margins": 0.37468695640563965, "rewards/rejected": -0.5330332517623901, "step": 2727 }, { "epoch": 0.455531453362256, "grad_norm": 57.65880584716797, "learning_rate": 1.5444685466377442e-05, "logits/chosen": -0.9996755719184875, "logits/rejected": -1.0128251314163208, "logps/chosen": -46.3322868347168, "logps/rejected": -83.569091796875, "loss": 0.5416, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.542761504650116, "rewards/margins": 1.1127113103866577, "rewards/rejected": -0.5699498057365417, "step": 2730 }, { "epoch": 0.45603203737693976, "grad_norm": 29.23391342163086, "learning_rate": 1.5439679626230604e-05, "logits/chosen": -0.8801547884941101, "logits/rejected": -0.8320611119270325, "logps/chosen": -105.2344741821289, "logps/rejected": -81.49056243896484, "loss": 0.5654, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3392884433269501, "rewards/margins": 0.9368969798088074, "rewards/rejected": -1.276185393333435, "step": 2733 }, { "epoch": 0.45653262139162354, "grad_norm": 31.78993797302246, "learning_rate": 1.5434673786083766e-05, "logits/chosen": -1.053416132926941, "logits/rejected": -1.0061149597167969, "logps/chosen": -129.0124053955078, "logps/rejected": -114.4145278930664, "loss": 0.6698, "rewards/accuracies": 0.0, "rewards/chosen": -1.0460896492004395, "rewards/margins": -0.6267326474189758, "rewards/rejected": -0.4193570613861084, "step": 2736 }, { "epoch": 0.4570332054063074, "grad_norm": 5.162641525268555, "learning_rate": 1.5429667945936928e-05, "logits/chosen": -0.8470843434333801, "logits/rejected": -0.8430399894714355, "logps/chosen": -75.74807739257812, "logps/rejected": -98.36813354492188, "loss": 0.2001, "rewards/accuracies": 1.0, "rewards/chosen": -0.33916720747947693, "rewards/margins": 1.7552140951156616, "rewards/rejected": -2.094381332397461, "step": 2739 }, { "epoch": 0.45753378942099115, "grad_norm": 35.97929763793945, "learning_rate": 1.542466210579009e-05, "logits/chosen": -0.9942426681518555, "logits/rejected": -0.9953842163085938, "logps/chosen": -60.492431640625, "logps/rejected": -101.3521728515625, "loss": 0.4208, "rewards/accuracies": 1.0, "rewards/chosen": 0.3589186668395996, "rewards/margins": 3.4024531841278076, "rewards/rejected": -3.043534278869629, "step": 2742 }, { "epoch": 0.45803437343567494, "grad_norm": 25.03738021850586, "learning_rate": 1.541965626564325e-05, "logits/chosen": -0.7535529732704163, "logits/rejected": -0.8297074437141418, "logps/chosen": -63.41708755493164, "logps/rejected": -145.475341796875, "loss": 0.3749, "rewards/accuracies": 1.0, "rewards/chosen": 0.20275075733661652, "rewards/margins": 2.643404722213745, "rewards/rejected": -2.4406538009643555, "step": 2745 }, { "epoch": 0.45853495745035877, "grad_norm": 13.522001266479492, "learning_rate": 1.5414650425496413e-05, "logits/chosen": -0.7243416905403137, "logits/rejected": -0.7548344135284424, "logps/chosen": -53.129573822021484, "logps/rejected": -121.86641693115234, "loss": 0.2094, "rewards/accuracies": 1.0, "rewards/chosen": 1.8248144388198853, "rewards/margins": 4.451168060302734, "rewards/rejected": -2.6263535022735596, "step": 2748 }, { "epoch": 0.45903554146504255, "grad_norm": 121.03126525878906, "learning_rate": 1.5409644585349578e-05, "logits/chosen": -0.8089798092842102, "logits/rejected": -0.8015322685241699, "logps/chosen": -106.95235443115234, "logps/rejected": -102.18460083007812, "loss": 1.1701, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.8205853700637817, "rewards/margins": 0.703392505645752, "rewards/rejected": -2.5239779949188232, "step": 2751 }, { "epoch": 0.45953612547972633, "grad_norm": 21.108978271484375, "learning_rate": 1.5404638745202737e-05, "logits/chosen": -0.8989630341529846, "logits/rejected": -0.8542147278785706, "logps/chosen": -101.46387481689453, "logps/rejected": -105.1738052368164, "loss": 0.4274, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.2046908140182495, "rewards/margins": -0.3629548251628876, "rewards/rejected": -0.8417360186576843, "step": 2754 }, { "epoch": 0.46003670949441017, "grad_norm": 12.134005546569824, "learning_rate": 1.53996329050559e-05, "logits/chosen": -1.0136847496032715, "logits/rejected": -1.06464421749115, "logps/chosen": -131.1823272705078, "logps/rejected": -123.36639404296875, "loss": 0.7795, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -3.0497806072235107, "rewards/margins": 0.44696173071861267, "rewards/rejected": -3.4967422485351562, "step": 2757 }, { "epoch": 0.46053729350909395, "grad_norm": 15.797374725341797, "learning_rate": 1.5394627064909064e-05, "logits/chosen": -0.8956641554832458, "logits/rejected": -0.9170253872871399, "logps/chosen": -61.752742767333984, "logps/rejected": -115.68189239501953, "loss": 0.2477, "rewards/accuracies": 1.0, "rewards/chosen": -0.12306671589612961, "rewards/margins": 3.0963878631591797, "rewards/rejected": -3.219454526901245, "step": 2760 }, { "epoch": 0.46103787752377773, "grad_norm": 18.450912475585938, "learning_rate": 1.5389621224762222e-05, "logits/chosen": -1.0226354598999023, "logits/rejected": -1.0331090688705444, "logps/chosen": -59.23676681518555, "logps/rejected": -61.72544479370117, "loss": 0.484, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.1744476556777954, "rewards/margins": 0.2735438346862793, "rewards/rejected": -0.09909617900848389, "step": 2763 }, { "epoch": 0.46153846153846156, "grad_norm": 24.550588607788086, "learning_rate": 1.5384615384615387e-05, "logits/chosen": -1.0506535768508911, "logits/rejected": -1.1026932001113892, "logps/chosen": -52.91519546508789, "logps/rejected": -132.9488067626953, "loss": 0.68, "rewards/accuracies": 1.0, "rewards/chosen": -1.1157170534133911, "rewards/margins": 2.692591667175293, "rewards/rejected": -3.8083088397979736, "step": 2766 }, { "epoch": 0.46203904555314534, "grad_norm": 15.17773723602295, "learning_rate": 1.537960954446855e-05, "logits/chosen": -0.8960391879081726, "logits/rejected": -0.9823899269104004, "logps/chosen": -41.141571044921875, "logps/rejected": -102.7132797241211, "loss": 0.4287, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.0009306594729423523, "rewards/margins": 0.7243377566337585, "rewards/rejected": -0.7252683639526367, "step": 2769 }, { "epoch": 0.4625396295678291, "grad_norm": 32.96342849731445, "learning_rate": 1.5374603704321707e-05, "logits/chosen": -0.8313634991645813, "logits/rejected": -0.9516006112098694, "logps/chosen": -50.842777252197266, "logps/rejected": -135.8730010986328, "loss": 0.6983, "rewards/accuracies": 1.0, "rewards/chosen": -0.5772029757499695, "rewards/margins": 1.4100284576416016, "rewards/rejected": -1.9872316122055054, "step": 2772 }, { "epoch": 0.4630402135825129, "grad_norm": 15.390054702758789, "learning_rate": 1.5369597864174873e-05, "logits/chosen": -0.9633365273475647, "logits/rejected": -0.8960407376289368, "logps/chosen": -141.82493591308594, "logps/rejected": -152.73255920410156, "loss": 0.4197, "rewards/accuracies": 1.0, "rewards/chosen": -0.5678589344024658, "rewards/margins": 1.8997668027877808, "rewards/rejected": -2.467625617980957, "step": 2775 }, { "epoch": 0.46354079759719674, "grad_norm": 55.243892669677734, "learning_rate": 1.5364592024028034e-05, "logits/chosen": -1.0670366287231445, "logits/rejected": -1.050766110420227, "logps/chosen": -74.60759735107422, "logps/rejected": -65.88704681396484, "loss": 0.8035, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.22502505779266357, "rewards/margins": 0.8538805842399597, "rewards/rejected": -0.6288554668426514, "step": 2778 }, { "epoch": 0.4640413816118805, "grad_norm": 20.64870262145996, "learning_rate": 1.5359586183881196e-05, "logits/chosen": -0.9819891452789307, "logits/rejected": -0.9696943163871765, "logps/chosen": -75.63115692138672, "logps/rejected": -53.70207595825195, "loss": 0.6823, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6424339413642883, "rewards/margins": -0.053421419113874435, "rewards/rejected": -0.5890125632286072, "step": 2781 }, { "epoch": 0.4645419656265643, "grad_norm": 6.162203311920166, "learning_rate": 1.5354580343734358e-05, "logits/chosen": -0.8368738293647766, "logits/rejected": -0.8132190108299255, "logps/chosen": -153.017333984375, "logps/rejected": -149.7686004638672, "loss": 0.3299, "rewards/accuracies": 1.0, "rewards/chosen": -0.7485291361808777, "rewards/margins": 2.703348398208618, "rewards/rejected": -3.4518775939941406, "step": 2784 }, { "epoch": 0.46504254964124814, "grad_norm": 8.520406723022461, "learning_rate": 1.534957450358752e-05, "logits/chosen": -0.7861604690551758, "logits/rejected": -0.8729328513145447, "logps/chosen": -60.53480911254883, "logps/rejected": -162.57337951660156, "loss": 0.5843, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.30728110671043396, "rewards/margins": 3.398714303970337, "rewards/rejected": -3.705995559692383, "step": 2787 }, { "epoch": 0.4655431336559319, "grad_norm": 7.407135009765625, "learning_rate": 1.534456866344068e-05, "logits/chosen": -0.6618885397911072, "logits/rejected": -0.7423527836799622, "logps/chosen": -56.41795349121094, "logps/rejected": -117.4680404663086, "loss": 0.1885, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.29690271615982056, "rewards/margins": 1.37933349609375, "rewards/rejected": -1.6762362718582153, "step": 2790 }, { "epoch": 0.4660437176706157, "grad_norm": 32.99002456665039, "learning_rate": 1.5339562823293843e-05, "logits/chosen": -0.6943624019622803, "logits/rejected": -0.8186514973640442, "logps/chosen": -59.92888259887695, "logps/rejected": -129.1001739501953, "loss": 0.4758, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.25981998443603516, "rewards/margins": 1.689623475074768, "rewards/rejected": -1.429803490638733, "step": 2793 }, { "epoch": 0.46654430168529953, "grad_norm": 18.042673110961914, "learning_rate": 1.5334556983147005e-05, "logits/chosen": -0.9614951610565186, "logits/rejected": -0.9996600151062012, "logps/chosen": -81.55805206298828, "logps/rejected": -134.24363708496094, "loss": 0.4716, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7834828495979309, "rewards/margins": 3.376457452774048, "rewards/rejected": -4.159940719604492, "step": 2796 }, { "epoch": 0.4670448856999833, "grad_norm": 14.123021125793457, "learning_rate": 1.5329551143000167e-05, "logits/chosen": -0.8079227805137634, "logits/rejected": -0.8819928169250488, "logps/chosen": -60.4421501159668, "logps/rejected": -117.76751708984375, "loss": 0.4029, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.6196959018707275, "rewards/margins": 1.331222653388977, "rewards/rejected": -1.9509185552597046, "step": 2799 }, { "epoch": 0.4675454697146671, "grad_norm": 30.426767349243164, "learning_rate": 1.5324545302853332e-05, "logits/chosen": -0.9848613142967224, "logits/rejected": -0.9907034039497375, "logps/chosen": -91.39713287353516, "logps/rejected": -124.66586303710938, "loss": 0.7777, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.0392249822616577, "rewards/margins": 1.3228899240493774, "rewards/rejected": -2.362114906311035, "step": 2802 }, { "epoch": 0.46804605372935093, "grad_norm": 46.7172966003418, "learning_rate": 1.531953946270649e-05, "logits/chosen": -0.7407522201538086, "logits/rejected": -0.7146231532096863, "logps/chosen": -131.90708923339844, "logps/rejected": -94.13180541992188, "loss": 0.5318, "rewards/accuracies": 1.0, "rewards/chosen": -1.2325859069824219, "rewards/margins": 1.14985990524292, "rewards/rejected": -2.3824455738067627, "step": 2805 }, { "epoch": 0.4685466377440347, "grad_norm": 44.20579147338867, "learning_rate": 1.5314533622559656e-05, "logits/chosen": -0.8621533513069153, "logits/rejected": -0.8346676826477051, "logps/chosen": -122.87994384765625, "logps/rejected": -91.09337615966797, "loss": 0.8504, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.584081768989563, "rewards/margins": 0.1560804396867752, "rewards/rejected": -1.7401622533798218, "step": 2808 }, { "epoch": 0.4690472217587185, "grad_norm": 17.08576202392578, "learning_rate": 1.5309527782412817e-05, "logits/chosen": -1.0176094770431519, "logits/rejected": -1.0157395601272583, "logps/chosen": -137.55381774902344, "logps/rejected": -153.26486206054688, "loss": 0.4265, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.5282028913497925, "rewards/margins": 2.637101411819458, "rewards/rejected": -4.165304183959961, "step": 2811 }, { "epoch": 0.4695478057734023, "grad_norm": 26.923738479614258, "learning_rate": 1.5304521942265976e-05, "logits/chosen": -0.8179129958152771, "logits/rejected": -0.8716065883636475, "logps/chosen": -68.6585922241211, "logps/rejected": -117.943603515625, "loss": 0.4428, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6445358395576477, "rewards/margins": 2.2255032062530518, "rewards/rejected": -2.8700389862060547, "step": 2814 }, { "epoch": 0.4700483897880861, "grad_norm": 17.411893844604492, "learning_rate": 1.529951610211914e-05, "logits/chosen": -0.853400707244873, "logits/rejected": -0.840101957321167, "logps/chosen": -131.54954528808594, "logps/rejected": -114.68450164794922, "loss": 0.4366, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.9230725765228271, "rewards/margins": -0.3063301146030426, "rewards/rejected": -0.6167424321174622, "step": 2817 }, { "epoch": 0.4705489738027699, "grad_norm": 22.47517967224121, "learning_rate": 1.5294510261972303e-05, "logits/chosen": -0.9845194816589355, "logits/rejected": -0.929854154586792, "logps/chosen": -113.1235122680664, "logps/rejected": -60.01223373413086, "loss": 0.5212, "rewards/accuracies": 0.0, "rewards/chosen": -1.5720701217651367, "rewards/margins": -0.7265108227729797, "rewards/rejected": -0.8455593585968018, "step": 2820 }, { "epoch": 0.4710495578174537, "grad_norm": 10.676916122436523, "learning_rate": 1.5289504421825465e-05, "logits/chosen": -0.9119308590888977, "logits/rejected": -0.9396551251411438, "logps/chosen": -45.55869674682617, "logps/rejected": -102.40088653564453, "loss": 0.3753, "rewards/accuracies": 1.0, "rewards/chosen": -0.11051195859909058, "rewards/margins": 2.2571475505828857, "rewards/rejected": -2.367659330368042, "step": 2823 }, { "epoch": 0.4715501418321375, "grad_norm": 38.28904724121094, "learning_rate": 1.5284498581678626e-05, "logits/chosen": -0.9000161290168762, "logits/rejected": -0.8905869126319885, "logps/chosen": -101.72689819335938, "logps/rejected": -105.7230453491211, "loss": 1.135, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.5083905458450317, "rewards/margins": 1.4592691659927368, "rewards/rejected": -2.9676599502563477, "step": 2826 }, { "epoch": 0.4720507258468213, "grad_norm": 24.977645874023438, "learning_rate": 1.5279492741531788e-05, "logits/chosen": -0.8215740323066711, "logits/rejected": -0.7751345038414001, "logps/chosen": -92.73865509033203, "logps/rejected": -90.84346771240234, "loss": 0.7335, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4865245819091797, "rewards/margins": 0.8949972987174988, "rewards/rejected": -1.3815218210220337, "step": 2829 }, { "epoch": 0.47255130986150506, "grad_norm": 22.733640670776367, "learning_rate": 1.527448690138495e-05, "logits/chosen": -0.8131146430969238, "logits/rejected": -0.8778360486030579, "logps/chosen": -79.7572250366211, "logps/rejected": -116.25711822509766, "loss": 0.4862, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6540980339050293, "rewards/margins": 0.3340488374233246, "rewards/rejected": -0.9881468415260315, "step": 2832 }, { "epoch": 0.4730518938761889, "grad_norm": 14.733213424682617, "learning_rate": 1.5269481061238112e-05, "logits/chosen": -0.8248886466026306, "logits/rejected": -0.8555593490600586, "logps/chosen": -47.0556755065918, "logps/rejected": -106.15726470947266, "loss": 0.4594, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.1663466840982437, "rewards/margins": 2.771787643432617, "rewards/rejected": -2.605440855026245, "step": 2835 }, { "epoch": 0.4735524778908727, "grad_norm": 10.713881492614746, "learning_rate": 1.5264475221091274e-05, "logits/chosen": -0.7050336003303528, "logits/rejected": -0.732171356678009, "logps/chosen": -96.72473907470703, "logps/rejected": -100.1773910522461, "loss": 0.7646, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9729945063591003, "rewards/margins": 0.11522960662841797, "rewards/rejected": -1.0882240533828735, "step": 2838 }, { "epoch": 0.47405306190555646, "grad_norm": 60.69954299926758, "learning_rate": 1.5259469380944435e-05, "logits/chosen": -0.814887285232544, "logits/rejected": -0.8611237406730652, "logps/chosen": -52.693851470947266, "logps/rejected": -99.150390625, "loss": 1.1613, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6268907189369202, "rewards/margins": 2.018500566482544, "rewards/rejected": -2.6453912258148193, "step": 2841 }, { "epoch": 0.4745536459202403, "grad_norm": 24.81887435913086, "learning_rate": 1.5254463540797599e-05, "logits/chosen": -0.7694956660270691, "logits/rejected": -0.8014599680900574, "logps/chosen": -56.507415771484375, "logps/rejected": -89.6295394897461, "loss": 0.6308, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.19319170713424683, "rewards/margins": 1.9732199907302856, "rewards/rejected": -1.780027985572815, "step": 2844 }, { "epoch": 0.4750542299349241, "grad_norm": 28.31639289855957, "learning_rate": 1.524945770065076e-05, "logits/chosen": -0.9465222358703613, "logits/rejected": -0.9075846076011658, "logps/chosen": -112.9752197265625, "logps/rejected": -61.11532974243164, "loss": 0.4488, "rewards/accuracies": 1.0, "rewards/chosen": -0.25815555453300476, "rewards/margins": 0.885540246963501, "rewards/rejected": -1.1436958312988281, "step": 2847 }, { "epoch": 0.47555481394960786, "grad_norm": 9.015952110290527, "learning_rate": 1.524445186050392e-05, "logits/chosen": -0.8359777927398682, "logits/rejected": -0.8185155987739563, "logps/chosen": -114.7021484375, "logps/rejected": -105.44661712646484, "loss": 0.4584, "rewards/accuracies": 1.0, "rewards/chosen": 0.32395949959754944, "rewards/margins": 2.52081561088562, "rewards/rejected": -2.1968562602996826, "step": 2850 }, { "epoch": 0.4760553979642917, "grad_norm": 45.461185455322266, "learning_rate": 1.5239446020357084e-05, "logits/chosen": -0.7380741238594055, "logits/rejected": -0.7823125720024109, "logps/chosen": -53.38467788696289, "logps/rejected": -97.00223541259766, "loss": 1.435, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.24704571068286896, "rewards/margins": 1.8112907409667969, "rewards/rejected": -2.0583364963531494, "step": 2853 }, { "epoch": 0.4765559819789755, "grad_norm": 64.72633361816406, "learning_rate": 1.5234440180210246e-05, "logits/chosen": -0.7964492440223694, "logits/rejected": -0.8128883242607117, "logps/chosen": -145.82957458496094, "logps/rejected": -99.86678314208984, "loss": 0.8704, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9317934513092041, "rewards/margins": 0.5723904967308044, "rewards/rejected": -1.5041841268539429, "step": 2856 }, { "epoch": 0.47705656599365925, "grad_norm": 10.444846153259277, "learning_rate": 1.522943434006341e-05, "logits/chosen": -0.8872056007385254, "logits/rejected": -0.9095584750175476, "logps/chosen": -53.87640380859375, "logps/rejected": -91.04058074951172, "loss": 0.5261, "rewards/accuracies": 1.0, "rewards/chosen": -0.16311442852020264, "rewards/margins": 3.054417848587036, "rewards/rejected": -3.2175323963165283, "step": 2859 }, { "epoch": 0.4775571500083431, "grad_norm": 17.305620193481445, "learning_rate": 1.522442849991657e-05, "logits/chosen": -1.0720452070236206, "logits/rejected": -1.0142022371292114, "logps/chosen": -90.26273345947266, "logps/rejected": -103.4888687133789, "loss": 0.5828, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5650642514228821, "rewards/margins": 1.564898133277893, "rewards/rejected": -2.12996244430542, "step": 2862 }, { "epoch": 0.47805773402302687, "grad_norm": 20.099515914916992, "learning_rate": 1.5219422659769733e-05, "logits/chosen": -0.7027052044868469, "logits/rejected": -0.8296303153038025, "logps/chosen": -67.40289306640625, "logps/rejected": -205.0473175048828, "loss": 0.3389, "rewards/accuracies": 1.0, "rewards/chosen": 0.9209911227226257, "rewards/margins": 2.9386227130889893, "rewards/rejected": -2.0176315307617188, "step": 2865 }, { "epoch": 0.47855831803771065, "grad_norm": 20.7567138671875, "learning_rate": 1.5214416819622895e-05, "logits/chosen": -0.8833966255187988, "logits/rejected": -0.9616103172302246, "logps/chosen": -40.330963134765625, "logps/rejected": -126.9012680053711, "loss": 0.8015, "rewards/accuracies": 1.0, "rewards/chosen": -0.5288882851600647, "rewards/margins": 1.6002473831176758, "rewards/rejected": -2.129135847091675, "step": 2868 }, { "epoch": 0.4790589020523945, "grad_norm": 14.647896766662598, "learning_rate": 1.5209410979476055e-05, "logits/chosen": -0.7078452110290527, "logits/rejected": -0.7153603434562683, "logps/chosen": -64.86322021484375, "logps/rejected": -106.90758514404297, "loss": 0.7257, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.13802720606327057, "rewards/margins": -0.03125894069671631, "rewards/rejected": 0.1692860722541809, "step": 2871 }, { "epoch": 0.47955948606707827, "grad_norm": 11.919926643371582, "learning_rate": 1.5204405139329219e-05, "logits/chosen": -0.7056212425231934, "logits/rejected": -0.8363202214241028, "logps/chosen": -40.04875183105469, "logps/rejected": -146.1359405517578, "loss": 0.3148, "rewards/accuracies": 1.0, "rewards/chosen": 0.1080823540687561, "rewards/margins": 5.351667881011963, "rewards/rejected": -5.243585586547852, "step": 2874 }, { "epoch": 0.48006007008176205, "grad_norm": 26.554750442504883, "learning_rate": 1.519939929918238e-05, "logits/chosen": -0.8493413925170898, "logits/rejected": -0.8554914593696594, "logps/chosen": -89.48990631103516, "logps/rejected": -104.38404083251953, "loss": 0.7825, "rewards/accuracies": 1.0, "rewards/chosen": -0.5146347284317017, "rewards/margins": 2.0507776737213135, "rewards/rejected": -2.5654125213623047, "step": 2877 }, { "epoch": 0.4805606540964458, "grad_norm": 28.412368774414062, "learning_rate": 1.5194393459035544e-05, "logits/chosen": -0.8295009732246399, "logits/rejected": -0.8375417590141296, "logps/chosen": -80.36273193359375, "logps/rejected": -99.91790008544922, "loss": 0.441, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.1615632325410843, "rewards/margins": 1.735880970954895, "rewards/rejected": -1.8974441289901733, "step": 2880 }, { "epoch": 0.48106123811112966, "grad_norm": 19.714513778686523, "learning_rate": 1.5189387618888704e-05, "logits/chosen": -0.9094235301017761, "logits/rejected": -0.9382633566856384, "logps/chosen": -72.4025650024414, "logps/rejected": -114.11788940429688, "loss": 0.4552, "rewards/accuracies": 1.0, "rewards/chosen": 0.13533218204975128, "rewards/margins": 1.095385193824768, "rewards/rejected": -0.9600531458854675, "step": 2883 }, { "epoch": 0.48156182212581344, "grad_norm": 14.122790336608887, "learning_rate": 1.5184381778741866e-05, "logits/chosen": -0.7958166003227234, "logits/rejected": -0.8104568123817444, "logps/chosen": -61.7823486328125, "logps/rejected": -67.67630004882812, "loss": 0.6871, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.31414470076560974, "rewards/margins": 0.6355572938919067, "rewards/rejected": -0.9497020244598389, "step": 2886 }, { "epoch": 0.4820624061404972, "grad_norm": 24.82139015197754, "learning_rate": 1.517937593859503e-05, "logits/chosen": -0.7619636654853821, "logits/rejected": -0.8128989338874817, "logps/chosen": -115.90191650390625, "logps/rejected": -127.03702545166016, "loss": 0.4992, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.31225472688674927, "rewards/margins": 0.9937641620635986, "rewards/rejected": -1.3060188293457031, "step": 2889 }, { "epoch": 0.48256299015518106, "grad_norm": 41.006439208984375, "learning_rate": 1.517437009844819e-05, "logits/chosen": -0.6751227378845215, "logits/rejected": -0.7286558151245117, "logps/chosen": -118.77066802978516, "logps/rejected": -172.72547912597656, "loss": 0.5813, "rewards/accuracies": 1.0, "rewards/chosen": -0.247308611869812, "rewards/margins": 1.9309724569320679, "rewards/rejected": -2.17828106880188, "step": 2892 }, { "epoch": 0.48306357416986484, "grad_norm": 8.318706512451172, "learning_rate": 1.5169364258301353e-05, "logits/chosen": -0.6265076994895935, "logits/rejected": -0.7332935929298401, "logps/chosen": -50.53575134277344, "logps/rejected": -130.7578582763672, "loss": 0.352, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.24994878470897675, "rewards/margins": 1.670906662940979, "rewards/rejected": -1.4209580421447754, "step": 2895 }, { "epoch": 0.4835641581845486, "grad_norm": 26.94571876525879, "learning_rate": 1.5164358418154515e-05, "logits/chosen": -0.7919173240661621, "logits/rejected": -0.8485990166664124, "logps/chosen": -110.8254623413086, "logps/rejected": -120.401123046875, "loss": 1.3059, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.8013449907302856, "rewards/margins": -0.8911492228507996, "rewards/rejected": -0.9101958870887756, "step": 2898 }, { "epoch": 0.48406474219923246, "grad_norm": 10.85075569152832, "learning_rate": 1.5159352578007678e-05, "logits/chosen": -0.767122745513916, "logits/rejected": -0.7885522246360779, "logps/chosen": -66.2265853881836, "logps/rejected": -91.9435806274414, "loss": 0.2673, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.2499944418668747, "rewards/margins": 1.2275081872940063, "rewards/rejected": -1.477502703666687, "step": 2901 }, { "epoch": 0.48456532621391624, "grad_norm": 10.33926010131836, "learning_rate": 1.5154346737860838e-05, "logits/chosen": -0.8606125712394714, "logits/rejected": -0.8697444796562195, "logps/chosen": -66.96014404296875, "logps/rejected": -102.86761474609375, "loss": 0.199, "rewards/accuracies": 1.0, "rewards/chosen": 0.06471991539001465, "rewards/margins": 3.0270063877105713, "rewards/rejected": -2.9622862339019775, "step": 2904 }, { "epoch": 0.4850659102286, "grad_norm": 36.05219268798828, "learning_rate": 1.5149340897714e-05, "logits/chosen": -0.8699439167976379, "logits/rejected": -0.8546541333198547, "logps/chosen": -81.18201446533203, "logps/rejected": -107.57858276367188, "loss": 0.4361, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.8111305236816406, "rewards/margins": 2.0751771926879883, "rewards/rejected": -1.2640467882156372, "step": 2907 }, { "epoch": 0.48556649424328385, "grad_norm": 16.68947982788086, "learning_rate": 1.5144335057567164e-05, "logits/chosen": -0.7990671992301941, "logits/rejected": -0.7905685901641846, "logps/chosen": -146.1196746826172, "logps/rejected": -94.92340087890625, "loss": 0.618, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.2802815437316895, "rewards/margins": -1.4153934717178345, "rewards/rejected": 0.1351117491722107, "step": 2910 }, { "epoch": 0.48606707825796763, "grad_norm": 34.45243835449219, "learning_rate": 1.5139329217420324e-05, "logits/chosen": -0.7929339408874512, "logits/rejected": -0.7624664306640625, "logps/chosen": -122.90634155273438, "logps/rejected": -94.20844268798828, "loss": 1.0657, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.0697435140609741, "rewards/margins": -1.347399353981018, "rewards/rejected": 0.2776559293270111, "step": 2913 }, { "epoch": 0.4865676622726514, "grad_norm": 28.153684616088867, "learning_rate": 1.5134323377273487e-05, "logits/chosen": -0.9490035176277161, "logits/rejected": -0.9700357913970947, "logps/chosen": -45.13168716430664, "logps/rejected": -102.9761734008789, "loss": 0.8196, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.0604171752929688, "rewards/margins": -0.26869869232177734, "rewards/rejected": -0.7917184233665466, "step": 2916 }, { "epoch": 0.48706824628733525, "grad_norm": 7.835396766662598, "learning_rate": 1.5129317537126649e-05, "logits/chosen": -0.7143896222114563, "logits/rejected": -0.7249460816383362, "logps/chosen": -70.34468078613281, "logps/rejected": -116.59355926513672, "loss": 0.2539, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.33507704734802246, "rewards/margins": 0.8457788825035095, "rewards/rejected": -0.5107017755508423, "step": 2919 }, { "epoch": 0.48756883030201903, "grad_norm": 30.212295532226562, "learning_rate": 1.5124311696979812e-05, "logits/chosen": -0.8620805144309998, "logits/rejected": -0.9041265845298767, "logps/chosen": -125.20287322998047, "logps/rejected": -124.71990203857422, "loss": 0.3886, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.8404842019081116, "rewards/margins": 0.14771921932697296, "rewards/rejected": -0.9882034659385681, "step": 2922 }, { "epoch": 0.4880694143167028, "grad_norm": 25.702714920043945, "learning_rate": 1.5119305856832973e-05, "logits/chosen": -0.7266817688941956, "logits/rejected": -0.7873652577400208, "logps/chosen": -62.9418830871582, "logps/rejected": -140.81336975097656, "loss": 0.4177, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.22064249217510223, "rewards/margins": 1.4382553100585938, "rewards/rejected": -1.2176127433776855, "step": 2925 }, { "epoch": 0.4885699983313866, "grad_norm": 46.09981918334961, "learning_rate": 1.5114300016686134e-05, "logits/chosen": -0.8380014300346375, "logits/rejected": -0.8461852073669434, "logps/chosen": -79.42833709716797, "logps/rejected": -133.5630645751953, "loss": 0.8507, "rewards/accuracies": 1.0, "rewards/chosen": -0.5153858661651611, "rewards/margins": 2.9575138092041016, "rewards/rejected": -3.4728996753692627, "step": 2928 }, { "epoch": 0.4890705823460704, "grad_norm": 31.6950626373291, "learning_rate": 1.5109294176539298e-05, "logits/chosen": -0.925179660320282, "logits/rejected": -0.8934626579284668, "logps/chosen": -82.91791534423828, "logps/rejected": -76.66759490966797, "loss": 0.7044, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.5295631885528564, "rewards/margins": 0.204776331782341, "rewards/rejected": -0.7343394756317139, "step": 2931 }, { "epoch": 0.4895711663607542, "grad_norm": 26.28089141845703, "learning_rate": 1.5104288336392458e-05, "logits/chosen": -0.6862001419067383, "logits/rejected": -0.6757583618164062, "logps/chosen": -69.26172637939453, "logps/rejected": -68.7400131225586, "loss": 0.6018, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.04061758518219, "rewards/margins": 0.49155306816101074, "rewards/rejected": -1.5321704149246216, "step": 2934 }, { "epoch": 0.490071750375438, "grad_norm": 30.112545013427734, "learning_rate": 1.5099282496245621e-05, "logits/chosen": -0.8488226532936096, "logits/rejected": -0.8929168581962585, "logps/chosen": -66.80816650390625, "logps/rejected": -87.72855377197266, "loss": 0.6263, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.13291041553020477, "rewards/margins": -0.01855696178972721, "rewards/rejected": -0.11435344815254211, "step": 2937 }, { "epoch": 0.4905723343901218, "grad_norm": 9.31851577758789, "learning_rate": 1.5094276656098783e-05, "logits/chosen": -0.841620147228241, "logits/rejected": -0.8447019457817078, "logps/chosen": -75.01099395751953, "logps/rejected": -99.29256439208984, "loss": 0.3968, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.39573606848716736, "rewards/margins": 1.4825488328933716, "rewards/rejected": -1.8782848119735718, "step": 2940 }, { "epoch": 0.4910729184048056, "grad_norm": 29.5719051361084, "learning_rate": 1.5089270815951943e-05, "logits/chosen": -0.9034383893013, "logits/rejected": -0.8394926190376282, "logps/chosen": -86.93695831298828, "logps/rejected": -60.45747756958008, "loss": 0.8575, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7970803380012512, "rewards/margins": 0.16733472049236298, "rewards/rejected": -0.9644150733947754, "step": 2943 }, { "epoch": 0.4915735024194894, "grad_norm": 37.84877014160156, "learning_rate": 1.5084264975805107e-05, "logits/chosen": -0.754921019077301, "logits/rejected": -0.7352590560913086, "logps/chosen": -137.07301330566406, "logps/rejected": -147.2743682861328, "loss": 0.5403, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.012794737704098225, "rewards/margins": 1.9003796577453613, "rewards/rejected": -1.9131742715835571, "step": 2946 }, { "epoch": 0.4920740864341732, "grad_norm": 24.509626388549805, "learning_rate": 1.5079259135658269e-05, "logits/chosen": -0.7234253883361816, "logits/rejected": -0.7259066104888916, "logps/chosen": -44.48202896118164, "logps/rejected": -101.02044677734375, "loss": 0.5027, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.4571392834186554, "rewards/margins": 2.676412343978882, "rewards/rejected": -2.219273090362549, "step": 2949 }, { "epoch": 0.492574670448857, "grad_norm": 21.621755599975586, "learning_rate": 1.5074253295511432e-05, "logits/chosen": -0.8658826351165771, "logits/rejected": -0.908913791179657, "logps/chosen": -70.8340072631836, "logps/rejected": -102.4978256225586, "loss": 0.3762, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.11502508074045181, "rewards/margins": 1.1110228300094604, "rewards/rejected": -0.9959977269172668, "step": 2952 }, { "epoch": 0.4930752544635408, "grad_norm": 16.73298454284668, "learning_rate": 1.5069247455364592e-05, "logits/chosen": -0.7970528602600098, "logits/rejected": -0.8023661971092224, "logps/chosen": -83.6917953491211, "logps/rejected": -98.18112182617188, "loss": 0.5745, "rewards/accuracies": 1.0, "rewards/chosen": 0.04831337928771973, "rewards/margins": 2.548224687576294, "rewards/rejected": -2.499911308288574, "step": 2955 }, { "epoch": 0.4935758384782246, "grad_norm": 13.979156494140625, "learning_rate": 1.5064241615217756e-05, "logits/chosen": -0.9165036082267761, "logits/rejected": -0.8557359576225281, "logps/chosen": -119.45539093017578, "logps/rejected": -107.8179931640625, "loss": 0.5237, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5956390500068665, "rewards/margins": 0.2859601080417633, "rewards/rejected": -0.8815991878509521, "step": 2958 }, { "epoch": 0.4940764224929084, "grad_norm": 35.66896438598633, "learning_rate": 1.5059235775070917e-05, "logits/chosen": -0.8801578879356384, "logits/rejected": -0.8805058598518372, "logps/chosen": -37.94972610473633, "logps/rejected": -56.199981689453125, "loss": 0.7888, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.45632901787757874, "rewards/margins": 1.9109283685684204, "rewards/rejected": -1.454599380493164, "step": 2961 }, { "epoch": 0.4945770065075922, "grad_norm": 30.488100051879883, "learning_rate": 1.5054229934924078e-05, "logits/chosen": -0.7019815444946289, "logits/rejected": -0.7186416983604431, "logps/chosen": -109.42037963867188, "logps/rejected": -88.5889892578125, "loss": 1.3113, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.457872152328491, "rewards/margins": -1.3493131399154663, "rewards/rejected": -1.108559012413025, "step": 2964 }, { "epoch": 0.495077590522276, "grad_norm": 8.647849082946777, "learning_rate": 1.5049224094777241e-05, "logits/chosen": -0.8939349055290222, "logits/rejected": -0.9377396702766418, "logps/chosen": -71.20759582519531, "logps/rejected": -92.42699432373047, "loss": 0.4006, "rewards/accuracies": 1.0, "rewards/chosen": -0.3354068696498871, "rewards/margins": 1.151017189025879, "rewards/rejected": -1.4864240884780884, "step": 2967 }, { "epoch": 0.4955781745369598, "grad_norm": 4.125166893005371, "learning_rate": 1.5044218254630403e-05, "logits/chosen": -0.774041473865509, "logits/rejected": -0.8216684460639954, "logps/chosen": -50.25335693359375, "logps/rejected": -99.05181884765625, "loss": 0.6094, "rewards/accuracies": 1.0, "rewards/chosen": 0.7888862490653992, "rewards/margins": 0.6531074643135071, "rewards/rejected": 0.13577881455421448, "step": 2970 }, { "epoch": 0.4960787585516436, "grad_norm": 9.228715896606445, "learning_rate": 1.5039212414483566e-05, "logits/chosen": -0.8218787312507629, "logits/rejected": -0.7926279902458191, "logps/chosen": -90.25737762451172, "logps/rejected": -122.61212158203125, "loss": 0.7567, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.1362021118402481, "rewards/margins": 1.5630303621292114, "rewards/rejected": -1.4268282651901245, "step": 2973 }, { "epoch": 0.4965793425663274, "grad_norm": 16.982580184936523, "learning_rate": 1.5034206574336726e-05, "logits/chosen": -0.725121796131134, "logits/rejected": -0.7738978266716003, "logps/chosen": -34.422119140625, "logps/rejected": -111.14885711669922, "loss": 0.3398, "rewards/accuracies": 1.0, "rewards/chosen": 0.3434987962245941, "rewards/margins": 3.8765735626220703, "rewards/rejected": -3.5330746173858643, "step": 2976 }, { "epoch": 0.4970799265810112, "grad_norm": 23.69173240661621, "learning_rate": 1.502920073418989e-05, "logits/chosen": -0.6826236844062805, "logits/rejected": -0.6583579182624817, "logps/chosen": -103.95430755615234, "logps/rejected": -108.03692626953125, "loss": 1.1219, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4929055869579315, "rewards/margins": 1.2548460960388184, "rewards/rejected": -1.7477515935897827, "step": 2979 }, { "epoch": 0.49758051059569497, "grad_norm": 12.641278266906738, "learning_rate": 1.5024194894043052e-05, "logits/chosen": -0.7268473505973816, "logits/rejected": -0.7658601403236389, "logps/chosen": -92.58040618896484, "logps/rejected": -148.1041259765625, "loss": 0.3216, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.7494567036628723, "rewards/margins": 2.482123613357544, "rewards/rejected": -1.7326668500900269, "step": 2982 }, { "epoch": 0.49808109461037875, "grad_norm": 51.53213882446289, "learning_rate": 1.5019189053896212e-05, "logits/chosen": -0.7181272506713867, "logits/rejected": -0.710218608379364, "logps/chosen": -93.97189331054688, "logps/rejected": -74.33004760742188, "loss": 0.7556, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.2727947235107422, "rewards/margins": 0.4743661880493164, "rewards/rejected": -0.2015714794397354, "step": 2985 }, { "epoch": 0.4985816786250626, "grad_norm": 5.322873592376709, "learning_rate": 1.5014183213749375e-05, "logits/chosen": -0.5468127131462097, "logits/rejected": -0.5354074835777283, "logps/chosen": -111.9712142944336, "logps/rejected": -130.7187957763672, "loss": 0.5476, "rewards/accuracies": 1.0, "rewards/chosen": -1.1850318908691406, "rewards/margins": 1.6530238389968872, "rewards/rejected": -2.8380558490753174, "step": 2988 }, { "epoch": 0.49908226263974637, "grad_norm": 63.694847106933594, "learning_rate": 1.5009177373602537e-05, "logits/chosen": -0.7925235629081726, "logits/rejected": -0.7918193340301514, "logps/chosen": -55.19487380981445, "logps/rejected": -146.57403564453125, "loss": 0.5027, "rewards/accuracies": 1.0, "rewards/chosen": 0.20608697831630707, "rewards/margins": 4.8327460289001465, "rewards/rejected": -4.626659393310547, "step": 2991 }, { "epoch": 0.49958284665443015, "grad_norm": 8.761918067932129, "learning_rate": 1.50041715334557e-05, "logits/chosen": -0.6284201145172119, "logits/rejected": -0.6818550229072571, "logps/chosen": -50.14271545410156, "logps/rejected": -98.94503021240234, "loss": 0.3222, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 1.1630744934082031, "rewards/margins": 1.7878013849258423, "rewards/rejected": -0.6247269511222839, "step": 2994 }, { "epoch": 0.500083430669114, "grad_norm": 18.53858184814453, "learning_rate": 1.499916569330886e-05, "logits/chosen": -0.7857670783996582, "logits/rejected": -0.7779921889305115, "logps/chosen": -64.87152099609375, "logps/rejected": -68.7968978881836, "loss": 0.4732, "rewards/accuracies": 1.0, "rewards/chosen": 0.18253906071186066, "rewards/margins": 1.2159250974655151, "rewards/rejected": -1.033385992050171, "step": 2997 }, { "epoch": 0.5005840146837978, "grad_norm": 13.3838472366333, "learning_rate": 1.4994159853162022e-05, "logits/chosen": -0.6625843644142151, "logits/rejected": -0.6785734295845032, "logps/chosen": -59.50835037231445, "logps/rejected": -99.86527252197266, "loss": 0.3584, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.06372509151697159, "rewards/margins": 2.144791603088379, "rewards/rejected": -2.081066370010376, "step": 3000 }, { "epoch": 0.5005840146837978, "eval_logits/chosen": -0.7768681049346924, "eval_logits/rejected": -0.7892848253250122, "eval_logps/chosen": -83.7724838256836, "eval_logps/rejected": -109.46464538574219, "eval_loss": 0.5952873826026917, "eval_rewards/accuracies": 0.717717707157135, "eval_rewards/chosen": -0.3514752686023712, "eval_rewards/margins": 1.2821705341339111, "eval_rewards/rejected": -1.6336455345153809, "eval_runtime": 356.3393, "eval_samples_per_second": 7.476, "eval_steps_per_second": 1.869, "step": 3000 }, { "epoch": 0.5010845986984815, "grad_norm": 29.523157119750977, "learning_rate": 1.4989154013015186e-05, "logits/chosen": -0.8264413475990295, "logits/rejected": -0.8549191355705261, "logps/chosen": -73.0362319946289, "logps/rejected": -122.3296890258789, "loss": 0.6497, "rewards/accuracies": 1.0, "rewards/chosen": 0.23366482555866241, "rewards/margins": 1.4929901361465454, "rewards/rejected": -1.259325385093689, "step": 3003 }, { "epoch": 0.5015851827131653, "grad_norm": 34.1034049987793, "learning_rate": 1.4984148172868346e-05, "logits/chosen": -0.7501159310340881, "logits/rejected": -0.7864617705345154, "logps/chosen": -86.53108978271484, "logps/rejected": -100.97711181640625, "loss": 0.6149, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.23684047162532806, "rewards/margins": 2.6586368083953857, "rewards/rejected": -2.895477056503296, "step": 3006 }, { "epoch": 0.5020857667278491, "grad_norm": 4.041179656982422, "learning_rate": 1.497914233272151e-05, "logits/chosen": -0.9666715264320374, "logits/rejected": -0.9467248916625977, "logps/chosen": -99.4131851196289, "logps/rejected": -123.0580062866211, "loss": 0.3417, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.3169695138931274, "rewards/margins": 1.6037545204162598, "rewards/rejected": -2.9207241535186768, "step": 3009 }, { "epoch": 0.502586350742533, "grad_norm": 4.475608825683594, "learning_rate": 1.4974136492574671e-05, "logits/chosen": -0.8213872313499451, "logits/rejected": -0.7435408234596252, "logps/chosen": -125.1511001586914, "logps/rejected": -89.6834487915039, "loss": 0.2294, "rewards/accuracies": 1.0, "rewards/chosen": 0.23158125579357147, "rewards/margins": 1.0385853052139282, "rewards/rejected": -0.8070039749145508, "step": 3012 }, { "epoch": 0.5030869347572168, "grad_norm": 16.609994888305664, "learning_rate": 1.4969130652427835e-05, "logits/chosen": -0.6731836795806885, "logits/rejected": -0.724564254283905, "logps/chosen": -57.64155197143555, "logps/rejected": -100.23099517822266, "loss": 0.363, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.31521162390708923, "rewards/margins": 1.0124822854995728, "rewards/rejected": -1.3276939392089844, "step": 3015 }, { "epoch": 0.5035875187719006, "grad_norm": 22.04433822631836, "learning_rate": 1.4964124812280995e-05, "logits/chosen": -0.821040153503418, "logits/rejected": -0.8526749610900879, "logps/chosen": -64.13628387451172, "logps/rejected": -124.0272216796875, "loss": 0.6266, "rewards/accuracies": 1.0, "rewards/chosen": -0.7504796385765076, "rewards/margins": 1.4622901678085327, "rewards/rejected": -2.2127699851989746, "step": 3018 }, { "epoch": 0.5040881027865843, "grad_norm": 9.184896469116211, "learning_rate": 1.4959118972134157e-05, "logits/chosen": -0.892423152923584, "logits/rejected": -0.910060703754425, "logps/chosen": -77.1310043334961, "logps/rejected": -139.88323974609375, "loss": 0.8375, "rewards/accuracies": 1.0, "rewards/chosen": -0.23682798445224762, "rewards/margins": 0.9679047465324402, "rewards/rejected": -1.2047327756881714, "step": 3021 }, { "epoch": 0.5045886868012681, "grad_norm": 23.634803771972656, "learning_rate": 1.495411313198732e-05, "logits/chosen": -0.6802919507026672, "logits/rejected": -0.6966525912284851, "logps/chosen": -69.66754150390625, "logps/rejected": -91.34522247314453, "loss": 0.7755, "rewards/accuracies": 1.0, "rewards/chosen": -0.3590194880962372, "rewards/margins": 1.437912106513977, "rewards/rejected": -1.7969317436218262, "step": 3024 }, { "epoch": 0.5050892708159519, "grad_norm": 14.135655403137207, "learning_rate": 1.494910729184048e-05, "logits/chosen": -0.7265394330024719, "logits/rejected": -0.7542855739593506, "logps/chosen": -121.1778793334961, "logps/rejected": -96.1335220336914, "loss": 0.2826, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.36360371112823486, "rewards/margins": 0.8179718852043152, "rewards/rejected": -1.1815756559371948, "step": 3027 }, { "epoch": 0.5055898548306358, "grad_norm": 17.949268341064453, "learning_rate": 1.4944101451693644e-05, "logits/chosen": -0.7595747113227844, "logits/rejected": -0.7338398098945618, "logps/chosen": -104.89896392822266, "logps/rejected": -94.16445922851562, "loss": 0.439, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6282821297645569, "rewards/margins": 0.17316298186779022, "rewards/rejected": -0.8014451861381531, "step": 3030 }, { "epoch": 0.5060904388453196, "grad_norm": 32.11861038208008, "learning_rate": 1.4939095611546806e-05, "logits/chosen": -0.6729629635810852, "logits/rejected": -0.7345001697540283, "logps/chosen": -79.91930389404297, "logps/rejected": -110.69539642333984, "loss": 0.6071, "rewards/accuracies": 1.0, "rewards/chosen": 0.20798440277576447, "rewards/margins": 1.5176162719726562, "rewards/rejected": -1.3096319437026978, "step": 3033 }, { "epoch": 0.5065910228600033, "grad_norm": 10.196094512939453, "learning_rate": 1.4934089771399969e-05, "logits/chosen": -0.7081282734870911, "logits/rejected": -0.7349131107330322, "logps/chosen": -74.09227752685547, "logps/rejected": -172.3841552734375, "loss": 0.6306, "rewards/accuracies": 1.0, "rewards/chosen": -1.2477744817733765, "rewards/margins": 2.9719479084014893, "rewards/rejected": -4.219722747802734, "step": 3036 }, { "epoch": 0.5070916068746871, "grad_norm": 19.41449546813965, "learning_rate": 1.492908393125313e-05, "logits/chosen": -0.7061858177185059, "logits/rejected": -0.7585189342498779, "logps/chosen": -43.11826705932617, "logps/rejected": -103.46875762939453, "loss": 0.3165, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.022109827026724815, "rewards/margins": 1.3355776071548462, "rewards/rejected": -1.3576873540878296, "step": 3039 }, { "epoch": 0.5075921908893709, "grad_norm": 20.513233184814453, "learning_rate": 1.4924078091106291e-05, "logits/chosen": -0.5743568539619446, "logits/rejected": -0.6198858022689819, "logps/chosen": -60.20096969604492, "logps/rejected": -145.0818328857422, "loss": 0.5972, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.09798828512430191, "rewards/margins": 2.213329315185547, "rewards/rejected": -2.1153409481048584, "step": 3042 }, { "epoch": 0.5080927749040547, "grad_norm": 23.612152099609375, "learning_rate": 1.4919072250959454e-05, "logits/chosen": -0.8122243881225586, "logits/rejected": -0.8136575818061829, "logps/chosen": -73.4026107788086, "logps/rejected": -104.020751953125, "loss": 0.8906, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4526686370372772, "rewards/margins": 1.8211561441421509, "rewards/rejected": -2.27382493019104, "step": 3045 }, { "epoch": 0.5085933589187386, "grad_norm": 7.944555282592773, "learning_rate": 1.4914066410812615e-05, "logits/chosen": -0.9626657366752625, "logits/rejected": -0.9736630916595459, "logps/chosen": -67.77690124511719, "logps/rejected": -92.57103729248047, "loss": 0.3475, "rewards/accuracies": 1.0, "rewards/chosen": -0.3198021650314331, "rewards/margins": 1.3214935064315796, "rewards/rejected": -1.6412957906723022, "step": 3048 }, { "epoch": 0.5090939429334224, "grad_norm": 30.99298095703125, "learning_rate": 1.4909060570665778e-05, "logits/chosen": -0.901219367980957, "logits/rejected": -0.8833796381950378, "logps/chosen": -69.03845977783203, "logps/rejected": -63.71663284301758, "loss": 1.0012, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 4.627804082701914e-05, "rewards/margins": 0.4493512213230133, "rewards/rejected": -0.4493049383163452, "step": 3051 }, { "epoch": 0.5095945269481061, "grad_norm": 18.53537368774414, "learning_rate": 1.490405473051894e-05, "logits/chosen": -0.8573608994483948, "logits/rejected": -0.8595223426818848, "logps/chosen": -67.8155746459961, "logps/rejected": -100.1515884399414, "loss": 0.6855, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4390697777271271, "rewards/margins": 2.650223970413208, "rewards/rejected": -3.0892937183380127, "step": 3054 }, { "epoch": 0.5100951109627899, "grad_norm": 14.800320625305176, "learning_rate": 1.4899048890372103e-05, "logits/chosen": -0.7180169224739075, "logits/rejected": -0.800508975982666, "logps/chosen": -39.38485336303711, "logps/rejected": -99.91046905517578, "loss": 0.3511, "rewards/accuracies": 1.0, "rewards/chosen": 0.24518917500972748, "rewards/margins": 1.1085700988769531, "rewards/rejected": -0.8633809089660645, "step": 3057 }, { "epoch": 0.5105956949774737, "grad_norm": 39.690093994140625, "learning_rate": 1.4894043050225263e-05, "logits/chosen": -0.8565661311149597, "logits/rejected": -0.8130610585212708, "logps/chosen": -98.89647674560547, "logps/rejected": -86.37361907958984, "loss": 0.5287, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7348141074180603, "rewards/margins": 0.12433993816375732, "rewards/rejected": -0.8591540455818176, "step": 3060 }, { "epoch": 0.5110962789921575, "grad_norm": 7.014929294586182, "learning_rate": 1.4889037210078425e-05, "logits/chosen": -0.7259573936462402, "logits/rejected": -0.7860493063926697, "logps/chosen": -57.7655143737793, "logps/rejected": -138.21913146972656, "loss": 0.2222, "rewards/accuracies": 1.0, "rewards/chosen": -0.14774726331233978, "rewards/margins": 3.463996648788452, "rewards/rejected": -3.611743688583374, "step": 3063 }, { "epoch": 0.5115968630068413, "grad_norm": 9.853633880615234, "learning_rate": 1.4884031369931589e-05, "logits/chosen": -0.8596248030662537, "logits/rejected": -0.8468735814094543, "logps/chosen": -79.73009490966797, "logps/rejected": -78.58705139160156, "loss": 0.388, "rewards/accuracies": 1.0, "rewards/chosen": -1.0723432302474976, "rewards/margins": 1.7438079118728638, "rewards/rejected": -2.8161513805389404, "step": 3066 }, { "epoch": 0.5120974470215252, "grad_norm": 32.57115936279297, "learning_rate": 1.4879025529784749e-05, "logits/chosen": -0.8591715693473816, "logits/rejected": -0.8143830895423889, "logps/chosen": -81.67694854736328, "logps/rejected": -97.81034088134766, "loss": 0.7366, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.36336827278137207, "rewards/margins": 1.3810038566589355, "rewards/rejected": -1.744372010231018, "step": 3069 }, { "epoch": 0.5125980310362089, "grad_norm": 12.614112854003906, "learning_rate": 1.4874019689637912e-05, "logits/chosen": -0.8624733090400696, "logits/rejected": -0.885750949382782, "logps/chosen": -67.99788665771484, "logps/rejected": -96.68588256835938, "loss": 0.3018, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7814987301826477, "rewards/margins": 1.0334302186965942, "rewards/rejected": -1.8149290084838867, "step": 3072 }, { "epoch": 0.5130986150508927, "grad_norm": 38.927207946777344, "learning_rate": 1.4869013849491074e-05, "logits/chosen": -0.7910595536231995, "logits/rejected": -0.7657594084739685, "logps/chosen": -99.8828353881836, "logps/rejected": -71.14226531982422, "loss": 1.0293, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.2201353311538696, "rewards/margins": -0.6872128844261169, "rewards/rejected": -0.5329225063323975, "step": 3075 }, { "epoch": 0.5135991990655765, "grad_norm": 31.786840438842773, "learning_rate": 1.4864008009344236e-05, "logits/chosen": -0.6748080253601074, "logits/rejected": -0.6987131237983704, "logps/chosen": -107.67208099365234, "logps/rejected": -153.12362670898438, "loss": 0.6258, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.681719183921814, "rewards/margins": 2.203324794769287, "rewards/rejected": -3.8850440979003906, "step": 3078 }, { "epoch": 0.5140997830802603, "grad_norm": 40.356868743896484, "learning_rate": 1.4859002169197398e-05, "logits/chosen": -0.8646979928016663, "logits/rejected": -0.8672283291816711, "logps/chosen": -116.90027618408203, "logps/rejected": -137.59178161621094, "loss": 0.4194, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.331730604171753, "rewards/margins": 2.7958881855010986, "rewards/rejected": -5.127618789672852, "step": 3081 }, { "epoch": 0.5146003670949441, "grad_norm": 26.086589813232422, "learning_rate": 1.485399632905056e-05, "logits/chosen": -0.9351145625114441, "logits/rejected": -0.964604914188385, "logps/chosen": -83.65088653564453, "logps/rejected": -154.7938995361328, "loss": 0.673, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1260650157928467, "rewards/margins": 0.40245750546455383, "rewards/rejected": -1.5285224914550781, "step": 3084 }, { "epoch": 0.515100951109628, "grad_norm": 13.664995193481445, "learning_rate": 1.4848990488903723e-05, "logits/chosen": -0.8030877113342285, "logits/rejected": -0.7844834923744202, "logps/chosen": -96.58386993408203, "logps/rejected": -83.82711791992188, "loss": 0.4337, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5319676995277405, "rewards/margins": 0.8565422892570496, "rewards/rejected": -1.3885098695755005, "step": 3087 }, { "epoch": 0.5156015351243117, "grad_norm": 24.245941162109375, "learning_rate": 1.4843984648756883e-05, "logits/chosen": -0.795964241027832, "logits/rejected": -0.8144209384918213, "logps/chosen": -65.11994171142578, "logps/rejected": -84.34357452392578, "loss": 0.4149, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.362298607826233, "rewards/margins": 0.07917197793722153, "rewards/rejected": -1.4414706230163574, "step": 3090 }, { "epoch": 0.5161021191389955, "grad_norm": 14.902860641479492, "learning_rate": 1.4838978808610047e-05, "logits/chosen": -0.7166889309883118, "logits/rejected": -0.7345682978630066, "logps/chosen": -70.41544342041016, "logps/rejected": -105.4947280883789, "loss": 0.3393, "rewards/accuracies": 1.0, "rewards/chosen": -0.2487993985414505, "rewards/margins": 1.517221450805664, "rewards/rejected": -1.7660207748413086, "step": 3093 }, { "epoch": 0.5166027031536793, "grad_norm": 17.803180694580078, "learning_rate": 1.4833972968463208e-05, "logits/chosen": -0.9900873303413391, "logits/rejected": -0.9457724094390869, "logps/chosen": -128.92108154296875, "logps/rejected": -108.39252471923828, "loss": 0.8139, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.8983917236328125, "rewards/margins": 0.9857928156852722, "rewards/rejected": -2.8841845989227295, "step": 3096 }, { "epoch": 0.5171032871683631, "grad_norm": 22.998367309570312, "learning_rate": 1.482896712831637e-05, "logits/chosen": -0.8178451061248779, "logits/rejected": -0.937497615814209, "logps/chosen": -97.37606048583984, "logps/rejected": -212.55955505371094, "loss": 0.6165, "rewards/accuracies": 1.0, "rewards/chosen": -0.32304802536964417, "rewards/margins": 1.705104947090149, "rewards/rejected": -2.02815318107605, "step": 3099 }, { "epoch": 0.5176038711830468, "grad_norm": 31.58111000061035, "learning_rate": 1.4823961288169532e-05, "logits/chosen": -0.6684864163398743, "logits/rejected": -0.7630548477172852, "logps/chosen": -81.69884490966797, "logps/rejected": -139.29428100585938, "loss": 0.6229, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.038661241531372, "rewards/margins": 1.6773700714111328, "rewards/rejected": -3.716031312942505, "step": 3102 }, { "epoch": 0.5181044551977307, "grad_norm": 8.631143569946289, "learning_rate": 1.4818955448022694e-05, "logits/chosen": -0.7445980906486511, "logits/rejected": -0.7623069882392883, "logps/chosen": -98.88903045654297, "logps/rejected": -150.6060333251953, "loss": 0.4637, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.3795889616012573, "rewards/margins": 0.4093039929866791, "rewards/rejected": -1.7888928651809692, "step": 3105 }, { "epoch": 0.5186050392124145, "grad_norm": 9.912506103515625, "learning_rate": 1.4813949607875857e-05, "logits/chosen": -0.6770620346069336, "logits/rejected": -0.7297813892364502, "logps/chosen": -69.76258087158203, "logps/rejected": -90.83287811279297, "loss": 0.276, "rewards/accuracies": 1.0, "rewards/chosen": 0.05102347210049629, "rewards/margins": 0.40725621581077576, "rewards/rejected": -0.35623273253440857, "step": 3108 }, { "epoch": 0.5191056232270983, "grad_norm": 13.837552070617676, "learning_rate": 1.4808943767729017e-05, "logits/chosen": -0.7313243746757507, "logits/rejected": -0.7204907536506653, "logps/chosen": -57.847259521484375, "logps/rejected": -73.60916900634766, "loss": 0.5895, "rewards/accuracies": 1.0, "rewards/chosen": 0.3657896816730499, "rewards/margins": 1.6066423654556274, "rewards/rejected": -1.2408527135849, "step": 3111 }, { "epoch": 0.5196062072417821, "grad_norm": 26.114126205444336, "learning_rate": 1.4803937927582181e-05, "logits/chosen": -0.761305034160614, "logits/rejected": -0.8554099202156067, "logps/chosen": -68.40536499023438, "logps/rejected": -123.33304595947266, "loss": 0.5377, "rewards/accuracies": 1.0, "rewards/chosen": -2.0342118740081787, "rewards/margins": 2.7785491943359375, "rewards/rejected": -4.812760829925537, "step": 3114 }, { "epoch": 0.5201067912564659, "grad_norm": 28.501220703125, "learning_rate": 1.4798932087435343e-05, "logits/chosen": -0.8040450215339661, "logits/rejected": -0.8203606605529785, "logps/chosen": -88.6622543334961, "logps/rejected": -90.05509185791016, "loss": 0.764, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.5361708402633667, "rewards/margins": 1.0375477075576782, "rewards/rejected": -0.5013768672943115, "step": 3117 }, { "epoch": 0.5206073752711496, "grad_norm": 36.36269760131836, "learning_rate": 1.4793926247288504e-05, "logits/chosen": -0.6472165584564209, "logits/rejected": -0.6333358287811279, "logps/chosen": -128.9477081298828, "logps/rejected": -157.25010681152344, "loss": 0.3221, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1474655866622925, "rewards/margins": 0.3479050099849701, "rewards/rejected": -1.4953705072402954, "step": 3120 }, { "epoch": 0.5211079592858334, "grad_norm": 30.2298641204834, "learning_rate": 1.4788920407141666e-05, "logits/chosen": -0.8707091212272644, "logits/rejected": -0.8807253837585449, "logps/chosen": -82.25154113769531, "logps/rejected": -89.14557647705078, "loss": 0.6622, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.0733895301818848, "rewards/margins": 1.249947190284729, "rewards/rejected": -2.3233368396759033, "step": 3123 }, { "epoch": 0.5216085433005173, "grad_norm": 36.86031723022461, "learning_rate": 1.4783914566994828e-05, "logits/chosen": -0.8833990097045898, "logits/rejected": -0.8569948077201843, "logps/chosen": -94.1294937133789, "logps/rejected": -110.0931167602539, "loss": 0.4219, "rewards/accuracies": 1.0, "rewards/chosen": -0.32600274682044983, "rewards/margins": 2.1320817470550537, "rewards/rejected": -2.4580843448638916, "step": 3126 }, { "epoch": 0.5221091273152011, "grad_norm": 23.241519927978516, "learning_rate": 1.4778908726847992e-05, "logits/chosen": -0.6827948093414307, "logits/rejected": -0.729672372341156, "logps/chosen": -101.14517211914062, "logps/rejected": -115.90192413330078, "loss": 0.5086, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.20848824083805084, "rewards/margins": 1.0037589073181152, "rewards/rejected": -0.7952706813812256, "step": 3129 }, { "epoch": 0.5226097113298849, "grad_norm": 14.830224990844727, "learning_rate": 1.4773902886701152e-05, "logits/chosen": -0.7665613293647766, "logits/rejected": -0.7767810821533203, "logps/chosen": -82.41666412353516, "logps/rejected": -96.50164031982422, "loss": 0.5594, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.31192803382873535, "rewards/margins": 1.1792720556259155, "rewards/rejected": -1.4912000894546509, "step": 3132 }, { "epoch": 0.5231102953445687, "grad_norm": 11.988255500793457, "learning_rate": 1.4768897046554313e-05, "logits/chosen": -0.7454667091369629, "logits/rejected": -0.7171964645385742, "logps/chosen": -95.586181640625, "logps/rejected": -130.8968963623047, "loss": 0.4152, "rewards/accuracies": 1.0, "rewards/chosen": -0.3101907968521118, "rewards/margins": 2.4980804920196533, "rewards/rejected": -2.8082714080810547, "step": 3135 }, { "epoch": 0.5236108793592524, "grad_norm": 13.836894989013672, "learning_rate": 1.4763891206407477e-05, "logits/chosen": -0.8182246088981628, "logits/rejected": -0.8796844482421875, "logps/chosen": -75.51154327392578, "logps/rejected": -136.2978057861328, "loss": 0.4683, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7540420889854431, "rewards/margins": 2.5276057720184326, "rewards/rejected": -3.2816476821899414, "step": 3138 }, { "epoch": 0.5241114633739362, "grad_norm": 12.847635269165039, "learning_rate": 1.4758885366260639e-05, "logits/chosen": -0.7824644446372986, "logits/rejected": -0.7616406083106995, "logps/chosen": -93.13504791259766, "logps/rejected": -114.3350601196289, "loss": 0.4817, "rewards/accuracies": 1.0, "rewards/chosen": -1.7510771751403809, "rewards/margins": 1.5540199279785156, "rewards/rejected": -3.3050973415374756, "step": 3141 }, { "epoch": 0.5246120473886201, "grad_norm": 12.881179809570312, "learning_rate": 1.47538795261138e-05, "logits/chosen": -1.0210881233215332, "logits/rejected": -0.9896255135536194, "logps/chosen": -71.73892974853516, "logps/rejected": -96.8170166015625, "loss": 0.3055, "rewards/accuracies": 1.0, "rewards/chosen": -0.1597510129213333, "rewards/margins": 1.814528465270996, "rewards/rejected": -1.974279522895813, "step": 3144 }, { "epoch": 0.5251126314033039, "grad_norm": 7.553984642028809, "learning_rate": 1.4748873685966962e-05, "logits/chosen": -0.8608855605125427, "logits/rejected": -0.8822317719459534, "logps/chosen": -136.47303771972656, "logps/rejected": -176.33331298828125, "loss": 0.6079, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -3.39323353767395, "rewards/margins": 0.4070257246494293, "rewards/rejected": -3.8002593517303467, "step": 3147 }, { "epoch": 0.5256132154179877, "grad_norm": 4.123138427734375, "learning_rate": 1.4743867845820126e-05, "logits/chosen": -0.754559338092804, "logits/rejected": -0.804770290851593, "logps/chosen": -57.96575927734375, "logps/rejected": -128.4701385498047, "loss": 0.422, "rewards/accuracies": 1.0, "rewards/chosen": 0.31988295912742615, "rewards/margins": 3.3947553634643555, "rewards/rejected": -3.0748722553253174, "step": 3150 }, { "epoch": 0.5261137994326714, "grad_norm": 23.902780532836914, "learning_rate": 1.4738862005673286e-05, "logits/chosen": -0.7206723093986511, "logits/rejected": -0.7218577861785889, "logps/chosen": -127.77734375, "logps/rejected": -121.89579010009766, "loss": 0.4921, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.46109792590141296, "rewards/margins": 1.4769293069839478, "rewards/rejected": -1.938027262687683, "step": 3153 }, { "epoch": 0.5266143834473552, "grad_norm": 9.370648384094238, "learning_rate": 1.4733856165526448e-05, "logits/chosen": -0.7007098197937012, "logits/rejected": -0.7294866442680359, "logps/chosen": -85.28124237060547, "logps/rejected": -109.07991790771484, "loss": 0.1984, "rewards/accuracies": 1.0, "rewards/chosen": -0.141581192612648, "rewards/margins": 2.03961443901062, "rewards/rejected": -2.1811954975128174, "step": 3156 }, { "epoch": 0.527114967462039, "grad_norm": 8.776870727539062, "learning_rate": 1.4728850325379611e-05, "logits/chosen": -0.7201007008552551, "logits/rejected": -0.8632190227508545, "logps/chosen": -34.58652877807617, "logps/rejected": -137.3767547607422, "loss": 0.1795, "rewards/accuracies": 1.0, "rewards/chosen": -0.7540562748908997, "rewards/margins": 2.879206895828247, "rewards/rejected": -3.633263349533081, "step": 3159 }, { "epoch": 0.5276155514767228, "grad_norm": 25.56012535095215, "learning_rate": 1.4723844485232773e-05, "logits/chosen": -0.9395363330841064, "logits/rejected": -0.9575562477111816, "logps/chosen": -104.5757064819336, "logps/rejected": -108.35994720458984, "loss": 0.9323, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.919922113418579, "rewards/margins": -1.9651994705200195, "rewards/rejected": -0.9547227025032043, "step": 3162 }, { "epoch": 0.5281161354914067, "grad_norm": 9.666987419128418, "learning_rate": 1.4718838645085935e-05, "logits/chosen": -0.846923828125, "logits/rejected": -0.9006827473640442, "logps/chosen": -93.711669921875, "logps/rejected": -181.817138671875, "loss": 0.3583, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4508826434612274, "rewards/margins": 3.3591768741607666, "rewards/rejected": -3.8100595474243164, "step": 3165 }, { "epoch": 0.5286167195060905, "grad_norm": 13.635276794433594, "learning_rate": 1.4713832804939097e-05, "logits/chosen": -0.8988495469093323, "logits/rejected": -0.9118955731391907, "logps/chosen": -47.99263381958008, "logps/rejected": -63.2769889831543, "loss": 0.6047, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.18440063297748566, "rewards/margins": 0.9092404246330261, "rewards/rejected": -0.7248398661613464, "step": 3168 }, { "epoch": 0.5291173035207742, "grad_norm": 16.593854904174805, "learning_rate": 1.470882696479226e-05, "logits/chosen": -0.8380258679389954, "logits/rejected": -0.8841469287872314, "logps/chosen": -83.54776763916016, "logps/rejected": -129.80093383789062, "loss": 0.7879, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -3.319077491760254, "rewards/margins": 1.1344963312149048, "rewards/rejected": -4.453573703765869, "step": 3171 }, { "epoch": 0.529617887535458, "grad_norm": 19.364423751831055, "learning_rate": 1.470382112464542e-05, "logits/chosen": -0.9135633111000061, "logits/rejected": -0.8741409778594971, "logps/chosen": -141.86697387695312, "logps/rejected": -104.9626693725586, "loss": 0.8596, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.9939451217651367, "rewards/margins": -0.5267700552940369, "rewards/rejected": -1.467175006866455, "step": 3174 }, { "epoch": 0.5301184715501418, "grad_norm": 62.77091598510742, "learning_rate": 1.4698815284498582e-05, "logits/chosen": -0.9385445713996887, "logits/rejected": -0.9710743427276611, "logps/chosen": -92.21146392822266, "logps/rejected": -170.91966247558594, "loss": 0.7602, "rewards/accuracies": 1.0, "rewards/chosen": -1.9773736000061035, "rewards/margins": 4.205937385559082, "rewards/rejected": -6.183311462402344, "step": 3177 }, { "epoch": 0.5306190555648256, "grad_norm": 25.1397705078125, "learning_rate": 1.4693809444351745e-05, "logits/chosen": -0.7604941725730896, "logits/rejected": -0.7489831447601318, "logps/chosen": -105.50244140625, "logps/rejected": -125.79793548583984, "loss": 0.4177, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.258744478225708, "rewards/margins": 2.1637380123138428, "rewards/rejected": -4.422482490539551, "step": 3180 }, { "epoch": 0.5311196395795095, "grad_norm": 13.460365295410156, "learning_rate": 1.4688803604204907e-05, "logits/chosen": -0.9213724136352539, "logits/rejected": -0.962491512298584, "logps/chosen": -45.54319381713867, "logps/rejected": -90.6131820678711, "loss": 0.279, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.05983360484242439, "rewards/margins": 0.3906426429748535, "rewards/rejected": -0.3308090269565582, "step": 3183 }, { "epoch": 0.5316202235941933, "grad_norm": 17.107406616210938, "learning_rate": 1.4683797764058069e-05, "logits/chosen": -0.7764462828636169, "logits/rejected": -0.8204761147499084, "logps/chosen": -44.03559875488281, "logps/rejected": -70.12059783935547, "loss": 0.66, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.7761226296424866, "rewards/margins": 0.41920244693756104, "rewards/rejected": -1.1953250169754028, "step": 3186 }, { "epoch": 0.532120807608877, "grad_norm": 32.107810974121094, "learning_rate": 1.467879192391123e-05, "logits/chosen": -0.9648329615592957, "logits/rejected": -0.9633706212043762, "logps/chosen": -60.06460952758789, "logps/rejected": -63.35361099243164, "loss": 0.6169, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.22184360027313232, "rewards/margins": -0.1789560168981552, "rewards/rejected": -0.0428876094520092, "step": 3189 }, { "epoch": 0.5326213916235608, "grad_norm": 19.474308013916016, "learning_rate": 1.4673786083764393e-05, "logits/chosen": -0.6133667826652527, "logits/rejected": -0.6359360814094543, "logps/chosen": -50.8448371887207, "logps/rejected": -117.21453094482422, "loss": 0.5192, "rewards/accuracies": 1.0, "rewards/chosen": 0.541985034942627, "rewards/margins": 2.6385891437530518, "rewards/rejected": -2.096604108810425, "step": 3192 }, { "epoch": 0.5331219756382446, "grad_norm": 5.317022323608398, "learning_rate": 1.4668780243617554e-05, "logits/chosen": -0.9144188761711121, "logits/rejected": -0.9211263656616211, "logps/chosen": -98.35918426513672, "logps/rejected": -142.37803649902344, "loss": 0.105, "rewards/accuracies": 1.0, "rewards/chosen": -0.44823455810546875, "rewards/margins": 6.271139144897461, "rewards/rejected": -6.71937370300293, "step": 3195 }, { "epoch": 0.5336225596529284, "grad_norm": 5.840473651885986, "learning_rate": 1.4663774403470716e-05, "logits/chosen": -0.7016770243644714, "logits/rejected": -0.7276394963264465, "logps/chosen": -74.6239242553711, "logps/rejected": -108.6460952758789, "loss": 0.3421, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1215473413467407, "rewards/margins": 2.707653760910034, "rewards/rejected": -3.8292009830474854, "step": 3198 }, { "epoch": 0.5341231436676123, "grad_norm": 27.48122787475586, "learning_rate": 1.465876856332388e-05, "logits/chosen": -0.7634568214416504, "logits/rejected": -0.7370760440826416, "logps/chosen": -136.12620544433594, "logps/rejected": -80.17362213134766, "loss": 0.9458, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7455987334251404, "rewards/margins": 0.3041188716888428, "rewards/rejected": -1.0497175455093384, "step": 3201 }, { "epoch": 0.534623727682296, "grad_norm": 32.2386589050293, "learning_rate": 1.4653762723177042e-05, "logits/chosen": -0.6898152232170105, "logits/rejected": -0.6276140809059143, "logps/chosen": -74.71526336669922, "logps/rejected": -67.73965454101562, "loss": 1.0419, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.4451045989990234, "rewards/margins": -0.6099228262901306, "rewards/rejected": -0.8351817727088928, "step": 3204 }, { "epoch": 0.5351243116969798, "grad_norm": 12.319762229919434, "learning_rate": 1.4648756883030203e-05, "logits/chosen": -0.8048855662345886, "logits/rejected": -0.8528990745544434, "logps/chosen": -30.639312744140625, "logps/rejected": -130.59547424316406, "loss": 0.3243, "rewards/accuracies": 1.0, "rewards/chosen": 1.293155550956726, "rewards/margins": 3.477123260498047, "rewards/rejected": -2.1839675903320312, "step": 3207 }, { "epoch": 0.5356248957116636, "grad_norm": 4.752992153167725, "learning_rate": 1.4643751042883365e-05, "logits/chosen": -0.6399478316307068, "logits/rejected": -0.6736993789672852, "logps/chosen": -86.00081634521484, "logps/rejected": -119.72052001953125, "loss": 0.1955, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.3989523649215698, "rewards/margins": 3.5910847187042236, "rewards/rejected": -4.990037441253662, "step": 3210 }, { "epoch": 0.5361254797263474, "grad_norm": 25.116037368774414, "learning_rate": 1.4638745202736527e-05, "logits/chosen": -0.8821315765380859, "logits/rejected": -0.8547492027282715, "logps/chosen": -109.13153839111328, "logps/rejected": -139.3797149658203, "loss": 0.3569, "rewards/accuracies": 1.0, "rewards/chosen": -2.409801483154297, "rewards/margins": 4.043330669403076, "rewards/rejected": -6.453132629394531, "step": 3213 }, { "epoch": 0.5366260637410312, "grad_norm": 6.487946510314941, "learning_rate": 1.4633739362589689e-05, "logits/chosen": -0.8457792401313782, "logits/rejected": -0.8345134854316711, "logps/chosen": -37.595787048339844, "logps/rejected": -44.683597564697266, "loss": 0.2261, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.4743673503398895, "rewards/margins": 1.3712502717971802, "rewards/rejected": -0.8968830108642578, "step": 3216 }, { "epoch": 0.537126647755715, "grad_norm": 5.093118667602539, "learning_rate": 1.462873352244285e-05, "logits/chosen": -0.7809565663337708, "logits/rejected": -0.7486067414283752, "logps/chosen": -87.52420806884766, "logps/rejected": -131.9034881591797, "loss": 0.8168, "rewards/accuracies": 1.0, "rewards/chosen": -0.6149711012840271, "rewards/margins": 2.6027097702026367, "rewards/rejected": -3.2176806926727295, "step": 3219 }, { "epoch": 0.5376272317703988, "grad_norm": 26.1980037689209, "learning_rate": 1.4623727682296014e-05, "logits/chosen": -0.7997323870658875, "logits/rejected": -0.8015573620796204, "logps/chosen": -57.80316162109375, "logps/rejected": -69.57965850830078, "loss": 0.4454, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.31567254662513733, "rewards/margins": 1.2885595560073853, "rewards/rejected": -1.6042323112487793, "step": 3222 }, { "epoch": 0.5381278157850826, "grad_norm": 11.024362564086914, "learning_rate": 1.4618721842149176e-05, "logits/chosen": -0.9304378628730774, "logits/rejected": -0.9236631393432617, "logps/chosen": -95.06905364990234, "logps/rejected": -142.2586212158203, "loss": 0.2029, "rewards/accuracies": 1.0, "rewards/chosen": -0.7724886536598206, "rewards/margins": 5.338181018829346, "rewards/rejected": -6.11067008972168, "step": 3225 }, { "epoch": 0.5386283997997664, "grad_norm": 25.089923858642578, "learning_rate": 1.4613716002002338e-05, "logits/chosen": -0.9256951212882996, "logits/rejected": -0.8915255665779114, "logps/chosen": -98.08065032958984, "logps/rejected": -116.712646484375, "loss": 0.5964, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6040761470794678, "rewards/margins": 1.1658731698989868, "rewards/rejected": -1.7699493169784546, "step": 3228 }, { "epoch": 0.5391289838144502, "grad_norm": 37.69767379760742, "learning_rate": 1.46087101618555e-05, "logits/chosen": -0.7726389765739441, "logits/rejected": -0.7910354137420654, "logps/chosen": -84.58999633789062, "logps/rejected": -160.03619384765625, "loss": 0.516, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.07630489021539688, "rewards/margins": 2.868330955505371, "rewards/rejected": -2.7920262813568115, "step": 3231 }, { "epoch": 0.539629567829134, "grad_norm": 29.539953231811523, "learning_rate": 1.4603704321708661e-05, "logits/chosen": -0.8459076881408691, "logits/rejected": -0.8426669239997864, "logps/chosen": -104.59590911865234, "logps/rejected": -150.90870666503906, "loss": 0.4571, "rewards/accuracies": 1.0, "rewards/chosen": -0.26662901043891907, "rewards/margins": 2.406120777130127, "rewards/rejected": -2.6727497577667236, "step": 3234 }, { "epoch": 0.5401301518438177, "grad_norm": 15.438255310058594, "learning_rate": 1.4598698481561823e-05, "logits/chosen": -0.8126518130302429, "logits/rejected": -0.8379890322685242, "logps/chosen": -107.53173828125, "logps/rejected": -193.115234375, "loss": 0.463, "rewards/accuracies": 1.0, "rewards/chosen": -0.8687819838523865, "rewards/margins": 3.3143997192382812, "rewards/rejected": -4.1831817626953125, "step": 3237 }, { "epoch": 0.5406307358585016, "grad_norm": 45.10163879394531, "learning_rate": 1.4593692641414985e-05, "logits/chosen": -0.5824782848358154, "logits/rejected": -0.7006898522377014, "logps/chosen": -56.821773529052734, "logps/rejected": -189.42515563964844, "loss": 0.7624, "rewards/accuracies": 1.0, "rewards/chosen": 0.11003431677818298, "rewards/margins": 3.1281402111053467, "rewards/rejected": -3.0181057453155518, "step": 3240 }, { "epoch": 0.5411313198731854, "grad_norm": 2.220844030380249, "learning_rate": 1.4588686801268148e-05, "logits/chosen": -0.765690803527832, "logits/rejected": -0.753459632396698, "logps/chosen": -89.89563751220703, "logps/rejected": -87.8810806274414, "loss": 0.2116, "rewards/accuracies": 1.0, "rewards/chosen": -0.4539321959018707, "rewards/margins": 2.345275402069092, "rewards/rejected": -2.7992076873779297, "step": 3243 }, { "epoch": 0.5416319038878692, "grad_norm": 15.707186698913574, "learning_rate": 1.458368096112131e-05, "logits/chosen": -0.8816020488739014, "logits/rejected": -0.8428109288215637, "logps/chosen": -91.40079498291016, "logps/rejected": -134.76438903808594, "loss": 0.2512, "rewards/accuracies": 1.0, "rewards/chosen": -0.38301682472229004, "rewards/margins": 3.4029481410980225, "rewards/rejected": -3.7859649658203125, "step": 3246 }, { "epoch": 0.542132487902553, "grad_norm": 44.39283752441406, "learning_rate": 1.457867512097447e-05, "logits/chosen": -0.8546252250671387, "logits/rejected": -0.8378713726997375, "logps/chosen": -132.5321502685547, "logps/rejected": -148.2784423828125, "loss": 0.6987, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -3.8114166259765625, "rewards/margins": -0.3834007680416107, "rewards/rejected": -3.428015947341919, "step": 3249 }, { "epoch": 0.5426330719172368, "grad_norm": 72.9501953125, "learning_rate": 1.4573669280827634e-05, "logits/chosen": -0.9957437515258789, "logits/rejected": -1.0111273527145386, "logps/chosen": -113.2271728515625, "logps/rejected": -159.419677734375, "loss": 0.4645, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.269171118736267, "rewards/margins": 1.2097954750061035, "rewards/rejected": -2.478966474533081, "step": 3252 }, { "epoch": 0.5431336559319205, "grad_norm": 45.294044494628906, "learning_rate": 1.4568663440680795e-05, "logits/chosen": -0.8024635314941406, "logits/rejected": -0.792358934879303, "logps/chosen": -114.3133544921875, "logps/rejected": -164.1097412109375, "loss": 0.5766, "rewards/accuracies": 1.0, "rewards/chosen": -0.36157122254371643, "rewards/margins": 5.309224605560303, "rewards/rejected": -5.670795917510986, "step": 3255 }, { "epoch": 0.5436342399466044, "grad_norm": 15.447421073913574, "learning_rate": 1.4563657600533957e-05, "logits/chosen": -0.7510054111480713, "logits/rejected": -0.8470858931541443, "logps/chosen": -71.5806884765625, "logps/rejected": -146.4622802734375, "loss": 0.8332, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.208434820175171, "rewards/margins": 1.5337613821029663, "rewards/rejected": -3.7421963214874268, "step": 3258 }, { "epoch": 0.5441348239612882, "grad_norm": 6.192575454711914, "learning_rate": 1.4558651760387119e-05, "logits/chosen": -0.9188331961631775, "logits/rejected": -0.9003584980964661, "logps/chosen": -119.34268951416016, "logps/rejected": -153.15426635742188, "loss": 0.1382, "rewards/accuracies": 1.0, "rewards/chosen": -0.05342304706573486, "rewards/margins": 4.422242641448975, "rewards/rejected": -4.475666046142578, "step": 3261 }, { "epoch": 0.544635407975972, "grad_norm": 19.86811065673828, "learning_rate": 1.4553645920240283e-05, "logits/chosen": -0.701748788356781, "logits/rejected": -0.8419464230537415, "logps/chosen": -76.71002960205078, "logps/rejected": -143.86444091796875, "loss": 0.5463, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.5559945106506348, "rewards/margins": 1.5656191110610962, "rewards/rejected": -3.1216137409210205, "step": 3264 }, { "epoch": 0.5451359919906558, "grad_norm": 60.17543029785156, "learning_rate": 1.4548640080093444e-05, "logits/chosen": -0.7544784545898438, "logits/rejected": -0.7472065091133118, "logps/chosen": -176.7740020751953, "logps/rejected": -113.6961669921875, "loss": 1.452, "rewards/accuracies": 0.0, "rewards/chosen": -1.603152871131897, "rewards/margins": -1.8233190774917603, "rewards/rejected": 0.22016626596450806, "step": 3267 }, { "epoch": 0.5456365760053395, "grad_norm": 33.33154296875, "learning_rate": 1.4543634239946604e-05, "logits/chosen": -0.7231485843658447, "logits/rejected": -0.8554626107215881, "logps/chosen": -36.31722640991211, "logps/rejected": -118.98296356201172, "loss": 0.3191, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.7581450939178467, "rewards/margins": 3.0046892166137695, "rewards/rejected": -2.246544122695923, "step": 3270 }, { "epoch": 0.5461371600200233, "grad_norm": 22.462984085083008, "learning_rate": 1.4538628399799768e-05, "logits/chosen": -0.8356782793998718, "logits/rejected": -0.8309642672538757, "logps/chosen": -61.63692092895508, "logps/rejected": -47.16562271118164, "loss": 0.4162, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6970239281654358, "rewards/margins": 0.45488619804382324, "rewards/rejected": -1.1519100666046143, "step": 3273 }, { "epoch": 0.5466377440347071, "grad_norm": 64.30328369140625, "learning_rate": 1.453362255965293e-05, "logits/chosen": -0.9366379380226135, "logits/rejected": -0.8442243933677673, "logps/chosen": -129.8893585205078, "logps/rejected": -85.66128540039062, "loss": 0.89, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.0964949131011963, "rewards/margins": -1.121315360069275, "rewards/rejected": -0.9751794338226318, "step": 3276 }, { "epoch": 0.547138328049391, "grad_norm": 2.8073766231536865, "learning_rate": 1.4528616719506091e-05, "logits/chosen": -0.6872976422309875, "logits/rejected": -0.6996240615844727, "logps/chosen": -46.885711669921875, "logps/rejected": -131.8373565673828, "loss": 0.3946, "rewards/accuracies": 1.0, "rewards/chosen": 0.6350273489952087, "rewards/margins": 6.07602071762085, "rewards/rejected": -5.440993785858154, "step": 3279 }, { "epoch": 0.5476389120640748, "grad_norm": 25.868356704711914, "learning_rate": 1.4523610879359253e-05, "logits/chosen": -0.8452339768409729, "logits/rejected": -0.8154101371765137, "logps/chosen": -128.44850158691406, "logps/rejected": -87.2777099609375, "loss": 0.8242, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.4703686237335205, "rewards/margins": -0.9197916984558105, "rewards/rejected": -1.5505770444869995, "step": 3282 }, { "epoch": 0.5481394960787586, "grad_norm": 29.61796760559082, "learning_rate": 1.4518605039212417e-05, "logits/chosen": -0.7462944984436035, "logits/rejected": -0.7028608918190002, "logps/chosen": -54.91093826293945, "logps/rejected": -71.56305694580078, "loss": 0.7354, "rewards/accuracies": 1.0, "rewards/chosen": 0.330620139837265, "rewards/margins": 2.381864547729492, "rewards/rejected": -2.0512444972991943, "step": 3285 }, { "epoch": 0.5486400800934423, "grad_norm": 28.47063446044922, "learning_rate": 1.4513599199065579e-05, "logits/chosen": -0.951429545879364, "logits/rejected": -0.9942045211791992, "logps/chosen": -70.63468170166016, "logps/rejected": -110.4645767211914, "loss": 0.7102, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.2467339038848877, "rewards/margins": -0.5749547481536865, "rewards/rejected": -1.6717795133590698, "step": 3288 }, { "epoch": 0.5491406641081261, "grad_norm": 27.004680633544922, "learning_rate": 1.4508593358918739e-05, "logits/chosen": -0.9382972717285156, "logits/rejected": -0.859349250793457, "logps/chosen": -134.53746032714844, "logps/rejected": -90.82833099365234, "loss": 0.763, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.4085925817489624, "rewards/margins": -0.06661184877157211, "rewards/rejected": -1.3419805765151978, "step": 3291 }, { "epoch": 0.5496412481228099, "grad_norm": 11.340299606323242, "learning_rate": 1.4503587518771902e-05, "logits/chosen": -0.8394286632537842, "logits/rejected": -0.8570270538330078, "logps/chosen": -76.3030014038086, "logps/rejected": -151.36912536621094, "loss": 0.6071, "rewards/accuracies": 1.0, "rewards/chosen": -0.5753533840179443, "rewards/margins": 3.5705063343048096, "rewards/rejected": -4.145859241485596, "step": 3294 }, { "epoch": 0.5501418321374938, "grad_norm": 29.91447639465332, "learning_rate": 1.4498581678625064e-05, "logits/chosen": -0.996602475643158, "logits/rejected": -0.9511082172393799, "logps/chosen": -92.3954849243164, "logps/rejected": -113.9028091430664, "loss": 0.4768, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8657386302947998, "rewards/margins": -0.060471415519714355, "rewards/rejected": -0.8052671551704407, "step": 3297 }, { "epoch": 0.5506424161521776, "grad_norm": 16.172847747802734, "learning_rate": 1.4493575838478226e-05, "logits/chosen": -0.6789801716804504, "logits/rejected": -0.6170856356620789, "logps/chosen": -107.49148559570312, "logps/rejected": -94.4600830078125, "loss": 0.8021, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9614169597625732, "rewards/margins": 0.8338108062744141, "rewards/rejected": -1.7952276468276978, "step": 3300 }, { "epoch": 0.5511430001668614, "grad_norm": 25.55877685546875, "learning_rate": 1.4488569998331388e-05, "logits/chosen": -0.7949660420417786, "logits/rejected": -0.8334235548973083, "logps/chosen": -113.6360855102539, "logps/rejected": -126.7994384765625, "loss": 0.5314, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9961317181587219, "rewards/margins": 0.28306254744529724, "rewards/rejected": -1.2791942358016968, "step": 3303 }, { "epoch": 0.5516435841815451, "grad_norm": 41.524688720703125, "learning_rate": 1.448356415818455e-05, "logits/chosen": -0.7778257727622986, "logits/rejected": -0.8176283836364746, "logps/chosen": -53.26631546020508, "logps/rejected": -123.49832916259766, "loss": 0.5758, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.3599964380264282, "rewards/margins": 2.7972240447998047, "rewards/rejected": -4.157220363616943, "step": 3306 }, { "epoch": 0.5521441681962289, "grad_norm": 19.814865112304688, "learning_rate": 1.4478558318037713e-05, "logits/chosen": -0.8578882217407227, "logits/rejected": -0.8884719014167786, "logps/chosen": -85.86187744140625, "logps/rejected": -113.81160736083984, "loss": 0.5049, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.6184403896331787, "rewards/margins": -0.026981910690665245, "rewards/rejected": -2.591458559036255, "step": 3309 }, { "epoch": 0.5526447522109127, "grad_norm": 37.285919189453125, "learning_rate": 1.4473552477890873e-05, "logits/chosen": -0.7836824059486389, "logits/rejected": -0.8325238227844238, "logps/chosen": -94.19353485107422, "logps/rejected": -133.65716552734375, "loss": 0.5504, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.3356852531433105, "rewards/margins": 1.0090607404708862, "rewards/rejected": -2.3447458744049072, "step": 3312 }, { "epoch": 0.5531453362255966, "grad_norm": 41.578216552734375, "learning_rate": 1.4468546637744036e-05, "logits/chosen": -0.8446981310844421, "logits/rejected": -0.9303563237190247, "logps/chosen": -72.1467056274414, "logps/rejected": -161.4206085205078, "loss": 0.678, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.4478498697280884, "rewards/margins": 1.3123103380203247, "rewards/rejected": -2.760160207748413, "step": 3315 }, { "epoch": 0.5536459202402804, "grad_norm": 7.753451347351074, "learning_rate": 1.4463540797597198e-05, "logits/chosen": -0.9597213864326477, "logits/rejected": -1.026318073272705, "logps/chosen": -86.81623077392578, "logps/rejected": -107.47689056396484, "loss": 0.5403, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.0067102909088135, "rewards/margins": -0.9458749890327454, "rewards/rejected": -1.0608352422714233, "step": 3318 }, { "epoch": 0.5541465042549641, "grad_norm": 11.890542030334473, "learning_rate": 1.445853495745036e-05, "logits/chosen": -0.6895675659179688, "logits/rejected": -0.7286807894706726, "logps/chosen": -69.65084075927734, "logps/rejected": -126.77587127685547, "loss": 0.3488, "rewards/accuracies": 1.0, "rewards/chosen": -0.489994615316391, "rewards/margins": 3.0033342838287354, "rewards/rejected": -3.4933290481567383, "step": 3321 }, { "epoch": 0.5546470882696479, "grad_norm": 15.458808898925781, "learning_rate": 1.4453529117303522e-05, "logits/chosen": -0.6996471881866455, "logits/rejected": -0.7683482766151428, "logps/chosen": -101.9533462524414, "logps/rejected": -149.02490234375, "loss": 0.3509, "rewards/accuracies": 1.0, "rewards/chosen": -0.8872731328010559, "rewards/margins": 2.182211399078369, "rewards/rejected": -3.0694844722747803, "step": 3324 }, { "epoch": 0.5551476722843317, "grad_norm": 3.0613622665405273, "learning_rate": 1.4448523277156684e-05, "logits/chosen": -0.6165554523468018, "logits/rejected": -0.6955440044403076, "logps/chosen": -93.35393524169922, "logps/rejected": -131.64222717285156, "loss": 0.7887, "rewards/accuracies": 1.0, "rewards/chosen": -2.415329694747925, "rewards/margins": 2.870386838912964, "rewards/rejected": -5.285716533660889, "step": 3327 }, { "epoch": 0.5556482562990155, "grad_norm": 10.585517883300781, "learning_rate": 1.4443517437009847e-05, "logits/chosen": -0.8395348191261292, "logits/rejected": -0.8418259620666504, "logps/chosen": -106.21014404296875, "logps/rejected": -93.94357299804688, "loss": 0.2929, "rewards/accuracies": 1.0, "rewards/chosen": -1.9430341720581055, "rewards/margins": 0.8841971755027771, "rewards/rejected": -2.8272314071655273, "step": 3330 }, { "epoch": 0.5561488403136993, "grad_norm": 33.8094367980957, "learning_rate": 1.4438511596863007e-05, "logits/chosen": -0.8424112200737, "logits/rejected": -0.8580255508422852, "logps/chosen": -94.32913208007812, "logps/rejected": -84.4994888305664, "loss": 1.2249, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.3252426385879517, "rewards/margins": 0.13524119555950165, "rewards/rejected": -1.4604839086532593, "step": 3333 }, { "epoch": 0.5566494243283832, "grad_norm": 4.090928554534912, "learning_rate": 1.443350575671617e-05, "logits/chosen": -0.6907103061676025, "logits/rejected": -0.7184467315673828, "logps/chosen": -74.1178207397461, "logps/rejected": -158.0332489013672, "loss": 0.1927, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9158831238746643, "rewards/margins": 3.0379445552825928, "rewards/rejected": -3.9538278579711914, "step": 3336 }, { "epoch": 0.5571500083430669, "grad_norm": 33.43541717529297, "learning_rate": 1.4428499916569332e-05, "logits/chosen": -0.8518306612968445, "logits/rejected": -0.8498656153678894, "logps/chosen": -85.05380249023438, "logps/rejected": -111.99495697021484, "loss": 0.2616, "rewards/accuracies": 1.0, "rewards/chosen": -0.38286614418029785, "rewards/margins": 1.1768289804458618, "rewards/rejected": -1.5596951246261597, "step": 3339 }, { "epoch": 0.5576505923577507, "grad_norm": 44.782615661621094, "learning_rate": 1.4423494076422494e-05, "logits/chosen": -0.8669743537902832, "logits/rejected": -0.829876720905304, "logps/chosen": -55.34025955200195, "logps/rejected": -51.96072006225586, "loss": 0.7607, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.5848902463912964, "rewards/margins": -0.9747748374938965, "rewards/rejected": -0.6101152896881104, "step": 3342 }, { "epoch": 0.5581511763724345, "grad_norm": 36.122432708740234, "learning_rate": 1.4418488236275656e-05, "logits/chosen": -0.7531997561454773, "logits/rejected": -0.8107728958129883, "logps/chosen": -100.2334976196289, "logps/rejected": -160.13316345214844, "loss": 0.5541, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.34778347611427307, "rewards/margins": 4.63079833984375, "rewards/rejected": -4.97858190536499, "step": 3345 }, { "epoch": 0.5586517603871183, "grad_norm": 23.05372428894043, "learning_rate": 1.4413482396128818e-05, "logits/chosen": -0.7273311614990234, "logits/rejected": -0.7396740913391113, "logps/chosen": -45.48541259765625, "logps/rejected": -73.14318084716797, "loss": 0.3533, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.300881952047348, "rewards/margins": 1.7557401657104492, "rewards/rejected": -1.4548581838607788, "step": 3348 }, { "epoch": 0.5591523444018021, "grad_norm": 2.612668991088867, "learning_rate": 1.4408476555981981e-05, "logits/chosen": -0.7343069911003113, "logits/rejected": -0.6845331788063049, "logps/chosen": -105.81781005859375, "logps/rejected": -103.0767822265625, "loss": 0.3428, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.4821118116378784, "rewards/margins": 2.4778528213500977, "rewards/rejected": -3.9599647521972656, "step": 3351 }, { "epoch": 0.559652928416486, "grad_norm": 44.14592361450195, "learning_rate": 1.4403470715835141e-05, "logits/chosen": -0.8999471664428711, "logits/rejected": -0.9180552363395691, "logps/chosen": -66.98112487792969, "logps/rejected": -53.232818603515625, "loss": 1.0376, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.36435627937316895, "rewards/margins": 1.4458948373794556, "rewards/rejected": -1.8102511167526245, "step": 3354 }, { "epoch": 0.5601535124311697, "grad_norm": 22.56341552734375, "learning_rate": 1.4398464875688305e-05, "logits/chosen": -0.8224168419837952, "logits/rejected": -0.8196706771850586, "logps/chosen": -123.48202514648438, "logps/rejected": -112.86993408203125, "loss": 0.373, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.3137611150741577, "rewards/margins": 1.2101060152053833, "rewards/rejected": -2.523867130279541, "step": 3357 }, { "epoch": 0.5606540964458535, "grad_norm": 9.128254890441895, "learning_rate": 1.4393459035541467e-05, "logits/chosen": -0.7506359219551086, "logits/rejected": -0.789170503616333, "logps/chosen": -83.40166473388672, "logps/rejected": -123.03497314453125, "loss": 0.2535, "rewards/accuracies": 1.0, "rewards/chosen": -1.301272988319397, "rewards/margins": 2.294116258621216, "rewards/rejected": -3.5953893661499023, "step": 3360 }, { "epoch": 0.5611546804605373, "grad_norm": 11.934659004211426, "learning_rate": 1.4388453195394627e-05, "logits/chosen": -0.8225695490837097, "logits/rejected": -0.8270816206932068, "logps/chosen": -44.01133728027344, "logps/rejected": -87.87747955322266, "loss": 0.2689, "rewards/accuracies": 1.0, "rewards/chosen": -0.47815242409706116, "rewards/margins": 2.8933401107788086, "rewards/rejected": -3.371492385864258, "step": 3363 }, { "epoch": 0.5616552644752211, "grad_norm": 22.975948333740234, "learning_rate": 1.438344735524779e-05, "logits/chosen": -0.8001260757446289, "logits/rejected": -0.8658556938171387, "logps/chosen": -66.61457061767578, "logps/rejected": -138.23619079589844, "loss": 0.8065, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8180950284004211, "rewards/margins": -0.21134810149669647, "rewards/rejected": -0.6067469716072083, "step": 3366 }, { "epoch": 0.5621558484899049, "grad_norm": 11.70832633972168, "learning_rate": 1.4378441515100952e-05, "logits/chosen": -0.9424428939819336, "logits/rejected": -1.0116496086120605, "logps/chosen": -53.16798782348633, "logps/rejected": -188.43968200683594, "loss": 0.7091, "rewards/accuracies": 1.0, "rewards/chosen": 0.14834143221378326, "rewards/margins": 4.318070888519287, "rewards/rejected": -4.169729232788086, "step": 3369 }, { "epoch": 0.5626564325045886, "grad_norm": 11.546310424804688, "learning_rate": 1.4373435674954116e-05, "logits/chosen": -0.5466092824935913, "logits/rejected": -0.6622653007507324, "logps/chosen": -42.39693832397461, "logps/rejected": -168.7804412841797, "loss": 0.5574, "rewards/accuracies": 1.0, "rewards/chosen": 0.3137461543083191, "rewards/margins": 4.104312419891357, "rewards/rejected": -3.7905664443969727, "step": 3372 }, { "epoch": 0.5631570165192725, "grad_norm": 17.988122940063477, "learning_rate": 1.4368429834807276e-05, "logits/chosen": -0.840681254863739, "logits/rejected": -0.9154661297798157, "logps/chosen": -44.23630905151367, "logps/rejected": -94.575927734375, "loss": 0.7104, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.47156915068626404, "rewards/margins": 2.0970499515533447, "rewards/rejected": -2.5686192512512207, "step": 3375 }, { "epoch": 0.5636576005339563, "grad_norm": 54.76520538330078, "learning_rate": 1.436342399466044e-05, "logits/chosen": -0.9399277567863464, "logits/rejected": -0.9520690441131592, "logps/chosen": -62.59706497192383, "logps/rejected": -122.6613540649414, "loss": 0.3415, "rewards/accuracies": 1.0, "rewards/chosen": -0.021244054660201073, "rewards/margins": 1.1989991664886475, "rewards/rejected": -1.220243215560913, "step": 3378 }, { "epoch": 0.5641581845486401, "grad_norm": 6.994503021240234, "learning_rate": 1.4358418154513601e-05, "logits/chosen": -0.7624897360801697, "logits/rejected": -0.7566424012184143, "logps/chosen": -124.6402587890625, "logps/rejected": -158.26036071777344, "loss": 0.3352, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9276989102363586, "rewards/margins": 1.6529645919799805, "rewards/rejected": -2.5806634426116943, "step": 3381 }, { "epoch": 0.5646587685633239, "grad_norm": 63.537235260009766, "learning_rate": 1.4353412314366761e-05, "logits/chosen": -0.7990696430206299, "logits/rejected": -0.765660285949707, "logps/chosen": -102.77054595947266, "logps/rejected": -123.60259246826172, "loss": 1.1128, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.0522959232330322, "rewards/margins": 0.14540307223796844, "rewards/rejected": -2.1976988315582275, "step": 3384 }, { "epoch": 0.5651593525780076, "grad_norm": 9.486126899719238, "learning_rate": 1.4348406474219925e-05, "logits/chosen": -0.8309006690979004, "logits/rejected": -0.8112863898277283, "logps/chosen": -111.29928588867188, "logps/rejected": -130.88987731933594, "loss": 0.3626, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.3530778884887695, "rewards/margins": 2.0787017345428467, "rewards/rejected": -3.4317798614501953, "step": 3387 }, { "epoch": 0.5656599365926914, "grad_norm": 14.383316993713379, "learning_rate": 1.4343400634073086e-05, "logits/chosen": -0.8610150218009949, "logits/rejected": -0.7925763130187988, "logps/chosen": -78.37831115722656, "logps/rejected": -76.00445556640625, "loss": 0.2494, "rewards/accuracies": 1.0, "rewards/chosen": -0.7461915612220764, "rewards/margins": 1.9881739616394043, "rewards/rejected": -2.734365463256836, "step": 3390 }, { "epoch": 0.5661605206073753, "grad_norm": 22.72266387939453, "learning_rate": 1.433839479392625e-05, "logits/chosen": -0.7676177620887756, "logits/rejected": -0.8056671023368835, "logps/chosen": -91.16785430908203, "logps/rejected": -149.9309539794922, "loss": 0.8379, "rewards/accuracies": 1.0, "rewards/chosen": -1.1070014238357544, "rewards/margins": 3.467254638671875, "rewards/rejected": -4.574256420135498, "step": 3393 }, { "epoch": 0.5666611046220591, "grad_norm": 26.61357879638672, "learning_rate": 1.433338895377941e-05, "logits/chosen": -0.7460296750068665, "logits/rejected": -0.8614251613616943, "logps/chosen": -53.62736892700195, "logps/rejected": -188.298095703125, "loss": 0.5164, "rewards/accuracies": 1.0, "rewards/chosen": -0.3984832763671875, "rewards/margins": 6.443162441253662, "rewards/rejected": -6.84164571762085, "step": 3396 }, { "epoch": 0.5671616886367429, "grad_norm": 71.9278335571289, "learning_rate": 1.4328383113632573e-05, "logits/chosen": -0.6328577399253845, "logits/rejected": -0.7234833836555481, "logps/chosen": -43.95926284790039, "logps/rejected": -116.2076416015625, "loss": 0.7639, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4060591161251068, "rewards/margins": 3.915440797805786, "rewards/rejected": -4.321500301361084, "step": 3399 }, { "epoch": 0.5676622726514267, "grad_norm": 30.38459014892578, "learning_rate": 1.4323377273485735e-05, "logits/chosen": -0.9162030220031738, "logits/rejected": -0.8314566016197205, "logps/chosen": -183.2169647216797, "logps/rejected": -158.01010131835938, "loss": 1.2008, "rewards/accuracies": 1.0, "rewards/chosen": -0.4195668399333954, "rewards/margins": 1.2973073720932007, "rewards/rejected": -1.7168742418289185, "step": 3402 }, { "epoch": 0.5681628566661104, "grad_norm": 27.721487045288086, "learning_rate": 1.4318371433338895e-05, "logits/chosen": -0.6725993156433105, "logits/rejected": -0.6770529747009277, "logps/chosen": -82.50968170166016, "logps/rejected": -120.2412109375, "loss": 0.3397, "rewards/accuracies": 1.0, "rewards/chosen": -0.4538404941558838, "rewards/margins": 2.647212266921997, "rewards/rejected": -3.1010525226593018, "step": 3405 }, { "epoch": 0.5686634406807942, "grad_norm": 10.897114753723145, "learning_rate": 1.4313365593192059e-05, "logits/chosen": -0.7803511619567871, "logits/rejected": -0.8410642147064209, "logps/chosen": -76.8484115600586, "logps/rejected": -125.09020233154297, "loss": 0.9986, "rewards/accuracies": 1.0, "rewards/chosen": -0.6968514919281006, "rewards/margins": 2.6451616287231445, "rewards/rejected": -3.342013120651245, "step": 3408 }, { "epoch": 0.5691640246954781, "grad_norm": 63.66329574584961, "learning_rate": 1.430835975304522e-05, "logits/chosen": -0.8276445865631104, "logits/rejected": -0.8826491236686707, "logps/chosen": -171.3146209716797, "logps/rejected": -166.26976013183594, "loss": 1.0414, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.8736984729766846, "rewards/margins": -1.4678579568862915, "rewards/rejected": -1.405840516090393, "step": 3411 }, { "epoch": 0.5696646087101619, "grad_norm": 45.4658203125, "learning_rate": 1.4303353912898384e-05, "logits/chosen": -0.8331819176673889, "logits/rejected": -0.8634157180786133, "logps/chosen": -116.20071411132812, "logps/rejected": -152.06163024902344, "loss": 0.8957, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.0087573528289795, "rewards/margins": 0.32409167289733887, "rewards/rejected": -2.3328492641448975, "step": 3414 }, { "epoch": 0.5701651927248457, "grad_norm": 12.356539726257324, "learning_rate": 1.4298348072751544e-05, "logits/chosen": -0.7242338061332703, "logits/rejected": -0.7149155139923096, "logps/chosen": -91.9443130493164, "logps/rejected": -79.28492736816406, "loss": 0.6242, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.246734619140625, "rewards/margins": -0.7049195170402527, "rewards/rejected": -1.541815161705017, "step": 3417 }, { "epoch": 0.5706657767395295, "grad_norm": 8.543658256530762, "learning_rate": 1.4293342232604708e-05, "logits/chosen": -0.8079535365104675, "logits/rejected": -0.8311246037483215, "logps/chosen": -85.79324340820312, "logps/rejected": -222.37852478027344, "loss": 0.2612, "rewards/accuracies": 1.0, "rewards/chosen": -0.48720836639404297, "rewards/margins": 4.917234897613525, "rewards/rejected": -5.404443264007568, "step": 3420 }, { "epoch": 0.5711663607542132, "grad_norm": 58.3459358215332, "learning_rate": 1.428833639245787e-05, "logits/chosen": -0.7807908058166504, "logits/rejected": -0.8492327332496643, "logps/chosen": -80.88275909423828, "logps/rejected": -147.1184844970703, "loss": 1.1773, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.034450054168701, "rewards/margins": 3.2014663219451904, "rewards/rejected": -5.2359161376953125, "step": 3423 }, { "epoch": 0.571666944768897, "grad_norm": 56.829132080078125, "learning_rate": 1.428333055231103e-05, "logits/chosen": -0.45543399453163147, "logits/rejected": -0.4855481684207916, "logps/chosen": -131.577880859375, "logps/rejected": -128.6697235107422, "loss": 0.9765, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.2930793762207031, "rewards/margins": 0.9711551666259766, "rewards/rejected": -2.264234781265259, "step": 3426 }, { "epoch": 0.5721675287835808, "grad_norm": 28.029052734375, "learning_rate": 1.4278324712164193e-05, "logits/chosen": -0.6104671359062195, "logits/rejected": -0.6150109171867371, "logps/chosen": -79.84766387939453, "logps/rejected": -81.9403076171875, "loss": 0.2935, "rewards/accuracies": 1.0, "rewards/chosen": -0.5536365509033203, "rewards/margins": 1.3102822303771973, "rewards/rejected": -1.8639187812805176, "step": 3429 }, { "epoch": 0.5726681127982647, "grad_norm": 26.92110824584961, "learning_rate": 1.4273318872017355e-05, "logits/chosen": -0.890214741230011, "logits/rejected": -0.8648335933685303, "logps/chosen": -59.62945556640625, "logps/rejected": -56.96498107910156, "loss": 1.0455, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.511358380317688, "rewards/margins": 0.16783654689788818, "rewards/rejected": -1.6791949272155762, "step": 3432 }, { "epoch": 0.5731686968129485, "grad_norm": 33.66423034667969, "learning_rate": 1.4268313031870518e-05, "logits/chosen": -0.6886434555053711, "logits/rejected": -0.6572912335395813, "logps/chosen": -122.0753402709961, "logps/rejected": -126.4303970336914, "loss": 0.5972, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.4113918542861938, "rewards/margins": 2.20990252494812, "rewards/rejected": -3.6212942600250244, "step": 3435 }, { "epoch": 0.5736692808276322, "grad_norm": 14.948907852172852, "learning_rate": 1.4263307191723679e-05, "logits/chosen": -0.8202323317527771, "logits/rejected": -0.8141417503356934, "logps/chosen": -122.3082275390625, "logps/rejected": -80.15323638916016, "loss": 0.8593, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.6247271299362183, "rewards/margins": -0.598021924495697, "rewards/rejected": -1.0267051458358765, "step": 3438 }, { "epoch": 0.574169864842316, "grad_norm": 62.03424072265625, "learning_rate": 1.425830135157684e-05, "logits/chosen": -0.9128299355506897, "logits/rejected": -0.9055284857749939, "logps/chosen": -127.77031707763672, "logps/rejected": -105.32596588134766, "loss": 0.4847, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.9990429878234863, "rewards/margins": 1.2717186212539673, "rewards/rejected": -3.270761489868164, "step": 3441 }, { "epoch": 0.5746704488569998, "grad_norm": 12.465239524841309, "learning_rate": 1.4253295511430004e-05, "logits/chosen": -0.6791560053825378, "logits/rejected": -0.656463623046875, "logps/chosen": -90.25289154052734, "logps/rejected": -117.99967193603516, "loss": 0.2577, "rewards/accuracies": 1.0, "rewards/chosen": -0.4145493507385254, "rewards/margins": 2.273664712905884, "rewards/rejected": -2.6882143020629883, "step": 3444 }, { "epoch": 0.5751710328716836, "grad_norm": 15.758909225463867, "learning_rate": 1.4248289671283164e-05, "logits/chosen": -0.911928653717041, "logits/rejected": -0.8800825476646423, "logps/chosen": -60.89809036254883, "logps/rejected": -78.43573760986328, "loss": 0.3299, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.006561795715242624, "rewards/margins": 1.0173325538635254, "rewards/rejected": -1.0238943099975586, "step": 3447 }, { "epoch": 0.5756716168863675, "grad_norm": 15.337299346923828, "learning_rate": 1.4243283831136327e-05, "logits/chosen": -0.738529622554779, "logits/rejected": -0.8219614624977112, "logps/chosen": -83.5794677734375, "logps/rejected": -161.52169799804688, "loss": 0.4633, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.7179142236709595, "rewards/margins": 2.6564669609069824, "rewards/rejected": -4.3743815422058105, "step": 3450 }, { "epoch": 0.5761722009010513, "grad_norm": 16.396312713623047, "learning_rate": 1.423827799098949e-05, "logits/chosen": -0.826678454875946, "logits/rejected": -0.8441851139068604, "logps/chosen": -91.52678680419922, "logps/rejected": -136.7149200439453, "loss": 0.5685, "rewards/accuracies": 1.0, "rewards/chosen": -1.457835078239441, "rewards/margins": 4.280117988586426, "rewards/rejected": -5.737953186035156, "step": 3453 }, { "epoch": 0.576672784915735, "grad_norm": 35.943519592285156, "learning_rate": 1.4233272150842653e-05, "logits/chosen": -0.878603994846344, "logits/rejected": -0.8913518786430359, "logps/chosen": -107.83089447021484, "logps/rejected": -112.89897918701172, "loss": 0.7259, "rewards/accuracies": 1.0, "rewards/chosen": -1.4451055526733398, "rewards/margins": 1.631489872932434, "rewards/rejected": -3.0765955448150635, "step": 3456 }, { "epoch": 0.5771733689304188, "grad_norm": 34.63325119018555, "learning_rate": 1.4228266310695813e-05, "logits/chosen": -0.7667537331581116, "logits/rejected": -0.8011539578437805, "logps/chosen": -90.9216079711914, "logps/rejected": -128.1140899658203, "loss": 0.5269, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.7462020516395569, "rewards/margins": 0.09983796626329422, "rewards/rejected": -0.8460400700569153, "step": 3459 }, { "epoch": 0.5776739529451026, "grad_norm": 6.638933181762695, "learning_rate": 1.4223260470548975e-05, "logits/chosen": -0.8202331066131592, "logits/rejected": -0.7872343063354492, "logps/chosen": -107.52983856201172, "logps/rejected": -87.21076202392578, "loss": 0.5186, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.25802943110466003, "rewards/margins": 1.2437583208084106, "rewards/rejected": -1.501787781715393, "step": 3462 }, { "epoch": 0.5781745369597864, "grad_norm": 45.21228790283203, "learning_rate": 1.4218254630402138e-05, "logits/chosen": -0.9518505930900574, "logits/rejected": -0.9442874789237976, "logps/chosen": -82.59795379638672, "logps/rejected": -95.67740631103516, "loss": 0.5249, "rewards/accuracies": 1.0, "rewards/chosen": 0.33799681067466736, "rewards/margins": 2.3564717769622803, "rewards/rejected": -2.018474817276001, "step": 3465 }, { "epoch": 0.5786751209744703, "grad_norm": 27.793285369873047, "learning_rate": 1.4213248790255298e-05, "logits/chosen": -0.803204357624054, "logits/rejected": -0.8392122387886047, "logps/chosen": -61.41938781738281, "logps/rejected": -154.20028686523438, "loss": 0.4413, "rewards/accuracies": 1.0, "rewards/chosen": -1.1872514486312866, "rewards/margins": 4.6651482582092285, "rewards/rejected": -5.8523993492126465, "step": 3468 }, { "epoch": 0.579175704989154, "grad_norm": 9.34034538269043, "learning_rate": 1.4208242950108462e-05, "logits/chosen": -0.737126350402832, "logits/rejected": -0.8071343898773193, "logps/chosen": -58.402347564697266, "logps/rejected": -112.29977416992188, "loss": 0.39, "rewards/accuracies": 1.0, "rewards/chosen": -1.3014472723007202, "rewards/margins": 0.7557175755500793, "rewards/rejected": -2.0571646690368652, "step": 3471 }, { "epoch": 0.5796762890038378, "grad_norm": 31.560745239257812, "learning_rate": 1.4203237109961623e-05, "logits/chosen": -0.833400547504425, "logits/rejected": -0.7833700776100159, "logps/chosen": -135.4097442626953, "logps/rejected": -86.5533447265625, "loss": 0.4452, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.4724934101104736, "rewards/margins": -0.28202417492866516, "rewards/rejected": -2.1904690265655518, "step": 3474 }, { "epoch": 0.5801768730185216, "grad_norm": 25.773305892944336, "learning_rate": 1.4198231269814787e-05, "logits/chosen": -0.7425613403320312, "logits/rejected": -0.7728658318519592, "logps/chosen": -62.56705093383789, "logps/rejected": -98.07254028320312, "loss": 0.5893, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.7914767265319824, "rewards/margins": 1.8371890783309937, "rewards/rejected": -3.6286659240722656, "step": 3477 }, { "epoch": 0.5806774570332054, "grad_norm": 42.058895111083984, "learning_rate": 1.4193225429667947e-05, "logits/chosen": -0.8728896975517273, "logits/rejected": -0.8739879131317139, "logps/chosen": -150.5768280029297, "logps/rejected": -178.3914337158203, "loss": 0.6334, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.558199644088745, "rewards/margins": -1.1398756504058838, "rewards/rejected": -1.4183238744735718, "step": 3480 }, { "epoch": 0.5811780410478892, "grad_norm": 66.91062927246094, "learning_rate": 1.4188219589521109e-05, "logits/chosen": -0.9125685691833496, "logits/rejected": -0.9164236187934875, "logps/chosen": -71.52906799316406, "logps/rejected": -106.42904663085938, "loss": 1.929, "rewards/accuracies": 1.0, "rewards/chosen": -0.08012083917856216, "rewards/margins": 4.1412506103515625, "rewards/rejected": -4.221371173858643, "step": 3483 }, { "epoch": 0.581678625062573, "grad_norm": 23.253835678100586, "learning_rate": 1.4183213749374272e-05, "logits/chosen": -0.790778398513794, "logits/rejected": -0.825751781463623, "logps/chosen": -76.44105529785156, "logps/rejected": -111.03656768798828, "loss": 0.6484, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4220946133136749, "rewards/margins": 1.7937191724777222, "rewards/rejected": -2.2158138751983643, "step": 3486 }, { "epoch": 0.5821792090772568, "grad_norm": 38.50440216064453, "learning_rate": 1.4178207909227432e-05, "logits/chosen": -0.4434567987918854, "logits/rejected": -0.5019421577453613, "logps/chosen": -49.21342468261719, "logps/rejected": -157.6486358642578, "loss": 0.6154, "rewards/accuracies": 1.0, "rewards/chosen": 0.03048858977854252, "rewards/margins": 3.435661554336548, "rewards/rejected": -3.4051730632781982, "step": 3489 }, { "epoch": 0.5826797930919406, "grad_norm": 59.55226135253906, "learning_rate": 1.4173202069080596e-05, "logits/chosen": -0.7066240310668945, "logits/rejected": -0.8280045390129089, "logps/chosen": -62.124332427978516, "logps/rejected": -117.61971282958984, "loss": 0.7816, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4735957682132721, "rewards/margins": 3.0008182525634766, "rewards/rejected": -3.4744138717651367, "step": 3492 }, { "epoch": 0.5831803771066244, "grad_norm": 17.606372833251953, "learning_rate": 1.4168196228933758e-05, "logits/chosen": -0.7864354252815247, "logits/rejected": -0.8536383509635925, "logps/chosen": -42.49188232421875, "logps/rejected": -137.85498046875, "loss": 0.3241, "rewards/accuracies": 1.0, "rewards/chosen": 0.9663440585136414, "rewards/margins": 4.691804885864258, "rewards/rejected": -3.725461006164551, "step": 3495 }, { "epoch": 0.5836809611213082, "grad_norm": 32.46034240722656, "learning_rate": 1.4163190388786918e-05, "logits/chosen": -0.7386612892150879, "logits/rejected": -0.7514044642448425, "logps/chosen": -62.22111892700195, "logps/rejected": -82.8328857421875, "loss": 0.7479, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.9225761890411377, "rewards/margins": 0.01020220946520567, "rewards/rejected": -0.9327784180641174, "step": 3498 }, { "epoch": 0.584181545135992, "grad_norm": 43.128211975097656, "learning_rate": 1.4158184548640081e-05, "logits/chosen": -0.6784330010414124, "logits/rejected": -0.6197180151939392, "logps/chosen": -84.72607421875, "logps/rejected": -112.776611328125, "loss": 0.5056, "rewards/accuracies": 1.0, "rewards/chosen": -1.6267799139022827, "rewards/margins": 2.897592782974243, "rewards/rejected": -4.524372577667236, "step": 3501 }, { "epoch": 0.5846821291506757, "grad_norm": 21.491836547851562, "learning_rate": 1.4153178708493243e-05, "logits/chosen": -0.8875438570976257, "logits/rejected": -0.8309444785118103, "logps/chosen": -76.04889678955078, "logps/rejected": -97.6457290649414, "loss": 0.2116, "rewards/accuracies": 1.0, "rewards/chosen": 0.5895190834999084, "rewards/margins": 2.4036080837249756, "rewards/rejected": -1.8140889406204224, "step": 3504 }, { "epoch": 0.5851827131653596, "grad_norm": 14.957215309143066, "learning_rate": 1.4148172868346407e-05, "logits/chosen": -0.7214382290840149, "logits/rejected": -0.6825425028800964, "logps/chosen": -101.6053695678711, "logps/rejected": -75.6812515258789, "loss": 0.37, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.1659945398569107, "rewards/margins": 0.9273595809936523, "rewards/rejected": -1.0933541059494019, "step": 3507 }, { "epoch": 0.5856832971800434, "grad_norm": 14.590738296508789, "learning_rate": 1.4143167028199567e-05, "logits/chosen": -0.7555655837059021, "logits/rejected": -0.7984258532524109, "logps/chosen": -57.28656005859375, "logps/rejected": -106.47489166259766, "loss": 0.3698, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.43056151270866394, "rewards/margins": 1.725481629371643, "rewards/rejected": -2.156043291091919, "step": 3510 }, { "epoch": 0.5861838811947272, "grad_norm": 16.730789184570312, "learning_rate": 1.413816118805273e-05, "logits/chosen": -0.6584935784339905, "logits/rejected": -0.6222479939460754, "logps/chosen": -106.32308197021484, "logps/rejected": -83.21640014648438, "loss": 0.4903, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.1249816417694092, "rewards/margins": -0.13961882889270782, "rewards/rejected": -0.9853628277778625, "step": 3513 }, { "epoch": 0.586684465209411, "grad_norm": 41.521026611328125, "learning_rate": 1.4133155347905892e-05, "logits/chosen": -0.5821637511253357, "logits/rejected": -0.6208257079124451, "logps/chosen": -114.9783706665039, "logps/rejected": -102.22782135009766, "loss": 0.5488, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.7521092891693115, "rewards/margins": -0.5575278997421265, "rewards/rejected": -2.1945812702178955, "step": 3516 }, { "epoch": 0.5871850492240948, "grad_norm": 20.9393310546875, "learning_rate": 1.4128149507759052e-05, "logits/chosen": -0.7055883407592773, "logits/rejected": -0.7584803700447083, "logps/chosen": -47.78515625, "logps/rejected": -128.92417907714844, "loss": 0.2688, "rewards/accuracies": 1.0, "rewards/chosen": -0.13114117085933685, "rewards/margins": 3.8233299255371094, "rewards/rejected": -3.9544708728790283, "step": 3519 }, { "epoch": 0.5876856332387785, "grad_norm": 8.55068588256836, "learning_rate": 1.4123143667612216e-05, "logits/chosen": -0.7631969451904297, "logits/rejected": -0.7342309951782227, "logps/chosen": -88.86871337890625, "logps/rejected": -122.88451385498047, "loss": 0.1495, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.7045154571533203, "rewards/margins": 1.6219091415405273, "rewards/rejected": -3.3264245986938477, "step": 3522 }, { "epoch": 0.5881862172534623, "grad_norm": 19.399755477905273, "learning_rate": 1.4118137827465377e-05, "logits/chosen": -0.8407564163208008, "logits/rejected": -0.862248420715332, "logps/chosen": -77.62836456298828, "logps/rejected": -146.985107421875, "loss": 0.4783, "rewards/accuracies": 1.0, "rewards/chosen": -0.5992755889892578, "rewards/margins": 2.694045305252075, "rewards/rejected": -3.293320894241333, "step": 3525 }, { "epoch": 0.5886868012681462, "grad_norm": 3.2195394039154053, "learning_rate": 1.411313198731854e-05, "logits/chosen": -0.6265742182731628, "logits/rejected": -0.6146816611289978, "logps/chosen": -58.64850997924805, "logps/rejected": -128.85203552246094, "loss": 0.4598, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1880491971969604, "rewards/margins": 3.364128828048706, "rewards/rejected": -4.552177906036377, "step": 3528 }, { "epoch": 0.58918738528283, "grad_norm": 4.760010719299316, "learning_rate": 1.4108126147171701e-05, "logits/chosen": -0.8572168350219727, "logits/rejected": -0.9094099998474121, "logps/chosen": -80.5494613647461, "logps/rejected": -140.0154266357422, "loss": 0.2903, "rewards/accuracies": 1.0, "rewards/chosen": -0.9010024070739746, "rewards/margins": 2.1650049686431885, "rewards/rejected": -3.066007375717163, "step": 3531 }, { "epoch": 0.5896879692975138, "grad_norm": 13.516860961914062, "learning_rate": 1.4103120307024864e-05, "logits/chosen": -0.691510021686554, "logits/rejected": -0.6611020565032959, "logps/chosen": -65.97217559814453, "logps/rejected": -85.12279510498047, "loss": 0.3536, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6656686067581177, "rewards/margins": 1.3017057180404663, "rewards/rejected": -1.9673742055892944, "step": 3534 }, { "epoch": 0.5901885533121976, "grad_norm": 49.24116134643555, "learning_rate": 1.4098114466878026e-05, "logits/chosen": -0.6580949425697327, "logits/rejected": -0.7440762519836426, "logps/chosen": -82.69710540771484, "logps/rejected": -199.5308837890625, "loss": 0.4813, "rewards/accuracies": 1.0, "rewards/chosen": -1.5613585710525513, "rewards/margins": 1.8711856603622437, "rewards/rejected": -3.432543992996216, "step": 3537 }, { "epoch": 0.5906891373268813, "grad_norm": 37.63359832763672, "learning_rate": 1.4093108626731186e-05, "logits/chosen": -0.8275192379951477, "logits/rejected": -0.7078810334205627, "logps/chosen": -151.72425842285156, "logps/rejected": -104.71283721923828, "loss": 0.8397, "rewards/accuracies": 1.0, "rewards/chosen": -1.579197883605957, "rewards/margins": 1.5868964195251465, "rewards/rejected": -3.1660945415496826, "step": 3540 }, { "epoch": 0.5911897213415651, "grad_norm": 42.22419357299805, "learning_rate": 1.408810278658435e-05, "logits/chosen": -0.8575544357299805, "logits/rejected": -0.7944929599761963, "logps/chosen": -115.50023651123047, "logps/rejected": -55.6161994934082, "loss": 1.1073, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.801324486732483, "rewards/margins": -1.310764193534851, "rewards/rejected": -0.49056026339530945, "step": 3543 }, { "epoch": 0.591690305356249, "grad_norm": 13.763031005859375, "learning_rate": 1.4083096946437512e-05, "logits/chosen": -0.8774316906929016, "logits/rejected": -0.8745291829109192, "logps/chosen": -107.55099487304688, "logps/rejected": -118.0762939453125, "loss": 0.317, "rewards/accuracies": 1.0, "rewards/chosen": 0.1639905422925949, "rewards/margins": 3.239858627319336, "rewards/rejected": -3.0758678913116455, "step": 3546 }, { "epoch": 0.5921908893709328, "grad_norm": 28.476917266845703, "learning_rate": 1.4078091106290675e-05, "logits/chosen": -0.7841197848320007, "logits/rejected": -0.7890224456787109, "logps/chosen": -85.384033203125, "logps/rejected": -107.0032730102539, "loss": 0.521, "rewards/accuracies": 1.0, "rewards/chosen": -0.5031945705413818, "rewards/margins": 2.938067674636841, "rewards/rejected": -3.4412624835968018, "step": 3549 }, { "epoch": 0.5926914733856166, "grad_norm": 6.385905742645264, "learning_rate": 1.4073085266143835e-05, "logits/chosen": -0.7803711891174316, "logits/rejected": -0.7408550381660461, "logps/chosen": -68.2137680053711, "logps/rejected": -79.75023651123047, "loss": 0.1736, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.23192165791988373, "rewards/margins": 2.808757781982422, "rewards/rejected": -3.0406792163848877, "step": 3552 }, { "epoch": 0.5931920574003003, "grad_norm": 22.997854232788086, "learning_rate": 1.4068079425996997e-05, "logits/chosen": -0.6020599007606506, "logits/rejected": -0.6486456394195557, "logps/chosen": -107.04003143310547, "logps/rejected": -125.2051773071289, "loss": 0.5955, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1301189661026, "rewards/margins": 1.3706809282302856, "rewards/rejected": -2.5007998943328857, "step": 3555 }, { "epoch": 0.5936926414149841, "grad_norm": 8.950775146484375, "learning_rate": 1.406307358585016e-05, "logits/chosen": -0.6922720074653625, "logits/rejected": -0.6944890022277832, "logps/chosen": -90.75311279296875, "logps/rejected": -60.39781188964844, "loss": 0.1925, "rewards/accuracies": 1.0, "rewards/chosen": -0.022573232650756836, "rewards/margins": 1.2450132369995117, "rewards/rejected": -1.267586350440979, "step": 3558 }, { "epoch": 0.5941932254296679, "grad_norm": 10.0150146484375, "learning_rate": 1.405806774570332e-05, "logits/chosen": -0.7254498600959778, "logits/rejected": -0.7816786170005798, "logps/chosen": -101.50286865234375, "logps/rejected": -100.16890716552734, "loss": 0.3364, "rewards/accuracies": 1.0, "rewards/chosen": -0.7050344347953796, "rewards/margins": 2.1446704864501953, "rewards/rejected": -2.8497047424316406, "step": 3561 }, { "epoch": 0.5946938094443518, "grad_norm": 39.757022857666016, "learning_rate": 1.4053061905556484e-05, "logits/chosen": -0.8297621607780457, "logits/rejected": -0.8092527985572815, "logps/chosen": -86.57372283935547, "logps/rejected": -96.1981201171875, "loss": 0.4481, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7371745705604553, "rewards/margins": 1.3985055685043335, "rewards/rejected": -2.1356801986694336, "step": 3564 }, { "epoch": 0.5951943934590356, "grad_norm": 16.466703414916992, "learning_rate": 1.4048056065409646e-05, "logits/chosen": -0.6750979423522949, "logits/rejected": -0.708655059337616, "logps/chosen": -120.39261627197266, "logps/rejected": -140.53623962402344, "loss": 0.3631, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3681701421737671, "rewards/margins": 1.900381088256836, "rewards/rejected": -2.2685511112213135, "step": 3567 }, { "epoch": 0.5956949774737194, "grad_norm": 5.179401874542236, "learning_rate": 1.404305022526281e-05, "logits/chosen": -0.8885486125946045, "logits/rejected": -0.8709966540336609, "logps/chosen": -131.8573760986328, "logps/rejected": -171.6038055419922, "loss": 0.3276, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.5769221782684326, "rewards/margins": 0.538474977016449, "rewards/rejected": -3.1153972148895264, "step": 3570 }, { "epoch": 0.5961955614884031, "grad_norm": 18.77992820739746, "learning_rate": 1.403804438511597e-05, "logits/chosen": -0.8575289845466614, "logits/rejected": -0.7962117791175842, "logps/chosen": -120.1377182006836, "logps/rejected": -76.78935241699219, "loss": 0.4683, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.6587031483650208, "rewards/margins": 0.8786017298698425, "rewards/rejected": -1.5373048782348633, "step": 3573 }, { "epoch": 0.5966961455030869, "grad_norm": 37.040592193603516, "learning_rate": 1.4033038544969131e-05, "logits/chosen": -0.6073973774909973, "logits/rejected": -0.6097031235694885, "logps/chosen": -76.85245513916016, "logps/rejected": -70.10204315185547, "loss": 0.8339, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.4979132413864136, "rewards/margins": -0.1613425463438034, "rewards/rejected": -1.3365706205368042, "step": 3576 }, { "epoch": 0.5971967295177707, "grad_norm": 23.311368942260742, "learning_rate": 1.4028032704822295e-05, "logits/chosen": -0.47433722019195557, "logits/rejected": -0.553240180015564, "logps/chosen": -83.47762298583984, "logps/rejected": -124.47382354736328, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": -1.6362171173095703, "rewards/margins": 1.1231154203414917, "rewards/rejected": -2.7593324184417725, "step": 3579 }, { "epoch": 0.5976973135324545, "grad_norm": 22.502317428588867, "learning_rate": 1.4023026864675455e-05, "logits/chosen": -0.7691695690155029, "logits/rejected": -0.7958977222442627, "logps/chosen": -126.8828353881836, "logps/rejected": -118.77569580078125, "loss": 0.4293, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.6635783910751343, "rewards/margins": -0.10760700702667236, "rewards/rejected": -1.5559712648391724, "step": 3582 }, { "epoch": 0.5981978975471384, "grad_norm": 27.347593307495117, "learning_rate": 1.4018021024528618e-05, "logits/chosen": -0.8576633334159851, "logits/rejected": -0.8619591593742371, "logps/chosen": -48.9525146484375, "logps/rejected": -97.69403839111328, "loss": 0.4293, "rewards/accuracies": 1.0, "rewards/chosen": -0.05422784015536308, "rewards/margins": 3.745774507522583, "rewards/rejected": -3.8000028133392334, "step": 3585 }, { "epoch": 0.5986984815618221, "grad_norm": 9.068231582641602, "learning_rate": 1.401301518438178e-05, "logits/chosen": -0.899941623210907, "logits/rejected": -0.9216710925102234, "logps/chosen": -37.180259704589844, "logps/rejected": -74.72737884521484, "loss": 0.2049, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.4900785684585571, "rewards/margins": 1.9344631433486938, "rewards/rejected": -3.424541711807251, "step": 3588 }, { "epoch": 0.5991990655765059, "grad_norm": 18.578990936279297, "learning_rate": 1.4008009344234944e-05, "logits/chosen": -0.8307339549064636, "logits/rejected": -0.8862643837928772, "logps/chosen": -58.8658447265625, "logps/rejected": -119.9117660522461, "loss": 0.6142, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6805469393730164, "rewards/margins": 1.7856144905090332, "rewards/rejected": -2.4661614894866943, "step": 3591 }, { "epoch": 0.5996996495911897, "grad_norm": 60.612606048583984, "learning_rate": 1.4003003504088104e-05, "logits/chosen": -0.8396165370941162, "logits/rejected": -0.8054821491241455, "logps/chosen": -126.1369857788086, "logps/rejected": -105.86101531982422, "loss": 0.7142, "rewards/accuracies": 1.0, "rewards/chosen": -1.9524644613265991, "rewards/margins": 0.5498924851417542, "rewards/rejected": -2.502357006072998, "step": 3594 }, { "epoch": 0.6002002336058735, "grad_norm": 27.029462814331055, "learning_rate": 1.3997997663941266e-05, "logits/chosen": -0.9626409411430359, "logits/rejected": -0.9379916191101074, "logps/chosen": -115.92578125, "logps/rejected": -115.0790023803711, "loss": 0.233, "rewards/accuracies": 1.0, "rewards/chosen": -3.2969534397125244, "rewards/margins": 2.454153299331665, "rewards/rejected": -5.7511067390441895, "step": 3597 }, { "epoch": 0.6007008176205573, "grad_norm": 18.672399520874023, "learning_rate": 1.3992991823794429e-05, "logits/chosen": -0.8882007002830505, "logits/rejected": -0.761101245880127, "logps/chosen": -179.113525390625, "logps/rejected": -125.71673583984375, "loss": 0.8313, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9163711667060852, "rewards/margins": 1.2126659154891968, "rewards/rejected": -2.1290371417999268, "step": 3600 }, { "epoch": 0.6007008176205573, "eval_logits/chosen": -0.7481955289840698, "eval_logits/rejected": -0.7624042630195618, "eval_logps/chosen": -91.17363739013672, "eval_logps/rejected": -120.8929214477539, "eval_loss": 0.6150566935539246, "eval_rewards/accuracies": 0.7342342138290405, "eval_rewards/chosen": -1.0915907621383667, "eval_rewards/margins": 1.6848829984664917, "eval_rewards/rejected": -2.7764737606048584, "eval_runtime": 346.4536, "eval_samples_per_second": 7.689, "eval_steps_per_second": 1.922, "step": 3600 }, { "epoch": 0.6012014016352412, "grad_norm": 3.5154809951782227, "learning_rate": 1.3987985983647589e-05, "logits/chosen": -0.7724606990814209, "logits/rejected": -0.7787497639656067, "logps/chosen": -73.99282836914062, "logps/rejected": -110.84262084960938, "loss": 0.191, "rewards/accuracies": 1.0, "rewards/chosen": -0.7497537136077881, "rewards/margins": 3.0448970794677734, "rewards/rejected": -3.7946510314941406, "step": 3603 }, { "epoch": 0.6017019856499249, "grad_norm": 38.705265045166016, "learning_rate": 1.3982980143500753e-05, "logits/chosen": -0.7147386074066162, "logits/rejected": -0.8090341687202454, "logps/chosen": -68.66194915771484, "logps/rejected": -165.635986328125, "loss": 0.4132, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.665524959564209, "rewards/margins": 1.6099042892456055, "rewards/rejected": -3.2754290103912354, "step": 3606 }, { "epoch": 0.6022025696646087, "grad_norm": 37.87376403808594, "learning_rate": 1.3977974303353914e-05, "logits/chosen": -0.6444140672683716, "logits/rejected": -0.5914640426635742, "logps/chosen": -130.88157653808594, "logps/rejected": -74.04218292236328, "loss": 0.5667, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.7556371688842773, "rewards/margins": -0.4519789516925812, "rewards/rejected": -2.3036582469940186, "step": 3609 }, { "epoch": 0.6027031536792925, "grad_norm": 29.658838272094727, "learning_rate": 1.3972968463207075e-05, "logits/chosen": -0.9642267227172852, "logits/rejected": -0.9770674705505371, "logps/chosen": -94.60054779052734, "logps/rejected": -125.85721588134766, "loss": 0.2947, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.23276782035827637, "rewards/margins": 3.009251356124878, "rewards/rejected": -3.2420194149017334, "step": 3612 }, { "epoch": 0.6032037376939763, "grad_norm": 30.407407760620117, "learning_rate": 1.3967962623060238e-05, "logits/chosen": -0.7482969760894775, "logits/rejected": -0.6921799778938293, "logps/chosen": -145.03468322753906, "logps/rejected": -89.21546173095703, "loss": 0.9485, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.722100019454956, "rewards/margins": -1.6929435729980469, "rewards/rejected": -1.0291563272476196, "step": 3615 }, { "epoch": 0.6037043217086601, "grad_norm": 24.302146911621094, "learning_rate": 1.39629567829134e-05, "logits/chosen": -0.7972914576530457, "logits/rejected": -0.8242341876029968, "logps/chosen": -61.53010940551758, "logps/rejected": -120.7718734741211, "loss": 0.2839, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7182943224906921, "rewards/margins": 2.4765875339508057, "rewards/rejected": -3.1948821544647217, "step": 3618 }, { "epoch": 0.604204905723344, "grad_norm": 29.755901336669922, "learning_rate": 1.3957950942766563e-05, "logits/chosen": -0.7629032135009766, "logits/rejected": -0.8182432055473328, "logps/chosen": -87.44329071044922, "logps/rejected": -114.10138702392578, "loss": 0.8134, "rewards/accuracies": 1.0, "rewards/chosen": 0.4634365141391754, "rewards/margins": 2.529228925704956, "rewards/rejected": -2.0657923221588135, "step": 3621 }, { "epoch": 0.6047054897380277, "grad_norm": 15.802294731140137, "learning_rate": 1.3952945102619723e-05, "logits/chosen": -0.7391359210014343, "logits/rejected": -0.8312755227088928, "logps/chosen": -64.44116973876953, "logps/rejected": -146.29489135742188, "loss": 0.6089, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.803268313407898, "rewards/margins": 0.6862626671791077, "rewards/rejected": -2.4895308017730713, "step": 3624 }, { "epoch": 0.6052060737527115, "grad_norm": 33.15327453613281, "learning_rate": 1.3947939262472887e-05, "logits/chosen": -0.6984127163887024, "logits/rejected": -0.7123863101005554, "logps/chosen": -42.6080436706543, "logps/rejected": -54.33607864379883, "loss": 0.4114, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6214134693145752, "rewards/margins": -0.44662606716156006, "rewards/rejected": -0.17478741705417633, "step": 3627 }, { "epoch": 0.6057066577673953, "grad_norm": 44.0360221862793, "learning_rate": 1.3942933422326049e-05, "logits/chosen": -0.6052780747413635, "logits/rejected": -0.5985878109931946, "logps/chosen": -122.97552490234375, "logps/rejected": -157.6183319091797, "loss": 0.5321, "rewards/accuracies": 1.0, "rewards/chosen": -1.2700392007827759, "rewards/margins": 2.2562673091888428, "rewards/rejected": -3.526306390762329, "step": 3630 }, { "epoch": 0.6062072417820791, "grad_norm": 19.494482040405273, "learning_rate": 1.3937927582179209e-05, "logits/chosen": -0.6652117371559143, "logits/rejected": -0.7461144924163818, "logps/chosen": -26.75390625, "logps/rejected": -107.68221282958984, "loss": 0.4038, "rewards/accuracies": 1.0, "rewards/chosen": 0.5926342010498047, "rewards/margins": 3.4611995220184326, "rewards/rejected": -2.868565559387207, "step": 3633 }, { "epoch": 0.6067078257967629, "grad_norm": 28.393049240112305, "learning_rate": 1.3932921742032372e-05, "logits/chosen": -0.8216468691825867, "logits/rejected": -0.848490297794342, "logps/chosen": -59.57539367675781, "logps/rejected": -105.21442413330078, "loss": 0.3812, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.687013566493988, "rewards/margins": 1.3018114566802979, "rewards/rejected": -1.9888249635696411, "step": 3636 }, { "epoch": 0.6072084098114466, "grad_norm": 25.290115356445312, "learning_rate": 1.3927915901885534e-05, "logits/chosen": -0.7208300232887268, "logits/rejected": -0.7914465069770813, "logps/chosen": -41.38661575317383, "logps/rejected": -136.83795166015625, "loss": 1.2841, "rewards/accuracies": 1.0, "rewards/chosen": 0.22628509998321533, "rewards/margins": 5.588831424713135, "rewards/rejected": -5.362546920776367, "step": 3639 }, { "epoch": 0.6077089938261305, "grad_norm": 28.92360496520996, "learning_rate": 1.3922910061738698e-05, "logits/chosen": -0.7738639712333679, "logits/rejected": -0.7615019679069519, "logps/chosen": -94.44613647460938, "logps/rejected": -120.4881820678711, "loss": 0.6084, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.24429988861084, "rewards/margins": 0.15000446140766144, "rewards/rejected": -2.3943045139312744, "step": 3642 }, { "epoch": 0.6082095778408143, "grad_norm": 8.614665985107422, "learning_rate": 1.3917904221591858e-05, "logits/chosen": -0.6014029383659363, "logits/rejected": -0.5905436873435974, "logps/chosen": -80.52936553955078, "logps/rejected": -117.17362213134766, "loss": 0.312, "rewards/accuracies": 1.0, "rewards/chosen": -0.8327329158782959, "rewards/margins": 2.009211778640747, "rewards/rejected": -2.841944694519043, "step": 3645 }, { "epoch": 0.6087101618554981, "grad_norm": 11.681370735168457, "learning_rate": 1.3912898381445021e-05, "logits/chosen": -0.6770000457763672, "logits/rejected": -0.7461182475090027, "logps/chosen": -69.92349243164062, "logps/rejected": -93.73300170898438, "loss": 0.6267, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.2274490594863892, "rewards/margins": 0.6675510406494141, "rewards/rejected": -1.8950001001358032, "step": 3648 }, { "epoch": 0.6092107458701819, "grad_norm": 18.97062873840332, "learning_rate": 1.3907892541298183e-05, "logits/chosen": -0.8313279151916504, "logits/rejected": -0.8376786112785339, "logps/chosen": -83.58838653564453, "logps/rejected": -126.9832763671875, "loss": 0.4174, "rewards/accuracies": 1.0, "rewards/chosen": 0.21332065761089325, "rewards/margins": 4.613327980041504, "rewards/rejected": -4.4000067710876465, "step": 3651 }, { "epoch": 0.6097113298848656, "grad_norm": 21.94280433654785, "learning_rate": 1.3902886701151343e-05, "logits/chosen": -0.7527866363525391, "logits/rejected": -0.7496345043182373, "logps/chosen": -124.33883666992188, "logps/rejected": -104.8307876586914, "loss": 0.755, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -3.6816539764404297, "rewards/margins": -0.8257349133491516, "rewards/rejected": -2.8559188842773438, "step": 3654 }, { "epoch": 0.6102119138995494, "grad_norm": 21.338939666748047, "learning_rate": 1.3897880861004507e-05, "logits/chosen": -0.7578803896903992, "logits/rejected": -0.7144894599914551, "logps/chosen": -93.46894073486328, "logps/rejected": -93.66101837158203, "loss": 0.6932, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.43783149123191833, "rewards/margins": 0.5947220921516418, "rewards/rejected": -1.0325536727905273, "step": 3657 }, { "epoch": 0.6107124979142333, "grad_norm": 15.8025541305542, "learning_rate": 1.3892875020857668e-05, "logits/chosen": -0.8512821793556213, "logits/rejected": -0.7591829299926758, "logps/chosen": -105.13702392578125, "logps/rejected": -90.95342254638672, "loss": 0.5162, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.9165321588516235, "rewards/margins": 1.4773601293563843, "rewards/rejected": -3.393892526626587, "step": 3660 }, { "epoch": 0.6112130819289171, "grad_norm": 2.2321701049804688, "learning_rate": 1.3887869180710832e-05, "logits/chosen": -0.7396469712257385, "logits/rejected": -0.7285152077674866, "logps/chosen": -102.79268646240234, "logps/rejected": -120.74103546142578, "loss": 0.2297, "rewards/accuracies": 1.0, "rewards/chosen": -0.582722008228302, "rewards/margins": 2.971389055252075, "rewards/rejected": -3.5541107654571533, "step": 3663 }, { "epoch": 0.6117136659436009, "grad_norm": 24.335405349731445, "learning_rate": 1.3882863340563992e-05, "logits/chosen": -0.7478136420249939, "logits/rejected": -0.7447915077209473, "logps/chosen": -102.99639892578125, "logps/rejected": -109.468017578125, "loss": 0.3662, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1539770364761353, "rewards/margins": 0.24310939013957977, "rewards/rejected": -1.397086501121521, "step": 3666 }, { "epoch": 0.6122142499582847, "grad_norm": 39.54582977294922, "learning_rate": 1.3877857500417154e-05, "logits/chosen": -0.8116670250892639, "logits/rejected": -0.7824513912200928, "logps/chosen": -103.57772064208984, "logps/rejected": -121.87345123291016, "loss": 0.5575, "rewards/accuracies": 1.0, "rewards/chosen": -0.4725029170513153, "rewards/margins": 2.9759762287139893, "rewards/rejected": -3.448479413986206, "step": 3669 }, { "epoch": 0.6127148339729684, "grad_norm": 12.359100341796875, "learning_rate": 1.3872851660270317e-05, "logits/chosen": -0.7957825660705566, "logits/rejected": -0.8194196820259094, "logps/chosen": -66.8180923461914, "logps/rejected": -125.02147674560547, "loss": 0.302, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.6087946891784668, "rewards/margins": 0.8819169998168945, "rewards/rejected": -2.4907119274139404, "step": 3672 }, { "epoch": 0.6132154179876522, "grad_norm": 40.8309440612793, "learning_rate": 1.3867845820123477e-05, "logits/chosen": -0.7508284449577332, "logits/rejected": -0.7493165135383606, "logps/chosen": -105.4869613647461, "logps/rejected": -99.10530853271484, "loss": 0.7123, "rewards/accuracies": 1.0, "rewards/chosen": 0.06099460646510124, "rewards/margins": 1.4480210542678833, "rewards/rejected": -1.3870264291763306, "step": 3675 }, { "epoch": 0.6137160020023361, "grad_norm": 47.733131408691406, "learning_rate": 1.386283997997664e-05, "logits/chosen": -0.5892549157142639, "logits/rejected": -0.6664145588874817, "logps/chosen": -130.31434631347656, "logps/rejected": -155.1026153564453, "loss": 0.3347, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -5.08524751663208, "rewards/margins": 0.8926536440849304, "rewards/rejected": -5.977901458740234, "step": 3678 }, { "epoch": 0.6142165860170199, "grad_norm": 109.57313537597656, "learning_rate": 1.3857834139829803e-05, "logits/chosen": -0.8755441308021545, "logits/rejected": -0.8578584790229797, "logps/chosen": -130.0764617919922, "logps/rejected": -130.41546630859375, "loss": 0.7059, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8223564028739929, "rewards/margins": 3.1376726627349854, "rewards/rejected": -3.960028886795044, "step": 3681 }, { "epoch": 0.6147171700317037, "grad_norm": 33.96443557739258, "learning_rate": 1.3852828299682966e-05, "logits/chosen": -0.7477982640266418, "logits/rejected": -0.7891273498535156, "logps/chosen": -68.1647720336914, "logps/rejected": -86.10457611083984, "loss": 0.7559, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5083991885185242, "rewards/margins": 0.03687046095728874, "rewards/rejected": -0.5452696681022644, "step": 3684 }, { "epoch": 0.6152177540463875, "grad_norm": 39.95389175415039, "learning_rate": 1.3847822459536126e-05, "logits/chosen": -0.7492842674255371, "logits/rejected": -0.7390108108520508, "logps/chosen": -159.64566040039062, "logps/rejected": -133.974853515625, "loss": 0.871, "rewards/accuracies": 1.0, "rewards/chosen": -1.6169310808181763, "rewards/margins": 2.6061859130859375, "rewards/rejected": -4.223117351531982, "step": 3687 }, { "epoch": 0.6157183380610712, "grad_norm": 46.309024810791016, "learning_rate": 1.3842816619389288e-05, "logits/chosen": -0.7857804298400879, "logits/rejected": -0.8018128275871277, "logps/chosen": -116.8685073852539, "logps/rejected": -156.08775329589844, "loss": 0.873, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -3.3313119411468506, "rewards/margins": 3.1066510677337646, "rewards/rejected": -6.437963008880615, "step": 3690 }, { "epoch": 0.616218922075755, "grad_norm": 53.40496063232422, "learning_rate": 1.3837810779242451e-05, "logits/chosen": -0.7906641960144043, "logits/rejected": -0.7808987498283386, "logps/chosen": -81.82819366455078, "logps/rejected": -76.7664566040039, "loss": 0.9117, "rewards/accuracies": 0.0, "rewards/chosen": 0.35169294476509094, "rewards/margins": -0.7054157853126526, "rewards/rejected": 1.0571088790893555, "step": 3693 }, { "epoch": 0.6167195060904388, "grad_norm": 47.29351043701172, "learning_rate": 1.3832804939095612e-05, "logits/chosen": -0.9717433452606201, "logits/rejected": -0.9663674235343933, "logps/chosen": -67.94757080078125, "logps/rejected": -76.93327331542969, "loss": 0.5075, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.2139297723770142, "rewards/margins": 1.5878266096115112, "rewards/rejected": -2.8017566204071045, "step": 3696 }, { "epoch": 0.6172200901051227, "grad_norm": 3.820040702819824, "learning_rate": 1.3827799098948775e-05, "logits/chosen": -0.7498636245727539, "logits/rejected": -0.7565471529960632, "logps/chosen": -66.78011322021484, "logps/rejected": -87.36504364013672, "loss": 0.4582, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.9415845274925232, "rewards/margins": 1.193339228630066, "rewards/rejected": -2.1349236965179443, "step": 3699 }, { "epoch": 0.6177206741198065, "grad_norm": 14.793567657470703, "learning_rate": 1.3822793258801937e-05, "logits/chosen": -0.6088420152664185, "logits/rejected": -0.6726567149162292, "logps/chosen": -96.39456939697266, "logps/rejected": -126.9915542602539, "loss": 0.4863, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7624933123588562, "rewards/margins": 1.2789812088012695, "rewards/rejected": -2.0414743423461914, "step": 3702 }, { "epoch": 0.6182212581344902, "grad_norm": 13.278603553771973, "learning_rate": 1.38177874186551e-05, "logits/chosen": -0.8168366551399231, "logits/rejected": -0.8322046399116516, "logps/chosen": -48.6261100769043, "logps/rejected": -103.17626953125, "loss": 0.3072, "rewards/accuracies": 1.0, "rewards/chosen": -0.8174729943275452, "rewards/margins": 2.5800278186798096, "rewards/rejected": -3.39750075340271, "step": 3705 }, { "epoch": 0.618721842149174, "grad_norm": 5.879955768585205, "learning_rate": 1.381278157850826e-05, "logits/chosen": -0.7100064754486084, "logits/rejected": -0.7846580147743225, "logps/chosen": -99.4874496459961, "logps/rejected": -116.0735092163086, "loss": 0.6498, "rewards/accuracies": 1.0, "rewards/chosen": 0.1368710845708847, "rewards/margins": 2.0305709838867188, "rewards/rejected": -1.8936997652053833, "step": 3708 }, { "epoch": 0.6192224261638578, "grad_norm": 13.138276100158691, "learning_rate": 1.3807775738361422e-05, "logits/chosen": -0.9094951748847961, "logits/rejected": -0.8674309253692627, "logps/chosen": -128.0220489501953, "logps/rejected": -93.94905853271484, "loss": 0.4143, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6473612785339355, "rewards/margins": -0.09231161326169968, "rewards/rejected": -0.5550497174263, "step": 3711 }, { "epoch": 0.6197230101785416, "grad_norm": 42.11091613769531, "learning_rate": 1.3802769898214586e-05, "logits/chosen": -0.6136326193809509, "logits/rejected": -0.6436396241188049, "logps/chosen": -89.55062103271484, "logps/rejected": -75.4009017944336, "loss": 1.2092, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.3890419006347656, "rewards/margins": -2.2916274070739746, "rewards/rejected": -0.0974145159125328, "step": 3714 }, { "epoch": 0.6202235941932255, "grad_norm": 36.80853271484375, "learning_rate": 1.3797764058067746e-05, "logits/chosen": -0.8769809603691101, "logits/rejected": -0.8690071105957031, "logps/chosen": -78.18293762207031, "logps/rejected": -126.38638305664062, "loss": 0.929, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.9333747625350952, "rewards/margins": 3.1141433715820312, "rewards/rejected": -5.047518253326416, "step": 3717 }, { "epoch": 0.6207241782079093, "grad_norm": 61.981048583984375, "learning_rate": 1.379275821792091e-05, "logits/chosen": -0.7201111912727356, "logits/rejected": -0.7847552299499512, "logps/chosen": -85.0939712524414, "logps/rejected": -130.20826721191406, "loss": 0.2236, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6509179472923279, "rewards/margins": 3.2916030883789062, "rewards/rejected": -3.942521095275879, "step": 3720 }, { "epoch": 0.621224762222593, "grad_norm": 46.624568939208984, "learning_rate": 1.3787752377774071e-05, "logits/chosen": -0.5472332239151001, "logits/rejected": -0.558699369430542, "logps/chosen": -84.78141021728516, "logps/rejected": -107.15966796875, "loss": 1.8159, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.2649954557418823, "rewards/margins": 2.235461711883545, "rewards/rejected": -3.5004570484161377, "step": 3723 }, { "epoch": 0.6217253462372768, "grad_norm": 49.773162841796875, "learning_rate": 1.3782746537627231e-05, "logits/chosen": -0.8778378963470459, "logits/rejected": -0.8604547381401062, "logps/chosen": -116.30074310302734, "logps/rejected": -96.184326171875, "loss": 1.1115, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.372889757156372, "rewards/margins": 0.7829100489616394, "rewards/rejected": -3.1558001041412354, "step": 3726 }, { "epoch": 0.6222259302519606, "grad_norm": 16.706769943237305, "learning_rate": 1.3777740697480395e-05, "logits/chosen": -0.5505726933479309, "logits/rejected": -0.640671968460083, "logps/chosen": -75.23977661132812, "logps/rejected": -217.3577117919922, "loss": 0.5256, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.6123656034469604, "rewards/margins": 2.297312021255493, "rewards/rejected": -3.909677505493164, "step": 3729 }, { "epoch": 0.6227265142666444, "grad_norm": 13.901041030883789, "learning_rate": 1.3772734857333556e-05, "logits/chosen": -0.5953988432884216, "logits/rejected": -0.6203976273536682, "logps/chosen": -95.35739135742188, "logps/rejected": -173.21205139160156, "loss": 0.4087, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1482704877853394, "rewards/margins": 1.059718370437622, "rewards/rejected": -2.207988739013672, "step": 3732 }, { "epoch": 0.6232270982813282, "grad_norm": 8.114547729492188, "learning_rate": 1.376772901718672e-05, "logits/chosen": -0.7372401356697083, "logits/rejected": -0.740159809589386, "logps/chosen": -72.47979736328125, "logps/rejected": -78.80126190185547, "loss": 0.341, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.07109350711107254, "rewards/margins": 0.8361225128173828, "rewards/rejected": -0.7650289535522461, "step": 3735 }, { "epoch": 0.623727682296012, "grad_norm": 16.410921096801758, "learning_rate": 1.376272317703988e-05, "logits/chosen": -0.5186137557029724, "logits/rejected": -0.6103330850601196, "logps/chosen": -46.41812515258789, "logps/rejected": -111.31404876708984, "loss": 0.6701, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6587492823600769, "rewards/margins": 2.1259605884552, "rewards/rejected": -2.784709930419922, "step": 3738 }, { "epoch": 0.6242282663106958, "grad_norm": 27.886117935180664, "learning_rate": 1.3757717336893044e-05, "logits/chosen": -0.7138764262199402, "logits/rejected": -0.6994056105613708, "logps/chosen": -90.36181640625, "logps/rejected": -89.26642608642578, "loss": 0.6279, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.7984156608581543, "rewards/margins": 0.7091162800788879, "rewards/rejected": -2.5075318813323975, "step": 3741 }, { "epoch": 0.6247288503253796, "grad_norm": 27.06134605407715, "learning_rate": 1.3752711496746205e-05, "logits/chosen": -0.7159523367881775, "logits/rejected": -0.7168111205101013, "logps/chosen": -46.394779205322266, "logps/rejected": -67.39791107177734, "loss": 0.6294, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 1.0020607709884644, "rewards/margins": 1.8040552139282227, "rewards/rejected": -0.8019943833351135, "step": 3744 }, { "epoch": 0.6252294343400634, "grad_norm": 10.85600471496582, "learning_rate": 1.3747705656599365e-05, "logits/chosen": -0.663423478603363, "logits/rejected": -0.7271997928619385, "logps/chosen": -74.6313705444336, "logps/rejected": -140.96249389648438, "loss": 0.6617, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3405705392360687, "rewards/margins": 3.424703598022461, "rewards/rejected": -3.7652742862701416, "step": 3747 }, { "epoch": 0.6257300183547472, "grad_norm": 16.562654495239258, "learning_rate": 1.3742699816452529e-05, "logits/chosen": -0.7775673866271973, "logits/rejected": -0.7948353886604309, "logps/chosen": -167.22808837890625, "logps/rejected": -133.7283477783203, "loss": 1.069, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.03489963710308075, "rewards/margins": 1.2311304807662964, "rewards/rejected": -1.1962308883666992, "step": 3750 }, { "epoch": 0.626230602369431, "grad_norm": 8.501498222351074, "learning_rate": 1.373769397630569e-05, "logits/chosen": -0.7155146598815918, "logits/rejected": -0.7248470783233643, "logps/chosen": -107.825439453125, "logps/rejected": -160.5462646484375, "loss": 0.5005, "rewards/accuracies": 1.0, "rewards/chosen": -0.8394746780395508, "rewards/margins": 1.7606045007705688, "rewards/rejected": -2.600078821182251, "step": 3753 }, { "epoch": 0.6267311863841148, "grad_norm": 27.067869186401367, "learning_rate": 1.3732688136158854e-05, "logits/chosen": -0.47502195835113525, "logits/rejected": -0.49517807364463806, "logps/chosen": -120.14276885986328, "logps/rejected": -95.82235717773438, "loss": 0.2671, "rewards/accuracies": 1.0, "rewards/chosen": 0.4151419699192047, "rewards/margins": 1.9944463968276978, "rewards/rejected": -1.5793043375015259, "step": 3756 }, { "epoch": 0.6272317703987986, "grad_norm": 7.960994243621826, "learning_rate": 1.3727682296012014e-05, "logits/chosen": -0.7066046595573425, "logits/rejected": -0.6987916827201843, "logps/chosen": -84.2215805053711, "logps/rejected": -98.91452026367188, "loss": 0.2294, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1708321571350098, "rewards/margins": 0.7822666168212891, "rewards/rejected": -1.9530987739562988, "step": 3759 }, { "epoch": 0.6277323544134824, "grad_norm": 6.447353363037109, "learning_rate": 1.3722676455865178e-05, "logits/chosen": -0.6007698178291321, "logits/rejected": -0.6507043242454529, "logps/chosen": -51.31180191040039, "logps/rejected": -109.141845703125, "loss": 0.235, "rewards/accuracies": 1.0, "rewards/chosen": 0.9182875156402588, "rewards/margins": 1.616389274597168, "rewards/rejected": -0.6981016993522644, "step": 3762 }, { "epoch": 0.6282329384281662, "grad_norm": 12.723027229309082, "learning_rate": 1.371767061571834e-05, "logits/chosen": -0.6532497406005859, "logits/rejected": -0.6956233382225037, "logps/chosen": -44.658782958984375, "logps/rejected": -112.6042251586914, "loss": 0.5658, "rewards/accuracies": 1.0, "rewards/chosen": -0.21179288625717163, "rewards/margins": 2.9915754795074463, "rewards/rejected": -3.2033684253692627, "step": 3765 }, { "epoch": 0.62873352244285, "grad_norm": 13.072351455688477, "learning_rate": 1.37126647755715e-05, "logits/chosen": -0.8124515414237976, "logits/rejected": -0.8170760273933411, "logps/chosen": -77.63211822509766, "logps/rejected": -114.17452239990234, "loss": 0.3269, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.1615702360868454, "rewards/margins": 2.7111403942108154, "rewards/rejected": -2.549570322036743, "step": 3768 }, { "epoch": 0.6292341064575337, "grad_norm": 14.760712623596191, "learning_rate": 1.3707658935424663e-05, "logits/chosen": -0.6426565051078796, "logits/rejected": -0.7123411297798157, "logps/chosen": -54.20527267456055, "logps/rejected": -108.14238739013672, "loss": 0.6401, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.2031671404838562, "rewards/margins": 0.7108917832374573, "rewards/rejected": -0.9140589237213135, "step": 3771 }, { "epoch": 0.6297346904722176, "grad_norm": 9.423932075500488, "learning_rate": 1.3702653095277825e-05, "logits/chosen": -0.6698747277259827, "logits/rejected": -0.6617714762687683, "logps/chosen": -100.78662872314453, "logps/rejected": -90.35222625732422, "loss": 0.3029, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6297032833099365, "rewards/margins": 0.9217438101768494, "rewards/rejected": -1.5514470338821411, "step": 3774 }, { "epoch": 0.6302352744869014, "grad_norm": 6.32108211517334, "learning_rate": 1.3697647255130989e-05, "logits/chosen": -0.8324887156486511, "logits/rejected": -0.8393046855926514, "logps/chosen": -97.81795501708984, "logps/rejected": -106.54029083251953, "loss": 0.4452, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.8227876424789429, "rewards/margins": 0.9368588328361511, "rewards/rejected": -2.759646415710449, "step": 3777 }, { "epoch": 0.6307358585015852, "grad_norm": 20.20669937133789, "learning_rate": 1.3692641414984149e-05, "logits/chosen": -0.4365043640136719, "logits/rejected": -0.409647136926651, "logps/chosen": -72.19441986083984, "logps/rejected": -82.43185424804688, "loss": 0.4337, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8974690437316895, "rewards/margins": 0.3636782169342041, "rewards/rejected": -1.261147141456604, "step": 3780 }, { "epoch": 0.631236442516269, "grad_norm": 7.988563060760498, "learning_rate": 1.3687635574837312e-05, "logits/chosen": -0.48577094078063965, "logits/rejected": -0.4605521261692047, "logps/chosen": -98.21798706054688, "logps/rejected": -56.905029296875, "loss": 0.5717, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.5275498628616333, "rewards/margins": -1.0959244966506958, "rewards/rejected": -0.431625097990036, "step": 3783 }, { "epoch": 0.6317370265309528, "grad_norm": 14.245180130004883, "learning_rate": 1.3682629734690474e-05, "logits/chosen": -0.812363862991333, "logits/rejected": -0.8088476657867432, "logps/chosen": -98.21813201904297, "logps/rejected": -95.8743896484375, "loss": 0.6148, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.2963978052139282, "rewards/margins": -0.1097927913069725, "rewards/rejected": -1.1866050958633423, "step": 3786 }, { "epoch": 0.6322376105456365, "grad_norm": 24.327730178833008, "learning_rate": 1.3677623894543634e-05, "logits/chosen": -0.8506801724433899, "logits/rejected": -0.8079530596733093, "logps/chosen": -88.10332489013672, "logps/rejected": -83.4821548461914, "loss": 0.7929, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.878675639629364, "rewards/margins": 1.0798922777175903, "rewards/rejected": -1.9585679769515991, "step": 3789 }, { "epoch": 0.6327381945603203, "grad_norm": 13.01898193359375, "learning_rate": 1.3672618054396797e-05, "logits/chosen": -0.7081955075263977, "logits/rejected": -0.6876566410064697, "logps/chosen": -71.6161880493164, "logps/rejected": -65.15621185302734, "loss": 0.4963, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.426789253950119, "rewards/margins": 0.730303943157196, "rewards/rejected": -1.1570931673049927, "step": 3792 }, { "epoch": 0.6332387785750042, "grad_norm": 46.0858154296875, "learning_rate": 1.366761221424996e-05, "logits/chosen": -0.6563850045204163, "logits/rejected": -0.6887515187263489, "logps/chosen": -101.53833770751953, "logps/rejected": -96.63510131835938, "loss": 0.8948, "rewards/accuracies": 0.0, "rewards/chosen": -1.1590808629989624, "rewards/margins": -1.088503360748291, "rewards/rejected": -0.07057736068964005, "step": 3795 }, { "epoch": 0.633739362589688, "grad_norm": 27.29522132873535, "learning_rate": 1.3662606374103123e-05, "logits/chosen": -0.8152391314506531, "logits/rejected": -0.8300299048423767, "logps/chosen": -92.16927337646484, "logps/rejected": -90.92361450195312, "loss": 0.3985, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.0861626863479614, "rewards/margins": 0.8594054579734802, "rewards/rejected": -1.9455682039260864, "step": 3798 }, { "epoch": 0.6342399466043718, "grad_norm": 37.18444061279297, "learning_rate": 1.3657600533956283e-05, "logits/chosen": -0.7584750652313232, "logits/rejected": -0.6859747767448425, "logps/chosen": -98.11197662353516, "logps/rejected": -73.15481567382812, "loss": 0.6173, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.8443485498428345, "rewards/margins": 0.7447602152824402, "rewards/rejected": -2.58910870552063, "step": 3801 }, { "epoch": 0.6347405306190556, "grad_norm": 33.65853500366211, "learning_rate": 1.3652594693809445e-05, "logits/chosen": -0.9394801259040833, "logits/rejected": -0.9261832237243652, "logps/chosen": -77.71497344970703, "logps/rejected": -81.36565399169922, "loss": 0.554, "rewards/accuracies": 1.0, "rewards/chosen": -0.9529868960380554, "rewards/margins": 0.8527737259864807, "rewards/rejected": -1.8057607412338257, "step": 3804 }, { "epoch": 0.6352411146337393, "grad_norm": 47.441917419433594, "learning_rate": 1.3647588853662608e-05, "logits/chosen": -0.5658226013183594, "logits/rejected": -0.5615108609199524, "logps/chosen": -58.78451156616211, "logps/rejected": -78.6031265258789, "loss": 0.5876, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6181034445762634, "rewards/margins": 0.8910704255104065, "rewards/rejected": -1.5091739892959595, "step": 3807 }, { "epoch": 0.6357416986484231, "grad_norm": 16.31043815612793, "learning_rate": 1.3642583013515768e-05, "logits/chosen": -0.8851048946380615, "logits/rejected": -0.8727499842643738, "logps/chosen": -84.61016845703125, "logps/rejected": -87.3841323852539, "loss": 0.2897, "rewards/accuracies": 1.0, "rewards/chosen": 1.1293359994888306, "rewards/margins": 3.204922914505005, "rewards/rejected": -2.075587272644043, "step": 3810 }, { "epoch": 0.636242282663107, "grad_norm": 38.454524993896484, "learning_rate": 1.3637577173368932e-05, "logits/chosen": -0.6404187083244324, "logits/rejected": -0.7345861792564392, "logps/chosen": -74.73320770263672, "logps/rejected": -121.07879638671875, "loss": 0.8108, "rewards/accuracies": 1.0, "rewards/chosen": 0.21763570606708527, "rewards/margins": 1.1016393899917603, "rewards/rejected": -0.8840036988258362, "step": 3813 }, { "epoch": 0.6367428666777908, "grad_norm": 44.74995040893555, "learning_rate": 1.3632571333222094e-05, "logits/chosen": -0.6271951198577881, "logits/rejected": -0.6496250033378601, "logps/chosen": -60.395538330078125, "logps/rejected": -113.47967529296875, "loss": 0.4449, "rewards/accuracies": 1.0, "rewards/chosen": -0.6711323261260986, "rewards/margins": 1.6976553201675415, "rewards/rejected": -2.3687875270843506, "step": 3816 }, { "epoch": 0.6372434506924746, "grad_norm": 32.266990661621094, "learning_rate": 1.3627565493075257e-05, "logits/chosen": -0.7139607071876526, "logits/rejected": -0.729050874710083, "logps/chosen": -92.1675796508789, "logps/rejected": -99.39786529541016, "loss": 0.9494, "rewards/accuracies": 1.0, "rewards/chosen": -0.15504249930381775, "rewards/margins": 2.4290168285369873, "rewards/rejected": -2.584059476852417, "step": 3819 }, { "epoch": 0.6377440347071583, "grad_norm": 31.20060920715332, "learning_rate": 1.3622559652928417e-05, "logits/chosen": -0.6755185723304749, "logits/rejected": -0.6812178492546082, "logps/chosen": -81.6805648803711, "logps/rejected": -88.74105072021484, "loss": 0.7012, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.9771537184715271, "rewards/margins": 0.4002401828765869, "rewards/rejected": -1.3773938417434692, "step": 3822 }, { "epoch": 0.6382446187218421, "grad_norm": 9.718733787536621, "learning_rate": 1.3617553812781579e-05, "logits/chosen": -0.7081522941589355, "logits/rejected": -0.7315641045570374, "logps/chosen": -57.72343063354492, "logps/rejected": -119.27365112304688, "loss": 0.3068, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.21628709137439728, "rewards/margins": 1.4401253461837769, "rewards/rejected": -1.6564124822616577, "step": 3825 }, { "epoch": 0.6387452027365259, "grad_norm": 31.460193634033203, "learning_rate": 1.3612547972634742e-05, "logits/chosen": -0.7298641800880432, "logits/rejected": -0.7313587069511414, "logps/chosen": -108.60237884521484, "logps/rejected": -178.33067321777344, "loss": 0.8166, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 1.0032023191452026, "rewards/margins": 1.8505891561508179, "rewards/rejected": -0.8473868370056152, "step": 3828 }, { "epoch": 0.6392457867512098, "grad_norm": 41.952144622802734, "learning_rate": 1.3607542132487903e-05, "logits/chosen": -0.6410872936248779, "logits/rejected": -0.660472571849823, "logps/chosen": -87.92645263671875, "logps/rejected": -130.56787109375, "loss": 0.4744, "rewards/accuracies": 1.0, "rewards/chosen": 0.24651272594928741, "rewards/margins": 3.1829922199249268, "rewards/rejected": -2.9364795684814453, "step": 3831 }, { "epoch": 0.6397463707658936, "grad_norm": 8.638508796691895, "learning_rate": 1.3602536292341066e-05, "logits/chosen": -0.49920201301574707, "logits/rejected": -0.5067345499992371, "logps/chosen": -57.17620849609375, "logps/rejected": -99.40709686279297, "loss": 1.0576, "rewards/accuracies": 1.0, "rewards/chosen": 0.045172374695539474, "rewards/margins": 0.7288658022880554, "rewards/rejected": -0.6836934089660645, "step": 3834 }, { "epoch": 0.6402469547805774, "grad_norm": 9.318795204162598, "learning_rate": 1.3597530452194228e-05, "logits/chosen": -0.6588925719261169, "logits/rejected": -0.6685873866081238, "logps/chosen": -47.83244323730469, "logps/rejected": -95.95272827148438, "loss": 0.2231, "rewards/accuracies": 1.0, "rewards/chosen": -0.11689478904008865, "rewards/margins": 2.0034284591674805, "rewards/rejected": -2.1203231811523438, "step": 3837 }, { "epoch": 0.6407475387952611, "grad_norm": 8.664215087890625, "learning_rate": 1.3592524612047391e-05, "logits/chosen": -0.768385648727417, "logits/rejected": -0.7260382771492004, "logps/chosen": -111.7181396484375, "logps/rejected": -99.30963134765625, "loss": 0.1142, "rewards/accuracies": 1.0, "rewards/chosen": 0.9107217788696289, "rewards/margins": 3.356198310852051, "rewards/rejected": -2.445476531982422, "step": 3840 }, { "epoch": 0.6412481228099449, "grad_norm": 35.222686767578125, "learning_rate": 1.3587518771900551e-05, "logits/chosen": -0.8491737246513367, "logits/rejected": -0.84529048204422, "logps/chosen": -36.26823806762695, "logps/rejected": -88.55216217041016, "loss": 0.416, "rewards/accuracies": 1.0, "rewards/chosen": 0.7790720462799072, "rewards/margins": 3.532578706741333, "rewards/rejected": -2.753506660461426, "step": 3843 }, { "epoch": 0.6417487068246287, "grad_norm": 36.827999114990234, "learning_rate": 1.3582512931753713e-05, "logits/chosen": -0.6554626822471619, "logits/rejected": -0.6430365443229675, "logps/chosen": -164.58566284179688, "logps/rejected": -113.80087280273438, "loss": 1.2092, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -4.36636209487915, "rewards/margins": -2.760089159011841, "rewards/rejected": -1.6062730550765991, "step": 3846 }, { "epoch": 0.6422492908393125, "grad_norm": 7.526715278625488, "learning_rate": 1.3577507091606877e-05, "logits/chosen": -0.6078572273254395, "logits/rejected": -0.6724982857704163, "logps/chosen": -94.12779998779297, "logps/rejected": -148.0711669921875, "loss": 0.2226, "rewards/accuracies": 1.0, "rewards/chosen": 0.9246876835823059, "rewards/margins": 1.145473599433899, "rewards/rejected": -0.22078602015972137, "step": 3849 }, { "epoch": 0.6427498748539964, "grad_norm": 34.45322036743164, "learning_rate": 1.3572501251460037e-05, "logits/chosen": -0.7943975329399109, "logits/rejected": -0.749057948589325, "logps/chosen": -88.52823638916016, "logps/rejected": -129.46156311035156, "loss": 0.3678, "rewards/accuracies": 1.0, "rewards/chosen": 0.9036082625389099, "rewards/margins": 3.1048624515533447, "rewards/rejected": -2.201254367828369, "step": 3852 }, { "epoch": 0.6432504588686802, "grad_norm": 29.768844604492188, "learning_rate": 1.35674954113132e-05, "logits/chosen": -0.6525542140007019, "logits/rejected": -0.7059590220451355, "logps/chosen": -65.2030258178711, "logps/rejected": -123.77587890625, "loss": 0.5004, "rewards/accuracies": 1.0, "rewards/chosen": -0.04997545853257179, "rewards/margins": 0.8516402244567871, "rewards/rejected": -0.9016156196594238, "step": 3855 }, { "epoch": 0.6437510428833639, "grad_norm": 33.88640213012695, "learning_rate": 1.3562489571166362e-05, "logits/chosen": -0.5788738131523132, "logits/rejected": -0.6893495917320251, "logps/chosen": -109.27765655517578, "logps/rejected": -183.4861602783203, "loss": 0.4615, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.5147284269332886, "rewards/margins": 2.112368583679199, "rewards/rejected": -3.6270971298217773, "step": 3858 }, { "epoch": 0.6442516268980477, "grad_norm": 19.384227752685547, "learning_rate": 1.3557483731019522e-05, "logits/chosen": -0.5060186386108398, "logits/rejected": -0.7601985931396484, "logps/chosen": -32.124210357666016, "logps/rejected": -143.7655487060547, "loss": 0.2671, "rewards/accuracies": 1.0, "rewards/chosen": 0.6714784502983093, "rewards/margins": 4.662501811981201, "rewards/rejected": -3.991023302078247, "step": 3861 }, { "epoch": 0.6447522109127315, "grad_norm": 17.216156005859375, "learning_rate": 1.3552477890872686e-05, "logits/chosen": -0.7339315414428711, "logits/rejected": -0.7668974995613098, "logps/chosen": -66.41708374023438, "logps/rejected": -95.42474365234375, "loss": 0.4986, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.49084386229515076, "rewards/margins": 0.4178995192050934, "rewards/rejected": -0.9087433218955994, "step": 3864 }, { "epoch": 0.6452527949274153, "grad_norm": 88.28113555908203, "learning_rate": 1.3547472050725847e-05, "logits/chosen": -0.8672325611114502, "logits/rejected": -0.9337356090545654, "logps/chosen": -109.99019622802734, "logps/rejected": -132.08876037597656, "loss": 0.9455, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -3.4150702953338623, "rewards/margins": -0.7525394558906555, "rewards/rejected": -2.6625306606292725, "step": 3867 }, { "epoch": 0.6457533789420992, "grad_norm": 28.962730407714844, "learning_rate": 1.3542466210579011e-05, "logits/chosen": -0.7636897563934326, "logits/rejected": -0.8306653499603271, "logps/chosen": -50.18764114379883, "logps/rejected": -136.07843017578125, "loss": 0.4793, "rewards/accuracies": 1.0, "rewards/chosen": 0.3808589279651642, "rewards/margins": 5.197657585144043, "rewards/rejected": -4.816798686981201, "step": 3870 }, { "epoch": 0.646253962956783, "grad_norm": 48.00382614135742, "learning_rate": 1.3537460370432171e-05, "logits/chosen": -0.7396354675292969, "logits/rejected": -0.7209523320198059, "logps/chosen": -119.92147064208984, "logps/rejected": -88.8917236328125, "loss": 0.8662, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.0903226137161255, "rewards/margins": -1.4712632894515991, "rewards/rejected": 0.3809405565261841, "step": 3873 }, { "epoch": 0.6467545469714667, "grad_norm": 36.18041229248047, "learning_rate": 1.3532454530285335e-05, "logits/chosen": -0.8032824993133545, "logits/rejected": -0.7642245292663574, "logps/chosen": -160.96148681640625, "logps/rejected": -100.4691162109375, "loss": 0.7024, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.790785551071167, "rewards/margins": -1.1303075551986694, "rewards/rejected": -1.660477638244629, "step": 3876 }, { "epoch": 0.6472551309861505, "grad_norm": 2.435509443283081, "learning_rate": 1.3527448690138496e-05, "logits/chosen": -0.781780481338501, "logits/rejected": -0.7927579879760742, "logps/chosen": -69.72189331054688, "logps/rejected": -119.79419708251953, "loss": 0.5157, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.2330870628356934, "rewards/margins": 1.8348253965377808, "rewards/rejected": -3.0679123401641846, "step": 3879 }, { "epoch": 0.6477557150008343, "grad_norm": 13.029290199279785, "learning_rate": 1.3522442849991656e-05, "logits/chosen": -0.7255236506462097, "logits/rejected": -0.729985237121582, "logps/chosen": -56.38688278198242, "logps/rejected": -96.11298370361328, "loss": 0.3936, "rewards/accuracies": 1.0, "rewards/chosen": -0.4379267394542694, "rewards/margins": 1.602614402770996, "rewards/rejected": -2.040541172027588, "step": 3882 }, { "epoch": 0.6482562990155181, "grad_norm": 64.90553283691406, "learning_rate": 1.351743700984482e-05, "logits/chosen": -0.42768922448158264, "logits/rejected": -0.4697777330875397, "logps/chosen": -95.52923583984375, "logps/rejected": -149.9443359375, "loss": 0.6508, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8152791857719421, "rewards/margins": 1.9922733306884766, "rewards/rejected": -2.8075523376464844, "step": 3885 }, { "epoch": 0.6487568830302018, "grad_norm": 22.101192474365234, "learning_rate": 1.3512431169697982e-05, "logits/chosen": -0.692406952381134, "logits/rejected": -0.7148633003234863, "logps/chosen": -56.57781982421875, "logps/rejected": -124.55696868896484, "loss": 0.3451, "rewards/accuracies": 1.0, "rewards/chosen": 0.753600537776947, "rewards/margins": 3.5339832305908203, "rewards/rejected": -2.7803828716278076, "step": 3888 }, { "epoch": 0.6492574670448857, "grad_norm": 26.601491928100586, "learning_rate": 1.3507425329551145e-05, "logits/chosen": -0.757631778717041, "logits/rejected": -0.7828976511955261, "logps/chosen": -54.59071731567383, "logps/rejected": -110.0931396484375, "loss": 0.6565, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.28293076157569885, "rewards/margins": 3.761087417602539, "rewards/rejected": -3.478156805038452, "step": 3891 }, { "epoch": 0.6497580510595695, "grad_norm": 56.077674865722656, "learning_rate": 1.3502419489404305e-05, "logits/chosen": -0.8331792950630188, "logits/rejected": -0.783585250377655, "logps/chosen": -94.79953002929688, "logps/rejected": -62.62641906738281, "loss": 1.3101, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.139519214630127, "rewards/margins": -1.1620640754699707, "rewards/rejected": -0.9774551391601562, "step": 3894 }, { "epoch": 0.6502586350742533, "grad_norm": 8.940820693969727, "learning_rate": 1.3497413649257469e-05, "logits/chosen": -0.7475094199180603, "logits/rejected": -0.7408905625343323, "logps/chosen": -155.342041015625, "logps/rejected": -143.28541564941406, "loss": 0.4616, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.413692831993103, "rewards/margins": 1.1963220834732056, "rewards/rejected": -2.6100149154663086, "step": 3897 }, { "epoch": 0.6507592190889371, "grad_norm": 24.264488220214844, "learning_rate": 1.349240780911063e-05, "logits/chosen": -0.5283150672912598, "logits/rejected": -0.5612251162528992, "logps/chosen": -74.57353973388672, "logps/rejected": -151.0901336669922, "loss": 0.7392, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.1820648908615112, "rewards/margins": 0.32091793417930603, "rewards/rejected": -1.50298273563385, "step": 3900 }, { "epoch": 0.6512598031036209, "grad_norm": 15.878546714782715, "learning_rate": 1.348740196896379e-05, "logits/chosen": -0.6685652136802673, "logits/rejected": -0.6636133790016174, "logps/chosen": -61.86570358276367, "logps/rejected": -85.7435073852539, "loss": 0.3519, "rewards/accuracies": 1.0, "rewards/chosen": -0.6360189318656921, "rewards/margins": 1.5043396949768066, "rewards/rejected": -2.1403586864471436, "step": 3903 }, { "epoch": 0.6517603871183046, "grad_norm": 17.058446884155273, "learning_rate": 1.3482396128816954e-05, "logits/chosen": -0.8016048073768616, "logits/rejected": -0.794766366481781, "logps/chosen": -104.6098861694336, "logps/rejected": -148.922607421875, "loss": 0.3316, "rewards/accuracies": 1.0, "rewards/chosen": -1.2611011266708374, "rewards/margins": 2.774805784225464, "rewards/rejected": -4.0359063148498535, "step": 3906 }, { "epoch": 0.6522609711329885, "grad_norm": 24.31184196472168, "learning_rate": 1.3477390288670116e-05, "logits/chosen": -0.76017165184021, "logits/rejected": -0.7855933308601379, "logps/chosen": -55.536956787109375, "logps/rejected": -111.75881958007812, "loss": 0.4934, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.0935871601104736, "rewards/margins": 1.3240406513214111, "rewards/rejected": -2.4176278114318848, "step": 3909 }, { "epoch": 0.6527615551476723, "grad_norm": 16.0672664642334, "learning_rate": 1.347238444852328e-05, "logits/chosen": -0.7531471252441406, "logits/rejected": -0.7061173319816589, "logps/chosen": -145.75262451171875, "logps/rejected": -92.12137603759766, "loss": 0.5018, "rewards/accuracies": 0.0, "rewards/chosen": -1.4308892488479614, "rewards/margins": -0.6239004731178284, "rewards/rejected": -0.8069887757301331, "step": 3912 }, { "epoch": 0.6532621391623561, "grad_norm": 104.06474304199219, "learning_rate": 1.346737860837644e-05, "logits/chosen": -0.7779243588447571, "logits/rejected": -0.798250138759613, "logps/chosen": -92.47241973876953, "logps/rejected": -109.6016616821289, "loss": 0.6093, "rewards/accuracies": 1.0, "rewards/chosen": -0.23026223480701447, "rewards/margins": 2.638169050216675, "rewards/rejected": -2.868431329727173, "step": 3915 }, { "epoch": 0.6537627231770399, "grad_norm": 36.183921813964844, "learning_rate": 1.3462372768229601e-05, "logits/chosen": -0.5497646927833557, "logits/rejected": -0.6149067878723145, "logps/chosen": -69.34217071533203, "logps/rejected": -139.15061950683594, "loss": 0.7672, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6083363890647888, "rewards/margins": 3.5181338787078857, "rewards/rejected": -4.12647008895874, "step": 3918 }, { "epoch": 0.6542633071917237, "grad_norm": 11.344193458557129, "learning_rate": 1.3457366928082765e-05, "logits/chosen": -0.729245662689209, "logits/rejected": -0.7040767669677734, "logps/chosen": -112.27376556396484, "logps/rejected": -92.31305694580078, "loss": 0.6602, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.7925992012023926, "rewards/margins": 0.5153965353965759, "rewards/rejected": -1.3079957962036133, "step": 3921 }, { "epoch": 0.6547638912064074, "grad_norm": 25.947956085205078, "learning_rate": 1.3452361087935925e-05, "logits/chosen": -0.767037570476532, "logits/rejected": -0.7303327918052673, "logps/chosen": -134.0308380126953, "logps/rejected": -79.17821502685547, "loss": 0.8593, "rewards/accuracies": 0.0, "rewards/chosen": -2.7371740341186523, "rewards/margins": -1.1049976348876953, "rewards/rejected": -1.6321762800216675, "step": 3924 }, { "epoch": 0.6552644752210913, "grad_norm": 24.68844985961914, "learning_rate": 1.3447355247789088e-05, "logits/chosen": -0.6827986836433411, "logits/rejected": -0.6817951798439026, "logps/chosen": -90.9841537475586, "logps/rejected": -88.28705596923828, "loss": 1.1641, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.2825089693069458, "rewards/margins": 0.5448451042175293, "rewards/rejected": -1.8273543119430542, "step": 3927 }, { "epoch": 0.6557650592357751, "grad_norm": 31.635133743286133, "learning_rate": 1.344234940764225e-05, "logits/chosen": -0.6064911484718323, "logits/rejected": -0.6022898554801941, "logps/chosen": -127.52466583251953, "logps/rejected": -117.99420928955078, "loss": 0.7519, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.2813776731491089, "rewards/margins": -0.6023181080818176, "rewards/rejected": -0.6790595650672913, "step": 3930 }, { "epoch": 0.6562656432504589, "grad_norm": 36.366268157958984, "learning_rate": 1.3437343567495414e-05, "logits/chosen": -0.7303915619850159, "logits/rejected": -0.7961094975471497, "logps/chosen": -80.45352172851562, "logps/rejected": -149.43141174316406, "loss": 0.666, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -3.2985305786132812, "rewards/margins": 2.445749044418335, "rewards/rejected": -5.744279384613037, "step": 3933 }, { "epoch": 0.6567662272651427, "grad_norm": 34.498130798339844, "learning_rate": 1.3432337727348574e-05, "logits/chosen": -0.7957926392555237, "logits/rejected": -0.8644525408744812, "logps/chosen": -134.0364990234375, "logps/rejected": -164.44093322753906, "loss": 0.7793, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.489223599433899, "rewards/margins": 0.9500706791877747, "rewards/rejected": -2.4392945766448975, "step": 3936 }, { "epoch": 0.6572668112798264, "grad_norm": 13.601644515991211, "learning_rate": 1.3427331887201736e-05, "logits/chosen": -0.8011698722839355, "logits/rejected": -0.8228297233581543, "logps/chosen": -81.63623809814453, "logps/rejected": -105.90546417236328, "loss": 0.6501, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.7111364603042603, "rewards/margins": 1.7349032163619995, "rewards/rejected": -3.4460394382476807, "step": 3939 }, { "epoch": 0.6577673952945102, "grad_norm": 38.338172912597656, "learning_rate": 1.3422326047054899e-05, "logits/chosen": -0.7373456954956055, "logits/rejected": -0.6960366368293762, "logps/chosen": -142.19241333007812, "logps/rejected": -108.38082885742188, "loss": 0.5926, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.0073916912078857, "rewards/margins": -0.9645892977714539, "rewards/rejected": -1.0428024530410767, "step": 3942 }, { "epoch": 0.658267979309194, "grad_norm": 31.34670639038086, "learning_rate": 1.341732020690806e-05, "logits/chosen": -0.7228819727897644, "logits/rejected": -0.7304533123970032, "logps/chosen": -81.3130111694336, "logps/rejected": -118.47322845458984, "loss": 0.2701, "rewards/accuracies": 1.0, "rewards/chosen": -1.2330089807510376, "rewards/margins": 1.7708088159561157, "rewards/rejected": -3.0038177967071533, "step": 3945 }, { "epoch": 0.6587685633238779, "grad_norm": 5.1499552726745605, "learning_rate": 1.3412314366761223e-05, "logits/chosen": -0.7146205902099609, "logits/rejected": -0.7063226103782654, "logps/chosen": -77.42945098876953, "logps/rejected": -116.43199920654297, "loss": 0.1399, "rewards/accuracies": 1.0, "rewards/chosen": -0.27250292897224426, "rewards/margins": 2.376340627670288, "rewards/rejected": -2.64884352684021, "step": 3948 }, { "epoch": 0.6592691473385617, "grad_norm": 12.325276374816895, "learning_rate": 1.3407308526614385e-05, "logits/chosen": -0.580766499042511, "logits/rejected": -0.6357859969139099, "logps/chosen": -57.97957229614258, "logps/rejected": -135.6875762939453, "loss": 0.3696, "rewards/accuracies": 1.0, "rewards/chosen": -0.0726168155670166, "rewards/margins": 4.882521152496338, "rewards/rejected": -4.955137729644775, "step": 3951 }, { "epoch": 0.6597697313532455, "grad_norm": 19.596105575561523, "learning_rate": 1.3402302686467548e-05, "logits/chosen": -0.6405529975891113, "logits/rejected": -0.6646207571029663, "logps/chosen": -86.43350219726562, "logps/rejected": -122.89115142822266, "loss": 0.4444, "rewards/accuracies": 1.0, "rewards/chosen": -1.0948482751846313, "rewards/margins": 2.5840044021606445, "rewards/rejected": -3.6788527965545654, "step": 3954 }, { "epoch": 0.6602703153679292, "grad_norm": 47.48630142211914, "learning_rate": 1.3397296846320708e-05, "logits/chosen": -0.4238930642604828, "logits/rejected": -0.45892414450645447, "logps/chosen": -67.3498764038086, "logps/rejected": -122.8472900390625, "loss": 1.2041, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3900695741176605, "rewards/margins": 0.6601713299751282, "rewards/rejected": -1.0502408742904663, "step": 3957 }, { "epoch": 0.660770899382613, "grad_norm": 13.88912296295166, "learning_rate": 1.339229100617387e-05, "logits/chosen": -0.8103920817375183, "logits/rejected": -0.8107448220252991, "logps/chosen": -80.60140991210938, "logps/rejected": -120.84369659423828, "loss": 0.4434, "rewards/accuracies": 1.0, "rewards/chosen": -0.5194917321205139, "rewards/margins": 1.9162297248840332, "rewards/rejected": -2.4357213973999023, "step": 3960 }, { "epoch": 0.6612714833972968, "grad_norm": 12.568327903747559, "learning_rate": 1.3387285166027033e-05, "logits/chosen": -0.6923286318778992, "logits/rejected": -0.7248234748840332, "logps/chosen": -45.76041793823242, "logps/rejected": -104.2751235961914, "loss": 0.799, "rewards/accuracies": 1.0, "rewards/chosen": 0.22558479011058807, "rewards/margins": 2.2603681087493896, "rewards/rejected": -2.034783363342285, "step": 3963 }, { "epoch": 0.6617720674119807, "grad_norm": 25.1223087310791, "learning_rate": 1.3382279325880193e-05, "logits/chosen": -0.776827335357666, "logits/rejected": -0.781287431716919, "logps/chosen": -115.375, "logps/rejected": -84.04248809814453, "loss": 0.7232, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -3.142118453979492, "rewards/margins": -0.12808187305927277, "rewards/rejected": -3.0140364170074463, "step": 3966 }, { "epoch": 0.6622726514266645, "grad_norm": 11.222503662109375, "learning_rate": 1.3377273485733357e-05, "logits/chosen": -0.6840665936470032, "logits/rejected": -0.626481294631958, "logps/chosen": -105.82262420654297, "logps/rejected": -100.81501007080078, "loss": 0.6897, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -3.082260847091675, "rewards/margins": 0.7482894062995911, "rewards/rejected": -3.830549955368042, "step": 3969 }, { "epoch": 0.6627732354413483, "grad_norm": 61.30501937866211, "learning_rate": 1.3372267645586519e-05, "logits/chosen": -0.5348632335662842, "logits/rejected": -0.5272796154022217, "logps/chosen": -58.31698226928711, "logps/rejected": -51.40556335449219, "loss": 0.7421, "rewards/accuracies": 0.0, "rewards/chosen": -0.3353695571422577, "rewards/margins": -0.5697693228721619, "rewards/rejected": 0.23439975082874298, "step": 3972 }, { "epoch": 0.663273819456032, "grad_norm": 11.218260765075684, "learning_rate": 1.3367261805439679e-05, "logits/chosen": -0.6524367928504944, "logits/rejected": -0.6781823635101318, "logps/chosen": -82.16259765625, "logps/rejected": -125.23394012451172, "loss": 0.4718, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.6063655614852905, "rewards/margins": 1.2227791547775269, "rewards/rejected": -2.8291447162628174, "step": 3975 }, { "epoch": 0.6637744034707158, "grad_norm": 17.327651977539062, "learning_rate": 1.3362255965292842e-05, "logits/chosen": -0.7775492668151855, "logits/rejected": -0.7590801119804382, "logps/chosen": -120.8515396118164, "logps/rejected": -104.36660766601562, "loss": 0.3666, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8724279403686523, "rewards/margins": 0.9839197993278503, "rewards/rejected": -1.856347680091858, "step": 3978 }, { "epoch": 0.6642749874853996, "grad_norm": 16.704837799072266, "learning_rate": 1.3357250125146004e-05, "logits/chosen": -0.682022750377655, "logits/rejected": -0.7185208201408386, "logps/chosen": -70.08126831054688, "logps/rejected": -157.84800720214844, "loss": 0.4156, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.6593943238258362, "rewards/margins": 1.6513592004776, "rewards/rejected": -2.310753583908081, "step": 3981 }, { "epoch": 0.6647755715000835, "grad_norm": 16.375452041625977, "learning_rate": 1.3352244284999168e-05, "logits/chosen": -0.758526086807251, "logits/rejected": -0.6814786791801453, "logps/chosen": -104.50057220458984, "logps/rejected": -95.10027313232422, "loss": 0.6571, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8283523917198181, "rewards/margins": 1.0476378202438354, "rewards/rejected": -1.8759900331497192, "step": 3984 }, { "epoch": 0.6652761555147673, "grad_norm": 37.48934555053711, "learning_rate": 1.3347238444852328e-05, "logits/chosen": -0.6955100893974304, "logits/rejected": -0.6172997951507568, "logps/chosen": -129.0363006591797, "logps/rejected": -108.4576187133789, "loss": 0.5267, "rewards/accuracies": 1.0, "rewards/chosen": -0.7501309514045715, "rewards/margins": 1.4299135208129883, "rewards/rejected": -2.180044412612915, "step": 3987 }, { "epoch": 0.665776739529451, "grad_norm": 51.73783874511719, "learning_rate": 1.3342232604705491e-05, "logits/chosen": -0.8606105446815491, "logits/rejected": -0.8278452754020691, "logps/chosen": -86.9264144897461, "logps/rejected": -62.719451904296875, "loss": 0.9432, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5610911250114441, "rewards/margins": 0.18750770390033722, "rewards/rejected": -0.7485988140106201, "step": 3990 }, { "epoch": 0.6662773235441348, "grad_norm": 40.366966247558594, "learning_rate": 1.3337226764558653e-05, "logits/chosen": -0.7477318644523621, "logits/rejected": -0.741949737071991, "logps/chosen": -133.4613800048828, "logps/rejected": -106.0312728881836, "loss": 0.7858, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.658228874206543, "rewards/margins": 0.3939630687236786, "rewards/rejected": -1.0521918535232544, "step": 3993 }, { "epoch": 0.6667779075588186, "grad_norm": 13.085230827331543, "learning_rate": 1.3332220924411813e-05, "logits/chosen": -0.6384044289588928, "logits/rejected": -0.7303054928779602, "logps/chosen": -81.52479553222656, "logps/rejected": -179.07325744628906, "loss": 0.322, "rewards/accuracies": 1.0, "rewards/chosen": 0.7633218169212341, "rewards/margins": 2.9890973567962646, "rewards/rejected": -2.2257754802703857, "step": 3996 }, { "epoch": 0.6672784915735024, "grad_norm": 36.0842170715332, "learning_rate": 1.3327215084264977e-05, "logits/chosen": -0.722144365310669, "logits/rejected": -0.754108726978302, "logps/chosen": -119.11243438720703, "logps/rejected": -121.5584487915039, "loss": 0.5193, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.0004465579986572, "rewards/margins": 0.12307461351156235, "rewards/rejected": -2.123521089553833, "step": 3999 }, { "epoch": 0.6677790755881862, "grad_norm": 10.948722839355469, "learning_rate": 1.3322209244118138e-05, "logits/chosen": -0.6823599934577942, "logits/rejected": -0.5970838069915771, "logps/chosen": -163.9893341064453, "logps/rejected": -102.69609832763672, "loss": 0.3577, "rewards/accuracies": 1.0, "rewards/chosen": 0.31852099299430847, "rewards/margins": 1.5406198501586914, "rewards/rejected": -1.2220988273620605, "step": 4002 }, { "epoch": 0.6682796596028701, "grad_norm": 10.250656127929688, "learning_rate": 1.3317203403971302e-05, "logits/chosen": -0.8275238871574402, "logits/rejected": -0.8320279121398926, "logps/chosen": -71.04459381103516, "logps/rejected": -91.74784088134766, "loss": 0.4015, "rewards/accuracies": 1.0, "rewards/chosen": 0.7290098071098328, "rewards/margins": 2.236778497695923, "rewards/rejected": -1.5077687501907349, "step": 4005 }, { "epoch": 0.6687802436175538, "grad_norm": 29.622024536132812, "learning_rate": 1.3312197563824462e-05, "logits/chosen": -0.9008142352104187, "logits/rejected": -0.8855012059211731, "logps/chosen": -137.27098083496094, "logps/rejected": -95.96045684814453, "loss": 0.2801, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.31927490234375, "rewards/margins": 1.296436071395874, "rewards/rejected": -3.615711212158203, "step": 4008 }, { "epoch": 0.6692808276322376, "grad_norm": 49.036903381347656, "learning_rate": 1.3307191723677625e-05, "logits/chosen": -0.7510854601860046, "logits/rejected": -0.7822802662849426, "logps/chosen": -84.72176361083984, "logps/rejected": -88.4293212890625, "loss": 0.7623, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.13334478437900543, "rewards/margins": 0.13295429944992065, "rewards/rejected": -0.2662990987300873, "step": 4011 }, { "epoch": 0.6697814116469214, "grad_norm": 19.511764526367188, "learning_rate": 1.3302185883530787e-05, "logits/chosen": -0.6244472861289978, "logits/rejected": -0.6119830012321472, "logps/chosen": -48.55497360229492, "logps/rejected": -57.17740249633789, "loss": 0.4267, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.48298177123069763, "rewards/margins": 0.3251546621322632, "rewards/rejected": -0.8081364035606384, "step": 4014 }, { "epoch": 0.6702819956616052, "grad_norm": 17.803342819213867, "learning_rate": 1.3297180043383947e-05, "logits/chosen": -0.861199676990509, "logits/rejected": -0.8481821417808533, "logps/chosen": -104.01102447509766, "logps/rejected": -122.33026123046875, "loss": 0.296, "rewards/accuracies": 1.0, "rewards/chosen": -0.9021146893501282, "rewards/margins": 0.9207606315612793, "rewards/rejected": -1.8228753805160522, "step": 4017 }, { "epoch": 0.670782579676289, "grad_norm": 23.744739532470703, "learning_rate": 1.3292174203237111e-05, "logits/chosen": -0.7223451137542725, "logits/rejected": -0.6389719843864441, "logps/chosen": -104.72042846679688, "logps/rejected": -120.74384307861328, "loss": 0.3831, "rewards/accuracies": 1.0, "rewards/chosen": -1.2715346813201904, "rewards/margins": 2.0733859539031982, "rewards/rejected": -3.3449203968048096, "step": 4020 }, { "epoch": 0.6712831636909729, "grad_norm": 30.077831268310547, "learning_rate": 1.3287168363090273e-05, "logits/chosen": -0.8953114151954651, "logits/rejected": -0.827219545841217, "logps/chosen": -102.72437286376953, "logps/rejected": -42.725067138671875, "loss": 0.6425, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.4502524137496948, "rewards/margins": -1.049667239189148, "rewards/rejected": -0.40058526396751404, "step": 4023 }, { "epoch": 0.6717837477056566, "grad_norm": 25.843048095703125, "learning_rate": 1.3282162522943436e-05, "logits/chosen": -0.8167133927345276, "logits/rejected": -0.7543242573738098, "logps/chosen": -97.84212493896484, "logps/rejected": -105.080322265625, "loss": 0.6722, "rewards/accuracies": 1.0, "rewards/chosen": -0.28238818049430847, "rewards/margins": 2.2869532108306885, "rewards/rejected": -2.5693414211273193, "step": 4026 }, { "epoch": 0.6722843317203404, "grad_norm": 22.904600143432617, "learning_rate": 1.3277156682796596e-05, "logits/chosen": -0.479285329580307, "logits/rejected": -0.5263115167617798, "logps/chosen": -95.05187225341797, "logps/rejected": -157.5618133544922, "loss": 0.4472, "rewards/accuracies": 1.0, "rewards/chosen": -0.2975740134716034, "rewards/margins": 3.267747163772583, "rewards/rejected": -3.5653209686279297, "step": 4029 }, { "epoch": 0.6727849157350242, "grad_norm": 29.131120681762695, "learning_rate": 1.3272150842649758e-05, "logits/chosen": -0.786415159702301, "logits/rejected": -0.7333488464355469, "logps/chosen": -75.07395935058594, "logps/rejected": -101.9687271118164, "loss": 0.7903, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.16564039885997772, "rewards/margins": 1.8672512769699097, "rewards/rejected": -2.0328915119171143, "step": 4032 }, { "epoch": 0.673285499749708, "grad_norm": 7.622950553894043, "learning_rate": 1.3267145002502922e-05, "logits/chosen": -0.6047490239143372, "logits/rejected": -0.6490761637687683, "logps/chosen": -14.091168403625488, "logps/rejected": -90.6706314086914, "loss": 0.3256, "rewards/accuracies": 1.0, "rewards/chosen": 0.8420754075050354, "rewards/margins": 3.034456968307495, "rewards/rejected": -2.1923816204071045, "step": 4035 }, { "epoch": 0.6737860837643918, "grad_norm": 13.223518371582031, "learning_rate": 1.3262139162356082e-05, "logits/chosen": -0.7547058463096619, "logits/rejected": -0.787652313709259, "logps/chosen": -72.83670043945312, "logps/rejected": -100.26944732666016, "loss": 0.5301, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.1338982582092285, "rewards/margins": -0.4363175332546234, "rewards/rejected": -1.6975808143615723, "step": 4038 }, { "epoch": 0.6742866677790756, "grad_norm": 10.637162208557129, "learning_rate": 1.3257133322209245e-05, "logits/chosen": -0.8083270192146301, "logits/rejected": -0.8358518481254578, "logps/chosen": -96.01319122314453, "logps/rejected": -129.0875244140625, "loss": 0.2911, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.32883453369140625, "rewards/margins": 3.6140291690826416, "rewards/rejected": -3.942864179611206, "step": 4041 }, { "epoch": 0.6747872517937594, "grad_norm": 4.276526927947998, "learning_rate": 1.3252127482062407e-05, "logits/chosen": -0.639216959476471, "logits/rejected": -0.6848061680793762, "logps/chosen": -68.8005142211914, "logps/rejected": -110.83283233642578, "loss": 0.1942, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3440669775009155, "rewards/margins": 2.5086894035339355, "rewards/rejected": -2.8527565002441406, "step": 4044 }, { "epoch": 0.6752878358084432, "grad_norm": 21.359716415405273, "learning_rate": 1.324712164191557e-05, "logits/chosen": -0.6516497731208801, "logits/rejected": -0.6646464467048645, "logps/chosen": -109.5464096069336, "logps/rejected": -160.41224670410156, "loss": 0.6771, "rewards/accuracies": 1.0, "rewards/chosen": -0.040205877274274826, "rewards/margins": 2.992725372314453, "rewards/rejected": -3.032931327819824, "step": 4047 }, { "epoch": 0.675788419823127, "grad_norm": 18.724620819091797, "learning_rate": 1.324211580176873e-05, "logits/chosen": -0.6851260662078857, "logits/rejected": -0.6894317269325256, "logps/chosen": -190.6334228515625, "logps/rejected": -167.87367248535156, "loss": 0.5857, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3049474060535431, "rewards/margins": 0.6736724972724915, "rewards/rejected": -0.9786199927330017, "step": 4050 }, { "epoch": 0.6762890038378108, "grad_norm": 14.361430168151855, "learning_rate": 1.3237109961621892e-05, "logits/chosen": -0.7279791235923767, "logits/rejected": -0.7130541205406189, "logps/chosen": -103.1748046875, "logps/rejected": -111.5789566040039, "loss": 0.4674, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.0088560581207275, "rewards/margins": 0.14531366527080536, "rewards/rejected": -2.154169797897339, "step": 4053 }, { "epoch": 0.6767895878524945, "grad_norm": 32.10148620605469, "learning_rate": 1.3232104121475056e-05, "logits/chosen": -0.8881149888038635, "logits/rejected": -0.8523340225219727, "logps/chosen": -111.51627349853516, "logps/rejected": -96.66967010498047, "loss": 1.2994, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.325904130935669, "rewards/margins": 0.7907991409301758, "rewards/rejected": -2.1167032718658447, "step": 4056 }, { "epoch": 0.6772901718671783, "grad_norm": 22.113662719726562, "learning_rate": 1.3227098281328216e-05, "logits/chosen": -0.6196330189704895, "logits/rejected": -0.6321048736572266, "logps/chosen": -74.94139099121094, "logps/rejected": -78.94254302978516, "loss": 0.5245, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1058427095413208, "rewards/margins": 0.9000017046928406, "rewards/rejected": -2.0058443546295166, "step": 4059 }, { "epoch": 0.6777907558818622, "grad_norm": 34.773536682128906, "learning_rate": 1.322209244118138e-05, "logits/chosen": -0.7117645144462585, "logits/rejected": -0.7141217589378357, "logps/chosen": -78.12388610839844, "logps/rejected": -107.9468765258789, "loss": 0.5602, "rewards/accuracies": 1.0, "rewards/chosen": -0.16753971576690674, "rewards/margins": 3.304072618484497, "rewards/rejected": -3.4716126918792725, "step": 4062 }, { "epoch": 0.678291339896546, "grad_norm": 26.573949813842773, "learning_rate": 1.3217086601034541e-05, "logits/chosen": -0.7608804702758789, "logits/rejected": -0.7974146008491516, "logps/chosen": -75.3434066772461, "logps/rejected": -117.58251190185547, "loss": 0.4657, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.2717608213424683, "rewards/margins": 1.30050790309906, "rewards/rejected": -2.5722687244415283, "step": 4065 }, { "epoch": 0.6787919239112298, "grad_norm": 37.468849182128906, "learning_rate": 1.3212080760887705e-05, "logits/chosen": -0.9149613976478577, "logits/rejected": -0.9015749096870422, "logps/chosen": -81.04008483886719, "logps/rejected": -56.453617095947266, "loss": 0.8626, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.8080217242240906, "rewards/margins": 0.017058968544006348, "rewards/rejected": -0.8250806331634521, "step": 4068 }, { "epoch": 0.6792925079259136, "grad_norm": 33.785404205322266, "learning_rate": 1.3207074920740865e-05, "logits/chosen": -0.639411985874176, "logits/rejected": -0.7076315879821777, "logps/chosen": -78.72119903564453, "logps/rejected": -153.0905303955078, "loss": 0.8671, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.8135643005371094, "rewards/margins": 0.8735482096672058, "rewards/rejected": -1.6871124505996704, "step": 4071 }, { "epoch": 0.6797930919405973, "grad_norm": 20.330631256103516, "learning_rate": 1.3202069080594027e-05, "logits/chosen": -0.7906506061553955, "logits/rejected": -0.8157560229301453, "logps/chosen": -65.56924438476562, "logps/rejected": -93.27501678466797, "loss": 0.369, "rewards/accuracies": 1.0, "rewards/chosen": -0.17939288914203644, "rewards/margins": 1.566169261932373, "rewards/rejected": -1.7455620765686035, "step": 4074 }, { "epoch": 0.6802936759552811, "grad_norm": 18.16389274597168, "learning_rate": 1.319706324044719e-05, "logits/chosen": -0.7262225151062012, "logits/rejected": -0.775576114654541, "logps/chosen": -53.509307861328125, "logps/rejected": -127.38111114501953, "loss": 0.6467, "rewards/accuracies": 1.0, "rewards/chosen": 0.3220219910144806, "rewards/margins": 4.449613571166992, "rewards/rejected": -4.127592086791992, "step": 4077 }, { "epoch": 0.680794259969965, "grad_norm": 10.643037796020508, "learning_rate": 1.319205740030035e-05, "logits/chosen": -0.5905379056930542, "logits/rejected": -0.6007940769195557, "logps/chosen": -120.32511138916016, "logps/rejected": -109.65741729736328, "loss": 0.3927, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.4350162744522095, "rewards/margins": -0.16303932666778564, "rewards/rejected": -1.2719770669937134, "step": 4080 }, { "epoch": 0.6812948439846488, "grad_norm": 27.194311141967773, "learning_rate": 1.3187051560153514e-05, "logits/chosen": -0.8219745755195618, "logits/rejected": -0.852735698223114, "logps/chosen": -90.75341033935547, "logps/rejected": -105.12383270263672, "loss": 1.0171, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.385114073753357, "rewards/margins": -0.603331983089447, "rewards/rejected": -0.7817819714546204, "step": 4083 }, { "epoch": 0.6817954279993326, "grad_norm": 29.033287048339844, "learning_rate": 1.3182045720006675e-05, "logits/chosen": -0.6946499943733215, "logits/rejected": -0.6433963179588318, "logps/chosen": -107.54949188232422, "logps/rejected": -96.58792877197266, "loss": 0.4829, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.3932191133499146, "rewards/margins": 1.2199572324752808, "rewards/rejected": -2.6131763458251953, "step": 4086 }, { "epoch": 0.6822960120140164, "grad_norm": 39.315277099609375, "learning_rate": 1.3177039879859837e-05, "logits/chosen": -0.8026351928710938, "logits/rejected": -0.827268123626709, "logps/chosen": -68.9968490600586, "logps/rejected": -151.2874755859375, "loss": 0.3551, "rewards/accuracies": 1.0, "rewards/chosen": -1.2392133474349976, "rewards/margins": 3.72963547706604, "rewards/rejected": -4.968849182128906, "step": 4089 }, { "epoch": 0.6827965960287001, "grad_norm": 23.874059677124023, "learning_rate": 1.3172034039712999e-05, "logits/chosen": -0.7570104002952576, "logits/rejected": -0.7855124473571777, "logps/chosen": -52.34234619140625, "logps/rejected": -97.31806182861328, "loss": 0.6489, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.2057335525751114, "rewards/margins": 2.6053056716918945, "rewards/rejected": -2.8110389709472656, "step": 4092 }, { "epoch": 0.6832971800433839, "grad_norm": 25.905471801757812, "learning_rate": 1.3167028199566161e-05, "logits/chosen": -0.6912005543708801, "logits/rejected": -0.6930296421051025, "logps/chosen": -123.83218383789062, "logps/rejected": -114.92000579833984, "loss": 0.3471, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -3.5918140411376953, "rewards/margins": 0.9746665358543396, "rewards/rejected": -4.5664801597595215, "step": 4095 }, { "epoch": 0.6837977640580677, "grad_norm": 25.127798080444336, "learning_rate": 1.3162022359419324e-05, "logits/chosen": -0.8851302266120911, "logits/rejected": -0.9168610572814941, "logps/chosen": -103.6762924194336, "logps/rejected": -81.79068756103516, "loss": 0.6757, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.400429606437683, "rewards/margins": -0.41333580017089844, "rewards/rejected": -0.9870937466621399, "step": 4098 }, { "epoch": 0.6842983480727516, "grad_norm": 5.32313871383667, "learning_rate": 1.3157016519272484e-05, "logits/chosen": -0.7725067138671875, "logits/rejected": -0.7092949748039246, "logps/chosen": -144.424072265625, "logps/rejected": -115.00154876708984, "loss": 0.3744, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.8488833904266357, "rewards/margins": 2.506289482116699, "rewards/rejected": -5.355173110961914, "step": 4101 }, { "epoch": 0.6847989320874354, "grad_norm": 49.48516845703125, "learning_rate": 1.3152010679125648e-05, "logits/chosen": -0.7800965905189514, "logits/rejected": -0.7874537110328674, "logps/chosen": -124.34366607666016, "logps/rejected": -95.42679595947266, "loss": 0.6335, "rewards/accuracies": 1.0, "rewards/chosen": -0.4074404239654541, "rewards/margins": 0.9280927777290344, "rewards/rejected": -1.3355332612991333, "step": 4104 }, { "epoch": 0.6852995161021191, "grad_norm": 35.22151184082031, "learning_rate": 1.314700483897881e-05, "logits/chosen": -0.740649402141571, "logits/rejected": -0.69949871301651, "logps/chosen": -104.2469711303711, "logps/rejected": -84.43439483642578, "loss": 0.354, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1588892936706543, "rewards/margins": 1.894297480583191, "rewards/rejected": -3.0531866550445557, "step": 4107 }, { "epoch": 0.6858001001168029, "grad_norm": 63.4177360534668, "learning_rate": 1.3141998998831972e-05, "logits/chosen": -0.8644252419471741, "logits/rejected": -0.8290359973907471, "logps/chosen": -65.04253387451172, "logps/rejected": -79.26758575439453, "loss": 0.9683, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.3687658309936523, "rewards/margins": 0.10620498657226562, "rewards/rejected": -1.474970817565918, "step": 4110 }, { "epoch": 0.6863006841314867, "grad_norm": 8.285385131835938, "learning_rate": 1.3136993158685133e-05, "logits/chosen": -0.5126095414161682, "logits/rejected": -0.5580695867538452, "logps/chosen": -102.38253784179688, "logps/rejected": -148.63888549804688, "loss": 0.4464, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -3.263000726699829, "rewards/margins": 0.024878105148673058, "rewards/rejected": -3.28787899017334, "step": 4113 }, { "epoch": 0.6868012681461705, "grad_norm": 26.898181915283203, "learning_rate": 1.3131987318538295e-05, "logits/chosen": -0.5830051898956299, "logits/rejected": -0.5448043346405029, "logps/chosen": -116.24169921875, "logps/rejected": -85.20352172851562, "loss": 0.4937, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.288551688194275, "rewards/margins": -0.06025338172912598, "rewards/rejected": -1.228298306465149, "step": 4116 }, { "epoch": 0.6873018521608544, "grad_norm": 23.191631317138672, "learning_rate": 1.3126981478391459e-05, "logits/chosen": -0.5875154137611389, "logits/rejected": -0.5281327366828918, "logps/chosen": -128.7234344482422, "logps/rejected": -110.00820922851562, "loss": 1.0242, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.5967613458633423, "rewards/margins": -0.1307249665260315, "rewards/rejected": -1.4660364389419556, "step": 4119 }, { "epoch": 0.6878024361755382, "grad_norm": 19.15566062927246, "learning_rate": 1.3121975638244619e-05, "logits/chosen": -0.7090029120445251, "logits/rejected": -0.7070165276527405, "logps/chosen": -104.99442291259766, "logps/rejected": -112.63529205322266, "loss": 0.3699, "rewards/accuracies": 1.0, "rewards/chosen": -2.007702589035034, "rewards/margins": 0.6304454207420349, "rewards/rejected": -2.638148069381714, "step": 4122 }, { "epoch": 0.6883030201902219, "grad_norm": 22.21516227722168, "learning_rate": 1.3116969798097782e-05, "logits/chosen": -0.47530555725097656, "logits/rejected": -0.4714620113372803, "logps/chosen": -99.09209442138672, "logps/rejected": -118.6974105834961, "loss": 0.4505, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.05728554725646973, "rewards/margins": 0.19344520568847656, "rewards/rejected": -0.13615965843200684, "step": 4125 }, { "epoch": 0.6888036042049057, "grad_norm": 39.83102035522461, "learning_rate": 1.3111963957950944e-05, "logits/chosen": -0.6614124774932861, "logits/rejected": -0.6476359367370605, "logps/chosen": -70.62841033935547, "logps/rejected": -93.68423461914062, "loss": 0.7589, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.1505743265151978, "rewards/margins": 0.280575156211853, "rewards/rejected": -1.4311494827270508, "step": 4128 }, { "epoch": 0.6893041882195895, "grad_norm": 36.10932540893555, "learning_rate": 1.3106958117804106e-05, "logits/chosen": -0.7147604823112488, "logits/rejected": -0.7583310008049011, "logps/chosen": -144.3695526123047, "logps/rejected": -124.73377227783203, "loss": 0.6438, "rewards/accuracies": 1.0, "rewards/chosen": -0.10074818134307861, "rewards/margins": 1.8394445180892944, "rewards/rejected": -1.940192699432373, "step": 4131 }, { "epoch": 0.6898047722342733, "grad_norm": 23.96690559387207, "learning_rate": 1.3101952277657268e-05, "logits/chosen": -0.7415075898170471, "logits/rejected": -0.656974732875824, "logps/chosen": -114.5361099243164, "logps/rejected": -120.09354400634766, "loss": 0.4999, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.4014962911605835, "rewards/margins": 1.3753184080123901, "rewards/rejected": -2.7768146991729736, "step": 4134 }, { "epoch": 0.6903053562489572, "grad_norm": 8.547968864440918, "learning_rate": 1.309694643751043e-05, "logits/chosen": -0.7612707614898682, "logits/rejected": -0.7843294739723206, "logps/chosen": -82.73439025878906, "logps/rejected": -164.40550231933594, "loss": 0.2696, "rewards/accuracies": 1.0, "rewards/chosen": -0.5417888760566711, "rewards/margins": 2.3200795650482178, "rewards/rejected": -2.861868143081665, "step": 4137 }, { "epoch": 0.690805940263641, "grad_norm": 2.541699171066284, "learning_rate": 1.3091940597363593e-05, "logits/chosen": -0.5433447957038879, "logits/rejected": -0.5610923171043396, "logps/chosen": -108.60462188720703, "logps/rejected": -134.47247314453125, "loss": 0.2785, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1038638353347778, "rewards/margins": 1.644754409790039, "rewards/rejected": -2.7486183643341064, "step": 4140 }, { "epoch": 0.6913065242783247, "grad_norm": 32.2723503112793, "learning_rate": 1.3086934757216753e-05, "logits/chosen": -0.7138122916221619, "logits/rejected": -0.7390369772911072, "logps/chosen": -48.87726974487305, "logps/rejected": -93.6257095336914, "loss": 0.7608, "rewards/accuracies": 1.0, "rewards/chosen": 0.34691938757896423, "rewards/margins": 2.8882265090942383, "rewards/rejected": -2.541307210922241, "step": 4143 }, { "epoch": 0.6918071082930085, "grad_norm": 13.194050788879395, "learning_rate": 1.3081928917069916e-05, "logits/chosen": -0.7040454745292664, "logits/rejected": -0.6940622925758362, "logps/chosen": -119.61629486083984, "logps/rejected": -71.9545669555664, "loss": 0.6164, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3162830173969269, "rewards/margins": 0.6609680652618408, "rewards/rejected": -0.9772510528564453, "step": 4146 }, { "epoch": 0.6923076923076923, "grad_norm": 13.452981948852539, "learning_rate": 1.3076923076923078e-05, "logits/chosen": -0.7225074768066406, "logits/rejected": -0.7249640822410583, "logps/chosen": -94.59282684326172, "logps/rejected": -97.78743743896484, "loss": 0.5316, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9290554523468018, "rewards/margins": 1.5883969068527222, "rewards/rejected": -2.5174522399902344, "step": 4149 }, { "epoch": 0.6928082763223761, "grad_norm": 88.73143005371094, "learning_rate": 1.307191723677624e-05, "logits/chosen": -0.6786101460456848, "logits/rejected": -0.7299501299858093, "logps/chosen": -72.53557586669922, "logps/rejected": -100.53096771240234, "loss": 0.7504, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7385504841804504, "rewards/margins": 0.790687620639801, "rewards/rejected": -1.5292381048202515, "step": 4152 }, { "epoch": 0.6933088603370599, "grad_norm": 4.992213249206543, "learning_rate": 1.3066911396629402e-05, "logits/chosen": -0.8460788130760193, "logits/rejected": -0.8904350399971008, "logps/chosen": -116.33319854736328, "logps/rejected": -162.8090057373047, "loss": 0.3231, "rewards/accuracies": 1.0, "rewards/chosen": -0.6691372990608215, "rewards/margins": 1.375091552734375, "rewards/rejected": -2.0442287921905518, "step": 4155 }, { "epoch": 0.6938094443517437, "grad_norm": 4.332929611206055, "learning_rate": 1.3061905556482564e-05, "logits/chosen": -0.6288282871246338, "logits/rejected": -0.6276267170906067, "logps/chosen": -70.3140869140625, "logps/rejected": -75.66309356689453, "loss": 0.7217, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.7640514373779297, "rewards/margins": 0.18396472930908203, "rewards/rejected": -1.9480161666870117, "step": 4158 }, { "epoch": 0.6943100283664275, "grad_norm": 5.517010688781738, "learning_rate": 1.3056899716335727e-05, "logits/chosen": -0.6233707070350647, "logits/rejected": -0.6710206866264343, "logps/chosen": -48.309722900390625, "logps/rejected": -128.75982666015625, "loss": 0.8062, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.677789032459259, "rewards/margins": 2.257136106491089, "rewards/rejected": -1.579346776008606, "step": 4161 }, { "epoch": 0.6948106123811113, "grad_norm": 8.672340393066406, "learning_rate": 1.3051893876188887e-05, "logits/chosen": -0.7010939121246338, "logits/rejected": -0.7583759427070618, "logps/chosen": -92.23186492919922, "logps/rejected": -139.0619659423828, "loss": 0.3127, "rewards/accuracies": 1.0, "rewards/chosen": 0.20983785390853882, "rewards/margins": 2.8869478702545166, "rewards/rejected": -2.677109956741333, "step": 4164 }, { "epoch": 0.6953111963957951, "grad_norm": 17.182260513305664, "learning_rate": 1.3046888036042049e-05, "logits/chosen": -0.6703614592552185, "logits/rejected": -0.5734580159187317, "logps/chosen": -143.1990203857422, "logps/rejected": -100.86273193359375, "loss": 0.4693, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.9926689267158508, "rewards/margins": 0.6431913375854492, "rewards/rejected": -1.6358604431152344, "step": 4167 }, { "epoch": 0.6958117804104789, "grad_norm": 18.217660903930664, "learning_rate": 1.3041882195895213e-05, "logits/chosen": -0.5236876606941223, "logits/rejected": -0.5593996047973633, "logps/chosen": -64.2469253540039, "logps/rejected": -85.23284149169922, "loss": 0.5647, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5110567808151245, "rewards/margins": 0.2778034210205078, "rewards/rejected": -0.7888602614402771, "step": 4170 }, { "epoch": 0.6963123644251626, "grad_norm": 17.89088249206543, "learning_rate": 1.3036876355748374e-05, "logits/chosen": -0.6233820915222168, "logits/rejected": -0.673151433467865, "logps/chosen": -101.9154052734375, "logps/rejected": -158.2528533935547, "loss": 0.4658, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.4499155282974243, "rewards/margins": 1.1644386053085327, "rewards/rejected": -2.614354133605957, "step": 4173 }, { "epoch": 0.6968129484398465, "grad_norm": 10.25602912902832, "learning_rate": 1.3031870515601536e-05, "logits/chosen": -0.7132657170295715, "logits/rejected": -0.8156477808952332, "logps/chosen": -84.58837127685547, "logps/rejected": -126.6491470336914, "loss": 0.3536, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4101206362247467, "rewards/margins": 0.9848352074623108, "rewards/rejected": -1.3949557542800903, "step": 4176 }, { "epoch": 0.6973135324545303, "grad_norm": 22.86087989807129, "learning_rate": 1.3026864675454698e-05, "logits/chosen": -0.6242478489875793, "logits/rejected": -0.6209363341331482, "logps/chosen": -78.4537582397461, "logps/rejected": -76.08795928955078, "loss": 0.3884, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.2611512541770935, "rewards/margins": 1.1882606744766235, "rewards/rejected": -1.4494119882583618, "step": 4179 }, { "epoch": 0.6978141164692141, "grad_norm": 45.351253509521484, "learning_rate": 1.3021858835307861e-05, "logits/chosen": -0.7042780518531799, "logits/rejected": -0.702216625213623, "logps/chosen": -132.87533569335938, "logps/rejected": -120.27518463134766, "loss": 0.579, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7881431579589844, "rewards/margins": 0.1340593844652176, "rewards/rejected": -0.9222025871276855, "step": 4182 }, { "epoch": 0.6983147004838979, "grad_norm": 10.489998817443848, "learning_rate": 1.3016852995161021e-05, "logits/chosen": -0.8214099407196045, "logits/rejected": -0.8205602765083313, "logps/chosen": -55.73667526245117, "logps/rejected": -69.1108627319336, "loss": 0.3996, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.26663628220558167, "rewards/margins": 0.2177560180425644, "rewards/rejected": -0.48439228534698486, "step": 4185 }, { "epoch": 0.6988152844985817, "grad_norm": 5.139481544494629, "learning_rate": 1.3011847155014183e-05, "logits/chosen": -0.4132101535797119, "logits/rejected": -0.38681283593177795, "logps/chosen": -82.3825454711914, "logps/rejected": -96.90360260009766, "loss": 0.9925, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.356878399848938, "rewards/margins": 1.0432294607162476, "rewards/rejected": -2.4001080989837646, "step": 4188 }, { "epoch": 0.6993158685132654, "grad_norm": 19.68210220336914, "learning_rate": 1.3006841314867347e-05, "logits/chosen": -0.716230571269989, "logits/rejected": -0.7057791352272034, "logps/chosen": -78.97817993164062, "logps/rejected": -137.16976928710938, "loss": 0.5103, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": 0.14457683265209198, "rewards/margins": 1.2763131856918335, "rewards/rejected": -1.1317362785339355, "step": 4191 }, { "epoch": 0.6998164525279493, "grad_norm": 11.761366844177246, "learning_rate": 1.3001835474720509e-05, "logits/chosen": -0.6190654039382935, "logits/rejected": -0.5254107117652893, "logps/chosen": -140.50538635253906, "logps/rejected": -71.7873306274414, "loss": 0.9371, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -3.132352590560913, "rewards/margins": -1.4632792472839355, "rewards/rejected": -1.669073462486267, "step": 4194 }, { "epoch": 0.7003170365426331, "grad_norm": 52.13713836669922, "learning_rate": 1.299682963457367e-05, "logits/chosen": -0.7996177673339844, "logits/rejected": -0.8022871613502502, "logps/chosen": -88.7218246459961, "logps/rejected": -148.20191955566406, "loss": 0.7527, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -3.0302155017852783, "rewards/margins": 0.6495972275733948, "rewards/rejected": -3.6798126697540283, "step": 4197 }, { "epoch": 0.7008176205573169, "grad_norm": 18.315244674682617, "learning_rate": 1.2991823794426832e-05, "logits/chosen": -0.6297062039375305, "logits/rejected": -0.6839177012443542, "logps/chosen": -79.35740661621094, "logps/rejected": -123.29143524169922, "loss": 0.3585, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.287829875946045, "rewards/margins": 1.6487184762954712, "rewards/rejected": -2.9365482330322266, "step": 4200 }, { "epoch": 0.7008176205573169, "eval_logits/chosen": -0.6767749190330505, "eval_logits/rejected": -0.6895807981491089, "eval_logps/chosen": -84.0328598022461, "eval_logps/rejected": -110.4189453125, "eval_loss": 0.580735445022583, "eval_rewards/accuracies": 0.7432432174682617, "eval_rewards/chosen": -0.3775129020214081, "eval_rewards/margins": 1.351562738418579, "eval_rewards/rejected": -1.72907555103302, "eval_runtime": 347.7863, "eval_samples_per_second": 7.66, "eval_steps_per_second": 1.915, "step": 4200 }, { "epoch": 0.7013182045720007, "grad_norm": 19.453893661499023, "learning_rate": 1.2986817954279996e-05, "logits/chosen": -0.6835184097290039, "logits/rejected": -0.7749967575073242, "logps/chosen": -50.21223449707031, "logps/rejected": -166.7753448486328, "loss": 0.4811, "rewards/accuracies": 1.0, "rewards/chosen": -0.37950313091278076, "rewards/margins": 2.762402296066284, "rewards/rejected": -3.1419050693511963, "step": 4203 }, { "epoch": 0.7018187885866844, "grad_norm": 20.074445724487305, "learning_rate": 1.2981812114133156e-05, "logits/chosen": -0.5963509678840637, "logits/rejected": -0.6046924591064453, "logps/chosen": -102.3235855102539, "logps/rejected": -129.5829315185547, "loss": 0.3127, "rewards/accuracies": 1.0, "rewards/chosen": -0.49768853187561035, "rewards/margins": 3.053652763366699, "rewards/rejected": -3.5513412952423096, "step": 4206 }, { "epoch": 0.7023193726013682, "grad_norm": 5.2012505531311035, "learning_rate": 1.2976806273986318e-05, "logits/chosen": -0.8064215779304504, "logits/rejected": -0.7947691082954407, "logps/chosen": -100.92715454101562, "logps/rejected": -155.472900390625, "loss": 0.2119, "rewards/accuracies": 1.0, "rewards/chosen": 0.7654867172241211, "rewards/margins": 2.3642079830169678, "rewards/rejected": -1.5987211465835571, "step": 4209 }, { "epoch": 0.702819956616052, "grad_norm": 21.93019676208496, "learning_rate": 1.2971800433839481e-05, "logits/chosen": -0.585054337978363, "logits/rejected": -0.619134783744812, "logps/chosen": -137.19964599609375, "logps/rejected": -172.45281982421875, "loss": 0.5776, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.0992563962936401, "rewards/margins": 0.04725567623972893, "rewards/rejected": -1.1465120315551758, "step": 4212 }, { "epoch": 0.7033205406307359, "grad_norm": 28.748193740844727, "learning_rate": 1.2966794593692643e-05, "logits/chosen": -0.6594114303588867, "logits/rejected": -0.7243450284004211, "logps/chosen": -70.09942626953125, "logps/rejected": -128.97254943847656, "loss": 0.6105, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.26997441053390503, "rewards/margins": -0.0387035496532917, "rewards/rejected": -0.23127084970474243, "step": 4215 }, { "epoch": 0.7038211246454197, "grad_norm": 32.35670471191406, "learning_rate": 1.2961788753545805e-05, "logits/chosen": -0.6314785480499268, "logits/rejected": -0.6032707095146179, "logps/chosen": -138.67507934570312, "logps/rejected": -112.36214447021484, "loss": 0.989, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.5862111449241638, "rewards/margins": -1.199464201927185, "rewards/rejected": 0.6132529973983765, "step": 4218 }, { "epoch": 0.7043217086601035, "grad_norm": 42.75863265991211, "learning_rate": 1.2956782913398966e-05, "logits/chosen": -0.6203052997589111, "logits/rejected": -0.584015429019928, "logps/chosen": -73.6378402709961, "logps/rejected": -116.2127685546875, "loss": 0.6977, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.16451124846935272, "rewards/margins": 1.9241236448287964, "rewards/rejected": -1.7596122026443481, "step": 4221 }, { "epoch": 0.7048222926747872, "grad_norm": 21.75798797607422, "learning_rate": 1.2951777073252128e-05, "logits/chosen": -0.6487126350402832, "logits/rejected": -0.7548627853393555, "logps/chosen": -60.911102294921875, "logps/rejected": -145.60105895996094, "loss": 0.4598, "rewards/accuracies": 1.0, "rewards/chosen": 0.47658514976501465, "rewards/margins": 3.0102627277374268, "rewards/rejected": -2.533677816390991, "step": 4224 }, { "epoch": 0.705322876689471, "grad_norm": 36.09555435180664, "learning_rate": 1.294677123310529e-05, "logits/chosen": -0.7087020874023438, "logits/rejected": -0.7103520035743713, "logps/chosen": -101.09814453125, "logps/rejected": -77.30374145507812, "loss": 0.9066, "rewards/accuracies": 0.0, "rewards/chosen": -0.7458758354187012, "rewards/margins": -1.0326179265975952, "rewards/rejected": 0.28674203157424927, "step": 4227 }, { "epoch": 0.7058234607041548, "grad_norm": 10.013288497924805, "learning_rate": 1.2941765392958452e-05, "logits/chosen": -0.6101894974708557, "logits/rejected": -0.6821048855781555, "logps/chosen": -43.67107009887695, "logps/rejected": -117.04381561279297, "loss": 0.2576, "rewards/accuracies": 1.0, "rewards/chosen": 0.20873694121837616, "rewards/margins": 1.2898238897323608, "rewards/rejected": -1.0810869932174683, "step": 4230 }, { "epoch": 0.7063240447188387, "grad_norm": 18.158658981323242, "learning_rate": 1.2936759552811615e-05, "logits/chosen": -0.436161607503891, "logits/rejected": -0.5337914824485779, "logps/chosen": -52.6252326965332, "logps/rejected": -146.5034637451172, "loss": 0.4447, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.007382263895124197, "rewards/margins": 1.0639688968658447, "rewards/rejected": -1.0565866231918335, "step": 4233 }, { "epoch": 0.7068246287335225, "grad_norm": 42.65302658081055, "learning_rate": 1.2931753712664777e-05, "logits/chosen": -0.5253350734710693, "logits/rejected": -0.40937328338623047, "logps/chosen": -150.86561584472656, "logps/rejected": -137.52017211914062, "loss": 0.3447, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.7664826512336731, "rewards/margins": 0.16664977371692657, "rewards/rejected": -0.9331324696540833, "step": 4236 }, { "epoch": 0.7073252127482063, "grad_norm": 27.434633255004883, "learning_rate": 1.2926747872517939e-05, "logits/chosen": -0.8661327362060547, "logits/rejected": -0.8543829917907715, "logps/chosen": -132.24305725097656, "logps/rejected": -137.6879425048828, "loss": 0.3909, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1781086921691895, "rewards/margins": 2.2215218544006348, "rewards/rejected": -3.399630546569824, "step": 4239 }, { "epoch": 0.70782579676289, "grad_norm": 16.168996810913086, "learning_rate": 1.29217420323711e-05, "logits/chosen": -0.5710635781288147, "logits/rejected": -0.6380203366279602, "logps/chosen": -87.16104888916016, "logps/rejected": -102.02642822265625, "loss": 0.3085, "rewards/accuracies": 1.0, "rewards/chosen": -0.9316202998161316, "rewards/margins": 0.9617822766304016, "rewards/rejected": -1.8934025764465332, "step": 4242 }, { "epoch": 0.7083263807775738, "grad_norm": 46.73720932006836, "learning_rate": 1.2916736192224262e-05, "logits/chosen": -0.8334917426109314, "logits/rejected": -0.8544745445251465, "logps/chosen": -53.9365234375, "logps/rejected": -94.86058807373047, "loss": 0.5153, "rewards/accuracies": 1.0, "rewards/chosen": -0.19949249923229218, "rewards/margins": 2.10599684715271, "rewards/rejected": -2.3054890632629395, "step": 4245 }, { "epoch": 0.7088269647922576, "grad_norm": 36.59136962890625, "learning_rate": 1.2911730352077424e-05, "logits/chosen": -0.4701344966888428, "logits/rejected": -0.5147207379341125, "logps/chosen": -117.8906021118164, "logps/rejected": -122.4515380859375, "loss": 0.5338, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.4660831689834595, "rewards/margins": 0.542919397354126, "rewards/rejected": -2.009002685546875, "step": 4248 }, { "epoch": 0.7093275488069414, "grad_norm": 11.157320022583008, "learning_rate": 1.2906724511930586e-05, "logits/chosen": -0.7131481766700745, "logits/rejected": -0.732991635799408, "logps/chosen": -102.3985824584961, "logps/rejected": -84.28585815429688, "loss": 0.5334, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.0339556448161602, "rewards/margins": 1.2574468851089478, "rewards/rejected": -1.2914024591445923, "step": 4251 }, { "epoch": 0.7098281328216253, "grad_norm": 24.65143394470215, "learning_rate": 1.290171867178375e-05, "logits/chosen": -0.7083411812782288, "logits/rejected": -0.6409273743629456, "logps/chosen": -209.4530487060547, "logps/rejected": -146.9711151123047, "loss": 0.348, "rewards/accuracies": 1.0, "rewards/chosen": -0.9813024401664734, "rewards/margins": 0.7773026823997498, "rewards/rejected": -1.7586050033569336, "step": 4254 }, { "epoch": 0.710328716836309, "grad_norm": 9.755152702331543, "learning_rate": 1.2896712831636911e-05, "logits/chosen": -0.7566984295845032, "logits/rejected": -0.7776628136634827, "logps/chosen": -79.36544036865234, "logps/rejected": -138.3981170654297, "loss": 0.4755, "rewards/accuracies": 1.0, "rewards/chosen": 0.061383556574583054, "rewards/margins": 1.518083930015564, "rewards/rejected": -1.4567004442214966, "step": 4257 }, { "epoch": 0.7108293008509928, "grad_norm": 33.016326904296875, "learning_rate": 1.2891706991490073e-05, "logits/chosen": -0.6989695429801941, "logits/rejected": -0.6524408459663391, "logps/chosen": -87.15084838867188, "logps/rejected": -91.91259002685547, "loss": 0.9404, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4324399530887604, "rewards/margins": 0.7102729678153992, "rewards/rejected": -1.142712950706482, "step": 4260 }, { "epoch": 0.7113298848656766, "grad_norm": 20.173076629638672, "learning_rate": 1.2886701151343235e-05, "logits/chosen": -0.7551072239875793, "logits/rejected": -0.7467982172966003, "logps/chosen": -76.49320220947266, "logps/rejected": -100.41265106201172, "loss": 0.4076, "rewards/accuracies": 1.0, "rewards/chosen": -0.04167620465159416, "rewards/margins": 1.6484222412109375, "rewards/rejected": -1.6900984048843384, "step": 4263 }, { "epoch": 0.7118304688803604, "grad_norm": 31.380748748779297, "learning_rate": 1.2881695311196397e-05, "logits/chosen": -0.6076470613479614, "logits/rejected": -0.5331776142120361, "logps/chosen": -117.87348175048828, "logps/rejected": -129.46630859375, "loss": 0.8055, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.4294630289077759, "rewards/margins": 0.8369728922843933, "rewards/rejected": -2.2664358615875244, "step": 4266 }, { "epoch": 0.7123310528950442, "grad_norm": 22.074872970581055, "learning_rate": 1.2876689471049559e-05, "logits/chosen": -0.5830984115600586, "logits/rejected": -0.7218479514122009, "logps/chosen": -53.216251373291016, "logps/rejected": -147.97950744628906, "loss": 0.1752, "rewards/accuracies": 1.0, "rewards/chosen": -0.3622216284275055, "rewards/margins": 2.145610809326172, "rewards/rejected": -2.5078325271606445, "step": 4269 }, { "epoch": 0.7128316369097281, "grad_norm": 20.84162712097168, "learning_rate": 1.287168363090272e-05, "logits/chosen": -0.6950793266296387, "logits/rejected": -0.6726858615875244, "logps/chosen": -90.67838287353516, "logps/rejected": -99.76242065429688, "loss": 0.8668, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8015492558479309, "rewards/margins": 0.770545482635498, "rewards/rejected": -1.5720947980880737, "step": 4272 }, { "epoch": 0.7133322209244118, "grad_norm": 31.883975982666016, "learning_rate": 1.2866677790755884e-05, "logits/chosen": -0.7682785391807556, "logits/rejected": -0.7754952311515808, "logps/chosen": -76.20110321044922, "logps/rejected": -132.5687713623047, "loss": 0.4194, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.2214449644088745, "rewards/margins": 2.7046797275543213, "rewards/rejected": -3.9261245727539062, "step": 4275 }, { "epoch": 0.7138328049390956, "grad_norm": 7.846800804138184, "learning_rate": 1.2861671950609046e-05, "logits/chosen": -0.8090169429779053, "logits/rejected": -0.7743819355964661, "logps/chosen": -159.4853973388672, "logps/rejected": -134.17738342285156, "loss": 0.4228, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.5389195680618286, "rewards/margins": 0.4925135374069214, "rewards/rejected": -2.03143310546875, "step": 4278 }, { "epoch": 0.7143333889537794, "grad_norm": 13.470894813537598, "learning_rate": 1.2856666110462206e-05, "logits/chosen": -0.6082584857940674, "logits/rejected": -0.587029218673706, "logps/chosen": -61.56624221801758, "logps/rejected": -117.38680267333984, "loss": 0.5139, "rewards/accuracies": 1.0, "rewards/chosen": 0.9993836283683777, "rewards/margins": 4.9597086906433105, "rewards/rejected": -3.960325241088867, "step": 4281 }, { "epoch": 0.7148339729684632, "grad_norm": 58.48093795776367, "learning_rate": 1.285166027031537e-05, "logits/chosen": -0.5229816436767578, "logits/rejected": -0.5911995768547058, "logps/chosen": -73.32161712646484, "logps/rejected": -145.9752197265625, "loss": 0.2452, "rewards/accuracies": 1.0, "rewards/chosen": -0.41228556632995605, "rewards/margins": 2.1273341178894043, "rewards/rejected": -2.5396196842193604, "step": 4284 }, { "epoch": 0.715334556983147, "grad_norm": 33.56798553466797, "learning_rate": 1.2846654430168531e-05, "logits/chosen": -0.6772140860557556, "logits/rejected": -0.6746332049369812, "logps/chosen": -176.51658630371094, "logps/rejected": -159.9447479248047, "loss": 0.9154, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.1889514923095703, "rewards/margins": 1.2469040155410767, "rewards/rejected": -3.4358556270599365, "step": 4287 }, { "epoch": 0.7158351409978309, "grad_norm": 26.245176315307617, "learning_rate": 1.2841648590021693e-05, "logits/chosen": -0.8119291663169861, "logits/rejected": -0.7538647651672363, "logps/chosen": -94.7765121459961, "logps/rejected": -71.79965209960938, "loss": 0.4029, "rewards/accuracies": 1.0, "rewards/chosen": -1.5740342140197754, "rewards/margins": 0.8533340096473694, "rewards/rejected": -2.4273681640625, "step": 4290 }, { "epoch": 0.7163357250125146, "grad_norm": 20.863710403442383, "learning_rate": 1.2836642749874855e-05, "logits/chosen": -0.6772401928901672, "logits/rejected": -0.7326886057853699, "logps/chosen": -43.75217819213867, "logps/rejected": -75.11382293701172, "loss": 0.4715, "rewards/accuracies": 1.0, "rewards/chosen": -0.5583624243736267, "rewards/margins": 1.8168894052505493, "rewards/rejected": -2.3752520084381104, "step": 4293 }, { "epoch": 0.7168363090271984, "grad_norm": 12.996377944946289, "learning_rate": 1.2831636909728018e-05, "logits/chosen": -0.6286055445671082, "logits/rejected": -0.6577004790306091, "logps/chosen": -48.0484504699707, "logps/rejected": -123.22110748291016, "loss": 0.4251, "rewards/accuracies": 1.0, "rewards/chosen": 0.625381350517273, "rewards/margins": 4.486084461212158, "rewards/rejected": -3.860703229904175, "step": 4296 }, { "epoch": 0.7173368930418822, "grad_norm": 34.93965530395508, "learning_rate": 1.282663106958118e-05, "logits/chosen": -0.6083522439002991, "logits/rejected": -0.6416916847229004, "logps/chosen": -77.52440643310547, "logps/rejected": -105.9062728881836, "loss": 0.5608, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.0738154649734497, "rewards/margins": 1.4043492078781128, "rewards/rejected": -2.4781646728515625, "step": 4299 }, { "epoch": 0.717837477056566, "grad_norm": 73.9717025756836, "learning_rate": 1.282162522943434e-05, "logits/chosen": -0.743804395198822, "logits/rejected": -0.7708801627159119, "logps/chosen": -76.08893585205078, "logps/rejected": -114.35028076171875, "loss": 0.6538, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.2376674860715866, "rewards/margins": 2.2324798107147217, "rewards/rejected": -2.470147132873535, "step": 4302 }, { "epoch": 0.7183380610712498, "grad_norm": 11.891569137573242, "learning_rate": 1.2816619389287503e-05, "logits/chosen": -0.7852217555046082, "logits/rejected": -0.799632728099823, "logps/chosen": -80.16353607177734, "logps/rejected": -87.52677154541016, "loss": 0.3265, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.7827478647232056, "rewards/margins": 0.16894036531448364, "rewards/rejected": -1.9516881704330444, "step": 4305 }, { "epoch": 0.7188386450859335, "grad_norm": 27.511791229248047, "learning_rate": 1.2811613549140665e-05, "logits/chosen": -0.5267985463142395, "logits/rejected": -0.6051878929138184, "logps/chosen": -95.4614028930664, "logps/rejected": -144.16819763183594, "loss": 0.5659, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7535989284515381, "rewards/margins": 2.0455191135406494, "rewards/rejected": -2.7991182804107666, "step": 4308 }, { "epoch": 0.7193392291006174, "grad_norm": 4.3651862144470215, "learning_rate": 1.2806607708993827e-05, "logits/chosen": -0.6909630298614502, "logits/rejected": -0.6659498810768127, "logps/chosen": -113.62596893310547, "logps/rejected": -77.74821472167969, "loss": 0.5707, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.0315942764282227, "rewards/margins": 1.4257675409317017, "rewards/rejected": -3.457361936569214, "step": 4311 }, { "epoch": 0.7198398131153012, "grad_norm": 35.03156661987305, "learning_rate": 1.2801601868846989e-05, "logits/chosen": -0.753870964050293, "logits/rejected": -0.7235298752784729, "logps/chosen": -124.81034088134766, "logps/rejected": -113.69193267822266, "loss": 0.6047, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.728595733642578, "rewards/margins": 0.3131169378757477, "rewards/rejected": -3.041712760925293, "step": 4314 }, { "epoch": 0.720340397129985, "grad_norm": 18.995027542114258, "learning_rate": 1.2796596028700152e-05, "logits/chosen": -0.506681501865387, "logits/rejected": -0.5694817900657654, "logps/chosen": -57.70632553100586, "logps/rejected": -124.2264633178711, "loss": 0.2574, "rewards/accuracies": 1.0, "rewards/chosen": 0.4979076683521271, "rewards/margins": 2.7522847652435303, "rewards/rejected": -2.2543766498565674, "step": 4317 }, { "epoch": 0.7208409811446688, "grad_norm": 22.858963012695312, "learning_rate": 1.2791590188553314e-05, "logits/chosen": -0.47557327151298523, "logits/rejected": -0.5070332288742065, "logps/chosen": -142.41831970214844, "logps/rejected": -172.3167724609375, "loss": 0.6457, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5377591252326965, "rewards/margins": 2.285426378250122, "rewards/rejected": -2.823185682296753, "step": 4320 }, { "epoch": 0.7213415651593525, "grad_norm": 4.3317060470581055, "learning_rate": 1.2786584348406474e-05, "logits/chosen": -0.5166175365447998, "logits/rejected": -0.5464215278625488, "logps/chosen": -76.59622955322266, "logps/rejected": -111.87140655517578, "loss": 0.885, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.5369290113449097, "rewards/margins": 2.092848777770996, "rewards/rejected": -3.6297779083251953, "step": 4323 }, { "epoch": 0.7218421491740363, "grad_norm": 41.934932708740234, "learning_rate": 1.2781578508259638e-05, "logits/chosen": -0.9638637900352478, "logits/rejected": -0.9196130633354187, "logps/chosen": -95.38309478759766, "logps/rejected": -82.14160919189453, "loss": 0.4911, "rewards/accuracies": 1.0, "rewards/chosen": 0.2208166867494583, "rewards/margins": 2.649848222732544, "rewards/rejected": -2.4290316104888916, "step": 4326 }, { "epoch": 0.7223427331887202, "grad_norm": 28.406618118286133, "learning_rate": 1.27765726681128e-05, "logits/chosen": -0.6786004900932312, "logits/rejected": -0.6432227492332458, "logps/chosen": -131.00311279296875, "logps/rejected": -94.64473724365234, "loss": 0.7419, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.681912660598755, "rewards/margins": -0.44727516174316406, "rewards/rejected": -2.234637498855591, "step": 4329 }, { "epoch": 0.722843317203404, "grad_norm": 7.830700397491455, "learning_rate": 1.2771566827965961e-05, "logits/chosen": -0.6136944890022278, "logits/rejected": -0.6004111766815186, "logps/chosen": -114.15375518798828, "logps/rejected": -77.31954193115234, "loss": 0.5143, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.76616632938385, "rewards/margins": 1.3790208101272583, "rewards/rejected": -3.1451873779296875, "step": 4332 }, { "epoch": 0.7233439012180878, "grad_norm": 13.918981552124023, "learning_rate": 1.2766560987819123e-05, "logits/chosen": -0.4495302438735962, "logits/rejected": -0.4331190586090088, "logps/chosen": -82.19125366210938, "logps/rejected": -82.39617919921875, "loss": 0.479, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.25397035479545593, "rewards/margins": 1.104654312133789, "rewards/rejected": -0.8506839871406555, "step": 4335 }, { "epoch": 0.7238444852327716, "grad_norm": 39.56173324584961, "learning_rate": 1.2761555147672285e-05, "logits/chosen": -0.6896073818206787, "logits/rejected": -0.7609419226646423, "logps/chosen": -102.51531982421875, "logps/rejected": -110.33160400390625, "loss": 0.4023, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.0601035356521606, "rewards/margins": 0.12660068273544312, "rewards/rejected": -1.1867042779922485, "step": 4338 }, { "epoch": 0.7243450692474553, "grad_norm": 6.834104537963867, "learning_rate": 1.2756549307525448e-05, "logits/chosen": -0.6158788204193115, "logits/rejected": -0.639561653137207, "logps/chosen": -162.78662109375, "logps/rejected": -207.81153869628906, "loss": 0.2125, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5377484560012817, "rewards/margins": 0.8974011540412903, "rewards/rejected": -1.4351496696472168, "step": 4341 }, { "epoch": 0.7248456532621391, "grad_norm": 34.630252838134766, "learning_rate": 1.2751543467378609e-05, "logits/chosen": -0.6132073998451233, "logits/rejected": -0.607724130153656, "logps/chosen": -56.677459716796875, "logps/rejected": -150.9667510986328, "loss": 0.7149, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.26035672426223755, "rewards/margins": 2.1490280628204346, "rewards/rejected": -2.4093847274780273, "step": 4344 }, { "epoch": 0.725346237276823, "grad_norm": 15.640362739562988, "learning_rate": 1.2746537627231772e-05, "logits/chosen": -0.7170100212097168, "logits/rejected": -0.7241289615631104, "logps/chosen": -79.90579223632812, "logps/rejected": -137.64329528808594, "loss": 0.3301, "rewards/accuracies": 1.0, "rewards/chosen": -0.506673276424408, "rewards/margins": 4.701854705810547, "rewards/rejected": -5.2085280418396, "step": 4347 }, { "epoch": 0.7258468212915068, "grad_norm": 29.88758659362793, "learning_rate": 1.2741531787084934e-05, "logits/chosen": -0.6588932871818542, "logits/rejected": -0.6697515845298767, "logps/chosen": -118.7291030883789, "logps/rejected": -118.17623138427734, "loss": 0.5587, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -3.3804585933685303, "rewards/margins": 1.0453675985336304, "rewards/rejected": -4.425826549530029, "step": 4350 }, { "epoch": 0.7263474053061906, "grad_norm": 12.167531967163086, "learning_rate": 1.2736525946938096e-05, "logits/chosen": -0.7413327693939209, "logits/rejected": -0.7704382538795471, "logps/chosen": -49.27999496459961, "logps/rejected": -158.78555297851562, "loss": 0.5167, "rewards/accuracies": 1.0, "rewards/chosen": -0.5433163046836853, "rewards/margins": 2.764955759048462, "rewards/rejected": -3.308272123336792, "step": 4353 }, { "epoch": 0.7268479893208744, "grad_norm": 44.28296661376953, "learning_rate": 1.2731520106791257e-05, "logits/chosen": -0.6363911628723145, "logits/rejected": -0.6235679984092712, "logps/chosen": -100.3027114868164, "logps/rejected": -90.35650634765625, "loss": 0.7127, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.9986446499824524, "rewards/margins": -0.2905191481113434, "rewards/rejected": -0.7081255316734314, "step": 4356 }, { "epoch": 0.7273485733355581, "grad_norm": 27.555795669555664, "learning_rate": 1.272651426664442e-05, "logits/chosen": -0.7539337277412415, "logits/rejected": -0.6955503821372986, "logps/chosen": -106.1698226928711, "logps/rejected": -98.11205291748047, "loss": 0.433, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.5361579656600952, "rewards/margins": 1.2678922414779663, "rewards/rejected": -2.8040504455566406, "step": 4359 }, { "epoch": 0.7278491573502419, "grad_norm": 24.89381217956543, "learning_rate": 1.2721508426497583e-05, "logits/chosen": -0.8389375805854797, "logits/rejected": -0.8660790324211121, "logps/chosen": -105.8901596069336, "logps/rejected": -126.0219497680664, "loss": 0.6994, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -3.0889663696289062, "rewards/margins": 0.030089596286416054, "rewards/rejected": -3.119055986404419, "step": 4362 }, { "epoch": 0.7283497413649257, "grad_norm": 25.75440788269043, "learning_rate": 1.2716502586350743e-05, "logits/chosen": -0.6146218180656433, "logits/rejected": -0.37873438000679016, "logps/chosen": -104.4961929321289, "logps/rejected": -81.39087677001953, "loss": 0.5304, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.3767732381820679, "rewards/margins": 1.36001455783844, "rewards/rejected": -2.736787796020508, "step": 4365 }, { "epoch": 0.7288503253796096, "grad_norm": 24.997404098510742, "learning_rate": 1.2711496746203906e-05, "logits/chosen": -0.7700526714324951, "logits/rejected": -0.8466455340385437, "logps/chosen": -59.437896728515625, "logps/rejected": -143.115478515625, "loss": 0.3886, "rewards/accuracies": 1.0, "rewards/chosen": -0.6175075173377991, "rewards/margins": 4.289079666137695, "rewards/rejected": -4.90658712387085, "step": 4368 }, { "epoch": 0.7293509093942934, "grad_norm": 27.08710289001465, "learning_rate": 1.2706490906057068e-05, "logits/chosen": -0.5544652938842773, "logits/rejected": -0.6004021763801575, "logps/chosen": -79.25827026367188, "logps/rejected": -149.0019073486328, "loss": 0.5164, "rewards/accuracies": 1.0, "rewards/chosen": -0.04011930152773857, "rewards/margins": 3.2193453311920166, "rewards/rejected": -3.2594645023345947, "step": 4371 }, { "epoch": 0.7298514934089771, "grad_norm": 30.84951400756836, "learning_rate": 1.270148506591023e-05, "logits/chosen": -0.3652202785015106, "logits/rejected": -0.45579978823661804, "logps/chosen": -106.83060455322266, "logps/rejected": -217.60296630859375, "loss": 0.5912, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8318478465080261, "rewards/margins": 3.1105384826660156, "rewards/rejected": -3.9423863887786865, "step": 4374 }, { "epoch": 0.7303520774236609, "grad_norm": 32.022769927978516, "learning_rate": 1.2696479225763392e-05, "logits/chosen": -0.6704893708229065, "logits/rejected": -0.6448890566825867, "logps/chosen": -93.01912689208984, "logps/rejected": -106.76030731201172, "loss": 0.4034, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.8036764860153198, "rewards/margins": 1.9370514154434204, "rewards/rejected": -3.740727663040161, "step": 4377 }, { "epoch": 0.7308526614383447, "grad_norm": 11.90835952758789, "learning_rate": 1.2691473385616553e-05, "logits/chosen": -0.5908802151679993, "logits/rejected": -0.573639452457428, "logps/chosen": -161.15101623535156, "logps/rejected": -134.12303161621094, "loss": 0.4071, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.388330340385437, "rewards/margins": 1.2188704013824463, "rewards/rejected": -2.607200860977173, "step": 4380 }, { "epoch": 0.7313532454530285, "grad_norm": 22.930715560913086, "learning_rate": 1.2686467545469717e-05, "logits/chosen": -0.5188859105110168, "logits/rejected": -0.5073332190513611, "logps/chosen": -72.00465393066406, "logps/rejected": -68.0886001586914, "loss": 0.7518, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.355494499206543, "rewards/margins": 0.8907501101493835, "rewards/rejected": -2.2462446689605713, "step": 4383 }, { "epoch": 0.7318538294677124, "grad_norm": 37.62380599975586, "learning_rate": 1.2681461705322877e-05, "logits/chosen": -0.6068174242973328, "logits/rejected": -0.6091874241828918, "logps/chosen": -126.67645263671875, "logps/rejected": -145.1027069091797, "loss": 0.7259, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1916338205337524, "rewards/margins": 0.3706298768520355, "rewards/rejected": -1.5622638463974, "step": 4386 }, { "epoch": 0.7323544134823962, "grad_norm": 10.500070571899414, "learning_rate": 1.267645586517604e-05, "logits/chosen": -0.6217055916786194, "logits/rejected": -0.6439533233642578, "logps/chosen": -71.1611099243164, "logps/rejected": -119.34756469726562, "loss": 0.5306, "rewards/accuracies": 1.0, "rewards/chosen": -0.6200476288795471, "rewards/margins": 3.0819311141967773, "rewards/rejected": -3.7019786834716797, "step": 4389 }, { "epoch": 0.7328549974970799, "grad_norm": 15.54166316986084, "learning_rate": 1.2671450025029202e-05, "logits/chosen": -0.6431033611297607, "logits/rejected": -0.6661655902862549, "logps/chosen": -41.0872688293457, "logps/rejected": -82.29322814941406, "loss": 0.6438, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7002987265586853, "rewards/margins": 0.6561150550842285, "rewards/rejected": -1.3564138412475586, "step": 4392 }, { "epoch": 0.7333555815117637, "grad_norm": 29.59764862060547, "learning_rate": 1.2666444184882362e-05, "logits/chosen": -0.6033642292022705, "logits/rejected": -0.6740811467170715, "logps/chosen": -139.9322052001953, "logps/rejected": -132.81869506835938, "loss": 0.7467, "rewards/accuracies": 0.0, "rewards/chosen": -1.1375781297683716, "rewards/margins": -0.7952634692192078, "rewards/rejected": -0.34231463074684143, "step": 4395 }, { "epoch": 0.7338561655264475, "grad_norm": 26.40423011779785, "learning_rate": 1.2661438344735526e-05, "logits/chosen": -0.5579873919487, "logits/rejected": -0.5932116508483887, "logps/chosen": -61.389495849609375, "logps/rejected": -138.37962341308594, "loss": 0.5019, "rewards/accuracies": 1.0, "rewards/chosen": -1.868703007698059, "rewards/margins": 1.1114915609359741, "rewards/rejected": -2.980194330215454, "step": 4398 }, { "epoch": 0.7343567495411313, "grad_norm": 34.884033203125, "learning_rate": 1.2656432504588688e-05, "logits/chosen": -0.5943103432655334, "logits/rejected": -0.6910012364387512, "logps/chosen": -56.46335220336914, "logps/rejected": -126.2109375, "loss": 1.1307, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.552106499671936, "rewards/margins": 1.175529956817627, "rewards/rejected": -1.7276363372802734, "step": 4401 }, { "epoch": 0.7348573335558152, "grad_norm": 11.567848205566406, "learning_rate": 1.2651426664441851e-05, "logits/chosen": -0.7084597945213318, "logits/rejected": -0.7167388796806335, "logps/chosen": -99.11652374267578, "logps/rejected": -155.3994598388672, "loss": 0.3103, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.2678797245025635, "rewards/margins": 0.09637401252985, "rewards/rejected": -2.3642537593841553, "step": 4404 }, { "epoch": 0.735357917570499, "grad_norm": 51.65326690673828, "learning_rate": 1.2646420824295011e-05, "logits/chosen": -0.8213978409767151, "logits/rejected": -0.7925841212272644, "logps/chosen": -61.48744201660156, "logps/rejected": -94.3245849609375, "loss": 0.5995, "rewards/accuracies": 1.0, "rewards/chosen": -0.24824893474578857, "rewards/margins": 2.802443742752075, "rewards/rejected": -3.0506927967071533, "step": 4407 }, { "epoch": 0.7358585015851827, "grad_norm": 16.59916114807129, "learning_rate": 1.2641414984148175e-05, "logits/chosen": -0.6390267014503479, "logits/rejected": -0.7369573712348938, "logps/chosen": -99.25809478759766, "logps/rejected": -168.55337524414062, "loss": 0.3626, "rewards/accuracies": 1.0, "rewards/chosen": -0.9702286124229431, "rewards/margins": 2.3332839012145996, "rewards/rejected": -3.3035125732421875, "step": 4410 }, { "epoch": 0.7363590855998665, "grad_norm": 13.309300422668457, "learning_rate": 1.2636409144001337e-05, "logits/chosen": -0.7364370226860046, "logits/rejected": -0.7090537548065186, "logps/chosen": -84.62871551513672, "logps/rejected": -154.66632080078125, "loss": 0.1949, "rewards/accuracies": 1.0, "rewards/chosen": -0.7213570475578308, "rewards/margins": 3.982362747192383, "rewards/rejected": -4.7037200927734375, "step": 4413 }, { "epoch": 0.7368596696145503, "grad_norm": 23.783538818359375, "learning_rate": 1.2631403303854497e-05, "logits/chosen": -0.565328061580658, "logits/rejected": -0.5337958931922913, "logps/chosen": -94.8548355102539, "logps/rejected": -88.8746337890625, "loss": 0.8958, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7152938842773438, "rewards/margins": 0.7903583645820618, "rewards/rejected": -1.5056524276733398, "step": 4416 }, { "epoch": 0.7373602536292341, "grad_norm": 14.151406288146973, "learning_rate": 1.262639746370766e-05, "logits/chosen": -0.7639009952545166, "logits/rejected": -0.724224865436554, "logps/chosen": -89.5262222290039, "logps/rejected": -100.86168670654297, "loss": 1.0067, "rewards/accuracies": 1.0, "rewards/chosen": -0.5607526302337646, "rewards/margins": 2.3357741832733154, "rewards/rejected": -2.896526575088501, "step": 4419 }, { "epoch": 0.7378608376439179, "grad_norm": 32.49443817138672, "learning_rate": 1.2621391623560822e-05, "logits/chosen": -0.7448675632476807, "logits/rejected": -0.7163227200508118, "logps/chosen": -178.0013427734375, "logps/rejected": -95.6807632446289, "loss": 1.232, "rewards/accuracies": 0.0, "rewards/chosen": -3.326223134994507, "rewards/margins": -3.131240129470825, "rewards/rejected": -0.1949828863143921, "step": 4422 }, { "epoch": 0.7383614216586017, "grad_norm": 13.782150268554688, "learning_rate": 1.2616385783413985e-05, "logits/chosen": -0.49355974793434143, "logits/rejected": -0.5261794328689575, "logps/chosen": -58.665924072265625, "logps/rejected": -99.1189193725586, "loss": 0.3826, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.22205094993114471, "rewards/margins": 0.7938642501831055, "rewards/rejected": -1.0159151554107666, "step": 4425 }, { "epoch": 0.7388620056732855, "grad_norm": 10.38327407836914, "learning_rate": 1.2611379943267146e-05, "logits/chosen": -0.5935659408569336, "logits/rejected": -0.6064684391021729, "logps/chosen": -77.3058090209961, "logps/rejected": -131.1905059814453, "loss": 0.3985, "rewards/accuracies": 1.0, "rewards/chosen": -0.712135374546051, "rewards/margins": 2.925015449523926, "rewards/rejected": -3.637150764465332, "step": 4428 }, { "epoch": 0.7393625896879693, "grad_norm": 37.589134216308594, "learning_rate": 1.2606374103120309e-05, "logits/chosen": -0.4941329061985016, "logits/rejected": -0.5945188403129578, "logps/chosen": -73.15677642822266, "logps/rejected": -167.88832092285156, "loss": 0.7128, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8130707144737244, "rewards/margins": 1.3354159593582153, "rewards/rejected": -2.148486614227295, "step": 4431 }, { "epoch": 0.7398631737026531, "grad_norm": 18.26567840576172, "learning_rate": 1.2601368262973471e-05, "logits/chosen": -0.4851716458797455, "logits/rejected": -0.5506313443183899, "logps/chosen": -54.58311080932617, "logps/rejected": -146.9671630859375, "loss": 0.587, "rewards/accuracies": 1.0, "rewards/chosen": -0.1969827562570572, "rewards/margins": 3.3760859966278076, "rewards/rejected": -3.573068618774414, "step": 4434 }, { "epoch": 0.7403637577173369, "grad_norm": 27.881855010986328, "learning_rate": 1.2596362422826631e-05, "logits/chosen": -0.5070288181304932, "logits/rejected": -0.5916137099266052, "logps/chosen": -94.49349975585938, "logps/rejected": -155.55174255371094, "loss": 0.4421, "rewards/accuracies": 1.0, "rewards/chosen": 0.24201710522174835, "rewards/margins": 3.030590772628784, "rewards/rejected": -2.7885735034942627, "step": 4437 }, { "epoch": 0.7408643417320206, "grad_norm": 8.139666557312012, "learning_rate": 1.2591356582679794e-05, "logits/chosen": -0.7814763188362122, "logits/rejected": -0.8079932332038879, "logps/chosen": -70.98641204833984, "logps/rejected": -101.341064453125, "loss": 0.5157, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1144474744796753, "rewards/margins": 0.830893337726593, "rewards/rejected": -1.9453407526016235, "step": 4440 }, { "epoch": 0.7413649257467045, "grad_norm": 45.586734771728516, "learning_rate": 1.2586350742532956e-05, "logits/chosen": -0.6649969220161438, "logits/rejected": -0.673258364200592, "logps/chosen": -97.339599609375, "logps/rejected": -172.46034240722656, "loss": 0.8912, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.25762414932250977, "rewards/margins": 4.200137138366699, "rewards/rejected": -3.9425132274627686, "step": 4443 }, { "epoch": 0.7418655097613883, "grad_norm": 24.26229476928711, "learning_rate": 1.258134490238612e-05, "logits/chosen": -0.3446408808231354, "logits/rejected": -0.4212242066860199, "logps/chosen": -72.43669891357422, "logps/rejected": -154.15184020996094, "loss": 0.844, "rewards/accuracies": 1.0, "rewards/chosen": -0.5288956761360168, "rewards/margins": 1.1577163934707642, "rewards/rejected": -1.6866121292114258, "step": 4446 }, { "epoch": 0.7423660937760721, "grad_norm": 119.75019073486328, "learning_rate": 1.257633906223928e-05, "logits/chosen": -0.5058586597442627, "logits/rejected": -0.5492732524871826, "logps/chosen": -103.79656982421875, "logps/rejected": -137.72142028808594, "loss": 0.4217, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.013238251209259033, "rewards/margins": 1.3926340341567993, "rewards/rejected": -1.379395604133606, "step": 4449 }, { "epoch": 0.7428666777907559, "grad_norm": 27.478294372558594, "learning_rate": 1.2571333222092442e-05, "logits/chosen": -0.7709197998046875, "logits/rejected": -0.8149433732032776, "logps/chosen": -119.43790435791016, "logps/rejected": -134.7368927001953, "loss": 0.7013, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1668671369552612, "rewards/margins": 0.7007524371147156, "rewards/rejected": -1.867619514465332, "step": 4452 }, { "epoch": 0.7433672618054397, "grad_norm": 20.308183670043945, "learning_rate": 1.2566327381945605e-05, "logits/chosen": -0.4225856065750122, "logits/rejected": -0.47398504614830017, "logps/chosen": -117.5263442993164, "logps/rejected": -187.53541564941406, "loss": 0.4274, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.9648584127426147, "rewards/margins": 0.5479124784469604, "rewards/rejected": -2.512770891189575, "step": 4455 }, { "epoch": 0.7438678458201234, "grad_norm": 5.906138896942139, "learning_rate": 1.2561321541798765e-05, "logits/chosen": -0.4515860080718994, "logits/rejected": -0.4765031635761261, "logps/chosen": -48.56144332885742, "logps/rejected": -84.66606903076172, "loss": 0.6541, "rewards/accuracies": 1.0, "rewards/chosen": 0.45337995886802673, "rewards/margins": 1.8688435554504395, "rewards/rejected": -1.4154635667800903, "step": 4458 }, { "epoch": 0.7443684298348072, "grad_norm": 14.717655181884766, "learning_rate": 1.2556315701651929e-05, "logits/chosen": -0.7201780676841736, "logits/rejected": -0.733683168888092, "logps/chosen": -84.92243957519531, "logps/rejected": -112.24554443359375, "loss": 0.4592, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.636063575744629, "rewards/margins": 1.0935088396072388, "rewards/rejected": -2.729572296142578, "step": 4461 }, { "epoch": 0.7448690138494911, "grad_norm": 23.996055603027344, "learning_rate": 1.255130986150509e-05, "logits/chosen": -0.5997042655944824, "logits/rejected": -0.6920697689056396, "logps/chosen": -41.94247055053711, "logps/rejected": -127.734375, "loss": 0.4236, "rewards/accuracies": 1.0, "rewards/chosen": -0.3356330394744873, "rewards/margins": 2.028087854385376, "rewards/rejected": -2.363720655441284, "step": 4464 }, { "epoch": 0.7453695978641749, "grad_norm": 31.7568416595459, "learning_rate": 1.2546304021358254e-05, "logits/chosen": -0.40675094723701477, "logits/rejected": -0.46078386902809143, "logps/chosen": -77.34456634521484, "logps/rejected": -155.40362548828125, "loss": 0.448, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.066768765449524, "rewards/margins": 2.055821418762207, "rewards/rejected": -3.1225903034210205, "step": 4467 }, { "epoch": 0.7458701818788587, "grad_norm": 22.779809951782227, "learning_rate": 1.2541298181211414e-05, "logits/chosen": -0.2995390295982361, "logits/rejected": -0.5229693651199341, "logps/chosen": -42.869842529296875, "logps/rejected": -144.41400146484375, "loss": 0.1956, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3114040791988373, "rewards/margins": 1.104239583015442, "rewards/rejected": -1.4156436920166016, "step": 4470 }, { "epoch": 0.7463707658935425, "grad_norm": 34.86394500732422, "learning_rate": 1.2536292341064576e-05, "logits/chosen": -0.6649587750434875, "logits/rejected": -0.6724390983581543, "logps/chosen": -102.66881561279297, "logps/rejected": -120.7225570678711, "loss": 0.7229, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9854423403739929, "rewards/margins": 1.1525262594223022, "rewards/rejected": -2.1379687786102295, "step": 4473 }, { "epoch": 0.7468713499082262, "grad_norm": 34.8297233581543, "learning_rate": 1.253128650091774e-05, "logits/chosen": -0.4702790677547455, "logits/rejected": -0.5287682414054871, "logps/chosen": -110.51529693603516, "logps/rejected": -132.66505432128906, "loss": 0.6423, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.4414031505584717, "rewards/margins": 0.812457263469696, "rewards/rejected": -3.2538602352142334, "step": 4476 }, { "epoch": 0.74737193392291, "grad_norm": 3.100407123565674, "learning_rate": 1.25262806607709e-05, "logits/chosen": -0.6137983202934265, "logits/rejected": -0.5709285736083984, "logps/chosen": -116.65157318115234, "logps/rejected": -104.95958709716797, "loss": 0.2737, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.372929334640503, "rewards/margins": 1.270027756690979, "rewards/rejected": -3.6429569721221924, "step": 4479 }, { "epoch": 0.7478725179375939, "grad_norm": 12.150959014892578, "learning_rate": 1.2521274820624063e-05, "logits/chosen": -0.6523331999778748, "logits/rejected": -0.6748488545417786, "logps/chosen": -121.9202880859375, "logps/rejected": -118.76446533203125, "loss": 0.2276, "rewards/accuracies": 1.0, "rewards/chosen": -0.2906191349029541, "rewards/margins": 1.1521782875061035, "rewards/rejected": -1.442797303199768, "step": 4482 }, { "epoch": 0.7483731019522777, "grad_norm": 67.35621643066406, "learning_rate": 1.2516268980477225e-05, "logits/chosen": -0.7274240851402283, "logits/rejected": -0.747437059879303, "logps/chosen": -42.174922943115234, "logps/rejected": -97.57745361328125, "loss": 0.6007, "rewards/accuracies": 1.0, "rewards/chosen": 0.2593347728252411, "rewards/margins": 3.4433212280273438, "rewards/rejected": -3.1839866638183594, "step": 4485 }, { "epoch": 0.7488736859669615, "grad_norm": 32.631805419921875, "learning_rate": 1.2511263140330388e-05, "logits/chosen": -0.5978729724884033, "logits/rejected": -0.6464623212814331, "logps/chosen": -76.26444244384766, "logps/rejected": -148.50083923339844, "loss": 0.7316, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.6288174390792847, "rewards/margins": 1.0884124040603638, "rewards/rejected": -2.7172298431396484, "step": 4488 }, { "epoch": 0.7493742699816452, "grad_norm": 12.081413269042969, "learning_rate": 1.2506257300183548e-05, "logits/chosen": -0.8086499571800232, "logits/rejected": -0.8707435131072998, "logps/chosen": -51.64918899536133, "logps/rejected": -134.8358154296875, "loss": 0.2771, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.3856036961078644, "rewards/margins": 1.2929495573043823, "rewards/rejected": -0.9073458313941956, "step": 4491 }, { "epoch": 0.749874853996329, "grad_norm": 49.862152099609375, "learning_rate": 1.250125146003671e-05, "logits/chosen": -0.5032880306243896, "logits/rejected": -0.4806682765483856, "logps/chosen": -143.7633819580078, "logps/rejected": -134.68600463867188, "loss": 0.9025, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -3.75018310546875, "rewards/margins": -1.3528589010238647, "rewards/rejected": -2.397324323654175, "step": 4494 }, { "epoch": 0.7503754380110128, "grad_norm": 9.854738235473633, "learning_rate": 1.2496245619889874e-05, "logits/chosen": -0.7175175547599792, "logits/rejected": -0.7294173240661621, "logps/chosen": -72.67786407470703, "logps/rejected": -116.41702270507812, "loss": 0.2051, "rewards/accuracies": 1.0, "rewards/chosen": -1.2634882926940918, "rewards/margins": 2.2422099113464355, "rewards/rejected": -3.5056982040405273, "step": 4497 }, { "epoch": 0.7508760220256967, "grad_norm": 14.819962501525879, "learning_rate": 1.2491239779743034e-05, "logits/chosen": -0.8017845153808594, "logits/rejected": -0.7707542777061462, "logps/chosen": -116.19476318359375, "logps/rejected": -70.1706771850586, "loss": 0.7465, "rewards/accuracies": 0.0, "rewards/chosen": -0.9317202568054199, "rewards/margins": -1.8252002000808716, "rewards/rejected": 0.8934800028800964, "step": 4500 }, { "epoch": 0.7513766060403805, "grad_norm": 16.929759979248047, "learning_rate": 1.2486233939596197e-05, "logits/chosen": -0.5895442962646484, "logits/rejected": -0.6039531230926514, "logps/chosen": -75.36800384521484, "logps/rejected": -94.78414916992188, "loss": 0.8505, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.279670000076294, "rewards/margins": 1.1593482494354248, "rewards/rejected": -3.4390182495117188, "step": 4503 }, { "epoch": 0.7518771900550643, "grad_norm": 37.754337310791016, "learning_rate": 1.2481228099449359e-05, "logits/chosen": -0.5530858635902405, "logits/rejected": -0.5995059013366699, "logps/chosen": -73.08426666259766, "logps/rejected": -151.8746795654297, "loss": 0.3711, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7369458079338074, "rewards/margins": 4.103998184204102, "rewards/rejected": -4.840943813323975, "step": 4506 }, { "epoch": 0.752377774069748, "grad_norm": 9.905936241149902, "learning_rate": 1.2476222259302523e-05, "logits/chosen": -0.684258759021759, "logits/rejected": -0.7345819473266602, "logps/chosen": -85.88565826416016, "logps/rejected": -141.67657470703125, "loss": 0.5232, "rewards/accuracies": 1.0, "rewards/chosen": -1.7127290964126587, "rewards/margins": 3.6465060710906982, "rewards/rejected": -5.3592352867126465, "step": 4509 }, { "epoch": 0.7528783580844318, "grad_norm": 10.893828392028809, "learning_rate": 1.2471216419155683e-05, "logits/chosen": -0.5047503113746643, "logits/rejected": -0.5081237554550171, "logps/chosen": -78.26348876953125, "logps/rejected": -79.72937774658203, "loss": 0.3185, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.0268598794937134, "rewards/margins": 0.8443062901496887, "rewards/rejected": -1.8711661100387573, "step": 4512 }, { "epoch": 0.7533789420991156, "grad_norm": 23.76875114440918, "learning_rate": 1.2466210579008844e-05, "logits/chosen": -0.870551347732544, "logits/rejected": -0.8583049774169922, "logps/chosen": -104.28643798828125, "logps/rejected": -174.8472137451172, "loss": 0.4147, "rewards/accuracies": 1.0, "rewards/chosen": -1.0025057792663574, "rewards/margins": 2.7163217067718506, "rewards/rejected": -3.718827247619629, "step": 4515 }, { "epoch": 0.7538795261137994, "grad_norm": 16.49224090576172, "learning_rate": 1.2461204738862008e-05, "logits/chosen": -0.5988377928733826, "logits/rejected": -0.6095018982887268, "logps/chosen": -76.33594512939453, "logps/rejected": -127.2244873046875, "loss": 0.5865, "rewards/accuracies": 1.0, "rewards/chosen": -1.0251778364181519, "rewards/margins": 2.1036605834960938, "rewards/rejected": -3.128838539123535, "step": 4518 }, { "epoch": 0.7543801101284833, "grad_norm": 20.801551818847656, "learning_rate": 1.2456198898715168e-05, "logits/chosen": -0.7697799801826477, "logits/rejected": -0.7736216187477112, "logps/chosen": -75.43681335449219, "logps/rejected": -75.00341033935547, "loss": 0.4291, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9006659388542175, "rewards/margins": -0.15167222917079926, "rewards/rejected": -0.7489936947822571, "step": 4521 }, { "epoch": 0.754880694143167, "grad_norm": 36.141632080078125, "learning_rate": 1.2451193058568331e-05, "logits/chosen": -0.6076696515083313, "logits/rejected": -0.6202167868614197, "logps/chosen": -137.04489135742188, "logps/rejected": -147.73077392578125, "loss": 0.8088, "rewards/accuracies": 1.0, "rewards/chosen": -0.6177358627319336, "rewards/margins": 2.901113271713257, "rewards/rejected": -3.5188491344451904, "step": 4524 }, { "epoch": 0.7553812781578508, "grad_norm": 5.486451148986816, "learning_rate": 1.2446187218421493e-05, "logits/chosen": -0.7109773755073547, "logits/rejected": -0.6790162920951843, "logps/chosen": -92.486328125, "logps/rejected": -96.24053955078125, "loss": 0.3004, "rewards/accuracies": 1.0, "rewards/chosen": -1.7664604187011719, "rewards/margins": 0.9761452078819275, "rewards/rejected": -2.742605447769165, "step": 4527 }, { "epoch": 0.7558818621725346, "grad_norm": 45.537696838378906, "learning_rate": 1.2441181378274653e-05, "logits/chosen": -0.6133930683135986, "logits/rejected": -0.6811738014221191, "logps/chosen": -140.20326232910156, "logps/rejected": -131.10313415527344, "loss": 0.6323, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.8774687647819519, "rewards/margins": 1.4655481576919556, "rewards/rejected": -2.343017101287842, "step": 4530 }, { "epoch": 0.7563824461872184, "grad_norm": 3.5938899517059326, "learning_rate": 1.2436175538127817e-05, "logits/chosen": -0.6845152974128723, "logits/rejected": -0.6885396838188171, "logps/chosen": -43.86429214477539, "logps/rejected": -65.3650131225586, "loss": 0.3002, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.31317445635795593, "rewards/margins": 1.407361388206482, "rewards/rejected": -1.7205356359481812, "step": 4533 }, { "epoch": 0.7568830302019022, "grad_norm": 46.69532775878906, "learning_rate": 1.2431169697980979e-05, "logits/chosen": -0.5783218741416931, "logits/rejected": -0.6080317497253418, "logps/chosen": -63.3430061340332, "logps/rejected": -116.21047973632812, "loss": 0.8361, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.5347275733947754, "rewards/margins": 1.4253138303756714, "rewards/rejected": -2.9600412845611572, "step": 4536 }, { "epoch": 0.7573836142165861, "grad_norm": 36.89018630981445, "learning_rate": 1.2426163857834142e-05, "logits/chosen": -0.7227559685707092, "logits/rejected": -0.7648065686225891, "logps/chosen": -67.6288070678711, "logps/rejected": -117.79879760742188, "loss": 0.5099, "rewards/accuracies": 1.0, "rewards/chosen": -0.3303415775299072, "rewards/margins": 3.5904643535614014, "rewards/rejected": -3.9208059310913086, "step": 4539 }, { "epoch": 0.7578841982312698, "grad_norm": 44.40371322631836, "learning_rate": 1.2421158017687302e-05, "logits/chosen": -0.6233636736869812, "logits/rejected": -0.6180313229560852, "logps/chosen": -99.79537963867188, "logps/rejected": -164.0978546142578, "loss": 0.6803, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.717908501625061, "rewards/margins": 1.9049533605575562, "rewards/rejected": -3.622861862182617, "step": 4542 }, { "epoch": 0.7583847822459536, "grad_norm": 51.754051208496094, "learning_rate": 1.2416152177540466e-05, "logits/chosen": -0.5268328785896301, "logits/rejected": -0.4754127264022827, "logps/chosen": -122.34563446044922, "logps/rejected": -129.7395477294922, "loss": 0.8204, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.1174380779266357, "rewards/margins": -0.05401110649108887, "rewards/rejected": -2.063426971435547, "step": 4545 }, { "epoch": 0.7588853662606374, "grad_norm": 25.805891036987305, "learning_rate": 1.2411146337393628e-05, "logits/chosen": -0.6548484563827515, "logits/rejected": -0.65510493516922, "logps/chosen": -83.34357452392578, "logps/rejected": -124.94696044921875, "loss": 0.2942, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.7112383842468262, "rewards/margins": 1.3908182382583618, "rewards/rejected": -3.1020565032958984, "step": 4548 }, { "epoch": 0.7593859502753212, "grad_norm": 16.61207389831543, "learning_rate": 1.2406140497246788e-05, "logits/chosen": -0.7337232232093811, "logits/rejected": -0.7206922173500061, "logps/chosen": -114.61421966552734, "logps/rejected": -137.9893798828125, "loss": 0.8524, "rewards/accuracies": 1.0, "rewards/chosen": -0.8321769833564758, "rewards/margins": 1.8940410614013672, "rewards/rejected": -2.7262182235717773, "step": 4551 }, { "epoch": 0.759886534290005, "grad_norm": 17.063310623168945, "learning_rate": 1.2401134657099951e-05, "logits/chosen": -0.7833730578422546, "logits/rejected": -0.7975446581840515, "logps/chosen": -76.56623840332031, "logps/rejected": -83.45553588867188, "loss": 0.514, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.710249900817871, "rewards/margins": 0.1935064196586609, "rewards/rejected": -1.9037562608718872, "step": 4554 }, { "epoch": 0.7603871183046889, "grad_norm": 43.26960372924805, "learning_rate": 1.2396128816953113e-05, "logits/chosen": -0.5605709552764893, "logits/rejected": -0.5286097526550293, "logps/chosen": -197.8551483154297, "logps/rejected": -138.4918212890625, "loss": 0.7681, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.628929853439331, "rewards/margins": -1.524119257926941, "rewards/rejected": -1.1048105955123901, "step": 4557 }, { "epoch": 0.7608877023193726, "grad_norm": 6.219507694244385, "learning_rate": 1.2391122976806276e-05, "logits/chosen": -0.7559311985969543, "logits/rejected": -0.7925798296928406, "logps/chosen": -31.948745727539062, "logps/rejected": -109.04407501220703, "loss": 0.3018, "rewards/accuracies": 1.0, "rewards/chosen": 0.5996642708778381, "rewards/margins": 3.824388265609741, "rewards/rejected": -3.2247238159179688, "step": 4560 }, { "epoch": 0.7613882863340564, "grad_norm": 29.529489517211914, "learning_rate": 1.2386117136659437e-05, "logits/chosen": -0.4549933671951294, "logits/rejected": -0.47123241424560547, "logps/chosen": -84.87540435791016, "logps/rejected": -156.2852325439453, "loss": 0.2792, "rewards/accuracies": 1.0, "rewards/chosen": -0.7078158259391785, "rewards/margins": 4.597820281982422, "rewards/rejected": -5.305636405944824, "step": 4563 }, { "epoch": 0.7618888703487402, "grad_norm": 8.207880020141602, "learning_rate": 1.23811112965126e-05, "logits/chosen": -0.7362386584281921, "logits/rejected": -0.7651764750480652, "logps/chosen": -84.4975357055664, "logps/rejected": -168.5562744140625, "loss": 0.2607, "rewards/accuracies": 1.0, "rewards/chosen": -0.033129651099443436, "rewards/margins": 3.3106019496917725, "rewards/rejected": -3.343731641769409, "step": 4566 }, { "epoch": 0.762389454363424, "grad_norm": 14.937304496765137, "learning_rate": 1.2376105456365762e-05, "logits/chosen": -0.662746250629425, "logits/rejected": -0.6779167056083679, "logps/chosen": -77.0301284790039, "logps/rejected": -124.9140396118164, "loss": 0.3855, "rewards/accuracies": 1.0, "rewards/chosen": -0.8500261306762695, "rewards/margins": 1.9113192558288574, "rewards/rejected": -2.761345624923706, "step": 4569 }, { "epoch": 0.7628900383781078, "grad_norm": 25.360872268676758, "learning_rate": 1.2371099616218922e-05, "logits/chosen": -0.5165292620658875, "logits/rejected": -0.5354292988777161, "logps/chosen": -122.19721221923828, "logps/rejected": -124.3678970336914, "loss": 0.9042, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.0480241775512695, "rewards/margins": -0.08194788545370102, "rewards/rejected": -1.966076374053955, "step": 4572 }, { "epoch": 0.7633906223927915, "grad_norm": 18.981882095336914, "learning_rate": 1.2366093776072085e-05, "logits/chosen": -0.6057906150817871, "logits/rejected": -0.72905033826828, "logps/chosen": -65.9725112915039, "logps/rejected": -168.6581573486328, "loss": 0.5487, "rewards/accuracies": 1.0, "rewards/chosen": 1.0034441947937012, "rewards/margins": 4.268771648406982, "rewards/rejected": -3.2653274536132812, "step": 4575 }, { "epoch": 0.7638912064074754, "grad_norm": 33.79985046386719, "learning_rate": 1.2361087935925247e-05, "logits/chosen": -0.703599750995636, "logits/rejected": -0.760845959186554, "logps/chosen": -65.65361785888672, "logps/rejected": -170.2133331298828, "loss": 0.2387, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6428701281547546, "rewards/margins": 4.639395236968994, "rewards/rejected": -5.2822651863098145, "step": 4578 }, { "epoch": 0.7643917904221592, "grad_norm": 17.1864070892334, "learning_rate": 1.235608209577841e-05, "logits/chosen": -0.4876399040222168, "logits/rejected": -0.42877888679504395, "logps/chosen": -109.6162109375, "logps/rejected": -93.58377838134766, "loss": 0.3868, "rewards/accuracies": 1.0, "rewards/chosen": -0.17236481606960297, "rewards/margins": 3.2010984420776367, "rewards/rejected": -3.3734633922576904, "step": 4581 }, { "epoch": 0.764892374436843, "grad_norm": 23.238922119140625, "learning_rate": 1.235107625563157e-05, "logits/chosen": -0.4289269745349884, "logits/rejected": -0.46069416403770447, "logps/chosen": -27.340837478637695, "logps/rejected": -77.56175994873047, "loss": 0.4239, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 1.0348784923553467, "rewards/margins": 1.9902524948120117, "rewards/rejected": -0.9553741812705994, "step": 4584 }, { "epoch": 0.7653929584515268, "grad_norm": 34.99905776977539, "learning_rate": 1.2346070415484733e-05, "logits/chosen": -0.6370570063591003, "logits/rejected": -0.6390826106071472, "logps/chosen": -81.53577423095703, "logps/rejected": -92.90828704833984, "loss": 0.8849, "rewards/accuracies": 1.0, "rewards/chosen": -0.4942914545536041, "rewards/margins": 1.53605318069458, "rewards/rejected": -2.0303447246551514, "step": 4587 }, { "epoch": 0.7658935424662106, "grad_norm": 13.676335334777832, "learning_rate": 1.2341064575337896e-05, "logits/chosen": -0.6121665835380554, "logits/rejected": -0.5475462079048157, "logps/chosen": -126.33541107177734, "logps/rejected": -105.80654907226562, "loss": 0.3899, "rewards/accuracies": 1.0, "rewards/chosen": -1.7249646186828613, "rewards/margins": 2.349808692932129, "rewards/rejected": -4.07477331161499, "step": 4590 }, { "epoch": 0.7663941264808943, "grad_norm": 53.12432098388672, "learning_rate": 1.2336058735191056e-05, "logits/chosen": -0.6372451186180115, "logits/rejected": -0.6970848441123962, "logps/chosen": -75.75360107421875, "logps/rejected": -135.14112854003906, "loss": 0.6352, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.8836913704872131, "rewards/margins": 0.8506147265434265, "rewards/rejected": -1.73430597782135, "step": 4593 }, { "epoch": 0.7668947104955782, "grad_norm": 46.78618240356445, "learning_rate": 1.233105289504422e-05, "logits/chosen": -0.6370261311531067, "logits/rejected": -0.5895012021064758, "logps/chosen": -115.8018569946289, "logps/rejected": -110.885498046875, "loss": 0.6223, "rewards/accuracies": 1.0, "rewards/chosen": -1.4627996683120728, "rewards/margins": 2.2094881534576416, "rewards/rejected": -3.672287940979004, "step": 4596 }, { "epoch": 0.767395294510262, "grad_norm": 2.049112319946289, "learning_rate": 1.2326047054897381e-05, "logits/chosen": -0.3930380642414093, "logits/rejected": -0.40639686584472656, "logps/chosen": -81.23108673095703, "logps/rejected": -113.7804183959961, "loss": 0.4319, "rewards/accuracies": 1.0, "rewards/chosen": -0.5494523644447327, "rewards/margins": 1.743259072303772, "rewards/rejected": -2.2927114963531494, "step": 4599 }, { "epoch": 0.7678958785249458, "grad_norm": 27.71993064880371, "learning_rate": 1.2321041214750545e-05, "logits/chosen": -0.49407970905303955, "logits/rejected": -0.5030515789985657, "logps/chosen": -52.97201919555664, "logps/rejected": -131.8582000732422, "loss": 0.3645, "rewards/accuracies": 1.0, "rewards/chosen": -0.1846572905778885, "rewards/margins": 6.180990695953369, "rewards/rejected": -6.365647792816162, "step": 4602 }, { "epoch": 0.7683964625396296, "grad_norm": 13.836877822875977, "learning_rate": 1.2316035374603705e-05, "logits/chosen": -0.5767556428909302, "logits/rejected": -0.6106069087982178, "logps/chosen": -50.827396392822266, "logps/rejected": -105.94024658203125, "loss": 0.5619, "rewards/accuracies": 1.0, "rewards/chosen": -0.8305582404136658, "rewards/margins": 1.6169458627700806, "rewards/rejected": -2.4475042819976807, "step": 4605 }, { "epoch": 0.7688970465543133, "grad_norm": 13.434618949890137, "learning_rate": 1.2311029534456867e-05, "logits/chosen": -0.6400447487831116, "logits/rejected": -0.7050043940544128, "logps/chosen": -81.12860107421875, "logps/rejected": -123.10868072509766, "loss": 0.7008, "rewards/accuracies": 1.0, "rewards/chosen": -1.125195026397705, "rewards/margins": 3.039766311645508, "rewards/rejected": -4.164961338043213, "step": 4608 }, { "epoch": 0.7693976305689971, "grad_norm": 22.81416893005371, "learning_rate": 1.230602369431003e-05, "logits/chosen": -0.6848018765449524, "logits/rejected": -0.6265903115272522, "logps/chosen": -71.16583251953125, "logps/rejected": -32.179412841796875, "loss": 0.8732, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.837975025177002, "rewards/margins": -0.7099432349205017, "rewards/rejected": -0.12803179025650024, "step": 4611 }, { "epoch": 0.7698982145836809, "grad_norm": 37.96695327758789, "learning_rate": 1.230101785416319e-05, "logits/chosen": -0.6748332977294922, "logits/rejected": -0.636959969997406, "logps/chosen": -108.6153793334961, "logps/rejected": -117.24246978759766, "loss": 0.9313, "rewards/accuracies": 1.0, "rewards/chosen": -0.9363566040992737, "rewards/margins": 1.4330633878707886, "rewards/rejected": -2.369420051574707, "step": 4614 }, { "epoch": 0.7703987985983648, "grad_norm": 3.149198055267334, "learning_rate": 1.2296012014016354e-05, "logits/chosen": -0.6089715361595154, "logits/rejected": -0.6099319458007812, "logps/chosen": -102.78862762451172, "logps/rejected": -73.73070526123047, "loss": 0.8358, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -3.814783811569214, "rewards/margins": -1.837294101715088, "rewards/rejected": -1.9774898290634155, "step": 4617 }, { "epoch": 0.7708993826130486, "grad_norm": 39.8233757019043, "learning_rate": 1.2291006173869516e-05, "logits/chosen": -0.7616517543792725, "logits/rejected": -0.7797960638999939, "logps/chosen": -67.53844451904297, "logps/rejected": -152.04542541503906, "loss": 0.2234, "rewards/accuracies": 1.0, "rewards/chosen": -0.36379316449165344, "rewards/margins": 1.7282981872558594, "rewards/rejected": -2.0920913219451904, "step": 4620 }, { "epoch": 0.7713999666277324, "grad_norm": 38.62008285522461, "learning_rate": 1.228600033372268e-05, "logits/chosen": -0.645913302898407, "logits/rejected": -0.6369662284851074, "logps/chosen": -117.4687271118164, "logps/rejected": -107.87653350830078, "loss": 0.3585, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.127732753753662, "rewards/margins": 0.3037683963775635, "rewards/rejected": -2.4315009117126465, "step": 4623 }, { "epoch": 0.7719005506424161, "grad_norm": 10.650192260742188, "learning_rate": 1.228099449357584e-05, "logits/chosen": -0.7052156329154968, "logits/rejected": -0.7203758358955383, "logps/chosen": -103.16033172607422, "logps/rejected": -129.7327117919922, "loss": 0.3394, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.6343142986297607, "rewards/margins": 0.5045884251594543, "rewards/rejected": -3.1389026641845703, "step": 4626 }, { "epoch": 0.7724011346570999, "grad_norm": 30.456623077392578, "learning_rate": 1.2275988653429001e-05, "logits/chosen": -0.43596020340919495, "logits/rejected": -0.41059422492980957, "logps/chosen": -103.03328704833984, "logps/rejected": -94.40198516845703, "loss": 0.4499, "rewards/accuracies": 1.0, "rewards/chosen": -0.5166099667549133, "rewards/margins": 1.5567097663879395, "rewards/rejected": -2.073319673538208, "step": 4629 }, { "epoch": 0.7729017186717837, "grad_norm": 20.96368408203125, "learning_rate": 1.2270982813282165e-05, "logits/chosen": -0.49607741832733154, "logits/rejected": -0.5800693035125732, "logps/chosen": -63.558746337890625, "logps/rejected": -138.59500122070312, "loss": 0.1465, "rewards/accuracies": 1.0, "rewards/chosen": 0.3683025538921356, "rewards/margins": 4.964234828948975, "rewards/rejected": -4.595932483673096, "step": 4632 }, { "epoch": 0.7734023026864676, "grad_norm": 9.267444610595703, "learning_rate": 1.2265976973135325e-05, "logits/chosen": -0.6354292035102844, "logits/rejected": -0.7412996292114258, "logps/chosen": -52.9574089050293, "logps/rejected": -159.58042907714844, "loss": 0.2287, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.852534294128418, "rewards/margins": 2.2016806602478027, "rewards/rejected": -4.054214954376221, "step": 4635 }, { "epoch": 0.7739028867011514, "grad_norm": 28.450382232666016, "learning_rate": 1.2260971132988488e-05, "logits/chosen": -0.6915757656097412, "logits/rejected": -0.6502706408500671, "logps/chosen": -129.41360473632812, "logps/rejected": -130.95945739746094, "loss": 1.0753, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.35495248436927795, "rewards/margins": 0.6220316290855408, "rewards/rejected": -0.9769842028617859, "step": 4638 }, { "epoch": 0.7744034707158352, "grad_norm": 26.060462951660156, "learning_rate": 1.225596529284165e-05, "logits/chosen": -0.6631572842597961, "logits/rejected": -0.7149341702461243, "logps/chosen": -73.6654052734375, "logps/rejected": -119.78682708740234, "loss": 0.4249, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.8021589517593384, "rewards/margins": 0.3378049433231354, "rewards/rejected": -2.1399638652801514, "step": 4641 }, { "epoch": 0.7749040547305189, "grad_norm": 47.014930725097656, "learning_rate": 1.225095945269481e-05, "logits/chosen": -0.7834005951881409, "logits/rejected": -0.7504379749298096, "logps/chosen": -84.87184143066406, "logps/rejected": -74.65913391113281, "loss": 0.4705, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1974748373031616, "rewards/margins": 0.6997111439704895, "rewards/rejected": -1.8971859216690063, "step": 4644 }, { "epoch": 0.7754046387452027, "grad_norm": 16.834016799926758, "learning_rate": 1.2245953612547974e-05, "logits/chosen": -0.5912767648696899, "logits/rejected": -0.5785976052284241, "logps/chosen": -101.76424407958984, "logps/rejected": -117.144775390625, "loss": 0.5121, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6947161555290222, "rewards/margins": 0.2256980687379837, "rewards/rejected": -0.9204142689704895, "step": 4647 }, { "epoch": 0.7759052227598865, "grad_norm": 16.75829315185547, "learning_rate": 1.2240947772401135e-05, "logits/chosen": -0.4873858392238617, "logits/rejected": -0.5800918340682983, "logps/chosen": -73.03956604003906, "logps/rejected": -154.65887451171875, "loss": 0.4937, "rewards/accuracies": 1.0, "rewards/chosen": -0.4270133972167969, "rewards/margins": 4.525786399841309, "rewards/rejected": -4.952800273895264, "step": 4650 }, { "epoch": 0.7764058067745704, "grad_norm": 43.24107360839844, "learning_rate": 1.2235941932254299e-05, "logits/chosen": -0.8314626216888428, "logits/rejected": -0.8122494220733643, "logps/chosen": -118.18280029296875, "logps/rejected": -122.50332641601562, "loss": 1.6498, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -3.00943660736084, "rewards/margins": 1.0688838958740234, "rewards/rejected": -4.078320503234863, "step": 4653 }, { "epoch": 0.7769063907892542, "grad_norm": 45.9099235534668, "learning_rate": 1.2230936092107459e-05, "logits/chosen": -0.678442656993866, "logits/rejected": -0.60483318567276, "logps/chosen": -124.21903228759766, "logps/rejected": -63.23274612426758, "loss": 0.6143, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.8521449565887451, "rewards/margins": 1.4789799451828003, "rewards/rejected": -2.331125020980835, "step": 4656 }, { "epoch": 0.777406974803938, "grad_norm": 10.332778930664062, "learning_rate": 1.2225930251960622e-05, "logits/chosen": -0.5798028707504272, "logits/rejected": -0.6114506125450134, "logps/chosen": -54.0555419921875, "logps/rejected": -116.38299560546875, "loss": 0.6649, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.0374337434768677, "rewards/margins": 1.9661976099014282, "rewards/rejected": -3.003631591796875, "step": 4659 }, { "epoch": 0.7779075588186217, "grad_norm": 24.463071823120117, "learning_rate": 1.2220924411813784e-05, "logits/chosen": -0.7694666385650635, "logits/rejected": -0.7963917851448059, "logps/chosen": -72.0792007446289, "logps/rejected": -106.86315155029297, "loss": 0.5939, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6794666647911072, "rewards/margins": 1.0285285711288452, "rewards/rejected": -1.707995057106018, "step": 4662 }, { "epoch": 0.7784081428333055, "grad_norm": 31.958637237548828, "learning_rate": 1.2215918571666944e-05, "logits/chosen": -0.6917340755462646, "logits/rejected": -0.7087517380714417, "logps/chosen": -69.91536712646484, "logps/rejected": -103.15003204345703, "loss": 0.4251, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.7633882761001587, "rewards/margins": 2.1588621139526367, "rewards/rejected": -3.922250509262085, "step": 4665 }, { "epoch": 0.7789087268479893, "grad_norm": 24.83547592163086, "learning_rate": 1.2210912731520108e-05, "logits/chosen": -0.6715838313102722, "logits/rejected": -0.7049241662025452, "logps/chosen": -94.07513427734375, "logps/rejected": -150.5056610107422, "loss": 0.6332, "rewards/accuracies": 1.0, "rewards/chosen": -1.5264686346054077, "rewards/margins": 3.6359317302703857, "rewards/rejected": -5.162400245666504, "step": 4668 }, { "epoch": 0.7794093108626731, "grad_norm": 60.81197738647461, "learning_rate": 1.220590689137327e-05, "logits/chosen": -0.7049067616462708, "logits/rejected": -0.6449599862098694, "logps/chosen": -149.75498962402344, "logps/rejected": -112.29170989990234, "loss": 0.4817, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -3.4569413661956787, "rewards/margins": -0.17239868640899658, "rewards/rejected": -3.2845427989959717, "step": 4671 }, { "epoch": 0.779909894877357, "grad_norm": 53.46412658691406, "learning_rate": 1.2200901051226433e-05, "logits/chosen": -0.7177543640136719, "logits/rejected": -0.6562318205833435, "logps/chosen": -126.74267578125, "logps/rejected": -92.5052490234375, "loss": 0.6476, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.4004974365234375, "rewards/margins": 1.1200863122940063, "rewards/rejected": -2.5205838680267334, "step": 4674 }, { "epoch": 0.7804104788920407, "grad_norm": 10.349162101745605, "learning_rate": 1.2195895211079593e-05, "logits/chosen": -0.6020395159721375, "logits/rejected": -0.6703290939331055, "logps/chosen": -62.28837203979492, "logps/rejected": -108.0761947631836, "loss": 0.2018, "rewards/accuracies": 1.0, "rewards/chosen": -0.047011058777570724, "rewards/margins": 3.4930124282836914, "rewards/rejected": -3.5400235652923584, "step": 4677 }, { "epoch": 0.7809110629067245, "grad_norm": 15.50549602508545, "learning_rate": 1.2190889370932757e-05, "logits/chosen": -0.7011759281158447, "logits/rejected": -0.6705595850944519, "logps/chosen": -152.759033203125, "logps/rejected": -133.80889892578125, "loss": 0.2947, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.7716784477233887, "rewards/margins": 1.1631792783737183, "rewards/rejected": -2.9348576068878174, "step": 4680 }, { "epoch": 0.7814116469214083, "grad_norm": 73.85627746582031, "learning_rate": 1.2185883530785919e-05, "logits/chosen": -0.4870587885379791, "logits/rejected": -0.6587666869163513, "logps/chosen": -73.53455352783203, "logps/rejected": -190.3455810546875, "loss": 0.4092, "rewards/accuracies": 1.0, "rewards/chosen": -1.8738527297973633, "rewards/margins": 2.3325564861297607, "rewards/rejected": -4.206408977508545, "step": 4683 }, { "epoch": 0.7819122309360921, "grad_norm": 42.086483001708984, "learning_rate": 1.2180877690639079e-05, "logits/chosen": -0.5965006351470947, "logits/rejected": -0.5868664383888245, "logps/chosen": -169.11134338378906, "logps/rejected": -101.31798553466797, "loss": 0.5005, "rewards/accuracies": 0.0, "rewards/chosen": -2.946444511413574, "rewards/margins": -1.52021062374115, "rewards/rejected": -1.4262338876724243, "step": 4686 }, { "epoch": 0.7824128149507759, "grad_norm": 35.09175491333008, "learning_rate": 1.2175871850492242e-05, "logits/chosen": -0.7378167510032654, "logits/rejected": -0.6757804751396179, "logps/chosen": -101.85770416259766, "logps/rejected": -73.03020477294922, "loss": 1.0273, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.8257731199264526, "rewards/margins": 0.7552888989448547, "rewards/rejected": -2.581061840057373, "step": 4689 }, { "epoch": 0.7829133989654598, "grad_norm": 7.850844383239746, "learning_rate": 1.2170866010345404e-05, "logits/chosen": -0.6667889952659607, "logits/rejected": -0.6959311366081238, "logps/chosen": -84.11470794677734, "logps/rejected": -147.42588806152344, "loss": 0.2715, "rewards/accuracies": 1.0, "rewards/chosen": -0.5223135948181152, "rewards/margins": 3.2706680297851562, "rewards/rejected": -3.7929813861846924, "step": 4692 }, { "epoch": 0.7834139829801435, "grad_norm": 19.339115142822266, "learning_rate": 1.2165860170198567e-05, "logits/chosen": -0.8551318645477295, "logits/rejected": -0.8572368025779724, "logps/chosen": -102.44738006591797, "logps/rejected": -144.9851531982422, "loss": 0.3381, "rewards/accuracies": 1.0, "rewards/chosen": -0.8571431040763855, "rewards/margins": 3.405918836593628, "rewards/rejected": -4.263062000274658, "step": 4695 }, { "epoch": 0.7839145669948273, "grad_norm": 54.178550720214844, "learning_rate": 1.2160854330051727e-05, "logits/chosen": -0.6816990971565247, "logits/rejected": -0.7411141395568848, "logps/chosen": -124.48675537109375, "logps/rejected": -134.64622497558594, "loss": 0.9686, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -3.925933837890625, "rewards/margins": -0.31293973326683044, "rewards/rejected": -3.6129941940307617, "step": 4698 }, { "epoch": 0.7844151510095111, "grad_norm": 12.239764213562012, "learning_rate": 1.215584848990489e-05, "logits/chosen": -0.6053391098976135, "logits/rejected": -0.6499602198600769, "logps/chosen": -73.32463073730469, "logps/rejected": -167.28860473632812, "loss": 0.5789, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.41835546493530273, "rewards/margins": 1.9767169952392578, "rewards/rejected": -2.3950726985931396, "step": 4701 }, { "epoch": 0.7849157350241949, "grad_norm": 33.476158142089844, "learning_rate": 1.2150842649758053e-05, "logits/chosen": -0.5896721482276917, "logits/rejected": -0.5361992716789246, "logps/chosen": -116.8318862915039, "logps/rejected": -95.43488311767578, "loss": 0.7656, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.4215631484985352, "rewards/margins": 0.7198696732521057, "rewards/rejected": -2.141433000564575, "step": 4704 }, { "epoch": 0.7854163190388787, "grad_norm": 28.902101516723633, "learning_rate": 1.2145836809611213e-05, "logits/chosen": -0.7502867579460144, "logits/rejected": -0.6915771961212158, "logps/chosen": -132.0134735107422, "logps/rejected": -100.13555145263672, "loss": 0.4519, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.187791109085083, "rewards/margins": -0.23614053428173065, "rewards/rejected": -1.951650619506836, "step": 4707 }, { "epoch": 0.7859169030535625, "grad_norm": 29.628543853759766, "learning_rate": 1.2140830969464376e-05, "logits/chosen": -0.3306374251842499, "logits/rejected": -0.3938029706478119, "logps/chosen": -74.86851501464844, "logps/rejected": -120.58758544921875, "loss": 0.4976, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.342951774597168, "rewards/margins": 2.0248525142669678, "rewards/rejected": -3.3678042888641357, "step": 4710 }, { "epoch": 0.7864174870682463, "grad_norm": 13.741998672485352, "learning_rate": 1.2135825129317538e-05, "logits/chosen": -0.9090132117271423, "logits/rejected": -0.9177563190460205, "logps/chosen": -73.68726348876953, "logps/rejected": -114.21502685546875, "loss": 0.3076, "rewards/accuracies": 1.0, "rewards/chosen": -0.38580918312072754, "rewards/margins": 4.140902996063232, "rewards/rejected": -4.526711940765381, "step": 4713 }, { "epoch": 0.7869180710829301, "grad_norm": 18.517127990722656, "learning_rate": 1.2130819289170702e-05, "logits/chosen": -0.732044517993927, "logits/rejected": -0.8062427639961243, "logps/chosen": -82.64453887939453, "logps/rejected": -139.3571014404297, "loss": 0.3542, "rewards/accuracies": 1.0, "rewards/chosen": -1.0911535024642944, "rewards/margins": 3.8260176181793213, "rewards/rejected": -4.917171478271484, "step": 4716 }, { "epoch": 0.7874186550976139, "grad_norm": 30.211746215820312, "learning_rate": 1.2125813449023862e-05, "logits/chosen": -0.6810154318809509, "logits/rejected": -0.7214387059211731, "logps/chosen": -117.34481811523438, "logps/rejected": -157.00379943847656, "loss": 0.9771, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.7822179198265076, "rewards/margins": -0.06832221895456314, "rewards/rejected": -0.7138956189155579, "step": 4719 }, { "epoch": 0.7879192391122977, "grad_norm": 21.693870544433594, "learning_rate": 1.2120807608877024e-05, "logits/chosen": -0.7717141509056091, "logits/rejected": -0.7432220578193665, "logps/chosen": -115.47408294677734, "logps/rejected": -72.64856719970703, "loss": 0.6029, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6631726622581482, "rewards/margins": 0.004544516559690237, "rewards/rejected": -0.6677171587944031, "step": 4722 }, { "epoch": 0.7884198231269814, "grad_norm": 50.37309265136719, "learning_rate": 1.2115801768730187e-05, "logits/chosen": -0.8502112030982971, "logits/rejected": -0.860531747341156, "logps/chosen": -80.49801635742188, "logps/rejected": -109.57770538330078, "loss": 0.7078, "rewards/accuracies": 1.0, "rewards/chosen": -0.6543586850166321, "rewards/margins": 1.9803484678268433, "rewards/rejected": -2.63470721244812, "step": 4725 }, { "epoch": 0.7889204071416652, "grad_norm": 4.938715934753418, "learning_rate": 1.2110795928583347e-05, "logits/chosen": -0.6434864401817322, "logits/rejected": -0.6601161360740662, "logps/chosen": -100.491455078125, "logps/rejected": -142.95069885253906, "loss": 0.562, "rewards/accuracies": 1.0, "rewards/chosen": -0.9185188412666321, "rewards/margins": 2.593168258666992, "rewards/rejected": -3.5116872787475586, "step": 4728 }, { "epoch": 0.7894209911563491, "grad_norm": 33.989139556884766, "learning_rate": 1.210579008843651e-05, "logits/chosen": -0.6516656279563904, "logits/rejected": -0.7228431105613708, "logps/chosen": -100.08346557617188, "logps/rejected": -147.5094451904297, "loss": 0.5585, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5231761336326599, "rewards/margins": 1.296263337135315, "rewards/rejected": -1.8194395303726196, "step": 4731 }, { "epoch": 0.7899215751710329, "grad_norm": 23.997278213500977, "learning_rate": 1.2100784248289672e-05, "logits/chosen": -0.6960477828979492, "logits/rejected": -0.7088186144828796, "logps/chosen": -83.15808868408203, "logps/rejected": -115.97757720947266, "loss": 0.3602, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5114434957504272, "rewards/margins": 2.050447463989258, "rewards/rejected": -2.5618908405303955, "step": 4734 }, { "epoch": 0.7904221591857167, "grad_norm": 25.999067306518555, "learning_rate": 1.2095778408142836e-05, "logits/chosen": -0.7473788261413574, "logits/rejected": -0.6637793183326721, "logps/chosen": -76.1502685546875, "logps/rejected": -67.96269989013672, "loss": 0.3938, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4578012228012085, "rewards/margins": 1.3973544836044312, "rewards/rejected": -1.85515558719635, "step": 4737 }, { "epoch": 0.7909227432004005, "grad_norm": 35.32756423950195, "learning_rate": 1.2090772567995996e-05, "logits/chosen": -0.45896443724632263, "logits/rejected": -0.5114392638206482, "logps/chosen": -77.80282592773438, "logps/rejected": -134.85780334472656, "loss": 0.4216, "rewards/accuracies": 1.0, "rewards/chosen": -1.7399259805679321, "rewards/margins": 2.9169301986694336, "rewards/rejected": -4.656856060028076, "step": 4740 }, { "epoch": 0.7914233272150842, "grad_norm": 20.878767013549805, "learning_rate": 1.2085766727849158e-05, "logits/chosen": -0.7406766414642334, "logits/rejected": -0.6940555572509766, "logps/chosen": -107.83575439453125, "logps/rejected": -81.2938232421875, "loss": 0.2082, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6092614531517029, "rewards/margins": 1.4663020372390747, "rewards/rejected": -2.075563430786133, "step": 4743 }, { "epoch": 0.791923911229768, "grad_norm": 12.153913497924805, "learning_rate": 1.2080760887702321e-05, "logits/chosen": -0.5745278000831604, "logits/rejected": -0.5508928298950195, "logps/chosen": -102.66158294677734, "logps/rejected": -114.78280639648438, "loss": 0.4239, "rewards/accuracies": 1.0, "rewards/chosen": -0.9057989120483398, "rewards/margins": 2.2047595977783203, "rewards/rejected": -3.1105587482452393, "step": 4746 }, { "epoch": 0.7924244952444519, "grad_norm": 22.49517059326172, "learning_rate": 1.2075755047555481e-05, "logits/chosen": -0.6362589001655579, "logits/rejected": -0.6257770657539368, "logps/chosen": -72.28604125976562, "logps/rejected": -77.6517333984375, "loss": 0.4029, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6146400570869446, "rewards/margins": 1.0109614133834839, "rewards/rejected": -1.6256014108657837, "step": 4749 }, { "epoch": 0.7929250792591357, "grad_norm": 29.485143661499023, "learning_rate": 1.2070749207408645e-05, "logits/chosen": -0.6028030514717102, "logits/rejected": -0.5621187686920166, "logps/chosen": -116.7110824584961, "logps/rejected": -107.37808227539062, "loss": 0.5434, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.8581289052963257, "rewards/margins": 1.3856090307235718, "rewards/rejected": -3.2437381744384766, "step": 4752 }, { "epoch": 0.7934256632738195, "grad_norm": 9.806266784667969, "learning_rate": 1.2065743367261807e-05, "logits/chosen": -0.735174834728241, "logits/rejected": -0.7353997826576233, "logps/chosen": -80.3104248046875, "logps/rejected": -92.5477066040039, "loss": 0.4274, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.053056240081787, "rewards/margins": 0.6253246665000916, "rewards/rejected": -1.6783809661865234, "step": 4755 }, { "epoch": 0.7939262472885033, "grad_norm": 10.499192237854004, "learning_rate": 1.2060737527114967e-05, "logits/chosen": -0.6417461633682251, "logits/rejected": -0.6317694783210754, "logps/chosen": -94.3705062866211, "logps/rejected": -92.40909576416016, "loss": 0.6805, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.5222786664962769, "rewards/margins": 0.8928443789482117, "rewards/rejected": -2.4151229858398438, "step": 4758 }, { "epoch": 0.794426831303187, "grad_norm": 8.087518692016602, "learning_rate": 1.205573168696813e-05, "logits/chosen": -0.716630756855011, "logits/rejected": -0.7539856433868408, "logps/chosen": -54.965789794921875, "logps/rejected": -123.0936279296875, "loss": 0.1958, "rewards/accuracies": 1.0, "rewards/chosen": 0.8243936896324158, "rewards/margins": 3.3622429370880127, "rewards/rejected": -2.5378496646881104, "step": 4761 }, { "epoch": 0.7949274153178708, "grad_norm": 36.072383880615234, "learning_rate": 1.2050725846821292e-05, "logits/chosen": -0.6299071907997131, "logits/rejected": -0.6191338896751404, "logps/chosen": -60.63750076293945, "logps/rejected": -135.17654418945312, "loss": 0.9408, "rewards/accuracies": 1.0, "rewards/chosen": 0.036668796092271805, "rewards/margins": 6.5989861488342285, "rewards/rejected": -6.562317371368408, "step": 4764 }, { "epoch": 0.7954279993325546, "grad_norm": 13.1127347946167, "learning_rate": 1.2045720006674456e-05, "logits/chosen": -0.5857036709785461, "logits/rejected": -0.5898920297622681, "logps/chosen": -98.9785385131836, "logps/rejected": -94.30188751220703, "loss": 0.4634, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.17900951206684113, "rewards/margins": 1.367660403251648, "rewards/rejected": -1.5466699600219727, "step": 4767 }, { "epoch": 0.7959285833472385, "grad_norm": 19.58861541748047, "learning_rate": 1.2040714166527616e-05, "logits/chosen": -0.4973507821559906, "logits/rejected": -0.5705735087394714, "logps/chosen": -33.55033874511719, "logps/rejected": -145.57737731933594, "loss": 0.2823, "rewards/accuracies": 1.0, "rewards/chosen": 0.2534423768520355, "rewards/margins": 3.87198543548584, "rewards/rejected": -3.6185429096221924, "step": 4770 }, { "epoch": 0.7964291673619223, "grad_norm": 28.177623748779297, "learning_rate": 1.203570832638078e-05, "logits/chosen": -0.5493863224983215, "logits/rejected": -0.5649341940879822, "logps/chosen": -116.37947845458984, "logps/rejected": -134.65003967285156, "loss": 0.4819, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1962776184082031, "rewards/margins": 0.25228604674339294, "rewards/rejected": -1.448563575744629, "step": 4773 }, { "epoch": 0.796929751376606, "grad_norm": 10.67374038696289, "learning_rate": 1.2030702486233941e-05, "logits/chosen": -0.5981009006500244, "logits/rejected": -0.6112693548202515, "logps/chosen": -63.33354568481445, "logps/rejected": -142.06541442871094, "loss": 0.4402, "rewards/accuracies": 1.0, "rewards/chosen": 0.40123072266578674, "rewards/margins": 5.440328598022461, "rewards/rejected": -5.039098262786865, "step": 4776 }, { "epoch": 0.7974303353912898, "grad_norm": 43.601898193359375, "learning_rate": 1.2025696646087101e-05, "logits/chosen": -0.5848584175109863, "logits/rejected": -0.5722897052764893, "logps/chosen": -85.30078125, "logps/rejected": -84.30284118652344, "loss": 0.8842, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.0966321229934692, "rewards/margins": -0.12034662812948227, "rewards/rejected": -0.9762855172157288, "step": 4779 }, { "epoch": 0.7979309194059736, "grad_norm": 19.170072555541992, "learning_rate": 1.2020690805940265e-05, "logits/chosen": -0.6431689858436584, "logits/rejected": -0.6630979776382446, "logps/chosen": -87.94830322265625, "logps/rejected": -163.7649383544922, "loss": 0.4451, "rewards/accuracies": 1.0, "rewards/chosen": -0.2675539553165436, "rewards/margins": 3.9631874561309814, "rewards/rejected": -4.230741500854492, "step": 4782 }, { "epoch": 0.7984315034206574, "grad_norm": 25.54289436340332, "learning_rate": 1.2015684965793426e-05, "logits/chosen": -0.4863002300262451, "logits/rejected": -0.5325522422790527, "logps/chosen": -40.046268463134766, "logps/rejected": -114.85932159423828, "loss": 0.4526, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.18769633769989014, "rewards/margins": 1.1864347457885742, "rewards/rejected": -1.374131202697754, "step": 4785 }, { "epoch": 0.7989320874353413, "grad_norm": 25.062576293945312, "learning_rate": 1.201067912564659e-05, "logits/chosen": -0.7998480796813965, "logits/rejected": -0.7678708434104919, "logps/chosen": -124.18038177490234, "logps/rejected": -110.69189453125, "loss": 0.4615, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.855715036392212, "rewards/margins": 0.3433268070220947, "rewards/rejected": -3.1990420818328857, "step": 4788 }, { "epoch": 0.7994326714500251, "grad_norm": 39.036251068115234, "learning_rate": 1.200567328549975e-05, "logits/chosen": -0.7158921360969543, "logits/rejected": -0.7745814919471741, "logps/chosen": -60.84431838989258, "logps/rejected": -131.9319305419922, "loss": 0.5968, "rewards/accuracies": 1.0, "rewards/chosen": -1.1791820526123047, "rewards/margins": 2.033374547958374, "rewards/rejected": -3.2125566005706787, "step": 4791 }, { "epoch": 0.7999332554647088, "grad_norm": 24.91571044921875, "learning_rate": 1.2000667445352913e-05, "logits/chosen": -0.6168060898780823, "logits/rejected": -0.6441259384155273, "logps/chosen": -95.28678131103516, "logps/rejected": -97.7038345336914, "loss": 0.7086, "rewards/accuracies": 1.0, "rewards/chosen": -0.37764617800712585, "rewards/margins": 2.852714776992798, "rewards/rejected": -3.230360984802246, "step": 4794 }, { "epoch": 0.8004338394793926, "grad_norm": 37.407169342041016, "learning_rate": 1.1995661605206075e-05, "logits/chosen": -0.6735632419586182, "logits/rejected": -0.5860263109207153, "logps/chosen": -154.69668579101562, "logps/rejected": -93.6204833984375, "loss": 0.8686, "rewards/accuracies": 1.0, "rewards/chosen": -1.5687309503555298, "rewards/margins": 0.9515002369880676, "rewards/rejected": -2.520231246948242, "step": 4797 }, { "epoch": 0.8009344234940764, "grad_norm": 20.459306716918945, "learning_rate": 1.1990655765059235e-05, "logits/chosen": -0.5231897830963135, "logits/rejected": -0.5468943119049072, "logps/chosen": -59.75860595703125, "logps/rejected": -111.68643188476562, "loss": 0.3954, "rewards/accuracies": 1.0, "rewards/chosen": -0.7235566973686218, "rewards/margins": 2.8402633666992188, "rewards/rejected": -3.5638198852539062, "step": 4800 }, { "epoch": 0.8009344234940764, "eval_logits/chosen": -0.6602983474731445, "eval_logits/rejected": -0.674521267414093, "eval_logps/chosen": -90.17123413085938, "eval_logps/rejected": -118.419189453125, "eval_loss": 0.5941592454910278, "eval_rewards/accuracies": 0.7387387156486511, "eval_rewards/chosen": -0.9913507699966431, "eval_rewards/margins": 1.537750005722046, "eval_rewards/rejected": -2.5291006565093994, "eval_runtime": 349.5161, "eval_samples_per_second": 7.622, "eval_steps_per_second": 1.905, "step": 4800 }, { "epoch": 0.8014350075087602, "grad_norm": 28.713520050048828, "learning_rate": 1.1985649924912399e-05, "logits/chosen": -0.8829894065856934, "logits/rejected": -0.8747537732124329, "logps/chosen": -120.05146026611328, "logps/rejected": -118.04979705810547, "loss": 0.8061, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.8248438835144043, "rewards/margins": 1.8092098236083984, "rewards/rejected": -0.9843659400939941, "step": 4803 }, { "epoch": 0.8019355915234441, "grad_norm": 8.402697563171387, "learning_rate": 1.198064408476556e-05, "logits/chosen": -0.46208634972572327, "logits/rejected": -0.5492787957191467, "logps/chosen": -35.47987747192383, "logps/rejected": -170.3285675048828, "loss": 0.3502, "rewards/accuracies": 1.0, "rewards/chosen": -0.030903689563274384, "rewards/margins": 3.972517728805542, "rewards/rejected": -4.003421306610107, "step": 4806 }, { "epoch": 0.8024361755381278, "grad_norm": 38.301429748535156, "learning_rate": 1.1975638244618724e-05, "logits/chosen": -0.8594996929168701, "logits/rejected": -0.8626102805137634, "logps/chosen": -108.67559051513672, "logps/rejected": -130.98541259765625, "loss": 0.3103, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.852541983127594, "rewards/margins": 1.2734299898147583, "rewards/rejected": -2.125972032546997, "step": 4809 }, { "epoch": 0.8029367595528116, "grad_norm": 8.382789611816406, "learning_rate": 1.1970632404471884e-05, "logits/chosen": -0.8761261105537415, "logits/rejected": -0.9091317057609558, "logps/chosen": -46.412105560302734, "logps/rejected": -155.7174530029297, "loss": 0.3083, "rewards/accuracies": 1.0, "rewards/chosen": -0.12306774407625198, "rewards/margins": 4.2215094566345215, "rewards/rejected": -4.344577312469482, "step": 4812 }, { "epoch": 0.8034373435674954, "grad_norm": 42.82002258300781, "learning_rate": 1.1965626564325046e-05, "logits/chosen": -0.757794201374054, "logits/rejected": -0.8058807253837585, "logps/chosen": -44.7805290222168, "logps/rejected": -126.2385482788086, "loss": 0.6382, "rewards/accuracies": 1.0, "rewards/chosen": -0.06754861027002335, "rewards/margins": 3.7778098583221436, "rewards/rejected": -3.8453586101531982, "step": 4815 }, { "epoch": 0.8039379275821792, "grad_norm": 8.35268497467041, "learning_rate": 1.196062072417821e-05, "logits/chosen": -0.7567765712738037, "logits/rejected": -0.7937803268432617, "logps/chosen": -45.597930908203125, "logps/rejected": -138.85986328125, "loss": 0.5033, "rewards/accuracies": 1.0, "rewards/chosen": 0.5699514746665955, "rewards/margins": 6.14788293838501, "rewards/rejected": -5.577931880950928, "step": 4818 }, { "epoch": 0.804438511596863, "grad_norm": 26.935285568237305, "learning_rate": 1.195561488403137e-05, "logits/chosen": -0.6433177590370178, "logits/rejected": -0.6962480545043945, "logps/chosen": -102.60248565673828, "logps/rejected": -134.61106872558594, "loss": 0.984, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.6892175674438477, "rewards/margins": 0.6010153889656067, "rewards/rejected": -2.2902328968048096, "step": 4821 }, { "epoch": 0.8049390956115468, "grad_norm": 34.12287902832031, "learning_rate": 1.1950609043884533e-05, "logits/chosen": -0.7866273522377014, "logits/rejected": -0.7738462090492249, "logps/chosen": -88.1214370727539, "logps/rejected": -115.6743392944336, "loss": 0.7136, "rewards/accuracies": 1.0, "rewards/chosen": -1.0073679685592651, "rewards/margins": 2.3980414867401123, "rewards/rejected": -3.405409574508667, "step": 4824 }, { "epoch": 0.8054396796262306, "grad_norm": 24.232059478759766, "learning_rate": 1.1945603203737695e-05, "logits/chosen": -0.709825336933136, "logits/rejected": -0.733708918094635, "logps/chosen": -69.14689636230469, "logps/rejected": -124.0473861694336, "loss": 0.5724, "rewards/accuracies": 1.0, "rewards/chosen": -0.2939033806324005, "rewards/margins": 4.290928363800049, "rewards/rejected": -4.584831714630127, "step": 4827 }, { "epoch": 0.8059402636409144, "grad_norm": 27.139894485473633, "learning_rate": 1.1940597363590858e-05, "logits/chosen": -0.6090039610862732, "logits/rejected": -0.6564838886260986, "logps/chosen": -40.4262580871582, "logps/rejected": -96.93732452392578, "loss": 0.3612, "rewards/accuracies": 1.0, "rewards/chosen": -0.02447129227221012, "rewards/margins": 1.6942195892333984, "rewards/rejected": -1.7186908721923828, "step": 4830 }, { "epoch": 0.8064408476555982, "grad_norm": 15.799322128295898, "learning_rate": 1.1935591523444018e-05, "logits/chosen": -0.7752747535705566, "logits/rejected": -0.7481756210327148, "logps/chosen": -123.1300277709961, "logps/rejected": -102.14288330078125, "loss": 0.9593, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.77817964553833, "rewards/margins": 0.006653626915067434, "rewards/rejected": -1.784833312034607, "step": 4833 }, { "epoch": 0.806941431670282, "grad_norm": 48.81414031982422, "learning_rate": 1.193058568329718e-05, "logits/chosen": -0.6908285021781921, "logits/rejected": -0.7449924945831299, "logps/chosen": -60.598934173583984, "logps/rejected": -87.38520050048828, "loss": 0.6889, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8894516825675964, "rewards/margins": 1.049249291419983, "rewards/rejected": -1.938700795173645, "step": 4836 }, { "epoch": 0.8074420156849658, "grad_norm": 56.727352142333984, "learning_rate": 1.1925579843150344e-05, "logits/chosen": -0.7134044766426086, "logits/rejected": -0.6906964182853699, "logps/chosen": -76.45771026611328, "logps/rejected": -95.66107940673828, "loss": 0.4658, "rewards/accuracies": 1.0, "rewards/chosen": -0.4072377681732178, "rewards/margins": 3.3942205905914307, "rewards/rejected": -3.8014583587646484, "step": 4839 }, { "epoch": 0.8079425996996495, "grad_norm": 82.91901397705078, "learning_rate": 1.1920574003003504e-05, "logits/chosen": -0.633024275302887, "logits/rejected": -0.6096370220184326, "logps/chosen": -123.69915771484375, "logps/rejected": -149.0301513671875, "loss": 0.6798, "rewards/accuracies": 1.0, "rewards/chosen": -0.7773692011833191, "rewards/margins": 2.7491729259490967, "rewards/rejected": -3.5265417098999023, "step": 4842 }, { "epoch": 0.8084431837143334, "grad_norm": 16.703794479370117, "learning_rate": 1.1915568162856667e-05, "logits/chosen": -0.7184147238731384, "logits/rejected": -0.7335394024848938, "logps/chosen": -88.53592681884766, "logps/rejected": -94.23967742919922, "loss": 0.6198, "rewards/accuracies": 0.0, "rewards/chosen": -0.7067891955375671, "rewards/margins": -0.28470122814178467, "rewards/rejected": -0.4220879077911377, "step": 4845 }, { "epoch": 0.8089437677290172, "grad_norm": 35.11865997314453, "learning_rate": 1.1910562322709829e-05, "logits/chosen": -0.6285073757171631, "logits/rejected": -0.6722933650016785, "logps/chosen": -62.7763786315918, "logps/rejected": -113.11410522460938, "loss": 0.6125, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.5563310384750366, "rewards/margins": 0.4864487648010254, "rewards/rejected": -2.0427796840667725, "step": 4848 }, { "epoch": 0.809444351743701, "grad_norm": 29.875839233398438, "learning_rate": 1.1905556482562993e-05, "logits/chosen": -0.6811257004737854, "logits/rejected": -0.6956071257591248, "logps/chosen": -71.07868194580078, "logps/rejected": -133.9246368408203, "loss": 0.6502, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5560209155082703, "rewards/margins": 2.773235321044922, "rewards/rejected": -3.329256296157837, "step": 4851 }, { "epoch": 0.8099449357583848, "grad_norm": 11.433632850646973, "learning_rate": 1.1900550642416153e-05, "logits/chosen": -0.5475452542304993, "logits/rejected": -0.5764397978782654, "logps/chosen": -103.9015121459961, "logps/rejected": -131.53286743164062, "loss": 0.5001, "rewards/accuracies": 1.0, "rewards/chosen": -1.0573430061340332, "rewards/margins": 2.039180040359497, "rewards/rejected": -3.096522569656372, "step": 4854 }, { "epoch": 0.8104455197730686, "grad_norm": 8.613593101501465, "learning_rate": 1.1895544802269315e-05, "logits/chosen": -0.71490877866745, "logits/rejected": -0.7556203007698059, "logps/chosen": -45.55696105957031, "logps/rejected": -146.31520080566406, "loss": 0.5697, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 1.2739921808242798, "rewards/margins": 3.8700456619262695, "rewards/rejected": -2.5960533618927, "step": 4857 }, { "epoch": 0.8109461037877523, "grad_norm": 25.967588424682617, "learning_rate": 1.1890538962122478e-05, "logits/chosen": -0.4246024191379547, "logits/rejected": -0.45561036467552185, "logps/chosen": -109.39579010009766, "logps/rejected": -134.3561248779297, "loss": 0.5162, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.1585569381713867, "rewards/margins": 0.5261847972869873, "rewards/rejected": -1.6847416162490845, "step": 4860 }, { "epoch": 0.8114466878024362, "grad_norm": 50.32585144042969, "learning_rate": 1.1885533121975638e-05, "logits/chosen": -0.7572731971740723, "logits/rejected": -0.7524073123931885, "logps/chosen": -109.4112777709961, "logps/rejected": -122.31153106689453, "loss": 0.7217, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.13956451416015625, "rewards/margins": 1.3249772787094116, "rewards/rejected": -1.4645417928695679, "step": 4863 }, { "epoch": 0.81194727181712, "grad_norm": 35.20433044433594, "learning_rate": 1.1880527281828802e-05, "logits/chosen": -0.678914487361908, "logits/rejected": -0.634049117565155, "logps/chosen": -121.17609405517578, "logps/rejected": -133.5517578125, "loss": 0.5404, "rewards/accuracies": 1.0, "rewards/chosen": -0.9616496562957764, "rewards/margins": 2.4986982345581055, "rewards/rejected": -3.460347890853882, "step": 4866 }, { "epoch": 0.8124478558318038, "grad_norm": 17.66283416748047, "learning_rate": 1.1875521441681963e-05, "logits/chosen": -0.7167152762413025, "logits/rejected": -0.7255390286445618, "logps/chosen": -75.99102783203125, "logps/rejected": -100.40447998046875, "loss": 0.4115, "rewards/accuracies": 1.0, "rewards/chosen": -0.9507917761802673, "rewards/margins": 2.2118170261383057, "rewards/rejected": -3.1626088619232178, "step": 4869 }, { "epoch": 0.8129484398464876, "grad_norm": 7.969927787780762, "learning_rate": 1.1870515601535127e-05, "logits/chosen": -0.6358327269554138, "logits/rejected": -0.6475924849510193, "logps/chosen": -96.30968475341797, "logps/rejected": -81.836669921875, "loss": 0.718, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.7531414031982422, "rewards/margins": -0.5948916077613831, "rewards/rejected": -1.1582497358322144, "step": 4872 }, { "epoch": 0.8134490238611713, "grad_norm": 36.58796310424805, "learning_rate": 1.1865509761388287e-05, "logits/chosen": -0.6865599751472473, "logits/rejected": -0.6667415499687195, "logps/chosen": -85.66155242919922, "logps/rejected": -86.58038330078125, "loss": 0.7399, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.30209219455718994, "rewards/margins": -0.4641905128955841, "rewards/rejected": 0.16209833323955536, "step": 4875 }, { "epoch": 0.8139496078758551, "grad_norm": 27.402774810791016, "learning_rate": 1.1860503921241449e-05, "logits/chosen": -0.4746299684047699, "logits/rejected": -0.5071284770965576, "logps/chosen": -89.7607192993164, "logps/rejected": -135.4557647705078, "loss": 0.6149, "rewards/accuracies": 1.0, "rewards/chosen": -1.3909344673156738, "rewards/margins": 1.7035188674926758, "rewards/rejected": -3.0944530963897705, "step": 4878 }, { "epoch": 0.8144501918905389, "grad_norm": 8.27346420288086, "learning_rate": 1.1855498081094612e-05, "logits/chosen": -0.6845474243164062, "logits/rejected": -0.6564413905143738, "logps/chosen": -114.4771499633789, "logps/rejected": -77.12931823730469, "loss": 0.2017, "rewards/accuracies": 1.0, "rewards/chosen": -0.050589483231306076, "rewards/margins": 1.5306429862976074, "rewards/rejected": -1.5812325477600098, "step": 4881 }, { "epoch": 0.8149507759052228, "grad_norm": 7.4539361000061035, "learning_rate": 1.1850492240947772e-05, "logits/chosen": -0.5997408628463745, "logits/rejected": -0.594006359577179, "logps/chosen": -83.02954864501953, "logps/rejected": -84.45572662353516, "loss": 0.3993, "rewards/accuracies": 1.0, "rewards/chosen": -0.7713682651519775, "rewards/margins": 1.3047531843185425, "rewards/rejected": -2.0761215686798096, "step": 4884 }, { "epoch": 0.8154513599199066, "grad_norm": 30.803876876831055, "learning_rate": 1.1845486400800936e-05, "logits/chosen": -0.8555935025215149, "logits/rejected": -0.873012125492096, "logps/chosen": -98.63936614990234, "logps/rejected": -140.56446838378906, "loss": 0.6043, "rewards/accuracies": 1.0, "rewards/chosen": -1.0584633350372314, "rewards/margins": 4.5794830322265625, "rewards/rejected": -5.637946605682373, "step": 4887 }, { "epoch": 0.8159519439345904, "grad_norm": 50.1743278503418, "learning_rate": 1.1840480560654098e-05, "logits/chosen": -0.6808226108551025, "logits/rejected": -0.734386146068573, "logps/chosen": -62.15995407104492, "logps/rejected": -173.1349334716797, "loss": 0.6783, "rewards/accuracies": 1.0, "rewards/chosen": 0.0922519788146019, "rewards/margins": 2.672675132751465, "rewards/rejected": -2.580423355102539, "step": 4890 }, { "epoch": 0.8164525279492741, "grad_norm": 15.516919136047363, "learning_rate": 1.1835474720507258e-05, "logits/chosen": -0.5471050143241882, "logits/rejected": -0.6365674138069153, "logps/chosen": -82.10883331298828, "logps/rejected": -165.45741271972656, "loss": 0.159, "rewards/accuracies": 1.0, "rewards/chosen": -1.150957703590393, "rewards/margins": 4.421905040740967, "rewards/rejected": -5.5728631019592285, "step": 4893 }, { "epoch": 0.8169531119639579, "grad_norm": 20.90843963623047, "learning_rate": 1.1830468880360421e-05, "logits/chosen": -0.7541643977165222, "logits/rejected": -0.7996576428413391, "logps/chosen": -69.75609588623047, "logps/rejected": -116.2032241821289, "loss": 0.595, "rewards/accuracies": 1.0, "rewards/chosen": -0.7994391322135925, "rewards/margins": 1.4366483688354492, "rewards/rejected": -2.2360875606536865, "step": 4896 }, { "epoch": 0.8174536959786417, "grad_norm": 26.35274314880371, "learning_rate": 1.1825463040213583e-05, "logits/chosen": -0.5969111919403076, "logits/rejected": -0.5419101119041443, "logps/chosen": -129.5122528076172, "logps/rejected": -103.05452728271484, "loss": 0.4879, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8412187099456787, "rewards/margins": 1.8346253633499146, "rewards/rejected": -2.675844192504883, "step": 4899 }, { "epoch": 0.8179542799933256, "grad_norm": 12.547176361083984, "learning_rate": 1.1820457200066747e-05, "logits/chosen": -0.6920149922370911, "logits/rejected": -0.7692084312438965, "logps/chosen": -77.43961334228516, "logps/rejected": -133.4507293701172, "loss": 0.5389, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.3275264501571655, "rewards/margins": 1.6375287771224976, "rewards/rejected": -2.965055227279663, "step": 4902 }, { "epoch": 0.8184548640080094, "grad_norm": 35.27403259277344, "learning_rate": 1.1815451359919907e-05, "logits/chosen": -0.8176629543304443, "logits/rejected": -0.7723124623298645, "logps/chosen": -112.15536499023438, "logps/rejected": -107.55304718017578, "loss": 0.7051, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.3859037160873413, "rewards/margins": -0.14084625244140625, "rewards/rejected": -1.245057463645935, "step": 4905 }, { "epoch": 0.8189554480226932, "grad_norm": 34.65458679199219, "learning_rate": 1.181044551977307e-05, "logits/chosen": -0.54291170835495, "logits/rejected": -0.6030036807060242, "logps/chosen": -122.5442123413086, "logps/rejected": -181.58087158203125, "loss": 0.565, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5168496966362, "rewards/margins": 1.6470283269882202, "rewards/rejected": -2.1638777256011963, "step": 4908 }, { "epoch": 0.8194560320373769, "grad_norm": 15.591156005859375, "learning_rate": 1.1805439679626232e-05, "logits/chosen": -0.7005580067634583, "logits/rejected": -0.7204185128211975, "logps/chosen": -92.84719848632812, "logps/rejected": -136.8545684814453, "loss": 0.2256, "rewards/accuracies": 1.0, "rewards/chosen": -2.8604371547698975, "rewards/margins": 2.8364105224609375, "rewards/rejected": -5.696847438812256, "step": 4911 }, { "epoch": 0.8199566160520607, "grad_norm": 30.652099609375, "learning_rate": 1.1800433839479392e-05, "logits/chosen": -0.49039188027381897, "logits/rejected": -0.617062509059906, "logps/chosen": -52.334930419921875, "logps/rejected": -138.9425048828125, "loss": 0.5254, "rewards/accuracies": 1.0, "rewards/chosen": -0.5114602446556091, "rewards/margins": 2.8629825115203857, "rewards/rejected": -3.3744430541992188, "step": 4914 }, { "epoch": 0.8204572000667445, "grad_norm": 9.731557846069336, "learning_rate": 1.1795427999332556e-05, "logits/chosen": -0.784778356552124, "logits/rejected": -0.8068118095397949, "logps/chosen": -64.55777740478516, "logps/rejected": -79.07263946533203, "loss": 0.4128, "rewards/accuracies": 1.0, "rewards/chosen": -0.8319229483604431, "rewards/margins": 0.43640899658203125, "rewards/rejected": -1.2683318853378296, "step": 4917 }, { "epoch": 0.8209577840814284, "grad_norm": 17.48468780517578, "learning_rate": 1.1790422159185717e-05, "logits/chosen": -0.7453673481941223, "logits/rejected": -0.8050352931022644, "logps/chosen": -108.0350341796875, "logps/rejected": -174.55625915527344, "loss": 0.4456, "rewards/accuracies": 1.0, "rewards/chosen": -0.75460284948349, "rewards/margins": 3.1743619441986084, "rewards/rejected": -3.928964614868164, "step": 4920 }, { "epoch": 0.8214583680961122, "grad_norm": 28.339073181152344, "learning_rate": 1.178541631903888e-05, "logits/chosen": -0.7895223498344421, "logits/rejected": -0.7845120429992676, "logps/chosen": -68.68599700927734, "logps/rejected": -90.2817153930664, "loss": 0.6279, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6153075695037842, "rewards/margins": 2.1822588443756104, "rewards/rejected": -2.7975664138793945, "step": 4923 }, { "epoch": 0.821958952110796, "grad_norm": 12.046250343322754, "learning_rate": 1.1780410478892041e-05, "logits/chosen": -0.5782859325408936, "logits/rejected": -0.6534342765808105, "logps/chosen": -77.53882598876953, "logps/rejected": -82.15341186523438, "loss": 0.3729, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.6528979539871216, "rewards/margins": 0.4044300615787506, "rewards/rejected": -1.0573278665542603, "step": 4926 }, { "epoch": 0.8224595361254797, "grad_norm": 8.42126750946045, "learning_rate": 1.1775404638745204e-05, "logits/chosen": -0.6865412592887878, "logits/rejected": -0.6914072632789612, "logps/chosen": -97.42362213134766, "logps/rejected": -125.75411224365234, "loss": 0.3301, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.416253924369812, "rewards/margins": 1.6315059661865234, "rewards/rejected": -3.047760009765625, "step": 4929 }, { "epoch": 0.8229601201401635, "grad_norm": 28.493961334228516, "learning_rate": 1.1770398798598366e-05, "logits/chosen": -0.7924017906188965, "logits/rejected": -0.781475841999054, "logps/chosen": -110.4603500366211, "logps/rejected": -120.82659149169922, "loss": 0.6593, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.837956428527832, "rewards/margins": -1.020912766456604, "rewards/rejected": -1.8170439004898071, "step": 4932 }, { "epoch": 0.8234607041548473, "grad_norm": 26.887939453125, "learning_rate": 1.1765392958451526e-05, "logits/chosen": -0.9865245819091797, "logits/rejected": -0.9969286918640137, "logps/chosen": -107.43558502197266, "logps/rejected": -130.60679626464844, "loss": 0.2724, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.0125346183776855, "rewards/margins": 1.467844843864441, "rewards/rejected": -2.480379581451416, "step": 4935 }, { "epoch": 0.8239612881695311, "grad_norm": 24.73293685913086, "learning_rate": 1.176038711830469e-05, "logits/chosen": -0.8361323475837708, "logits/rejected": -0.8855196833610535, "logps/chosen": -82.96228790283203, "logps/rejected": -109.9677963256836, "loss": 0.845, "rewards/accuracies": 1.0, "rewards/chosen": -1.9278501272201538, "rewards/margins": 1.5905532836914062, "rewards/rejected": -3.5184032917022705, "step": 4938 }, { "epoch": 0.824461872184215, "grad_norm": 61.82833480834961, "learning_rate": 1.1755381278157852e-05, "logits/chosen": -0.7599790692329407, "logits/rejected": -0.7298063635826111, "logps/chosen": -119.99512481689453, "logps/rejected": -93.5621337890625, "loss": 1.0588, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -4.833366870880127, "rewards/margins": -1.9901738166809082, "rewards/rejected": -2.8431930541992188, "step": 4941 }, { "epoch": 0.8249624561988987, "grad_norm": 75.59861755371094, "learning_rate": 1.1750375438011015e-05, "logits/chosen": -0.5262045860290527, "logits/rejected": -0.5111555457115173, "logps/chosen": -90.98889923095703, "logps/rejected": -58.41420364379883, "loss": 1.5657, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.7631300687789917, "rewards/margins": -1.3279472589492798, "rewards/rejected": -0.4351828098297119, "step": 4944 }, { "epoch": 0.8254630402135825, "grad_norm": 17.647907257080078, "learning_rate": 1.1745369597864175e-05, "logits/chosen": -0.7266044616699219, "logits/rejected": -0.7178294062614441, "logps/chosen": -86.49620819091797, "logps/rejected": -98.6144790649414, "loss": 0.9805, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -3.00244140625, "rewards/margins": -0.7903611063957214, "rewards/rejected": -2.212080240249634, "step": 4947 }, { "epoch": 0.8259636242282663, "grad_norm": 37.098506927490234, "learning_rate": 1.1740363757717337e-05, "logits/chosen": -0.6176678538322449, "logits/rejected": -0.5594131350517273, "logps/chosen": -131.13055419921875, "logps/rejected": -48.7182731628418, "loss": 0.3811, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.7292329668998718, "rewards/margins": 0.27180346846580505, "rewards/rejected": -1.001036286354065, "step": 4950 }, { "epoch": 0.8264642082429501, "grad_norm": 51.0628776550293, "learning_rate": 1.17353579175705e-05, "logits/chosen": -0.6309524178504944, "logits/rejected": -0.6991894841194153, "logps/chosen": -60.15093994140625, "logps/rejected": -101.76546478271484, "loss": 0.4993, "rewards/accuracies": 1.0, "rewards/chosen": -0.2621888220310211, "rewards/margins": 0.94743412733078, "rewards/rejected": -1.2096229791641235, "step": 4953 }, { "epoch": 0.8269647922576339, "grad_norm": 28.038740158081055, "learning_rate": 1.173035207742366e-05, "logits/chosen": -0.5073338150978088, "logits/rejected": -0.5084433555603027, "logps/chosen": -99.81253051757812, "logps/rejected": -148.5070037841797, "loss": 0.6395, "rewards/accuracies": 1.0, "rewards/chosen": -0.6384422779083252, "rewards/margins": 2.5257749557495117, "rewards/rejected": -3.164216995239258, "step": 4956 }, { "epoch": 0.8274653762723178, "grad_norm": 11.257852554321289, "learning_rate": 1.1725346237276824e-05, "logits/chosen": -0.6131691336631775, "logits/rejected": -0.6258828043937683, "logps/chosen": -127.016845703125, "logps/rejected": -204.83966064453125, "loss": 0.3231, "rewards/accuracies": 1.0, "rewards/chosen": -1.366683840751648, "rewards/margins": 1.252743124961853, "rewards/rejected": -2.619426727294922, "step": 4959 }, { "epoch": 0.8279659602870015, "grad_norm": 21.765445709228516, "learning_rate": 1.1720340397129986e-05, "logits/chosen": -0.5156077742576599, "logits/rejected": -0.6624132394790649, "logps/chosen": -54.52317428588867, "logps/rejected": -168.4021453857422, "loss": 0.8359, "rewards/accuracies": 1.0, "rewards/chosen": -0.12774302065372467, "rewards/margins": 3.81146240234375, "rewards/rejected": -3.9392054080963135, "step": 4962 }, { "epoch": 0.8284665443016853, "grad_norm": 12.547958374023438, "learning_rate": 1.171533455698315e-05, "logits/chosen": -0.7951271533966064, "logits/rejected": -0.8022934794425964, "logps/chosen": -76.53301239013672, "logps/rejected": -109.52249908447266, "loss": 0.4424, "rewards/accuracies": 1.0, "rewards/chosen": -0.6143582463264465, "rewards/margins": 1.70526123046875, "rewards/rejected": -2.3196194171905518, "step": 4965 }, { "epoch": 0.8289671283163691, "grad_norm": 9.770676612854004, "learning_rate": 1.171032871683631e-05, "logits/chosen": -0.6628302931785583, "logits/rejected": -0.7637226581573486, "logps/chosen": -80.01020050048828, "logps/rejected": -119.142333984375, "loss": 0.154, "rewards/accuracies": 1.0, "rewards/chosen": -0.6919257044792175, "rewards/margins": 3.6223089694976807, "rewards/rejected": -4.314234733581543, "step": 4968 }, { "epoch": 0.8294677123310529, "grad_norm": 36.643489837646484, "learning_rate": 1.1705322876689471e-05, "logits/chosen": -0.6304523944854736, "logits/rejected": -0.6574656367301941, "logps/chosen": -77.1795654296875, "logps/rejected": -93.1825180053711, "loss": 0.6679, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8196923732757568, "rewards/margins": 0.6425350308418274, "rewards/rejected": -1.4622273445129395, "step": 4971 }, { "epoch": 0.8299682963457367, "grad_norm": 48.50871658325195, "learning_rate": 1.1700317036542635e-05, "logits/chosen": -0.8295121192932129, "logits/rejected": -0.8281204104423523, "logps/chosen": -95.9006576538086, "logps/rejected": -126.4795913696289, "loss": 0.7706, "rewards/accuracies": 1.0, "rewards/chosen": 0.04432028532028198, "rewards/margins": 3.388981819152832, "rewards/rejected": -3.3446619510650635, "step": 4974 }, { "epoch": 0.8304688803604204, "grad_norm": 28.01030731201172, "learning_rate": 1.1695311196395795e-05, "logits/chosen": -0.6465082764625549, "logits/rejected": -0.649097204208374, "logps/chosen": -103.12979125976562, "logps/rejected": -147.71461486816406, "loss": 0.6233, "rewards/accuracies": 1.0, "rewards/chosen": -2.582157850265503, "rewards/margins": 0.5738978981971741, "rewards/rejected": -3.1560556888580322, "step": 4977 }, { "epoch": 0.8309694643751043, "grad_norm": 23.7664794921875, "learning_rate": 1.1690305356248958e-05, "logits/chosen": -0.5968215465545654, "logits/rejected": -0.5807682871818542, "logps/chosen": -90.62276458740234, "logps/rejected": -104.03775787353516, "loss": 0.5666, "rewards/accuracies": 1.0, "rewards/chosen": -0.4960825741291046, "rewards/margins": 3.1023731231689453, "rewards/rejected": -3.5984554290771484, "step": 4980 }, { "epoch": 0.8314700483897881, "grad_norm": 16.69741439819336, "learning_rate": 1.168529951610212e-05, "logits/chosen": -0.8234527111053467, "logits/rejected": -0.7949042320251465, "logps/chosen": -142.99362182617188, "logps/rejected": -96.86920166015625, "loss": 0.6872, "rewards/accuracies": 1.0, "rewards/chosen": -1.0370128154754639, "rewards/margins": 2.3865368366241455, "rewards/rejected": -3.4235496520996094, "step": 4983 }, { "epoch": 0.8319706324044719, "grad_norm": 47.68551254272461, "learning_rate": 1.1680293675955284e-05, "logits/chosen": -0.5172894597053528, "logits/rejected": -0.5333290696144104, "logps/chosen": -82.92975616455078, "logps/rejected": -92.09627532958984, "loss": 1.0743, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.22303760051727295, "rewards/margins": 1.6088687181472778, "rewards/rejected": -1.8319064378738403, "step": 4986 }, { "epoch": 0.8324712164191557, "grad_norm": 38.57456588745117, "learning_rate": 1.1675287835808444e-05, "logits/chosen": -0.6681806445121765, "logits/rejected": -0.6032946705818176, "logps/chosen": -72.2878646850586, "logps/rejected": -72.6908950805664, "loss": 0.5805, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8909347057342529, "rewards/margins": 1.0199536085128784, "rewards/rejected": -1.9108883142471313, "step": 4989 }, { "epoch": 0.8329718004338394, "grad_norm": 66.66463470458984, "learning_rate": 1.1670281995661605e-05, "logits/chosen": -0.6161339282989502, "logits/rejected": -0.6276154518127441, "logps/chosen": -105.81024169921875, "logps/rejected": -110.23856353759766, "loss": 1.1848, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.8072320818901062, "rewards/margins": 1.497271180152893, "rewards/rejected": -2.3045032024383545, "step": 4992 }, { "epoch": 0.8334723844485232, "grad_norm": 31.1960506439209, "learning_rate": 1.1665276155514769e-05, "logits/chosen": -0.5572054982185364, "logits/rejected": -0.5932329893112183, "logps/chosen": -55.18545913696289, "logps/rejected": -132.6875762939453, "loss": 0.6429, "rewards/accuracies": 1.0, "rewards/chosen": -0.5739322304725647, "rewards/margins": 2.4918935298919678, "rewards/rejected": -3.0658254623413086, "step": 4995 }, { "epoch": 0.8339729684632071, "grad_norm": 18.32146644592285, "learning_rate": 1.1660270315367929e-05, "logits/chosen": -0.5883703827857971, "logits/rejected": -0.6230637431144714, "logps/chosen": -62.79490661621094, "logps/rejected": -123.94674682617188, "loss": 0.7556, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8669664859771729, "rewards/margins": 0.3499056398868561, "rewards/rejected": -1.2168720960617065, "step": 4998 }, { "epoch": 0.8344735524778909, "grad_norm": 45.573524475097656, "learning_rate": 1.1655264475221093e-05, "logits/chosen": -0.9313662648200989, "logits/rejected": -0.9255599975585938, "logps/chosen": -53.864933013916016, "logps/rejected": -90.23272705078125, "loss": 0.7607, "rewards/accuracies": 1.0, "rewards/chosen": -0.02309572696685791, "rewards/margins": 3.329523801803589, "rewards/rejected": -3.352619171142578, "step": 5001 }, { "epoch": 0.8349741364925747, "grad_norm": 21.93052864074707, "learning_rate": 1.1650258635074254e-05, "logits/chosen": -0.7002925276756287, "logits/rejected": -0.6479194760322571, "logps/chosen": -94.44525909423828, "logps/rejected": -39.36946487426758, "loss": 0.9001, "rewards/accuracies": 0.0, "rewards/chosen": -2.422678232192993, "rewards/margins": -1.6303905248641968, "rewards/rejected": -0.7922875285148621, "step": 5004 }, { "epoch": 0.8354747205072585, "grad_norm": 25.181909561157227, "learning_rate": 1.1645252794927414e-05, "logits/chosen": -0.5534301400184631, "logits/rejected": -0.714817225933075, "logps/chosen": -87.84636688232422, "logps/rejected": -128.5677032470703, "loss": 0.7306, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.7382961511611938, "rewards/margins": 0.9045858383178711, "rewards/rejected": -2.6428821086883545, "step": 5007 }, { "epoch": 0.8359753045219422, "grad_norm": 24.553768157958984, "learning_rate": 1.1640246954780578e-05, "logits/chosen": -0.4371105432510376, "logits/rejected": -0.46381711959838867, "logps/chosen": -83.6064224243164, "logps/rejected": -202.0635528564453, "loss": 0.5413, "rewards/accuracies": 1.0, "rewards/chosen": -1.481847882270813, "rewards/margins": 3.114593744277954, "rewards/rejected": -4.596441745758057, "step": 5010 }, { "epoch": 0.836475888536626, "grad_norm": 29.272565841674805, "learning_rate": 1.163524111463374e-05, "logits/chosen": -0.477062851190567, "logits/rejected": -0.5282806158065796, "logps/chosen": -93.06636810302734, "logps/rejected": -127.7776107788086, "loss": 0.4479, "rewards/accuracies": 1.0, "rewards/chosen": 0.3938748836517334, "rewards/margins": 3.0637474060058594, "rewards/rejected": -2.669872999191284, "step": 5013 }, { "epoch": 0.8369764725513099, "grad_norm": 31.520315170288086, "learning_rate": 1.1630235274486903e-05, "logits/chosen": -0.6853213310241699, "logits/rejected": -0.6799471974372864, "logps/chosen": -98.90735626220703, "logps/rejected": -135.91262817382812, "loss": 0.6345, "rewards/accuracies": 1.0, "rewards/chosen": -0.03448257967829704, "rewards/margins": 2.3541107177734375, "rewards/rejected": -2.3885934352874756, "step": 5016 }, { "epoch": 0.8374770565659937, "grad_norm": 16.224241256713867, "learning_rate": 1.1625229434340063e-05, "logits/chosen": -0.7074047923088074, "logits/rejected": -0.7381412982940674, "logps/chosen": -111.13701629638672, "logps/rejected": -174.49586486816406, "loss": 0.4726, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.32741573452949524, "rewards/margins": 1.585319995880127, "rewards/rejected": -1.9127358198165894, "step": 5019 }, { "epoch": 0.8379776405806775, "grad_norm": 10.42617416381836, "learning_rate": 1.1620223594193227e-05, "logits/chosen": -0.5020742416381836, "logits/rejected": -0.5633450746536255, "logps/chosen": -70.18946838378906, "logps/rejected": -176.7946014404297, "loss": 0.2249, "rewards/accuracies": 1.0, "rewards/chosen": -0.4124637842178345, "rewards/margins": 4.486691951751709, "rewards/rejected": -4.899156093597412, "step": 5022 }, { "epoch": 0.8384782245953613, "grad_norm": 33.22736740112305, "learning_rate": 1.1615217754046389e-05, "logits/chosen": -0.6648035645484924, "logits/rejected": -0.6566687226295471, "logps/chosen": -132.0926055908203, "logps/rejected": -136.5511016845703, "loss": 0.7705, "rewards/accuracies": 1.0, "rewards/chosen": -0.35811591148376465, "rewards/margins": 2.626861810684204, "rewards/rejected": -2.9849777221679688, "step": 5025 }, { "epoch": 0.838978808610045, "grad_norm": 45.15287780761719, "learning_rate": 1.1610211913899549e-05, "logits/chosen": -0.5664445757865906, "logits/rejected": -0.634433388710022, "logps/chosen": -53.3731803894043, "logps/rejected": -91.12716674804688, "loss": 0.6283, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.42029500007629395, "rewards/margins": 2.9694297313690186, "rewards/rejected": -2.5491347312927246, "step": 5028 }, { "epoch": 0.8394793926247288, "grad_norm": 12.857115745544434, "learning_rate": 1.1605206073752712e-05, "logits/chosen": -0.6740157604217529, "logits/rejected": -0.5885576605796814, "logps/chosen": -182.80157470703125, "logps/rejected": -110.81342315673828, "loss": 0.4426, "rewards/accuracies": 1.0, "rewards/chosen": -0.1645914763212204, "rewards/margins": 2.530736207962036, "rewards/rejected": -2.6953275203704834, "step": 5031 }, { "epoch": 0.8399799766394126, "grad_norm": 37.54893112182617, "learning_rate": 1.1600200233605874e-05, "logits/chosen": -0.6130249500274658, "logits/rejected": -0.5945204496383667, "logps/chosen": -81.26940155029297, "logps/rejected": -118.20829010009766, "loss": 0.5216, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1485360860824585, "rewards/margins": 2.532064199447632, "rewards/rejected": -3.68060040473938, "step": 5034 }, { "epoch": 0.8404805606540965, "grad_norm": 11.368176460266113, "learning_rate": 1.1595194393459037e-05, "logits/chosen": -0.6160310506820679, "logits/rejected": -0.5247976779937744, "logps/chosen": -116.88101959228516, "logps/rejected": -107.32286834716797, "loss": 0.3816, "rewards/accuracies": 1.0, "rewards/chosen": -0.6946032643318176, "rewards/margins": 2.171674966812134, "rewards/rejected": -2.8662784099578857, "step": 5037 }, { "epoch": 0.8409811446687803, "grad_norm": 42.2730827331543, "learning_rate": 1.1590188553312198e-05, "logits/chosen": -0.6295682787895203, "logits/rejected": -0.6228063702583313, "logps/chosen": -67.40619659423828, "logps/rejected": -70.22201538085938, "loss": 0.6934, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6039130091667175, "rewards/margins": 1.1228609085083008, "rewards/rejected": -1.7267738580703735, "step": 5040 }, { "epoch": 0.841481728683464, "grad_norm": 15.675458908081055, "learning_rate": 1.1585182713165361e-05, "logits/chosen": -0.776655912399292, "logits/rejected": -0.7895822525024414, "logps/chosen": -88.7840576171875, "logps/rejected": -123.0484619140625, "loss": 0.2108, "rewards/accuracies": 1.0, "rewards/chosen": 0.2550518214702606, "rewards/margins": 2.2470896244049072, "rewards/rejected": -1.9920377731323242, "step": 5043 }, { "epoch": 0.8419823126981478, "grad_norm": 19.359588623046875, "learning_rate": 1.1580176873018523e-05, "logits/chosen": -0.7349691390991211, "logits/rejected": -0.767578661441803, "logps/chosen": -93.56331634521484, "logps/rejected": -116.96463775634766, "loss": 0.6652, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.044562816619873, "rewards/margins": 0.9453503489494324, "rewards/rejected": -1.9899133443832397, "step": 5046 }, { "epoch": 0.8424828967128316, "grad_norm": 16.853252410888672, "learning_rate": 1.1575171032871683e-05, "logits/chosen": -0.6311860084533691, "logits/rejected": -0.6515701413154602, "logps/chosen": -104.11666107177734, "logps/rejected": -153.18312072753906, "loss": 0.5874, "rewards/accuracies": 1.0, "rewards/chosen": 0.20245535671710968, "rewards/margins": 2.703801393508911, "rewards/rejected": -2.5013461112976074, "step": 5049 }, { "epoch": 0.8429834807275154, "grad_norm": 14.13357162475586, "learning_rate": 1.1570165192724846e-05, "logits/chosen": -0.6260302662849426, "logits/rejected": -0.5543820858001709, "logps/chosen": -103.35958099365234, "logps/rejected": -61.907379150390625, "loss": 0.2677, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.10173644870519638, "rewards/margins": 1.6809669733047485, "rewards/rejected": -1.7827032804489136, "step": 5052 }, { "epoch": 0.8434840647421993, "grad_norm": 51.91957473754883, "learning_rate": 1.1565159352578008e-05, "logits/chosen": -0.5790529847145081, "logits/rejected": -0.6048365831375122, "logps/chosen": -88.79959869384766, "logps/rejected": -122.0962905883789, "loss": 0.6875, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3738608658313751, "rewards/margins": 1.4964970350265503, "rewards/rejected": -1.8703579902648926, "step": 5055 }, { "epoch": 0.8439846487568831, "grad_norm": 7.458947658538818, "learning_rate": 1.1560153512431172e-05, "logits/chosen": -0.5559045672416687, "logits/rejected": -0.5331619381904602, "logps/chosen": -92.70513916015625, "logps/rejected": -106.56362915039062, "loss": 0.4143, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.2057255059480667, "rewards/margins": 4.177408218383789, "rewards/rejected": -3.9716827869415283, "step": 5058 }, { "epoch": 0.8444852327715668, "grad_norm": 20.35560417175293, "learning_rate": 1.1555147672284332e-05, "logits/chosen": -0.7175582051277161, "logits/rejected": -0.7382633090019226, "logps/chosen": -63.43699645996094, "logps/rejected": -119.2605209350586, "loss": 0.3624, "rewards/accuracies": 1.0, "rewards/chosen": -0.5035426020622253, "rewards/margins": 2.2516136169433594, "rewards/rejected": -2.7551562786102295, "step": 5061 }, { "epoch": 0.8449858167862506, "grad_norm": 44.60270690917969, "learning_rate": 1.1550141832137494e-05, "logits/chosen": -0.6923839449882507, "logits/rejected": -0.7196595072746277, "logps/chosen": -101.3729248046875, "logps/rejected": -107.64261627197266, "loss": 0.9729, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.2824279069900513, "rewards/margins": -0.13614852726459503, "rewards/rejected": -1.1462794542312622, "step": 5064 }, { "epoch": 0.8454864008009344, "grad_norm": 27.119327545166016, "learning_rate": 1.1545135991990657e-05, "logits/chosen": -0.5157762169837952, "logits/rejected": -0.5199601054191589, "logps/chosen": -63.886138916015625, "logps/rejected": -65.57999420166016, "loss": 0.6661, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.482429027557373, "rewards/margins": 0.03924059867858887, "rewards/rejected": -1.5216697454452515, "step": 5067 }, { "epoch": 0.8459869848156182, "grad_norm": 13.047514915466309, "learning_rate": 1.1540130151843817e-05, "logits/chosen": -0.6322150826454163, "logits/rejected": -0.6428828835487366, "logps/chosen": -84.93914031982422, "logps/rejected": -105.04495239257812, "loss": 0.4624, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9714462161064148, "rewards/margins": 1.4364486932754517, "rewards/rejected": -2.407895088195801, "step": 5070 }, { "epoch": 0.8464875688303021, "grad_norm": 31.73171615600586, "learning_rate": 1.153512431169698e-05, "logits/chosen": -0.6378186345100403, "logits/rejected": -0.651893138885498, "logps/chosen": -117.33934783935547, "logps/rejected": -125.2878189086914, "loss": 0.3998, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.3395986557006836, "rewards/margins": 0.18971745669841766, "rewards/rejected": -1.5293160676956177, "step": 5073 }, { "epoch": 0.8469881528449859, "grad_norm": 11.02761173248291, "learning_rate": 1.1530118471550143e-05, "logits/chosen": -0.7435739636421204, "logits/rejected": -0.7678763270378113, "logps/chosen": -82.6122817993164, "logps/rejected": -103.94466400146484, "loss": 0.3908, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3395693600177765, "rewards/margins": 2.9089434146881104, "rewards/rejected": -3.2485129833221436, "step": 5076 }, { "epoch": 0.8474887368596696, "grad_norm": 30.482990264892578, "learning_rate": 1.1525112631403306e-05, "logits/chosen": -0.7308120727539062, "logits/rejected": -0.6816281676292419, "logps/chosen": -91.76361083984375, "logps/rejected": -91.44082641601562, "loss": 0.6649, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.2612968981266022, "rewards/margins": 1.5075654983520508, "rewards/rejected": -1.7688623666763306, "step": 5079 }, { "epoch": 0.8479893208743534, "grad_norm": 16.347196578979492, "learning_rate": 1.1520106791256466e-05, "logits/chosen": -0.6559646129608154, "logits/rejected": -0.6495433449745178, "logps/chosen": -83.31485748291016, "logps/rejected": -88.5514144897461, "loss": 0.8059, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.05280814692378044, "rewards/margins": 1.2133325338363647, "rewards/rejected": -1.2661405801773071, "step": 5082 }, { "epoch": 0.8484899048890372, "grad_norm": 7.613106727600098, "learning_rate": 1.1515100951109628e-05, "logits/chosen": -0.7019670009613037, "logits/rejected": -0.7334936261177063, "logps/chosen": -71.13557434082031, "logps/rejected": -114.2586441040039, "loss": 0.5699, "rewards/accuracies": 1.0, "rewards/chosen": -1.0058807134628296, "rewards/margins": 3.1097183227539062, "rewards/rejected": -4.115599155426025, "step": 5085 }, { "epoch": 0.848990488903721, "grad_norm": 12.335651397705078, "learning_rate": 1.1510095110962791e-05, "logits/chosen": -0.647369384765625, "logits/rejected": -0.7089252471923828, "logps/chosen": -96.2765884399414, "logps/rejected": -200.4213409423828, "loss": 0.3207, "rewards/accuracies": 1.0, "rewards/chosen": -0.6626015901565552, "rewards/margins": 1.5065393447875977, "rewards/rejected": -2.1691410541534424, "step": 5088 }, { "epoch": 0.8494910729184048, "grad_norm": 33.23074722290039, "learning_rate": 1.1505089270815952e-05, "logits/chosen": -0.5778059363365173, "logits/rejected": -0.5866600871086121, "logps/chosen": -136.46893310546875, "logps/rejected": -144.318115234375, "loss": 0.5922, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4529973268508911, "rewards/margins": 0.13983333110809326, "rewards/rejected": -0.5928306579589844, "step": 5091 }, { "epoch": 0.8499916569330886, "grad_norm": 12.926718711853027, "learning_rate": 1.1500083430669115e-05, "logits/chosen": -0.5553819537162781, "logits/rejected": -0.5094043016433716, "logps/chosen": -109.65167236328125, "logps/rejected": -109.69287109375, "loss": 0.4176, "rewards/accuracies": 1.0, "rewards/chosen": -1.6452994346618652, "rewards/margins": 0.6420086026191711, "rewards/rejected": -2.2873079776763916, "step": 5094 }, { "epoch": 0.8504922409477724, "grad_norm": 28.447996139526367, "learning_rate": 1.1495077590522277e-05, "logits/chosen": -0.6639540195465088, "logits/rejected": -0.6827099919319153, "logps/chosen": -167.6713104248047, "logps/rejected": -178.08489990234375, "loss": 0.7854, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.5806167721748352, "rewards/margins": -0.49187925457954407, "rewards/rejected": -0.08873748779296875, "step": 5097 }, { "epoch": 0.8509928249624562, "grad_norm": 15.887472152709961, "learning_rate": 1.149007175037544e-05, "logits/chosen": -0.5599517226219177, "logits/rejected": -0.5867295265197754, "logps/chosen": -64.96868133544922, "logps/rejected": -123.9774398803711, "loss": 0.4917, "rewards/accuracies": 1.0, "rewards/chosen": -1.7683062553405762, "rewards/margins": 2.4030230045318604, "rewards/rejected": -4.171329498291016, "step": 5100 }, { "epoch": 0.85149340897714, "grad_norm": 33.71933364868164, "learning_rate": 1.14850659102286e-05, "logits/chosen": -0.40627190470695496, "logits/rejected": -0.3479379415512085, "logps/chosen": -98.5226821899414, "logps/rejected": -113.79166412353516, "loss": 0.706, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.0409573316574097, "rewards/margins": 3.3113858699798584, "rewards/rejected": -4.3523430824279785, "step": 5103 }, { "epoch": 0.8519939929918238, "grad_norm": 23.045082092285156, "learning_rate": 1.1480060070081762e-05, "logits/chosen": -0.7499777674674988, "logits/rejected": -0.7004281878471375, "logps/chosen": -169.8949432373047, "logps/rejected": -78.60411834716797, "loss": 0.8372, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.706210732460022, "rewards/margins": -0.9998567700386047, "rewards/rejected": -0.7063539028167725, "step": 5106 }, { "epoch": 0.8524945770065075, "grad_norm": 35.77865982055664, "learning_rate": 1.1475054229934926e-05, "logits/chosen": -0.5653671622276306, "logits/rejected": -0.6097895503044128, "logps/chosen": -60.82566452026367, "logps/rejected": -86.7147216796875, "loss": 0.9838, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.9381634593009949, "rewards/margins": -0.17159098386764526, "rewards/rejected": -0.7665724754333496, "step": 5109 }, { "epoch": 0.8529951610211914, "grad_norm": 32.51142120361328, "learning_rate": 1.1470048389788086e-05, "logits/chosen": -0.4857204854488373, "logits/rejected": -0.49512454867362976, "logps/chosen": -69.02953338623047, "logps/rejected": -113.29379272460938, "loss": 0.5321, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.9015283584594727, "rewards/margins": 0.10697326809167862, "rewards/rejected": -1.008501648902893, "step": 5112 }, { "epoch": 0.8534957450358752, "grad_norm": 40.7663688659668, "learning_rate": 1.146504254964125e-05, "logits/chosen": -0.6925444602966309, "logits/rejected": -0.694467306137085, "logps/chosen": -47.13926696777344, "logps/rejected": -96.9444808959961, "loss": 0.8237, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.19644217193126678, "rewards/margins": 3.1387479305267334, "rewards/rejected": -2.94230580329895, "step": 5115 }, { "epoch": 0.853996329050559, "grad_norm": 6.310652732849121, "learning_rate": 1.1460036709494411e-05, "logits/chosen": -0.620966911315918, "logits/rejected": -0.5964429378509521, "logps/chosen": -89.0364761352539, "logps/rejected": -83.46581268310547, "loss": 0.3416, "rewards/accuracies": 1.0, "rewards/chosen": -0.11613384634256363, "rewards/margins": 0.852806568145752, "rewards/rejected": -0.9689404368400574, "step": 5118 }, { "epoch": 0.8544969130652428, "grad_norm": 8.947725296020508, "learning_rate": 1.1455030869347573e-05, "logits/chosen": -0.6180275678634644, "logits/rejected": -0.6648244857788086, "logps/chosen": -76.72222137451172, "logps/rejected": -89.83734893798828, "loss": 0.405, "rewards/accuracies": 1.0, "rewards/chosen": 0.06161561608314514, "rewards/margins": 2.118699312210083, "rewards/rejected": -2.05708384513855, "step": 5121 }, { "epoch": 0.8549974970799266, "grad_norm": 29.918766021728516, "learning_rate": 1.1450025029200735e-05, "logits/chosen": -0.6515944004058838, "logits/rejected": -0.660532534122467, "logps/chosen": -52.32736587524414, "logps/rejected": -73.70954132080078, "loss": 0.588, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.5388384461402893, "rewards/margins": 0.49040189385414124, "rewards/rejected": -1.0292402505874634, "step": 5124 }, { "epoch": 0.8554980810946103, "grad_norm": 23.16463851928711, "learning_rate": 1.1445019189053896e-05, "logits/chosen": -0.6398717164993286, "logits/rejected": -0.668504536151886, "logps/chosen": -53.082275390625, "logps/rejected": -140.0329132080078, "loss": 0.5723, "rewards/accuracies": 1.0, "rewards/chosen": -0.3794601261615753, "rewards/margins": 5.079742908477783, "rewards/rejected": -5.459203243255615, "step": 5127 }, { "epoch": 0.8559986651092941, "grad_norm": 27.800968170166016, "learning_rate": 1.144001334890706e-05, "logits/chosen": -0.7105709910392761, "logits/rejected": -0.8146259188652039, "logps/chosen": -78.6358413696289, "logps/rejected": -111.59484100341797, "loss": 0.5078, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6329469084739685, "rewards/margins": 0.9839706420898438, "rewards/rejected": -1.6169174909591675, "step": 5130 }, { "epoch": 0.856499249123978, "grad_norm": 12.093179702758789, "learning_rate": 1.143500750876022e-05, "logits/chosen": -0.522887647151947, "logits/rejected": -0.45446860790252686, "logps/chosen": -101.65200805664062, "logps/rejected": -58.299564361572266, "loss": 0.5141, "rewards/accuracies": 1.0, "rewards/chosen": 0.5956390500068665, "rewards/margins": 2.0166847705841064, "rewards/rejected": -1.4210454225540161, "step": 5133 }, { "epoch": 0.8569998331386618, "grad_norm": 19.648954391479492, "learning_rate": 1.1430001668613384e-05, "logits/chosen": -0.695176362991333, "logits/rejected": -0.6846781373023987, "logps/chosen": -92.00350952148438, "logps/rejected": -93.7633056640625, "loss": 0.5623, "rewards/accuracies": 1.0, "rewards/chosen": 0.0731033906340599, "rewards/margins": 1.6535078287124634, "rewards/rejected": -1.580404281616211, "step": 5136 }, { "epoch": 0.8575004171533456, "grad_norm": 17.96504020690918, "learning_rate": 1.1424995828466545e-05, "logits/chosen": -0.8447085022926331, "logits/rejected": -0.7908287644386292, "logps/chosen": -94.26611328125, "logps/rejected": -61.70982360839844, "loss": 0.4263, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.2399892807006836, "rewards/margins": 0.768886387348175, "rewards/rejected": -0.5288971066474915, "step": 5139 }, { "epoch": 0.8580010011680294, "grad_norm": 14.789609909057617, "learning_rate": 1.1419989988319707e-05, "logits/chosen": -0.6384769082069397, "logits/rejected": -0.644171416759491, "logps/chosen": -122.14116668701172, "logps/rejected": -111.75809478759766, "loss": 0.5483, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.2826322317123413, "rewards/margins": 0.015152771957218647, "rewards/rejected": -1.2977849245071411, "step": 5142 }, { "epoch": 0.8585015851827131, "grad_norm": 9.10431957244873, "learning_rate": 1.1414984148172869e-05, "logits/chosen": -0.7150741219520569, "logits/rejected": -0.7078655362129211, "logps/chosen": -78.14484405517578, "logps/rejected": -86.7147216796875, "loss": 0.2871, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.06097952648997307, "rewards/margins": 3.0268611907958984, "rewards/rejected": -2.9658820629119873, "step": 5145 }, { "epoch": 0.8590021691973969, "grad_norm": 25.818769454956055, "learning_rate": 1.140997830802603e-05, "logits/chosen": -0.5104144811630249, "logits/rejected": -0.4689311981201172, "logps/chosen": -98.5882568359375, "logps/rejected": -100.49832153320312, "loss": 0.5063, "rewards/accuracies": 1.0, "rewards/chosen": -0.7105364799499512, "rewards/margins": 1.4616646766662598, "rewards/rejected": -2.172201156616211, "step": 5148 }, { "epoch": 0.8595027532120808, "grad_norm": 23.394752502441406, "learning_rate": 1.1404972467879194e-05, "logits/chosen": -0.4374943673610687, "logits/rejected": -0.4131690263748169, "logps/chosen": -176.4064178466797, "logps/rejected": -156.61477661132812, "loss": 1.0193, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.57876718044281, "rewards/margins": 0.2841891944408417, "rewards/rejected": -1.8629564046859741, "step": 5151 }, { "epoch": 0.8600033372267646, "grad_norm": 35.99729537963867, "learning_rate": 1.1399966627732354e-05, "logits/chosen": -0.6192367672920227, "logits/rejected": -0.6213169693946838, "logps/chosen": -103.49190521240234, "logps/rejected": -91.33255767822266, "loss": 0.4529, "rewards/accuracies": 0.0, "rewards/chosen": -3.156693696975708, "rewards/margins": -0.5654505491256714, "rewards/rejected": -2.591243267059326, "step": 5154 }, { "epoch": 0.8605039212414484, "grad_norm": 18.432456970214844, "learning_rate": 1.1394960787585518e-05, "logits/chosen": -0.716586172580719, "logits/rejected": -0.6471444964408875, "logps/chosen": -55.68172836303711, "logps/rejected": -51.427547454833984, "loss": 0.4403, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.12199803441762924, "rewards/margins": 0.4522179663181305, "rewards/rejected": -0.33021989464759827, "step": 5157 }, { "epoch": 0.8610045052561321, "grad_norm": 27.110212326049805, "learning_rate": 1.138995494743868e-05, "logits/chosen": -0.5332514643669128, "logits/rejected": -0.5798358917236328, "logps/chosen": -56.25673294067383, "logps/rejected": -136.95584106445312, "loss": 0.6516, "rewards/accuracies": 1.0, "rewards/chosen": -1.1892863512039185, "rewards/margins": 1.7572005987167358, "rewards/rejected": -2.946486711502075, "step": 5160 }, { "epoch": 0.8615050892708159, "grad_norm": 20.073057174682617, "learning_rate": 1.1384949107291841e-05, "logits/chosen": -0.5250779986381531, "logits/rejected": -0.5137545466423035, "logps/chosen": -41.994083404541016, "logps/rejected": -82.55565643310547, "loss": 0.5743, "rewards/accuracies": 1.0, "rewards/chosen": -0.37716564536094666, "rewards/margins": 2.1603829860687256, "rewards/rejected": -2.537548780441284, "step": 5163 }, { "epoch": 0.8620056732854997, "grad_norm": 27.63919448852539, "learning_rate": 1.1379943267145003e-05, "logits/chosen": -0.487172931432724, "logits/rejected": -0.46109580993652344, "logps/chosen": -141.37086486816406, "logps/rejected": -99.0966796875, "loss": 0.6839, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.27634504437446594, "rewards/margins": 0.6001647114753723, "rewards/rejected": -0.323819637298584, "step": 5166 }, { "epoch": 0.8625062573001836, "grad_norm": 12.090928077697754, "learning_rate": 1.1374937426998165e-05, "logits/chosen": -0.7261994481086731, "logits/rejected": -0.7362059950828552, "logps/chosen": -81.97888946533203, "logps/rejected": -128.4283447265625, "loss": 0.286, "rewards/accuracies": 1.0, "rewards/chosen": -1.052525281906128, "rewards/margins": 1.7581491470336914, "rewards/rejected": -2.8106744289398193, "step": 5169 }, { "epoch": 0.8630068413148674, "grad_norm": 22.55510902404785, "learning_rate": 1.1369931586851328e-05, "logits/chosen": -0.5597946047782898, "logits/rejected": -0.580158531665802, "logps/chosen": -80.8117446899414, "logps/rejected": -93.9738998413086, "loss": 0.4471, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.013125121593475342, "rewards/margins": 1.1612673997879028, "rewards/rejected": -1.1743927001953125, "step": 5172 }, { "epoch": 0.8635074253295512, "grad_norm": 14.091017723083496, "learning_rate": 1.1364925746704489e-05, "logits/chosen": -0.6318914890289307, "logits/rejected": -0.47231268882751465, "logps/chosen": -189.55747985839844, "logps/rejected": -125.5487289428711, "loss": 1.1019, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.4598078727722168, "rewards/margins": -0.7950718998908997, "rewards/rejected": 0.33526408672332764, "step": 5175 }, { "epoch": 0.8640080093442349, "grad_norm": 16.131038665771484, "learning_rate": 1.135991990655765e-05, "logits/chosen": -0.41046786308288574, "logits/rejected": -0.35715579986572266, "logps/chosen": -151.3508758544922, "logps/rejected": -128.2998809814453, "loss": 0.216, "rewards/accuracies": 1.0, "rewards/chosen": -0.183074951171875, "rewards/margins": 0.8906896114349365, "rewards/rejected": -1.0737645626068115, "step": 5178 }, { "epoch": 0.8645085933589187, "grad_norm": 31.304906845092773, "learning_rate": 1.1354914066410814e-05, "logits/chosen": -0.5203649401664734, "logits/rejected": -0.572272777557373, "logps/chosen": -69.52458953857422, "logps/rejected": -106.04407501220703, "loss": 0.7866, "rewards/accuracies": 1.0, "rewards/chosen": -0.004522820468991995, "rewards/margins": 2.2118120193481445, "rewards/rejected": -2.2163350582122803, "step": 5181 }, { "epoch": 0.8650091773736025, "grad_norm": 36.12967300415039, "learning_rate": 1.1349908226263976e-05, "logits/chosen": -0.6331565976142883, "logits/rejected": -0.6447506546974182, "logps/chosen": -77.53816986083984, "logps/rejected": -114.4379653930664, "loss": 0.3379, "rewards/accuracies": 1.0, "rewards/chosen": -0.8834386467933655, "rewards/margins": 2.4752867221832275, "rewards/rejected": -3.3587253093719482, "step": 5184 }, { "epoch": 0.8655097613882863, "grad_norm": 10.259912490844727, "learning_rate": 1.1344902386117137e-05, "logits/chosen": -0.6966433525085449, "logits/rejected": -0.7192214131355286, "logps/chosen": -66.0303726196289, "logps/rejected": -96.23230743408203, "loss": 0.7992, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5448360443115234, "rewards/margins": 2.2837631702423096, "rewards/rejected": -2.828598976135254, "step": 5187 }, { "epoch": 0.8660103454029702, "grad_norm": 11.147924423217773, "learning_rate": 1.13398965459703e-05, "logits/chosen": -0.638889491558075, "logits/rejected": -0.6581683158874512, "logps/chosen": -95.856689453125, "logps/rejected": -128.55873107910156, "loss": 0.3448, "rewards/accuracies": 1.0, "rewards/chosen": -0.6849102973937988, "rewards/margins": 3.7714948654174805, "rewards/rejected": -4.456405162811279, "step": 5190 }, { "epoch": 0.866510929417654, "grad_norm": 22.14396095275879, "learning_rate": 1.1334890705823463e-05, "logits/chosen": -0.5285084843635559, "logits/rejected": -0.605059802532196, "logps/chosen": -79.79026794433594, "logps/rejected": -155.76966857910156, "loss": 0.4149, "rewards/accuracies": 1.0, "rewards/chosen": -0.05508957430720329, "rewards/margins": 1.5466562509536743, "rewards/rejected": -1.6017459630966187, "step": 5193 }, { "epoch": 0.8670115134323377, "grad_norm": 5.491950035095215, "learning_rate": 1.1329884865676623e-05, "logits/chosen": -0.5323138236999512, "logits/rejected": -0.5767223238945007, "logps/chosen": -35.636348724365234, "logps/rejected": -63.78168869018555, "loss": 0.357, "rewards/accuracies": 1.0, "rewards/chosen": 0.5959950089454651, "rewards/margins": 1.4261912107467651, "rewards/rejected": -0.8301961421966553, "step": 5196 }, { "epoch": 0.8675120974470215, "grad_norm": 41.4188232421875, "learning_rate": 1.1324879025529785e-05, "logits/chosen": -0.53721022605896, "logits/rejected": -0.4423882067203522, "logps/chosen": -89.7837142944336, "logps/rejected": -107.6866683959961, "loss": 0.3635, "rewards/accuracies": 1.0, "rewards/chosen": 0.17710810899734497, "rewards/margins": 4.248526096343994, "rewards/rejected": -4.071417808532715, "step": 5199 }, { "epoch": 0.8680126814617053, "grad_norm": 35.70491409301758, "learning_rate": 1.1319873185382948e-05, "logits/chosen": -0.5490866303443909, "logits/rejected": -0.49774202704429626, "logps/chosen": -117.4071044921875, "logps/rejected": -108.45574188232422, "loss": 0.7161, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.8012904524803162, "rewards/margins": 0.6952689290046692, "rewards/rejected": -1.4965592622756958, "step": 5202 }, { "epoch": 0.8685132654763891, "grad_norm": 46.53582000732422, "learning_rate": 1.131486734523611e-05, "logits/chosen": -0.6050857305526733, "logits/rejected": -0.5891857743263245, "logps/chosen": -70.81758880615234, "logps/rejected": -90.06108856201172, "loss": 0.8071, "rewards/accuracies": 1.0, "rewards/chosen": -0.26027026772499084, "rewards/margins": 3.7147350311279297, "rewards/rejected": -3.9750053882598877, "step": 5205 }, { "epoch": 0.869013849491073, "grad_norm": 18.884967803955078, "learning_rate": 1.1309861505089272e-05, "logits/chosen": -0.6272644996643066, "logits/rejected": -0.6920938491821289, "logps/chosen": -117.0538558959961, "logps/rejected": -142.5388641357422, "loss": 0.3174, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.45401179790496826, "rewards/margins": 1.623949646949768, "rewards/rejected": -2.0779616832733154, "step": 5208 }, { "epoch": 0.8695144335057567, "grad_norm": 8.8042631149292, "learning_rate": 1.1304855664942433e-05, "logits/chosen": -0.475245863199234, "logits/rejected": -0.4834322929382324, "logps/chosen": -130.3167266845703, "logps/rejected": -176.6344451904297, "loss": 0.5519, "rewards/accuracies": 1.0, "rewards/chosen": -0.4806068241596222, "rewards/margins": 1.9914287328720093, "rewards/rejected": -2.4720356464385986, "step": 5211 }, { "epoch": 0.8700150175204405, "grad_norm": 23.56631088256836, "learning_rate": 1.1299849824795597e-05, "logits/chosen": -0.6312660574913025, "logits/rejected": -0.6507378220558167, "logps/chosen": -78.2089614868164, "logps/rejected": -120.3248519897461, "loss": 0.665, "rewards/accuracies": 1.0, "rewards/chosen": -0.2939009666442871, "rewards/margins": 3.404039144515991, "rewards/rejected": -3.6979401111602783, "step": 5214 }, { "epoch": 0.8705156015351243, "grad_norm": 18.08479118347168, "learning_rate": 1.1294843984648757e-05, "logits/chosen": -0.4818453788757324, "logits/rejected": -0.5308169722557068, "logps/chosen": -88.3983383178711, "logps/rejected": -98.87451171875, "loss": 0.4571, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.8662620782852173, "rewards/margins": -0.5312475562095642, "rewards/rejected": -1.3350144624710083, "step": 5217 }, { "epoch": 0.8710161855498081, "grad_norm": 45.014591217041016, "learning_rate": 1.1289838144501919e-05, "logits/chosen": -0.5714976787567139, "logits/rejected": -0.5665468573570251, "logps/chosen": -98.08541107177734, "logps/rejected": -94.68924713134766, "loss": 0.8751, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.0758800506591797, "rewards/margins": 0.11618876457214355, "rewards/rejected": -2.1920688152313232, "step": 5220 }, { "epoch": 0.8715167695644919, "grad_norm": 58.136966705322266, "learning_rate": 1.1284832304355082e-05, "logits/chosen": -0.7104697823524475, "logits/rejected": -0.7006548047065735, "logps/chosen": -114.9461898803711, "logps/rejected": -170.38490295410156, "loss": 0.5905, "rewards/accuracies": 1.0, "rewards/chosen": -0.7866702079772949, "rewards/margins": 1.9297451972961426, "rewards/rejected": -2.7164154052734375, "step": 5223 }, { "epoch": 0.8720173535791758, "grad_norm": 15.653696060180664, "learning_rate": 1.1279826464208244e-05, "logits/chosen": -0.6860346794128418, "logits/rejected": -0.680361270904541, "logps/chosen": -84.5916748046875, "logps/rejected": -97.14435577392578, "loss": 0.2383, "rewards/accuracies": 1.0, "rewards/chosen": -1.0850238800048828, "rewards/margins": 2.687147378921509, "rewards/rejected": -3.7721712589263916, "step": 5226 }, { "epoch": 0.8725179375938595, "grad_norm": 40.671051025390625, "learning_rate": 1.1274820624061406e-05, "logits/chosen": -0.5173263549804688, "logits/rejected": -0.6613742709159851, "logps/chosen": -22.763395309448242, "logps/rejected": -134.61097717285156, "loss": 0.7747, "rewards/accuracies": 1.0, "rewards/chosen": 0.4891248047351837, "rewards/margins": 3.015058755874634, "rewards/rejected": -2.5259339809417725, "step": 5229 }, { "epoch": 0.8730185216085433, "grad_norm": 32.038360595703125, "learning_rate": 1.1269814783914568e-05, "logits/chosen": -0.7385668754577637, "logits/rejected": -0.7902305722236633, "logps/chosen": -63.28190231323242, "logps/rejected": -129.6437530517578, "loss": 0.5834, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4830988943576813, "rewards/margins": 2.5267677307128906, "rewards/rejected": -3.009866714477539, "step": 5232 }, { "epoch": 0.8735191056232271, "grad_norm": 3.290879487991333, "learning_rate": 1.1264808943767731e-05, "logits/chosen": -0.6792020201683044, "logits/rejected": -0.6765751838684082, "logps/chosen": -71.08441162109375, "logps/rejected": -121.9891586303711, "loss": 0.6783, "rewards/accuracies": 1.0, "rewards/chosen": -0.45864740014076233, "rewards/margins": 2.211289644241333, "rewards/rejected": -2.6699371337890625, "step": 5235 }, { "epoch": 0.8740196896379109, "grad_norm": 31.111297607421875, "learning_rate": 1.1259803103620891e-05, "logits/chosen": -0.8957025408744812, "logits/rejected": -0.8704061508178711, "logps/chosen": -105.17910766601562, "logps/rejected": -72.72550201416016, "loss": 0.7883, "rewards/accuracies": 0.0, "rewards/chosen": -2.114806652069092, "rewards/margins": -1.5604091882705688, "rewards/rejected": -0.5543974041938782, "step": 5238 }, { "epoch": 0.8745202736525947, "grad_norm": 44.102664947509766, "learning_rate": 1.1254797263474053e-05, "logits/chosen": -0.6703649163246155, "logits/rejected": -0.6821444630622864, "logps/chosen": -91.18602752685547, "logps/rejected": -109.43844604492188, "loss": 0.5948, "rewards/accuracies": 1.0, "rewards/chosen": -0.8933432698249817, "rewards/margins": 1.8916559219360352, "rewards/rejected": -2.784999132156372, "step": 5241 }, { "epoch": 0.8750208576672784, "grad_norm": 8.825094223022461, "learning_rate": 1.1249791423327217e-05, "logits/chosen": -0.5146535038948059, "logits/rejected": -0.5155243277549744, "logps/chosen": -136.21287536621094, "logps/rejected": -104.7197265625, "loss": 0.7364, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.08938118070363998, "rewards/margins": 0.6451700925827026, "rewards/rejected": -0.5557889342308044, "step": 5244 }, { "epoch": 0.8755214416819623, "grad_norm": 29.343612670898438, "learning_rate": 1.1244785583180378e-05, "logits/chosen": -0.6775240898132324, "logits/rejected": -0.659226655960083, "logps/chosen": -95.07433319091797, "logps/rejected": -154.01876831054688, "loss": 0.4168, "rewards/accuracies": 1.0, "rewards/chosen": -1.0567528009414673, "rewards/margins": 3.0913448333740234, "rewards/rejected": -4.148097515106201, "step": 5247 }, { "epoch": 0.8760220256966461, "grad_norm": 33.541873931884766, "learning_rate": 1.123977974303354e-05, "logits/chosen": -0.684636116027832, "logits/rejected": -0.6338600516319275, "logps/chosen": -89.83553314208984, "logps/rejected": -103.36385345458984, "loss": 0.3125, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.1018017902970314, "rewards/margins": 1.0639077425003052, "rewards/rejected": -1.1657096147537231, "step": 5250 }, { "epoch": 0.8765226097113299, "grad_norm": 32.671260833740234, "learning_rate": 1.1234773902886702e-05, "logits/chosen": -0.647468090057373, "logits/rejected": -0.7004220485687256, "logps/chosen": -90.6304702758789, "logps/rejected": -114.1775131225586, "loss": 0.3353, "rewards/accuracies": 1.0, "rewards/chosen": -1.6365723609924316, "rewards/margins": 1.800628662109375, "rewards/rejected": -3.4372007846832275, "step": 5253 }, { "epoch": 0.8770231937260137, "grad_norm": 15.701973915100098, "learning_rate": 1.1229768062739864e-05, "logits/chosen": -0.6158221960067749, "logits/rejected": -0.5557249188423157, "logps/chosen": -127.44011688232422, "logps/rejected": -85.25337982177734, "loss": 0.2833, "rewards/accuracies": 1.0, "rewards/chosen": -0.9147098064422607, "rewards/margins": 1.7784538269042969, "rewards/rejected": -2.6931636333465576, "step": 5256 }, { "epoch": 0.8775237777406975, "grad_norm": 16.166616439819336, "learning_rate": 1.1224762222593026e-05, "logits/chosen": -0.6617043614387512, "logits/rejected": -0.6935828328132629, "logps/chosen": -157.0425567626953, "logps/rejected": -138.74911499023438, "loss": 0.4729, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -3.0908281803131104, "rewards/margins": -0.7436997294425964, "rewards/rejected": -2.3471286296844482, "step": 5259 }, { "epoch": 0.8780243617553812, "grad_norm": 35.05803298950195, "learning_rate": 1.1219756382446187e-05, "logits/chosen": -0.6273482441902161, "logits/rejected": -0.5715327858924866, "logps/chosen": -150.82984924316406, "logps/rejected": -129.40350341796875, "loss": 0.6378, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.1309449672698975, "rewards/margins": -0.2698698043823242, "rewards/rejected": -1.8610750436782837, "step": 5262 }, { "epoch": 0.8785249457700651, "grad_norm": 8.929173469543457, "learning_rate": 1.1214750542299351e-05, "logits/chosen": -0.4988724887371063, "logits/rejected": -0.5700047016143799, "logps/chosen": -51.70343017578125, "logps/rejected": -102.85436248779297, "loss": 0.1973, "rewards/accuracies": 1.0, "rewards/chosen": -0.07401975244283676, "rewards/margins": 1.1601070165634155, "rewards/rejected": -1.2341266870498657, "step": 5265 }, { "epoch": 0.8790255297847489, "grad_norm": 26.56364631652832, "learning_rate": 1.1209744702152513e-05, "logits/chosen": -0.7219931483268738, "logits/rejected": -0.7332339286804199, "logps/chosen": -91.0636215209961, "logps/rejected": -134.6190643310547, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": -0.6496039032936096, "rewards/margins": 4.551858901977539, "rewards/rejected": -5.201462745666504, "step": 5268 }, { "epoch": 0.8795261137994327, "grad_norm": 34.39509963989258, "learning_rate": 1.1204738862005674e-05, "logits/chosen": -0.7667047381401062, "logits/rejected": -0.7496486306190491, "logps/chosen": -89.22748565673828, "logps/rejected": -75.84186553955078, "loss": 0.4985, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.8594398498535156, "rewards/margins": -0.15582895278930664, "rewards/rejected": -1.7036107778549194, "step": 5271 }, { "epoch": 0.8800266978141165, "grad_norm": 58.57990646362305, "learning_rate": 1.1199733021858836e-05, "logits/chosen": -0.5947504043579102, "logits/rejected": -0.6315538287162781, "logps/chosen": -81.00011444091797, "logps/rejected": -99.52349090576172, "loss": 0.7975, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.1540944576263428, "rewards/margins": 0.07545558363199234, "rewards/rejected": -2.2295498847961426, "step": 5274 }, { "epoch": 0.8805272818288002, "grad_norm": 7.313522815704346, "learning_rate": 1.1194727181711998e-05, "logits/chosen": -0.5274115204811096, "logits/rejected": -0.6227656006813049, "logps/chosen": -66.46949768066406, "logps/rejected": -156.7874755859375, "loss": 0.5936, "rewards/accuracies": 1.0, "rewards/chosen": -0.3555353879928589, "rewards/margins": 3.0532848834991455, "rewards/rejected": -3.408820390701294, "step": 5277 }, { "epoch": 0.881027865843484, "grad_norm": 99.98505401611328, "learning_rate": 1.118972134156516e-05, "logits/chosen": -0.5876986384391785, "logits/rejected": -0.624988853931427, "logps/chosen": -88.36621856689453, "logps/rejected": -114.67560577392578, "loss": 0.6324, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.641394853591919, "rewards/margins": 0.56208735704422, "rewards/rejected": -1.2034822702407837, "step": 5280 }, { "epoch": 0.8815284498581679, "grad_norm": 20.809185028076172, "learning_rate": 1.1184715501418322e-05, "logits/chosen": -0.5784502625465393, "logits/rejected": -0.5908973813056946, "logps/chosen": -46.80269241333008, "logps/rejected": -66.86150360107422, "loss": 0.7708, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1244295835494995, "rewards/margins": -0.5162593126296997, "rewards/rejected": -0.608170211315155, "step": 5283 }, { "epoch": 0.8820290338728517, "grad_norm": 10.9489107131958, "learning_rate": 1.1179709661271485e-05, "logits/chosen": -0.6665731072425842, "logits/rejected": -0.6577050089836121, "logps/chosen": -84.95243072509766, "logps/rejected": -120.51861572265625, "loss": 0.3203, "rewards/accuracies": 1.0, "rewards/chosen": -0.36551642417907715, "rewards/margins": 3.5568771362304688, "rewards/rejected": -3.922393560409546, "step": 5286 }, { "epoch": 0.8825296178875355, "grad_norm": 18.653274536132812, "learning_rate": 1.1174703821124647e-05, "logits/chosen": -0.8831267356872559, "logits/rejected": -0.86431884765625, "logps/chosen": -90.4432144165039, "logps/rejected": -114.50739288330078, "loss": 0.6842, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.11652807146310806, "rewards/margins": 1.2001526355743408, "rewards/rejected": -1.316680669784546, "step": 5289 }, { "epoch": 0.8830302019022193, "grad_norm": 17.802248001098633, "learning_rate": 1.1169697980977809e-05, "logits/chosen": -0.6039112210273743, "logits/rejected": -0.6244013905525208, "logps/chosen": -52.815521240234375, "logps/rejected": -71.29379272460938, "loss": 0.598, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.17220033705234528, "rewards/margins": 0.2531934380531311, "rewards/rejected": -0.4253937900066376, "step": 5292 }, { "epoch": 0.883530785916903, "grad_norm": 44.29862976074219, "learning_rate": 1.116469214083097e-05, "logits/chosen": -0.6027450561523438, "logits/rejected": -0.577419102191925, "logps/chosen": -100.06466674804688, "logps/rejected": -81.93515014648438, "loss": 0.6363, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.2977741956710815, "rewards/margins": 0.7382469177246094, "rewards/rejected": -2.0360209941864014, "step": 5295 }, { "epoch": 0.8840313699315868, "grad_norm": 23.047456741333008, "learning_rate": 1.1159686300684132e-05, "logits/chosen": -0.6625749468803406, "logits/rejected": -0.6183480024337769, "logps/chosen": -134.8944091796875, "logps/rejected": -72.2652587890625, "loss": 0.3518, "rewards/accuracies": 1.0, "rewards/chosen": 0.0217820405960083, "rewards/margins": 1.6203206777572632, "rewards/rejected": -1.5985385179519653, "step": 5298 }, { "epoch": 0.8845319539462706, "grad_norm": 26.102745056152344, "learning_rate": 1.1154680460537294e-05, "logits/chosen": -0.6546265482902527, "logits/rejected": -0.6823731064796448, "logps/chosen": -71.49433135986328, "logps/rejected": -107.71161651611328, "loss": 0.4322, "rewards/accuracies": 1.0, "rewards/chosen": -0.90607088804245, "rewards/margins": 1.8540762662887573, "rewards/rejected": -2.7601470947265625, "step": 5301 }, { "epoch": 0.8850325379609545, "grad_norm": 26.468839645385742, "learning_rate": 1.1149674620390456e-05, "logits/chosen": -0.5928848385810852, "logits/rejected": -0.5784701704978943, "logps/chosen": -162.8287353515625, "logps/rejected": -112.62897491455078, "loss": 0.7025, "rewards/accuracies": 0.0, "rewards/chosen": -2.403533458709717, "rewards/margins": -2.0499720573425293, "rewards/rejected": -0.3535614013671875, "step": 5304 }, { "epoch": 0.8855331219756383, "grad_norm": 2.2395787239074707, "learning_rate": 1.114466878024362e-05, "logits/chosen": -0.5697750449180603, "logits/rejected": -0.6159061789512634, "logps/chosen": -58.5128288269043, "logps/rejected": -134.6702117919922, "loss": 0.6662, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.09805703163146973, "rewards/margins": 3.9763717651367188, "rewards/rejected": -3.878314971923828, "step": 5307 }, { "epoch": 0.886033705990322, "grad_norm": 12.876724243164062, "learning_rate": 1.1139662940096781e-05, "logits/chosen": -0.5877766013145447, "logits/rejected": -0.671984851360321, "logps/chosen": -56.0899658203125, "logps/rejected": -211.9476318359375, "loss": 0.4058, "rewards/accuracies": 1.0, "rewards/chosen": 0.25957173109054565, "rewards/margins": 2.490169048309326, "rewards/rejected": -2.230597496032715, "step": 5310 }, { "epoch": 0.8865342900050058, "grad_norm": 23.6109619140625, "learning_rate": 1.1134657099949941e-05, "logits/chosen": -0.4910610616207123, "logits/rejected": -0.5413877367973328, "logps/chosen": -78.10687255859375, "logps/rejected": -93.10860443115234, "loss": 0.5112, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.7172355055809021, "rewards/margins": 1.015048861503601, "rewards/rejected": -0.29781341552734375, "step": 5313 }, { "epoch": 0.8870348740196896, "grad_norm": 9.651638984680176, "learning_rate": 1.1129651259803105e-05, "logits/chosen": -0.6724743843078613, "logits/rejected": -0.619300365447998, "logps/chosen": -133.4291534423828, "logps/rejected": -116.95150756835938, "loss": 0.8986, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.0044598579406738, "rewards/margins": 2.21441650390625, "rewards/rejected": -3.2188761234283447, "step": 5316 }, { "epoch": 0.8875354580343734, "grad_norm": 29.2908878326416, "learning_rate": 1.1124645419656267e-05, "logits/chosen": -0.7668601870536804, "logits/rejected": -0.790947675704956, "logps/chosen": -153.0433807373047, "logps/rejected": -145.6250457763672, "loss": 0.8291, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.7576932907104492, "rewards/margins": 1.6218076944351196, "rewards/rejected": -3.3795011043548584, "step": 5319 }, { "epoch": 0.8880360420490573, "grad_norm": 35.68904495239258, "learning_rate": 1.1119639579509428e-05, "logits/chosen": -0.7498412132263184, "logits/rejected": -0.7102594375610352, "logps/chosen": -101.99532318115234, "logps/rejected": -125.10140991210938, "loss": 0.3819, "rewards/accuracies": 1.0, "rewards/chosen": -0.09166312217712402, "rewards/margins": 2.473553419113159, "rewards/rejected": -2.565216541290283, "step": 5322 }, { "epoch": 0.8885366260637411, "grad_norm": 30.699214935302734, "learning_rate": 1.111463373936259e-05, "logits/chosen": -0.5141251087188721, "logits/rejected": -0.4456705152988434, "logps/chosen": -108.58246612548828, "logps/rejected": -76.18553924560547, "loss": 0.8051, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6614186763763428, "rewards/margins": 0.8159614205360413, "rewards/rejected": -1.4773801565170288, "step": 5325 }, { "epoch": 0.8890372100784248, "grad_norm": 32.73302459716797, "learning_rate": 1.1109627899215754e-05, "logits/chosen": -0.6330336928367615, "logits/rejected": -0.6526468992233276, "logps/chosen": -71.61773681640625, "logps/rejected": -148.06546020507812, "loss": 0.3953, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.7397615909576416, "rewards/margins": 4.458266735076904, "rewards/rejected": -3.7185051441192627, "step": 5328 }, { "epoch": 0.8895377940931086, "grad_norm": 26.923368453979492, "learning_rate": 1.1104622059068915e-05, "logits/chosen": -0.6797322630882263, "logits/rejected": -0.7008499503135681, "logps/chosen": -108.92501068115234, "logps/rejected": -181.13941955566406, "loss": 0.6287, "rewards/accuracies": 1.0, "rewards/chosen": 0.1337856650352478, "rewards/margins": 1.3897953033447266, "rewards/rejected": -1.2560096979141235, "step": 5331 }, { "epoch": 0.8900383781077924, "grad_norm": 14.693853378295898, "learning_rate": 1.1099616218922076e-05, "logits/chosen": -0.7263872027397156, "logits/rejected": -0.7414796948432922, "logps/chosen": -97.11095428466797, "logps/rejected": -117.52741241455078, "loss": 0.4157, "rewards/accuracies": 1.0, "rewards/chosen": -0.5335645079612732, "rewards/margins": 2.0074288845062256, "rewards/rejected": -2.5409934520721436, "step": 5334 }, { "epoch": 0.8905389621224762, "grad_norm": 7.475099086761475, "learning_rate": 1.1094610378775239e-05, "logits/chosen": -0.6557307839393616, "logits/rejected": -0.5882272720336914, "logps/chosen": -107.0846939086914, "logps/rejected": -135.0264434814453, "loss": 0.261, "rewards/accuracies": 1.0, "rewards/chosen": -0.8437688946723938, "rewards/margins": 3.607093572616577, "rewards/rejected": -4.450862407684326, "step": 5337 }, { "epoch": 0.89103954613716, "grad_norm": 18.485342025756836, "learning_rate": 1.1089604538628401e-05, "logits/chosen": -0.603676974773407, "logits/rejected": -0.6369271278381348, "logps/chosen": -78.61273956298828, "logps/rejected": -107.71334838867188, "loss": 0.5801, "rewards/accuracies": 1.0, "rewards/chosen": -0.6071935296058655, "rewards/margins": 1.3906258344650269, "rewards/rejected": -1.9978193044662476, "step": 5340 }, { "epoch": 0.8915401301518439, "grad_norm": 6.755733489990234, "learning_rate": 1.1084598698481563e-05, "logits/chosen": -0.6010645031929016, "logits/rejected": -0.6178373098373413, "logps/chosen": -81.32304382324219, "logps/rejected": -105.45343780517578, "loss": 0.2237, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.028526872396469116, "rewards/margins": 2.7478256225585938, "rewards/rejected": -2.776352643966675, "step": 5343 }, { "epoch": 0.8920407141665276, "grad_norm": 18.79436492919922, "learning_rate": 1.1079592858334724e-05, "logits/chosen": -0.7367852330207825, "logits/rejected": -0.7451242804527283, "logps/chosen": -50.63143539428711, "logps/rejected": -68.6977310180664, "loss": 1.06, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.44164952635765076, "rewards/margins": 1.2385708093643188, "rewards/rejected": -1.6802202463150024, "step": 5346 }, { "epoch": 0.8925412981812114, "grad_norm": 8.06203842163086, "learning_rate": 1.1074587018187888e-05, "logits/chosen": -0.5320892333984375, "logits/rejected": -0.5932509303092957, "logps/chosen": -59.18855285644531, "logps/rejected": -154.80075073242188, "loss": 0.3542, "rewards/accuracies": 1.0, "rewards/chosen": -0.2114419937133789, "rewards/margins": 3.140695810317993, "rewards/rejected": -3.352137804031372, "step": 5349 }, { "epoch": 0.8930418821958952, "grad_norm": 32.854549407958984, "learning_rate": 1.106958117804105e-05, "logits/chosen": -0.5613064169883728, "logits/rejected": -0.65436190366745, "logps/chosen": -60.28396224975586, "logps/rejected": -91.58992767333984, "loss": 0.6744, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.3000906705856323, "rewards/margins": 0.2065768986940384, "rewards/rejected": 0.09351374953985214, "step": 5352 }, { "epoch": 0.893542466210579, "grad_norm": 25.408262252807617, "learning_rate": 1.106457533789421e-05, "logits/chosen": -0.7191190123558044, "logits/rejected": -0.7427474856376648, "logps/chosen": -101.4290542602539, "logps/rejected": -130.0488739013672, "loss": 0.6565, "rewards/accuracies": 0.0, "rewards/chosen": -0.583789050579071, "rewards/margins": -0.6573792099952698, "rewards/rejected": 0.07359009981155396, "step": 5355 }, { "epoch": 0.8940430502252628, "grad_norm": 36.134857177734375, "learning_rate": 1.1059569497747373e-05, "logits/chosen": -0.6445572972297668, "logits/rejected": -0.6963880062103271, "logps/chosen": -90.50484466552734, "logps/rejected": -147.96263122558594, "loss": 0.4882, "rewards/accuracies": 1.0, "rewards/chosen": -0.0692698135972023, "rewards/margins": 3.1764957904815674, "rewards/rejected": -3.2457656860351562, "step": 5358 }, { "epoch": 0.8945436342399467, "grad_norm": 24.70877456665039, "learning_rate": 1.1054563657600535e-05, "logits/chosen": -0.6518282294273376, "logits/rejected": -0.7215792536735535, "logps/chosen": -87.6937026977539, "logps/rejected": -168.19805908203125, "loss": 0.4945, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.38261619210243225, "rewards/margins": 0.36251118779182434, "rewards/rejected": -0.7451273798942566, "step": 5361 }, { "epoch": 0.8950442182546304, "grad_norm": 28.959278106689453, "learning_rate": 1.1049557817453697e-05, "logits/chosen": -0.49272966384887695, "logits/rejected": -0.5410664081573486, "logps/chosen": -76.9446792602539, "logps/rejected": -181.5521697998047, "loss": 0.3589, "rewards/accuracies": 1.0, "rewards/chosen": -0.08657632023096085, "rewards/margins": 2.4091293811798096, "rewards/rejected": -2.4957058429718018, "step": 5364 }, { "epoch": 0.8955448022693142, "grad_norm": 17.443220138549805, "learning_rate": 1.1044551977306859e-05, "logits/chosen": -0.5052759051322937, "logits/rejected": -0.572015106678009, "logps/chosen": -49.4862174987793, "logps/rejected": -167.072509765625, "loss": 0.7355, "rewards/accuracies": 1.0, "rewards/chosen": 0.26507097482681274, "rewards/margins": 5.723178863525391, "rewards/rejected": -5.458108425140381, "step": 5367 }, { "epoch": 0.896045386283998, "grad_norm": 19.283931732177734, "learning_rate": 1.103954613716002e-05, "logits/chosen": -0.512232780456543, "logits/rejected": -0.5409400463104248, "logps/chosen": -117.95349884033203, "logps/rejected": -111.68482208251953, "loss": 1.0567, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.09666164964437485, "rewards/margins": 0.9481565356254578, "rewards/rejected": -0.8514948487281799, "step": 5370 }, { "epoch": 0.8965459702986818, "grad_norm": 2.8979649543762207, "learning_rate": 1.1034540297013184e-05, "logits/chosen": -0.5766705870628357, "logits/rejected": -0.5825626850128174, "logps/chosen": -39.96491622924805, "logps/rejected": -110.27678680419922, "loss": 0.4033, "rewards/accuracies": 1.0, "rewards/chosen": 0.8126988410949707, "rewards/margins": 2.832322359085083, "rewards/rejected": -2.0196235179901123, "step": 5373 }, { "epoch": 0.8970465543133656, "grad_norm": 37.00758743286133, "learning_rate": 1.1029534456866344e-05, "logits/chosen": -0.6503428816795349, "logits/rejected": -0.6896453499794006, "logps/chosen": -51.07170486450195, "logps/rejected": -109.02645111083984, "loss": 0.7001, "rewards/accuracies": 1.0, "rewards/chosen": 0.408651202917099, "rewards/margins": 4.234193325042725, "rewards/rejected": -3.825542449951172, "step": 5376 }, { "epoch": 0.8975471383280494, "grad_norm": 16.00459098815918, "learning_rate": 1.1024528616719508e-05, "logits/chosen": -0.6209996342658997, "logits/rejected": -0.6135093569755554, "logps/chosen": -41.680973052978516, "logps/rejected": -72.2906265258789, "loss": 0.376, "rewards/accuracies": 1.0, "rewards/chosen": 0.5243629813194275, "rewards/margins": 2.2978384494781494, "rewards/rejected": -1.7734755277633667, "step": 5379 }, { "epoch": 0.8980477223427332, "grad_norm": 9.671673774719238, "learning_rate": 1.101952277657267e-05, "logits/chosen": -0.521334171295166, "logits/rejected": -0.5893716812133789, "logps/chosen": -63.81979751586914, "logps/rejected": -139.90858459472656, "loss": 0.2174, "rewards/accuracies": 1.0, "rewards/chosen": 0.9384167194366455, "rewards/margins": 3.2847025394439697, "rewards/rejected": -2.3462860584259033, "step": 5382 }, { "epoch": 0.898548306357417, "grad_norm": 18.301097869873047, "learning_rate": 1.1014516936425831e-05, "logits/chosen": -0.49608877301216125, "logits/rejected": -0.48684701323509216, "logps/chosen": -95.35980224609375, "logps/rejected": -83.60261535644531, "loss": 0.641, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.63970547914505, "rewards/margins": 0.11973651498556137, "rewards/rejected": -0.7594420313835144, "step": 5385 }, { "epoch": 0.8990488903721008, "grad_norm": 12.856501579284668, "learning_rate": 1.1009511096278993e-05, "logits/chosen": -0.5853674411773682, "logits/rejected": -0.5963168740272522, "logps/chosen": -68.63239288330078, "logps/rejected": -80.4722671508789, "loss": 0.6658, "rewards/accuracies": 1.0, "rewards/chosen": -0.6592854857444763, "rewards/margins": 1.1261789798736572, "rewards/rejected": -1.7854646444320679, "step": 5388 }, { "epoch": 0.8995494743867846, "grad_norm": 21.610563278198242, "learning_rate": 1.1004505256132155e-05, "logits/chosen": -0.6587416529655457, "logits/rejected": -0.6390979290008545, "logps/chosen": -130.7488555908203, "logps/rejected": -98.79383087158203, "loss": 0.405, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9345352053642273, "rewards/margins": 1.6640609502792358, "rewards/rejected": -2.5985963344573975, "step": 5391 }, { "epoch": 0.9000500584014683, "grad_norm": 8.122838973999023, "learning_rate": 1.0999499415985318e-05, "logits/chosen": -0.53277188539505, "logits/rejected": -0.5035920143127441, "logps/chosen": -83.4658432006836, "logps/rejected": -158.26219177246094, "loss": 0.3109, "rewards/accuracies": 1.0, "rewards/chosen": -0.09391316026449203, "rewards/margins": 3.19511342048645, "rewards/rejected": -3.289026975631714, "step": 5394 }, { "epoch": 0.9005506424161521, "grad_norm": 14.959997177124023, "learning_rate": 1.0994493575838478e-05, "logits/chosen": -0.552044689655304, "logits/rejected": -0.5060982704162598, "logps/chosen": -190.67262268066406, "logps/rejected": -167.0072479248047, "loss": 0.6151, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.2258590459823608, "rewards/margins": -0.5549963116645813, "rewards/rejected": -0.6708626747131348, "step": 5397 }, { "epoch": 0.901051226430836, "grad_norm": 23.678016662597656, "learning_rate": 1.0989487735691642e-05, "logits/chosen": -0.5823565125465393, "logits/rejected": -0.6264680027961731, "logps/chosen": -70.27828216552734, "logps/rejected": -113.88177490234375, "loss": 0.566, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.06471113115549088, "rewards/margins": 2.4556076526641846, "rewards/rejected": -2.5203187465667725, "step": 5400 }, { "epoch": 0.901051226430836, "eval_logits/chosen": -0.6237972378730774, "eval_logits/rejected": -0.6371889114379883, "eval_logps/chosen": -85.97547149658203, "eval_logps/rejected": -113.56649017333984, "eval_loss": 0.5671418905258179, "eval_rewards/accuracies": 0.7297297120094299, "eval_rewards/chosen": -0.5717741250991821, "eval_rewards/margins": 1.4720572233200073, "eval_rewards/rejected": -2.0438315868377686, "eval_runtime": 346.1695, "eval_samples_per_second": 7.696, "eval_steps_per_second": 1.924, "step": 5400 }, { "epoch": 0.9015518104455198, "grad_norm": 12.130939483642578, "learning_rate": 1.0984481895544804e-05, "logits/chosen": -0.7026426792144775, "logits/rejected": -0.7235104441642761, "logps/chosen": -66.02165985107422, "logps/rejected": -118.3082275390625, "loss": 0.3409, "rewards/accuracies": 1.0, "rewards/chosen": -0.2717791795730591, "rewards/margins": 1.888107180595398, "rewards/rejected": -2.159886360168457, "step": 5403 }, { "epoch": 0.9020523944602036, "grad_norm": 11.802750587463379, "learning_rate": 1.0979476055397965e-05, "logits/chosen": -0.5886392593383789, "logits/rejected": -0.6533645987510681, "logps/chosen": -81.00809478759766, "logps/rejected": -137.98158264160156, "loss": 0.5183, "rewards/accuracies": 1.0, "rewards/chosen": 0.2057039588689804, "rewards/margins": 1.926371693611145, "rewards/rejected": -1.720667839050293, "step": 5406 }, { "epoch": 0.9025529784748874, "grad_norm": 3.364377498626709, "learning_rate": 1.0974470215251127e-05, "logits/chosen": -0.6468489170074463, "logits/rejected": -0.683965265750885, "logps/chosen": -54.907772064208984, "logps/rejected": -111.80084228515625, "loss": 0.2853, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.26674339175224304, "rewards/margins": 3.8060905933380127, "rewards/rejected": -4.072834014892578, "step": 5409 }, { "epoch": 0.9030535624895711, "grad_norm": 31.096416473388672, "learning_rate": 1.0969464375104289e-05, "logits/chosen": -0.7426639199256897, "logits/rejected": -0.788421094417572, "logps/chosen": -79.25226593017578, "logps/rejected": -90.8368911743164, "loss": 0.9776, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.126882314682007, "rewards/margins": -1.426831841468811, "rewards/rejected": -0.7000501155853271, "step": 5412 }, { "epoch": 0.9035541465042549, "grad_norm": 10.020857810974121, "learning_rate": 1.0964458534957453e-05, "logits/chosen": -0.5788666605949402, "logits/rejected": -0.6041801571846008, "logps/chosen": -142.99717712402344, "logps/rejected": -157.6408233642578, "loss": 0.2445, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.08566919714212418, "rewards/margins": 1.9557405710220337, "rewards/rejected": -2.041409730911255, "step": 5415 }, { "epoch": 0.9040547305189388, "grad_norm": 33.826751708984375, "learning_rate": 1.0959452694810613e-05, "logits/chosen": -0.7032422423362732, "logits/rejected": -0.7117488980293274, "logps/chosen": -60.48248291015625, "logps/rejected": -102.77557373046875, "loss": 0.3387, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9237132668495178, "rewards/margins": 0.8451229929924011, "rewards/rejected": -1.7688363790512085, "step": 5418 }, { "epoch": 0.9045553145336226, "grad_norm": 24.85650634765625, "learning_rate": 1.0954446854663776e-05, "logits/chosen": -0.8032479286193848, "logits/rejected": -0.7862002849578857, "logps/chosen": -101.32257080078125, "logps/rejected": -89.46663665771484, "loss": 0.2137, "rewards/accuracies": 1.0, "rewards/chosen": -0.39331158995628357, "rewards/margins": 1.1820377111434937, "rewards/rejected": -1.5753493309020996, "step": 5421 }, { "epoch": 0.9050558985483064, "grad_norm": 10.149628639221191, "learning_rate": 1.0949441014516938e-05, "logits/chosen": -0.7683329582214355, "logits/rejected": -0.8238511681556702, "logps/chosen": -72.66650390625, "logps/rejected": -170.4212646484375, "loss": 0.2745, "rewards/accuracies": 1.0, "rewards/chosen": 0.2537698745727539, "rewards/margins": 4.43013334274292, "rewards/rejected": -4.176363468170166, "step": 5424 }, { "epoch": 0.9055564825629902, "grad_norm": 14.426325798034668, "learning_rate": 1.0944435174370098e-05, "logits/chosen": -0.6625586152076721, "logits/rejected": -0.7144203782081604, "logps/chosen": -28.369415283203125, "logps/rejected": -128.22471618652344, "loss": 0.4888, "rewards/accuracies": 1.0, "rewards/chosen": 0.157108336687088, "rewards/margins": 3.160722017288208, "rewards/rejected": -3.0036137104034424, "step": 5427 }, { "epoch": 0.9060570665776739, "grad_norm": 36.61492156982422, "learning_rate": 1.0939429334223262e-05, "logits/chosen": -0.6839638352394104, "logits/rejected": -0.6904220581054688, "logps/chosen": -97.64521026611328, "logps/rejected": -115.27163696289062, "loss": 0.6988, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.2246266603469849, "rewards/margins": 0.27083539962768555, "rewards/rejected": -1.4954618215560913, "step": 5430 }, { "epoch": 0.9065576505923577, "grad_norm": 10.141923904418945, "learning_rate": 1.0934423494076423e-05, "logits/chosen": -0.5097419619560242, "logits/rejected": -0.5644739270210266, "logps/chosen": -122.34134674072266, "logps/rejected": -135.03688049316406, "loss": 0.2267, "rewards/accuracies": 1.0, "rewards/chosen": -0.6015269160270691, "rewards/margins": 1.3498096466064453, "rewards/rejected": -1.9513365030288696, "step": 5433 }, { "epoch": 0.9070582346070416, "grad_norm": 6.081590175628662, "learning_rate": 1.0929417653929587e-05, "logits/chosen": -0.5407623648643494, "logits/rejected": -0.5947036147117615, "logps/chosen": -34.105262756347656, "logps/rejected": -77.22212982177734, "loss": 0.3976, "rewards/accuracies": 1.0, "rewards/chosen": 0.09384835511445999, "rewards/margins": 1.5757646560668945, "rewards/rejected": -1.481916069984436, "step": 5436 }, { "epoch": 0.9075588186217254, "grad_norm": 34.20344543457031, "learning_rate": 1.0924411813782747e-05, "logits/chosen": -0.6837905049324036, "logits/rejected": -0.6269432902336121, "logps/chosen": -113.78617095947266, "logps/rejected": -73.42179107666016, "loss": 0.8427, "rewards/accuracies": 0.0, "rewards/chosen": -2.235633134841919, "rewards/margins": -1.3649142980575562, "rewards/rejected": -0.870718777179718, "step": 5439 }, { "epoch": 0.9080594026364092, "grad_norm": 29.242101669311523, "learning_rate": 1.091940597363591e-05, "logits/chosen": -0.6259782910346985, "logits/rejected": -0.6368451118469238, "logps/chosen": -112.33234405517578, "logps/rejected": -124.2994155883789, "loss": 0.4419, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.118028163909912, "rewards/margins": 0.5916172862052917, "rewards/rejected": -2.7096455097198486, "step": 5442 }, { "epoch": 0.9085599866510929, "grad_norm": 28.07090950012207, "learning_rate": 1.0914400133489072e-05, "logits/chosen": -0.7033504843711853, "logits/rejected": -0.7122356295585632, "logps/chosen": -86.2286376953125, "logps/rejected": -118.693115234375, "loss": 0.4762, "rewards/accuracies": 1.0, "rewards/chosen": -0.7635664939880371, "rewards/margins": 2.7519891262054443, "rewards/rejected": -3.5155556201934814, "step": 5445 }, { "epoch": 0.9090605706657767, "grad_norm": 13.810389518737793, "learning_rate": 1.0909394293342232e-05, "logits/chosen": -0.5632876753807068, "logits/rejected": -0.5843488574028015, "logps/chosen": -82.42378997802734, "logps/rejected": -129.25547790527344, "loss": 0.7164, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.401224970817566, "rewards/margins": 0.587459146976471, "rewards/rejected": -1.9886841773986816, "step": 5448 }, { "epoch": 0.9095611546804605, "grad_norm": 19.00213623046875, "learning_rate": 1.0904388453195396e-05, "logits/chosen": -0.6882684826850891, "logits/rejected": -0.6895648837089539, "logps/chosen": -136.6562957763672, "logps/rejected": -143.5079803466797, "loss": 0.3836, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.6384364366531372, "rewards/margins": 0.8521387577056885, "rewards/rejected": -2.490575075149536, "step": 5451 }, { "epoch": 0.9100617386951443, "grad_norm": 12.04398250579834, "learning_rate": 1.0899382613048558e-05, "logits/chosen": -0.5259541869163513, "logits/rejected": -0.558539867401123, "logps/chosen": -93.3355941772461, "logps/rejected": -87.62298583984375, "loss": 0.5167, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.9122587442398071, "rewards/margins": 0.1978517770767212, "rewards/rejected": -2.1101105213165283, "step": 5454 }, { "epoch": 0.9105623227098282, "grad_norm": 25.27430534362793, "learning_rate": 1.0894376772901721e-05, "logits/chosen": -0.6008908152580261, "logits/rejected": -0.6623331904411316, "logps/chosen": -55.36851501464844, "logps/rejected": -164.12469482421875, "loss": 0.3196, "rewards/accuracies": 1.0, "rewards/chosen": 0.5789665579795837, "rewards/margins": 6.186411380767822, "rewards/rejected": -5.607444763183594, "step": 5457 }, { "epoch": 0.911062906724512, "grad_norm": 10.117852210998535, "learning_rate": 1.0889370932754881e-05, "logits/chosen": -0.534233570098877, "logits/rejected": -0.5442509055137634, "logps/chosen": -76.75177001953125, "logps/rejected": -98.1544189453125, "loss": 0.1772, "rewards/accuracies": 1.0, "rewards/chosen": -0.5960361361503601, "rewards/margins": 2.01312518119812, "rewards/rejected": -2.609161138534546, "step": 5460 }, { "epoch": 0.9115634907391957, "grad_norm": 23.286283493041992, "learning_rate": 1.0884365092608045e-05, "logits/chosen": -0.7541413307189941, "logits/rejected": -0.7835946083068848, "logps/chosen": -92.88431549072266, "logps/rejected": -119.53754425048828, "loss": 0.756, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.4279645681381226, "rewards/margins": 0.8928284645080566, "rewards/rejected": -2.3207929134368896, "step": 5463 }, { "epoch": 0.9120640747538795, "grad_norm": 13.716800689697266, "learning_rate": 1.0879359252461206e-05, "logits/chosen": -0.7637357711791992, "logits/rejected": -0.7388032078742981, "logps/chosen": -60.45195388793945, "logps/rejected": -60.04587936401367, "loss": 0.3555, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9411842823028564, "rewards/margins": 0.8933179974555969, "rewards/rejected": -1.8345023393630981, "step": 5466 }, { "epoch": 0.9125646587685633, "grad_norm": 81.53092193603516, "learning_rate": 1.0874353412314367e-05, "logits/chosen": -0.7298396229743958, "logits/rejected": -0.736894428730011, "logps/chosen": -55.5102424621582, "logps/rejected": -77.63288116455078, "loss": 0.9665, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3426784574985504, "rewards/margins": 1.5059293508529663, "rewards/rejected": -1.8486076593399048, "step": 5469 }, { "epoch": 0.9130652427832471, "grad_norm": 9.386821746826172, "learning_rate": 1.086934757216753e-05, "logits/chosen": -0.6065738201141357, "logits/rejected": -0.6438892483711243, "logps/chosen": -45.273372650146484, "logps/rejected": -104.07485961914062, "loss": 0.3827, "rewards/accuracies": 1.0, "rewards/chosen": 0.3174515664577484, "rewards/margins": 3.1974728107452393, "rewards/rejected": -2.880021333694458, "step": 5472 }, { "epoch": 0.913565826797931, "grad_norm": 7.961609363555908, "learning_rate": 1.0864341732020692e-05, "logits/chosen": -0.5113217234611511, "logits/rejected": -0.5183121562004089, "logps/chosen": -82.4371109008789, "logps/rejected": -91.96366119384766, "loss": 0.3673, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4582034647464752, "rewards/margins": 1.3708995580673218, "rewards/rejected": -1.8291029930114746, "step": 5475 }, { "epoch": 0.9140664108126147, "grad_norm": 36.679954528808594, "learning_rate": 1.0859335891873855e-05, "logits/chosen": -0.5654900670051575, "logits/rejected": -0.613146960735321, "logps/chosen": -69.07640838623047, "logps/rejected": -102.91207122802734, "loss": 0.8341, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.9813704490661621, "rewards/margins": 0.31878453493118286, "rewards/rejected": -1.3001550436019897, "step": 5478 }, { "epoch": 0.9145669948272985, "grad_norm": 21.483518600463867, "learning_rate": 1.0854330051727015e-05, "logits/chosen": -0.5787302851676941, "logits/rejected": -0.6225516200065613, "logps/chosen": -50.384918212890625, "logps/rejected": -91.01042938232422, "loss": 0.3524, "rewards/accuracies": 1.0, "rewards/chosen": 0.1106569766998291, "rewards/margins": 1.9095362424850464, "rewards/rejected": -1.7988791465759277, "step": 5481 }, { "epoch": 0.9150675788419823, "grad_norm": 29.98276710510254, "learning_rate": 1.0849324211580177e-05, "logits/chosen": -0.75628262758255, "logits/rejected": -0.7337880730628967, "logps/chosen": -65.97211456298828, "logps/rejected": -99.8214111328125, "loss": 0.3278, "rewards/accuracies": 1.0, "rewards/chosen": 0.21626949310302734, "rewards/margins": 5.10138463973999, "rewards/rejected": -4.885115146636963, "step": 5484 }, { "epoch": 0.9155681628566661, "grad_norm": 27.548044204711914, "learning_rate": 1.084431837143334e-05, "logits/chosen": -0.6320534348487854, "logits/rejected": -0.5892935991287231, "logps/chosen": -175.9044189453125, "logps/rejected": -142.10369873046875, "loss": 0.8395, "rewards/accuracies": 1.0, "rewards/chosen": -0.7682492733001709, "rewards/margins": 2.017223358154297, "rewards/rejected": -2.785472869873047, "step": 5487 }, { "epoch": 0.9160687468713499, "grad_norm": 37.83418273925781, "learning_rate": 1.08393125312865e-05, "logits/chosen": -0.619720995426178, "logits/rejected": -0.6405501365661621, "logps/chosen": -74.4150161743164, "logps/rejected": -119.30428314208984, "loss": 0.5866, "rewards/accuracies": 1.0, "rewards/chosen": -1.0359302759170532, "rewards/margins": 3.3095529079437256, "rewards/rejected": -4.345483303070068, "step": 5490 }, { "epoch": 0.9165693308860337, "grad_norm": 21.770042419433594, "learning_rate": 1.0834306691139664e-05, "logits/chosen": -0.6844028830528259, "logits/rejected": -0.6759741902351379, "logps/chosen": -107.5562973022461, "logps/rejected": -104.90728759765625, "loss": 0.8466, "rewards/accuracies": 1.0, "rewards/chosen": -0.40340423583984375, "rewards/margins": 2.148340940475464, "rewards/rejected": -2.5517451763153076, "step": 5493 }, { "epoch": 0.9170699149007175, "grad_norm": 29.801368713378906, "learning_rate": 1.0829300850992826e-05, "logits/chosen": -0.6739822030067444, "logits/rejected": -0.6724605560302734, "logps/chosen": -91.51432037353516, "logps/rejected": -96.11832427978516, "loss": 0.502, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.3456923961639404, "rewards/margins": -0.10386208444833755, "rewards/rejected": -2.241830587387085, "step": 5496 }, { "epoch": 0.9175704989154013, "grad_norm": 7.143894195556641, "learning_rate": 1.082429501084599e-05, "logits/chosen": -0.5234878063201904, "logits/rejected": -0.6729373335838318, "logps/chosen": -121.65313720703125, "logps/rejected": -132.36314392089844, "loss": 0.4954, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.112626552581787, "rewards/margins": 0.5420424938201904, "rewards/rejected": -2.6546690464019775, "step": 5499 }, { "epoch": 0.9180710829300851, "grad_norm": 25.14337730407715, "learning_rate": 1.081928917069915e-05, "logits/chosen": -0.7033976912498474, "logits/rejected": -0.7063103318214417, "logps/chosen": -105.18941497802734, "logps/rejected": -116.95531463623047, "loss": 0.4436, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.5422724485397339, "rewards/margins": 0.7546712756156921, "rewards/rejected": -2.2969436645507812, "step": 5502 }, { "epoch": 0.9185716669447689, "grad_norm": 25.379711151123047, "learning_rate": 1.0814283330552311e-05, "logits/chosen": -0.6982913017272949, "logits/rejected": -0.722802460193634, "logps/chosen": -99.11090850830078, "logps/rejected": -105.96035766601562, "loss": 0.616, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.0659948587417603, "rewards/margins": 0.6516634821891785, "rewards/rejected": -1.7176584005355835, "step": 5505 }, { "epoch": 0.9190722509594527, "grad_norm": 55.20647048950195, "learning_rate": 1.0809277490405475e-05, "logits/chosen": -0.6080247163772583, "logits/rejected": -0.5668296217918396, "logps/chosen": -67.32259368896484, "logps/rejected": -79.62158203125, "loss": 0.542, "rewards/accuracies": 1.0, "rewards/chosen": -0.5050459504127502, "rewards/margins": 2.2141594886779785, "rewards/rejected": -2.719205617904663, "step": 5508 }, { "epoch": 0.9195728349741364, "grad_norm": 28.64873504638672, "learning_rate": 1.0804271650258635e-05, "logits/chosen": -0.496351957321167, "logits/rejected": -0.5727551579475403, "logps/chosen": -81.98036193847656, "logps/rejected": -117.59130859375, "loss": 0.6816, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.9781039953231812, "rewards/margins": 0.7944602370262146, "rewards/rejected": -2.772564172744751, "step": 5511 }, { "epoch": 0.9200734189888203, "grad_norm": 22.911182403564453, "learning_rate": 1.0799265810111799e-05, "logits/chosen": -0.3865449130535126, "logits/rejected": -0.38320398330688477, "logps/chosen": -105.7677993774414, "logps/rejected": -166.32565307617188, "loss": 0.5844, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.440479040145874, "rewards/margins": -0.015315691940486431, "rewards/rejected": -2.4251632690429688, "step": 5514 }, { "epoch": 0.9205740030035041, "grad_norm": 8.143973350524902, "learning_rate": 1.079425996996496e-05, "logits/chosen": -0.47185707092285156, "logits/rejected": -0.5184386968612671, "logps/chosen": -92.41934204101562, "logps/rejected": -131.2983856201172, "loss": 0.2328, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.11021081358194351, "rewards/margins": 1.2422372102737427, "rewards/rejected": -1.1320264339447021, "step": 5517 }, { "epoch": 0.9210745870181879, "grad_norm": 42.505958557128906, "learning_rate": 1.0789254129818124e-05, "logits/chosen": -0.5944816470146179, "logits/rejected": -0.6002377271652222, "logps/chosen": -62.25401306152344, "logps/rejected": -110.87100982666016, "loss": 0.7406, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7323775291442871, "rewards/margins": 2.40053653717041, "rewards/rejected": -3.132913827896118, "step": 5520 }, { "epoch": 0.9215751710328717, "grad_norm": 23.861711502075195, "learning_rate": 1.0784248289671284e-05, "logits/chosen": -0.6913261413574219, "logits/rejected": -0.7155334949493408, "logps/chosen": -117.0892105102539, "logps/rejected": -151.07220458984375, "loss": 0.5987, "rewards/accuracies": 1.0, "rewards/chosen": -0.25279417634010315, "rewards/margins": 2.243306875228882, "rewards/rejected": -2.496100902557373, "step": 5523 }, { "epoch": 0.9220757550475555, "grad_norm": 30.782785415649414, "learning_rate": 1.0779242449524446e-05, "logits/chosen": -0.4637725055217743, "logits/rejected": -0.539423406124115, "logps/chosen": -55.68259811401367, "logps/rejected": -162.6209716796875, "loss": 0.5972, "rewards/accuracies": 1.0, "rewards/chosen": -1.1342536211013794, "rewards/margins": 4.670748233795166, "rewards/rejected": -5.805002212524414, "step": 5526 }, { "epoch": 0.9225763390622392, "grad_norm": 21.408905029296875, "learning_rate": 1.077423660937761e-05, "logits/chosen": -0.6231053471565247, "logits/rejected": -0.6957612633705139, "logps/chosen": -63.57149887084961, "logps/rejected": -156.89080810546875, "loss": 0.3462, "rewards/accuracies": 1.0, "rewards/chosen": -0.6433630585670471, "rewards/margins": 4.7038445472717285, "rewards/rejected": -5.347207546234131, "step": 5529 }, { "epoch": 0.9230769230769231, "grad_norm": 31.98724365234375, "learning_rate": 1.076923076923077e-05, "logits/chosen": -0.6097513437271118, "logits/rejected": -0.6081416606903076, "logps/chosen": -37.49325180053711, "logps/rejected": -70.8251724243164, "loss": 0.3763, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.26087814569473267, "rewards/margins": 1.9283767938613892, "rewards/rejected": -1.6674989461898804, "step": 5532 }, { "epoch": 0.9235775070916069, "grad_norm": 17.25590705871582, "learning_rate": 1.0764224929083933e-05, "logits/chosen": -0.6139641404151917, "logits/rejected": -0.6626401543617249, "logps/chosen": -55.55269241333008, "logps/rejected": -93.562255859375, "loss": 0.2665, "rewards/accuracies": 1.0, "rewards/chosen": -0.6159115433692932, "rewards/margins": 2.544895648956299, "rewards/rejected": -3.1608073711395264, "step": 5535 }, { "epoch": 0.9240780911062907, "grad_norm": 58.6855583190918, "learning_rate": 1.0759219088937095e-05, "logits/chosen": -0.8430252075195312, "logits/rejected": -0.848371684551239, "logps/chosen": -94.54141998291016, "logps/rejected": -129.5771942138672, "loss": 0.4115, "rewards/accuracies": 1.0, "rewards/chosen": -0.9388254284858704, "rewards/margins": 1.9744302034378052, "rewards/rejected": -2.9132556915283203, "step": 5538 }, { "epoch": 0.9245786751209745, "grad_norm": 17.707286834716797, "learning_rate": 1.0754213248790255e-05, "logits/chosen": -0.5539646744728088, "logits/rejected": -0.5558473467826843, "logps/chosen": -138.60585021972656, "logps/rejected": -165.9581298828125, "loss": 0.5967, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -3.94390606880188, "rewards/margins": -0.2617870271205902, "rewards/rejected": -3.682119369506836, "step": 5541 }, { "epoch": 0.9250792591356582, "grad_norm": 28.428550720214844, "learning_rate": 1.0749207408643418e-05, "logits/chosen": -0.7123034596443176, "logits/rejected": -0.7578582763671875, "logps/chosen": -65.01754760742188, "logps/rejected": -131.36349487304688, "loss": 0.4499, "rewards/accuracies": 1.0, "rewards/chosen": 0.2196681946516037, "rewards/margins": 3.8839523792266846, "rewards/rejected": -3.6642839908599854, "step": 5544 }, { "epoch": 0.925579843150342, "grad_norm": 9.617027282714844, "learning_rate": 1.074420156849658e-05, "logits/chosen": -0.8679804801940918, "logits/rejected": -0.8382328152656555, "logps/chosen": -122.2378921508789, "logps/rejected": -94.53557586669922, "loss": 0.2365, "rewards/accuracies": 1.0, "rewards/chosen": -2.013190984725952, "rewards/margins": 1.3497289419174194, "rewards/rejected": -3.362919807434082, "step": 5547 }, { "epoch": 0.9260804271650258, "grad_norm": 23.047624588012695, "learning_rate": 1.0739195728349743e-05, "logits/chosen": -0.6375316977500916, "logits/rejected": -0.7156505584716797, "logps/chosen": -84.38166046142578, "logps/rejected": -163.1240997314453, "loss": 0.4416, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.059434175491333, "rewards/margins": 1.9227021932601929, "rewards/rejected": -3.9821364879608154, "step": 5550 }, { "epoch": 0.9265810111797097, "grad_norm": 28.856897354125977, "learning_rate": 1.0734189888202904e-05, "logits/chosen": -0.38850364089012146, "logits/rejected": -0.383632093667984, "logps/chosen": -62.1235466003418, "logps/rejected": -69.44121551513672, "loss": 0.8985, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.856675922870636, "rewards/margins": 2.7567222118377686, "rewards/rejected": -3.6133975982666016, "step": 5553 }, { "epoch": 0.9270815951943935, "grad_norm": 16.876256942749023, "learning_rate": 1.0729184048056067e-05, "logits/chosen": -0.6285013556480408, "logits/rejected": -0.6633102297782898, "logps/chosen": -120.62310791015625, "logps/rejected": -182.75865173339844, "loss": 0.3556, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.6594321727752686, "rewards/margins": 2.998350143432617, "rewards/rejected": -5.657782077789307, "step": 5556 }, { "epoch": 0.9275821792090773, "grad_norm": 23.632482528686523, "learning_rate": 1.0724178207909229e-05, "logits/chosen": -0.8808754086494446, "logits/rejected": -0.8356514573097229, "logps/chosen": -111.2391357421875, "logps/rejected": -120.5058822631836, "loss": 0.5441, "rewards/accuracies": 1.0, "rewards/chosen": -1.7538000345230103, "rewards/margins": 2.652615547180176, "rewards/rejected": -4.4064154624938965, "step": 5559 }, { "epoch": 0.928082763223761, "grad_norm": 1.2301015853881836, "learning_rate": 1.0719172367762389e-05, "logits/chosen": -0.6025797724723816, "logits/rejected": -0.6723361611366272, "logps/chosen": -56.15395736694336, "logps/rejected": -148.9537353515625, "loss": 0.6219, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.18869149684906, "rewards/margins": 2.2009217739105225, "rewards/rejected": -3.389613151550293, "step": 5562 }, { "epoch": 0.9285833472384448, "grad_norm": 40.71856689453125, "learning_rate": 1.0714166527615552e-05, "logits/chosen": -0.6533458828926086, "logits/rejected": -0.6608603596687317, "logps/chosen": -126.59698486328125, "logps/rejected": -164.10353088378906, "loss": 0.4143, "rewards/accuracies": 1.0, "rewards/chosen": -1.6979371309280396, "rewards/margins": 4.062926769256592, "rewards/rejected": -5.7608642578125, "step": 5565 }, { "epoch": 0.9290839312531286, "grad_norm": 32.519378662109375, "learning_rate": 1.0709160687468714e-05, "logits/chosen": -0.8354608416557312, "logits/rejected": -0.8198387622833252, "logps/chosen": -113.06034088134766, "logps/rejected": -79.51836395263672, "loss": 1.4082, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8852524161338806, "rewards/margins": 0.6354420781135559, "rewards/rejected": -1.520694375038147, "step": 5568 }, { "epoch": 0.9295845152678125, "grad_norm": 19.827829360961914, "learning_rate": 1.0704154847321878e-05, "logits/chosen": -0.6461171507835388, "logits/rejected": -0.6137771010398865, "logps/chosen": -72.47045135498047, "logps/rejected": -79.62896728515625, "loss": 0.7609, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5960538983345032, "rewards/margins": 0.7376298308372498, "rewards/rejected": -1.3336838483810425, "step": 5571 }, { "epoch": 0.9300850992824963, "grad_norm": 3.817145586013794, "learning_rate": 1.0699149007175038e-05, "logits/chosen": -0.698378324508667, "logits/rejected": -0.6473050713539124, "logps/chosen": -103.2818832397461, "logps/rejected": -95.09366607666016, "loss": 0.2264, "rewards/accuracies": 1.0, "rewards/chosen": -0.15468163788318634, "rewards/margins": 1.7846208810806274, "rewards/rejected": -1.9393028020858765, "step": 5574 }, { "epoch": 0.93058568329718, "grad_norm": 9.520485877990723, "learning_rate": 1.0694143167028201e-05, "logits/chosen": -0.7882154583930969, "logits/rejected": -0.7723803520202637, "logps/chosen": -82.01349639892578, "logps/rejected": -119.11286163330078, "loss": 0.3928, "rewards/accuracies": 1.0, "rewards/chosen": -1.121904969215393, "rewards/margins": 4.108085632324219, "rewards/rejected": -5.229990482330322, "step": 5577 }, { "epoch": 0.9310862673118638, "grad_norm": 39.13595962524414, "learning_rate": 1.0689137326881363e-05, "logits/chosen": -0.6671347618103027, "logits/rejected": -0.6526820659637451, "logps/chosen": -85.87662506103516, "logps/rejected": -128.9417266845703, "loss": 0.5053, "rewards/accuracies": 1.0, "rewards/chosen": -1.9774256944656372, "rewards/margins": 3.3368358612060547, "rewards/rejected": -5.3142619132995605, "step": 5580 }, { "epoch": 0.9315868513265476, "grad_norm": 19.42824363708496, "learning_rate": 1.0684131486734523e-05, "logits/chosen": -0.7409420013427734, "logits/rejected": -0.7832544445991516, "logps/chosen": -95.03375244140625, "logps/rejected": -116.73617553710938, "loss": 0.6187, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.9940065145492554, "rewards/margins": 1.1230409145355225, "rewards/rejected": -3.1170475482940674, "step": 5583 }, { "epoch": 0.9320874353412314, "grad_norm": 32.51823806762695, "learning_rate": 1.0679125646587687e-05, "logits/chosen": -0.7914358973503113, "logits/rejected": -0.7891551852226257, "logps/chosen": -93.10428619384766, "logps/rejected": -88.9514389038086, "loss": 0.6768, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.551257610321045, "rewards/margins": 0.13092423975467682, "rewards/rejected": -1.6821818351745605, "step": 5586 }, { "epoch": 0.9325880193559153, "grad_norm": 15.228116989135742, "learning_rate": 1.0674119806440849e-05, "logits/chosen": -0.6361737251281738, "logits/rejected": -0.5552834868431091, "logps/chosen": -119.01666259765625, "logps/rejected": -76.18901062011719, "loss": 0.6088, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1418617963790894, "rewards/margins": -0.09883657842874527, "rewards/rejected": -1.0430251359939575, "step": 5589 }, { "epoch": 0.9330886033705991, "grad_norm": 9.202020645141602, "learning_rate": 1.0669113966294012e-05, "logits/chosen": -0.6349454522132874, "logits/rejected": -0.6438732743263245, "logps/chosen": -69.88956451416016, "logps/rejected": -85.63191986083984, "loss": 0.294, "rewards/accuracies": 0.0, "rewards/chosen": -1.343010425567627, "rewards/margins": -0.44867992401123047, "rewards/rejected": -0.8943305015563965, "step": 5592 }, { "epoch": 0.9335891873852828, "grad_norm": 30.868568420410156, "learning_rate": 1.0664108126147172e-05, "logits/chosen": -0.7586324214935303, "logits/rejected": -0.6949176788330078, "logps/chosen": -117.72447967529297, "logps/rejected": -121.9999771118164, "loss": 0.3414, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.525567054748535, "rewards/margins": 2.7192776203155518, "rewards/rejected": -5.244844913482666, "step": 5595 }, { "epoch": 0.9340897713999666, "grad_norm": 61.35625076293945, "learning_rate": 1.0659102286000336e-05, "logits/chosen": -0.5663193464279175, "logits/rejected": -0.5535247325897217, "logps/chosen": -112.98722076416016, "logps/rejected": -67.54228973388672, "loss": 1.471, "rewards/accuracies": 0.0, "rewards/chosen": -3.0514633655548096, "rewards/margins": -2.883775472640991, "rewards/rejected": -0.1676882654428482, "step": 5598 }, { "epoch": 0.9345903554146504, "grad_norm": 15.907732963562012, "learning_rate": 1.0654096445853497e-05, "logits/chosen": -0.6227132678031921, "logits/rejected": -0.6990304589271545, "logps/chosen": -76.15198516845703, "logps/rejected": -120.69158172607422, "loss": 0.5035, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.4695554971694946, "rewards/margins": 1.9663686752319336, "rewards/rejected": -3.4359242916107178, "step": 5601 }, { "epoch": 0.9350909394293342, "grad_norm": 20.019058227539062, "learning_rate": 1.0649090605706658e-05, "logits/chosen": -0.7254145741462708, "logits/rejected": -0.7795384526252747, "logps/chosen": -49.5029411315918, "logps/rejected": -137.81439208984375, "loss": 0.2776, "rewards/accuracies": 1.0, "rewards/chosen": -0.2835797965526581, "rewards/margins": 3.3222649097442627, "rewards/rejected": -3.605844736099243, "step": 5604 }, { "epoch": 0.935591523444018, "grad_norm": 42.0247802734375, "learning_rate": 1.0644084765559821e-05, "logits/chosen": -0.4800817668437958, "logits/rejected": -0.4551752507686615, "logps/chosen": -86.31488800048828, "logps/rejected": -97.4597396850586, "loss": 0.8243, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1032065153121948, "rewards/margins": 1.2154165506362915, "rewards/rejected": -2.3186233043670654, "step": 5607 }, { "epoch": 0.9360921074587019, "grad_norm": 52.09622573852539, "learning_rate": 1.0639078925412983e-05, "logits/chosen": -0.6854901313781738, "logits/rejected": -0.6030774116516113, "logps/chosen": -112.23468017578125, "logps/rejected": -103.18619537353516, "loss": 0.9084, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.5528879165649414, "rewards/margins": 0.9762592315673828, "rewards/rejected": -3.529147148132324, "step": 5610 }, { "epoch": 0.9365926914733856, "grad_norm": 41.02309799194336, "learning_rate": 1.0634073085266146e-05, "logits/chosen": -0.8039348125457764, "logits/rejected": -0.8131715655326843, "logps/chosen": -111.91363525390625, "logps/rejected": -142.87843322753906, "loss": 0.8448, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.352773904800415, "rewards/margins": 2.08901047706604, "rewards/rejected": -4.441784381866455, "step": 5613 }, { "epoch": 0.9370932754880694, "grad_norm": 19.908790588378906, "learning_rate": 1.0629067245119306e-05, "logits/chosen": -0.687576949596405, "logits/rejected": -0.7124568819999695, "logps/chosen": -80.86971282958984, "logps/rejected": -102.167236328125, "loss": 0.455, "rewards/accuracies": 1.0, "rewards/chosen": -0.399560809135437, "rewards/margins": 2.3865997791290283, "rewards/rejected": -2.786160469055176, "step": 5616 }, { "epoch": 0.9375938595027532, "grad_norm": 12.29057502746582, "learning_rate": 1.0624061404972468e-05, "logits/chosen": -0.6228405833244324, "logits/rejected": -0.7080097794532776, "logps/chosen": -68.95291137695312, "logps/rejected": -136.32652282714844, "loss": 0.6178, "rewards/accuracies": 1.0, "rewards/chosen": -2.2957370281219482, "rewards/margins": 2.352947473526001, "rewards/rejected": -4.648684501647949, "step": 5619 }, { "epoch": 0.938094443517437, "grad_norm": 14.704208374023438, "learning_rate": 1.0619055564825632e-05, "logits/chosen": -0.630001962184906, "logits/rejected": -0.6984277367591858, "logps/chosen": -36.1566047668457, "logps/rejected": -122.47583770751953, "loss": 0.4604, "rewards/accuracies": 1.0, "rewards/chosen": 0.0838097631931305, "rewards/margins": 2.9425222873687744, "rewards/rejected": -2.8587124347686768, "step": 5622 }, { "epoch": 0.9385950275321208, "grad_norm": 23.807178497314453, "learning_rate": 1.0614049724678792e-05, "logits/chosen": -0.7221109867095947, "logits/rejected": -0.6824519634246826, "logps/chosen": -109.62383270263672, "logps/rejected": -134.38389587402344, "loss": 0.2856, "rewards/accuracies": 1.0, "rewards/chosen": -0.6562768220901489, "rewards/margins": 2.7978477478027344, "rewards/rejected": -3.4541244506835938, "step": 5625 }, { "epoch": 0.9390956115468047, "grad_norm": 8.766377449035645, "learning_rate": 1.0609043884531955e-05, "logits/chosen": -0.5190708041191101, "logits/rejected": -0.46744203567504883, "logps/chosen": -53.780364990234375, "logps/rejected": -56.38350296020508, "loss": 0.3946, "rewards/accuracies": 1.0, "rewards/chosen": -0.5338215231895447, "rewards/margins": 1.0156275033950806, "rewards/rejected": -1.54944908618927, "step": 5628 }, { "epoch": 0.9395961955614884, "grad_norm": 45.98735046386719, "learning_rate": 1.0604038044385117e-05, "logits/chosen": -0.9410316348075867, "logits/rejected": -0.9025008082389832, "logps/chosen": -118.5510482788086, "logps/rejected": -83.5490951538086, "loss": 0.8396, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.3329681158065796, "rewards/margins": 1.2244716882705688, "rewards/rejected": -2.5574400424957275, "step": 5631 }, { "epoch": 0.9400967795761722, "grad_norm": 19.06101417541504, "learning_rate": 1.059903220423828e-05, "logits/chosen": -0.5625102519989014, "logits/rejected": -0.6458279490470886, "logps/chosen": -70.05862426757812, "logps/rejected": -156.39678955078125, "loss": 1.1169, "rewards/accuracies": 1.0, "rewards/chosen": -1.9431883096694946, "rewards/margins": 3.8774850368499756, "rewards/rejected": -5.82067346572876, "step": 5634 }, { "epoch": 0.940597363590856, "grad_norm": 18.833337783813477, "learning_rate": 1.059402636409144e-05, "logits/chosen": -0.388261079788208, "logits/rejected": -0.45537319779396057, "logps/chosen": -87.08344268798828, "logps/rejected": -107.29083251953125, "loss": 0.6452, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.9569755792617798, "rewards/margins": 0.35972046852111816, "rewards/rejected": -2.3166961669921875, "step": 5637 }, { "epoch": 0.9410979476055398, "grad_norm": 29.81479263305664, "learning_rate": 1.0589020523944602e-05, "logits/chosen": -0.7085742950439453, "logits/rejected": -0.7442991137504578, "logps/chosen": -85.29682159423828, "logps/rejected": -98.15350341796875, "loss": 1.459, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -3.8294341564178467, "rewards/margins": -1.4936670064926147, "rewards/rejected": -2.3357672691345215, "step": 5640 }, { "epoch": 0.9415985316202236, "grad_norm": 31.886428833007812, "learning_rate": 1.0584014683797766e-05, "logits/chosen": -0.5915260910987854, "logits/rejected": -0.5806593894958496, "logps/chosen": -86.1102523803711, "logps/rejected": -100.44734954833984, "loss": 0.4432, "rewards/accuracies": 1.0, "rewards/chosen": -0.8133783340454102, "rewards/margins": 1.750518798828125, "rewards/rejected": -2.563897132873535, "step": 5643 }, { "epoch": 0.9420991156349074, "grad_norm": 27.1453857421875, "learning_rate": 1.0579008843650926e-05, "logits/chosen": -0.7008989453315735, "logits/rejected": -0.6764888763427734, "logps/chosen": -95.9089584350586, "logps/rejected": -111.1478271484375, "loss": 0.5136, "rewards/accuracies": 1.0, "rewards/chosen": -0.9413856863975525, "rewards/margins": 3.120652437210083, "rewards/rejected": -4.062037944793701, "step": 5646 }, { "epoch": 0.9425996996495912, "grad_norm": 20.332216262817383, "learning_rate": 1.057400300350409e-05, "logits/chosen": -0.6401054263114929, "logits/rejected": -0.5989338755607605, "logps/chosen": -80.48332977294922, "logps/rejected": -101.65396118164062, "loss": 0.2491, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.0216363668441772, "rewards/margins": 1.0310648679733276, "rewards/rejected": -2.052701234817505, "step": 5649 }, { "epoch": 0.943100283664275, "grad_norm": 77.74995422363281, "learning_rate": 1.0568997163357251e-05, "logits/chosen": -0.7967925667762756, "logits/rejected": -0.7783470749855042, "logps/chosen": -79.76419830322266, "logps/rejected": -88.97322845458984, "loss": 0.6007, "rewards/accuracies": 1.0, "rewards/chosen": -1.3284131288528442, "rewards/margins": 2.225534677505493, "rewards/rejected": -3.553948163986206, "step": 5652 }, { "epoch": 0.9436008676789588, "grad_norm": 12.541274070739746, "learning_rate": 1.0563991323210415e-05, "logits/chosen": -0.7690091133117676, "logits/rejected": -0.810711681842804, "logps/chosen": -63.32084274291992, "logps/rejected": -72.97693634033203, "loss": 0.2957, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.853295922279358, "rewards/margins": -0.012747645378112793, "rewards/rejected": -1.8405481576919556, "step": 5655 }, { "epoch": 0.9441014516936426, "grad_norm": 3.6328790187835693, "learning_rate": 1.0558985483063575e-05, "logits/chosen": -0.8909973502159119, "logits/rejected": -0.8410583138465881, "logps/chosen": -157.83763122558594, "logps/rejected": -144.32493591308594, "loss": 0.4438, "rewards/accuracies": 1.0, "rewards/chosen": -1.9150973558425903, "rewards/margins": 1.247390866279602, "rewards/rejected": -3.1624882221221924, "step": 5658 }, { "epoch": 0.9446020357083263, "grad_norm": 40.49634552001953, "learning_rate": 1.0553979642916737e-05, "logits/chosen": -0.5738058090209961, "logits/rejected": -0.642731249332428, "logps/chosen": -57.10017013549805, "logps/rejected": -160.90928649902344, "loss": 0.7036, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.4235374927520752, "rewards/margins": 2.4131994247436523, "rewards/rejected": -2.8367366790771484, "step": 5661 }, { "epoch": 0.9451026197230101, "grad_norm": 28.16604995727539, "learning_rate": 1.05489738027699e-05, "logits/chosen": -0.46891582012176514, "logits/rejected": -0.5084471702575684, "logps/chosen": -89.02974700927734, "logps/rejected": -147.45765686035156, "loss": 0.4647, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.465988874435425, "rewards/margins": 1.7235711812973022, "rewards/rejected": -4.189559459686279, "step": 5664 }, { "epoch": 0.945603203737694, "grad_norm": 33.20941162109375, "learning_rate": 1.054396796262306e-05, "logits/chosen": -0.5807009935379028, "logits/rejected": -0.5728015899658203, "logps/chosen": -111.5472412109375, "logps/rejected": -92.46603393554688, "loss": 1.2819, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.4128074645996094, "rewards/margins": 0.2860759496688843, "rewards/rejected": -1.6988835334777832, "step": 5667 }, { "epoch": 0.9461037877523778, "grad_norm": 36.70108413696289, "learning_rate": 1.0538962122476224e-05, "logits/chosen": -0.7003076672554016, "logits/rejected": -0.7625824809074402, "logps/chosen": -50.84548568725586, "logps/rejected": -94.30664825439453, "loss": 0.7981, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.0653775930404663, "rewards/margins": 1.2037595510482788, "rewards/rejected": -2.269137144088745, "step": 5670 }, { "epoch": 0.9466043717670616, "grad_norm": 13.197216033935547, "learning_rate": 1.0533956282329386e-05, "logits/chosen": -0.6783313751220703, "logits/rejected": -0.6727798581123352, "logps/chosen": -198.62744140625, "logps/rejected": -169.039306640625, "loss": 0.3631, "rewards/accuracies": 1.0, "rewards/chosen": -1.1238478422164917, "rewards/margins": 2.65120792388916, "rewards/rejected": -3.7750556468963623, "step": 5673 }, { "epoch": 0.9471049557817454, "grad_norm": 31.287307739257812, "learning_rate": 1.0528950442182546e-05, "logits/chosen": -0.8685415387153625, "logits/rejected": -0.901295006275177, "logps/chosen": -99.2375717163086, "logps/rejected": -178.1962890625, "loss": 0.391, "rewards/accuracies": 1.0, "rewards/chosen": -1.4561843872070312, "rewards/margins": 2.2880818843841553, "rewards/rejected": -3.7442657947540283, "step": 5676 }, { "epoch": 0.9476055397964291, "grad_norm": 9.077438354492188, "learning_rate": 1.052394460203571e-05, "logits/chosen": -0.4375194311141968, "logits/rejected": -0.36743366718292236, "logps/chosen": -147.85191345214844, "logps/rejected": -125.8599853515625, "loss": 0.2515, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7034549117088318, "rewards/margins": 2.6398255825042725, "rewards/rejected": -3.343280553817749, "step": 5679 }, { "epoch": 0.9481061238111129, "grad_norm": 41.422996520996094, "learning_rate": 1.0518938761888871e-05, "logits/chosen": -0.47629690170288086, "logits/rejected": -0.4656808376312256, "logps/chosen": -116.20601654052734, "logps/rejected": -112.78705596923828, "loss": 0.6228, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.809192180633545, "rewards/margins": 0.5601024031639099, "rewards/rejected": -2.3692944049835205, "step": 5682 }, { "epoch": 0.9486067078257968, "grad_norm": 6.072824478149414, "learning_rate": 1.0513932921742034e-05, "logits/chosen": -0.5649673938751221, "logits/rejected": -0.6149089932441711, "logps/chosen": -95.02054595947266, "logps/rejected": -111.80377197265625, "loss": 0.4194, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5992286801338196, "rewards/margins": 1.3920269012451172, "rewards/rejected": -1.991255760192871, "step": 5685 }, { "epoch": 0.9491072918404806, "grad_norm": 83.30311584472656, "learning_rate": 1.0508927081595195e-05, "logits/chosen": -0.5192160606384277, "logits/rejected": -0.6209257245063782, "logps/chosen": -45.98050308227539, "logps/rejected": -174.17771911621094, "loss": 1.0068, "rewards/accuracies": 1.0, "rewards/chosen": -0.8444290161132812, "rewards/margins": 3.045154333114624, "rewards/rejected": -3.8895833492279053, "step": 5688 }, { "epoch": 0.9496078758551644, "grad_norm": 70.70833587646484, "learning_rate": 1.0503921241448358e-05, "logits/chosen": -0.716463029384613, "logits/rejected": -0.6297950148582458, "logps/chosen": -109.5261459350586, "logps/rejected": -131.20045471191406, "loss": 0.8721, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.4026473760604858, "rewards/margins": 1.5601590871810913, "rewards/rejected": -2.962806463241577, "step": 5691 }, { "epoch": 0.9501084598698482, "grad_norm": 40.0439567565918, "learning_rate": 1.049891540130152e-05, "logits/chosen": -0.6968851089477539, "logits/rejected": -0.7299894690513611, "logps/chosen": -87.7176742553711, "logps/rejected": -100.3803939819336, "loss": 0.8339, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.9952787160873413, "rewards/margins": -0.5449820756912231, "rewards/rejected": -1.4502967596054077, "step": 5694 }, { "epoch": 0.9506090438845319, "grad_norm": 2.441497325897217, "learning_rate": 1.049390956115468e-05, "logits/chosen": -0.644644021987915, "logits/rejected": -0.6698442101478577, "logps/chosen": -53.29490661621094, "logps/rejected": -131.48060607910156, "loss": 0.4622, "rewards/accuracies": 1.0, "rewards/chosen": -0.554847776889801, "rewards/margins": 4.375833511352539, "rewards/rejected": -4.930681228637695, "step": 5697 }, { "epoch": 0.9511096278992157, "grad_norm": 37.4140625, "learning_rate": 1.0488903721007843e-05, "logits/chosen": -0.7045989632606506, "logits/rejected": -0.730919599533081, "logps/chosen": -86.75501251220703, "logps/rejected": -103.10761260986328, "loss": 0.3216, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.4387410879135132, "rewards/margins": 1.3353344202041626, "rewards/rejected": -2.774075746536255, "step": 5700 }, { "epoch": 0.9516102119138995, "grad_norm": 20.450096130371094, "learning_rate": 1.0483897880861005e-05, "logits/chosen": -0.6768110394477844, "logits/rejected": -0.7400787472724915, "logps/chosen": -60.6419563293457, "logps/rejected": -116.40010833740234, "loss": 1.0722, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.025693655014038, "rewards/margins": 0.25069043040275574, "rewards/rejected": -2.276384115219116, "step": 5703 }, { "epoch": 0.9521107959285834, "grad_norm": 16.80157470703125, "learning_rate": 1.0478892040714169e-05, "logits/chosen": -0.728666365146637, "logits/rejected": -0.7200694680213928, "logps/chosen": -110.75487518310547, "logps/rejected": -122.19905853271484, "loss": 0.3872, "rewards/accuracies": 1.0, "rewards/chosen": -1.5371251106262207, "rewards/margins": 1.6375513076782227, "rewards/rejected": -3.1746766567230225, "step": 5706 }, { "epoch": 0.9526113799432672, "grad_norm": 11.093334197998047, "learning_rate": 1.0473886200567329e-05, "logits/chosen": -0.6850495338439941, "logits/rejected": -0.6494889855384827, "logps/chosen": -85.0336685180664, "logps/rejected": -66.63214111328125, "loss": 0.3989, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.1502999067306519, "rewards/margins": 1.0025838613510132, "rewards/rejected": -2.152883529663086, "step": 5709 }, { "epoch": 0.953111963957951, "grad_norm": 19.3232421875, "learning_rate": 1.0468880360420492e-05, "logits/chosen": -0.6695547103881836, "logits/rejected": -0.6715318560600281, "logps/chosen": -120.67352294921875, "logps/rejected": -83.32561492919922, "loss": 0.498, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6996147632598877, "rewards/margins": 1.298009991645813, "rewards/rejected": -1.9976247549057007, "step": 5712 }, { "epoch": 0.9536125479726347, "grad_norm": 24.03635025024414, "learning_rate": 1.0463874520273654e-05, "logits/chosen": -0.6255770325660706, "logits/rejected": -0.6342204213142395, "logps/chosen": -98.2320785522461, "logps/rejected": -131.5086212158203, "loss": 0.5537, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.5758439302444458, "rewards/margins": 3.1541402339935303, "rewards/rejected": -4.729984283447266, "step": 5715 }, { "epoch": 0.9541131319873185, "grad_norm": 91.43265533447266, "learning_rate": 1.0458868680126814e-05, "logits/chosen": -0.6748539805412292, "logits/rejected": -0.6334225535392761, "logps/chosen": -117.96392822265625, "logps/rejected": -110.2718505859375, "loss": 0.7592, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.7637523412704468, "rewards/margins": 1.2583370208740234, "rewards/rejected": -3.0220892429351807, "step": 5718 }, { "epoch": 0.9546137160020023, "grad_norm": 36.59763717651367, "learning_rate": 1.0453862839979978e-05, "logits/chosen": -0.6555138826370239, "logits/rejected": -0.6193931102752686, "logps/chosen": -71.07333374023438, "logps/rejected": -67.67059326171875, "loss": 0.8248, "rewards/accuracies": 1.0, "rewards/chosen": -0.805262565612793, "rewards/margins": 1.4137330055236816, "rewards/rejected": -2.2189955711364746, "step": 5721 }, { "epoch": 0.9551143000166862, "grad_norm": 16.537370681762695, "learning_rate": 1.044885699983314e-05, "logits/chosen": -0.6122894287109375, "logits/rejected": -0.6089706420898438, "logps/chosen": -61.62144470214844, "logps/rejected": -100.7585678100586, "loss": 0.3068, "rewards/accuracies": 1.0, "rewards/chosen": -0.2144392728805542, "rewards/margins": 2.9293699264526367, "rewards/rejected": -3.1438090801239014, "step": 5724 }, { "epoch": 0.95561488403137, "grad_norm": 60.12942886352539, "learning_rate": 1.0443851159686303e-05, "logits/chosen": -0.5891962647438049, "logits/rejected": -0.543563723564148, "logps/chosen": -114.47696685791016, "logps/rejected": -103.27153778076172, "loss": 0.8348, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.2088912725448608, "rewards/margins": 1.1771953105926514, "rewards/rejected": -2.3860864639282227, "step": 5727 }, { "epoch": 0.9561154680460537, "grad_norm": 32.8414306640625, "learning_rate": 1.0438845319539463e-05, "logits/chosen": -0.6686211228370667, "logits/rejected": -0.671219527721405, "logps/chosen": -97.50115966796875, "logps/rejected": -102.39498138427734, "loss": 0.4329, "rewards/accuracies": 1.0, "rewards/chosen": -1.8219562768936157, "rewards/margins": 1.6309236288070679, "rewards/rejected": -3.4528801441192627, "step": 5730 }, { "epoch": 0.9566160520607375, "grad_norm": 69.3248062133789, "learning_rate": 1.0433839479392625e-05, "logits/chosen": -0.65853351354599, "logits/rejected": -0.7078776359558105, "logps/chosen": -97.88726806640625, "logps/rejected": -157.28102111816406, "loss": 0.8706, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.4127485752105713, "rewards/margins": 0.6916987895965576, "rewards/rejected": -3.104447364807129, "step": 5733 }, { "epoch": 0.9571166360754213, "grad_norm": 24.409759521484375, "learning_rate": 1.0428833639245788e-05, "logits/chosen": -0.6231107115745544, "logits/rejected": -0.6088959574699402, "logps/chosen": -105.6960678100586, "logps/rejected": -79.3731918334961, "loss": 0.4032, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.5615993738174438, "rewards/margins": -0.027040204033255577, "rewards/rejected": -1.5345592498779297, "step": 5736 }, { "epoch": 0.9576172200901051, "grad_norm": 29.044689178466797, "learning_rate": 1.0423827799098948e-05, "logits/chosen": -0.7401151061058044, "logits/rejected": -0.7270610332489014, "logps/chosen": -83.39998626708984, "logps/rejected": -95.86237335205078, "loss": 0.3227, "rewards/accuracies": 1.0, "rewards/chosen": -1.0950456857681274, "rewards/margins": 1.3332682847976685, "rewards/rejected": -2.428313970565796, "step": 5739 }, { "epoch": 0.958117804104789, "grad_norm": 17.818857192993164, "learning_rate": 1.0418821958952112e-05, "logits/chosen": -0.5130643844604492, "logits/rejected": -0.5032057166099548, "logps/chosen": -63.2470588684082, "logps/rejected": -76.89459991455078, "loss": 0.3624, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8603561520576477, "rewards/margins": 0.7408967018127441, "rewards/rejected": -1.6012529134750366, "step": 5742 }, { "epoch": 0.9586183881194728, "grad_norm": 19.331262588500977, "learning_rate": 1.0413816118805274e-05, "logits/chosen": -0.47760114073753357, "logits/rejected": -0.4386617839336395, "logps/chosen": -110.46004486083984, "logps/rejected": -105.79779052734375, "loss": 0.2615, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -3.8093531131744385, "rewards/margins": 1.8524452447891235, "rewards/rejected": -5.661798477172852, "step": 5745 }, { "epoch": 0.9591189721341565, "grad_norm": 16.898082733154297, "learning_rate": 1.0408810278658437e-05, "logits/chosen": -0.5008504986763, "logits/rejected": -0.5272768139839172, "logps/chosen": -81.86376953125, "logps/rejected": -154.95802307128906, "loss": 0.374, "rewards/accuracies": 1.0, "rewards/chosen": -1.121017336845398, "rewards/margins": 4.287626266479492, "rewards/rejected": -5.40864372253418, "step": 5748 }, { "epoch": 0.9596195561488403, "grad_norm": 19.025218963623047, "learning_rate": 1.0403804438511597e-05, "logits/chosen": -0.6768975257873535, "logits/rejected": -0.721947968006134, "logps/chosen": -101.8407211303711, "logps/rejected": -127.57537078857422, "loss": 0.3039, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.8566226959228516, "rewards/margins": 0.8789928555488586, "rewards/rejected": -2.7356154918670654, "step": 5751 }, { "epoch": 0.9601201401635241, "grad_norm": 11.39237117767334, "learning_rate": 1.0398798598364759e-05, "logits/chosen": -0.415240079164505, "logits/rejected": -0.48230698704719543, "logps/chosen": -47.03114318847656, "logps/rejected": -156.9573211669922, "loss": 0.1635, "rewards/accuracies": 1.0, "rewards/chosen": -0.8761196732521057, "rewards/margins": 5.805479526519775, "rewards/rejected": -6.681599140167236, "step": 5754 }, { "epoch": 0.9606207241782079, "grad_norm": 15.468509674072266, "learning_rate": 1.0393792758217923e-05, "logits/chosen": -0.5260145664215088, "logits/rejected": -0.5041020512580872, "logps/chosen": -113.6801986694336, "logps/rejected": -99.80938720703125, "loss": 0.6134, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.353601336479187, "rewards/margins": 1.6046124696731567, "rewards/rejected": -1.9582138061523438, "step": 5757 }, { "epoch": 0.9611213081928917, "grad_norm": 9.120880126953125, "learning_rate": 1.0388786918071083e-05, "logits/chosen": -0.7729110717773438, "logits/rejected": -0.7519119381904602, "logps/chosen": -153.70977783203125, "logps/rejected": -149.3419952392578, "loss": 0.1685, "rewards/accuracies": 1.0, "rewards/chosen": -3.428642511367798, "rewards/margins": 1.9521011114120483, "rewards/rejected": -5.380743503570557, "step": 5760 }, { "epoch": 0.9616218922075755, "grad_norm": 20.801889419555664, "learning_rate": 1.0383781077924246e-05, "logits/chosen": -0.4963759183883667, "logits/rejected": -0.6042072176933289, "logps/chosen": -114.89705657958984, "logps/rejected": -170.52476501464844, "loss": 1.0598, "rewards/accuracies": 1.0, "rewards/chosen": -0.5877676606178284, "rewards/margins": 2.6636180877685547, "rewards/rejected": -3.2513856887817383, "step": 5763 }, { "epoch": 0.9621224762222593, "grad_norm": 18.166574478149414, "learning_rate": 1.0378775237777408e-05, "logits/chosen": -0.6522171497344971, "logits/rejected": -0.6436622142791748, "logps/chosen": -89.7136001586914, "logps/rejected": -80.26616668701172, "loss": 0.4469, "rewards/accuracies": 1.0, "rewards/chosen": -1.6040164232254028, "rewards/margins": 0.950638473033905, "rewards/rejected": -2.554654836654663, "step": 5766 }, { "epoch": 0.9626230602369431, "grad_norm": 7.025177955627441, "learning_rate": 1.0373769397630572e-05, "logits/chosen": -0.7185309529304504, "logits/rejected": -0.6852602958679199, "logps/chosen": -89.3947982788086, "logps/rejected": -89.7505111694336, "loss": 0.7122, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.195812940597534, "rewards/margins": 1.3479247093200684, "rewards/rejected": -3.5437376499176025, "step": 5769 }, { "epoch": 0.9631236442516269, "grad_norm": 6.442964553833008, "learning_rate": 1.0368763557483732e-05, "logits/chosen": -0.5555461049079895, "logits/rejected": -0.6102260947227478, "logps/chosen": -73.8615951538086, "logps/rejected": -115.82952880859375, "loss": 0.3429, "rewards/accuracies": 1.0, "rewards/chosen": -0.4442721903324127, "rewards/margins": 2.319333076477051, "rewards/rejected": -2.7636051177978516, "step": 5772 }, { "epoch": 0.9636242282663107, "grad_norm": 5.54202127456665, "learning_rate": 1.0363757717336893e-05, "logits/chosen": -0.8215970396995544, "logits/rejected": -0.802789032459259, "logps/chosen": -102.71417999267578, "logps/rejected": -114.5169906616211, "loss": 0.3353, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.6564786434173584, "rewards/margins": 2.7724602222442627, "rewards/rejected": -5.428939342498779, "step": 5775 }, { "epoch": 0.9641248122809944, "grad_norm": 30.84662437438965, "learning_rate": 1.0358751877190057e-05, "logits/chosen": -0.7922849059104919, "logits/rejected": -0.7250692844390869, "logps/chosen": -141.18597412109375, "logps/rejected": -72.55960845947266, "loss": 0.4226, "rewards/accuracies": 0.0, "rewards/chosen": -2.838085412979126, "rewards/margins": -0.6697919964790344, "rewards/rejected": -2.1682937145233154, "step": 5778 }, { "epoch": 0.9646253962956783, "grad_norm": 4.065211772918701, "learning_rate": 1.0353746037043217e-05, "logits/chosen": -0.5801126956939697, "logits/rejected": -0.5862779021263123, "logps/chosen": -46.7691535949707, "logps/rejected": -91.14261627197266, "loss": 0.327, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6936717629432678, "rewards/margins": 1.1855188608169556, "rewards/rejected": -1.8791908025741577, "step": 5781 }, { "epoch": 0.9651259803103621, "grad_norm": 5.478382587432861, "learning_rate": 1.034874019689638e-05, "logits/chosen": -0.7056596875190735, "logits/rejected": -0.6865260601043701, "logps/chosen": -87.66947174072266, "logps/rejected": -115.9867935180664, "loss": 0.6518, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.11273825168609619, "rewards/margins": 3.497267961502075, "rewards/rejected": -3.610006332397461, "step": 5784 }, { "epoch": 0.9656265643250459, "grad_norm": 10.25281047821045, "learning_rate": 1.0343734356749542e-05, "logits/chosen": -0.6918315887451172, "logits/rejected": -0.7596766948699951, "logps/chosen": -54.54079055786133, "logps/rejected": -150.17420959472656, "loss": 0.1989, "rewards/accuracies": 1.0, "rewards/chosen": 1.0473870038986206, "rewards/margins": 6.513626575469971, "rewards/rejected": -5.466239929199219, "step": 5787 }, { "epoch": 0.9661271483397297, "grad_norm": 41.07115173339844, "learning_rate": 1.0338728516602702e-05, "logits/chosen": -0.6079862713813782, "logits/rejected": -0.6033329367637634, "logps/chosen": -109.24664306640625, "logps/rejected": -134.09423828125, "loss": 0.4788, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.9951467514038086, "rewards/margins": 1.0371471643447876, "rewards/rejected": -3.0322940349578857, "step": 5790 }, { "epoch": 0.9666277323544135, "grad_norm": 28.878698348999023, "learning_rate": 1.0333722676455866e-05, "logits/chosen": -0.533518373966217, "logits/rejected": -0.49566951394081116, "logps/chosen": -121.00006103515625, "logps/rejected": -67.6621322631836, "loss": 0.7419, "rewards/accuracies": 1.0, "rewards/chosen": -1.4404574632644653, "rewards/margins": 1.1678105592727661, "rewards/rejected": -2.6082680225372314, "step": 5793 }, { "epoch": 0.9671283163690972, "grad_norm": 16.070816040039062, "learning_rate": 1.0328716836309028e-05, "logits/chosen": -0.4834069311618805, "logits/rejected": -0.4520985186100006, "logps/chosen": -121.13944244384766, "logps/rejected": -53.76544189453125, "loss": 1.0495, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -4.073585033416748, "rewards/margins": -1.9944761991500854, "rewards/rejected": -2.079108953475952, "step": 5796 }, { "epoch": 0.9676289003837811, "grad_norm": 61.68289566040039, "learning_rate": 1.0323710996162191e-05, "logits/chosen": -0.6672951579093933, "logits/rejected": -0.6489725708961487, "logps/chosen": -127.3138656616211, "logps/rejected": -120.27324676513672, "loss": 0.914, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -4.377901077270508, "rewards/margins": 0.4654479920864105, "rewards/rejected": -4.843349456787109, "step": 5799 }, { "epoch": 0.9681294843984649, "grad_norm": 15.852374076843262, "learning_rate": 1.0318705156015351e-05, "logits/chosen": -0.5749604105949402, "logits/rejected": -0.5997905731201172, "logps/chosen": -94.0472640991211, "logps/rejected": -187.49928283691406, "loss": 0.3361, "rewards/accuracies": 1.0, "rewards/chosen": -2.5943660736083984, "rewards/margins": 5.202942371368408, "rewards/rejected": -7.797308444976807, "step": 5802 }, { "epoch": 0.9686300684131487, "grad_norm": 35.942508697509766, "learning_rate": 1.0313699315868515e-05, "logits/chosen": -0.6760953068733215, "logits/rejected": -0.6853839755058289, "logps/chosen": -75.97833251953125, "logps/rejected": -97.3304214477539, "loss": 0.5435, "rewards/accuracies": 1.0, "rewards/chosen": -0.6535493731498718, "rewards/margins": 2.561871290206909, "rewards/rejected": -3.2154204845428467, "step": 5805 }, { "epoch": 0.9691306524278325, "grad_norm": 56.72478103637695, "learning_rate": 1.0308693475721677e-05, "logits/chosen": -0.5892464518547058, "logits/rejected": -0.6143026947975159, "logps/chosen": -121.61104583740234, "logps/rejected": -166.35479736328125, "loss": 1.2143, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.5881605744361877, "rewards/margins": 2.6763556003570557, "rewards/rejected": -3.2645161151885986, "step": 5808 }, { "epoch": 0.9696312364425163, "grad_norm": 20.646303176879883, "learning_rate": 1.0303687635574837e-05, "logits/chosen": -0.6643162369728088, "logits/rejected": -0.6809868216514587, "logps/chosen": -70.309814453125, "logps/rejected": -75.55442810058594, "loss": 0.6377, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.5379894971847534, "rewards/margins": 0.6216807961463928, "rewards/rejected": -2.159670352935791, "step": 5811 }, { "epoch": 0.9701318204572, "grad_norm": 8.36655044555664, "learning_rate": 1.0298681795428e-05, "logits/chosen": -0.6323795914649963, "logits/rejected": -0.6334194540977478, "logps/chosen": -62.76090621948242, "logps/rejected": -82.02969360351562, "loss": 0.6379, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -0.88719242811203, "rewards/margins": 0.3782660961151123, "rewards/rejected": -1.2654584646224976, "step": 5814 }, { "epoch": 0.9706324044718838, "grad_norm": 34.133766174316406, "learning_rate": 1.0293675955281162e-05, "logits/chosen": -0.5534782409667969, "logits/rejected": -0.5882678627967834, "logps/chosen": -134.12107849121094, "logps/rejected": -126.00679779052734, "loss": 0.5539, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.0533409118652344, "rewards/margins": 1.8202894926071167, "rewards/rejected": -3.8736302852630615, "step": 5817 }, { "epoch": 0.9711329884865677, "grad_norm": 16.96253776550293, "learning_rate": 1.0288670115134325e-05, "logits/chosen": -0.645195484161377, "logits/rejected": -0.6393564343452454, "logps/chosen": -93.78255462646484, "logps/rejected": -116.15714263916016, "loss": 0.5611, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.0452194213867188, "rewards/margins": 1.9826465845108032, "rewards/rejected": -4.027865886688232, "step": 5820 }, { "epoch": 0.9716335725012515, "grad_norm": 22.200927734375, "learning_rate": 1.0283664274987486e-05, "logits/chosen": -0.7919809818267822, "logits/rejected": -0.7701253294944763, "logps/chosen": -121.50650787353516, "logps/rejected": -133.7177276611328, "loss": 0.181, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.31203031539917, "rewards/margins": 3.8825862407684326, "rewards/rejected": -5.194616317749023, "step": 5823 }, { "epoch": 0.9721341565159353, "grad_norm": 19.919740676879883, "learning_rate": 1.0278658434840649e-05, "logits/chosen": -0.6458188891410828, "logits/rejected": -0.7580789923667908, "logps/chosen": -83.5090103149414, "logps/rejected": -146.2368927001953, "loss": 0.4261, "rewards/accuracies": 1.0, "rewards/chosen": -1.3288395404815674, "rewards/margins": 2.0337960720062256, "rewards/rejected": -3.362635374069214, "step": 5826 }, { "epoch": 0.972634740530619, "grad_norm": 25.796504974365234, "learning_rate": 1.027365259469381e-05, "logits/chosen": -0.8823478817939758, "logits/rejected": -0.9038284420967102, "logps/chosen": -68.20743560791016, "logps/rejected": -132.75633239746094, "loss": 0.4968, "rewards/accuracies": 1.0, "rewards/chosen": -0.6387256979942322, "rewards/margins": 2.953518867492676, "rewards/rejected": -3.5922443866729736, "step": 5829 }, { "epoch": 0.9731353245453028, "grad_norm": 8.071237564086914, "learning_rate": 1.0268646754546971e-05, "logits/chosen": -0.7607783675193787, "logits/rejected": -0.7793622016906738, "logps/chosen": -120.3111801147461, "logps/rejected": -137.49795532226562, "loss": 0.4182, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.2702653408050537, "rewards/margins": 2.590043783187866, "rewards/rejected": -3.860308885574341, "step": 5832 }, { "epoch": 0.9736359085599866, "grad_norm": 21.468963623046875, "learning_rate": 1.0263640914400134e-05, "logits/chosen": -0.5573767423629761, "logits/rejected": -0.5614635944366455, "logps/chosen": -66.23323822021484, "logps/rejected": -90.32465362548828, "loss": 0.4517, "rewards/accuracies": 1.0, "rewards/chosen": -1.2089077234268188, "rewards/margins": 2.03426194190979, "rewards/rejected": -3.2431697845458984, "step": 5835 }, { "epoch": 0.9741364925746705, "grad_norm": 33.156272888183594, "learning_rate": 1.0258635074253296e-05, "logits/chosen": -0.538891077041626, "logits/rejected": -0.4906749725341797, "logps/chosen": -76.91388702392578, "logps/rejected": -69.11775970458984, "loss": 0.3084, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.3003906309604645, "rewards/margins": 1.238423466682434, "rewards/rejected": -1.5388141870498657, "step": 5838 }, { "epoch": 0.9746370765893543, "grad_norm": 32.48591232299805, "learning_rate": 1.025362923410646e-05, "logits/chosen": -0.7130990028381348, "logits/rejected": -0.7868921756744385, "logps/chosen": -120.4989242553711, "logps/rejected": -140.5595245361328, "loss": 0.4391, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.5976076126098633, "rewards/margins": 1.2022229433059692, "rewards/rejected": -2.799830675125122, "step": 5841 }, { "epoch": 0.9751376606040381, "grad_norm": 23.414535522460938, "learning_rate": 1.024862339395962e-05, "logits/chosen": -0.6725640296936035, "logits/rejected": -0.6482098698616028, "logps/chosen": -88.64306640625, "logps/rejected": -110.38903045654297, "loss": 0.406, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.3562440872192383, "rewards/margins": 1.6085104942321777, "rewards/rejected": -2.964754819869995, "step": 5844 }, { "epoch": 0.9756382446187218, "grad_norm": 22.788719177246094, "learning_rate": 1.0243617553812782e-05, "logits/chosen": -0.6790230870246887, "logits/rejected": -0.6761794686317444, "logps/chosen": -61.61457443237305, "logps/rejected": -104.9035415649414, "loss": 0.6346, "rewards/accuracies": 1.0, "rewards/chosen": -0.8566184043884277, "rewards/margins": 3.6864755153656006, "rewards/rejected": -4.543094158172607, "step": 5847 }, { "epoch": 0.9761388286334056, "grad_norm": 61.591819763183594, "learning_rate": 1.0238611713665945e-05, "logits/chosen": -0.6358290314674377, "logits/rejected": -0.725055992603302, "logps/chosen": -44.11879348754883, "logps/rejected": -134.75091552734375, "loss": 0.9016, "rewards/accuracies": 1.0, "rewards/chosen": -0.9708324074745178, "rewards/margins": 1.8768702745437622, "rewards/rejected": -2.847702741622925, "step": 5850 }, { "epoch": 0.9766394126480894, "grad_norm": 20.181272506713867, "learning_rate": 1.0233605873519105e-05, "logits/chosen": -0.5042391419410706, "logits/rejected": -0.5109841227531433, "logps/chosen": -66.62165069580078, "logps/rejected": -88.97452545166016, "loss": 0.3252, "rewards/accuracies": 1.0, "rewards/chosen": -0.5471473932266235, "rewards/margins": 1.2648416757583618, "rewards/rejected": -1.811989188194275, "step": 5853 }, { "epoch": 0.9771399966627732, "grad_norm": 10.223540306091309, "learning_rate": 1.0228600033372269e-05, "logits/chosen": -0.7012014985084534, "logits/rejected": -0.6841281056404114, "logps/chosen": -84.78369903564453, "logps/rejected": -82.11304473876953, "loss": 0.7083, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.5251179337501526, "rewards/margins": 0.8841919302940369, "rewards/rejected": -1.409309983253479, "step": 5856 }, { "epoch": 0.9776405806774571, "grad_norm": 42.21370315551758, "learning_rate": 1.022359419322543e-05, "logits/chosen": -0.5885344743728638, "logits/rejected": -0.632952868938446, "logps/chosen": -72.67676544189453, "logps/rejected": -123.20442962646484, "loss": 0.6202, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.3720723390579224, "rewards/margins": 1.3099311590194702, "rewards/rejected": -2.6820037364959717, "step": 5859 }, { "epoch": 0.9781411646921409, "grad_norm": 45.26822280883789, "learning_rate": 1.0218588353078594e-05, "logits/chosen": -0.6047407984733582, "logits/rejected": -0.6886982917785645, "logps/chosen": -86.15715789794922, "logps/rejected": -124.35189819335938, "loss": 0.3078, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6183640956878662, "rewards/margins": 1.8009533882141113, "rewards/rejected": -2.4193174839019775, "step": 5862 }, { "epoch": 0.9786417487068246, "grad_norm": 46.20836639404297, "learning_rate": 1.0213582512931754e-05, "logits/chosen": -0.6704095005989075, "logits/rejected": -0.6574876308441162, "logps/chosen": -88.65128326416016, "logps/rejected": -82.84589385986328, "loss": 0.9802, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.0716500282287598, "rewards/margins": -0.727548360824585, "rewards/rejected": -1.3441017866134644, "step": 5865 }, { "epoch": 0.9791423327215084, "grad_norm": 37.22377014160156, "learning_rate": 1.0208576672784916e-05, "logits/chosen": -0.5960958003997803, "logits/rejected": -0.5723369121551514, "logps/chosen": -167.80323791503906, "logps/rejected": -139.9673309326172, "loss": 0.7163, "rewards/accuracies": 0.0, "rewards/chosen": -2.7516002655029297, "rewards/margins": -0.30821236968040466, "rewards/rejected": -2.443387985229492, "step": 5868 }, { "epoch": 0.9796429167361922, "grad_norm": 29.81600570678711, "learning_rate": 1.020357083263808e-05, "logits/chosen": -0.6657021641731262, "logits/rejected": -0.6068364381790161, "logps/chosen": -69.82079315185547, "logps/rejected": -85.68085479736328, "loss": 0.5516, "rewards/accuracies": 1.0, "rewards/chosen": -0.4215784966945648, "rewards/margins": 0.70143061876297, "rewards/rejected": -1.1230090856552124, "step": 5871 }, { "epoch": 0.980143500750876, "grad_norm": 79.67949676513672, "learning_rate": 1.019856499249124e-05, "logits/chosen": -0.5948833227157593, "logits/rejected": -0.4859018325805664, "logps/chosen": -114.2275390625, "logps/rejected": -40.81536865234375, "loss": 0.7252, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -1.1180719137191772, "rewards/margins": 0.22406084835529327, "rewards/rejected": -1.3421329259872437, "step": 5874 }, { "epoch": 0.9806440847655599, "grad_norm": 25.982147216796875, "learning_rate": 1.0193559152344403e-05, "logits/chosen": -0.728985071182251, "logits/rejected": -0.6631262898445129, "logps/chosen": -108.40545654296875, "logps/rejected": -100.3663101196289, "loss": 0.4387, "rewards/accuracies": 1.0, "rewards/chosen": -0.09302303940057755, "rewards/margins": 2.1553103923797607, "rewards/rejected": -2.24833345413208, "step": 5877 }, { "epoch": 0.9811446687802436, "grad_norm": 17.102920532226562, "learning_rate": 1.0188553312197565e-05, "logits/chosen": -0.6077149510383606, "logits/rejected": -0.6288801431655884, "logps/chosen": -42.066654205322266, "logps/rejected": -79.65828704833984, "loss": 0.3599, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.6477475166320801, "rewards/margins": 2.4009287357330322, "rewards/rejected": -1.7531813383102417, "step": 5880 }, { "epoch": 0.9816452527949274, "grad_norm": 48.59499740600586, "learning_rate": 1.0183547472050728e-05, "logits/chosen": -0.5900495648384094, "logits/rejected": -0.6551650762557983, "logps/chosen": -47.88041305541992, "logps/rejected": -155.71116638183594, "loss": 0.3863, "rewards/accuracies": 1.0, "rewards/chosen": -1.5384012460708618, "rewards/margins": 4.2105913162231445, "rewards/rejected": -5.748992919921875, "step": 5883 }, { "epoch": 0.9821458368096112, "grad_norm": 26.642250061035156, "learning_rate": 1.0178541631903888e-05, "logits/chosen": -0.796210765838623, "logits/rejected": -0.7991473078727722, "logps/chosen": -90.46656036376953, "logps/rejected": -123.68639373779297, "loss": 0.5566, "rewards/accuracies": 1.0, "rewards/chosen": -1.8096827268600464, "rewards/margins": 2.109022378921509, "rewards/rejected": -3.9187049865722656, "step": 5886 }, { "epoch": 0.982646420824295, "grad_norm": 2.579056978225708, "learning_rate": 1.017353579175705e-05, "logits/chosen": -0.6197190284729004, "logits/rejected": -0.6299130320549011, "logps/chosen": -78.35083770751953, "logps/rejected": -109.93234252929688, "loss": 0.6477, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.2320687770843506, "rewards/margins": 1.2300163507461548, "rewards/rejected": -3.462085008621216, "step": 5889 }, { "epoch": 0.9831470048389788, "grad_norm": 38.01673126220703, "learning_rate": 1.0168529951610214e-05, "logits/chosen": -0.5165979266166687, "logits/rejected": -0.4953150749206543, "logps/chosen": -79.25796508789062, "logps/rejected": -101.46073150634766, "loss": 0.5846, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.8120987415313721, "rewards/margins": 1.7132154703140259, "rewards/rejected": -2.5253143310546875, "step": 5892 }, { "epoch": 0.9836475888536627, "grad_norm": 21.23751449584961, "learning_rate": 1.0163524111463374e-05, "logits/chosen": -0.521443784236908, "logits/rejected": -0.5094780325889587, "logps/chosen": -72.26902770996094, "logps/rejected": -108.32022857666016, "loss": 0.5607, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.7249963879585266, "rewards/margins": 3.0217459201812744, "rewards/rejected": -3.7467424869537354, "step": 5895 }, { "epoch": 0.9841481728683464, "grad_norm": 33.02896499633789, "learning_rate": 1.0158518271316537e-05, "logits/chosen": -0.6698977947235107, "logits/rejected": -0.6438191533088684, "logps/chosen": -107.64208984375, "logps/rejected": -121.76220703125, "loss": 0.5896, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6626971960067749, "rewards/margins": 1.6123672723770142, "rewards/rejected": -2.275064706802368, "step": 5898 }, { "epoch": 0.9846487568830302, "grad_norm": 17.38744354248047, "learning_rate": 1.0153512431169699e-05, "logits/chosen": -0.717420756816864, "logits/rejected": -0.7328644394874573, "logps/chosen": -59.8335075378418, "logps/rejected": -107.69701385498047, "loss": 0.7491, "rewards/accuracies": 1.0, "rewards/chosen": -1.4711066484451294, "rewards/margins": 2.7185776233673096, "rewards/rejected": -4.1896843910217285, "step": 5901 }, { "epoch": 0.985149340897714, "grad_norm": 23.65190315246582, "learning_rate": 1.0148506591022859e-05, "logits/chosen": -0.463807612657547, "logits/rejected": -0.49922847747802734, "logps/chosen": -66.12628173828125, "logps/rejected": -134.74009704589844, "loss": 0.4677, "rewards/accuracies": 1.0, "rewards/chosen": -0.9344840049743652, "rewards/margins": 3.7393360137939453, "rewards/rejected": -4.673820495605469, "step": 5904 }, { "epoch": 0.9856499249123978, "grad_norm": 11.853607177734375, "learning_rate": 1.0143500750876023e-05, "logits/chosen": -0.45316287875175476, "logits/rejected": -0.4522649049758911, "logps/chosen": -120.46858978271484, "logps/rejected": -133.70767211914062, "loss": 0.5252, "rewards/accuracies": 0.0, "rewards/chosen": -1.1251360177993774, "rewards/margins": -0.5611315965652466, "rewards/rejected": -0.5640044808387756, "step": 5907 }, { "epoch": 0.9861505089270816, "grad_norm": 22.166107177734375, "learning_rate": 1.0138494910729184e-05, "logits/chosen": -0.609734058380127, "logits/rejected": -0.5446200966835022, "logps/chosen": -110.26956176757812, "logps/rejected": -76.43306732177734, "loss": 0.6923, "rewards/accuracies": 1.0, "rewards/chosen": -0.3074691593647003, "rewards/margins": 0.7376669049263, "rewards/rejected": -1.0451360940933228, "step": 5910 }, { "epoch": 0.9866510929417653, "grad_norm": 40.725948333740234, "learning_rate": 1.0133489070582348e-05, "logits/chosen": -0.5931665301322937, "logits/rejected": -0.62173992395401, "logps/chosen": -48.782318115234375, "logps/rejected": -145.39813232421875, "loss": 0.443, "rewards/accuracies": 1.0, "rewards/chosen": 0.2937028110027313, "rewards/margins": 4.731624126434326, "rewards/rejected": -4.437921524047852, "step": 5913 }, { "epoch": 0.9871516769564492, "grad_norm": 12.003742218017578, "learning_rate": 1.0128483230435508e-05, "logits/chosen": -0.6967945098876953, "logits/rejected": -0.6755608916282654, "logps/chosen": -89.1728286743164, "logps/rejected": -101.9582290649414, "loss": 0.403, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.24808669090271, "rewards/margins": 0.7828080654144287, "rewards/rejected": -2.0308947563171387, "step": 5916 }, { "epoch": 0.987652260971133, "grad_norm": 16.978544235229492, "learning_rate": 1.0123477390288671e-05, "logits/chosen": -0.5761151313781738, "logits/rejected": -0.5810842514038086, "logps/chosen": -89.8564224243164, "logps/rejected": -122.57462310791016, "loss": 0.6503, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.6826798915863037, "rewards/margins": 0.7293849587440491, "rewards/rejected": -1.4120649099349976, "step": 5919 }, { "epoch": 0.9881528449858168, "grad_norm": 35.496212005615234, "learning_rate": 1.0118471550141833e-05, "logits/chosen": -0.3836287558078766, "logits/rejected": -0.432603120803833, "logps/chosen": -52.91775131225586, "logps/rejected": -93.48296356201172, "loss": 0.8812, "rewards/accuracies": 1.0, "rewards/chosen": 1.378503680229187, "rewards/margins": 4.017092227935791, "rewards/rejected": -2.6385886669158936, "step": 5922 }, { "epoch": 0.9886534290005006, "grad_norm": 19.219026565551758, "learning_rate": 1.0113465709994993e-05, "logits/chosen": -0.5730059742927551, "logits/rejected": -0.5674970746040344, "logps/chosen": -101.3906021118164, "logps/rejected": -129.24290466308594, "loss": 0.6506, "rewards/accuracies": 1.0, "rewards/chosen": -0.7929606437683105, "rewards/margins": 2.7228939533233643, "rewards/rejected": -3.515854597091675, "step": 5925 }, { "epoch": 0.9891540130151844, "grad_norm": 11.486767768859863, "learning_rate": 1.0108459869848157e-05, "logits/chosen": -0.7853805422782898, "logits/rejected": -0.7344425320625305, "logps/chosen": -146.22410583496094, "logps/rejected": -105.97431182861328, "loss": 0.3832, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -3.294327735900879, "rewards/margins": -0.12297888845205307, "rewards/rejected": -3.1713485717773438, "step": 5928 }, { "epoch": 0.9896545970298681, "grad_norm": 18.40277862548828, "learning_rate": 1.0103454029701319e-05, "logits/chosen": -0.5430110692977905, "logits/rejected": -0.5886316895484924, "logps/chosen": -130.2638397216797, "logps/rejected": -165.79989624023438, "loss": 0.4394, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.4199258089065552, "rewards/margins": 1.049368143081665, "rewards/rejected": -2.4692938327789307, "step": 5931 }, { "epoch": 0.990155181044552, "grad_norm": 40.01869201660156, "learning_rate": 1.0098448189554482e-05, "logits/chosen": -0.7396709322929382, "logits/rejected": -0.7769517302513123, "logps/chosen": -47.55375671386719, "logps/rejected": -139.8062286376953, "loss": 0.3675, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.23373006284236908, "rewards/margins": 5.095940113067627, "rewards/rejected": -5.329669952392578, "step": 5934 }, { "epoch": 0.9906557650592358, "grad_norm": 20.149681091308594, "learning_rate": 1.0093442349407642e-05, "logits/chosen": -0.7187082171440125, "logits/rejected": -0.6396586298942566, "logps/chosen": -83.3122329711914, "logps/rejected": -87.80157470703125, "loss": 0.5951, "rewards/accuracies": 1.0, "rewards/chosen": -0.2457452267408371, "rewards/margins": 0.8288728594779968, "rewards/rejected": -1.0746179819107056, "step": 5937 }, { "epoch": 0.9911563490739196, "grad_norm": 12.922137260437012, "learning_rate": 1.0088436509260806e-05, "logits/chosen": -0.8066849708557129, "logits/rejected": -0.7944676876068115, "logps/chosen": -95.2300796508789, "logps/rejected": -100.69720458984375, "loss": 0.5121, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.4201767444610596, "rewards/margins": -0.22381043434143066, "rewards/rejected": -2.196366548538208, "step": 5940 }, { "epoch": 0.9916569330886034, "grad_norm": 78.92549133300781, "learning_rate": 1.0083430669113968e-05, "logits/chosen": -0.722023069858551, "logits/rejected": -0.7651321291923523, "logps/chosen": -88.33084869384766, "logps/rejected": -136.8129119873047, "loss": 0.681, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.9990304708480835, "rewards/margins": -0.893358051776886, "rewards/rejected": -1.1056724786758423, "step": 5943 }, { "epoch": 0.9921575171032871, "grad_norm": 21.94681167602539, "learning_rate": 1.0078424828967128e-05, "logits/chosen": -0.7868771553039551, "logits/rejected": -0.7186253666877747, "logps/chosen": -100.58708953857422, "logps/rejected": -84.99822235107422, "loss": 0.8066, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.06537926197052, "rewards/margins": 0.21820537745952606, "rewards/rejected": -1.283584713935852, "step": 5946 }, { "epoch": 0.9926581011179709, "grad_norm": 49.7817497253418, "learning_rate": 1.0073418988820291e-05, "logits/chosen": -0.44231489300727844, "logits/rejected": -0.543285608291626, "logps/chosen": -62.364131927490234, "logps/rejected": -164.70684814453125, "loss": 0.7599, "rewards/accuracies": 1.0, "rewards/chosen": -0.36180901527404785, "rewards/margins": 4.458548545837402, "rewards/rejected": -4.820357322692871, "step": 5949 }, { "epoch": 0.9931586851326548, "grad_norm": 33.304290771484375, "learning_rate": 1.0068413148673453e-05, "logits/chosen": -0.5614752173423767, "logits/rejected": -0.5623475909233093, "logps/chosen": -137.03990173339844, "logps/rejected": -124.03887176513672, "loss": 0.8152, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -2.5267045497894287, "rewards/margins": -0.3057180345058441, "rewards/rejected": -2.2209863662719727, "step": 5952 }, { "epoch": 0.9936592691473386, "grad_norm": 19.296226501464844, "learning_rate": 1.0063407308526616e-05, "logits/chosen": -0.6045448780059814, "logits/rejected": -0.6362645626068115, "logps/chosen": -110.53035736083984, "logps/rejected": -127.71346282958984, "loss": 0.8249, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.035768747329712, "rewards/margins": 0.8152100443840027, "rewards/rejected": -2.8509788513183594, "step": 5955 }, { "epoch": 0.9941598531620224, "grad_norm": 83.70564270019531, "learning_rate": 1.0058401468379776e-05, "logits/chosen": -0.6152710318565369, "logits/rejected": -0.6157931685447693, "logps/chosen": -129.16258239746094, "logps/rejected": -103.81485748291016, "loss": 0.9848, "rewards/accuracies": 0.3333333432674408, "rewards/chosen": -3.4606246948242188, "rewards/margins": -0.2532081604003906, "rewards/rejected": -3.207416534423828, "step": 5958 }, { "epoch": 0.9946604371767062, "grad_norm": 11.809561729431152, "learning_rate": 1.005339562823294e-05, "logits/chosen": -0.628492534160614, "logits/rejected": -0.599431037902832, "logps/chosen": -190.2292938232422, "logps/rejected": -163.8637237548828, "loss": 0.3943, "rewards/accuracies": 1.0, "rewards/chosen": -0.9799209237098694, "rewards/margins": 1.4701493978500366, "rewards/rejected": -2.4500701427459717, "step": 5961 }, { "epoch": 0.9951610211913899, "grad_norm": 15.091608047485352, "learning_rate": 1.0048389788086102e-05, "logits/chosen": -0.6678030490875244, "logits/rejected": -0.5958754420280457, "logps/chosen": -112.01766204833984, "logps/rejected": -75.23958587646484, "loss": 0.7985, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.0746322870254517, "rewards/margins": 0.3634457588195801, "rewards/rejected": -1.4380780458450317, "step": 5964 }, { "epoch": 0.9956616052060737, "grad_norm": 44.52152633666992, "learning_rate": 1.0043383947939262e-05, "logits/chosen": -0.8139623999595642, "logits/rejected": -0.796004593372345, "logps/chosen": -73.96990966796875, "logps/rejected": -96.15194702148438, "loss": 1.017, "rewards/accuracies": 1.0, "rewards/chosen": -0.33817920088768005, "rewards/margins": 2.2772231101989746, "rewards/rejected": -2.6154022216796875, "step": 5967 }, { "epoch": 0.9961621892207575, "grad_norm": 31.067920684814453, "learning_rate": 1.0038378107792425e-05, "logits/chosen": -0.6423477530479431, "logits/rejected": -0.6769251823425293, "logps/chosen": -84.88925170898438, "logps/rejected": -128.72789001464844, "loss": 0.7674, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -3.1522750854492188, "rewards/margins": 0.9177346229553223, "rewards/rejected": -4.070009708404541, "step": 5970 }, { "epoch": 0.9966627732354414, "grad_norm": 15.90886402130127, "learning_rate": 1.0033372267645587e-05, "logits/chosen": -0.6208155751228333, "logits/rejected": -0.5972766876220703, "logps/chosen": -98.71825408935547, "logps/rejected": -136.9268341064453, "loss": 0.2055, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -3.054030656814575, "rewards/margins": 3.8751440048217773, "rewards/rejected": -6.929174423217773, "step": 5973 }, { "epoch": 0.9971633572501252, "grad_norm": 24.704593658447266, "learning_rate": 1.002836642749875e-05, "logits/chosen": -0.6297455430030823, "logits/rejected": -0.7109382152557373, "logps/chosen": -69.6208724975586, "logps/rejected": -147.1343231201172, "loss": 0.2768, "rewards/accuracies": 1.0, "rewards/chosen": -1.2903612852096558, "rewards/margins": 2.2519383430480957, "rewards/rejected": -3.542299509048462, "step": 5976 }, { "epoch": 0.997663941264809, "grad_norm": 43.7061653137207, "learning_rate": 1.002336058735191e-05, "logits/chosen": -0.7568328976631165, "logits/rejected": -0.7600057721138, "logps/chosen": -103.40011596679688, "logps/rejected": -119.77545928955078, "loss": 0.4438, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.0767714977264404, "rewards/margins": 1.3913499116897583, "rewards/rejected": -3.468121290206909, "step": 5979 }, { "epoch": 0.9981645252794927, "grad_norm": 52.34920120239258, "learning_rate": 1.0018354747205073e-05, "logits/chosen": -0.6468203663825989, "logits/rejected": -0.6560245156288147, "logps/chosen": -68.14534759521484, "logps/rejected": -101.0033187866211, "loss": 1.0014, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.420539528131485, "rewards/margins": 2.060666799545288, "rewards/rejected": -1.640127182006836, "step": 5982 }, { "epoch": 0.9986651092941765, "grad_norm": 8.211867332458496, "learning_rate": 1.0013348907058236e-05, "logits/chosen": -0.666168749332428, "logits/rejected": -0.770773708820343, "logps/chosen": -61.39739990234375, "logps/rejected": -157.11721801757812, "loss": 0.2068, "rewards/accuracies": 1.0, "rewards/chosen": 0.040189869701862335, "rewards/margins": 4.900651454925537, "rewards/rejected": -4.860461235046387, "step": 5985 }, { "epoch": 0.9991656933088603, "grad_norm": 53.18135452270508, "learning_rate": 1.0008343066911396e-05, "logits/chosen": -0.7134937644004822, "logits/rejected": -0.7311527132987976, "logps/chosen": -134.27391052246094, "logps/rejected": -137.89599609375, "loss": 0.4531, "rewards/accuracies": 1.0, "rewards/chosen": -1.3798189163208008, "rewards/margins": 1.5935115814208984, "rewards/rejected": -2.973330497741699, "step": 5988 }, { "epoch": 0.9996662773235442, "grad_norm": 126.24494171142578, "learning_rate": 1.000333722676456e-05, "logits/chosen": -0.5932393670082092, "logits/rejected": -0.6667121052742004, "logps/chosen": -109.84716033935547, "logps/rejected": -122.84353637695312, "loss": 1.1912, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -2.747086763381958, "rewards/margins": 0.22638647258281708, "rewards/rejected": -2.973473310470581, "step": 5991 }, { "epoch": 1.000166861338228, "grad_norm": 11.692079544067383, "learning_rate": 9.998331386617721e-06, "logits/chosen": -0.8824734687805176, "logits/rejected": -0.8829240798950195, "logps/chosen": -85.66119384765625, "logps/rejected": -91.9903335571289, "loss": 0.24, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.619892954826355, "rewards/margins": 2.0899288654327393, "rewards/rejected": -3.7098217010498047, "step": 5994 }, { "epoch": 1.0006674453529116, "grad_norm": 22.853519439697266, "learning_rate": 9.993325546470883e-06, "logits/chosen": -0.5438070893287659, "logits/rejected": -0.545119047164917, "logps/chosen": -57.91835021972656, "logps/rejected": -68.80912017822266, "loss": 0.499, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -1.4087408781051636, "rewards/margins": 0.8234202265739441, "rewards/rejected": -2.232161045074463, "step": 5997 }, { "epoch": 1.0011680293675955, "grad_norm": 20.210033416748047, "learning_rate": 9.988319706324045e-06, "logits/chosen": -0.5319482684135437, "logits/rejected": -0.5518211126327515, "logps/chosen": -89.40306854248047, "logps/rejected": -114.4407958984375, "loss": 0.4318, "rewards/accuracies": 1.0, "rewards/chosen": -2.0268688201904297, "rewards/margins": 2.2551050186157227, "rewards/rejected": -4.281973838806152, "step": 6000 }, { "epoch": 1.0011680293675955, "eval_logits/chosen": -0.6805323958396912, "eval_logits/rejected": -0.6946524977684021, "eval_logps/chosen": -96.65752410888672, "eval_logps/rejected": -126.01219940185547, "eval_loss": 0.5788983702659607, "eval_rewards/accuracies": 0.7252252101898193, "eval_rewards/chosen": -1.6399788856506348, "eval_rewards/margins": 1.6484228372573853, "eval_rewards/rejected": -3.2884018421173096, "eval_runtime": 355.9104, "eval_samples_per_second": 7.485, "eval_steps_per_second": 1.871, "step": 6000 } ], "logging_steps": 3, "max_steps": 11986, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 600, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }