diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4044 @@ +{ + "best_metric": 0.6631070971488953, + "best_model_checkpoint": "./output/checkpoints/2024-05-27_09-03-33/checkpoint-1100", + "epoch": 1.0, + "eval_steps": 100, + "global_step": 1271, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.003933910306845004, + "grad_norm": 27.324785232543945, + "learning_rate": 6.25e-07, + "logits/chosen": -0.23312029242515564, + "logits/rejected": -0.7136957049369812, + "logps/chosen": -206.98876953125, + "logps/rejected": -177.72207641601562, + "loss": 0.6946, + "rewards/accuracies": 0.22499999403953552, + "rewards/chosen": -0.0011991311330348253, + "rewards/margins": -0.0031457520090043545, + "rewards/rejected": 0.001946620992384851, + "step": 5 + }, + { + "epoch": 0.007867820613690008, + "grad_norm": 26.920639038085938, + "learning_rate": 1.40625e-06, + "logits/chosen": -0.3985660672187805, + "logits/rejected": -0.7379584908485413, + "logps/chosen": -201.005859375, + "logps/rejected": -177.08181762695312, + "loss": 0.688, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.010929527692496777, + "rewards/margins": 0.013672275468707085, + "rewards/rejected": -0.002742747776210308, + "step": 10 + }, + { + "epoch": 0.011801730920535013, + "grad_norm": 34.40425109863281, + "learning_rate": 2.1875000000000002e-06, + "logits/chosen": -0.35717901587486267, + "logits/rejected": -0.660548746585846, + "logps/chosen": -217.42825317382812, + "logps/rejected": -194.10195922851562, + "loss": 0.6924, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.034006841480731964, + "rewards/margins": 0.0028066448867321014, + "rewards/rejected": 0.03120020031929016, + "step": 15 + }, + { + "epoch": 0.015735641227380016, + "grad_norm": 27.097261428833008, + "learning_rate": 2.96875e-06, + "logits/chosen": -0.3896491825580597, + "logits/rejected": -0.7307055592536926, + "logps/chosen": -209.29373168945312, + "logps/rejected": -179.78488159179688, + "loss": 0.6839, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.08366340398788452, + "rewards/margins": 0.025963936001062393, + "rewards/rejected": 0.05769947171211243, + "step": 20 + }, + { + "epoch": 0.01966955153422502, + "grad_norm": 29.19064712524414, + "learning_rate": 3.7500000000000005e-06, + "logits/chosen": -0.24666282534599304, + "logits/rejected": -0.7009283900260925, + "logps/chosen": -196.3118438720703, + "logps/rejected": -178.7552032470703, + "loss": 0.683, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.13745614886283875, + "rewards/margins": 0.03245489299297333, + "rewards/rejected": 0.1050012856721878, + "step": 25 + }, + { + "epoch": 0.023603461841070025, + "grad_norm": 31.083709716796875, + "learning_rate": 4.53125e-06, + "logits/chosen": -0.3193593919277191, + "logits/rejected": -0.6126649379730225, + "logps/chosen": -208.44863891601562, + "logps/rejected": -184.2353057861328, + "loss": 0.6852, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.22270426154136658, + "rewards/margins": 0.03411892056465149, + "rewards/rejected": 0.18858537077903748, + "step": 30 + }, + { + "epoch": 0.02753737214791503, + "grad_norm": 25.83799171447754, + "learning_rate": 5.3125e-06, + "logits/chosen": -0.46783486008644104, + "logits/rejected": -0.7504000067710876, + "logps/chosen": -221.98843383789062, + "logps/rejected": -199.54000854492188, + "loss": 0.6705, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.3932109773159027, + "rewards/margins": 0.08941729366779327, + "rewards/rejected": 0.30379369854927063, + "step": 35 + }, + { + "epoch": 0.03147128245476003, + "grad_norm": 24.734338760375977, + "learning_rate": 6.093750000000001e-06, + "logits/chosen": -0.3396364748477936, + "logits/rejected": -0.7113901376724243, + "logps/chosen": -196.3134765625, + "logps/rejected": -179.5933380126953, + "loss": 0.6879, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.4551575779914856, + "rewards/margins": 0.05487058684229851, + "rewards/rejected": 0.4002869725227356, + "step": 40 + }, + { + "epoch": 0.03540519276160504, + "grad_norm": 28.34064292907715, + "learning_rate": 6.718750000000001e-06, + "logits/chosen": -0.667598307132721, + "logits/rejected": -1.014026403427124, + "logps/chosen": -196.5115966796875, + "logps/rejected": -165.67092895507812, + "loss": 0.6852, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5074445605278015, + "rewards/margins": 0.09343204647302628, + "rewards/rejected": 0.41401252150535583, + "step": 45 + }, + { + "epoch": 0.03933910306845004, + "grad_norm": 30.12347984313965, + "learning_rate": 7.500000000000001e-06, + "logits/chosen": -0.2210284173488617, + "logits/rejected": -0.32401731610298157, + "logps/chosen": -210.63818359375, + "logps/rejected": -205.76895141601562, + "loss": 0.6641, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.5923845171928406, + "rewards/margins": 0.12340062856674194, + "rewards/rejected": 0.4689839482307434, + "step": 50 + }, + { + "epoch": 0.043273013375295044, + "grad_norm": 47.19338607788086, + "learning_rate": 8.281250000000001e-06, + "logits/chosen": -0.5629546642303467, + "logits/rejected": -0.7718995213508606, + "logps/chosen": -194.5259552001953, + "logps/rejected": -179.5989532470703, + "loss": 0.6902, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.6510985493659973, + "rewards/margins": 0.10642552375793457, + "rewards/rejected": 0.544672966003418, + "step": 55 + }, + { + "epoch": 0.04720692368214005, + "grad_norm": 23.202775955200195, + "learning_rate": 9.0625e-06, + "logits/chosen": -0.3029821217060089, + "logits/rejected": -0.7788914442062378, + "logps/chosen": -214.9969940185547, + "logps/rejected": -167.64263916015625, + "loss": 0.6472, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6703575849533081, + "rewards/margins": 0.22439488768577576, + "rewards/rejected": 0.44596266746520996, + "step": 60 + }, + { + "epoch": 0.05114083398898505, + "grad_norm": 35.26408386230469, + "learning_rate": 9.84375e-06, + "logits/chosen": -0.4141275882720947, + "logits/rejected": -0.7083785533905029, + "logps/chosen": -212.9031524658203, + "logps/rejected": -198.8483428955078, + "loss": 0.662, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.7610660791397095, + "rewards/margins": 0.2469903975725174, + "rewards/rejected": 0.514075756072998, + "step": 65 + }, + { + "epoch": 0.05507474429583006, + "grad_norm": 19.10537338256836, + "learning_rate": 1.0625e-05, + "logits/chosen": -0.4033733308315277, + "logits/rejected": -0.7651963829994202, + "logps/chosen": -212.84487915039062, + "logps/rejected": -174.28073120117188, + "loss": 0.6534, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.7663796544075012, + "rewards/margins": 0.24841317534446716, + "rewards/rejected": 0.5179664492607117, + "step": 70 + }, + { + "epoch": 0.059008654602675056, + "grad_norm": 26.261890411376953, + "learning_rate": 1.1406250000000001e-05, + "logits/chosen": -0.10389180481433868, + "logits/rejected": -0.5258628129959106, + "logps/chosen": -206.84921264648438, + "logps/rejected": -186.50869750976562, + "loss": 0.6808, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.8156352043151855, + "rewards/margins": 0.17864595353603363, + "rewards/rejected": 0.6369892358779907, + "step": 75 + }, + { + "epoch": 0.06294256490952006, + "grad_norm": 32.33486557006836, + "learning_rate": 1.2187500000000001e-05, + "logits/chosen": -0.22502727806568146, + "logits/rejected": -0.49946776032447815, + "logps/chosen": -209.71426391601562, + "logps/rejected": -198.34292602539062, + "loss": 0.6852, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6370053291320801, + "rewards/margins": 0.15727970004081726, + "rewards/rejected": 0.4797256886959076, + "step": 80 + }, + { + "epoch": 0.06687647521636507, + "grad_norm": 38.13333511352539, + "learning_rate": 1.2968750000000002e-05, + "logits/chosen": -0.25742509961128235, + "logits/rejected": -0.7358572483062744, + "logps/chosen": -206.3865966796875, + "logps/rejected": -178.12637329101562, + "loss": 0.6652, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.3785225450992584, + "rewards/margins": 0.16723336279392242, + "rewards/rejected": 0.2112891674041748, + "step": 85 + }, + { + "epoch": 0.07081038552321008, + "grad_norm": 23.647096633911133, + "learning_rate": 1.375e-05, + "logits/chosen": -0.3365253806114197, + "logits/rejected": -0.5771717429161072, + "logps/chosen": -208.416748046875, + "logps/rejected": -184.40476989746094, + "loss": 0.7024, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2850777506828308, + "rewards/margins": 0.14986075460910797, + "rewards/rejected": 0.13521698117256165, + "step": 90 + }, + { + "epoch": 0.07474429583005507, + "grad_norm": 22.20098114013672, + "learning_rate": 1.453125e-05, + "logits/chosen": -0.21254411339759827, + "logits/rejected": -0.6303216218948364, + "logps/chosen": -201.83139038085938, + "logps/rejected": -183.7214813232422, + "loss": 0.6843, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3604539632797241, + "rewards/margins": 0.1408630609512329, + "rewards/rejected": 0.2195909023284912, + "step": 95 + }, + { + "epoch": 0.07867820613690008, + "grad_norm": 29.343482971191406, + "learning_rate": 1.5312500000000003e-05, + "logits/chosen": -0.41852107644081116, + "logits/rejected": -0.7636915445327759, + "logps/chosen": -208.08035278320312, + "logps/rejected": -178.69972229003906, + "loss": 0.6731, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.4267478585243225, + "rewards/margins": 0.16477522253990173, + "rewards/rejected": 0.2619726061820984, + "step": 100 + }, + { + "epoch": 0.07867820613690008, + "eval_logits/chosen": 1.3246409893035889, + "eval_logits/rejected": 1.0977884531021118, + "eval_logps/chosen": -206.3737030029297, + "eval_logps/rejected": -179.28366088867188, + "eval_loss": 0.6665228009223938, + "eval_rewards/accuracies": 0.635937511920929, + "eval_rewards/chosen": 0.6386381387710571, + "eval_rewards/margins": 0.19896559417247772, + "eval_rewards/rejected": 0.4396725594997406, + "eval_runtime": 307.3381, + "eval_samples_per_second": 2.082, + "eval_steps_per_second": 0.13, + "step": 100 + }, + { + "epoch": 0.08261211644374508, + "grad_norm": 24.263774871826172, + "learning_rate": 1.609375e-05, + "logits/chosen": -0.16335585713386536, + "logits/rejected": -0.4457281231880188, + "logps/chosen": -201.37017822265625, + "logps/rejected": -176.67379760742188, + "loss": 0.6641, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.6822856068611145, + "rewards/margins": 0.18253257870674133, + "rewards/rejected": 0.4997529983520508, + "step": 105 + }, + { + "epoch": 0.08654602675059009, + "grad_norm": 25.775903701782227, + "learning_rate": 1.6875e-05, + "logits/chosen": -0.436201810836792, + "logits/rejected": -0.9347764849662781, + "logps/chosen": -195.61062622070312, + "logps/rejected": -169.15048217773438, + "loss": 0.6596, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7915258407592773, + "rewards/margins": 0.2815794348716736, + "rewards/rejected": 0.5099464654922485, + "step": 110 + }, + { + "epoch": 0.0904799370574351, + "grad_norm": 30.208763122558594, + "learning_rate": 1.7656250000000002e-05, + "logits/chosen": -0.5659558176994324, + "logits/rejected": -0.855063796043396, + "logps/chosen": -198.71206665039062, + "logps/rejected": -174.78524780273438, + "loss": 0.7202, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6501097679138184, + "rewards/margins": 0.17246408760547638, + "rewards/rejected": 0.4776456952095032, + "step": 115 + }, + { + "epoch": 0.0944138473642801, + "grad_norm": 23.550596237182617, + "learning_rate": 1.84375e-05, + "logits/chosen": -0.5133547186851501, + "logits/rejected": -0.734718382358551, + "logps/chosen": -193.6223602294922, + "logps/rejected": -179.42771911621094, + "loss": 0.7313, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.5589785575866699, + "rewards/margins": 0.10251788794994354, + "rewards/rejected": 0.4564606547355652, + "step": 120 + }, + { + "epoch": 0.0983477576711251, + "grad_norm": 29.921533584594727, + "learning_rate": 1.9062500000000003e-05, + "logits/chosen": -0.3889247179031372, + "logits/rejected": -0.6225888133049011, + "logps/chosen": -187.0243377685547, + "logps/rejected": -176.29808044433594, + "loss": 0.6273, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.40915530920028687, + "rewards/margins": 0.3502606451511383, + "rewards/rejected": 0.058894671499729156, + "step": 125 + }, + { + "epoch": 0.1022816679779701, + "grad_norm": 29.90145492553711, + "learning_rate": 1.984375e-05, + "logits/chosen": -0.34609144926071167, + "logits/rejected": -0.7598401308059692, + "logps/chosen": -201.13104248046875, + "logps/rejected": -173.50753784179688, + "loss": 0.6626, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.1300664246082306, + "rewards/margins": 0.21786466240882874, + "rewards/rejected": -0.08779821544885635, + "step": 130 + }, + { + "epoch": 0.10621557828481511, + "grad_norm": 23.906503677368164, + "learning_rate": 1.9999395643917957e-05, + "logits/chosen": -0.41295546293258667, + "logits/rejected": -0.8447906374931335, + "logps/chosen": -201.5752716064453, + "logps/rejected": -165.7244415283203, + "loss": 0.6405, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21347875893115997, + "rewards/margins": 0.3085169196128845, + "rewards/rejected": -0.09503819793462753, + "step": 135 + }, + { + "epoch": 0.11014948859166011, + "grad_norm": 25.38832664489746, + "learning_rate": 1.999694057253083e-05, + "logits/chosen": -0.2702675759792328, + "logits/rejected": -0.6757915019989014, + "logps/chosen": -198.8104705810547, + "logps/rejected": -175.73355102539062, + "loss": 0.6331, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.7145684361457825, + "rewards/margins": 0.3979041576385498, + "rewards/rejected": 0.31666427850723267, + "step": 140 + }, + { + "epoch": 0.11408339889850512, + "grad_norm": 25.388601303100586, + "learning_rate": 1.9992597476892096e-05, + "logits/chosen": -0.20559760928153992, + "logits/rejected": -0.6221147775650024, + "logps/chosen": -203.33877563476562, + "logps/rejected": -177.6593780517578, + "loss": 0.6278, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.8970493078231812, + "rewards/margins": 0.41804951429367065, + "rewards/rejected": 0.4789998531341553, + "step": 145 + }, + { + "epoch": 0.11801730920535011, + "grad_norm": 26.157350540161133, + "learning_rate": 1.9986367177239688e-05, + "logits/chosen": -0.34933823347091675, + "logits/rejected": -0.5474187135696411, + "logps/chosen": -192.22409057617188, + "logps/rejected": -179.11972045898438, + "loss": 0.7403, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.7421566843986511, + "rewards/margins": 0.24275951087474823, + "rewards/rejected": 0.4993972182273865, + "step": 150 + }, + { + "epoch": 0.12195121951219512, + "grad_norm": 27.657987594604492, + "learning_rate": 1.9978250850229278e-05, + "logits/chosen": -0.5602678060531616, + "logits/rejected": -0.7431076765060425, + "logps/chosen": -197.28172302246094, + "logps/rejected": -180.0853271484375, + "loss": 0.718, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.6257942914962769, + "rewards/margins": 0.286087304353714, + "rewards/rejected": 0.3397069573402405, + "step": 155 + }, + { + "epoch": 0.12588512981904013, + "grad_norm": 27.8662166595459, + "learning_rate": 1.996825002871205e-05, + "logits/chosen": -0.3598572611808777, + "logits/rejected": -0.8388012647628784, + "logps/chosen": -192.58541870117188, + "logps/rejected": -165.87228393554688, + "loss": 0.6815, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.49491995573043823, + "rewards/margins": 0.3221299648284912, + "rewards/rejected": 0.1727900207042694, + "step": 160 + }, + { + "epoch": 0.12981904012588513, + "grad_norm": 21.444156646728516, + "learning_rate": 1.9956366601445212e-05, + "logits/chosen": -0.18239173293113708, + "logits/rejected": -0.6315879225730896, + "logps/chosen": -214.19509887695312, + "logps/rejected": -185.4246368408203, + "loss": 0.6328, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5859188437461853, + "rewards/margins": 0.4131649136543274, + "rewards/rejected": 0.1727539300918579, + "step": 165 + }, + { + "epoch": 0.13375295043273014, + "grad_norm": 22.295812606811523, + "learning_rate": 1.994260281273529e-05, + "logits/chosen": -0.27679482102394104, + "logits/rejected": -0.7712021470069885, + "logps/chosen": -206.1096954345703, + "logps/rejected": -173.62576293945312, + "loss": 0.6613, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.6996821165084839, + "rewards/margins": 0.3059811294078827, + "rewards/rejected": 0.3937010169029236, + "step": 170 + }, + { + "epoch": 0.13768686073957515, + "grad_norm": 33.50761413574219, + "learning_rate": 1.9926961262014237e-05, + "logits/chosen": -0.3116024136543274, + "logits/rejected": -0.625832736492157, + "logps/chosen": -219.8788604736328, + "logps/rejected": -187.32510375976562, + "loss": 0.746, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.1785697937011719, + "rewards/margins": 0.21889865398406982, + "rewards/rejected": 0.9596711993217468, + "step": 175 + }, + { + "epoch": 0.14162077104642015, + "grad_norm": 15.657761573791504, + "learning_rate": 1.9909444903348546e-05, + "logits/chosen": -0.005524394102394581, + "logits/rejected": -0.3487216532230377, + "logps/chosen": -228.5839385986328, + "logps/rejected": -201.77001953125, + "loss": 0.7435, + "rewards/accuracies": 0.5625, + "rewards/chosen": 1.1422548294067383, + "rewards/margins": 0.15804262459278107, + "rewards/rejected": 0.9842122793197632, + "step": 180 + }, + { + "epoch": 0.14555468135326516, + "grad_norm": 26.140518188476562, + "learning_rate": 1.9890057044881308e-05, + "logits/chosen": -0.12314258515834808, + "logits/rejected": -0.5814956426620483, + "logps/chosen": -201.1555633544922, + "logps/rejected": -167.4046173095703, + "loss": 0.6795, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 1.0470006465911865, + "rewards/margins": 0.35150283575057983, + "rewards/rejected": 0.6954978108406067, + "step": 185 + }, + { + "epoch": 0.14948859166011014, + "grad_norm": 19.782007217407227, + "learning_rate": 1.9868801348207467e-05, + "logits/chosen": -0.11235501617193222, + "logits/rejected": -0.5538455247879028, + "logps/chosen": -204.25839233398438, + "logps/rejected": -181.46743774414062, + "loss": 0.685, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 1.1285860538482666, + "rewards/margins": 0.33753544092178345, + "rewards/rejected": 0.7910505533218384, + "step": 190 + }, + { + "epoch": 0.15342250196695514, + "grad_norm": 19.97163200378418, + "learning_rate": 1.9845681827682263e-05, + "logits/chosen": -0.16671855747699738, + "logits/rejected": -0.540806233882904, + "logps/chosen": -194.2422332763672, + "logps/rejected": -163.8104705810547, + "loss": 0.6713, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6743755340576172, + "rewards/margins": 0.26031339168548584, + "rewards/rejected": 0.4140622019767761, + "step": 195 + }, + { + "epoch": 0.15735641227380015, + "grad_norm": 18.71397590637207, + "learning_rate": 1.982070284966309e-05, + "logits/chosen": -0.1493137627840042, + "logits/rejected": -0.43618321418762207, + "logps/chosen": -202.78318786621094, + "logps/rejected": -177.56668090820312, + "loss": 0.6528, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.5738676190376282, + "rewards/margins": 0.31430238485336304, + "rewards/rejected": 0.25956520438194275, + "step": 200 + }, + { + "epoch": 0.15735641227380015, + "eval_logits/chosen": 1.3314845561981201, + "eval_logits/rejected": 1.1080169677734375, + "eval_logps/chosen": -206.4569549560547, + "eval_logps/rejected": -179.43057250976562, + "eval_loss": 0.6942009329795837, + "eval_rewards/accuracies": 0.604687511920929, + "eval_rewards/chosen": 0.6053363680839539, + "eval_rewards/margins": 0.2244330197572708, + "eval_rewards/rejected": 0.38090336322784424, + "eval_runtime": 309.8464, + "eval_samples_per_second": 2.066, + "eval_steps_per_second": 0.129, + "step": 200 + }, + { + "epoch": 0.16129032258064516, + "grad_norm": 15.415759086608887, + "learning_rate": 1.9793869131684884e-05, + "logits/chosen": -0.08272367715835571, + "logits/rejected": -0.4305300712585449, + "logps/chosen": -196.86305236816406, + "logps/rejected": -178.54037475585938, + "loss": 0.7078, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.5516935586929321, + "rewards/margins": 0.21864008903503418, + "rewards/rejected": 0.33305343985557556, + "step": 205 + }, + { + "epoch": 0.16522423288749016, + "grad_norm": 28.38641929626465, + "learning_rate": 1.9765185741569126e-05, + "logits/chosen": -0.14836929738521576, + "logits/rejected": -0.4139153063297272, + "logps/chosen": -215.8746795654297, + "logps/rejected": -190.37954711914062, + "loss": 0.7474, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.6253814697265625, + "rewards/margins": 0.11703801155090332, + "rewards/rejected": 0.5083434581756592, + "step": 210 + }, + { + "epoch": 0.16915814319433517, + "grad_norm": 23.663591384887695, + "learning_rate": 1.9734658096466774e-05, + "logits/chosen": 0.011041751131415367, + "logits/rejected": -0.4074042737483978, + "logps/chosen": -209.1394500732422, + "logps/rejected": -178.0277099609375, + "loss": 0.6711, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.3468799591064453, + "rewards/margins": 0.2509341835975647, + "rewards/rejected": 0.09594579041004181, + "step": 215 + }, + { + "epoch": 0.17309205350118018, + "grad_norm": 32.677852630615234, + "learning_rate": 1.970229196183516e-05, + "logits/chosen": -0.020372604951262474, + "logits/rejected": -0.37563034892082214, + "logps/chosen": -209.47402954101562, + "logps/rejected": -177.0091094970703, + "loss": 0.6983, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.594801127910614, + "rewards/margins": 0.25161081552505493, + "rewards/rejected": 0.34319034218788147, + "step": 220 + }, + { + "epoch": 0.17702596380802518, + "grad_norm": 22.306182861328125, + "learning_rate": 1.9668093450349125e-05, + "logits/chosen": -0.1756196916103363, + "logits/rejected": -0.5201798677444458, + "logps/chosen": -217.6730499267578, + "logps/rejected": -185.24819946289062, + "loss": 0.6923, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.9194382429122925, + "rewards/margins": 0.3321036696434021, + "rewards/rejected": 0.5873345136642456, + "step": 225 + }, + { + "epoch": 0.1809598741148702, + "grad_norm": 31.994035720825195, + "learning_rate": 1.9632069020746574e-05, + "logits/chosen": -0.3013627529144287, + "logits/rejected": -0.7145218849182129, + "logps/chosen": -206.0642547607422, + "logps/rejected": -178.27896118164062, + "loss": 0.6459, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.8781298398971558, + "rewards/margins": 0.5241779088973999, + "rewards/rejected": 0.353952020406723, + "step": 230 + }, + { + "epoch": 0.1848937844217152, + "grad_norm": 29.714988708496094, + "learning_rate": 1.959422547660869e-05, + "logits/chosen": -0.2492908537387848, + "logits/rejected": -0.779377818107605, + "logps/chosen": -198.94345092773438, + "logps/rejected": -169.714599609375, + "loss": 0.6366, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.5711004734039307, + "rewards/margins": 0.413928359746933, + "rewards/rejected": 0.15717211365699768, + "step": 235 + }, + { + "epoch": 0.1888276947285602, + "grad_norm": 24.506587982177734, + "learning_rate": 1.955456996507499e-05, + "logits/chosen": -0.019927600398659706, + "logits/rejected": -0.43524104356765747, + "logps/chosen": -197.2928009033203, + "logps/rejected": -168.06382751464844, + "loss": 0.6361, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.787044107913971, + "rewards/margins": 0.3754611909389496, + "rewards/rejected": 0.4115828573703766, + "step": 240 + }, + { + "epoch": 0.19276160503540518, + "grad_norm": 24.652503967285156, + "learning_rate": 1.9513109975493553e-05, + "logits/chosen": -0.30659085512161255, + "logits/rejected": -0.6158447265625, + "logps/chosen": -207.3615264892578, + "logps/rejected": -198.04635620117188, + "loss": 0.6338, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.8901578783988953, + "rewards/margins": 0.45508089661598206, + "rewards/rejected": 0.4350770115852356, + "step": 245 + }, + { + "epoch": 0.1966955153422502, + "grad_norm": 22.106698989868164, + "learning_rate": 1.9469853338006515e-05, + "logits/chosen": -0.07243610918521881, + "logits/rejected": -0.2781897187232971, + "logps/chosen": -203.30215454101562, + "logps/rejected": -188.57080078125, + "loss": 0.7046, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.7840886116027832, + "rewards/margins": 0.2757692337036133, + "rewards/rejected": 0.5083193778991699, + "step": 250 + }, + { + "epoch": 0.2006294256490952, + "grad_norm": 17.76561164855957, + "learning_rate": 1.9424808222071337e-05, + "logits/chosen": -0.1372375786304474, + "logits/rejected": -0.4728778898715973, + "logps/chosen": -218.58462524414062, + "logps/rejected": -192.29983520507812, + "loss": 0.622, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.8517538905143738, + "rewards/margins": 0.47115468978881836, + "rewards/rejected": 0.3805992603302002, + "step": 255 + }, + { + "epoch": 0.2045633359559402, + "grad_norm": 21.741724014282227, + "learning_rate": 1.9377983134917868e-05, + "logits/chosen": -0.42930954694747925, + "logits/rejected": -0.6508566737174988, + "logps/chosen": -196.40382385253906, + "logps/rejected": -180.81784057617188, + "loss": 0.6814, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.42375677824020386, + "rewards/margins": 0.3472265601158142, + "rewards/rejected": 0.07653021067380905, + "step": 260 + }, + { + "epoch": 0.2084972462627852, + "grad_norm": 25.856201171875, + "learning_rate": 1.9329386919941694e-05, + "logits/chosen": -0.5100887417793274, + "logits/rejected": -0.896782398223877, + "logps/chosen": -200.4944610595703, + "logps/rejected": -168.5055694580078, + "loss": 0.631, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.5903893709182739, + "rewards/margins": 0.45923447608947754, + "rewards/rejected": 0.13115492463111877, + "step": 265 + }, + { + "epoch": 0.21243115656963021, + "grad_norm": 21.10732078552246, + "learning_rate": 1.927902875503397e-05, + "logits/chosen": -0.2257436066865921, + "logits/rejected": -0.6618258953094482, + "logps/chosen": -216.7244415283203, + "logps/rejected": -172.3234405517578, + "loss": 0.6436, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.9558561444282532, + "rewards/margins": 0.5136295557022095, + "rewards/rejected": 0.44222649931907654, + "step": 270 + }, + { + "epoch": 0.21636506687647522, + "grad_norm": 21.297080993652344, + "learning_rate": 1.9226918150848067e-05, + "logits/chosen": -0.325428307056427, + "logits/rejected": -0.6309774518013, + "logps/chosen": -190.9318389892578, + "logps/rejected": -179.4983673095703, + "loss": 0.6904, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 1.1083195209503174, + "rewards/margins": 0.31002935767173767, + "rewards/rejected": 0.7982901334762573, + "step": 275 + }, + { + "epoch": 0.22029897718332023, + "grad_norm": 21.540422439575195, + "learning_rate": 1.9173064949003408e-05, + "logits/chosen": -0.05009857565164566, + "logits/rejected": -0.3596547245979309, + "logps/chosen": -200.29823303222656, + "logps/rejected": -180.3629150390625, + "loss": 0.6645, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 1.4049633741378784, + "rewards/margins": 0.5104038119316101, + "rewards/rejected": 0.8945595026016235, + "step": 280 + }, + { + "epoch": 0.22423288749016523, + "grad_norm": NaN, + "learning_rate": 1.9128734540932494e-05, + "logits/chosen": -0.3485383987426758, + "logits/rejected": -0.5194178223609924, + "logps/chosen": -197.75784301757812, + "logps/rejected": -181.0018768310547, + "loss": 0.7351, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 1.2131370306015015, + "rewards/margins": 0.22653412818908691, + "rewards/rejected": 0.9866029620170593, + "step": 285 + }, + { + "epoch": 0.22816679779701024, + "grad_norm": 24.915868759155273, + "learning_rate": 1.9071770513468988e-05, + "logits/chosen": -0.17852464318275452, + "logits/rejected": -0.35372194647789, + "logps/chosen": -193.89865112304688, + "logps/rejected": -187.19973754882812, + "loss": 0.7047, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.9371153116226196, + "rewards/margins": 0.20712292194366455, + "rewards/rejected": 0.7299925088882446, + "step": 290 + }, + { + "epoch": 0.23210070810385522, + "grad_norm": 19.513757705688477, + "learning_rate": 1.901309318956141e-05, + "logits/chosen": -0.4217872619628906, + "logits/rejected": -0.7518173456192017, + "logps/chosen": -194.53421020507812, + "logps/rejected": -168.0951385498047, + "loss": 0.7308, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.6975895166397095, + "rewards/margins": 0.23306536674499512, + "rewards/rejected": 0.46452417969703674, + "step": 295 + }, + { + "epoch": 0.23603461841070023, + "grad_norm": 18.220582962036133, + "learning_rate": 1.8952713651021227e-05, + "logits/chosen": -0.14223751425743103, + "logits/rejected": -0.4979272484779358, + "logps/chosen": -199.91549682617188, + "logps/rejected": -177.2222900390625, + "loss": 0.6827, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.819624125957489, + "rewards/margins": 0.39503517746925354, + "rewards/rejected": 0.42458897829055786, + "step": 300 + }, + { + "epoch": 0.23603461841070023, + "eval_logits/chosen": 1.2563122510910034, + "eval_logits/rejected": 1.0339769124984741, + "eval_logps/chosen": -206.1991424560547, + "eval_logps/rejected": -179.33786010742188, + "eval_loss": 0.7167426347732544, + "eval_rewards/accuracies": 0.6171875, + "eval_rewards/chosen": 0.708461582660675, + "eval_rewards/margins": 0.2904762327671051, + "eval_rewards/rejected": 0.41798537969589233, + "eval_runtime": 284.7459, + "eval_samples_per_second": 2.248, + "eval_steps_per_second": 0.14, + "step": 300 + }, + { + "epoch": 0.23996852871754523, + "grad_norm": 23.576587677001953, + "learning_rate": 1.8890643301140487e-05, + "logits/chosen": -0.5384713411331177, + "logits/rejected": -0.8448705673217773, + "logps/chosen": -197.2958526611328, + "logps/rejected": -165.64370727539062, + "loss": 0.6409, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.6531845331192017, + "rewards/margins": 0.39299410581588745, + "rewards/rejected": 0.2601904273033142, + "step": 305 + }, + { + "epoch": 0.24390243902439024, + "grad_norm": 18.40612030029297, + "learning_rate": 1.8826893862538233e-05, + "logits/chosen": -0.3022890090942383, + "logits/rejected": -0.5158249735832214, + "logps/chosen": -207.9346160888672, + "logps/rejected": -193.0900115966797, + "loss": 0.7895, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.7925726771354675, + "rewards/margins": 0.08937112987041473, + "rewards/rejected": 0.7032015919685364, + "step": 310 + }, + { + "epoch": 0.24783634933123525, + "grad_norm": 18.7589168548584, + "learning_rate": 1.8761477374946548e-05, + "logits/chosen": -0.12031130492687225, + "logits/rejected": -0.4747944474220276, + "logps/chosen": -211.0299530029297, + "logps/rejected": -186.3873291015625, + "loss": 0.6952, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.9918599128723145, + "rewards/margins": 0.28354746103286743, + "rewards/rejected": 0.7083123922348022, + "step": 315 + }, + { + "epoch": 0.25177025963808025, + "grad_norm": 20.57366180419922, + "learning_rate": 1.869440619293672e-05, + "logits/chosen": 0.015002071857452393, + "logits/rejected": -0.4523535668849945, + "logps/chosen": -215.18704223632812, + "logps/rejected": -179.958984375, + "loss": 0.6336, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.9286333918571472, + "rewards/margins": 0.4492555558681488, + "rewards/rejected": 0.4793778359889984, + "step": 320 + }, + { + "epoch": 0.25570416994492523, + "grad_norm": 24.69734001159668, + "learning_rate": 1.8625692983585976e-05, + "logits/chosen": -0.3278903663158417, + "logits/rejected": -0.9296085238456726, + "logps/chosen": -212.3651580810547, + "logps/rejected": -168.00753784179688, + "loss": 0.6633, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.7741891741752625, + "rewards/margins": 0.3930490016937256, + "rewards/rejected": 0.38114017248153687, + "step": 325 + }, + { + "epoch": 0.25963808025177026, + "grad_norm": 27.854631423950195, + "learning_rate": 1.855535072408516e-05, + "logits/chosen": -0.4728453755378723, + "logits/rejected": -0.6778625249862671, + "logps/chosen": -211.9385528564453, + "logps/rejected": -193.85667419433594, + "loss": 0.6953, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.9169828295707703, + "rewards/margins": 0.32869625091552734, + "rewards/rejected": 0.5882865786552429, + "step": 330 + }, + { + "epoch": 0.26357199055861524, + "grad_norm": 18.423259735107422, + "learning_rate": 1.8483392699287858e-05, + "logits/chosen": -0.05396045371890068, + "logits/rejected": -0.5624040365219116, + "logps/chosen": -222.1643524169922, + "logps/rejected": -177.35289001464844, + "loss": 0.6206, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 1.1117911338806152, + "rewards/margins": 0.5459399223327637, + "rewards/rejected": 0.5658511519432068, + "step": 335 + }, + { + "epoch": 0.2675059008654603, + "grad_norm": 23.744850158691406, + "learning_rate": 1.840983249920143e-05, + "logits/chosen": -0.3244122564792633, + "logits/rejected": -0.5297374725341797, + "logps/chosen": -196.14691162109375, + "logps/rejected": -188.9138946533203, + "loss": 0.7056, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.8795296549797058, + "rewards/margins": 0.3909408748149872, + "rewards/rejected": 0.488588809967041, + "step": 340 + }, + { + "epoch": 0.27143981117230526, + "grad_norm": 18.513778686523438, + "learning_rate": 1.8334684016420383e-05, + "logits/chosen": -0.08137266337871552, + "logits/rejected": -0.5458197593688965, + "logps/chosen": -232.447509765625, + "logps/rejected": -191.580078125, + "loss": 0.6264, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 1.163153052330017, + "rewards/margins": 0.4738085865974426, + "rewards/rejected": 0.6893445253372192, + "step": 345 + }, + { + "epoch": 0.2753737214791503, + "grad_norm": 15.827184677124023, + "learning_rate": 1.8257961443502626e-05, + "logits/chosen": -0.30110448598861694, + "logits/rejected": -0.6258831024169922, + "logps/chosen": -190.89808654785156, + "logps/rejected": -173.31884765625, + "loss": 0.6519, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.946982204914093, + "rewards/margins": 0.39443182945251465, + "rewards/rejected": 0.5525503754615784, + "step": 350 + }, + { + "epoch": 0.27930763178599527, + "grad_norm": 19.0930118560791, + "learning_rate": 1.8179679270289048e-05, + "logits/chosen": -0.2574307322502136, + "logits/rejected": -0.7561649680137634, + "logps/chosen": -201.4808349609375, + "logps/rejected": -172.31173706054688, + "loss": 0.6453, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 1.1022285223007202, + "rewards/margins": 0.5637288689613342, + "rewards/rejected": 0.5384997129440308, + "step": 355 + }, + { + "epoch": 0.2832415420928403, + "grad_norm": 22.383216857910156, + "learning_rate": 1.8099852281166974e-05, + "logits/chosen": -0.2120940238237381, + "logits/rejected": -0.7636501789093018, + "logps/chosen": -209.04806518554688, + "logps/rejected": -166.7012481689453, + "loss": 0.6576, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 1.3941724300384521, + "rewards/margins": 0.5536119341850281, + "rewards/rejected": 0.8405605554580688, + "step": 360 + }, + { + "epoch": 0.2871754523996853, + "grad_norm": 18.3509578704834, + "learning_rate": 1.8018495552277987e-05, + "logits/chosen": 0.07260416448116302, + "logits/rejected": -0.2597780227661133, + "logps/chosen": -208.8731689453125, + "logps/rejected": -187.85023498535156, + "loss": 0.6275, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.4085180759429932, + "rewards/margins": 0.5379746556282043, + "rewards/rejected": 0.8705434799194336, + "step": 365 + }, + { + "epoch": 0.2911093627065303, + "grad_norm": 21.863872528076172, + "learning_rate": 1.7935624448670625e-05, + "logits/chosen": -0.4248635172843933, + "logits/rejected": -0.4336097836494446, + "logps/chosen": -179.680908203125, + "logps/rejected": -173.14013671875, + "loss": 0.75, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 1.0196665525436401, + "rewards/margins": 0.20690293610095978, + "rewards/rejected": 0.8127636909484863, + "step": 370 + }, + { + "epoch": 0.2950432730133753, + "grad_norm": 26.93684196472168, + "learning_rate": 1.785125462139855e-05, + "logits/chosen": -0.16947659850120544, + "logits/rejected": -0.451927125453949, + "logps/chosen": -198.48106384277344, + "logps/rejected": -174.99111938476562, + "loss": 0.7696, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.1577861309051514, + "rewards/margins": 0.21412566304206848, + "rewards/rejected": 0.9436607360839844, + "step": 375 + }, + { + "epoch": 0.2989771833202203, + "grad_norm": 15.670443534851074, + "learning_rate": 1.7765402004564687e-05, + "logits/chosen": -0.1878432035446167, + "logits/rejected": -0.5365083813667297, + "logps/chosen": -204.27255249023438, + "logps/rejected": -175.6739959716797, + "loss": 0.6793, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 1.1427654027938843, + "rewards/margins": 0.44054698944091797, + "rewards/rejected": 0.7022184133529663, + "step": 380 + }, + { + "epoch": 0.3029110936270653, + "grad_norm": 20.738510131835938, + "learning_rate": 1.76780828123119e-05, + "logits/chosen": -0.22227105498313904, + "logits/rejected": -0.4939172863960266, + "logps/chosen": -204.56930541992188, + "logps/rejected": -187.81863403320312, + "loss": 0.6359, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.086004376411438, + "rewards/margins": 0.5049671530723572, + "rewards/rejected": 0.5810372233390808, + "step": 385 + }, + { + "epoch": 0.3068450039339103, + "grad_norm": 15.985719680786133, + "learning_rate": 1.7589313535760787e-05, + "logits/chosen": -0.33505499362945557, + "logits/rejected": -0.5057377219200134, + "logps/chosen": -203.09201049804688, + "logps/rejected": -186.1582489013672, + "loss": 0.728, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.9475752115249634, + "rewards/margins": 0.21062707901000977, + "rewards/rejected": 0.7369481325149536, + "step": 390 + }, + { + "epoch": 0.3107789142407553, + "grad_norm": 15.00536823272705, + "learning_rate": 1.7499110939895162e-05, + "logits/chosen": -0.2682803273200989, + "logits/rejected": -0.6644273400306702, + "logps/chosen": -197.18655395507812, + "logps/rejected": -184.64974975585938, + "loss": 0.7331, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.4725784361362457, + "rewards/margins": 0.17313337326049805, + "rewards/rejected": 0.29944509267807007, + "step": 395 + }, + { + "epoch": 0.3147128245476003, + "grad_norm": 18.541942596435547, + "learning_rate": 1.7407492060395835e-05, + "logits/chosen": -0.3485754132270813, + "logits/rejected": -0.6408174633979797, + "logps/chosen": -196.4596710205078, + "logps/rejected": -178.34701538085938, + "loss": 0.6891, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.33864206075668335, + "rewards/margins": 0.22059743106365204, + "rewards/rejected": 0.11804463714361191, + "step": 400 + }, + { + "epoch": 0.3147128245476003, + "eval_logits/chosen": 1.2971076965332031, + "eval_logits/rejected": 1.0804717540740967, + "eval_logps/chosen": -207.33456420898438, + "eval_logps/rejected": -180.31930541992188, + "eval_loss": 0.7093836069107056, + "eval_rewards/accuracies": 0.598437488079071, + "eval_rewards/chosen": 0.2542892098426819, + "eval_rewards/margins": 0.22887463867664337, + "eval_rewards/rejected": 0.025414561852812767, + "eval_runtime": 301.2073, + "eval_samples_per_second": 2.125, + "eval_steps_per_second": 0.133, + "step": 400 + }, + { + "epoch": 0.31864673485444533, + "grad_norm": 22.79604148864746, + "learning_rate": 1.731447420042321e-05, + "logits/chosen": -0.33927303552627563, + "logits/rejected": -0.5682342052459717, + "logps/chosen": -190.31930541992188, + "logps/rejected": -173.07032775878906, + "loss": 0.7979, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0728757381439209, + "rewards/margins": 0.02971130609512329, + "rewards/rejected": 0.0431644432246685, + "step": 405 + }, + { + "epoch": 0.3225806451612903, + "grad_norm": 22.005783081054688, + "learning_rate": 1.7220074927349452e-05, + "logits/chosen": -0.3349539339542389, + "logits/rejected": -0.6785364151000977, + "logps/chosen": -205.6999969482422, + "logps/rejected": -174.34982299804688, + "loss": 0.6723, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.04839733988046646, + "rewards/margins": 0.2823019027709961, + "rewards/rejected": -0.23390455543994904, + "step": 410 + }, + { + "epoch": 0.32651455546813535, + "grad_norm": 18.50445556640625, + "learning_rate": 1.712431206944067e-05, + "logits/chosen": -0.31676384806632996, + "logits/rejected": -0.47476306557655334, + "logps/chosen": -194.7633056640625, + "logps/rejected": -185.64987182617188, + "loss": 0.6637, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.2732119560241699, + "rewards/margins": 0.3499985337257385, + "rewards/rejected": -0.07678655534982681, + "step": 415 + }, + { + "epoch": 0.3304484657749803, + "grad_norm": 21.16750144958496, + "learning_rate": 1.7027203712489902e-05, + "logits/chosen": -0.22730335593223572, + "logits/rejected": -0.6324140429496765, + "logps/chosen": -209.23678588867188, + "logps/rejected": -177.7320098876953, + "loss": 0.7066, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.41964513063430786, + "rewards/margins": 0.263131707906723, + "rewards/rejected": 0.15651337802410126, + "step": 420 + }, + { + "epoch": 0.33438237608182536, + "grad_norm": 21.21584129333496, + "learning_rate": 1.6928768196401403e-05, + "logits/chosen": -0.19787462055683136, + "logits/rejected": -0.5100497007369995, + "logps/chosen": -213.1494140625, + "logps/rejected": -194.2113800048828, + "loss": 0.7113, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.434047132730484, + "rewards/margins": 0.20316064357757568, + "rewards/rejected": 0.2308865338563919, + "step": 425 + }, + { + "epoch": 0.33831628638867034, + "grad_norm": 26.320444107055664, + "learning_rate": 1.682902411172698e-05, + "logits/chosen": -0.27940934896469116, + "logits/rejected": -0.6819210052490234, + "logps/chosen": -191.19189453125, + "logps/rejected": -160.06234741210938, + "loss": 0.672, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6531416177749634, + "rewards/margins": 0.32751747965812683, + "rewards/rejected": 0.32562416791915894, + "step": 430 + }, + { + "epoch": 0.3422501966955153, + "grad_norm": 16.507688522338867, + "learning_rate": 1.6727990296154962e-05, + "logits/chosen": -0.43093472719192505, + "logits/rejected": -0.6659766435623169, + "logps/chosen": -194.37916564941406, + "logps/rejected": -175.87298583984375, + "loss": 0.6782, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.9038249254226685, + "rewards/margins": 0.3305993974208832, + "rewards/rejected": 0.5732254385948181, + "step": 435 + }, + { + "epoch": 0.34618410700236035, + "grad_norm": 15.00309944152832, + "learning_rate": 1.6625685830952533e-05, + "logits/chosen": -0.017139725387096405, + "logits/rejected": -0.5116509199142456, + "logps/chosen": -203.77554321289062, + "logps/rejected": -166.87571716308594, + "loss": 0.6715, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.8991168141365051, + "rewards/margins": 0.4240299165248871, + "rewards/rejected": 0.47508686780929565, + "step": 440 + }, + { + "epoch": 0.35011801730920533, + "grad_norm": 22.238525390625, + "learning_rate": 1.6522130037362018e-05, + "logits/chosen": -0.4809524416923523, + "logits/rejected": -0.77618408203125, + "logps/chosen": -183.9463348388672, + "logps/rejected": -168.94070434570312, + "loss": 0.7005, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.9337043762207031, + "rewards/margins": 0.2749274969100952, + "rewards/rejected": 0.6587768197059631, + "step": 445 + }, + { + "epoch": 0.35405192761605037, + "grad_norm": 17.745378494262695, + "learning_rate": 1.641734247295189e-05, + "logits/chosen": -0.4837673306465149, + "logits/rejected": -0.8133207559585571, + "logps/chosen": -187.5880126953125, + "logps/rejected": -172.59933471679688, + "loss": 0.6777, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.9450648427009583, + "rewards/margins": 0.3392513394355774, + "rewards/rejected": 0.6058135032653809, + "step": 450 + }, + { + "epoch": 0.35798583792289534, + "grad_norm": 21.806243896484375, + "learning_rate": 1.63113429279231e-05, + "logits/chosen": -0.3670351207256317, + "logits/rejected": -0.7418017387390137, + "logps/chosen": -221.2038116455078, + "logps/rejected": -184.3399200439453, + "loss": 0.7212, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8858639001846313, + "rewards/margins": 0.2686173915863037, + "rewards/rejected": 0.6172465085983276, + "step": 455 + }, + { + "epoch": 0.3619197482297404, + "grad_norm": 19.19058609008789, + "learning_rate": 1.6204151421371504e-05, + "logits/chosen": -0.5260201692581177, + "logits/rejected": -0.887170672416687, + "logps/chosen": -198.56930541992188, + "logps/rejected": -170.34158325195312, + "loss": 0.6642, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6595619320869446, + "rewards/margins": 0.25892138481140137, + "rewards/rejected": 0.4006405472755432, + "step": 460 + }, + { + "epoch": 0.36585365853658536, + "grad_norm": 16.740882873535156, + "learning_rate": 1.609578819750708e-05, + "logits/chosen": -0.21146011352539062, + "logits/rejected": -0.41337770223617554, + "logps/chosen": -186.92779541015625, + "logps/rejected": -183.7529754638672, + "loss": 0.6911, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.568415641784668, + "rewards/margins": 0.27034991979599, + "rewards/rejected": 0.298065721988678, + "step": 465 + }, + { + "epoch": 0.3697875688434304, + "grad_norm": 22.620988845825195, + "learning_rate": 1.5986273721830557e-05, + "logits/chosen": -0.17011170089244843, + "logits/rejected": -0.5642642974853516, + "logps/chosen": -206.16073608398438, + "logps/rejected": -187.0243377685547, + "loss": 0.73, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.44800883531570435, + "rewards/margins": 0.19431404769420624, + "rewards/rejected": 0.2536947727203369, + "step": 470 + }, + { + "epoch": 0.37372147915027537, + "grad_norm": 19.39198112487793, + "learning_rate": 1.587562867726832e-05, + "logits/chosen": -0.18244773149490356, + "logits/rejected": -0.5230101346969604, + "logps/chosen": -223.02371215820312, + "logps/rejected": -198.8177032470703, + "loss": 0.6721, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.4324628710746765, + "rewards/margins": 0.22442837059497833, + "rewards/rejected": 0.20803451538085938, + "step": 475 + }, + { + "epoch": 0.3776553894571204, + "grad_norm": 19.32149314880371, + "learning_rate": 1.5763873960266236e-05, + "logits/chosen": -0.29324209690093994, + "logits/rejected": -0.5279776453971863, + "logps/chosen": -206.15469360351562, + "logps/rejected": -188.80137634277344, + "loss": 0.6942, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.4868395924568176, + "rewards/margins": 0.33774086833000183, + "rewards/rejected": 0.14909867942333221, + "step": 480 + }, + { + "epoch": 0.3815892997639654, + "grad_norm": 19.483469009399414, + "learning_rate": 1.5673685398812467e-05, + "logits/chosen": -0.1828387826681137, + "logits/rejected": -0.41064882278442383, + "logps/chosen": -217.49295043945312, + "logps/rejected": -198.88177490234375, + "loss": 0.7507, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 1.0268093347549438, + "rewards/margins": 0.21613208949565887, + "rewards/rejected": 0.810677170753479, + "step": 485 + }, + { + "epoch": 0.38552321007081036, + "grad_norm": 28.394817352294922, + "learning_rate": 1.555998659687541e-05, + "logits/chosen": -0.49702200293540955, + "logits/rejected": -1.0014259815216064, + "logps/chosen": -197.88128662109375, + "logps/rejected": -160.67999267578125, + "loss": 0.6519, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.9069635272026062, + "rewards/margins": 0.40647339820861816, + "rewards/rejected": 0.5004900693893433, + "step": 490 + }, + { + "epoch": 0.3894571203776554, + "grad_norm": 20.914031982421875, + "learning_rate": 1.544523773472669e-05, + "logits/chosen": 0.02130720391869545, + "logits/rejected": -0.4486933648586273, + "logps/chosen": -211.362060546875, + "logps/rejected": -175.72430419921875, + "loss": 0.685, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 1.1584622859954834, + "rewards/margins": 0.3790398836135864, + "rewards/rejected": 0.7794222831726074, + "step": 495 + }, + { + "epoch": 0.3933910306845004, + "grad_norm": 14.320610046386719, + "learning_rate": 1.532946048386001e-05, + "logits/chosen": -0.010864943265914917, + "logits/rejected": -0.5150319337844849, + "logps/chosen": -207.92333984375, + "logps/rejected": -178.11700439453125, + "loss": 0.6224, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.1484225988388062, + "rewards/margins": 0.4593985676765442, + "rewards/rejected": 0.6890240907669067, + "step": 500 + }, + { + "epoch": 0.3933910306845004, + "eval_logits/chosen": 1.327344536781311, + "eval_logits/rejected": 1.1055529117584229, + "eval_logps/chosen": -205.45755004882812, + "eval_logps/rejected": -178.61904907226562, + "eval_loss": 0.7026852369308472, + "eval_rewards/accuracies": 0.620312511920929, + "eval_rewards/chosen": 1.0051077604293823, + "eval_rewards/margins": 0.29959002137184143, + "eval_rewards/rejected": 0.7055177688598633, + "eval_runtime": 297.7987, + "eval_samples_per_second": 2.149, + "eval_steps_per_second": 0.134, + "step": 500 + }, + { + "epoch": 0.3973249409913454, + "grad_norm": 17.606443405151367, + "learning_rate": 1.5212676709990762e-05, + "logits/chosen": 0.12024303525686264, + "logits/rejected": -0.33552008867263794, + "logps/chosen": -205.59109497070312, + "logps/rejected": -181.02566528320312, + "loss": 0.6522, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.9811790585517883, + "rewards/margins": 0.43993645906448364, + "rewards/rejected": 0.5412425994873047, + "step": 505 + }, + { + "epoch": 0.4012588512981904, + "grad_norm": 23.3114070892334, + "learning_rate": 1.509490846892649e-05, + "logits/chosen": 0.01656034216284752, + "logits/rejected": -0.5744299292564392, + "logps/chosen": -211.2788543701172, + "logps/rejected": -167.57276916503906, + "loss": 0.6138, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.8017475008964539, + "rewards/margins": 0.5002428293228149, + "rewards/rejected": 0.3015046715736389, + "step": 510 + }, + { + "epoch": 0.4051927616050354, + "grad_norm": 14.10328197479248, + "learning_rate": 1.4976178002401408e-05, + "logits/chosen": -0.3282383978366852, + "logits/rejected": -0.48758015036582947, + "logps/chosen": -200.8679962158203, + "logps/rejected": -179.44241333007812, + "loss": 0.6479, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6766945719718933, + "rewards/margins": 0.3457737863063812, + "rewards/rejected": 0.33092084527015686, + "step": 515 + }, + { + "epoch": 0.4091266719118804, + "grad_norm": 26.593978881835938, + "learning_rate": 1.4856507733875837e-05, + "logits/chosen": -0.1160442978143692, + "logits/rejected": -0.4207191467285156, + "logps/chosen": -190.7376708984375, + "logps/rejected": -169.13816833496094, + "loss": 0.7379, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.7750043869018555, + "rewards/margins": 0.34026703238487244, + "rewards/rejected": 0.43473726511001587, + "step": 520 + }, + { + "epoch": 0.41306058221872544, + "grad_norm": 17.67402458190918, + "learning_rate": 1.4735920264301288e-05, + "logits/chosen": -0.17023354768753052, + "logits/rejected": -0.5197206735610962, + "logps/chosen": -207.9748077392578, + "logps/rejected": -182.002197265625, + "loss": 0.7135, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5062464475631714, + "rewards/margins": 0.19488921761512756, + "rewards/rejected": 0.31135720014572144, + "step": 525 + }, + { + "epoch": 0.4169944925255704, + "grad_norm": 16.364791870117188, + "learning_rate": 1.4614438367852056e-05, + "logits/chosen": -0.35339441895484924, + "logits/rejected": -0.6959262490272522, + "logps/chosen": -202.8052215576172, + "logps/rejected": -167.2289276123047, + "loss": 0.6573, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.89134281873703, + "rewards/margins": 0.39820951223373413, + "rewards/rejected": 0.4931332468986511, + "step": 530 + }, + { + "epoch": 0.4209284028324154, + "grad_norm": 19.59364891052246, + "learning_rate": 1.4492084987624071e-05, + "logits/chosen": -0.1122426763176918, + "logits/rejected": -0.44985610246658325, + "logps/chosen": -204.77981567382812, + "logps/rejected": -181.18716430664062, + "loss": 0.6709, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.9437880516052246, + "rewards/margins": 0.4574647545814514, + "rewards/rejected": 0.48632335662841797, + "step": 535 + }, + { + "epoch": 0.42486231313926043, + "grad_norm": 17.59402084350586, + "learning_rate": 1.4368883231301885e-05, + "logits/chosen": -0.17638197541236877, + "logits/rejected": -0.5632339715957642, + "logps/chosen": -201.26885986328125, + "logps/rejected": -170.08328247070312, + "loss": 0.6228, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 1.184136986732483, + "rewards/margins": 0.756480872631073, + "rewards/rejected": 0.42765602469444275, + "step": 540 + }, + { + "epoch": 0.4287962234461054, + "grad_norm": 27.206796646118164, + "learning_rate": 1.4244856366794517e-05, + "logits/chosen": -0.057549990713596344, + "logits/rejected": -0.4487794041633606, + "logps/chosen": -205.1177215576172, + "logps/rejected": -177.13014221191406, + "loss": 0.6294, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.0669742822647095, + "rewards/margins": 0.5120534896850586, + "rewards/rejected": 0.5549208521842957, + "step": 545 + }, + { + "epoch": 0.43273013375295044, + "grad_norm": 16.399995803833008, + "learning_rate": 1.4120027817841098e-05, + "logits/chosen": -0.133390873670578, + "logits/rejected": -0.47696390748023987, + "logps/chosen": -214.5057373046875, + "logps/rejected": -193.0947265625, + "loss": 0.808, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.8214758038520813, + "rewards/margins": 0.04125159978866577, + "rewards/rejected": 0.7802242040634155, + "step": 550 + }, + { + "epoch": 0.4366640440597954, + "grad_norm": 18.979785919189453, + "learning_rate": 1.399442115958704e-05, + "logits/chosen": -0.569675862789154, + "logits/rejected": -0.8924716711044312, + "logps/chosen": -211.4713897705078, + "logps/rejected": -183.01220703125, + "loss": 0.6587, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.8996235132217407, + "rewards/margins": 0.45010414719581604, + "rewards/rejected": 0.4495193362236023, + "step": 555 + }, + { + "epoch": 0.44059795436664045, + "grad_norm": 21.638757705688477, + "learning_rate": 1.3868060114131644e-05, + "logits/chosen": -0.22702725231647491, + "logits/rejected": -0.5234431028366089, + "logps/chosen": -210.87393188476562, + "logps/rejected": -195.6029052734375, + "loss": 0.738, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 1.0586285591125488, + "rewards/margins": 0.27768781781196594, + "rewards/rejected": 0.7809406518936157, + "step": 560 + }, + { + "epoch": 0.44453186467348543, + "grad_norm": 23.013927459716797, + "learning_rate": 1.3740968546047935e-05, + "logits/chosen": -0.17697608470916748, + "logits/rejected": -0.4483562409877777, + "logps/chosen": -211.2060089111328, + "logps/rejected": -197.86001586914062, + "loss": 0.7594, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.9211471676826477, + "rewards/margins": 0.0961461290717125, + "rewards/rejected": 0.825001060962677, + "step": 565 + }, + { + "epoch": 0.44846577498033047, + "grad_norm": 20.101484298706055, + "learning_rate": 1.3613170457875579e-05, + "logits/chosen": -0.22834663093090057, + "logits/rejected": -0.6228377223014832, + "logps/chosen": -207.5561065673828, + "logps/rejected": -182.3037567138672, + "loss": 0.6097, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 1.1122691631317139, + "rewards/margins": 0.5503975749015808, + "rewards/rejected": 0.5618715882301331, + "step": 570 + }, + { + "epoch": 0.45239968528717545, + "grad_norm": 26.358943939208984, + "learning_rate": 1.348468998558779e-05, + "logits/chosen": -0.13707995414733887, + "logits/rejected": -0.44805946946144104, + "logps/chosen": -220.7776641845703, + "logps/rejected": -201.1964874267578, + "loss": 0.713, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.9859493374824524, + "rewards/margins": 0.3383699953556061, + "rewards/rejected": 0.6475793123245239, + "step": 575 + }, + { + "epoch": 0.4563335955940205, + "grad_norm": 16.33328628540039, + "learning_rate": 1.3355551394032968e-05, + "logits/chosen": -0.31562569737434387, + "logits/rejected": -0.6708458065986633, + "logps/chosen": -203.0553436279297, + "logps/rejected": -176.8132781982422, + "loss": 0.6889, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.7495515942573547, + "rewards/margins": 0.3594974875450134, + "rewards/rejected": 0.39005404710769653, + "step": 580 + }, + { + "epoch": 0.46026750590086546, + "grad_norm": 29.162113189697266, + "learning_rate": 1.3225779072352066e-05, + "logits/chosen": -0.32384806871414185, + "logits/rejected": -0.6729586124420166, + "logps/chosen": -214.14102172851562, + "logps/rejected": -184.0008087158203, + "loss": 0.6698, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.8371032476425171, + "rewards/margins": 0.3700554370880127, + "rewards/rejected": 0.4670478403568268, + "step": 585 + }, + { + "epoch": 0.46420141620771044, + "grad_norm": 25.16128921508789, + "learning_rate": 1.309539752937243e-05, + "logits/chosen": -0.256720632314682, + "logits/rejected": -0.4291699528694153, + "logps/chosen": -191.2805938720703, + "logps/rejected": -184.6292266845703, + "loss": 0.6755, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6577237248420715, + "rewards/margins": 0.28180426359176636, + "rewards/rejected": 0.3759194016456604, + "step": 590 + }, + { + "epoch": 0.46813532651455547, + "grad_norm": 20.09102439880371, + "learning_rate": 1.2964431388979075e-05, + "logits/chosen": -0.3570843040943146, + "logits/rejected": -0.8670114278793335, + "logps/chosen": -203.76992797851562, + "logps/rejected": -163.80783081054688, + "loss": 0.6412, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7835728526115417, + "rewards/margins": 0.5176677703857422, + "rewards/rejected": 0.2659050524234772, + "step": 595 + }, + { + "epoch": 0.47206923682140045, + "grad_norm": 22.330236434936523, + "learning_rate": 1.2832905385464193e-05, + "logits/chosen": -0.3153493404388428, + "logits/rejected": -0.6954606771469116, + "logps/chosen": -199.0489501953125, + "logps/rejected": -172.42919921875, + "loss": 0.6764, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7396122217178345, + "rewards/margins": 0.3455941677093506, + "rewards/rejected": 0.39401811361312866, + "step": 600 + }, + { + "epoch": 0.47206923682140045, + "eval_logits/chosen": 1.3154770135879517, + "eval_logits/rejected": 1.0959367752075195, + "eval_logps/chosen": -205.95361328125, + "eval_logps/rejected": -179.14404296875, + "eval_loss": 0.688846230506897, + "eval_rewards/accuracies": 0.6234375238418579, + "eval_rewards/chosen": 0.8066827058792114, + "eval_rewards/margins": 0.3111591935157776, + "eval_rewards/rejected": 0.49552351236343384, + "eval_runtime": 282.0013, + "eval_samples_per_second": 2.269, + "eval_steps_per_second": 0.142, + "step": 600 + }, + { + "epoch": 0.4760031471282455, + "grad_norm": 13.301490783691406, + "learning_rate": 1.2700844358855853e-05, + "logits/chosen": -0.2941150367259979, + "logits/rejected": -0.7340162992477417, + "logps/chosen": -194.4886932373047, + "logps/rejected": -159.5877227783203, + "loss": 0.6895, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.846507728099823, + "rewards/margins": 0.3602963089942932, + "rewards/rejected": 0.48621147871017456, + "step": 605 + }, + { + "epoch": 0.47993705743509046, + "grad_norm": 19.667444229125977, + "learning_rate": 1.2568273250226681e-05, + "logits/chosen": -0.2455168217420578, + "logits/rejected": -0.608180820941925, + "logps/chosen": -225.4668426513672, + "logps/rejected": -192.55905151367188, + "loss": 0.6672, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 1.017348289489746, + "rewards/margins": 0.38524192571640015, + "rewards/rejected": 0.632106363773346, + "step": 610 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 24.933828353881836, + "learning_rate": 1.243521709698351e-05, + "logits/chosen": -0.28044039011001587, + "logits/rejected": -0.5124521255493164, + "logps/chosen": -199.1013641357422, + "logps/rejected": -195.05728149414062, + "loss": 0.6967, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.9729631543159485, + "rewards/margins": 0.31783193349838257, + "rewards/rejected": 0.6551311016082764, + "step": 615 + }, + { + "epoch": 0.4878048780487805, + "grad_norm": 21.9912109375, + "learning_rate": 1.230170102813879e-05, + "logits/chosen": -0.6046349406242371, + "logits/rejected": -0.8912727236747742, + "logps/chosen": -193.95303344726562, + "logps/rejected": -169.863037109375, + "loss": 0.6994, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.9892646670341492, + "rewards/margins": 0.35485339164733887, + "rewards/rejected": 0.6344112753868103, + "step": 620 + }, + { + "epoch": 0.4917387883556255, + "grad_norm": 14.393425941467285, + "learning_rate": 1.2167750259564733e-05, + "logits/chosen": -0.21057292819023132, + "logits/rejected": -0.6453763246536255, + "logps/chosen": -197.05722045898438, + "logps/rejected": -194.5146942138672, + "loss": 0.6655, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.8438342809677124, + "rewards/margins": 0.3562160134315491, + "rewards/rejected": 0.4876182973384857, + "step": 625 + }, + { + "epoch": 0.4956726986624705, + "grad_norm": 27.751855850219727, + "learning_rate": 1.203339008923103e-05, + "logits/chosen": -0.08632899820804596, + "logits/rejected": -0.5858111381530762, + "logps/chosen": -210.37890625, + "logps/rejected": -181.04751586914062, + "loss": 0.7106, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 1.0096272230148315, + "rewards/margins": 0.4222971796989441, + "rewards/rejected": 0.587330162525177, + "step": 630 + }, + { + "epoch": 0.4996066089693155, + "grad_norm": 21.017240524291992, + "learning_rate": 1.1898645892427064e-05, + "logits/chosen": -0.48605161905288696, + "logits/rejected": -0.6945669651031494, + "logps/chosen": -182.28805541992188, + "logps/rejected": -169.93661499023438, + "loss": 0.7755, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.5226560831069946, + "rewards/margins": 0.05550839379429817, + "rewards/rejected": 0.46714773774147034, + "step": 635 + }, + { + "epoch": 0.5035405192761605, + "grad_norm": 20.2221622467041, + "learning_rate": 1.1763543116969549e-05, + "logits/chosen": -0.10474424064159393, + "logits/rejected": -0.5913185477256775, + "logps/chosen": -209.303466796875, + "logps/rejected": -173.1480255126953, + "loss": 0.6692, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6786335706710815, + "rewards/margins": 0.3942939341068268, + "rewards/rejected": 0.28433966636657715, + "step": 640 + }, + { + "epoch": 0.5074744295830055, + "grad_norm": 15.26221752166748, + "learning_rate": 1.1628107278396432e-05, + "logits/chosen": -0.06124790757894516, + "logits/rejected": -0.3360343873500824, + "logps/chosen": -202.93270874023438, + "logps/rejected": -184.75259399414062, + "loss": 0.6547, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.39857378602027893, + "rewards/margins": 0.2742787301540375, + "rewards/rejected": 0.12429501861333847, + "step": 645 + }, + { + "epoch": 0.5114083398898505, + "grad_norm": 18.45632553100586, + "learning_rate": 1.1492363955148023e-05, + "logits/chosen": -0.1759663075208664, + "logits/rejected": -0.6530739665031433, + "logps/chosen": -218.36123657226562, + "logps/rejected": -199.7471160888672, + "loss": 0.653, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.5292393565177917, + "rewards/margins": 0.3620988726615906, + "rewards/rejected": 0.16714049875736237, + "step": 650 + }, + { + "epoch": 0.5153422501966956, + "grad_norm": 16.891386032104492, + "learning_rate": 1.1356338783736256e-05, + "logits/chosen": -0.4392605721950531, + "logits/rejected": -0.7525895237922668, + "logps/chosen": -194.24301147460938, + "logps/rejected": -182.4429473876953, + "loss": 0.6259, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4986444115638733, + "rewards/margins": 0.49716418981552124, + "rewards/rejected": 0.0014802322257310152, + "step": 655 + }, + { + "epoch": 0.5192761605035405, + "grad_norm": 18.568416595458984, + "learning_rate": 1.1220057453902973e-05, + "logits/chosen": -0.2285362035036087, + "logits/rejected": -0.6583995223045349, + "logps/chosen": -219.6389617919922, + "logps/rejected": -176.62965393066406, + "loss": 0.6604, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6904179453849792, + "rewards/margins": 0.3659079670906067, + "rewards/rejected": 0.32451000809669495, + "step": 660 + }, + { + "epoch": 0.5232100708103855, + "grad_norm": 16.81451416015625, + "learning_rate": 1.1083545703768137e-05, + "logits/chosen": -0.3168891370296478, + "logits/rejected": -0.5861741304397583, + "logps/chosen": -198.4099578857422, + "logps/rejected": -181.83871459960938, + "loss": 0.736, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.6643240451812744, + "rewards/margins": 0.17423763871192932, + "rewards/rejected": 0.4900864064693451, + "step": 665 + }, + { + "epoch": 0.5271439811172305, + "grad_norm": 20.030567169189453, + "learning_rate": 1.0946829314968936e-05, + "logits/chosen": -0.22313520312309265, + "logits/rejected": -0.6608983874320984, + "logps/chosen": -206.3205108642578, + "logps/rejected": -178.14974975585938, + "loss": 0.6314, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.784034252166748, + "rewards/margins": 0.45540714263916016, + "rewards/rejected": 0.3286270797252655, + "step": 670 + }, + { + "epoch": 0.5310778914240756, + "grad_norm": 12.727190017700195, + "learning_rate": 1.0809934107790675e-05, + "logits/chosen": -0.1376127302646637, + "logits/rejected": -0.5582663416862488, + "logps/chosen": -207.121337890625, + "logps/rejected": -189.23037719726562, + "loss": 0.5616, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.1340868473052979, + "rewards/margins": 0.6862513422966003, + "rewards/rejected": 0.4478355050086975, + "step": 675 + }, + { + "epoch": 0.5350118017309206, + "grad_norm": 15.704160690307617, + "learning_rate": 1.0672885936290316e-05, + "logits/chosen": -0.11958789825439453, + "logits/rejected": -0.41796404123306274, + "logps/chosen": -200.3405303955078, + "logps/rejected": -185.74917602539062, + "loss": 0.7025, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 1.0963573455810547, + "rewards/margins": 0.3328610956668854, + "rewards/rejected": 0.7634962797164917, + "step": 680 + }, + { + "epoch": 0.5389457120377655, + "grad_norm": 16.583145141601562, + "learning_rate": 1.05357106834137e-05, + "logits/chosen": -0.035154812037944794, + "logits/rejected": -0.6018010377883911, + "logps/chosen": -214.5799102783203, + "logps/rejected": -181.4016571044922, + "loss": 0.6904, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.9387739300727844, + "rewards/margins": 0.34907636046409607, + "rewards/rejected": 0.5896975994110107, + "step": 685 + }, + { + "epoch": 0.5428796223446105, + "grad_norm": 15.397040367126465, + "learning_rate": 1.0398434256107291e-05, + "logits/chosen": -0.3040166199207306, + "logits/rejected": -0.6104984283447266, + "logps/chosen": -190.73818969726562, + "logps/rejected": -172.9613037109375, + "loss": 0.6723, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.8516994714736938, + "rewards/margins": 0.3647121787071228, + "rewards/rejected": 0.48698729276657104, + "step": 690 + }, + { + "epoch": 0.5468135326514555, + "grad_norm": 17.214340209960938, + "learning_rate": 1.0261082580425366e-05, + "logits/chosen": -0.25491005182266235, + "logits/rejected": -0.7748223543167114, + "logps/chosen": -205.028564453125, + "logps/rejected": -169.1365966796875, + "loss": 0.6359, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.8545015454292297, + "rewards/margins": 0.4432094693183899, + "rewards/rejected": 0.41129201650619507, + "step": 695 + }, + { + "epoch": 0.5507474429583006, + "grad_norm": 18.72207260131836, + "learning_rate": 1.012368159663363e-05, + "logits/chosen": -0.43465644121170044, + "logits/rejected": -0.6075267195701599, + "logps/chosen": -198.85336303710938, + "logps/rejected": -185.84034729003906, + "loss": 0.6205, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.8257676959037781, + "rewards/margins": 0.442889541387558, + "rewards/rejected": 0.38287803530693054, + "step": 700 + }, + { + "epoch": 0.5507474429583006, + "eval_logits/chosen": 1.289400339126587, + "eval_logits/rejected": 1.06741201877594, + "eval_logps/chosen": -206.27685546875, + "eval_logps/rejected": -179.56541442871094, + "eval_loss": 0.6758726835250854, + "eval_rewards/accuracies": 0.6343749761581421, + "eval_rewards/chosen": 0.6773768067359924, + "eval_rewards/margins": 0.3504090905189514, + "eval_rewards/rejected": 0.32696765661239624, + "eval_runtime": 264.1292, + "eval_samples_per_second": 2.423, + "eval_steps_per_second": 0.151, + "step": 700 + }, + { + "epoch": 0.5546813532651456, + "grad_norm": 20.8519344329834, + "learning_rate": 9.98625725431013e-06, + "logits/chosen": -0.020856428891420364, + "logits/rejected": -0.20043806731700897, + "logps/chosen": -193.96920776367188, + "logps/rejected": -172.1241912841797, + "loss": 0.7039, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5150532722473145, + "rewards/margins": 0.1648593544960022, + "rewards/rejected": 0.35019388794898987, + "step": 705 + }, + { + "epoch": 0.5586152635719905, + "grad_norm": 18.23834800720215, + "learning_rate": 9.848835507444405e-06, + "logits/chosen": -0.17138266563415527, + "logits/rejected": -0.5400444269180298, + "logps/chosen": -213.20947265625, + "logps/rejected": -179.41683959960938, + "loss": 0.5993, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.878060519695282, + "rewards/margins": 0.5326789617538452, + "rewards/rejected": 0.34538155794143677, + "step": 710 + }, + { + "epoch": 0.5625491738788355, + "grad_norm": 17.19778060913086, + "learning_rate": 9.71144230953582e-06, + "logits/chosen": -0.15033751726150513, + "logits/rejected": -0.6573851108551025, + "logps/chosen": -209.91763305664062, + "logps/rejected": -173.20547485351562, + "loss": 0.637, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7313550710678101, + "rewards/margins": 0.45394793152809143, + "rewards/rejected": 0.27740710973739624, + "step": 715 + }, + { + "epoch": 0.5664830841856806, + "grad_norm": 17.859058380126953, + "learning_rate": 9.574103608691974e-06, + "logits/chosen": -0.1018882766366005, + "logits/rejected": -0.3827294111251831, + "logps/chosen": -217.5899658203125, + "logps/rejected": -190.86546325683594, + "loss": 0.7034, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7803667187690735, + "rewards/margins": 0.14793583750724792, + "rewards/rejected": 0.632430911064148, + "step": 720 + }, + { + "epoch": 0.5704169944925256, + "grad_norm": 17.891475677490234, + "learning_rate": 9.436845342728142e-06, + "logits/chosen": -0.23665161430835724, + "logits/rejected": -0.6916168928146362, + "logps/chosen": -198.93873596191406, + "logps/rejected": -166.03292846679688, + "loss": 0.6421, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.801128089427948, + "rewards/margins": 0.4237571656703949, + "rewards/rejected": 0.3773708939552307, + "step": 725 + }, + { + "epoch": 0.5743509047993706, + "grad_norm": 17.744354248046875, + "learning_rate": 9.299693434268653e-06, + "logits/chosen": -0.01328353863209486, + "logits/rejected": -0.2819923758506775, + "logps/chosen": -207.9522705078125, + "logps/rejected": -188.49993896484375, + "loss": 0.6878, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.904397189617157, + "rewards/margins": 0.36915481090545654, + "rewards/rejected": 0.5352423787117004, + "step": 730 + }, + { + "epoch": 0.5782848151062155, + "grad_norm": 18.68268394470215, + "learning_rate": 9.162673785851131e-06, + "logits/chosen": -0.39516356587409973, + "logits/rejected": -0.7670010328292847, + "logps/chosen": -204.0966796875, + "logps/rejected": -170.11227416992188, + "loss": 0.6341, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.8424245715141296, + "rewards/margins": 0.40797433257102966, + "rewards/rejected": 0.43445029854774475, + "step": 735 + }, + { + "epoch": 0.5822187254130606, + "grad_norm": 14.530721664428711, + "learning_rate": 9.025812275034541e-06, + "logits/chosen": -0.14751622080802917, + "logits/rejected": -0.5135005116462708, + "logps/chosen": -225.6256866455078, + "logps/rejected": -200.2797393798828, + "loss": 0.621, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 1.029211401939392, + "rewards/margins": 0.5424867868423462, + "rewards/rejected": 0.48672476410865784, + "step": 740 + }, + { + "epoch": 0.5861526357199056, + "grad_norm": 18.743927001953125, + "learning_rate": 8.889134749511956e-06, + "logits/chosen": -0.11462094634771347, + "logits/rejected": -0.38805294036865234, + "logps/chosen": -207.6776123046875, + "logps/rejected": -181.88101196289062, + "loss": 0.7368, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.7982211709022522, + "rewards/margins": 0.21776151657104492, + "rewards/rejected": 0.580459713935852, + "step": 745 + }, + { + "epoch": 0.5900865460267506, + "grad_norm": 14.667529106140137, + "learning_rate": 8.752667022228936e-06, + "logits/chosen": -0.022926175966858864, + "logits/rejected": -0.4718795418739319, + "logps/chosen": -216.82284545898438, + "logps/rejected": -186.5943603515625, + "loss": 0.622, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.8725186586380005, + "rewards/margins": 0.6078484058380127, + "rewards/rejected": 0.264670193195343, + "step": 750 + }, + { + "epoch": 0.5940204563335956, + "grad_norm": 20.248031616210938, + "learning_rate": 8.616434866508519e-06, + "logits/chosen": -0.15943610668182373, + "logits/rejected": -0.6148089170455933, + "logps/chosen": -209.1900177001953, + "logps/rejected": -184.60047912597656, + "loss": 0.6446, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.7891548871994019, + "rewards/margins": 0.48758840560913086, + "rewards/rejected": 0.30156660079956055, + "step": 755 + }, + { + "epoch": 0.5979543666404405, + "grad_norm": 16.850963592529297, + "learning_rate": 8.480464011183631e-06, + "logits/chosen": -0.2673138678073883, + "logits/rejected": -0.6848293542861938, + "logps/chosen": -201.9542999267578, + "logps/rejected": -168.80638122558594, + "loss": 0.6669, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6175512671470642, + "rewards/margins": 0.30348506569862366, + "rewards/rejected": 0.31406617164611816, + "step": 760 + }, + { + "epoch": 0.6018882769472856, + "grad_norm": 18.8007755279541, + "learning_rate": 8.344780135737962e-06, + "logits/chosen": -0.31253287196159363, + "logits/rejected": -0.8586766123771667, + "logps/chosen": -212.3469696044922, + "logps/rejected": -163.8748321533203, + "loss": 0.6595, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.8451669812202454, + "rewards/margins": 0.5855604410171509, + "rewards/rejected": 0.2596065402030945, + "step": 765 + }, + { + "epoch": 0.6058221872541306, + "grad_norm": 13.551706314086914, + "learning_rate": 8.209408865456127e-06, + "logits/chosen": -0.13036459684371948, + "logits/rejected": -0.4954930245876312, + "logps/chosen": -213.2278289794922, + "logps/rejected": -188.24514770507812, + "loss": 0.662, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.8455514907836914, + "rewards/margins": 0.34862059354782104, + "rewards/rejected": 0.49693092703819275, + "step": 770 + }, + { + "epoch": 0.6097560975609756, + "grad_norm": 17.73063087463379, + "learning_rate": 8.074375766584053e-06, + "logits/chosen": 0.0039010108448565006, + "logits/rejected": -0.5214850306510925, + "logps/chosen": -213.3166046142578, + "logps/rejected": -174.0699005126953, + "loss": 0.717, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.6939308643341064, + "rewards/margins": 0.32082659006118774, + "rewards/rejected": 0.3731042742729187, + "step": 775 + }, + { + "epoch": 0.6136900078678206, + "grad_norm": 14.620991706848145, + "learning_rate": 7.939706341500555e-06, + "logits/chosen": -0.04872986674308777, + "logits/rejected": -0.4084659516811371, + "logps/chosen": -194.51834106445312, + "logps/rejected": -185.00225830078125, + "loss": 0.5966, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.8033088445663452, + "rewards/margins": 0.5693622827529907, + "rewards/rejected": 0.23394668102264404, + "step": 780 + }, + { + "epoch": 0.6176239181746657, + "grad_norm": 13.0098876953125, + "learning_rate": 7.805426023900938e-06, + "logits/chosen": -0.4255433976650238, + "logits/rejected": -0.7939322590827942, + "logps/chosen": -190.10177612304688, + "logps/rejected": -162.91436767578125, + "loss": 0.7034, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6892917156219482, + "rewards/margins": 0.3028218150138855, + "rewards/rejected": 0.38646987080574036, + "step": 785 + }, + { + "epoch": 0.6215578284815106, + "grad_norm": 22.03873634338379, + "learning_rate": 7.671560173993588e-06, + "logits/chosen": -0.08852169662714005, + "logits/rejected": -0.4719138741493225, + "logps/chosen": -199.76376342773438, + "logps/rejected": -182.2493896484375, + "loss": 0.6744, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7909868359565735, + "rewards/margins": 0.3397650420665741, + "rewards/rejected": 0.4512217938899994, + "step": 790 + }, + { + "epoch": 0.6254917387883556, + "grad_norm": 18.647151947021484, + "learning_rate": 7.538134073710437e-06, + "logits/chosen": -0.38996896147727966, + "logits/rejected": -0.6869844198226929, + "logps/chosen": -198.90866088867188, + "logps/rejected": -178.61019897460938, + "loss": 0.7028, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7868278622627258, + "rewards/margins": 0.44276612997055054, + "rewards/rejected": 0.3440617322921753, + "step": 795 + }, + { + "epoch": 0.6294256490952006, + "grad_norm": 17.837268829345703, + "learning_rate": 7.405172921932214e-06, + "logits/chosen": -0.09680289775133133, + "logits/rejected": -0.4570208191871643, + "logps/chosen": -196.43899536132812, + "logps/rejected": -173.35025024414062, + "loss": 0.6309, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7571867108345032, + "rewards/margins": 0.43233370780944824, + "rewards/rejected": 0.32485300302505493, + "step": 800 + }, + { + "epoch": 0.6294256490952006, + "eval_logits/chosen": 1.2894115447998047, + "eval_logits/rejected": 1.0707098245620728, + "eval_logps/chosen": -206.11080932617188, + "eval_logps/rejected": -179.48574829101562, + "eval_loss": 0.6793522834777832, + "eval_rewards/accuracies": 0.6265624761581421, + "eval_rewards/chosen": 0.7437959313392639, + "eval_rewards/margins": 0.384955495595932, + "eval_rewards/rejected": 0.3588404655456543, + "eval_runtime": 298.0621, + "eval_samples_per_second": 2.147, + "eval_steps_per_second": 0.134, + "step": 800 + }, + { + "epoch": 0.6333595594020456, + "grad_norm": 23.481149673461914, + "learning_rate": 7.272701829729378e-06, + "logits/chosen": -0.09348127245903015, + "logits/rejected": -0.39429792761802673, + "logps/chosen": -222.31369018554688, + "logps/rejected": -189.89024353027344, + "loss": 0.7434, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.7474627494812012, + "rewards/margins": 0.24622318148612976, + "rewards/rejected": 0.5012395977973938, + "step": 805 + }, + { + "epoch": 0.6372934697088907, + "grad_norm": 18.71939468383789, + "learning_rate": 7.140745815619632e-06, + "logits/chosen": -0.09522039443254471, + "logits/rejected": -0.4288865923881531, + "logps/chosen": -198.81405639648438, + "logps/rejected": -192.83120727539062, + "loss": 0.6662, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.610317587852478, + "rewards/margins": 0.3116861879825592, + "rewards/rejected": 0.2986314296722412, + "step": 810 + }, + { + "epoch": 0.6412273800157356, + "grad_norm": 18.34478759765625, + "learning_rate": 7.009329800842929e-06, + "logits/chosen": 0.017814218997955322, + "logits/rejected": -0.3244866132736206, + "logps/chosen": -229.75381469726562, + "logps/rejected": -199.60000610351562, + "loss": 0.7092, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.5720285177230835, + "rewards/margins": 0.19818969070911407, + "rewards/rejected": 0.3738388121128082, + "step": 815 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 16.03777313232422, + "learning_rate": 6.878478604654835e-06, + "logits/chosen": -0.284344345331192, + "logits/rejected": -0.6540359258651733, + "logps/chosen": -195.71812438964844, + "logps/rejected": -176.70550537109375, + "loss": 0.5904, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6928594708442688, + "rewards/margins": 0.6011512875556946, + "rewards/rejected": 0.09170810133218765, + "step": 820 + }, + { + "epoch": 0.6490952006294256, + "grad_norm": 22.05975914001465, + "learning_rate": 6.748216939639158e-06, + "logits/chosen": 0.07760115712881088, + "logits/rejected": -0.4913705885410309, + "logps/chosen": -190.44102478027344, + "logps/rejected": -163.40457153320312, + "loss": 0.6636, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5673459768295288, + "rewards/margins": 0.46832141280174255, + "rewards/rejected": 0.09902457147836685, + "step": 825 + }, + { + "epoch": 0.6530291109362707, + "grad_norm": 19.04427146911621, + "learning_rate": 6.618569407040736e-06, + "logits/chosen": -0.2564006745815277, + "logits/rejected": -0.621497392654419, + "logps/chosen": -198.78524780273438, + "logps/rejected": -172.7997283935547, + "loss": 0.6624, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.6063997149467468, + "rewards/margins": 0.4061097204685211, + "rewards/rejected": 0.2002900391817093, + "step": 830 + }, + { + "epoch": 0.6569630212431157, + "grad_norm": 13.502724647521973, + "learning_rate": 6.489560492119225e-06, + "logits/chosen": 0.06354556977748871, + "logits/rejected": -0.4314854145050049, + "logps/chosen": -215.6816864013672, + "logps/rejected": -183.03579711914062, + "loss": 0.6743, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.43139171600341797, + "rewards/margins": 0.3207935392856598, + "rewards/rejected": 0.11059819161891937, + "step": 835 + }, + { + "epoch": 0.6608969315499607, + "grad_norm": 15.181354522705078, + "learning_rate": 6.361214559524817e-06, + "logits/chosen": -0.3440548777580261, + "logits/rejected": -0.6467902660369873, + "logps/chosen": -194.0684814453125, + "logps/rejected": -180.21780395507812, + "loss": 0.615, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.5782068967819214, + "rewards/margins": 0.4565269947052002, + "rewards/rejected": 0.12167992442846298, + "step": 840 + }, + { + "epoch": 0.6648308418568056, + "grad_norm": 79.10075378417969, + "learning_rate": 6.233555848696724e-06, + "logits/chosen": -0.293182373046875, + "logits/rejected": -0.5915425419807434, + "logps/chosen": -208.3809356689453, + "logps/rejected": -191.13064575195312, + "loss": 0.7247, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.5359665155410767, + "rewards/margins": 0.28759217262268066, + "rewards/rejected": 0.24837426841259003, + "step": 845 + }, + { + "epoch": 0.6687647521636507, + "grad_norm": 18.02682113647461, + "learning_rate": 6.1066084692853224e-06, + "logits/chosen": -0.03417937830090523, + "logits/rejected": -0.43492475152015686, + "logps/chosen": -212.67398071289062, + "logps/rejected": -183.54196166992188, + "loss": 0.6832, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3686201870441437, + "rewards/margins": 0.2786737084388733, + "rewards/rejected": 0.08994650840759277, + "step": 850 + }, + { + "epoch": 0.6726986624704957, + "grad_norm": 17.677215576171875, + "learning_rate": 5.980396396598777e-06, + "logits/chosen": -0.2180563509464264, + "logits/rejected": -0.3799629211425781, + "logps/chosen": -192.2188720703125, + "logps/rejected": -187.93289184570312, + "loss": 0.6909, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.4506203234195709, + "rewards/margins": 0.31998997926712036, + "rewards/rejected": 0.13063031435012817, + "step": 855 + }, + { + "epoch": 0.6766325727773407, + "grad_norm": 13.698114395141602, + "learning_rate": 5.854943467075087e-06, + "logits/chosen": -0.22957925498485565, + "logits/rejected": -0.5203697085380554, + "logps/chosen": -198.90037536621094, + "logps/rejected": -180.50279235839844, + "loss": 0.6282, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.4433286786079407, + "rewards/margins": 0.4702211916446686, + "rewards/rejected": -0.026892513036727905, + "step": 860 + }, + { + "epoch": 0.6805664830841857, + "grad_norm": 16.75077247619629, + "learning_rate": 5.730273373780309e-06, + "logits/chosen": -0.3643267750740051, + "logits/rejected": -0.7527881860733032, + "logps/chosen": -193.90756225585938, + "logps/rejected": -173.71755981445312, + "loss": 0.6902, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.46958428621292114, + "rewards/margins": 0.43391847610473633, + "rewards/rejected": 0.03566574305295944, + "step": 865 + }, + { + "epoch": 0.6845003933910306, + "grad_norm": 21.622961044311523, + "learning_rate": 5.606409661933889e-06, + "logits/chosen": -0.023716717958450317, + "logits/rejected": -0.3822089731693268, + "logps/chosen": -221.4508056640625, + "logps/rejected": -188.75930786132812, + "loss": 0.7406, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6389329433441162, + "rewards/margins": 0.3061096668243408, + "rewards/rejected": 0.332823246717453, + "step": 870 + }, + { + "epoch": 0.6884343036978757, + "grad_norm": 19.141998291015625, + "learning_rate": 5.483375724461918e-06, + "logits/chosen": -0.36916786432266235, + "logits/rejected": -0.8393670320510864, + "logps/chosen": -201.64920043945312, + "logps/rejected": -163.6253662109375, + "loss": 0.6788, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.36916905641555786, + "rewards/margins": 0.3855450749397278, + "rewards/rejected": -0.016376061365008354, + "step": 875 + }, + { + "epoch": 0.6923682140047207, + "grad_norm": 20.840383529663086, + "learning_rate": 5.361194797579108e-06, + "logits/chosen": -0.27600985765457153, + "logits/rejected": -0.7273412346839905, + "logps/chosen": -213.323486328125, + "logps/rejected": -172.2437286376953, + "loss": 0.7035, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7029854655265808, + "rewards/margins": 0.3768990635871887, + "rewards/rejected": 0.3260864317417145, + "step": 880 + }, + { + "epoch": 0.6963021243115657, + "grad_norm": 20.24435806274414, + "learning_rate": 5.239889956400435e-06, + "logits/chosen": 0.13340488076210022, + "logits/rejected": -0.46101540327072144, + "logps/chosen": -217.2809295654297, + "logps/rejected": -176.47802734375, + "loss": 0.6408, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5957245230674744, + "rewards/margins": 0.416795551776886, + "rewards/rejected": 0.17892900109291077, + "step": 885 + }, + { + "epoch": 0.7002360346184107, + "grad_norm": 18.37978172302246, + "learning_rate": 5.119484110583135e-06, + "logits/chosen": -0.4709344506263733, + "logits/rejected": -0.7668399810791016, + "logps/chosen": -200.41390991210938, + "logps/rejected": -169.01779174804688, + "loss": 0.6936, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.39610370993614197, + "rewards/margins": 0.3159303665161133, + "rewards/rejected": 0.08017335832118988, + "step": 890 + }, + { + "epoch": 0.7041699449252558, + "grad_norm": 14.384517669677734, + "learning_rate": 5.000000000000003e-06, + "logits/chosen": -0.2237352579832077, + "logits/rejected": -0.7978562116622925, + "logps/chosen": -200.4236297607422, + "logps/rejected": -168.09664916992188, + "loss": 0.6054, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.5839098691940308, + "rewards/margins": 0.5763157606124878, + "rewards/rejected": 0.007594155613332987, + "step": 895 + }, + { + "epoch": 0.7081038552321007, + "grad_norm": 23.844955444335938, + "learning_rate": 4.881460190444726e-06, + "logits/chosen": -0.57319176197052, + "logits/rejected": -0.7391110062599182, + "logps/chosen": -205.91015625, + "logps/rejected": -186.86459350585938, + "loss": 0.6875, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5059628486633301, + "rewards/margins": 0.3120475113391876, + "rewards/rejected": 0.19391539692878723, + "step": 900 + }, + { + "epoch": 0.7081038552321007, + "eval_logits/chosen": 1.2854810953140259, + "eval_logits/rejected": 1.0660665035247803, + "eval_logps/chosen": -206.6718292236328, + "eval_logps/rejected": -179.9932861328125, + "eval_loss": 0.6678879857063293, + "eval_rewards/accuracies": 0.6265624761581421, + "eval_rewards/chosen": 0.519389808177948, + "eval_rewards/margins": 0.3635701537132263, + "eval_rewards/rejected": 0.15581969916820526, + "eval_runtime": 296.5851, + "eval_samples_per_second": 2.158, + "eval_steps_per_second": 0.135, + "step": 900 + }, + { + "epoch": 0.7120377655389457, + "grad_norm": 14.835896492004395, + "learning_rate": 4.763887069370107e-06, + "logits/chosen": -0.1812276542186737, + "logits/rejected": -0.5340962409973145, + "logps/chosen": -184.94485473632812, + "logps/rejected": -169.592041015625, + "loss": 0.6794, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.4734250605106354, + "rewards/margins": 0.3993573486804962, + "rewards/rejected": 0.07406774908304214, + "step": 905 + }, + { + "epoch": 0.7159716758457907, + "grad_norm": 24.3856143951416, + "learning_rate": 4.64730284165996e-06, + "logits/chosen": -0.04929916188120842, + "logits/rejected": -0.5009157061576843, + "logps/chosen": -225.0531768798828, + "logps/rejected": -193.8749237060547, + "loss": 0.6584, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.7070298194885254, + "rewards/margins": 0.4039214551448822, + "rewards/rejected": 0.3031083941459656, + "step": 910 + }, + { + "epoch": 0.7199055861526357, + "grad_norm": 22.8303279876709, + "learning_rate": 4.531729525435501e-06, + "logits/chosen": 0.0025139451026916504, + "logits/rejected": -0.6012422442436218, + "logps/chosen": -205.25, + "logps/rejected": -166.71438598632812, + "loss": 0.639, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.6045829653739929, + "rewards/margins": 0.42883044481277466, + "rewards/rejected": 0.17575259506702423, + "step": 915 + }, + { + "epoch": 0.7238394964594808, + "grad_norm": 14.778836250305176, + "learning_rate": 4.417188947896983e-06, + "logits/chosen": -0.30647343397140503, + "logits/rejected": -0.6068025827407837, + "logps/chosen": -185.31884765625, + "logps/rejected": -171.61390686035156, + "loss": 0.6358, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.5817626118659973, + "rewards/margins": 0.4069378972053528, + "rewards/rejected": 0.17482469975948334, + "step": 920 + }, + { + "epoch": 0.7277734067663257, + "grad_norm": 14.139073371887207, + "learning_rate": 4.303702741201431e-06, + "logits/chosen": -0.5711551904678345, + "logits/rejected": -0.8691667318344116, + "logps/chosen": -192.8331298828125, + "logps/rejected": -175.0562286376953, + "loss": 0.6808, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.38311484456062317, + "rewards/margins": 0.3181079924106598, + "rewards/rejected": 0.0650068148970604, + "step": 925 + }, + { + "epoch": 0.7317073170731707, + "grad_norm": 15.895308494567871, + "learning_rate": 4.1912923383771685e-06, + "logits/chosen": -0.36842986941337585, + "logits/rejected": -0.7152490019798279, + "logps/chosen": -211.0810089111328, + "logps/rejected": -196.27755737304688, + "loss": 0.6735, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.5359424948692322, + "rewards/margins": 0.3752870559692383, + "rewards/rejected": 0.16065548360347748, + "step": 930 + }, + { + "epoch": 0.7356412273800157, + "grad_norm": 14.658058166503906, + "learning_rate": 4.079978969275984e-06, + "logits/chosen": -0.5706170797348022, + "logits/rejected": -0.852310299873352, + "logps/chosen": -176.20578002929688, + "logps/rejected": -158.9827423095703, + "loss": 0.7049, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.29048237204551697, + "rewards/margins": 0.3135663866996765, + "rewards/rejected": -0.023084009066224098, + "step": 935 + }, + { + "epoch": 0.7395751376868608, + "grad_norm": 14.667938232421875, + "learning_rate": 3.9697836565636484e-06, + "logits/chosen": -0.0873163565993309, + "logits/rejected": -0.4978067874908447, + "logps/chosen": -219.19210815429688, + "logps/rejected": -186.28640747070312, + "loss": 0.6177, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.48243242502212524, + "rewards/margins": 0.47774791717529297, + "rewards/rejected": 0.004684485495090485, + "step": 940 + }, + { + "epoch": 0.7435090479937058, + "grad_norm": 20.62685775756836, + "learning_rate": 3.860727211749572e-06, + "logits/chosen": -0.3459232449531555, + "logits/rejected": -0.6185725927352905, + "logps/chosen": -204.01295471191406, + "logps/rejected": -179.14883422851562, + "loss": 0.6996, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7561392188072205, + "rewards/margins": 0.43435636162757874, + "rewards/rejected": 0.3217828571796417, + "step": 945 + }, + { + "epoch": 0.7474429583005507, + "grad_norm": 17.138633728027344, + "learning_rate": 3.7528302312563447e-06, + "logits/chosen": -0.21280460059642792, + "logits/rejected": -0.6648741960525513, + "logps/chosen": -207.45266723632812, + "logps/rejected": -169.14617919921875, + "loss": 0.7068, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.38267606496810913, + "rewards/margins": 0.17945952713489532, + "rewards/rejected": 0.2032165229320526, + "step": 950 + }, + { + "epoch": 0.7513768686073957, + "grad_norm": 58.220947265625, + "learning_rate": 3.646113092529878e-06, + "logits/chosen": -0.21766535937786102, + "logits/rejected": -0.6996904611587524, + "logps/chosen": -225.0487060546875, + "logps/rejected": -184.19442749023438, + "loss": 0.7056, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6735895872116089, + "rewards/margins": 0.4389261305332184, + "rewards/rejected": 0.2346634566783905, + "step": 955 + }, + { + "epoch": 0.7553107789142408, + "grad_norm": 20.379343032836914, + "learning_rate": 3.5405959501909313e-06, + "logits/chosen": -0.18848784267902374, + "logits/rejected": -0.5305780172348022, + "logps/chosen": -212.13162231445312, + "logps/rejected": -186.52542114257812, + "loss": 0.6872, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.3702337145805359, + "rewards/margins": 0.29016590118408203, + "rewards/rejected": 0.08006780594587326, + "step": 960 + }, + { + "epoch": 0.7592446892210858, + "grad_norm": 17.178056716918945, + "learning_rate": 3.436298732228699e-06, + "logits/chosen": -0.21896116435527802, + "logits/rejected": -0.6624099612236023, + "logps/chosen": -205.2207794189453, + "logps/rejected": -170.05699157714844, + "loss": 0.6446, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.4240780472755432, + "rewards/margins": 0.37573105096817017, + "rewards/rejected": 0.04834695905447006, + "step": 965 + }, + { + "epoch": 0.7631785995279308, + "grad_norm": 19.06415557861328, + "learning_rate": 3.3332411362372063e-06, + "logits/chosen": -0.15206289291381836, + "logits/rejected": -0.4406839907169342, + "logps/chosen": -186.83627319335938, + "logps/rejected": -164.04739379882812, + "loss": 0.6972, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.5690515041351318, + "rewards/margins": 0.338064044713974, + "rewards/rejected": 0.23098750412464142, + "step": 970 + }, + { + "epoch": 0.7671125098347757, + "grad_norm": 19.997249603271484, + "learning_rate": 3.231442625695217e-06, + "logits/chosen": -0.4492325186729431, + "logits/rejected": -0.6821542978286743, + "logps/chosen": -192.6551971435547, + "logps/rejected": -174.02772521972656, + "loss": 0.6523, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.41465169191360474, + "rewards/margins": 0.4233173727989197, + "rewards/rejected": -0.008665725588798523, + "step": 975 + }, + { + "epoch": 0.7710464201416207, + "grad_norm": 21.12126350402832, + "learning_rate": 3.1309224262903614e-06, + "logits/chosen": -0.0248140636831522, + "logits/rejected": -0.2627066373825073, + "logps/chosen": -214.6104278564453, + "logps/rejected": -192.9540557861328, + "loss": 0.6733, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.5803993940353394, + "rewards/margins": 0.2992710471153259, + "rewards/rejected": 0.28112831711769104, + "step": 980 + }, + { + "epoch": 0.7749803304484658, + "grad_norm": 12.457499504089355, + "learning_rate": 3.0316995222881584e-06, + "logits/chosen": -0.40065187215805054, + "logits/rejected": -0.8357529640197754, + "logps/chosen": -192.20655822753906, + "logps/rejected": -164.68626403808594, + "loss": 0.6292, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5738979578018188, + "rewards/margins": 0.4073941111564636, + "rewards/rejected": 0.1665038764476776, + "step": 985 + }, + { + "epoch": 0.7789142407553108, + "grad_norm": 12.965932846069336, + "learning_rate": 2.9337926529466578e-06, + "logits/chosen": -0.5754062533378601, + "logits/rejected": -0.9457462430000305, + "logps/chosen": -189.44522094726562, + "logps/rejected": -169.0963897705078, + "loss": 0.6242, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.49839162826538086, + "rewards/margins": 0.4758077561855316, + "rewards/rejected": 0.02258378639817238, + "step": 990 + }, + { + "epoch": 0.7828481510621558, + "grad_norm": 14.567062377929688, + "learning_rate": 2.83722030897733e-06, + "logits/chosen": 0.24449042975902557, + "logits/rejected": -0.30078762769699097, + "logps/chosen": -205.9731903076172, + "logps/rejected": -173.31008911132812, + "loss": 0.5947, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.5201369524002075, + "rewards/margins": 0.5564968585968018, + "rewards/rejected": -0.036359887570142746, + "step": 995 + }, + { + "epoch": 0.7867820613690008, + "grad_norm": 18.595260620117188, + "learning_rate": 2.7420007290529118e-06, + "logits/chosen": -0.1308153122663498, + "logits/rejected": -0.6352800726890564, + "logps/chosen": -224.5437469482422, + "logps/rejected": -178.47549438476562, + "loss": 0.6361, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.6366292238235474, + "rewards/margins": 0.4467080235481262, + "rewards/rejected": 0.18992114067077637, + "step": 1000 + }, + { + "epoch": 0.7867820613690008, + "eval_logits/chosen": 1.278507113456726, + "eval_logits/rejected": 1.058009147644043, + "eval_logps/chosen": -206.54354858398438, + "eval_logps/rejected": -179.86978149414062, + "eval_loss": 0.6649525165557861, + "eval_rewards/accuracies": 0.625, + "eval_rewards/chosen": 0.5706965923309326, + "eval_rewards/margins": 0.3654647767543793, + "eval_rewards/rejected": 0.20523183047771454, + "eval_runtime": 301.4428, + "eval_samples_per_second": 2.123, + "eval_steps_per_second": 0.133, + "step": 1000 + }, + { + "epoch": 0.7907159716758458, + "grad_norm": 19.2440242767334, + "learning_rate": 2.6481518963628383e-06, + "logits/chosen": -0.11340751498937607, + "logits/rejected": -0.31099405884742737, + "logps/chosen": -212.424072265625, + "logps/rejected": -195.0722198486328, + "loss": 0.6193, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.5051929354667664, + "rewards/margins": 0.5108169317245483, + "rewards/rejected": -0.00562392920255661, + "step": 1005 + }, + { + "epoch": 0.7946498819826908, + "grad_norm": 17.27981185913086, + "learning_rate": 2.555691535216944e-06, + "logits/chosen": -0.2921395003795624, + "logits/rejected": -0.7080395817756653, + "logps/chosen": -208.31747436523438, + "logps/rejected": -180.02212524414062, + "loss": 0.676, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.4920189380645752, + "rewards/margins": 0.2944754660129547, + "rewards/rejected": 0.19754347205162048, + "step": 1010 + }, + { + "epoch": 0.7985837922895358, + "grad_norm": 12.276522636413574, + "learning_rate": 2.464637107698046e-06, + "logits/chosen": -0.3768986165523529, + "logits/rejected": -0.9090649485588074, + "logps/chosen": -195.61764526367188, + "logps/rejected": -158.5428466796875, + "loss": 0.676, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.35970592498779297, + "rewards/margins": 0.34655410051345825, + "rewards/rejected": 0.013151821680366993, + "step": 1015 + }, + { + "epoch": 0.8025177025963808, + "grad_norm": 12.748953819274902, + "learning_rate": 2.3750058103640427e-06, + "logits/chosen": -0.3452379107475281, + "logits/rejected": -0.8985518217086792, + "logps/chosen": -209.6136016845703, + "logps/rejected": -173.26414489746094, + "loss": 0.6122, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.5482198596000671, + "rewards/margins": 0.4972603917121887, + "rewards/rejected": 0.05095947906374931, + "step": 1020 + }, + { + "epoch": 0.8064516129032258, + "grad_norm": 15.02308177947998, + "learning_rate": 2.286814571000171e-06, + "logits/chosen": -0.2370177060365677, + "logits/rejected": -0.6736031770706177, + "logps/chosen": -194.3092041015625, + "logps/rejected": -164.15817260742188, + "loss": 0.6226, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.5286887288093567, + "rewards/margins": 0.4581621289253235, + "rewards/rejected": 0.0705266147851944, + "step": 1025 + }, + { + "epoch": 0.8103855232100708, + "grad_norm": 15.919551849365234, + "learning_rate": 2.2000800454220285e-06, + "logits/chosen": -0.04363623261451721, + "logits/rejected": -0.4236673414707184, + "logps/chosen": -209.69235229492188, + "logps/rejected": -175.1033935546875, + "loss": 0.6664, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5712024569511414, + "rewards/margins": 0.4244155287742615, + "rewards/rejected": 0.1467868834733963, + "step": 1030 + }, + { + "epoch": 0.8143194335169158, + "grad_norm": 16.688159942626953, + "learning_rate": 2.114818614329945e-06, + "logits/chosen": -0.18427999317646027, + "logits/rejected": -0.4734131693840027, + "logps/chosen": -200.3739776611328, + "logps/rejected": -180.11984252929688, + "loss": 0.6447, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.6805271506309509, + "rewards/margins": 0.4109037518501282, + "rewards/rejected": 0.26962336897850037, + "step": 1035 + }, + { + "epoch": 0.8182533438237608, + "grad_norm": 15.36899471282959, + "learning_rate": 2.031046380215327e-06, + "logits/chosen": -0.5546427965164185, + "logits/rejected": -0.8263591527938843, + "logps/chosen": -180.88345336914062, + "logps/rejected": -167.74163818359375, + "loss": 0.6247, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5958175659179688, + "rewards/margins": 0.5058714151382446, + "rewards/rejected": 0.08994609862565994, + "step": 1040 + }, + { + "epoch": 0.8221872541306058, + "grad_norm": 24.130155563354492, + "learning_rate": 1.9487791643195276e-06, + "logits/chosen": -0.3917720317840576, + "logits/rejected": -0.7242711782455444, + "logps/chosen": -206.98135375976562, + "logps/rejected": -183.48318481445312, + "loss": 0.7598, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.24247002601623535, + "rewards/margins": 0.1793862134218216, + "rewards/rejected": 0.06308381259441376, + "step": 1045 + }, + { + "epoch": 0.8261211644374509, + "grad_norm": 16.544754028320312, + "learning_rate": 1.8680325036458535e-06, + "logits/chosen": -0.16317354142665863, + "logits/rejected": -0.5910676717758179, + "logps/chosen": -204.1961669921875, + "logps/rejected": -173.1997833251953, + "loss": 0.706, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.40617918968200684, + "rewards/margins": 0.3612835705280304, + "rewards/rejected": 0.04489566385746002, + "step": 1050 + }, + { + "epoch": 0.8300550747442959, + "grad_norm": 28.87963104248047, + "learning_rate": 1.788821648025242e-06, + "logits/chosen": -0.46491608023643494, + "logits/rejected": -0.5262236595153809, + "logps/chosen": -198.90652465820312, + "logps/rejected": -188.23049926757812, + "loss": 0.7507, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.4018153250217438, + "rewards/margins": 0.1714794784784317, + "rewards/rejected": 0.23033586144447327, + "step": 1055 + }, + { + "epoch": 0.8339889850511408, + "grad_norm": 21.89056968688965, + "learning_rate": 1.7111615572361628e-06, + "logits/chosen": -0.1197819709777832, + "logits/rejected": -0.40464717149734497, + "logps/chosen": -211.6194305419922, + "logps/rejected": -192.4689483642578, + "loss": 0.7031, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.44010597467422485, + "rewards/margins": 0.23361381888389587, + "rewards/rejected": 0.20649214088916779, + "step": 1060 + }, + { + "epoch": 0.8379228953579858, + "grad_norm": 25.025197982788086, + "learning_rate": 1.6350668981793304e-06, + "logits/chosen": -0.21810774505138397, + "logits/rejected": -0.536165714263916, + "logps/chosen": -195.72702026367188, + "logps/rejected": -185.3990478515625, + "loss": 0.6838, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22649447619915009, + "rewards/margins": 0.3016238212585449, + "rewards/rejected": -0.07512933015823364, + "step": 1065 + }, + { + "epoch": 0.8418568056648308, + "grad_norm": 15.117574691772461, + "learning_rate": 1.5605520421076969e-06, + "logits/chosen": -0.34034574031829834, + "logits/rejected": -0.5113102793693542, + "logps/chosen": -195.9296417236328, + "logps/rejected": -186.29287719726562, + "loss": 0.6485, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.23202356696128845, + "rewards/margins": 0.3296849727630615, + "rewards/rejected": -0.09766140580177307, + "step": 1070 + }, + { + "epoch": 0.8457907159716759, + "grad_norm": 16.99416732788086, + "learning_rate": 1.487631061912298e-06, + "logits/chosen": -0.5572665929794312, + "logits/rejected": -0.8171085119247437, + "logps/chosen": -193.6608123779297, + "logps/rejected": -176.0238800048828, + "loss": 0.6605, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.35814136266708374, + "rewards/margins": 0.37609419226646423, + "rewards/rejected": -0.017952853813767433, + "step": 1075 + }, + { + "epoch": 0.8497246262785209, + "grad_norm": 14.536643981933594, + "learning_rate": 1.4163177294644438e-06, + "logits/chosen": -0.2895492613315582, + "logits/rejected": -0.48721733689308167, + "logps/chosen": -198.87753295898438, + "logps/rejected": -183.21096801757812, + "loss": 0.6838, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.22944995760917664, + "rewards/margins": 0.3464585840702057, + "rewards/rejected": -0.11700858920812607, + "step": 1080 + }, + { + "epoch": 0.8536585365853658, + "grad_norm": 25.793216705322266, + "learning_rate": 1.3466255130147622e-06, + "logits/chosen": -0.36471131443977356, + "logits/rejected": -0.5930619239807129, + "logps/chosen": -187.9856719970703, + "logps/rejected": -175.9360809326172, + "loss": 0.683, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.1035921722650528, + "rewards/margins": 0.23180215060710907, + "rewards/rejected": -0.12820999324321747, + "step": 1085 + }, + { + "epoch": 0.8575924468922108, + "grad_norm": 20.578927993774414, + "learning_rate": 1.2785675746495752e-06, + "logits/chosen": -0.24610432982444763, + "logits/rejected": -0.7905102968215942, + "logps/chosen": -188.7552032470703, + "logps/rejected": -163.68289184570312, + "loss": 0.6594, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.19958016276359558, + "rewards/margins": 0.35812973976135254, + "rewards/rejected": -0.15854960680007935, + "step": 1090 + }, + { + "epoch": 0.8615263571990559, + "grad_norm": 17.24201011657715, + "learning_rate": 1.212156767805115e-06, + "logits/chosen": -0.3163990080356598, + "logits/rejected": -0.8110219240188599, + "logps/chosen": -175.55859375, + "logps/rejected": -141.836181640625, + "loss": 0.6571, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.13926038146018982, + "rewards/margins": 0.2834976315498352, + "rewards/rejected": -0.1442372053861618, + "step": 1095 + }, + { + "epoch": 0.8654602675059009, + "grad_norm": 13.562137603759766, + "learning_rate": 1.1474056348400141e-06, + "logits/chosen": -0.25132131576538086, + "logits/rejected": -0.5677313804626465, + "logps/chosen": -192.50961303710938, + "logps/rejected": -171.54611206054688, + "loss": 0.6721, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.3157256245613098, + "rewards/margins": 0.3246195316314697, + "rewards/rejected": -0.008893907070159912, + "step": 1100 + }, + { + "epoch": 0.8654602675059009, + "eval_logits/chosen": 1.2642682790756226, + "eval_logits/rejected": 1.043653964996338, + "eval_logps/chosen": -207.01547241210938, + "eval_logps/rejected": -180.30709838867188, + "eval_loss": 0.6631070971488953, + "eval_rewards/accuracies": 0.6265624761581421, + "eval_rewards/chosen": 0.38192370533943176, + "eval_rewards/margins": 0.3516288101673126, + "eval_rewards/rejected": 0.03029490076005459, + "eval_runtime": 300.501, + "eval_samples_per_second": 2.13, + "eval_steps_per_second": 0.133, + "step": 1100 + }, + { + "epoch": 0.8693941778127459, + "grad_norm": 21.122217178344727, + "learning_rate": 1.0843264046665558e-06, + "logits/chosen": -0.5116424560546875, + "logits/rejected": -0.6911696195602417, + "logps/chosen": -184.15603637695312, + "logps/rejected": -175.35256958007812, + "loss": 0.757, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.15398895740509033, + "rewards/margins": 0.07341472804546356, + "rewards/rejected": 0.08057420700788498, + "step": 1105 + }, + { + "epoch": 0.8733280881195908, + "grad_norm": 12.891325950622559, + "learning_rate": 1.0229309904411178e-06, + "logits/chosen": -0.5018507838249207, + "logits/rejected": -0.8595576286315918, + "logps/chosen": -195.92578125, + "logps/rejected": -172.9355010986328, + "loss": 0.6866, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.32908183336257935, + "rewards/margins": 0.38747507333755493, + "rewards/rejected": -0.05839322879910469, + "step": 1110 + }, + { + "epoch": 0.8772619984264359, + "grad_norm": 17.477975845336914, + "learning_rate": 9.63230987314251e-07, + "logits/chosen": -0.27941471338272095, + "logits/rejected": -0.5305674076080322, + "logps/chosen": -191.43380737304688, + "logps/rejected": -170.3942108154297, + "loss": 0.6999, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.39622825384140015, + "rewards/margins": 0.23104743659496307, + "rewards/rejected": 0.16518081724643707, + "step": 1115 + }, + { + "epoch": 0.8811959087332809, + "grad_norm": 19.63365936279297, + "learning_rate": 9.052376702408206e-07, + "logits/chosen": -0.4624987542629242, + "logits/rejected": -0.5762002468109131, + "logps/chosen": -187.87295532226562, + "logps/rejected": -193.58670043945312, + "loss": 0.7027, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.4038239121437073, + "rewards/margins": 0.24877241253852844, + "rewards/rejected": 0.15505146980285645, + "step": 1120 + }, + { + "epoch": 0.8851298190401259, + "grad_norm": 20.428455352783203, + "learning_rate": 8.489619918506098e-07, + "logits/chosen": -0.23860251903533936, + "logits/rejected": -0.6500253677368164, + "logps/chosen": -212.96658325195312, + "logps/rejected": -179.9956512451172, + "loss": 0.7073, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.24726350605487823, + "rewards/margins": 0.2762225866317749, + "rewards/rejected": -0.028959061950445175, + "step": 1125 + }, + { + "epoch": 0.8890637293469709, + "grad_norm": 19.340242385864258, + "learning_rate": 7.944145803798064e-07, + "logits/chosen": -0.23527947068214417, + "logits/rejected": -0.59322589635849, + "logps/chosen": -203.28225708007812, + "logps/rejected": -180.2418670654297, + "loss": 0.681, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.3411061465740204, + "rewards/margins": 0.18520446121692657, + "rewards/rejected": 0.155901700258255, + "step": 1130 + }, + { + "epoch": 0.8929976396538158, + "grad_norm": 79.16990661621094, + "learning_rate": 7.416057376637543e-07, + "logits/chosen": -0.3579210638999939, + "logits/rejected": -0.6960107088088989, + "logps/chosen": -200.02012634277344, + "logps/rejected": -180.67965698242188, + "loss": 0.6985, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.4765182435512543, + "rewards/margins": 0.23227711021900177, + "rewards/rejected": 0.2442411184310913, + "step": 1135 + }, + { + "epoch": 0.8969315499606609, + "grad_norm": 17.410009384155273, + "learning_rate": 6.905454371913467e-07, + "logits/chosen": -0.1638367921113968, + "logits/rejected": -0.5099595189094543, + "logps/chosen": -195.05340576171875, + "logps/rejected": -173.5426788330078, + "loss": 0.586, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.4057907164096832, + "rewards/margins": 0.6160932183265686, + "rewards/rejected": -0.21030254662036896, + "step": 1140 + }, + { + "epoch": 0.9008654602675059, + "grad_norm": 22.917627334594727, + "learning_rate": 6.412433222214265e-07, + "logits/chosen": -0.2664688527584076, + "logits/rejected": -0.6332502365112305, + "logps/chosen": -216.44711303710938, + "logps/rejected": -192.39352416992188, + "loss": 0.6699, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.47267699241638184, + "rewards/margins": 0.3270387649536133, + "rewards/rejected": 0.14563825726509094, + "step": 1145 + }, + { + "epoch": 0.9047993705743509, + "grad_norm": 17.761707305908203, + "learning_rate": 5.937087039615619e-07, + "logits/chosen": 0.004246175289154053, + "logits/rejected": -0.3583109974861145, + "logps/chosen": -208.1468963623047, + "logps/rejected": -186.03244018554688, + "loss": 0.647, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.37165918946266174, + "rewards/margins": 0.42197996377944946, + "rewards/rejected": -0.05032079294323921, + "step": 1150 + }, + { + "epoch": 0.9087332808811959, + "grad_norm": 26.38233184814453, + "learning_rate": 5.479505598095292e-07, + "logits/chosen": -0.12539446353912354, + "logits/rejected": -0.085462287068367, + "logps/chosen": -205.96804809570312, + "logps/rejected": -210.92672729492188, + "loss": 0.7508, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.34662169218063354, + "rewards/margins": 0.055609725415706635, + "rewards/rejected": 0.2910119593143463, + "step": 1155 + }, + { + "epoch": 0.912667191188041, + "grad_norm": 25.847694396972656, + "learning_rate": 5.03977531657841e-07, + "logits/chosen": -0.023742878809571266, + "logits/rejected": -0.445591539144516, + "logps/chosen": -206.13525390625, + "logps/rejected": -183.71890258789062, + "loss": 0.6829, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.3747442364692688, + "rewards/margins": 0.33868470788002014, + "rewards/rejected": 0.036059536039829254, + "step": 1160 + }, + { + "epoch": 0.9166011014948859, + "grad_norm": 14.531253814697266, + "learning_rate": 4.6179792426163107e-07, + "logits/chosen": -0.13202346861362457, + "logits/rejected": -0.539734423160553, + "logps/chosen": -192.2351531982422, + "logps/rejected": -167.59829711914062, + "loss": 0.6574, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.4072590470314026, + "rewards/margins": 0.43480420112609863, + "rewards/rejected": -0.027545183897018433, + "step": 1165 + }, + { + "epoch": 0.9205350118017309, + "grad_norm": 18.72174835205078, + "learning_rate": 4.214197036702239e-07, + "logits/chosen": 0.10880019515752792, + "logits/rejected": -0.2607296109199524, + "logps/chosen": -215.71939086914062, + "logps/rejected": -196.47320556640625, + "loss": 0.657, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.5132928490638733, + "rewards/margins": 0.36024293303489685, + "rewards/rejected": 0.15304993093013763, + "step": 1170 + }, + { + "epoch": 0.9244689221085759, + "grad_norm": 16.053632736206055, + "learning_rate": 3.82850495722662e-07, + "logits/chosen": -0.07127988338470459, + "logits/rejected": -0.5435328483581543, + "logps/chosen": -210.20547485351562, + "logps/rejected": -173.3409881591797, + "loss": 0.6586, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4504272937774658, + "rewards/margins": 0.3692251741886139, + "rewards/rejected": 0.08120210468769073, + "step": 1175 + }, + { + "epoch": 0.9284028324154209, + "grad_norm": 20.035791397094727, + "learning_rate": 3.4609758460748656e-07, + "logits/chosen": -0.1992299109697342, + "logits/rejected": -0.43638792634010315, + "logps/chosen": -196.8170623779297, + "logps/rejected": -181.31607055664062, + "loss": 0.6511, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.29465168714523315, + "rewards/margins": 0.3416779041290283, + "rewards/rejected": -0.04702623561024666, + "step": 1180 + }, + { + "epoch": 0.932336742722266, + "grad_norm": 14.104338645935059, + "learning_rate": 3.1116791148704584e-07, + "logits/chosen": -0.5095082521438599, + "logits/rejected": -0.933671772480011, + "logps/chosen": -181.0245819091797, + "logps/rejected": -145.5948944091797, + "loss": 0.6582, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.405670702457428, + "rewards/margins": 0.39661210775375366, + "rewards/rejected": 0.009058552794158459, + "step": 1185 + }, + { + "epoch": 0.9362706530291109, + "grad_norm": 24.393505096435547, + "learning_rate": 2.78068073186587e-07, + "logits/chosen": -0.07540292292833328, + "logits/rejected": -0.5439732670783997, + "logps/chosen": -220.9651336669922, + "logps/rejected": -198.6578826904297, + "loss": 0.662, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.47583404183387756, + "rewards/margins": 0.4002237915992737, + "rewards/rejected": 0.07561029493808746, + "step": 1190 + }, + { + "epoch": 0.9402045633359559, + "grad_norm": 22.48759651184082, + "learning_rate": 2.4680432094837394e-07, + "logits/chosen": -0.030767759308218956, + "logits/rejected": -0.40518251061439514, + "logps/chosen": -192.23971557617188, + "logps/rejected": -165.24063110351562, + "loss": 0.6944, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.32550159096717834, + "rewards/margins": 0.2751082479953766, + "rewards/rejected": 0.05039336532354355, + "step": 1195 + }, + { + "epoch": 0.9441384736428009, + "grad_norm": 16.363256454467773, + "learning_rate": 2.1738255925108253e-07, + "logits/chosen": -0.5227106809616089, + "logits/rejected": -0.7640475034713745, + "logps/chosen": -218.41708374023438, + "logps/rejected": -194.16444396972656, + "loss": 0.668, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.38994377851486206, + "rewards/margins": 0.3536146283149719, + "rewards/rejected": 0.03632917255163193, + "step": 1200 + }, + { + "epoch": 0.9441384736428009, + "eval_logits/chosen": 1.2622064352035522, + "eval_logits/rejected": 1.0416359901428223, + "eval_logps/chosen": -207.0163116455078, + "eval_logps/rejected": -180.30044555664062, + "eval_loss": 0.6637659072875977, + "eval_rewards/accuracies": 0.6390625238418579, + "eval_rewards/chosen": 0.38159698247909546, + "eval_rewards/margins": 0.34863370656967163, + "eval_rewards/rejected": 0.03296329826116562, + "eval_runtime": 307.2933, + "eval_samples_per_second": 2.083, + "eval_steps_per_second": 0.13, + "step": 1200 + }, + { + "epoch": 0.948072383949646, + "grad_norm": 15.801830291748047, + "learning_rate": 1.8980834469467523e-07, + "logits/chosen": 0.028558891266584396, + "logits/rejected": -0.36049187183380127, + "logps/chosen": -225.1962127685547, + "logps/rejected": -196.6998748779297, + "loss": 0.7157, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.30342918634414673, + "rewards/margins": 0.18997251987457275, + "rewards/rejected": 0.11345665156841278, + "step": 1205 + }, + { + "epoch": 0.952006294256491, + "grad_norm": 21.53165054321289, + "learning_rate": 1.6408688495098134e-07, + "logits/chosen": -0.09858529269695282, + "logits/rejected": -0.52873694896698, + "logps/chosen": -208.2776336669922, + "logps/rejected": -179.6067657470703, + "loss": 0.701, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.3496444821357727, + "rewards/margins": 0.24224546551704407, + "rewards/rejected": 0.10739902406930923, + "step": 1210 + }, + { + "epoch": 0.955940204563336, + "grad_norm": 15.417522430419922, + "learning_rate": 1.402230377801761e-07, + "logits/chosen": -0.12817321717739105, + "logits/rejected": -0.5611924529075623, + "logps/chosen": -223.1984405517578, + "logps/rejected": -191.31808471679688, + "loss": 0.673, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.5014004707336426, + "rewards/margins": 0.3005516231060028, + "rewards/rejected": 0.20084881782531738, + "step": 1215 + }, + { + "epoch": 0.9598741148701809, + "grad_norm": 21.139495849609375, + "learning_rate": 1.1822131011334003e-07, + "logits/chosen": -0.330310583114624, + "logits/rejected": -0.6778287887573242, + "logps/chosen": -206.1497802734375, + "logps/rejected": -175.0183563232422, + "loss": 0.6634, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.40439096093177795, + "rewards/margins": 0.39146164059638977, + "rewards/rejected": 0.012929338030517101, + "step": 1220 + }, + { + "epoch": 0.963808025177026, + "grad_norm": 15.441524505615234, + "learning_rate": 9.80858572012866e-08, + "logits/chosen": -0.10460350662469864, + "logits/rejected": -0.46022725105285645, + "logps/chosen": -223.1492156982422, + "logps/rejected": -193.82369995117188, + "loss": 0.6415, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.3919476568698883, + "rewards/margins": 0.37521207332611084, + "rewards/rejected": 0.016735553741455078, + "step": 1225 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 19.24515724182129, + "learning_rate": 7.982048182978985e-08, + "logits/chosen": -0.3437039256095886, + "logits/rejected": -0.7036724090576172, + "logps/chosen": -210.358642578125, + "logps/rejected": -189.95278930664062, + "loss": 0.676, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.3740697503089905, + "rewards/margins": 0.34888529777526855, + "rewards/rejected": 0.02518446370959282, + "step": 1230 + }, + { + "epoch": 0.971675845790716, + "grad_norm": 16.690387725830078, + "learning_rate": 6.342863360139672e-08, + "logits/chosen": -0.29954901337623596, + "logits/rejected": -0.7138617634773254, + "logps/chosen": -181.06094360351562, + "logps/rejected": -157.12701416015625, + "loss": 0.6961, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.3025146424770355, + "rewards/margins": 0.25972747802734375, + "rewards/rejected": 0.04278718680143356, + "step": 1235 + }, + { + "epoch": 0.975609756097561, + "grad_norm": 17.477008819580078, + "learning_rate": 4.8913408283934874e-08, + "logits/chosen": -0.19394654035568237, + "logits/rejected": -0.5592636466026306, + "logps/chosen": -211.7626495361328, + "logps/rejected": -190.55416870117188, + "loss": 0.6955, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.39831337332725525, + "rewards/margins": 0.3408041000366211, + "rewards/rejected": 0.05750928074121475, + "step": 1240 + }, + { + "epoch": 0.9795436664044059, + "grad_norm": 14.256926536560059, + "learning_rate": 3.627754722584031e-08, + "logits/chosen": -0.15048038959503174, + "logits/rejected": -0.5208483934402466, + "logps/chosen": -223.10110473632812, + "logps/rejected": -190.59140014648438, + "loss": 0.6593, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.49489039182662964, + "rewards/margins": 0.3687785863876343, + "rewards/rejected": 0.12611182034015656, + "step": 1245 + }, + { + "epoch": 0.983477576711251, + "grad_norm": 29.595378875732422, + "learning_rate": 2.5523436838430503e-08, + "logits/chosen": -0.3160143494606018, + "logits/rejected": -0.6430375576019287, + "logps/chosen": -196.36361694335938, + "logps/rejected": -166.49758911132812, + "loss": 0.6625, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.36816468834877014, + "rewards/margins": 0.36051416397094727, + "rewards/rejected": 0.007650518324226141, + "step": 1250 + }, + { + "epoch": 0.987411487018096, + "grad_norm": 13.689908027648926, + "learning_rate": 1.665310814520482e-08, + "logits/chosen": -0.6328016519546509, + "logits/rejected": -0.9240643382072449, + "logps/chosen": -188.88470458984375, + "logps/rejected": -166.7686767578125, + "loss": 0.6975, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.09804626554250717, + "rewards/margins": 0.26263147592544556, + "rewards/rejected": -0.1645852029323578, + "step": 1255 + }, + { + "epoch": 0.991345397324941, + "grad_norm": 17.047653198242188, + "learning_rate": 9.668236398262532e-09, + "logits/chosen": -0.35158300399780273, + "logits/rejected": -0.6125014424324036, + "logps/chosen": -203.73788452148438, + "logps/rejected": -189.255126953125, + "loss": 0.6549, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.1564028114080429, + "rewards/margins": 0.3770085275173187, + "rewards/rejected": -0.22060570120811462, + "step": 1260 + }, + { + "epoch": 0.995279307631786, + "grad_norm": 21.35641098022461, + "learning_rate": 4.570140761918085e-09, + "logits/chosen": -0.744472861289978, + "logits/rejected": -0.9415663480758667, + "logps/chosen": -186.4073028564453, + "logps/rejected": -176.38418579101562, + "loss": 0.6604, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.32688194513320923, + "rewards/margins": 0.42695555090904236, + "rewards/rejected": -0.10007365047931671, + "step": 1265 + }, + { + "epoch": 0.999213217938631, + "grad_norm": 14.891934394836426, + "learning_rate": 1.3597840635615201e-09, + "logits/chosen": -0.14978916943073273, + "logits/rejected": -0.6677058935165405, + "logps/chosen": -209.85635375976562, + "logps/rejected": -172.8124237060547, + "loss": 0.6708, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.3781249225139618, + "rewards/margins": 0.32455307245254517, + "rewards/rejected": 0.05357181280851364, + "step": 1270 + }, + { + "epoch": 1.0, + "step": 1271, + "total_flos": 0.0, + "train_loss": 0.675776368140424, + "train_runtime": 24039.6181, + "train_samples_per_second": 0.846, + "train_steps_per_second": 0.053 + } + ], + "logging_steps": 5, + "max_steps": 1271, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}