diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13883 @@ +{ + "best_global_step": 850, + "best_metric": 0.31901347637176514, + "best_model_checkpoint": "/experiment_results/dpo/A-vibe_OPEN_SOURCE_checkpoint-1600_dpo_chosen_OUR_super_unsafe_from_PR_x15_NEW_CORRECT_04_10_25_v9/checkpoint-850", + "epoch": 1.0, + "eval_steps": 50, + "global_step": 904, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0011061946902654867, + "grad_norm": 21.33904266357422, + "learning_rate": 0.0, + "logits/chosen": -1.55078125, + "logits/rejected": -1.46875, + "logps/chosen": -288.0, + "logps/rejected": -235.5, + "loss": 0.7017, + "rewards/accuracies": 0.078125, + "rewards/chosen": -0.007916450500488281, + "rewards/margins": -0.0164794921875, + "rewards/rejected": 0.00848388671875, + "step": 1 + }, + { + "epoch": 0.0022123893805309734, + "grad_norm": 19.44487762451172, + "learning_rate": 1.7857142857142856e-08, + "logits/chosen": -1.5, + "logits/rejected": -1.43359375, + "logps/chosen": -259.0, + "logps/rejected": -226.0, + "loss": 0.6987, + "rewards/accuracies": 0.171875, + "rewards/chosen": -0.006072998046875, + "rewards/margins": -0.007415771484375, + "rewards/rejected": 0.0013580322265625, + "step": 2 + }, + { + "epoch": 0.00331858407079646, + "grad_norm": 21.772796630859375, + "learning_rate": 3.571428571428571e-08, + "logits/chosen": -1.58984375, + "logits/rejected": -1.54296875, + "logps/chosen": -288.0, + "logps/rejected": -286.0, + "loss": 0.6943, + "rewards/accuracies": 0.296875, + "rewards/chosen": -0.0041046142578125, + "rewards/margins": 0.002166748046875, + "rewards/rejected": -0.0062713623046875, + "step": 3 + }, + { + "epoch": 0.004424778761061947, + "grad_norm": 20.7520751953125, + "learning_rate": 5.3571428571428564e-08, + "logits/chosen": -1.65625, + "logits/rejected": -1.6015625, + "logps/chosen": -257.5, + "logps/rejected": -243.0, + "loss": 0.6858, + "rewards/accuracies": 0.328125, + "rewards/chosen": 0.0084075927734375, + "rewards/margins": 0.0184326171875, + "rewards/rejected": -0.009979248046875, + "step": 4 + }, + { + "epoch": 0.0055309734513274336, + "grad_norm": 22.113121032714844, + "learning_rate": 7.142857142857142e-08, + "logits/chosen": -1.5234375, + "logits/rejected": -1.53515625, + "logps/chosen": -263.0, + "logps/rejected": -262.5, + "loss": 0.6965, + "rewards/accuracies": 0.28125, + "rewards/chosen": -0.008130073547363281, + "rewards/margins": -0.003143310546875, + "rewards/rejected": -0.0049991607666015625, + "step": 5 + }, + { + "epoch": 0.00663716814159292, + "grad_norm": 22.67697525024414, + "learning_rate": 8.928571428571429e-08, + "logits/chosen": -1.4609375, + "logits/rejected": -1.62109375, + "logps/chosen": -252.5, + "logps/rejected": -259.5, + "loss": 0.6851, + "rewards/accuracies": 0.3046875, + "rewards/chosen": 0.00469970703125, + "rewards/margins": 0.013885498046875, + "rewards/rejected": -0.009189605712890625, + "step": 6 + }, + { + "epoch": 0.007743362831858407, + "grad_norm": 23.316373825073242, + "learning_rate": 1.0714285714285713e-07, + "logits/chosen": -1.46484375, + "logits/rejected": -1.3984375, + "logps/chosen": -279.0, + "logps/rejected": -271.0, + "loss": 0.698, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.00156402587890625, + "rewards/margins": -0.0079498291015625, + "rewards/rejected": 0.00946044921875, + "step": 7 + }, + { + "epoch": 0.008849557522123894, + "grad_norm": 24.865726470947266, + "learning_rate": 1.25e-07, + "logits/chosen": -1.43359375, + "logits/rejected": -1.546875, + "logps/chosen": -275.0, + "logps/rejected": -292.0, + "loss": 0.7039, + "rewards/accuracies": 0.203125, + "rewards/chosen": -0.006072998046875, + "rewards/margins": -0.01904296875, + "rewards/rejected": 0.01300048828125, + "step": 8 + }, + { + "epoch": 0.00995575221238938, + "grad_norm": 20.924415588378906, + "learning_rate": 1.4285714285714285e-07, + "logits/chosen": -1.55859375, + "logits/rejected": -1.51953125, + "logps/chosen": -238.5, + "logps/rejected": -238.5, + "loss": 0.6892, + "rewards/accuracies": 0.296875, + "rewards/chosen": 0.0125274658203125, + "rewards/margins": 0.0072021484375, + "rewards/rejected": 0.0052642822265625, + "step": 9 + }, + { + "epoch": 0.011061946902654867, + "grad_norm": 19.864246368408203, + "learning_rate": 1.6071428571428573e-07, + "logits/chosen": -1.56640625, + "logits/rejected": -1.48046875, + "logps/chosen": -249.0, + "logps/rejected": -230.0, + "loss": 0.6956, + "rewards/accuracies": 0.3046875, + "rewards/chosen": 0.0086822509765625, + "rewards/margins": 0.00128173828125, + "rewards/rejected": 0.0074615478515625, + "step": 10 + }, + { + "epoch": 0.012168141592920354, + "grad_norm": 22.528316497802734, + "learning_rate": 1.7857142857142858e-07, + "logits/chosen": -1.59375, + "logits/rejected": -1.5, + "logps/chosen": -272.0, + "logps/rejected": -290.0, + "loss": 0.6936, + "rewards/accuracies": 0.3359375, + "rewards/chosen": -0.0045032501220703125, + "rewards/margins": 0.0057544708251953125, + "rewards/rejected": -0.01029062271118164, + "step": 11 + }, + { + "epoch": 0.01327433628318584, + "grad_norm": 21.385112762451172, + "learning_rate": 1.964285714285714e-07, + "logits/chosen": -1.43359375, + "logits/rejected": -1.38671875, + "logps/chosen": -270.0, + "logps/rejected": -281.0, + "loss": 0.6895, + "rewards/accuracies": 0.3359375, + "rewards/chosen": 0.001373291015625, + "rewards/margins": 0.0108795166015625, + "rewards/rejected": -0.009471893310546875, + "step": 12 + }, + { + "epoch": 0.014380530973451327, + "grad_norm": 21.703392028808594, + "learning_rate": 2.1428571428571426e-07, + "logits/chosen": -1.51953125, + "logits/rejected": -1.35546875, + "logps/chosen": -258.0, + "logps/rejected": -263.0, + "loss": 0.7104, + "rewards/accuracies": 0.2421875, + "rewards/chosen": -0.0135040283203125, + "rewards/margins": -0.0324249267578125, + "rewards/rejected": 0.01898193359375, + "step": 13 + }, + { + "epoch": 0.015486725663716814, + "grad_norm": 19.697071075439453, + "learning_rate": 2.3214285714285714e-07, + "logits/chosen": -1.4140625, + "logits/rejected": -1.56640625, + "logps/chosen": -248.0, + "logps/rejected": -233.5, + "loss": 0.6953, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.005401611328125, + "rewards/margins": -0.001678466796875, + "rewards/rejected": 0.007049560546875, + "step": 14 + }, + { + "epoch": 0.016592920353982302, + "grad_norm": 21.335206985473633, + "learning_rate": 2.5e-07, + "logits/chosen": -1.56640625, + "logits/rejected": -1.51953125, + "logps/chosen": -272.0, + "logps/rejected": -270.0, + "loss": 0.6838, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.0098724365234375, + "rewards/margins": 0.0190277099609375, + "rewards/rejected": -0.009204864501953125, + "step": 15 + }, + { + "epoch": 0.017699115044247787, + "grad_norm": 21.42949867248535, + "learning_rate": 2.6785714285714284e-07, + "logits/chosen": -1.515625, + "logits/rejected": -1.6328125, + "logps/chosen": -248.5, + "logps/rejected": -244.5, + "loss": 0.6785, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.00927734375, + "rewards/margins": 0.0289306640625, + "rewards/rejected": -0.0196533203125, + "step": 16 + }, + { + "epoch": 0.018805309734513276, + "grad_norm": 20.796878814697266, + "learning_rate": 2.857142857142857e-07, + "logits/chosen": -1.60546875, + "logits/rejected": -1.625, + "logps/chosen": -231.5, + "logps/rejected": -231.5, + "loss": 0.6899, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.00724029541015625, + "rewards/margins": 0.011138916015625, + "rewards/rejected": -0.00391387939453125, + "step": 17 + }, + { + "epoch": 0.01991150442477876, + "grad_norm": 20.082786560058594, + "learning_rate": 3.0357142857142855e-07, + "logits/chosen": -1.46875, + "logits/rejected": -1.40625, + "logps/chosen": -251.0, + "logps/rejected": -248.5, + "loss": 0.688, + "rewards/accuracies": 0.40625, + "rewards/chosen": 0.016357421875, + "rewards/margins": 0.0147705078125, + "rewards/rejected": 0.0015716552734375, + "step": 18 + }, + { + "epoch": 0.02101769911504425, + "grad_norm": 21.640682220458984, + "learning_rate": 3.2142857142857145e-07, + "logits/chosen": -1.59765625, + "logits/rejected": -1.3515625, + "logps/chosen": -264.0, + "logps/rejected": -262.0, + "loss": 0.6912, + "rewards/accuracies": 0.3359375, + "rewards/chosen": 0.01506805419921875, + "rewards/margins": 0.005462646484375, + "rewards/rejected": 0.00958251953125, + "step": 19 + }, + { + "epoch": 0.022123893805309734, + "grad_norm": 22.128896713256836, + "learning_rate": 3.392857142857143e-07, + "logits/chosen": -1.57421875, + "logits/rejected": -1.47265625, + "logps/chosen": -267.5, + "logps/rejected": -267.0, + "loss": 0.6917, + "rewards/accuracies": 0.3515625, + "rewards/chosen": 0.01458740234375, + "rewards/margins": 0.0075225830078125, + "rewards/rejected": 0.007049560546875, + "step": 20 + }, + { + "epoch": 0.023230088495575223, + "grad_norm": 20.139122009277344, + "learning_rate": 3.5714285714285716e-07, + "logits/chosen": -1.58203125, + "logits/rejected": -1.46484375, + "logps/chosen": -251.5, + "logps/rejected": -251.0, + "loss": 0.699, + "rewards/accuracies": 0.3203125, + "rewards/chosen": 0.003238677978515625, + "rewards/margins": -0.0052642822265625, + "rewards/rejected": 0.008502960205078125, + "step": 21 + }, + { + "epoch": 0.024336283185840708, + "grad_norm": 20.964323043823242, + "learning_rate": 3.75e-07, + "logits/chosen": -1.55859375, + "logits/rejected": -1.49609375, + "logps/chosen": -236.0, + "logps/rejected": -260.5, + "loss": 0.6882, + "rewards/accuracies": 0.3984375, + "rewards/chosen": 0.02685546875, + "rewards/margins": 0.013885498046875, + "rewards/rejected": 0.01297760009765625, + "step": 22 + }, + { + "epoch": 0.025442477876106196, + "grad_norm": 19.556018829345703, + "learning_rate": 3.928571428571428e-07, + "logits/chosen": -1.6015625, + "logits/rejected": -1.4296875, + "logps/chosen": -234.0, + "logps/rejected": -210.0, + "loss": 0.6941, + "rewards/accuracies": 0.359375, + "rewards/chosen": 0.01114654541015625, + "rewards/margins": 0.00146484375, + "rewards/rejected": 0.0096893310546875, + "step": 23 + }, + { + "epoch": 0.02654867256637168, + "grad_norm": 195.61749267578125, + "learning_rate": 4.1071428571428566e-07, + "logits/chosen": -1.59375, + "logits/rejected": -1.328125, + "logps/chosen": -264.0, + "logps/rejected": -329.5, + "loss": 0.676, + "rewards/accuracies": 0.4921875, + "rewards/chosen": 0.0570068359375, + "rewards/margins": -0.006103515625, + "rewards/rejected": 0.0631866455078125, + "step": 24 + }, + { + "epoch": 0.02765486725663717, + "grad_norm": 21.722719192504883, + "learning_rate": 4.285714285714285e-07, + "logits/chosen": -1.4375, + "logits/rejected": -1.546875, + "logps/chosen": -259.0, + "logps/rejected": -269.0, + "loss": 0.6887, + "rewards/accuracies": 0.453125, + "rewards/chosen": 0.0538330078125, + "rewards/margins": 0.0157470703125, + "rewards/rejected": 0.03802490234375, + "step": 25 + }, + { + "epoch": 0.028761061946902654, + "grad_norm": 22.364490509033203, + "learning_rate": 4.464285714285714e-07, + "logits/chosen": -1.41015625, + "logits/rejected": -1.36328125, + "logps/chosen": -296.0, + "logps/rejected": -305.0, + "loss": 0.6882, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.0601806640625, + "rewards/margins": 0.01422119140625, + "rewards/rejected": 0.0460205078125, + "step": 26 + }, + { + "epoch": 0.029867256637168143, + "grad_norm": 20.38817024230957, + "learning_rate": 4.6428571428571427e-07, + "logits/chosen": -1.44140625, + "logits/rejected": -1.390625, + "logps/chosen": -280.0, + "logps/rejected": -265.0, + "loss": 0.6743, + "rewards/accuracies": 0.4609375, + "rewards/chosen": 0.0726318359375, + "rewards/margins": 0.0447998046875, + "rewards/rejected": 0.02783203125, + "step": 27 + }, + { + "epoch": 0.030973451327433628, + "grad_norm": 21.340524673461914, + "learning_rate": 4.821428571428571e-07, + "logits/chosen": -1.4609375, + "logits/rejected": -1.49609375, + "logps/chosen": -263.0, + "logps/rejected": -233.5, + "loss": 0.6704, + "rewards/accuracies": 0.4921875, + "rewards/chosen": 0.0859375, + "rewards/margins": 0.05419921875, + "rewards/rejected": 0.03167724609375, + "step": 28 + }, + { + "epoch": 0.032079646017699116, + "grad_norm": 22.794097900390625, + "learning_rate": 5e-07, + "logits/chosen": -1.47265625, + "logits/rejected": -1.52734375, + "logps/chosen": -251.5, + "logps/rejected": -277.0, + "loss": 0.6665, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.100341796875, + "rewards/margins": 0.0615234375, + "rewards/rejected": 0.03863525390625, + "step": 29 + }, + { + "epoch": 0.033185840707964605, + "grad_norm": 21.916282653808594, + "learning_rate": 4.999983923145526e-07, + "logits/chosen": -1.45703125, + "logits/rejected": -1.44140625, + "logps/chosen": -268.0, + "logps/rejected": -271.0, + "loss": 0.6672, + "rewards/accuracies": 0.5078125, + "rewards/chosen": 0.087158203125, + "rewards/margins": 0.052978515625, + "rewards/rejected": 0.03411865234375, + "step": 30 + }, + { + "epoch": 0.034292035398230086, + "grad_norm": 20.50246810913086, + "learning_rate": 4.999935692788877e-07, + "logits/chosen": -1.44140625, + "logits/rejected": -1.42578125, + "logps/chosen": -263.0, + "logps/rejected": -280.0, + "loss": 0.6626, + "rewards/accuracies": 0.4921875, + "rewards/chosen": 0.1103515625, + "rewards/margins": 0.071533203125, + "rewards/rejected": 0.03887939453125, + "step": 31 + }, + { + "epoch": 0.035398230088495575, + "grad_norm": 21.142545700073242, + "learning_rate": 4.999855309550366e-07, + "logits/chosen": -1.54296875, + "logits/rejected": -1.5859375, + "logps/chosen": -291.0, + "logps/rejected": -268.0, + "loss": 0.6704, + "rewards/accuracies": 0.5390625, + "rewards/chosen": 0.091552734375, + "rewards/margins": 0.0552978515625, + "rewards/rejected": 0.03607177734375, + "step": 32 + }, + { + "epoch": 0.03650442477876106, + "grad_norm": 20.50800895690918, + "learning_rate": 4.999742774463842e-07, + "logits/chosen": -1.4375, + "logits/rejected": -1.40234375, + "logps/chosen": -256.5, + "logps/rejected": -270.0, + "loss": 0.6494, + "rewards/accuracies": 0.6171875, + "rewards/chosen": 0.1484375, + "rewards/margins": 0.092041015625, + "rewards/rejected": 0.0565185546875, + "step": 33 + }, + { + "epoch": 0.03761061946902655, + "grad_norm": 19.532590866088867, + "learning_rate": 4.999598088976672e-07, + "logits/chosen": -1.49609375, + "logits/rejected": -1.4765625, + "logps/chosen": -250.0, + "logps/rejected": -260.0, + "loss": 0.6445, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.177734375, + "rewards/margins": 0.105224609375, + "rewards/rejected": 0.072509765625, + "step": 34 + }, + { + "epoch": 0.03871681415929203, + "grad_norm": 20.883621215820312, + "learning_rate": 4.999421254949727e-07, + "logits/chosen": -1.5390625, + "logits/rejected": -1.4140625, + "logps/chosen": -271.0, + "logps/rejected": -270.0, + "loss": 0.6501, + "rewards/accuracies": 0.5234375, + "rewards/chosen": 0.16943359375, + "rewards/margins": 0.103271484375, + "rewards/rejected": 0.0655517578125, + "step": 35 + }, + { + "epoch": 0.03982300884955752, + "grad_norm": 20.3232479095459, + "learning_rate": 4.999212274657353e-07, + "logits/chosen": -1.51953125, + "logits/rejected": -1.46484375, + "logps/chosen": -257.0, + "logps/rejected": -255.5, + "loss": 0.6428, + "rewards/accuracies": 0.6015625, + "rewards/chosen": 0.2080078125, + "rewards/margins": 0.114013671875, + "rewards/rejected": 0.09375, + "step": 36 + }, + { + "epoch": 0.04092920353982301, + "grad_norm": 21.007495880126953, + "learning_rate": 4.99897115078735e-07, + "logits/chosen": -1.4609375, + "logits/rejected": -1.58984375, + "logps/chosen": -259.5, + "logps/rejected": -253.0, + "loss": 0.636, + "rewards/accuracies": 0.5859375, + "rewards/chosen": 0.24609375, + "rewards/margins": 0.13525390625, + "rewards/rejected": 0.110595703125, + "step": 37 + }, + { + "epoch": 0.0420353982300885, + "grad_norm": 18.463993072509766, + "learning_rate": 4.998697886440926e-07, + "logits/chosen": -1.5078125, + "logits/rejected": -1.4375, + "logps/chosen": -242.0, + "logps/rejected": -246.0, + "loss": 0.6384, + "rewards/accuracies": 0.5234375, + "rewards/chosen": 0.23681640625, + "rewards/margins": 0.130859375, + "rewards/rejected": 0.106201171875, + "step": 38 + }, + { + "epoch": 0.04314159292035398, + "grad_norm": 20.67741584777832, + "learning_rate": 4.998392485132666e-07, + "logits/chosen": -1.49609375, + "logits/rejected": -1.375, + "logps/chosen": -267.0, + "logps/rejected": -275.0, + "loss": 0.6331, + "rewards/accuracies": 0.5546875, + "rewards/chosen": 0.275390625, + "rewards/margins": 0.14990234375, + "rewards/rejected": 0.12548828125, + "step": 39 + }, + { + "epoch": 0.04424778761061947, + "grad_norm": 20.392776489257812, + "learning_rate": 4.998054950790485e-07, + "logits/chosen": -1.359375, + "logits/rejected": -1.4609375, + "logps/chosen": -275.0, + "logps/rejected": -285.0, + "loss": 0.6218, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.2841796875, + "rewards/margins": 0.1650390625, + "rewards/rejected": 0.11865234375, + "step": 40 + }, + { + "epoch": 0.04535398230088496, + "grad_norm": 19.42493438720703, + "learning_rate": 4.997685287755575e-07, + "logits/chosen": -1.515625, + "logits/rejected": -1.3828125, + "logps/chosen": -271.0, + "logps/rejected": -262.5, + "loss": 0.6274, + "rewards/accuracies": 0.5703125, + "rewards/chosen": 0.2744140625, + "rewards/margins": 0.15234375, + "rewards/rejected": 0.12158203125, + "step": 41 + }, + { + "epoch": 0.046460176991150445, + "grad_norm": 19.07245635986328, + "learning_rate": 4.99728350078235e-07, + "logits/chosen": -1.53515625, + "logits/rejected": -1.4453125, + "logps/chosen": -274.0, + "logps/rejected": -251.5, + "loss": 0.6108, + "rewards/accuracies": 0.5859375, + "rewards/chosen": 0.3173828125, + "rewards/margins": 0.18896484375, + "rewards/rejected": 0.127685546875, + "step": 42 + }, + { + "epoch": 0.04756637168141593, + "grad_norm": 19.7177677154541, + "learning_rate": 4.996849595038388e-07, + "logits/chosen": -1.515625, + "logits/rejected": -1.49609375, + "logps/chosen": -273.5, + "logps/rejected": -281.0, + "loss": 0.6208, + "rewards/accuracies": 0.578125, + "rewards/chosen": 0.3310546875, + "rewards/margins": 0.17529296875, + "rewards/rejected": 0.15625, + "step": 43 + }, + { + "epoch": 0.048672566371681415, + "grad_norm": 19.820003509521484, + "learning_rate": 4.996383576104361e-07, + "logits/chosen": -1.5234375, + "logits/rejected": -1.421875, + "logps/chosen": -261.0, + "logps/rejected": -263.5, + "loss": 0.6196, + "rewards/accuracies": 0.5859375, + "rewards/chosen": 0.330078125, + "rewards/margins": 0.1796875, + "rewards/rejected": 0.15087890625, + "step": 44 + }, + { + "epoch": 0.049778761061946904, + "grad_norm": 20.092729568481445, + "learning_rate": 4.995885449973962e-07, + "logits/chosen": -1.36328125, + "logits/rejected": -1.39453125, + "logps/chosen": -293.0, + "logps/rejected": -295.0, + "loss": 0.6111, + "rewards/accuracies": 0.5859375, + "rewards/chosen": 0.3408203125, + "rewards/margins": 0.20068359375, + "rewards/rejected": 0.140380859375, + "step": 45 + }, + { + "epoch": 0.05088495575221239, + "grad_norm": 18.567899703979492, + "learning_rate": 4.995355223053834e-07, + "logits/chosen": -1.5, + "logits/rejected": -1.44921875, + "logps/chosen": -260.5, + "logps/rejected": -255.5, + "loss": 0.6146, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.3359375, + "rewards/margins": 0.1982421875, + "rewards/rejected": 0.13720703125, + "step": 46 + }, + { + "epoch": 0.051991150442477874, + "grad_norm": 20.356060028076172, + "learning_rate": 4.994792902163481e-07, + "logits/chosen": -1.45703125, + "logits/rejected": -1.29296875, + "logps/chosen": -280.0, + "logps/rejected": -260.0, + "loss": 0.627, + "rewards/accuracies": 0.5859375, + "rewards/chosen": 0.357421875, + "rewards/margins": 0.1787109375, + "rewards/rejected": 0.1787109375, + "step": 47 + }, + { + "epoch": 0.05309734513274336, + "grad_norm": 20.717748641967773, + "learning_rate": 4.994198494535182e-07, + "logits/chosen": -1.4765625, + "logits/rejected": -1.41796875, + "logps/chosen": -280.0, + "logps/rejected": -281.0, + "loss": 0.5881, + "rewards/accuracies": 0.6171875, + "rewards/chosen": 0.4091796875, + "rewards/margins": 0.25390625, + "rewards/rejected": 0.1552734375, + "step": 48 + }, + { + "epoch": 0.05420353982300885, + "grad_norm": 19.05792236328125, + "learning_rate": 4.993572007813904e-07, + "logits/chosen": -1.390625, + "logits/rejected": -1.35546875, + "logps/chosen": -251.5, + "logps/rejected": -277.0, + "loss": 0.5889, + "rewards/accuracies": 0.6015625, + "rewards/chosen": 0.41015625, + "rewards/margins": 0.263671875, + "rewards/rejected": 0.14599609375, + "step": 49 + }, + { + "epoch": 0.05530973451327434, + "grad_norm": 17.29762840270996, + "learning_rate": 4.992913450057195e-07, + "logits/chosen": -1.41796875, + "logits/rejected": -1.35546875, + "logps/chosen": -237.0, + "logps/rejected": -224.5, + "loss": 0.5867, + "rewards/accuracies": 0.6015625, + "rewards/chosen": 0.4501953125, + "rewards/margins": 0.26953125, + "rewards/rejected": 0.1796875, + "step": 50 + }, + { + "epoch": 0.05530973451327434, + "eval_logits/chosen": -1.4197372198104858, + "eval_logits/rejected": -1.4136348962783813, + "eval_logps/chosen": -255.96517944335938, + "eval_logps/rejected": -257.37811279296875, + "eval_loss": 0.5778365731239319, + "eval_rewards/accuracies": 0.6217424273490906, + "eval_rewards/chosen": 0.4856770932674408, + "eval_rewards/margins": 0.2985657751560211, + "eval_rewards/rejected": 0.1872473508119583, + "eval_runtime": 210.1095, + "eval_samples_per_second": 61.173, + "eval_steps_per_second": 0.957, + "step": 50 + }, + { + "epoch": 0.05641592920353982, + "grad_norm": 18.575069427490234, + "learning_rate": 4.992222829735082e-07, + "logits/chosen": -1.5078125, + "logits/rejected": -1.421875, + "logps/chosen": -260.0, + "logps/rejected": -259.0, + "loss": 0.5874, + "rewards/accuracies": 0.609375, + "rewards/chosen": 0.48046875, + "rewards/margins": 0.27734375, + "rewards/rejected": 0.203125, + "step": 51 + }, + { + "epoch": 0.05752212389380531, + "grad_norm": 17.976177215576172, + "learning_rate": 4.991500155729971e-07, + "logits/chosen": -1.42578125, + "logits/rejected": -1.43359375, + "logps/chosen": -252.5, + "logps/rejected": -256.0, + "loss": 0.575, + "rewards/accuracies": 0.6640625, + "rewards/chosen": 0.50390625, + "rewards/margins": 0.30078125, + "rewards/rejected": 0.20458984375, + "step": 52 + }, + { + "epoch": 0.0586283185840708, + "grad_norm": 17.979496002197266, + "learning_rate": 4.99074543733652e-07, + "logits/chosen": -1.4453125, + "logits/rejected": -1.46875, + "logps/chosen": -265.0, + "logps/rejected": -267.0, + "loss": 0.5444, + "rewards/accuracies": 0.7109375, + "rewards/chosen": 0.55859375, + "rewards/margins": 0.3984375, + "rewards/rejected": 0.1611328125, + "step": 53 + }, + { + "epoch": 0.059734513274336286, + "grad_norm": 17.736249923706055, + "learning_rate": 4.989958684261526e-07, + "logits/chosen": -1.31640625, + "logits/rejected": -1.375, + "logps/chosen": -255.5, + "logps/rejected": -292.0, + "loss": 0.5529, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.568359375, + "rewards/margins": 0.39453125, + "rewards/rejected": 0.17333984375, + "step": 54 + }, + { + "epoch": 0.06084070796460177, + "grad_norm": 17.284420013427734, + "learning_rate": 4.989139906623802e-07, + "logits/chosen": -1.44140625, + "logits/rejected": -1.42578125, + "logps/chosen": -253.0, + "logps/rejected": -256.5, + "loss": 0.5522, + "rewards/accuracies": 0.6953125, + "rewards/chosen": 0.599609375, + "rewards/margins": 0.373046875, + "rewards/rejected": 0.2255859375, + "step": 55 + }, + { + "epoch": 0.061946902654867256, + "grad_norm": 17.704713821411133, + "learning_rate": 4.988289114954044e-07, + "logits/chosen": -1.3671875, + "logits/rejected": -1.35546875, + "logps/chosen": -237.5, + "logps/rejected": -260.0, + "loss": 0.5504, + "rewards/accuracies": 0.609375, + "rewards/chosen": 0.654296875, + "rewards/margins": 0.3896484375, + "rewards/rejected": 0.263671875, + "step": 56 + }, + { + "epoch": 0.06305309734513274, + "grad_norm": 17.688888549804688, + "learning_rate": 4.987406320194694e-07, + "logits/chosen": -1.453125, + "logits/rejected": -1.3359375, + "logps/chosen": -242.5, + "logps/rejected": -247.5, + "loss": 0.5537, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.69140625, + "rewards/margins": 0.3857421875, + "rewards/rejected": 0.3046875, + "step": 57 + }, + { + "epoch": 0.06415929203539823, + "grad_norm": 17.646419525146484, + "learning_rate": 4.986491533699802e-07, + "logits/chosen": -1.41015625, + "logits/rejected": -1.37109375, + "logps/chosen": -256.5, + "logps/rejected": -281.0, + "loss": 0.5431, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.71484375, + "rewards/margins": 0.439453125, + "rewards/rejected": 0.27490234375, + "step": 58 + }, + { + "epoch": 0.06526548672566372, + "grad_norm": 17.19894027709961, + "learning_rate": 4.985544767234879e-07, + "logits/chosen": -1.41796875, + "logits/rejected": -1.4453125, + "logps/chosen": -245.5, + "logps/rejected": -250.5, + "loss": 0.5403, + "rewards/accuracies": 0.609375, + "rewards/chosen": 0.7421875, + "rewards/margins": 0.4541015625, + "rewards/rejected": 0.287109375, + "step": 59 + }, + { + "epoch": 0.06637168141592921, + "grad_norm": 16.630441665649414, + "learning_rate": 4.984566032976749e-07, + "logits/chosen": -1.390625, + "logits/rejected": -1.33984375, + "logps/chosen": -248.0, + "logps/rejected": -254.5, + "loss": 0.5386, + "rewards/accuracies": 0.6640625, + "rewards/chosen": 0.828125, + "rewards/margins": 0.466796875, + "rewards/rejected": 0.361328125, + "step": 60 + }, + { + "epoch": 0.06747787610619468, + "grad_norm": 17.252979278564453, + "learning_rate": 4.983555343513384e-07, + "logits/chosen": -1.41796875, + "logits/rejected": -1.4375, + "logps/chosen": -250.5, + "logps/rejected": -275.5, + "loss": 0.5028, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.8515625, + "rewards/margins": 0.578125, + "rewards/rejected": 0.271484375, + "step": 61 + }, + { + "epoch": 0.06858407079646017, + "grad_norm": 17.014602661132812, + "learning_rate": 4.982512711843752e-07, + "logits/chosen": -1.32421875, + "logits/rejected": -1.30078125, + "logps/chosen": -242.0, + "logps/rejected": -243.5, + "loss": 0.5159, + "rewards/accuracies": 0.671875, + "rewards/chosen": 0.94140625, + "rewards/margins": 0.5458984375, + "rewards/rejected": 0.39453125, + "step": 62 + }, + { + "epoch": 0.06969026548672566, + "grad_norm": 16.877351760864258, + "learning_rate": 4.98143815137764e-07, + "logits/chosen": -1.40234375, + "logits/rejected": -1.36328125, + "logps/chosen": -264.0, + "logps/rejected": -285.0, + "loss": 0.5343, + "rewards/accuracies": 0.6328125, + "rewards/chosen": 0.857421875, + "rewards/margins": 0.4853515625, + "rewards/rejected": 0.373046875, + "step": 63 + }, + { + "epoch": 0.07079646017699115, + "grad_norm": 17.047161102294922, + "learning_rate": 4.980331675935493e-07, + "logits/chosen": -1.33203125, + "logits/rejected": -1.34375, + "logps/chosen": -254.5, + "logps/rejected": -291.0, + "loss": 0.5211, + "rewards/accuracies": 0.6484375, + "rewards/chosen": 0.962890625, + "rewards/margins": 0.533203125, + "rewards/rejected": 0.4296875, + "step": 64 + }, + { + "epoch": 0.07190265486725664, + "grad_norm": 17.61670684814453, + "learning_rate": 4.979193299748224e-07, + "logits/chosen": -1.2890625, + "logits/rejected": -1.3828125, + "logps/chosen": -265.0, + "logps/rejected": -278.0, + "loss": 0.4878, + "rewards/accuracies": 0.7109375, + "rewards/chosen": 0.96484375, + "rewards/margins": 0.6640625, + "rewards/rejected": 0.2998046875, + "step": 65 + }, + { + "epoch": 0.07300884955752213, + "grad_norm": 17.460668563842773, + "learning_rate": 4.978023037457043e-07, + "logits/chosen": -1.44921875, + "logits/rejected": -1.3515625, + "logps/chosen": -267.0, + "logps/rejected": -279.0, + "loss": 0.536, + "rewards/accuracies": 0.5703125, + "rewards/chosen": 0.95703125, + "rewards/margins": 0.5390625, + "rewards/rejected": 0.416015625, + "step": 66 + }, + { + "epoch": 0.07411504424778761, + "grad_norm": 181.168212890625, + "learning_rate": 4.976820904113256e-07, + "logits/chosen": -1.36328125, + "logits/rejected": -1.30859375, + "logps/chosen": -233.5, + "logps/rejected": -340.0, + "loss": 0.489, + "rewards/accuracies": 0.734375, + "rewards/chosen": 1.05859375, + "rewards/margins": 0.646484375, + "rewards/rejected": 0.41015625, + "step": 67 + }, + { + "epoch": 0.0752212389380531, + "grad_norm": 15.64547061920166, + "learning_rate": 4.975586915178084e-07, + "logits/chosen": -1.3671875, + "logits/rejected": -1.40234375, + "logps/chosen": -241.5, + "logps/rejected": -256.0, + "loss": 0.4702, + "rewards/accuracies": 0.6953125, + "rewards/chosen": 1.041015625, + "rewards/margins": 0.716796875, + "rewards/rejected": 0.32421875, + "step": 68 + }, + { + "epoch": 0.07632743362831858, + "grad_norm": 16.069597244262695, + "learning_rate": 4.974321086522452e-07, + "logits/chosen": -1.3046875, + "logits/rejected": -1.234375, + "logps/chosen": -256.5, + "logps/rejected": -251.5, + "loss": 0.5188, + "rewards/accuracies": 0.6328125, + "rewards/chosen": 0.986328125, + "rewards/margins": 0.568359375, + "rewards/rejected": 0.4208984375, + "step": 69 + }, + { + "epoch": 0.07743362831858407, + "grad_norm": 149.1916046142578, + "learning_rate": 4.973023434426798e-07, + "logits/chosen": -1.4140625, + "logits/rejected": -1.421875, + "logps/chosen": -248.5, + "logps/rejected": -247.0, + "loss": 0.5642, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.96875, + "rewards/margins": 0.5107421875, + "rewards/rejected": 0.458984375, + "step": 70 + }, + { + "epoch": 0.07853982300884955, + "grad_norm": 16.17060089111328, + "learning_rate": 4.971693975580851e-07, + "logits/chosen": -1.390625, + "logits/rejected": -1.28515625, + "logps/chosen": -232.0, + "logps/rejected": -241.0, + "loss": 0.5079, + "rewards/accuracies": 0.6328125, + "rewards/chosen": 1.025390625, + "rewards/margins": 0.61328125, + "rewards/rejected": 0.4140625, + "step": 71 + }, + { + "epoch": 0.07964601769911504, + "grad_norm": 17.631471633911133, + "learning_rate": 4.970332727083425e-07, + "logits/chosen": -1.36328125, + "logits/rejected": -1.35546875, + "logps/chosen": -271.5, + "logps/rejected": -283.0, + "loss": 0.5212, + "rewards/accuracies": 0.625, + "rewards/chosen": 1.1328125, + "rewards/margins": 0.619140625, + "rewards/rejected": 0.5166015625, + "step": 72 + }, + { + "epoch": 0.08075221238938053, + "grad_norm": 17.61063003540039, + "learning_rate": 4.968939706442195e-07, + "logits/chosen": -1.39453125, + "logits/rejected": -1.203125, + "logps/chosen": -275.0, + "logps/rejected": -255.5, + "loss": 0.5211, + "rewards/accuracies": 0.640625, + "rewards/chosen": 0.99609375, + "rewards/margins": 0.583984375, + "rewards/rejected": 0.4130859375, + "step": 73 + }, + { + "epoch": 0.08185840707964602, + "grad_norm": 16.28321075439453, + "learning_rate": 4.967514931573472e-07, + "logits/chosen": -1.26953125, + "logits/rejected": -1.41015625, + "logps/chosen": -243.0, + "logps/rejected": -258.0, + "loss": 0.4977, + "rewards/accuracies": 0.6640625, + "rewards/chosen": 1.20703125, + "rewards/margins": 0.689453125, + "rewards/rejected": 0.5146484375, + "step": 74 + }, + { + "epoch": 0.08296460176991151, + "grad_norm": 16.079599380493164, + "learning_rate": 4.966058420801977e-07, + "logits/chosen": -1.34765625, + "logits/rejected": -1.35546875, + "logps/chosen": -259.5, + "logps/rejected": -244.0, + "loss": 0.4648, + "rewards/accuracies": 0.6796875, + "rewards/chosen": 1.13671875, + "rewards/margins": 0.779296875, + "rewards/rejected": 0.3583984375, + "step": 75 + }, + { + "epoch": 0.084070796460177, + "grad_norm": 18.725271224975586, + "learning_rate": 4.964570192860596e-07, + "logits/chosen": -1.359375, + "logits/rejected": -1.36328125, + "logps/chosen": -287.0, + "logps/rejected": -250.5, + "loss": 0.5436, + "rewards/accuracies": 0.5859375, + "rewards/chosen": 1.109375, + "rewards/margins": 0.5234375, + "rewards/rejected": 0.583984375, + "step": 76 + }, + { + "epoch": 0.08517699115044247, + "grad_norm": 15.678024291992188, + "learning_rate": 4.963050266890152e-07, + "logits/chosen": -1.328125, + "logits/rejected": -1.4609375, + "logps/chosen": -253.0, + "logps/rejected": -246.0, + "loss": 0.4833, + "rewards/accuracies": 0.703125, + "rewards/chosen": 1.15625, + "rewards/margins": 0.767578125, + "rewards/rejected": 0.390625, + "step": 77 + }, + { + "epoch": 0.08628318584070796, + "grad_norm": 15.54704761505127, + "learning_rate": 4.961498662439145e-07, + "logits/chosen": -1.32421875, + "logits/rejected": -1.359375, + "logps/chosen": -230.5, + "logps/rejected": -249.0, + "loss": 0.4718, + "rewards/accuracies": 0.7109375, + "rewards/chosen": 1.3046875, + "rewards/margins": 0.82421875, + "rewards/rejected": 0.48046875, + "step": 78 + }, + { + "epoch": 0.08738938053097345, + "grad_norm": 16.073156356811523, + "learning_rate": 4.959915399463512e-07, + "logits/chosen": -1.2890625, + "logits/rejected": -1.27734375, + "logps/chosen": -246.0, + "logps/rejected": -259.5, + "loss": 0.4602, + "rewards/accuracies": 0.671875, + "rewards/chosen": 1.27734375, + "rewards/margins": 0.85546875, + "rewards/rejected": 0.423828125, + "step": 79 + }, + { + "epoch": 0.08849557522123894, + "grad_norm": 14.94100284576416, + "learning_rate": 4.958300498326362e-07, + "logits/chosen": -1.26953125, + "logits/rejected": -1.40234375, + "logps/chosen": -231.0, + "logps/rejected": -264.5, + "loss": 0.4397, + "rewards/accuracies": 0.7421875, + "rewards/chosen": 1.30078125, + "rewards/margins": 0.953125, + "rewards/rejected": 0.345703125, + "step": 80 + }, + { + "epoch": 0.08960176991150443, + "grad_norm": 17.28353500366211, + "learning_rate": 4.956653979797721e-07, + "logits/chosen": -1.3671875, + "logits/rejected": -1.3828125, + "logps/chosen": -280.0, + "logps/rejected": -259.5, + "loss": 0.5153, + "rewards/accuracies": 0.609375, + "rewards/chosen": 1.18359375, + "rewards/margins": 0.71875, + "rewards/rejected": 0.4658203125, + "step": 81 + }, + { + "epoch": 0.09070796460176991, + "grad_norm": 15.823220252990723, + "learning_rate": 4.954975865054259e-07, + "logits/chosen": -1.3671875, + "logits/rejected": -1.32421875, + "logps/chosen": -255.5, + "logps/rejected": -256.5, + "loss": 0.4614, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.34375, + "rewards/margins": 0.87109375, + "rewards/rejected": 0.474609375, + "step": 82 + }, + { + "epoch": 0.0918141592920354, + "grad_norm": 14.873113632202148, + "learning_rate": 4.953266175679023e-07, + "logits/chosen": -1.2890625, + "logits/rejected": -1.3125, + "logps/chosen": -236.5, + "logps/rejected": -241.5, + "loss": 0.4586, + "rewards/accuracies": 0.703125, + "rewards/chosen": 1.33984375, + "rewards/margins": 0.869140625, + "rewards/rejected": 0.47265625, + "step": 83 + }, + { + "epoch": 0.09292035398230089, + "grad_norm": 16.7225341796875, + "learning_rate": 4.951524933661154e-07, + "logits/chosen": -1.328125, + "logits/rejected": -1.30078125, + "logps/chosen": -256.0, + "logps/rejected": -237.0, + "loss": 0.5129, + "rewards/accuracies": 0.640625, + "rewards/chosen": 1.1953125, + "rewards/margins": 0.6953125, + "rewards/rejected": 0.5009765625, + "step": 84 + }, + { + "epoch": 0.09402654867256637, + "grad_norm": 15.362020492553711, + "learning_rate": 4.949752161395605e-07, + "logits/chosen": -1.3046875, + "logits/rejected": -1.24609375, + "logps/chosen": -257.0, + "logps/rejected": -252.0, + "loss": 0.4339, + "rewards/accuracies": 0.703125, + "rewards/chosen": 1.2890625, + "rewards/margins": 0.990234375, + "rewards/rejected": 0.30078125, + "step": 85 + }, + { + "epoch": 0.09513274336283185, + "grad_norm": 15.91197681427002, + "learning_rate": 4.94794788168286e-07, + "logits/chosen": -1.32421875, + "logits/rejected": -1.24609375, + "logps/chosen": -229.5, + "logps/rejected": -255.5, + "loss": 0.4675, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.3203125, + "rewards/margins": 0.85546875, + "rewards/rejected": 0.4658203125, + "step": 86 + }, + { + "epoch": 0.09623893805309734, + "grad_norm": 15.647570610046387, + "learning_rate": 4.946112117728634e-07, + "logits/chosen": -1.3671875, + "logits/rejected": -1.3359375, + "logps/chosen": -243.0, + "logps/rejected": -235.5, + "loss": 0.4574, + "rewards/accuracies": 0.671875, + "rewards/chosen": 1.28515625, + "rewards/margins": 0.865234375, + "rewards/rejected": 0.41796875, + "step": 87 + }, + { + "epoch": 0.09734513274336283, + "grad_norm": 17.21525001525879, + "learning_rate": 4.944244893143572e-07, + "logits/chosen": -1.3515625, + "logits/rejected": -1.26953125, + "logps/chosen": -268.0, + "logps/rejected": -264.0, + "loss": 0.4832, + "rewards/accuracies": 0.671875, + "rewards/chosen": 1.28515625, + "rewards/margins": 0.826171875, + "rewards/rejected": 0.4580078125, + "step": 88 + }, + { + "epoch": 0.09845132743362832, + "grad_norm": 16.55433464050293, + "learning_rate": 4.942346231942955e-07, + "logits/chosen": -1.3671875, + "logits/rejected": -1.3828125, + "logps/chosen": -255.0, + "logps/rejected": -258.5, + "loss": 0.4967, + "rewards/accuracies": 0.609375, + "rewards/chosen": 1.3359375, + "rewards/margins": 0.80078125, + "rewards/rejected": 0.53515625, + "step": 89 + }, + { + "epoch": 0.09955752212389381, + "grad_norm": 15.434545516967773, + "learning_rate": 4.94041615854638e-07, + "logits/chosen": -1.3671875, + "logits/rejected": -1.28515625, + "logps/chosen": -265.0, + "logps/rejected": -262.5, + "loss": 0.4258, + "rewards/accuracies": 0.6640625, + "rewards/chosen": 1.42578125, + "rewards/margins": 1.0390625, + "rewards/rejected": 0.3876953125, + "step": 90 + }, + { + "epoch": 0.1006637168141593, + "grad_norm": 16.600032806396484, + "learning_rate": 4.938454697777457e-07, + "logits/chosen": -1.234375, + "logits/rejected": -1.1953125, + "logps/chosen": -279.0, + "logps/rejected": -278.0, + "loss": 0.473, + "rewards/accuracies": 0.671875, + "rewards/chosen": 1.265625, + "rewards/margins": 0.8828125, + "rewards/rejected": 0.3818359375, + "step": 91 + }, + { + "epoch": 0.10176991150442478, + "grad_norm": 17.01357078552246, + "learning_rate": 4.936461874863479e-07, + "logits/chosen": -1.3828125, + "logits/rejected": -1.3828125, + "logps/chosen": -255.5, + "logps/rejected": -286.0, + "loss": 0.495, + "rewards/accuracies": 0.640625, + "rewards/chosen": 1.3125, + "rewards/margins": 0.833984375, + "rewards/rejected": 0.4765625, + "step": 92 + }, + { + "epoch": 0.10287610619469026, + "grad_norm": 15.263469696044922, + "learning_rate": 4.934437715435107e-07, + "logits/chosen": -1.27734375, + "logits/rejected": -1.33203125, + "logps/chosen": -244.5, + "logps/rejected": -245.0, + "loss": 0.4545, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.328125, + "rewards/margins": 0.955078125, + "rewards/rejected": 0.3759765625, + "step": 93 + }, + { + "epoch": 0.10398230088495575, + "grad_norm": 15.016999244689941, + "learning_rate": 4.932382245526034e-07, + "logits/chosen": -1.23828125, + "logits/rejected": -1.21875, + "logps/chosen": -245.0, + "logps/rejected": -256.5, + "loss": 0.4381, + "rewards/accuracies": 0.7265625, + "rewards/chosen": 1.34375, + "rewards/margins": 0.970703125, + "rewards/rejected": 0.3740234375, + "step": 94 + }, + { + "epoch": 0.10508849557522124, + "grad_norm": 16.34784507751465, + "learning_rate": 4.930295491572653e-07, + "logits/chosen": -1.37109375, + "logits/rejected": -1.3203125, + "logps/chosen": -247.0, + "logps/rejected": -260.0, + "loss": 0.4766, + "rewards/accuracies": 0.640625, + "rewards/chosen": 1.34765625, + "rewards/margins": 0.87890625, + "rewards/rejected": 0.46875, + "step": 95 + }, + { + "epoch": 0.10619469026548672, + "grad_norm": 15.943212509155273, + "learning_rate": 4.928177480413714e-07, + "logits/chosen": -1.3046875, + "logits/rejected": -1.28125, + "logps/chosen": -259.0, + "logps/rejected": -270.5, + "loss": 0.4727, + "rewards/accuracies": 0.6484375, + "rewards/chosen": 1.3515625, + "rewards/margins": 0.955078125, + "rewards/rejected": 0.3955078125, + "step": 96 + }, + { + "epoch": 0.10730088495575221, + "grad_norm": 16.164276123046875, + "learning_rate": 4.926028239289984e-07, + "logits/chosen": -1.2421875, + "logits/rejected": -1.38671875, + "logps/chosen": -273.5, + "logps/rejected": -268.5, + "loss": 0.4556, + "rewards/accuracies": 0.6640625, + "rewards/chosen": 1.37890625, + "rewards/margins": 0.97265625, + "rewards/rejected": 0.4072265625, + "step": 97 + }, + { + "epoch": 0.1084070796460177, + "grad_norm": 16.16509437561035, + "learning_rate": 4.923847795843893e-07, + "logits/chosen": -1.359375, + "logits/rejected": -1.2265625, + "logps/chosen": -270.0, + "logps/rejected": -277.0, + "loss": 0.4657, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.265625, + "rewards/margins": 0.947265625, + "rewards/rejected": 0.3173828125, + "step": 98 + }, + { + "epoch": 0.10951327433628319, + "grad_norm": 16.42505645751953, + "learning_rate": 4.921636178119177e-07, + "logits/chosen": -1.47265625, + "logits/rejected": -1.16796875, + "logps/chosen": -251.0, + "logps/rejected": -232.5, + "loss": 0.4747, + "rewards/accuracies": 0.6328125, + "rewards/chosen": 1.296875, + "rewards/margins": 0.8671875, + "rewards/rejected": 0.4287109375, + "step": 99 + }, + { + "epoch": 0.11061946902654868, + "grad_norm": 23.872831344604492, + "learning_rate": 4.919393414560522e-07, + "logits/chosen": -1.39453125, + "logits/rejected": -1.3125, + "logps/chosen": -246.0, + "logps/rejected": -249.5, + "loss": 0.4521, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.26171875, + "rewards/margins": 0.93359375, + "rewards/rejected": 0.328125, + "step": 100 + }, + { + "epoch": 0.11061946902654868, + "eval_logits/chosen": -1.329796314239502, + "eval_logits/rejected": -1.3044931888580322, + "eval_logps/chosen": -247.93531799316406, + "eval_logps/rejected": -256.3333435058594, + "eval_loss": 0.4438014328479767, + "eval_rewards/accuracies": 0.7061508893966675, + "eval_rewards/chosen": 1.2971081733703613, + "eval_rewards/margins": 1.0102806091308594, + "eval_rewards/rejected": 0.2868901491165161, + "eval_runtime": 193.1281, + "eval_samples_per_second": 66.552, + "eval_steps_per_second": 1.041, + "step": 100 + }, + { + "epoch": 0.11172566371681415, + "grad_norm": 13.894512176513672, + "learning_rate": 4.917119534013193e-07, + "logits/chosen": -1.26953125, + "logits/rejected": -1.234375, + "logps/chosen": -233.0, + "logps/rejected": -233.0, + "loss": 0.418, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.25390625, + "rewards/margins": 1.0546875, + "rewards/rejected": 0.1982421875, + "step": 101 + }, + { + "epoch": 0.11283185840707964, + "grad_norm": 15.946537017822266, + "learning_rate": 4.91481456572267e-07, + "logits/chosen": -1.359375, + "logits/rejected": -1.23828125, + "logps/chosen": -251.5, + "logps/rejected": -249.5, + "loss": 0.4642, + "rewards/accuracies": 0.6953125, + "rewards/chosen": 1.22265625, + "rewards/margins": 0.91796875, + "rewards/rejected": 0.30517578125, + "step": 102 + }, + { + "epoch": 0.11393805309734513, + "grad_norm": 13.790292739868164, + "learning_rate": 4.912478539334264e-07, + "logits/chosen": -1.27734375, + "logits/rejected": -1.35546875, + "logps/chosen": -223.5, + "logps/rejected": -241.0, + "loss": 0.3972, + "rewards/accuracies": 0.7421875, + "rewards/chosen": 1.40625, + "rewards/margins": 1.14453125, + "rewards/rejected": 0.25634765625, + "step": 103 + }, + { + "epoch": 0.11504424778761062, + "grad_norm": 14.399200439453125, + "learning_rate": 4.910111484892739e-07, + "logits/chosen": -1.296875, + "logits/rejected": -1.26171875, + "logps/chosen": -240.5, + "logps/rejected": -260.5, + "loss": 0.3929, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 1.36328125, + "rewards/margins": 1.23046875, + "rewards/rejected": 0.1336669921875, + "step": 104 + }, + { + "epoch": 0.1161504424778761, + "grad_norm": 16.46123504638672, + "learning_rate": 4.907713432841928e-07, + "logits/chosen": -1.24609375, + "logits/rejected": -1.14453125, + "logps/chosen": -255.0, + "logps/rejected": -229.0, + "loss": 0.5001, + "rewards/accuracies": 0.6171875, + "rewards/chosen": 1.0625, + "rewards/margins": 0.828125, + "rewards/rejected": 0.23486328125, + "step": 105 + }, + { + "epoch": 0.1172566371681416, + "grad_norm": 16.047285079956055, + "learning_rate": 4.905284414024337e-07, + "logits/chosen": -1.23828125, + "logits/rejected": -1.40234375, + "logps/chosen": -242.5, + "logps/rejected": -284.0, + "loss": 0.4525, + "rewards/accuracies": 0.7109375, + "rewards/chosen": 1.22265625, + "rewards/margins": 0.986328125, + "rewards/rejected": 0.23388671875, + "step": 106 + }, + { + "epoch": 0.11836283185840708, + "grad_norm": 16.925933837890625, + "learning_rate": 4.902824459680752e-07, + "logits/chosen": -1.2578125, + "logits/rejected": -1.3125, + "logps/chosen": -265.0, + "logps/rejected": -266.0, + "loss": 0.46, + "rewards/accuracies": 0.7421875, + "rewards/chosen": 1.19140625, + "rewards/margins": 0.978515625, + "rewards/rejected": 0.209228515625, + "step": 107 + }, + { + "epoch": 0.11946902654867257, + "grad_norm": 15.083377838134766, + "learning_rate": 4.900333601449835e-07, + "logits/chosen": -1.265625, + "logits/rejected": -1.33203125, + "logps/chosen": -266.0, + "logps/rejected": -265.0, + "loss": 0.4376, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.21484375, + "rewards/margins": 1.05078125, + "rewards/rejected": 0.162109375, + "step": 108 + }, + { + "epoch": 0.12057522123893805, + "grad_norm": 17.161651611328125, + "learning_rate": 4.89781187136772e-07, + "logits/chosen": -1.25, + "logits/rejected": -1.27734375, + "logps/chosen": -254.0, + "logps/rejected": -276.0, + "loss": 0.4388, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.26171875, + "rewards/margins": 1.09375, + "rewards/rejected": 0.16796875, + "step": 109 + }, + { + "epoch": 0.12168141592920353, + "grad_norm": 15.041230201721191, + "learning_rate": 4.895259301867595e-07, + "logits/chosen": -1.23828125, + "logits/rejected": -1.328125, + "logps/chosen": -246.0, + "logps/rejected": -280.0, + "loss": 0.4269, + "rewards/accuracies": 0.7109375, + "rewards/chosen": 1.3046875, + "rewards/margins": 1.1171875, + "rewards/rejected": 0.18603515625, + "step": 110 + }, + { + "epoch": 0.12278761061946902, + "grad_norm": 13.183340072631836, + "learning_rate": 4.892675925779292e-07, + "logits/chosen": -1.3359375, + "logits/rejected": -1.36328125, + "logps/chosen": -207.5, + "logps/rejected": -250.0, + "loss": 0.4122, + "rewards/accuracies": 0.734375, + "rewards/chosen": 1.328125, + "rewards/margins": 1.26171875, + "rewards/rejected": 0.0699310302734375, + "step": 111 + }, + { + "epoch": 0.12389380530973451, + "grad_norm": 15.673736572265625, + "learning_rate": 4.89006177632886e-07, + "logits/chosen": -1.375, + "logits/rejected": -1.37109375, + "logps/chosen": -263.0, + "logps/rejected": -272.0, + "loss": 0.4412, + "rewards/accuracies": 0.765625, + "rewards/chosen": 1.24609375, + "rewards/margins": 1.08203125, + "rewards/rejected": 0.1640625, + "step": 112 + }, + { + "epoch": 0.125, + "grad_norm": 14.536067008972168, + "learning_rate": 4.887416887138138e-07, + "logits/chosen": -1.1875, + "logits/rejected": -1.15625, + "logps/chosen": -257.5, + "logps/rejected": -270.0, + "loss": 0.4604, + "rewards/accuracies": 0.671875, + "rewards/chosen": 1.1953125, + "rewards/margins": 1.09375, + "rewards/rejected": 0.10595703125, + "step": 113 + }, + { + "epoch": 0.1261061946902655, + "grad_norm": 14.573882102966309, + "learning_rate": 4.884741292224326e-07, + "logits/chosen": -1.296875, + "logits/rejected": -1.34765625, + "logps/chosen": -240.0, + "logps/rejected": -268.5, + "loss": 0.4091, + "rewards/accuracies": 0.734375, + "rewards/chosen": 1.265625, + "rewards/margins": 1.16796875, + "rewards/rejected": 0.09814453125, + "step": 114 + }, + { + "epoch": 0.12721238938053098, + "grad_norm": 15.746649742126465, + "learning_rate": 4.882035025999544e-07, + "logits/chosen": -1.22265625, + "logits/rejected": -1.1875, + "logps/chosen": -273.5, + "logps/rejected": -270.5, + "loss": 0.4313, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.15234375, + "rewards/margins": 1.1640625, + "rewards/rejected": -0.0108642578125, + "step": 115 + }, + { + "epoch": 0.12831858407079647, + "grad_norm": 15.159698486328125, + "learning_rate": 4.879298123270391e-07, + "logits/chosen": -1.32421875, + "logits/rejected": -1.36328125, + "logps/chosen": -244.5, + "logps/rejected": -256.0, + "loss": 0.4331, + "rewards/accuracies": 0.6796875, + "rewards/chosen": 1.23046875, + "rewards/margins": 1.171875, + "rewards/rejected": 0.0592041015625, + "step": 116 + }, + { + "epoch": 0.12942477876106195, + "grad_norm": 14.491214752197266, + "learning_rate": 4.876530619237495e-07, + "logits/chosen": -1.20703125, + "logits/rejected": -1.2578125, + "logps/chosen": -235.5, + "logps/rejected": -233.5, + "loss": 0.4267, + "rewards/accuracies": 0.6953125, + "rewards/chosen": 1.19140625, + "rewards/margins": 1.20703125, + "rewards/rejected": -0.018310546875, + "step": 117 + }, + { + "epoch": 0.13053097345132744, + "grad_norm": 15.388319969177246, + "learning_rate": 4.873732549495065e-07, + "logits/chosen": -1.3203125, + "logits/rejected": -1.25, + "logps/chosen": -263.0, + "logps/rejected": -254.5, + "loss": 0.4351, + "rewards/accuracies": 0.7109375, + "rewards/chosen": 1.16796875, + "rewards/margins": 1.1171875, + "rewards/rejected": 0.05218505859375, + "step": 118 + }, + { + "epoch": 0.13163716814159293, + "grad_norm": 14.733474731445312, + "learning_rate": 4.870903950030428e-07, + "logits/chosen": -1.35546875, + "logits/rejected": -1.31640625, + "logps/chosen": -241.5, + "logps/rejected": -274.0, + "loss": 0.376, + "rewards/accuracies": 0.765625, + "rewards/chosen": 1.390625, + "rewards/margins": 1.30859375, + "rewards/rejected": 0.086669921875, + "step": 119 + }, + { + "epoch": 0.13274336283185842, + "grad_norm": 14.26261043548584, + "learning_rate": 4.868044857223571e-07, + "logits/chosen": -1.2890625, + "logits/rejected": -1.29296875, + "logps/chosen": -248.5, + "logps/rejected": -281.0, + "loss": 0.3815, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 1.296875, + "rewards/margins": 1.3046875, + "rewards/rejected": -0.008544921875, + "step": 120 + }, + { + "epoch": 0.1338495575221239, + "grad_norm": 15.668647766113281, + "learning_rate": 4.865155307846669e-07, + "logits/chosen": -1.3828125, + "logits/rejected": -1.4375, + "logps/chosen": -232.0, + "logps/rejected": -240.0, + "loss": 0.4114, + "rewards/accuracies": 0.7109375, + "rewards/chosen": 1.2734375, + "rewards/margins": 1.2421875, + "rewards/rejected": 0.0323486328125, + "step": 121 + }, + { + "epoch": 0.13495575221238937, + "grad_norm": 16.117341995239258, + "learning_rate": 4.862235339063613e-07, + "logits/chosen": -1.35546875, + "logits/rejected": -1.26171875, + "logps/chosen": -252.5, + "logps/rejected": -268.0, + "loss": 0.4789, + "rewards/accuracies": 0.6640625, + "rewards/chosen": 1.10546875, + "rewards/margins": 0.9375, + "rewards/rejected": 0.169189453125, + "step": 122 + }, + { + "epoch": 0.13606194690265486, + "grad_norm": 15.315237998962402, + "learning_rate": 4.859284988429533e-07, + "logits/chosen": -1.390625, + "logits/rejected": -1.37109375, + "logps/chosen": -264.0, + "logps/rejected": -302.0, + "loss": 0.4574, + "rewards/accuracies": 0.6796875, + "rewards/chosen": 1.09375, + "rewards/margins": 1.0234375, + "rewards/rejected": 0.0693359375, + "step": 123 + }, + { + "epoch": 0.13716814159292035, + "grad_norm": 13.38134765625, + "learning_rate": 4.856304293890317e-07, + "logits/chosen": -1.25, + "logits/rejected": -1.18359375, + "logps/chosen": -255.0, + "logps/rejected": -253.5, + "loss": 0.3681, + "rewards/accuracies": 0.765625, + "rewards/chosen": 1.3984375, + "rewards/margins": 1.484375, + "rewards/rejected": -0.08575439453125, + "step": 124 + }, + { + "epoch": 0.13827433628318583, + "grad_norm": 17.225801467895508, + "learning_rate": 4.853293293782118e-07, + "logits/chosen": -1.39453125, + "logits/rejected": -1.4140625, + "logps/chosen": -276.0, + "logps/rejected": -280.0, + "loss": 0.458, + "rewards/accuracies": 0.6796875, + "rewards/chosen": 1.1796875, + "rewards/margins": 1.08203125, + "rewards/rejected": 0.09991455078125, + "step": 125 + }, + { + "epoch": 0.13938053097345132, + "grad_norm": 14.186132431030273, + "learning_rate": 4.850252026830863e-07, + "logits/chosen": -1.359375, + "logits/rejected": -1.26953125, + "logps/chosen": -234.5, + "logps/rejected": -252.5, + "loss": 0.4436, + "rewards/accuracies": 0.734375, + "rewards/chosen": 1.25390625, + "rewards/margins": 1.123046875, + "rewards/rejected": 0.1328125, + "step": 126 + }, + { + "epoch": 0.1404867256637168, + "grad_norm": 14.477481842041016, + "learning_rate": 4.84718053215176e-07, + "logits/chosen": -1.25390625, + "logits/rejected": -1.23046875, + "logps/chosen": -249.5, + "logps/rejected": -256.0, + "loss": 0.4314, + "rewards/accuracies": 0.703125, + "rewards/chosen": 1.296875, + "rewards/margins": 1.140625, + "rewards/rejected": 0.1552734375, + "step": 127 + }, + { + "epoch": 0.1415929203539823, + "grad_norm": 15.153040885925293, + "learning_rate": 4.844078849248785e-07, + "logits/chosen": -1.30859375, + "logits/rejected": -1.3125, + "logps/chosen": -260.0, + "logps/rejected": -292.0, + "loss": 0.3964, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.421875, + "rewards/margins": 1.37890625, + "rewards/rejected": 0.0396728515625, + "step": 128 + }, + { + "epoch": 0.1426991150442478, + "grad_norm": 14.35177230834961, + "learning_rate": 4.840947018014182e-07, + "logits/chosen": -1.23828125, + "logits/rejected": -1.19140625, + "logps/chosen": -256.5, + "logps/rejected": -251.5, + "loss": 0.4107, + "rewards/accuracies": 0.7578125, + "rewards/chosen": 1.33203125, + "rewards/margins": 1.2734375, + "rewards/rejected": 0.060028076171875, + "step": 129 + }, + { + "epoch": 0.14380530973451328, + "grad_norm": 14.168734550476074, + "learning_rate": 4.837785078727948e-07, + "logits/chosen": -1.25, + "logits/rejected": -1.19140625, + "logps/chosen": -248.0, + "logps/rejected": -284.0, + "loss": 0.3812, + "rewards/accuracies": 0.7578125, + "rewards/chosen": 1.3984375, + "rewards/margins": 1.390625, + "rewards/rejected": 0.00927734375, + "step": 130 + }, + { + "epoch": 0.14491150442477876, + "grad_norm": 15.743026733398438, + "learning_rate": 4.834593072057313e-07, + "logits/chosen": -1.28125, + "logits/rejected": -1.30078125, + "logps/chosen": -246.0, + "logps/rejected": -265.0, + "loss": 0.4586, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.31640625, + "rewards/margins": 1.08984375, + "rewards/rejected": 0.2255859375, + "step": 131 + }, + { + "epoch": 0.14601769911504425, + "grad_norm": 16.969074249267578, + "learning_rate": 4.831371039056217e-07, + "logits/chosen": -1.20703125, + "logits/rejected": -1.1484375, + "logps/chosen": -275.0, + "logps/rejected": -296.0, + "loss": 0.4373, + "rewards/accuracies": 0.7265625, + "rewards/chosen": 1.2109375, + "rewards/margins": 1.19921875, + "rewards/rejected": 0.0108642578125, + "step": 132 + }, + { + "epoch": 0.14712389380530974, + "grad_norm": 14.101773262023926, + "learning_rate": 4.828119021164786e-07, + "logits/chosen": -1.25390625, + "logits/rejected": -1.296875, + "logps/chosen": -246.5, + "logps/rejected": -277.0, + "loss": 0.3919, + "rewards/accuracies": 0.7578125, + "rewards/chosen": 1.359375, + "rewards/margins": 1.43359375, + "rewards/rejected": -0.07568359375, + "step": 133 + }, + { + "epoch": 0.14823008849557523, + "grad_norm": 15.83488941192627, + "learning_rate": 4.824837060208795e-07, + "logits/chosen": -1.3046875, + "logits/rejected": -1.265625, + "logps/chosen": -275.0, + "logps/rejected": -268.5, + "loss": 0.4578, + "rewards/accuracies": 0.703125, + "rewards/chosen": 1.1953125, + "rewards/margins": 0.998046875, + "rewards/rejected": 0.193359375, + "step": 134 + }, + { + "epoch": 0.14933628318584072, + "grad_norm": 13.669934272766113, + "learning_rate": 4.82152519839913e-07, + "logits/chosen": -1.390625, + "logits/rejected": -1.2578125, + "logps/chosen": -241.5, + "logps/rejected": -243.5, + "loss": 0.3765, + "rewards/accuracies": 0.796875, + "rewards/chosen": 1.4375, + "rewards/margins": 1.40625, + "rewards/rejected": 0.0296630859375, + "step": 135 + }, + { + "epoch": 0.1504424778761062, + "grad_norm": 16.85657501220703, + "learning_rate": 4.818183478331247e-07, + "logits/chosen": -1.13671875, + "logits/rejected": -1.25390625, + "logps/chosen": -257.5, + "logps/rejected": -277.5, + "loss": 0.4258, + "rewards/accuracies": 0.6953125, + "rewards/chosen": 1.390625, + "rewards/margins": 1.3125, + "rewards/rejected": 0.0772705078125, + "step": 136 + }, + { + "epoch": 0.1515486725663717, + "grad_norm": 15.21373462677002, + "learning_rate": 4.814811942984625e-07, + "logits/chosen": -1.29296875, + "logits/rejected": -1.1953125, + "logps/chosen": -256.5, + "logps/rejected": -240.0, + "loss": 0.4232, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.19921875, + "rewards/margins": 1.12890625, + "rewards/rejected": 0.0693359375, + "step": 137 + }, + { + "epoch": 0.15265486725663716, + "grad_norm": 13.69796085357666, + "learning_rate": 4.811410635722209e-07, + "logits/chosen": -1.2890625, + "logits/rejected": -1.21875, + "logps/chosen": -236.5, + "logps/rejected": -257.0, + "loss": 0.3722, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 1.4453125, + "rewards/margins": 1.55078125, + "rewards/rejected": -0.10595703125, + "step": 138 + }, + { + "epoch": 0.15376106194690264, + "grad_norm": 15.000753402709961, + "learning_rate": 4.807979600289857e-07, + "logits/chosen": -1.21875, + "logits/rejected": -1.27734375, + "logps/chosen": -274.0, + "logps/rejected": -297.0, + "loss": 0.3709, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.30078125, + "rewards/margins": 1.515625, + "rewards/rejected": -0.21240234375, + "step": 139 + }, + { + "epoch": 0.15486725663716813, + "grad_norm": 13.44487476348877, + "learning_rate": 4.804518880815776e-07, + "logits/chosen": -1.15625, + "logits/rejected": -1.27734375, + "logps/chosen": -248.5, + "logps/rejected": -267.5, + "loss": 0.3818, + "rewards/accuracies": 0.7265625, + "rewards/chosen": 1.37109375, + "rewards/margins": 1.515625, + "rewards/rejected": -0.144775390625, + "step": 140 + }, + { + "epoch": 0.15597345132743362, + "grad_norm": 15.1209135055542, + "learning_rate": 4.801028521809951e-07, + "logits/chosen": -1.21875, + "logits/rejected": -1.16796875, + "logps/chosen": -273.0, + "logps/rejected": -271.5, + "loss": 0.4027, + "rewards/accuracies": 0.7578125, + "rewards/chosen": 1.19140625, + "rewards/margins": 1.3125, + "rewards/rejected": -0.122802734375, + "step": 141 + }, + { + "epoch": 0.1570796460176991, + "grad_norm": 16.363567352294922, + "learning_rate": 4.797508568163578e-07, + "logits/chosen": -1.33203125, + "logits/rejected": -1.2109375, + "logps/chosen": -262.0, + "logps/rejected": -269.0, + "loss": 0.4581, + "rewards/accuracies": 0.7265625, + "rewards/chosen": 1.072265625, + "rewards/margins": 1.169921875, + "rewards/rejected": -0.097412109375, + "step": 142 + }, + { + "epoch": 0.1581858407079646, + "grad_norm": 13.670063972473145, + "learning_rate": 4.793959065148484e-07, + "logits/chosen": -1.2734375, + "logits/rejected": -1.2421875, + "logps/chosen": -240.0, + "logps/rejected": -254.5, + "loss": 0.3719, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 1.296875, + "rewards/margins": 1.4765625, + "rewards/rejected": -0.179443359375, + "step": 143 + }, + { + "epoch": 0.1592920353982301, + "grad_norm": 14.17078971862793, + "learning_rate": 4.790380058416542e-07, + "logits/chosen": -1.26953125, + "logits/rejected": -1.23046875, + "logps/chosen": -240.0, + "logps/rejected": -259.5, + "loss": 0.3726, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 1.35546875, + "rewards/margins": 1.625, + "rewards/rejected": -0.2666015625, + "step": 144 + }, + { + "epoch": 0.16039823008849557, + "grad_norm": 13.858586311340332, + "learning_rate": 4.786771593999089e-07, + "logits/chosen": -1.25390625, + "logits/rejected": -1.25, + "logps/chosen": -242.5, + "logps/rejected": -251.5, + "loss": 0.377, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 1.3515625, + "rewards/margins": 1.5078125, + "rewards/rejected": -0.154296875, + "step": 145 + }, + { + "epoch": 0.16150442477876106, + "grad_norm": 15.108954429626465, + "learning_rate": 4.783133718306331e-07, + "logits/chosen": -1.21875, + "logits/rejected": -1.28125, + "logps/chosen": -266.0, + "logps/rejected": -305.0, + "loss": 0.4185, + "rewards/accuracies": 0.765625, + "rewards/chosen": 1.21875, + "rewards/margins": 1.37109375, + "rewards/rejected": -0.15087890625, + "step": 146 + }, + { + "epoch": 0.16261061946902655, + "grad_norm": 14.861040115356445, + "learning_rate": 4.779466478126746e-07, + "logits/chosen": -1.3828125, + "logits/rejected": -1.30078125, + "logps/chosen": -242.0, + "logps/rejected": -239.5, + "loss": 0.3849, + "rewards/accuracies": 0.765625, + "rewards/chosen": 1.140625, + "rewards/margins": 1.4296875, + "rewards/rejected": -0.2919921875, + "step": 147 + }, + { + "epoch": 0.16371681415929204, + "grad_norm": 14.671175956726074, + "learning_rate": 4.775769920626483e-07, + "logits/chosen": -1.37890625, + "logits/rejected": -1.27734375, + "logps/chosen": -238.5, + "logps/rejected": -251.0, + "loss": 0.4109, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.1484375, + "rewards/margins": 1.27734375, + "rewards/rejected": -0.129150390625, + "step": 148 + }, + { + "epoch": 0.16482300884955753, + "grad_norm": 13.885614395141602, + "learning_rate": 4.772044093348757e-07, + "logits/chosen": -1.24609375, + "logits/rejected": -1.23828125, + "logps/chosen": -245.5, + "logps/rejected": -247.0, + "loss": 0.4042, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.162109375, + "rewards/margins": 1.361328125, + "rewards/rejected": -0.19720458984375, + "step": 149 + }, + { + "epoch": 0.16592920353982302, + "grad_norm": 15.551752090454102, + "learning_rate": 4.7682890442132336e-07, + "logits/chosen": -1.3671875, + "logits/rejected": -1.2265625, + "logps/chosen": -255.0, + "logps/rejected": -252.0, + "loss": 0.415, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.095703125, + "rewards/margins": 1.37890625, + "rewards/rejected": -0.28369140625, + "step": 150 + }, + { + "epoch": 0.16592920353982302, + "eval_logits/chosen": -1.2870413064956665, + "eval_logits/rejected": -1.2431591749191284, + "eval_logps/chosen": -248.97512817382812, + "eval_logps/rejected": -261.86566162109375, + "eval_loss": 0.39721065759658813, + "eval_rewards/accuracies": 0.7473672032356262, + "eval_rewards/chosen": 1.184818148612976, + "eval_rewards/margins": 1.4427666664123535, + "eval_rewards/rejected": -0.25743111968040466, + "eval_runtime": 193.0648, + "eval_samples_per_second": 66.573, + "eval_steps_per_second": 1.041, + "step": 150 + }, + { + "epoch": 0.1670353982300885, + "grad_norm": 15.598936080932617, + "learning_rate": 4.7645048215154156e-07, + "logits/chosen": -1.3046875, + "logits/rejected": -1.2890625, + "logps/chosen": -242.0, + "logps/rejected": -260.0, + "loss": 0.4404, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.146484375, + "rewards/margins": 1.39453125, + "rewards/rejected": -0.24755859375, + "step": 151 + }, + { + "epoch": 0.168141592920354, + "grad_norm": 13.759398460388184, + "learning_rate": 4.760691473926021e-07, + "logits/chosen": -1.24609375, + "logits/rejected": -1.234375, + "logps/chosen": -248.5, + "logps/rejected": -269.0, + "loss": 0.3753, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.2890625, + "rewards/margins": 1.55859375, + "rewards/rejected": -0.265625, + "step": 152 + }, + { + "epoch": 0.16924778761061948, + "grad_norm": 17.32530975341797, + "learning_rate": 4.756849050490357e-07, + "logits/chosen": -1.26953125, + "logits/rejected": -1.1640625, + "logps/chosen": -287.0, + "logps/rejected": -302.0, + "loss": 0.4487, + "rewards/accuracies": 0.7265625, + "rewards/chosen": 1.01953125, + "rewards/margins": 1.23828125, + "rewards/rejected": -0.21826171875, + "step": 153 + }, + { + "epoch": 0.17035398230088494, + "grad_norm": 16.289810180664062, + "learning_rate": 4.75297760062769e-07, + "logits/chosen": -1.36328125, + "logits/rejected": -1.296875, + "logps/chosen": -271.0, + "logps/rejected": -266.5, + "loss": 0.4189, + "rewards/accuracies": 0.703125, + "rewards/chosen": 1.0703125, + "rewards/margins": 1.375, + "rewards/rejected": -0.30419921875, + "step": 154 + }, + { + "epoch": 0.17146017699115043, + "grad_norm": 15.245888710021973, + "learning_rate": 4.749077174130608e-07, + "logits/chosen": -1.3203125, + "logits/rejected": -1.23828125, + "logps/chosen": -264.0, + "logps/rejected": -282.0, + "loss": 0.4183, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.16015625, + "rewards/margins": 1.40234375, + "rewards/rejected": -0.240234375, + "step": 155 + }, + { + "epoch": 0.17256637168141592, + "grad_norm": 14.452110290527344, + "learning_rate": 4.7451478211643835e-07, + "logits/chosen": -1.39453125, + "logits/rejected": -1.30859375, + "logps/chosen": -253.0, + "logps/rejected": -256.0, + "loss": 0.3993, + "rewards/accuracies": 0.7421875, + "rewards/chosen": 1.2421875, + "rewards/margins": 1.44921875, + "rewards/rejected": -0.20654296875, + "step": 156 + }, + { + "epoch": 0.1736725663716814, + "grad_norm": 14.378584861755371, + "learning_rate": 4.741189592266325e-07, + "logits/chosen": -1.29296875, + "logits/rejected": -1.26171875, + "logps/chosen": -231.5, + "logps/rejected": -273.5, + "loss": 0.3664, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 1.3125, + "rewards/margins": 1.70703125, + "rewards/rejected": -0.39453125, + "step": 157 + }, + { + "epoch": 0.1747787610619469, + "grad_norm": 13.193842887878418, + "learning_rate": 4.7372025383451274e-07, + "logits/chosen": -1.12109375, + "logits/rejected": -1.203125, + "logps/chosen": -240.0, + "logps/rejected": -260.0, + "loss": 0.3485, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.12109375, + "rewards/margins": 1.6171875, + "rewards/rejected": -0.4912109375, + "step": 158 + }, + { + "epoch": 0.17588495575221239, + "grad_norm": 13.745351791381836, + "learning_rate": 4.7331867106802204e-07, + "logits/chosen": -1.2421875, + "logits/rejected": -1.1875, + "logps/chosen": -258.5, + "logps/rejected": -265.0, + "loss": 0.3891, + "rewards/accuracies": 0.7109375, + "rewards/chosen": 1.23046875, + "rewards/margins": 1.5859375, + "rewards/rejected": -0.35546875, + "step": 159 + }, + { + "epoch": 0.17699115044247787, + "grad_norm": 14.0711669921875, + "learning_rate": 4.7291421609211045e-07, + "logits/chosen": -1.31640625, + "logits/rejected": -1.24609375, + "logps/chosen": -251.5, + "logps/rejected": -282.0, + "loss": 0.3999, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.1875, + "rewards/margins": 1.3984375, + "rewards/rejected": -0.20654296875, + "step": 160 + }, + { + "epoch": 0.17809734513274336, + "grad_norm": 13.304108619689941, + "learning_rate": 4.725068941086692e-07, + "logits/chosen": -1.37109375, + "logits/rejected": -1.1953125, + "logps/chosen": -255.5, + "logps/rejected": -262.0, + "loss": 0.3558, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 1.2265625, + "rewards/margins": 1.63671875, + "rewards/rejected": -0.4140625, + "step": 161 + }, + { + "epoch": 0.17920353982300885, + "grad_norm": 13.896252632141113, + "learning_rate": 4.7209671035646304e-07, + "logits/chosen": -1.3125, + "logits/rejected": -1.2265625, + "logps/chosen": -248.5, + "logps/rejected": -264.0, + "loss": 0.3942, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.15234375, + "rewards/margins": 1.42578125, + "rewards/rejected": -0.27294921875, + "step": 162 + }, + { + "epoch": 0.18030973451327434, + "grad_norm": 14.796649932861328, + "learning_rate": 4.7168367011106367e-07, + "logits/chosen": -1.30859375, + "logits/rejected": -1.30078125, + "logps/chosen": -245.0, + "logps/rejected": -262.5, + "loss": 0.3799, + "rewards/accuracies": 0.7578125, + "rewards/chosen": 1.13671875, + "rewards/margins": 1.5703125, + "rewards/rejected": -0.431640625, + "step": 163 + }, + { + "epoch": 0.18141592920353983, + "grad_norm": 16.078460693359375, + "learning_rate": 4.712677786847814e-07, + "logits/chosen": -1.44140625, + "logits/rejected": -1.1875, + "logps/chosen": -243.5, + "logps/rejected": -250.0, + "loss": 0.4507, + "rewards/accuracies": 0.703125, + "rewards/chosen": 1.044921875, + "rewards/margins": 1.2265625, + "rewards/rejected": -0.18359375, + "step": 164 + }, + { + "epoch": 0.18252212389380532, + "grad_norm": 13.583531379699707, + "learning_rate": 4.708490414265971e-07, + "logits/chosen": -1.375, + "logits/rejected": -1.1796875, + "logps/chosen": -262.0, + "logps/rejected": -272.5, + "loss": 0.3486, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 1.18359375, + "rewards/margins": 1.63671875, + "rewards/rejected": -0.4521484375, + "step": 165 + }, + { + "epoch": 0.1836283185840708, + "grad_norm": 14.29465389251709, + "learning_rate": 4.7042746372209296e-07, + "logits/chosen": -1.25, + "logits/rejected": -1.32421875, + "logps/chosen": -249.5, + "logps/rejected": -278.0, + "loss": 0.357, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 1.3046875, + "rewards/margins": 1.71875, + "rewards/rejected": -0.416015625, + "step": 166 + }, + { + "epoch": 0.1847345132743363, + "grad_norm": 14.11926555633545, + "learning_rate": 4.700030509933839e-07, + "logits/chosen": -1.12890625, + "logits/rejected": -1.1484375, + "logps/chosen": -235.5, + "logps/rejected": -273.0, + "loss": 0.3775, + "rewards/accuracies": 0.796875, + "rewards/chosen": 1.33984375, + "rewards/margins": 1.60546875, + "rewards/rejected": -0.2646484375, + "step": 167 + }, + { + "epoch": 0.18584070796460178, + "grad_norm": 13.987667083740234, + "learning_rate": 4.6957580869904707e-07, + "logits/chosen": -1.234375, + "logits/rejected": -1.140625, + "logps/chosen": -266.0, + "logps/rejected": -280.0, + "loss": 0.3593, + "rewards/accuracies": 0.796875, + "rewards/chosen": 1.15625, + "rewards/margins": 1.55078125, + "rewards/rejected": -0.3935546875, + "step": 168 + }, + { + "epoch": 0.18694690265486727, + "grad_norm": 14.725763320922852, + "learning_rate": 4.691457423340524e-07, + "logits/chosen": -1.2265625, + "logits/rejected": -1.076171875, + "logps/chosen": -261.0, + "logps/rejected": -248.5, + "loss": 0.3935, + "rewards/accuracies": 0.7421875, + "rewards/chosen": 0.953125, + "rewards/margins": 1.39453125, + "rewards/rejected": -0.439453125, + "step": 169 + }, + { + "epoch": 0.18805309734513273, + "grad_norm": 15.593293190002441, + "learning_rate": 4.6871285742969114e-07, + "logits/chosen": -1.171875, + "logits/rejected": -1.21875, + "logps/chosen": -267.0, + "logps/rejected": -278.0, + "loss": 0.4233, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.23828125, + "rewards/margins": 1.5234375, + "rewards/rejected": -0.279296875, + "step": 170 + }, + { + "epoch": 0.18915929203539822, + "grad_norm": 13.978684425354004, + "learning_rate": 4.682771595535056e-07, + "logits/chosen": -1.2890625, + "logits/rejected": -1.3046875, + "logps/chosen": -244.5, + "logps/rejected": -274.0, + "loss": 0.3605, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.41796875, + "rewards/margins": 1.6171875, + "rewards/rejected": -0.19677734375, + "step": 171 + }, + { + "epoch": 0.1902654867256637, + "grad_norm": 12.64192008972168, + "learning_rate": 4.678386543092168e-07, + "logits/chosen": -1.1953125, + "logits/rejected": -1.16015625, + "logps/chosen": -243.5, + "logps/rejected": -267.0, + "loss": 0.35, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.32421875, + "rewards/margins": 1.8515625, + "rewards/rejected": -0.52734375, + "step": 172 + }, + { + "epoch": 0.1913716814159292, + "grad_norm": 15.251437187194824, + "learning_rate": 4.673973473366527e-07, + "logits/chosen": -1.25390625, + "logits/rejected": -1.2578125, + "logps/chosen": -252.5, + "logps/rejected": -270.5, + "loss": 0.386, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.33984375, + "rewards/margins": 1.71875, + "rewards/rejected": -0.3818359375, + "step": 173 + }, + { + "epoch": 0.19247787610619468, + "grad_norm": 11.346704483032227, + "learning_rate": 4.669532443116757e-07, + "logits/chosen": -1.2890625, + "logits/rejected": -1.20703125, + "logps/chosen": -227.0, + "logps/rejected": -244.5, + "loss": 0.2852, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.46484375, + "rewards/margins": 2.0625, + "rewards/rejected": -0.59765625, + "step": 174 + }, + { + "epoch": 0.19358407079646017, + "grad_norm": 17.457523345947266, + "learning_rate": 4.6650635094610966e-07, + "logits/chosen": -1.34375, + "logits/rejected": -1.12109375, + "logps/chosen": -280.0, + "logps/rejected": -277.0, + "loss": 0.4692, + "rewards/accuracies": 0.6953125, + "rewards/chosen": 1.05859375, + "rewards/margins": 1.22265625, + "rewards/rejected": -0.166748046875, + "step": 175 + }, + { + "epoch": 0.19469026548672566, + "grad_norm": 14.530098915100098, + "learning_rate": 4.6605667298766607e-07, + "logits/chosen": -1.30078125, + "logits/rejected": -1.21875, + "logps/chosen": -241.5, + "logps/rejected": -260.0, + "loss": 0.3907, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.17578125, + "rewards/margins": 1.67578125, + "rewards/rejected": -0.501953125, + "step": 176 + }, + { + "epoch": 0.19579646017699115, + "grad_norm": 15.266855239868164, + "learning_rate": 4.656042162198708e-07, + "logits/chosen": -1.43359375, + "logits/rejected": -1.3046875, + "logps/chosen": -235.0, + "logps/rejected": -265.0, + "loss": 0.4364, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.0703125, + "rewards/margins": 1.4296875, + "rewards/rejected": -0.357421875, + "step": 177 + }, + { + "epoch": 0.19690265486725664, + "grad_norm": 12.054651260375977, + "learning_rate": 4.6514898646198896e-07, + "logits/chosen": -1.3203125, + "logits/rejected": -1.26171875, + "logps/chosen": -257.0, + "logps/rejected": -271.0, + "loss": 0.3194, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 1.2734375, + "rewards/margins": 1.91015625, + "rewards/rejected": -0.6328125, + "step": 178 + }, + { + "epoch": 0.19800884955752213, + "grad_norm": 15.28715705871582, + "learning_rate": 4.6469098956895076e-07, + "logits/chosen": -1.3125, + "logits/rejected": -1.22265625, + "logps/chosen": -265.5, + "logps/rejected": -277.0, + "loss": 0.3848, + "rewards/accuracies": 0.765625, + "rewards/chosen": 1.28125, + "rewards/margins": 1.65234375, + "rewards/rejected": -0.373046875, + "step": 179 + }, + { + "epoch": 0.19911504424778761, + "grad_norm": 14.788736343383789, + "learning_rate": 4.6423023143127557e-07, + "logits/chosen": -1.2265625, + "logits/rejected": -1.3984375, + "logps/chosen": -252.0, + "logps/rejected": -272.0, + "loss": 0.3994, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 1.0859375, + "rewards/margins": 1.5, + "rewards/rejected": -0.4150390625, + "step": 180 + }, + { + "epoch": 0.2002212389380531, + "grad_norm": 14.42548942565918, + "learning_rate": 4.637667179749968e-07, + "logits/chosen": -1.23046875, + "logits/rejected": -1.21484375, + "logps/chosen": -272.5, + "logps/rejected": -274.5, + "loss": 0.3871, + "rewards/accuracies": 0.6875, + "rewards/chosen": 1.0390625, + "rewards/margins": 1.515625, + "rewards/rejected": -0.48046875, + "step": 181 + }, + { + "epoch": 0.2013274336283186, + "grad_norm": 13.830480575561523, + "learning_rate": 4.63300455161585e-07, + "logits/chosen": -1.29296875, + "logits/rejected": -1.19140625, + "logps/chosen": -250.0, + "logps/rejected": -248.0, + "loss": 0.3167, + "rewards/accuracies": 0.796875, + "rewards/chosen": 1.34375, + "rewards/margins": 1.92578125, + "rewards/rejected": -0.5859375, + "step": 182 + }, + { + "epoch": 0.20243362831858408, + "grad_norm": 14.639776229858398, + "learning_rate": 4.6283144898787174e-07, + "logits/chosen": -1.32421875, + "logits/rejected": -1.20703125, + "logps/chosen": -247.5, + "logps/rejected": -279.0, + "loss": 0.3672, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.32421875, + "rewards/margins": 1.79296875, + "rewards/rejected": -0.46484375, + "step": 183 + }, + { + "epoch": 0.20353982300884957, + "grad_norm": 13.662202835083008, + "learning_rate": 4.6235970548597224e-07, + "logits/chosen": -1.26953125, + "logits/rejected": -1.234375, + "logps/chosen": -231.0, + "logps/rejected": -240.0, + "loss": 0.3531, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.28125, + "rewards/margins": 1.78515625, + "rewards/rejected": -0.505859375, + "step": 184 + }, + { + "epoch": 0.20464601769911506, + "grad_norm": 13.101706504821777, + "learning_rate": 4.6188523072320777e-07, + "logits/chosen": -1.2421875, + "logits/rejected": -1.14453125, + "logps/chosen": -253.0, + "logps/rejected": -273.0, + "loss": 0.3276, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 1.2421875, + "rewards/margins": 1.82421875, + "rewards/rejected": -0.5830078125, + "step": 185 + }, + { + "epoch": 0.20575221238938052, + "grad_norm": 16.33759307861328, + "learning_rate": 4.614080308020277e-07, + "logits/chosen": -1.25, + "logits/rejected": -1.2265625, + "logps/chosen": -258.0, + "logps/rejected": -290.0, + "loss": 0.3694, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.203125, + "rewards/margins": 1.69140625, + "rewards/rejected": -0.48828125, + "step": 186 + }, + { + "epoch": 0.206858407079646, + "grad_norm": 13.627776145935059, + "learning_rate": 4.609281118599311e-07, + "logits/chosen": -1.3046875, + "logits/rejected": -1.21875, + "logps/chosen": -238.5, + "logps/rejected": -239.0, + "loss": 0.4007, + "rewards/accuracies": 0.7265625, + "rewards/chosen": 1.056640625, + "rewards/margins": 1.55078125, + "rewards/rejected": -0.4931640625, + "step": 187 + }, + { + "epoch": 0.2079646017699115, + "grad_norm": 13.673922538757324, + "learning_rate": 4.6044548006938734e-07, + "logits/chosen": -1.3671875, + "logits/rejected": -1.1796875, + "logps/chosen": -247.5, + "logps/rejected": -254.5, + "loss": 0.3592, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.29296875, + "rewards/margins": 1.7265625, + "rewards/rejected": -0.4375, + "step": 188 + }, + { + "epoch": 0.20907079646017698, + "grad_norm": 14.20157527923584, + "learning_rate": 4.5996014163775745e-07, + "logits/chosen": -1.30078125, + "logits/rejected": -1.26953125, + "logps/chosen": -268.5, + "logps/rejected": -272.0, + "loss": 0.3429, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.32421875, + "rewards/margins": 1.84765625, + "rewards/rejected": -0.5234375, + "step": 189 + }, + { + "epoch": 0.21017699115044247, + "grad_norm": 14.90439510345459, + "learning_rate": 4.5947210280721353e-07, + "logits/chosen": -1.34375, + "logits/rejected": -1.2421875, + "logps/chosen": -248.0, + "logps/rejected": -285.0, + "loss": 0.373, + "rewards/accuracies": 0.7265625, + "rewards/chosen": 1.203125, + "rewards/margins": 1.765625, + "rewards/rejected": -0.5625, + "step": 190 + }, + { + "epoch": 0.21128318584070796, + "grad_norm": 14.063448905944824, + "learning_rate": 4.589813698546592e-07, + "logits/chosen": -1.34375, + "logits/rejected": -1.1328125, + "logps/chosen": -256.0, + "logps/rejected": -274.0, + "loss": 0.3471, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.21875, + "rewards/margins": 1.828125, + "rewards/rejected": -0.607421875, + "step": 191 + }, + { + "epoch": 0.21238938053097345, + "grad_norm": 13.391234397888184, + "learning_rate": 4.584879490916481e-07, + "logits/chosen": -1.30078125, + "logits/rejected": -1.234375, + "logps/chosen": -247.5, + "logps/rejected": -241.5, + "loss": 0.356, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.0390625, + "rewards/margins": 1.9296875, + "rewards/rejected": -0.892578125, + "step": 192 + }, + { + "epoch": 0.21349557522123894, + "grad_norm": 13.415105819702148, + "learning_rate": 4.5799184686430343e-07, + "logits/chosen": -1.23046875, + "logits/rejected": -1.09375, + "logps/chosen": -251.0, + "logps/rejected": -257.5, + "loss": 0.34, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.1875, + "rewards/margins": 1.87109375, + "rewards/rejected": -0.689453125, + "step": 193 + }, + { + "epoch": 0.21460176991150443, + "grad_norm": 13.00170612335205, + "learning_rate": 4.574930695532356e-07, + "logits/chosen": -1.30078125, + "logits/rejected": -1.32421875, + "logps/chosen": -257.0, + "logps/rejected": -273.0, + "loss": 0.3455, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 1.203125, + "rewards/margins": 1.82421875, + "rewards/rejected": -0.623046875, + "step": 194 + }, + { + "epoch": 0.2157079646017699, + "grad_norm": 13.366878509521484, + "learning_rate": 4.569916235734611e-07, + "logits/chosen": -1.2890625, + "logits/rejected": -1.19921875, + "logps/chosen": -240.5, + "logps/rejected": -272.0, + "loss": 0.3792, + "rewards/accuracies": 0.7109375, + "rewards/chosen": 1.2109375, + "rewards/margins": 1.7578125, + "rewards/rejected": -0.544921875, + "step": 195 + }, + { + "epoch": 0.2168141592920354, + "grad_norm": 14.402266502380371, + "learning_rate": 4.5648751537431897e-07, + "logits/chosen": -1.21484375, + "logits/rejected": -1.171875, + "logps/chosen": -250.5, + "logps/rejected": -286.0, + "loss": 0.428, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.9453125, + "rewards/margins": 1.3984375, + "rewards/rejected": -0.4560546875, + "step": 196 + }, + { + "epoch": 0.2179203539823009, + "grad_norm": 15.003867149353027, + "learning_rate": 4.559807514393885e-07, + "logits/chosen": -1.30859375, + "logits/rejected": -1.2421875, + "logps/chosen": -276.5, + "logps/rejected": -286.0, + "loss": 0.35, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.296875, + "rewards/margins": 1.87109375, + "rewards/rejected": -0.5810546875, + "step": 197 + }, + { + "epoch": 0.21902654867256638, + "grad_norm": 14.217790603637695, + "learning_rate": 4.5547133828640595e-07, + "logits/chosen": -1.25, + "logits/rejected": -1.24609375, + "logps/chosen": -267.0, + "logps/rejected": -266.0, + "loss": 0.3393, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.27734375, + "rewards/margins": 1.83203125, + "rewards/rejected": -0.5556640625, + "step": 198 + }, + { + "epoch": 0.22013274336283187, + "grad_norm": 13.773700714111328, + "learning_rate": 4.5495928246717995e-07, + "logits/chosen": -1.2421875, + "logits/rejected": -1.28125, + "logps/chosen": -265.0, + "logps/rejected": -292.0, + "loss": 0.3351, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.375, + "rewards/margins": 2.0546875, + "rewards/rejected": -0.67578125, + "step": 199 + }, + { + "epoch": 0.22123893805309736, + "grad_norm": 14.733463287353516, + "learning_rate": 4.544445905675081e-07, + "logits/chosen": -1.265625, + "logits/rejected": -1.18359375, + "logps/chosen": -266.0, + "logps/rejected": -281.5, + "loss": 0.3673, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.0859375, + "rewards/margins": 1.7265625, + "rewards/rejected": -0.642578125, + "step": 200 + }, + { + "epoch": 0.22123893805309736, + "eval_logits/chosen": -1.276119351387024, + "eval_logits/rejected": -1.2190414667129517, + "eval_logps/chosen": -249.2039794921875, + "eval_logps/rejected": -265.0248718261719, + "eval_loss": 0.37490636110305786, + "eval_rewards/accuracies": 0.7651365399360657, + "eval_rewards/chosen": 1.1634211540222168, + "eval_rewards/margins": 1.7447527647018433, + "eval_rewards/rejected": -0.5807631611824036, + "eval_runtime": 192.9266, + "eval_samples_per_second": 66.621, + "eval_steps_per_second": 1.042, + "step": 200 + }, + { + "epoch": 0.22234513274336284, + "grad_norm": 14.183818817138672, + "learning_rate": 4.539272692070919e-07, + "logits/chosen": -1.2890625, + "logits/rejected": -1.203125, + "logps/chosen": -270.0, + "logps/rejected": -238.5, + "loss": 0.3398, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.2421875, + "rewards/margins": 1.84375, + "rewards/rejected": -0.6015625, + "step": 201 + }, + { + "epoch": 0.2234513274336283, + "grad_norm": 14.671875953674316, + "learning_rate": 4.534073250394515e-07, + "logits/chosen": -1.40625, + "logits/rejected": -1.25390625, + "logps/chosen": -245.5, + "logps/rejected": -261.5, + "loss": 0.4247, + "rewards/accuracies": 0.6796875, + "rewards/chosen": 0.904296875, + "rewards/margins": 1.375, + "rewards/rejected": -0.47265625, + "step": 202 + }, + { + "epoch": 0.2245575221238938, + "grad_norm": 14.409346580505371, + "learning_rate": 4.5288476475184025e-07, + "logits/chosen": -1.23828125, + "logits/rejected": -1.15234375, + "logps/chosen": -251.5, + "logps/rejected": -259.0, + "loss": 0.3738, + "rewards/accuracies": 0.7578125, + "rewards/chosen": 1.10546875, + "rewards/margins": 1.73046875, + "rewards/rejected": -0.623046875, + "step": 203 + }, + { + "epoch": 0.22566371681415928, + "grad_norm": 16.879392623901367, + "learning_rate": 4.523595950651587e-07, + "logits/chosen": -1.21484375, + "logits/rejected": -1.25390625, + "logps/chosen": -272.0, + "logps/rejected": -281.0, + "loss": 0.4152, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.0546875, + "rewards/margins": 1.640625, + "rewards/rejected": -0.58203125, + "step": 204 + }, + { + "epoch": 0.22676991150442477, + "grad_norm": 13.093546867370605, + "learning_rate": 4.518318227338681e-07, + "logits/chosen": -1.20703125, + "logits/rejected": -1.1953125, + "logps/chosen": -272.0, + "logps/rejected": -275.0, + "loss": 0.3398, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.07421875, + "rewards/margins": 1.71484375, + "rewards/rejected": -0.640625, + "step": 205 + }, + { + "epoch": 0.22787610619469026, + "grad_norm": 14.883780479431152, + "learning_rate": 4.5130145454590374e-07, + "logits/chosen": -1.28515625, + "logits/rejected": -1.140625, + "logps/chosen": -247.0, + "logps/rejected": -279.0, + "loss": 0.3714, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 1.12890625, + "rewards/margins": 1.84375, + "rewards/rejected": -0.71875, + "step": 206 + }, + { + "epoch": 0.22898230088495575, + "grad_norm": 13.462334632873535, + "learning_rate": 4.5076849732258737e-07, + "logits/chosen": -1.30078125, + "logits/rejected": -1.1953125, + "logps/chosen": -233.0, + "logps/rejected": -231.5, + "loss": 0.3624, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.2578125, + "rewards/margins": 1.75, + "rewards/rejected": -0.4931640625, + "step": 207 + }, + { + "epoch": 0.23008849557522124, + "grad_norm": 13.485892295837402, + "learning_rate": 4.5023295791853937e-07, + "logits/chosen": -1.30859375, + "logits/rejected": -1.25390625, + "logps/chosen": -243.0, + "logps/rejected": -284.0, + "loss": 0.3465, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 1.36328125, + "rewards/margins": 1.86328125, + "rewards/rejected": -0.4951171875, + "step": 208 + }, + { + "epoch": 0.23119469026548672, + "grad_norm": 13.468306541442871, + "learning_rate": 4.496948432215912e-07, + "logits/chosen": -1.2734375, + "logits/rejected": -1.14453125, + "logps/chosen": -239.0, + "logps/rejected": -231.5, + "loss": 0.3881, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.07421875, + "rewards/margins": 1.6484375, + "rewards/rejected": -0.576171875, + "step": 209 + }, + { + "epoch": 0.2323008849557522, + "grad_norm": 14.274983406066895, + "learning_rate": 4.4915416015269614e-07, + "logits/chosen": -1.30078125, + "logits/rejected": -1.265625, + "logps/chosen": -271.0, + "logps/rejected": -279.5, + "loss": 0.3449, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 1.24609375, + "rewards/margins": 1.875, + "rewards/rejected": -0.6298828125, + "step": 210 + }, + { + "epoch": 0.2334070796460177, + "grad_norm": 14.726081848144531, + "learning_rate": 4.486109156658405e-07, + "logits/chosen": -1.25390625, + "logits/rejected": -1.30078125, + "logps/chosen": -223.0, + "logps/rejected": -258.0, + "loss": 0.3548, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.26953125, + "rewards/margins": 1.83203125, + "rewards/rejected": -0.5556640625, + "step": 211 + }, + { + "epoch": 0.2345132743362832, + "grad_norm": 14.424053192138672, + "learning_rate": 4.480651167479544e-07, + "logits/chosen": -1.328125, + "logits/rejected": -1.18359375, + "logps/chosen": -235.5, + "logps/rejected": -251.0, + "loss": 0.3725, + "rewards/accuracies": 0.7578125, + "rewards/chosen": 1.3125, + "rewards/margins": 1.7109375, + "rewards/rejected": -0.3974609375, + "step": 212 + }, + { + "epoch": 0.23561946902654868, + "grad_norm": 16.50137710571289, + "learning_rate": 4.475167704188218e-07, + "logits/chosen": -1.28515625, + "logits/rejected": -1.33203125, + "logps/chosen": -261.0, + "logps/rejected": -274.0, + "loss": 0.4309, + "rewards/accuracies": 0.765625, + "rewards/chosen": 1.0625, + "rewards/margins": 1.57421875, + "rewards/rejected": -0.513671875, + "step": 213 + }, + { + "epoch": 0.23672566371681417, + "grad_norm": 13.223847389221191, + "learning_rate": 4.4696588373098973e-07, + "logits/chosen": -1.203125, + "logits/rejected": -1.26953125, + "logps/chosen": -246.0, + "logps/rejected": -262.5, + "loss": 0.3152, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.40234375, + "rewards/margins": 2.12109375, + "rewards/rejected": -0.7177734375, + "step": 214 + }, + { + "epoch": 0.23783185840707965, + "grad_norm": 15.553281784057617, + "learning_rate": 4.4641246376967854e-07, + "logits/chosen": -1.19140625, + "logits/rejected": -1.1640625, + "logps/chosen": -256.5, + "logps/rejected": -271.0, + "loss": 0.3849, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.203125, + "rewards/margins": 1.73046875, + "rewards/rejected": -0.52734375, + "step": 215 + }, + { + "epoch": 0.23893805309734514, + "grad_norm": 14.652776718139648, + "learning_rate": 4.4585651765268983e-07, + "logits/chosen": -1.21484375, + "logits/rejected": -1.16796875, + "logps/chosen": -249.0, + "logps/rejected": -240.0, + "loss": 0.394, + "rewards/accuracies": 0.7421875, + "rewards/chosen": 1.15625, + "rewards/margins": 1.73828125, + "rewards/rejected": -0.5859375, + "step": 216 + }, + { + "epoch": 0.24004424778761063, + "grad_norm": 15.165270805358887, + "learning_rate": 4.452980525303155e-07, + "logits/chosen": -1.29296875, + "logits/rejected": -1.22265625, + "logps/chosen": -272.5, + "logps/rejected": -269.0, + "loss": 0.3583, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.25, + "rewards/margins": 1.81640625, + "rewards/rejected": -0.56640625, + "step": 217 + }, + { + "epoch": 0.2411504424778761, + "grad_norm": 13.010436058044434, + "learning_rate": 4.4473707558524553e-07, + "logits/chosen": -1.41796875, + "logits/rejected": -1.1640625, + "logps/chosen": -248.5, + "logps/rejected": -276.0, + "loss": 0.3244, + "rewards/accuracies": 0.796875, + "rewards/chosen": 1.30859375, + "rewards/margins": 2.05859375, + "rewards/rejected": -0.751953125, + "step": 218 + }, + { + "epoch": 0.24225663716814158, + "grad_norm": 14.902968406677246, + "learning_rate": 4.4417359403247567e-07, + "logits/chosen": -1.2890625, + "logits/rejected": -1.10546875, + "logps/chosen": -255.0, + "logps/rejected": -276.0, + "loss": 0.3569, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.453125, + "rewards/margins": 2.0234375, + "rewards/rejected": -0.568359375, + "step": 219 + }, + { + "epoch": 0.24336283185840707, + "grad_norm": 13.878840446472168, + "learning_rate": 4.436076151192146e-07, + "logits/chosen": -1.33203125, + "logits/rejected": -1.26953125, + "logps/chosen": -218.0, + "logps/rejected": -246.5, + "loss": 0.3976, + "rewards/accuracies": 0.703125, + "rewards/chosen": 1.1484375, + "rewards/margins": 1.8046875, + "rewards/rejected": -0.654296875, + "step": 220 + }, + { + "epoch": 0.24446902654867256, + "grad_norm": 13.981918334960938, + "learning_rate": 4.4303914612479104e-07, + "logits/chosen": -1.32421875, + "logits/rejected": -1.25, + "logps/chosen": -237.0, + "logps/rejected": -273.0, + "loss": 0.3427, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.36328125, + "rewards/margins": 2.0390625, + "rewards/rejected": -0.673828125, + "step": 221 + }, + { + "epoch": 0.24557522123893805, + "grad_norm": 12.754227638244629, + "learning_rate": 4.4246819436055946e-07, + "logits/chosen": -1.234375, + "logits/rejected": -1.140625, + "logps/chosen": -248.0, + "logps/rejected": -250.0, + "loss": 0.3383, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.109375, + "rewards/margins": 1.87890625, + "rewards/rejected": -0.76171875, + "step": 222 + }, + { + "epoch": 0.24668141592920353, + "grad_norm": 14.764009475708008, + "learning_rate": 4.418947671698066e-07, + "logits/chosen": -1.34375, + "logits/rejected": -1.234375, + "logps/chosen": -250.5, + "logps/rejected": -266.0, + "loss": 0.3845, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.060546875, + "rewards/margins": 1.7265625, + "rewards/rejected": -0.66015625, + "step": 223 + }, + { + "epoch": 0.24778761061946902, + "grad_norm": 15.158235549926758, + "learning_rate": 4.4131887192765684e-07, + "logits/chosen": -1.28515625, + "logits/rejected": -1.26953125, + "logps/chosen": -244.0, + "logps/rejected": -265.0, + "loss": 0.3368, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 1.19140625, + "rewards/margins": 2.09375, + "rewards/rejected": -0.904296875, + "step": 224 + }, + { + "epoch": 0.2488938053097345, + "grad_norm": 13.44605827331543, + "learning_rate": 4.4074051604097753e-07, + "logits/chosen": -1.265625, + "logits/rejected": -1.26171875, + "logps/chosen": -248.0, + "logps/rejected": -269.0, + "loss": 0.3464, + "rewards/accuracies": 0.796875, + "rewards/chosen": 1.28515625, + "rewards/margins": 2.03125, + "rewards/rejected": -0.744140625, + "step": 225 + }, + { + "epoch": 0.25, + "grad_norm": 15.778076171875, + "learning_rate": 4.401597069482832e-07, + "logits/chosen": -1.32421875, + "logits/rejected": -1.26171875, + "logps/chosen": -248.5, + "logps/rejected": -265.0, + "loss": 0.4139, + "rewards/accuracies": 0.7265625, + "rewards/chosen": 0.962890625, + "rewards/margins": 1.66015625, + "rewards/rejected": -0.697265625, + "step": 226 + }, + { + "epoch": 0.25110619469026546, + "grad_norm": 13.870752334594727, + "learning_rate": 4.395764521196406e-07, + "logits/chosen": -1.26171875, + "logits/rejected": -1.2265625, + "logps/chosen": -234.5, + "logps/rejected": -281.0, + "loss": 0.3158, + "rewards/accuracies": 0.796875, + "rewards/chosen": 1.3984375, + "rewards/margins": 2.2734375, + "rewards/rejected": -0.87109375, + "step": 227 + }, + { + "epoch": 0.252212389380531, + "grad_norm": 13.615601539611816, + "learning_rate": 4.389907590565721e-07, + "logits/chosen": -1.1796875, + "logits/rejected": -1.15234375, + "logps/chosen": -268.0, + "logps/rejected": -290.0, + "loss": 0.3724, + "rewards/accuracies": 0.7421875, + "rewards/chosen": 1.087890625, + "rewards/margins": 1.91796875, + "rewards/rejected": -0.830078125, + "step": 228 + }, + { + "epoch": 0.25331858407079644, + "grad_norm": 14.186120986938477, + "learning_rate": 4.3840263529195943e-07, + "logits/chosen": -1.2421875, + "logits/rejected": -1.2109375, + "logps/chosen": -248.5, + "logps/rejected": -262.0, + "loss": 0.3415, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 1.072265625, + "rewards/margins": 1.9375, + "rewards/rejected": -0.865234375, + "step": 229 + }, + { + "epoch": 0.25442477876106195, + "grad_norm": 12.267884254455566, + "learning_rate": 4.3781208838994663e-07, + "logits/chosen": -1.296875, + "logits/rejected": -1.23828125, + "logps/chosen": -246.0, + "logps/rejected": -257.5, + "loss": 0.3271, + "rewards/accuracies": 0.796875, + "rewards/chosen": 1.15234375, + "rewards/margins": 1.97265625, + "rewards/rejected": -0.8203125, + "step": 230 + }, + { + "epoch": 0.2555309734513274, + "grad_norm": 14.3861722946167, + "learning_rate": 4.372191259458432e-07, + "logits/chosen": -1.3515625, + "logits/rejected": -1.21875, + "logps/chosen": -234.5, + "logps/rejected": -251.0, + "loss": 0.3735, + "rewards/accuracies": 0.765625, + "rewards/chosen": 1.111328125, + "rewards/margins": 1.92578125, + "rewards/rejected": -0.81640625, + "step": 231 + }, + { + "epoch": 0.25663716814159293, + "grad_norm": 13.046867370605469, + "learning_rate": 4.366237555860256e-07, + "logits/chosen": -1.35546875, + "logits/rejected": -1.2109375, + "logps/chosen": -246.0, + "logps/rejected": -270.0, + "loss": 0.3317, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.16015625, + "rewards/margins": 2.109375, + "rewards/rejected": -0.9453125, + "step": 232 + }, + { + "epoch": 0.2577433628318584, + "grad_norm": 15.247108459472656, + "learning_rate": 4.3602598496784013e-07, + "logits/chosen": -1.2578125, + "logits/rejected": -1.140625, + "logps/chosen": -272.0, + "logps/rejected": -268.0, + "loss": 0.3798, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 0.9765625, + "rewards/margins": 1.875, + "rewards/rejected": -0.896484375, + "step": 233 + }, + { + "epoch": 0.2588495575221239, + "grad_norm": 13.2136812210083, + "learning_rate": 4.3542582177950373e-07, + "logits/chosen": -1.21484375, + "logits/rejected": -1.1484375, + "logps/chosen": -227.5, + "logps/rejected": -262.5, + "loss": 0.3171, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.26953125, + "rewards/margins": 2.1015625, + "rewards/rejected": -0.822265625, + "step": 234 + }, + { + "epoch": 0.25995575221238937, + "grad_norm": 13.574021339416504, + "learning_rate": 4.348232737400054e-07, + "logits/chosen": -1.13671875, + "logits/rejected": -1.171875, + "logps/chosen": -239.0, + "logps/rejected": -267.0, + "loss": 0.3749, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 1.05859375, + "rewards/margins": 1.78125, + "rewards/rejected": -0.720703125, + "step": 235 + }, + { + "epoch": 0.2610619469026549, + "grad_norm": 13.393758773803711, + "learning_rate": 4.3421834859900685e-07, + "logits/chosen": -1.265625, + "logits/rejected": -1.140625, + "logps/chosen": -236.5, + "logps/rejected": -255.0, + "loss": 0.3454, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.1796875, + "rewards/margins": 2.140625, + "rewards/rejected": -0.962890625, + "step": 236 + }, + { + "epoch": 0.26216814159292035, + "grad_norm": 17.910938262939453, + "learning_rate": 4.336110541367428e-07, + "logits/chosen": -1.25390625, + "logits/rejected": -1.19921875, + "logps/chosen": -245.5, + "logps/rejected": -272.0, + "loss": 0.4424, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.943359375, + "rewards/margins": 1.58984375, + "rewards/rejected": -0.646484375, + "step": 237 + }, + { + "epoch": 0.26327433628318586, + "grad_norm": 14.35909366607666, + "learning_rate": 4.33001398163921e-07, + "logits/chosen": -1.25, + "logits/rejected": -1.2265625, + "logps/chosen": -243.5, + "logps/rejected": -260.5, + "loss": 0.3525, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.30078125, + "rewards/margins": 2.2109375, + "rewards/rejected": -0.91015625, + "step": 238 + }, + { + "epoch": 0.2643805309734513, + "grad_norm": 15.5848970413208, + "learning_rate": 4.3238938852162187e-07, + "logits/chosen": -1.1875, + "logits/rejected": -1.3046875, + "logps/chosen": -250.5, + "logps/rejected": -273.0, + "loss": 0.3839, + "rewards/accuracies": 0.7265625, + "rewards/chosen": 1.1640625, + "rewards/margins": 2.03125, + "rewards/rejected": -0.861328125, + "step": 239 + }, + { + "epoch": 0.26548672566371684, + "grad_norm": 13.962175369262695, + "learning_rate": 4.317750330811972e-07, + "logits/chosen": -1.3671875, + "logits/rejected": -1.328125, + "logps/chosen": -250.5, + "logps/rejected": -275.0, + "loss": 0.3394, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 1.1875, + "rewards/margins": 1.91015625, + "rewards/rejected": -0.72265625, + "step": 240 + }, + { + "epoch": 0.2665929203539823, + "grad_norm": 13.130892753601074, + "learning_rate": 4.311583397441696e-07, + "logits/chosen": -1.18359375, + "logits/rejected": -1.26171875, + "logps/chosen": -240.0, + "logps/rejected": -255.5, + "loss": 0.3364, + "rewards/accuracies": 0.765625, + "rewards/chosen": 1.25, + "rewards/margins": 2.203125, + "rewards/rejected": -0.9453125, + "step": 241 + }, + { + "epoch": 0.2676991150442478, + "grad_norm": 15.227952003479004, + "learning_rate": 4.3053931644213e-07, + "logits/chosen": -1.23046875, + "logits/rejected": -1.1640625, + "logps/chosen": -261.0, + "logps/rejected": -269.5, + "loss": 0.4343, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.15234375, + "rewards/margins": 1.65234375, + "rewards/rejected": -0.4970703125, + "step": 242 + }, + { + "epoch": 0.2688053097345133, + "grad_norm": 11.86292552947998, + "learning_rate": 4.2991797113663676e-07, + "logits/chosen": -1.24609375, + "logits/rejected": -1.1875, + "logps/chosen": -239.5, + "logps/rejected": -268.0, + "loss": 0.2865, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 1.3125, + "rewards/margins": 2.296875, + "rewards/rejected": -0.98046875, + "step": 243 + }, + { + "epoch": 0.26991150442477874, + "grad_norm": 12.915170669555664, + "learning_rate": 4.292943118191121e-07, + "logits/chosen": -1.19140625, + "logits/rejected": -1.23828125, + "logps/chosen": -243.0, + "logps/rejected": -257.0, + "loss": 0.3192, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 1.26953125, + "rewards/margins": 2.1328125, + "rewards/rejected": -0.869140625, + "step": 244 + }, + { + "epoch": 0.27101769911504425, + "grad_norm": 16.35938262939453, + "learning_rate": 4.2866834651074024e-07, + "logits/chosen": -1.16015625, + "logits/rejected": -1.12890625, + "logps/chosen": -283.0, + "logps/rejected": -308.0, + "loss": 0.3896, + "rewards/accuracies": 0.734375, + "rewards/chosen": 1.19921875, + "rewards/margins": 1.8046875, + "rewards/rejected": -0.607421875, + "step": 245 + }, + { + "epoch": 0.2721238938053097, + "grad_norm": 14.654645919799805, + "learning_rate": 4.280400832623636e-07, + "logits/chosen": -1.25, + "logits/rejected": -1.10546875, + "logps/chosen": -269.5, + "logps/rejected": -273.0, + "loss": 0.3785, + "rewards/accuracies": 0.7265625, + "rewards/chosen": 1.234375, + "rewards/margins": 1.95703125, + "rewards/rejected": -0.71484375, + "step": 246 + }, + { + "epoch": 0.27323008849557523, + "grad_norm": 12.577658653259277, + "learning_rate": 4.274095301543796e-07, + "logits/chosen": -1.4140625, + "logits/rejected": -1.234375, + "logps/chosen": -222.5, + "logps/rejected": -252.0, + "loss": 0.3402, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.33984375, + "rewards/margins": 2.06640625, + "rewards/rejected": -0.73046875, + "step": 247 + }, + { + "epoch": 0.2743362831858407, + "grad_norm": 13.634322166442871, + "learning_rate": 4.2677669529663686e-07, + "logits/chosen": -1.2578125, + "logits/rejected": -1.1640625, + "logps/chosen": -266.0, + "logps/rejected": -267.5, + "loss": 0.3221, + "rewards/accuracies": 0.796875, + "rewards/chosen": 1.41015625, + "rewards/margins": 2.2734375, + "rewards/rejected": -0.87109375, + "step": 248 + }, + { + "epoch": 0.2754424778761062, + "grad_norm": 14.120111465454102, + "learning_rate": 4.2614158682833037e-07, + "logits/chosen": -1.26953125, + "logits/rejected": -1.1328125, + "logps/chosen": -251.0, + "logps/rejected": -281.0, + "loss": 0.3739, + "rewards/accuracies": 0.765625, + "rewards/chosen": 1.32421875, + "rewards/margins": 1.90625, + "rewards/rejected": -0.5791015625, + "step": 249 + }, + { + "epoch": 0.27654867256637167, + "grad_norm": 14.189047813415527, + "learning_rate": 4.255042129178973e-07, + "logits/chosen": -1.27734375, + "logits/rejected": -1.21484375, + "logps/chosen": -237.0, + "logps/rejected": -268.0, + "loss": 0.3868, + "rewards/accuracies": 0.7421875, + "rewards/chosen": 1.14453125, + "rewards/margins": 1.9375, + "rewards/rejected": -0.794921875, + "step": 250 + }, + { + "epoch": 0.27654867256637167, + "eval_logits/chosen": -1.2774020433425903, + "eval_logits/rejected": -1.207769751548767, + "eval_logps/chosen": -248.95523071289062, + "eval_logps/rejected": -267.1990051269531, + "eval_loss": 0.36062541604042053, + "eval_rewards/accuracies": 0.7771241068840027, + "eval_rewards/chosen": 1.1926889419555664, + "eval_rewards/margins": 1.988767147064209, + "eval_rewards/rejected": -0.7957963943481445, + "eval_runtime": 193.0793, + "eval_samples_per_second": 66.568, + "eval_steps_per_second": 1.041, + "step": 250 + }, + { + "epoch": 0.2776548672566372, + "grad_norm": 14.157464981079102, + "learning_rate": 4.248645817629117e-07, + "logits/chosen": -1.40625, + "logits/rejected": -1.26171875, + "logps/chosen": -262.0, + "logps/rejected": -279.0, + "loss": 0.3588, + "rewards/accuracies": 0.796875, + "rewards/chosen": 1.033203125, + "rewards/margins": 1.91796875, + "rewards/rejected": -0.88671875, + "step": 251 + }, + { + "epoch": 0.27876106194690264, + "grad_norm": 12.822221755981445, + "learning_rate": 4.242227015899793e-07, + "logits/chosen": -1.2890625, + "logits/rejected": -1.19140625, + "logps/chosen": -245.5, + "logps/rejected": -273.0, + "loss": 0.3323, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.41796875, + "rewards/margins": 2.2734375, + "rewards/rejected": -0.853515625, + "step": 252 + }, + { + "epoch": 0.27986725663716816, + "grad_norm": 15.107699394226074, + "learning_rate": 4.2357858065463124e-07, + "logits/chosen": -1.3046875, + "logits/rejected": -1.13671875, + "logps/chosen": -243.5, + "logps/rejected": -275.0, + "loss": 0.4063, + "rewards/accuracies": 0.7421875, + "rewards/chosen": 1.2421875, + "rewards/margins": 1.88671875, + "rewards/rejected": -0.642578125, + "step": 253 + }, + { + "epoch": 0.2809734513274336, + "grad_norm": 14.644704818725586, + "learning_rate": 4.229322272412185e-07, + "logits/chosen": -1.2421875, + "logits/rejected": -1.203125, + "logps/chosen": -274.0, + "logps/rejected": -289.0, + "loss": 0.3511, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.009765625, + "rewards/margins": 1.98046875, + "rewards/rejected": -0.970703125, + "step": 254 + }, + { + "epoch": 0.28207964601769914, + "grad_norm": 14.453044891357422, + "learning_rate": 4.222836496628047e-07, + "logits/chosen": -1.31640625, + "logits/rejected": -1.21875, + "logps/chosen": -264.0, + "logps/rejected": -286.0, + "loss": 0.3342, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.21484375, + "rewards/margins": 1.9375, + "rewards/rejected": -0.72265625, + "step": 255 + }, + { + "epoch": 0.2831858407079646, + "grad_norm": 12.731569290161133, + "learning_rate": 4.216328562610599e-07, + "logits/chosen": -1.33203125, + "logits/rejected": -1.2109375, + "logps/chosen": -231.5, + "logps/rejected": -262.5, + "loss": 0.3542, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.06640625, + "rewards/margins": 2.12890625, + "rewards/rejected": -1.064453125, + "step": 256 + }, + { + "epoch": 0.2842920353982301, + "grad_norm": 14.310720443725586, + "learning_rate": 4.209798554061527e-07, + "logits/chosen": -1.2578125, + "logits/rejected": -1.21484375, + "logps/chosen": -258.0, + "logps/rejected": -282.0, + "loss": 0.3884, + "rewards/accuracies": 0.7578125, + "rewards/chosen": 0.966796875, + "rewards/margins": 1.828125, + "rewards/rejected": -0.861328125, + "step": 257 + }, + { + "epoch": 0.2853982300884956, + "grad_norm": 14.716500282287598, + "learning_rate": 4.203246554966428e-07, + "logits/chosen": -1.1796875, + "logits/rejected": -1.3046875, + "logps/chosen": -243.0, + "logps/rejected": -253.0, + "loss": 0.4139, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.962890625, + "rewards/margins": 1.71484375, + "rewards/rejected": -0.75, + "step": 258 + }, + { + "epoch": 0.28650442477876104, + "grad_norm": 14.436864852905273, + "learning_rate": 4.1966726495937305e-07, + "logits/chosen": -1.42578125, + "logits/rejected": -1.20703125, + "logps/chosen": -252.5, + "logps/rejected": -273.0, + "loss": 0.3439, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.08984375, + "rewards/margins": 1.95703125, + "rewards/rejected": -0.86328125, + "step": 259 + }, + { + "epoch": 0.28761061946902655, + "grad_norm": 15.182847023010254, + "learning_rate": 4.1900769224936124e-07, + "logits/chosen": -1.1953125, + "logits/rejected": -1.19140625, + "logps/chosen": -286.0, + "logps/rejected": -310.0, + "loss": 0.3774, + "rewards/accuracies": 0.7109375, + "rewards/chosen": 0.96875, + "rewards/margins": 1.99609375, + "rewards/rejected": -1.03125, + "step": 260 + }, + { + "epoch": 0.288716814159292, + "grad_norm": 13.360356330871582, + "learning_rate": 4.1834594584969077e-07, + "logits/chosen": -1.25390625, + "logits/rejected": -1.2109375, + "logps/chosen": -248.5, + "logps/rejected": -266.0, + "loss": 0.3638, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 1.115234375, + "rewards/margins": 1.9140625, + "rewards/rejected": -0.802734375, + "step": 261 + }, + { + "epoch": 0.28982300884955753, + "grad_norm": 13.982027053833008, + "learning_rate": 4.176820342714022e-07, + "logits/chosen": -1.39453125, + "logits/rejected": -1.26171875, + "logps/chosen": -259.0, + "logps/rejected": -281.0, + "loss": 0.3449, + "rewards/accuracies": 0.7578125, + "rewards/chosen": 1.068359375, + "rewards/margins": 1.99609375, + "rewards/rejected": -0.921875, + "step": 262 + }, + { + "epoch": 0.290929203539823, + "grad_norm": 13.159867286682129, + "learning_rate": 4.1701596605338334e-07, + "logits/chosen": -1.40234375, + "logits/rejected": -1.234375, + "logps/chosen": -242.5, + "logps/rejected": -271.0, + "loss": 0.3395, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 1.25, + "rewards/margins": 1.99609375, + "rewards/rejected": -0.751953125, + "step": 263 + }, + { + "epoch": 0.2920353982300885, + "grad_norm": 12.9893798828125, + "learning_rate": 4.1634774976225965e-07, + "logits/chosen": -1.35546875, + "logits/rejected": -1.2109375, + "logps/chosen": -234.5, + "logps/rejected": -277.0, + "loss": 0.3156, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.171875, + "rewards/margins": 2.2421875, + "rewards/rejected": -1.0703125, + "step": 264 + }, + { + "epoch": 0.29314159292035397, + "grad_norm": 13.78116226196289, + "learning_rate": 4.15677393992284e-07, + "logits/chosen": -1.2890625, + "logits/rejected": -1.2578125, + "logps/chosen": -253.5, + "logps/rejected": -279.0, + "loss": 0.3418, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.20703125, + "rewards/margins": 2.171875, + "rewards/rejected": -0.966796875, + "step": 265 + }, + { + "epoch": 0.2942477876106195, + "grad_norm": 14.935332298278809, + "learning_rate": 4.150049073652261e-07, + "logits/chosen": -1.31640625, + "logits/rejected": -1.15625, + "logps/chosen": -265.0, + "logps/rejected": -291.0, + "loss": 0.3503, + "rewards/accuracies": 0.765625, + "rewards/chosen": 1.21875, + "rewards/margins": 2.1640625, + "rewards/rejected": -0.94140625, + "step": 266 + }, + { + "epoch": 0.29535398230088494, + "grad_norm": 15.937322616577148, + "learning_rate": 4.1433029853026163e-07, + "logits/chosen": -1.31640625, + "logits/rejected": -1.203125, + "logps/chosen": -245.5, + "logps/rejected": -294.0, + "loss": 0.3923, + "rewards/accuracies": 0.796875, + "rewards/chosen": 1.0625, + "rewards/margins": 1.91796875, + "rewards/rejected": -0.85546875, + "step": 267 + }, + { + "epoch": 0.29646017699115046, + "grad_norm": 14.759867668151855, + "learning_rate": 4.136535761638611e-07, + "logits/chosen": -1.2890625, + "logits/rejected": -1.09375, + "logps/chosen": -276.0, + "logps/rejected": -295.0, + "loss": 0.356, + "rewards/accuracies": 0.7578125, + "rewards/chosen": 0.98828125, + "rewards/margins": 1.98828125, + "rewards/rejected": -1.001953125, + "step": 268 + }, + { + "epoch": 0.2975663716814159, + "grad_norm": 13.723134994506836, + "learning_rate": 4.129747489696781e-07, + "logits/chosen": -1.23828125, + "logits/rejected": -1.12890625, + "logps/chosen": -252.5, + "logps/rejected": -246.5, + "loss": 0.3215, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.06640625, + "rewards/margins": 2.3671875, + "rewards/rejected": -1.29296875, + "step": 269 + }, + { + "epoch": 0.29867256637168144, + "grad_norm": 12.263731002807617, + "learning_rate": 4.122938256784374e-07, + "logits/chosen": -1.3671875, + "logits/rejected": -1.30859375, + "logps/chosen": -216.5, + "logps/rejected": -275.0, + "loss": 0.3189, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.16796875, + "rewards/margins": 2.265625, + "rewards/rejected": -1.09375, + "step": 270 + }, + { + "epoch": 0.2997787610619469, + "grad_norm": 15.063496589660645, + "learning_rate": 4.116108150478228e-07, + "logits/chosen": -1.1875, + "logits/rejected": -1.125, + "logps/chosen": -255.5, + "logps/rejected": -256.5, + "loss": 0.3799, + "rewards/accuracies": 0.7421875, + "rewards/chosen": 0.818359375, + "rewards/margins": 1.7890625, + "rewards/rejected": -0.974609375, + "step": 271 + }, + { + "epoch": 0.3008849557522124, + "grad_norm": 15.24313735961914, + "learning_rate": 4.109257258623643e-07, + "logits/chosen": -1.2265625, + "logits/rejected": -1.09765625, + "logps/chosen": -238.5, + "logps/rejected": -274.0, + "loss": 0.3779, + "rewards/accuracies": 0.7578125, + "rewards/chosen": 1.15625, + "rewards/margins": 2.1875, + "rewards/rejected": -1.03125, + "step": 272 + }, + { + "epoch": 0.3019911504424779, + "grad_norm": 14.098630905151367, + "learning_rate": 4.1023856693332516e-07, + "logits/chosen": -1.3203125, + "logits/rejected": -1.17578125, + "logps/chosen": -248.0, + "logps/rejected": -272.0, + "loss": 0.3197, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.982421875, + "rewards/margins": 2.25, + "rewards/rejected": -1.26953125, + "step": 273 + }, + { + "epoch": 0.3030973451327434, + "grad_norm": 13.230525970458984, + "learning_rate": 4.0954934709858857e-07, + "logits/chosen": -1.234375, + "logits/rejected": -1.16796875, + "logps/chosen": -268.0, + "logps/rejected": -287.0, + "loss": 0.3215, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.15234375, + "rewards/margins": 2.125, + "rewards/rejected": -0.97265625, + "step": 274 + }, + { + "epoch": 0.30420353982300885, + "grad_norm": 12.722634315490723, + "learning_rate": 4.0885807522254433e-07, + "logits/chosen": -1.375, + "logits/rejected": -1.25390625, + "logps/chosen": -256.5, + "logps/rejected": -319.0, + "loss": 0.3175, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.37890625, + "rewards/margins": 2.3515625, + "rewards/rejected": -0.96875, + "step": 275 + }, + { + "epoch": 0.3053097345132743, + "grad_norm": 12.688482284545898, + "learning_rate": 4.0816476019597423e-07, + "logits/chosen": -1.41015625, + "logits/rejected": -1.2890625, + "logps/chosen": -235.5, + "logps/rejected": -256.0, + "loss": 0.3222, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.28515625, + "rewards/margins": 2.20703125, + "rewards/rejected": -0.92578125, + "step": 276 + }, + { + "epoch": 0.3064159292035398, + "grad_norm": 14.044715881347656, + "learning_rate": 4.0746941093593807e-07, + "logits/chosen": -1.2578125, + "logits/rejected": -1.27734375, + "logps/chosen": -249.0, + "logps/rejected": -295.0, + "loss": 0.2954, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 1.359375, + "rewards/margins": 2.3671875, + "rewards/rejected": -1.009765625, + "step": 277 + }, + { + "epoch": 0.3075221238938053, + "grad_norm": 15.867609024047852, + "learning_rate": 4.0677203638565893e-07, + "logits/chosen": -1.36328125, + "logits/rejected": -1.25390625, + "logps/chosen": -260.0, + "logps/rejected": -275.0, + "loss": 0.3278, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.19921875, + "rewards/margins": 2.296875, + "rewards/rejected": -1.10546875, + "step": 278 + }, + { + "epoch": 0.3086283185840708, + "grad_norm": 16.124387741088867, + "learning_rate": 4.060726455144082e-07, + "logits/chosen": -1.25390625, + "logits/rejected": -1.16015625, + "logps/chosen": -240.5, + "logps/rejected": -281.0, + "loss": 0.3936, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.025390625, + "rewards/margins": 1.96484375, + "rewards/rejected": -0.9375, + "step": 279 + }, + { + "epoch": 0.30973451327433627, + "grad_norm": 14.164496421813965, + "learning_rate": 4.0537124731739003e-07, + "logits/chosen": -1.34765625, + "logits/rejected": -1.1953125, + "logps/chosen": -250.0, + "logps/rejected": -270.0, + "loss": 0.3594, + "rewards/accuracies": 0.796875, + "rewards/chosen": 1.005859375, + "rewards/margins": 2.02734375, + "rewards/rejected": -1.02734375, + "step": 280 + }, + { + "epoch": 0.3108407079646018, + "grad_norm": 14.754056930541992, + "learning_rate": 4.0466785081562583e-07, + "logits/chosen": -1.234375, + "logits/rejected": -1.2265625, + "logps/chosen": -258.5, + "logps/rejected": -247.0, + "loss": 0.3625, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.095703125, + "rewards/margins": 2.06640625, + "rewards/rejected": -0.970703125, + "step": 281 + }, + { + "epoch": 0.31194690265486724, + "grad_norm": 14.682291030883789, + "learning_rate": 4.039624650558382e-07, + "logits/chosen": -1.13671875, + "logits/rejected": -1.21484375, + "logps/chosen": -239.0, + "logps/rejected": -265.5, + "loss": 0.3439, + "rewards/accuracies": 0.765625, + "rewards/chosen": 1.28125, + "rewards/margins": 2.2890625, + "rewards/rejected": -1.009765625, + "step": 282 + }, + { + "epoch": 0.31305309734513276, + "grad_norm": 13.215510368347168, + "learning_rate": 4.032550991103344e-07, + "logits/chosen": -1.3984375, + "logits/rejected": -1.30859375, + "logps/chosen": -218.5, + "logps/rejected": -263.5, + "loss": 0.3302, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.0078125, + "rewards/margins": 2.03515625, + "rewards/rejected": -1.029296875, + "step": 283 + }, + { + "epoch": 0.3141592920353982, + "grad_norm": 14.175792694091797, + "learning_rate": 4.0254576207689004e-07, + "logits/chosen": -1.23046875, + "logits/rejected": -1.21484375, + "logps/chosen": -268.0, + "logps/rejected": -307.0, + "loss": 0.3466, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 1.083984375, + "rewards/margins": 2.0703125, + "rewards/rejected": -0.986328125, + "step": 284 + }, + { + "epoch": 0.31526548672566373, + "grad_norm": 15.70940113067627, + "learning_rate": 4.0183446307863174e-07, + "logits/chosen": -1.359375, + "logits/rejected": -1.20703125, + "logps/chosen": -249.0, + "logps/rejected": -281.0, + "loss": 0.3759, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.927734375, + "rewards/margins": 1.94921875, + "rewards/rejected": -1.015625, + "step": 285 + }, + { + "epoch": 0.3163716814159292, + "grad_norm": 14.439340591430664, + "learning_rate": 4.0112121126391967e-07, + "logits/chosen": -1.26953125, + "logits/rejected": -1.26171875, + "logps/chosen": -278.0, + "logps/rejected": -298.0, + "loss": 0.3487, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 1.2421875, + "rewards/margins": 2.3515625, + "rewards/rejected": -1.109375, + "step": 286 + }, + { + "epoch": 0.3174778761061947, + "grad_norm": 13.696681022644043, + "learning_rate": 4.0040601580623054e-07, + "logits/chosen": -1.3125, + "logits/rejected": -1.3515625, + "logps/chosen": -236.0, + "logps/rejected": -246.0, + "loss": 0.3344, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.181640625, + "rewards/margins": 2.3515625, + "rewards/rejected": -1.16796875, + "step": 287 + }, + { + "epoch": 0.3185840707964602, + "grad_norm": 13.659770011901855, + "learning_rate": 3.9968888590403904e-07, + "logits/chosen": -1.28125, + "logits/rejected": -1.3828125, + "logps/chosen": -248.5, + "logps/rejected": -280.0, + "loss": 0.3278, + "rewards/accuracies": 0.796875, + "rewards/chosen": 1.33203125, + "rewards/margins": 2.375, + "rewards/rejected": -1.046875, + "step": 288 + }, + { + "epoch": 0.3196902654867257, + "grad_norm": 12.041626930236816, + "learning_rate": 3.9896983078069947e-07, + "logits/chosen": -1.3203125, + "logits/rejected": -1.16796875, + "logps/chosen": -245.0, + "logps/rejected": -273.5, + "loss": 0.3141, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 1.22265625, + "rewards/margins": 2.296875, + "rewards/rejected": -1.078125, + "step": 289 + }, + { + "epoch": 0.32079646017699115, + "grad_norm": 14.61534595489502, + "learning_rate": 3.9824885968432755e-07, + "logits/chosen": -1.31640625, + "logits/rejected": -1.1875, + "logps/chosen": -241.5, + "logps/rejected": -251.0, + "loss": 0.3742, + "rewards/accuracies": 0.7421875, + "rewards/chosen": 1.05859375, + "rewards/margins": 2.0859375, + "rewards/rejected": -1.025390625, + "step": 290 + }, + { + "epoch": 0.3219026548672566, + "grad_norm": 13.926555633544922, + "learning_rate": 3.975259818876811e-07, + "logits/chosen": -1.31640625, + "logits/rejected": -1.2578125, + "logps/chosen": -262.0, + "logps/rejected": -259.0, + "loss": 0.298, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.2265625, + "rewards/margins": 2.328125, + "rewards/rejected": -1.1015625, + "step": 291 + }, + { + "epoch": 0.3230088495575221, + "grad_norm": 12.315802574157715, + "learning_rate": 3.968012066880412e-07, + "logits/chosen": -1.27734375, + "logits/rejected": -1.2265625, + "logps/chosen": -259.0, + "logps/rejected": -267.0, + "loss": 0.3022, + "rewards/accuracies": 0.796875, + "rewards/chosen": 1.2109375, + "rewards/margins": 2.515625, + "rewards/rejected": -1.30078125, + "step": 292 + }, + { + "epoch": 0.3241150442477876, + "grad_norm": 12.437846183776855, + "learning_rate": 3.960745434070921e-07, + "logits/chosen": -1.19921875, + "logits/rejected": -1.06640625, + "logps/chosen": -256.5, + "logps/rejected": -281.0, + "loss": 0.3422, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.744140625, + "rewards/margins": 1.89453125, + "rewards/rejected": -1.1484375, + "step": 293 + }, + { + "epoch": 0.3252212389380531, + "grad_norm": 13.978434562683105, + "learning_rate": 3.9534600139080163e-07, + "logits/chosen": -1.2109375, + "logits/rejected": -1.140625, + "logps/chosen": -237.0, + "logps/rejected": -274.0, + "loss": 0.366, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.888671875, + "rewards/margins": 2.2109375, + "rewards/rejected": -1.32421875, + "step": 294 + }, + { + "epoch": 0.32632743362831856, + "grad_norm": 17.65538787841797, + "learning_rate": 3.94615590009301e-07, + "logits/chosen": -1.2578125, + "logits/rejected": -1.25390625, + "logps/chosen": -264.0, + "logps/rejected": -285.0, + "loss": 0.4392, + "rewards/accuracies": 0.7109375, + "rewards/chosen": 0.931640625, + "rewards/margins": 1.875, + "rewards/rejected": -0.947265625, + "step": 295 + }, + { + "epoch": 0.3274336283185841, + "grad_norm": 12.776206016540527, + "learning_rate": 3.9388331865676425e-07, + "logits/chosen": -1.1796875, + "logits/rejected": -1.234375, + "logps/chosen": -246.5, + "logps/rejected": -260.0, + "loss": 0.2823, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.0234375, + "rewards/margins": 2.453125, + "rewards/rejected": -1.42578125, + "step": 296 + }, + { + "epoch": 0.32853982300884954, + "grad_norm": 15.579447746276855, + "learning_rate": 3.931491967512872e-07, + "logits/chosen": -1.3984375, + "logits/rejected": -1.2734375, + "logps/chosen": -252.5, + "logps/rejected": -283.0, + "loss": 0.3896, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.94140625, + "rewards/margins": 1.859375, + "rewards/rejected": -0.91796875, + "step": 297 + }, + { + "epoch": 0.32964601769911506, + "grad_norm": 13.556859970092773, + "learning_rate": 3.9241323373476686e-07, + "logits/chosen": -1.16015625, + "logits/rejected": -1.125, + "logps/chosen": -258.0, + "logps/rejected": -265.0, + "loss": 0.3322, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.1015625, + "rewards/margins": 2.296875, + "rewards/rejected": -1.1953125, + "step": 298 + }, + { + "epoch": 0.3307522123893805, + "grad_norm": 12.573343276977539, + "learning_rate": 3.916754390727794e-07, + "logits/chosen": -1.203125, + "logits/rejected": -1.15234375, + "logps/chosen": -251.0, + "logps/rejected": -285.0, + "loss": 0.2524, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.375, + "rewards/margins": 2.71875, + "rewards/rejected": -1.34765625, + "step": 299 + }, + { + "epoch": 0.33185840707964603, + "grad_norm": 13.832542419433594, + "learning_rate": 3.9093582225445877e-07, + "logits/chosen": -1.28125, + "logits/rejected": -1.1953125, + "logps/chosen": -263.0, + "logps/rejected": -283.5, + "loss": 0.3695, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.689453125, + "rewards/margins": 1.85546875, + "rewards/rejected": -1.1640625, + "step": 300 + }, + { + "epoch": 0.33185840707964603, + "eval_logits/chosen": -1.2725435495376587, + "eval_logits/rejected": -1.1930581331253052, + "eval_logps/chosen": -251.15921020507812, + "eval_logps/rejected": -271.43780517578125, + "eval_loss": 0.35059425234794617, + "eval_rewards/accuracies": 0.7869349718093872, + "eval_rewards/chosen": 0.9655628204345703, + "eval_rewards/margins": 2.1816697120666504, + "eval_rewards/rejected": -1.2169232368469238, + "eval_runtime": 193.0334, + "eval_samples_per_second": 66.584, + "eval_steps_per_second": 1.041, + "step": 300 + }, + { + "epoch": 0.3329646017699115, + "grad_norm": 13.506171226501465, + "learning_rate": 3.901943927923744e-07, + "logits/chosen": -1.2578125, + "logits/rejected": -1.15234375, + "logps/chosen": -258.5, + "logps/rejected": -279.0, + "loss": 0.3567, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.908203125, + "rewards/margins": 2.03125, + "rewards/rejected": -1.119140625, + "step": 301 + }, + { + "epoch": 0.334070796460177, + "grad_norm": 14.112154960632324, + "learning_rate": 3.8945116022240937e-07, + "logits/chosen": -1.18359375, + "logits/rejected": -1.08203125, + "logps/chosen": -268.0, + "logps/rejected": -313.0, + "loss": 0.3424, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.9765625, + "rewards/margins": 2.2109375, + "rewards/rejected": -1.23046875, + "step": 302 + }, + { + "epoch": 0.33517699115044247, + "grad_norm": 13.437678337097168, + "learning_rate": 3.8870613410363707e-07, + "logits/chosen": -1.26171875, + "logits/rejected": -1.18359375, + "logps/chosen": -269.0, + "logps/rejected": -273.5, + "loss": 0.361, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.830078125, + "rewards/margins": 1.97265625, + "rewards/rejected": -1.14453125, + "step": 303 + }, + { + "epoch": 0.336283185840708, + "grad_norm": 14.396577835083008, + "learning_rate": 3.8795932401819863e-07, + "logits/chosen": -1.27734375, + "logits/rejected": -1.1171875, + "logps/chosen": -272.0, + "logps/rejected": -286.0, + "loss": 0.3308, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.861328125, + "rewards/margins": 2.1796875, + "rewards/rejected": -1.31640625, + "step": 304 + }, + { + "epoch": 0.33738938053097345, + "grad_norm": 15.585335731506348, + "learning_rate": 3.872107395711798e-07, + "logits/chosen": -1.30859375, + "logits/rejected": -1.08984375, + "logps/chosen": -280.0, + "logps/rejected": -337.0, + "loss": 0.369, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.9921875, + "rewards/margins": 1.98046875, + "rewards/rejected": -0.984375, + "step": 305 + }, + { + "epoch": 0.33849557522123896, + "grad_norm": 16.997081756591797, + "learning_rate": 3.864603903904871e-07, + "logits/chosen": -1.27734375, + "logits/rejected": -1.171875, + "logps/chosen": -283.0, + "logps/rejected": -291.0, + "loss": 0.3989, + "rewards/accuracies": 0.734375, + "rewards/chosen": 0.982421875, + "rewards/margins": 2.2421875, + "rewards/rejected": -1.2578125, + "step": 306 + }, + { + "epoch": 0.3396017699115044, + "grad_norm": 16.18097496032715, + "learning_rate": 3.857082861267242e-07, + "logits/chosen": -1.265625, + "logits/rejected": -1.23828125, + "logps/chosen": -249.0, + "logps/rejected": -266.0, + "loss": 0.402, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.83984375, + "rewards/margins": 1.9921875, + "rewards/rejected": -1.15234375, + "step": 307 + }, + { + "epoch": 0.3407079646017699, + "grad_norm": 13.513619422912598, + "learning_rate": 3.849544364530677e-07, + "logits/chosen": -1.32421875, + "logits/rejected": -1.2421875, + "logps/chosen": -264.5, + "logps/rejected": -273.0, + "loss": 0.2981, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.98046875, + "rewards/margins": 2.4921875, + "rewards/rejected": -1.515625, + "step": 308 + }, + { + "epoch": 0.3418141592920354, + "grad_norm": 15.130499839782715, + "learning_rate": 3.8419885106514295e-07, + "logits/chosen": -1.23828125, + "logits/rejected": -1.18359375, + "logps/chosen": -271.5, + "logps/rejected": -284.0, + "loss": 0.3542, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.962890625, + "rewards/margins": 2.1484375, + "rewards/rejected": -1.1875, + "step": 309 + }, + { + "epoch": 0.34292035398230086, + "grad_norm": 18.18540382385254, + "learning_rate": 3.834415396808988e-07, + "logits/chosen": -1.27734375, + "logits/rejected": -1.1640625, + "logps/chosen": -251.0, + "logps/rejected": -289.0, + "loss": 0.3976, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.78515625, + "rewards/margins": 2.03515625, + "rewards/rejected": -1.24609375, + "step": 310 + }, + { + "epoch": 0.3440265486725664, + "grad_norm": 13.235279083251953, + "learning_rate": 3.826825120404833e-07, + "logits/chosen": -1.14453125, + "logits/rejected": -1.18359375, + "logps/chosen": -265.0, + "logps/rejected": -271.0, + "loss": 0.3018, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.990234375, + "rewards/margins": 2.40625, + "rewards/rejected": -1.4140625, + "step": 311 + }, + { + "epoch": 0.34513274336283184, + "grad_norm": 14.219352722167969, + "learning_rate": 3.81921777906118e-07, + "logits/chosen": -1.34375, + "logits/rejected": -1.16796875, + "logps/chosen": -243.0, + "logps/rejected": -265.0, + "loss": 0.3433, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.10546875, + "rewards/margins": 2.3203125, + "rewards/rejected": -1.2109375, + "step": 312 + }, + { + "epoch": 0.34623893805309736, + "grad_norm": 14.085458755493164, + "learning_rate": 3.8115934706197244e-07, + "logits/chosen": -1.3046875, + "logits/rejected": -1.234375, + "logps/chosen": -263.0, + "logps/rejected": -260.0, + "loss": 0.3526, + "rewards/accuracies": 0.7421875, + "rewards/chosen": 1.12890625, + "rewards/margins": 2.17578125, + "rewards/rejected": -1.046875, + "step": 313 + }, + { + "epoch": 0.3473451327433628, + "grad_norm": 13.884740829467773, + "learning_rate": 3.8039522931403847e-07, + "logits/chosen": -1.4375, + "logits/rejected": -1.22265625, + "logps/chosen": -257.0, + "logps/rejected": -274.5, + "loss": 0.3197, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.1328125, + "rewards/margins": 2.29296875, + "rewards/rejected": -1.15625, + "step": 314 + }, + { + "epoch": 0.34845132743362833, + "grad_norm": 15.969679832458496, + "learning_rate": 3.7962943449000377e-07, + "logits/chosen": -1.1640625, + "logits/rejected": -1.1484375, + "logps/chosen": -260.0, + "logps/rejected": -283.0, + "loss": 0.4191, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.90625, + "rewards/margins": 1.7890625, + "rewards/rejected": -0.880859375, + "step": 315 + }, + { + "epoch": 0.3495575221238938, + "grad_norm": 14.207815170288086, + "learning_rate": 3.7886197243912607e-07, + "logits/chosen": -1.22265625, + "logits/rejected": -1.1015625, + "logps/chosen": -256.0, + "logps/rejected": -279.0, + "loss": 0.3409, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.19140625, + "rewards/margins": 2.34375, + "rewards/rejected": -1.15625, + "step": 316 + }, + { + "epoch": 0.3506637168141593, + "grad_norm": 14.867213249206543, + "learning_rate": 3.7809285303210593e-07, + "logits/chosen": -1.26953125, + "logits/rejected": -1.09375, + "logps/chosen": -248.5, + "logps/rejected": -246.0, + "loss": 0.3668, + "rewards/accuracies": 0.7421875, + "rewards/chosen": 1.06640625, + "rewards/margins": 2.0, + "rewards/rejected": -0.93359375, + "step": 317 + }, + { + "epoch": 0.35176991150442477, + "grad_norm": 13.388772964477539, + "learning_rate": 3.7732208616095986e-07, + "logits/chosen": -1.12890625, + "logits/rejected": -1.1484375, + "logps/chosen": -249.0, + "logps/rejected": -276.0, + "loss": 0.3055, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 1.44140625, + "rewards/margins": 2.578125, + "rewards/rejected": -1.13671875, + "step": 318 + }, + { + "epoch": 0.3528761061946903, + "grad_norm": 13.385259628295898, + "learning_rate": 3.7654968173889334e-07, + "logits/chosen": -1.34375, + "logits/rejected": -1.14453125, + "logps/chosen": -240.5, + "logps/rejected": -279.0, + "loss": 0.3375, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.28125, + "rewards/margins": 2.4765625, + "rewards/rejected": -1.19140625, + "step": 319 + }, + { + "epoch": 0.35398230088495575, + "grad_norm": 13.495279312133789, + "learning_rate": 3.7577564970017336e-07, + "logits/chosen": -1.3203125, + "logits/rejected": -1.12890625, + "logps/chosen": -237.5, + "logps/rejected": -251.0, + "loss": 0.3125, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 1.16796875, + "rewards/margins": 2.546875, + "rewards/rejected": -1.37890625, + "step": 320 + }, + { + "epoch": 0.35508849557522126, + "grad_norm": 14.151552200317383, + "learning_rate": 3.75e-07, + "logits/chosen": -1.28515625, + "logits/rejected": -1.1171875, + "logps/chosen": -251.0, + "logps/rejected": -281.0, + "loss": 0.3316, + "rewards/accuracies": 0.765625, + "rewards/chosen": 1.12109375, + "rewards/margins": 2.3125, + "rewards/rejected": -1.19140625, + "step": 321 + }, + { + "epoch": 0.3561946902654867, + "grad_norm": 13.408705711364746, + "learning_rate": 3.742227426143793e-07, + "logits/chosen": -1.23046875, + "logits/rejected": -1.12890625, + "logps/chosen": -229.5, + "logps/rejected": -235.0, + "loss": 0.3559, + "rewards/accuracies": 0.7578125, + "rewards/chosen": 0.94140625, + "rewards/margins": 2.0625, + "rewards/rejected": -1.1171875, + "step": 322 + }, + { + "epoch": 0.3573008849557522, + "grad_norm": 14.06219482421875, + "learning_rate": 3.734438875399943e-07, + "logits/chosen": -1.29296875, + "logits/rejected": -1.22265625, + "logps/chosen": -270.0, + "logps/rejected": -296.0, + "loss": 0.3082, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 1.3046875, + "rewards/margins": 2.5, + "rewards/rejected": -1.19921875, + "step": 323 + }, + { + "epoch": 0.3584070796460177, + "grad_norm": 13.949469566345215, + "learning_rate": 3.726634447940768e-07, + "logits/chosen": -1.3359375, + "logits/rejected": -1.17578125, + "logps/chosen": -277.0, + "logps/rejected": -297.0, + "loss": 0.3666, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.04296875, + "rewards/margins": 1.87109375, + "rewards/rejected": -0.83203125, + "step": 324 + }, + { + "epoch": 0.35951327433628316, + "grad_norm": 14.00977897644043, + "learning_rate": 3.7188142441427836e-07, + "logits/chosen": -1.1484375, + "logits/rejected": -1.1640625, + "logps/chosen": -237.5, + "logps/rejected": -263.0, + "loss": 0.3086, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 1.328125, + "rewards/margins": 2.53125, + "rewards/rejected": -1.203125, + "step": 325 + }, + { + "epoch": 0.3606194690265487, + "grad_norm": 17.24125099182129, + "learning_rate": 3.710978364585411e-07, + "logits/chosen": -1.25, + "logits/rejected": -1.23046875, + "logps/chosen": -265.5, + "logps/rejected": -274.0, + "loss": 0.4063, + "rewards/accuracies": 0.7421875, + "rewards/chosen": 1.005859375, + "rewards/margins": 2.12890625, + "rewards/rejected": -1.126953125, + "step": 326 + }, + { + "epoch": 0.36172566371681414, + "grad_norm": 13.335806846618652, + "learning_rate": 3.7031269100496897e-07, + "logits/chosen": -1.3203125, + "logits/rejected": -1.1328125, + "logps/chosen": -246.0, + "logps/rejected": -255.5, + "loss": 0.3012, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 1.142578125, + "rewards/margins": 2.578125, + "rewards/rejected": -1.42578125, + "step": 327 + }, + { + "epoch": 0.36283185840707965, + "grad_norm": 14.329955101013184, + "learning_rate": 3.69525998151697e-07, + "logits/chosen": -1.20703125, + "logits/rejected": -1.140625, + "logps/chosen": -260.0, + "logps/rejected": -284.0, + "loss": 0.3322, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.23046875, + "rewards/margins": 2.453125, + "rewards/rejected": -1.22265625, + "step": 328 + }, + { + "epoch": 0.3639380530973451, + "grad_norm": 14.133960723876953, + "learning_rate": 3.687377680167626e-07, + "logits/chosen": -1.23828125, + "logits/rejected": -1.125, + "logps/chosen": -253.0, + "logps/rejected": -271.0, + "loss": 0.3381, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 1.23046875, + "rewards/margins": 2.5390625, + "rewards/rejected": -1.3125, + "step": 329 + }, + { + "epoch": 0.36504424778761063, + "grad_norm": 15.067541122436523, + "learning_rate": 3.6794801073797453e-07, + "logits/chosen": -1.2265625, + "logits/rejected": -1.15234375, + "logps/chosen": -262.0, + "logps/rejected": -284.0, + "loss": 0.3784, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 1.06640625, + "rewards/margins": 2.1640625, + "rewards/rejected": -1.09375, + "step": 330 + }, + { + "epoch": 0.3661504424778761, + "grad_norm": 14.064321517944336, + "learning_rate": 3.671567364727833e-07, + "logits/chosen": -1.29296875, + "logits/rejected": -1.140625, + "logps/chosen": -234.5, + "logps/rejected": -260.0, + "loss": 0.3866, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 1.0234375, + "rewards/margins": 2.15625, + "rewards/rejected": -1.12890625, + "step": 331 + }, + { + "epoch": 0.3672566371681416, + "grad_norm": 14.258491516113281, + "learning_rate": 3.663639553981497e-07, + "logits/chosen": -1.2890625, + "logits/rejected": -1.2421875, + "logps/chosen": -236.0, + "logps/rejected": -253.5, + "loss": 0.3042, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.4140625, + "rewards/margins": 2.6640625, + "rewards/rejected": -1.25, + "step": 332 + }, + { + "epoch": 0.36836283185840707, + "grad_norm": 14.841512680053711, + "learning_rate": 3.655696777104146e-07, + "logits/chosen": -1.25390625, + "logits/rejected": -1.21875, + "logps/chosen": -251.5, + "logps/rejected": -276.0, + "loss": 0.338, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 1.28515625, + "rewards/margins": 2.53125, + "rewards/rejected": -1.24609375, + "step": 333 + }, + { + "epoch": 0.3694690265486726, + "grad_norm": 13.853322982788086, + "learning_rate": 3.647739136251673e-07, + "logits/chosen": -1.1953125, + "logits/rejected": -1.20703125, + "logps/chosen": -265.0, + "logps/rejected": -308.0, + "loss": 0.3455, + "rewards/accuracies": 0.765625, + "rewards/chosen": 1.1796875, + "rewards/margins": 2.234375, + "rewards/rejected": -1.0546875, + "step": 334 + }, + { + "epoch": 0.37057522123893805, + "grad_norm": 15.147529602050781, + "learning_rate": 3.639766733771147e-07, + "logits/chosen": -1.34375, + "logits/rejected": -1.25, + "logps/chosen": -250.5, + "logps/rejected": -287.0, + "loss": 0.3692, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.10546875, + "rewards/margins": 2.26171875, + "rewards/rejected": -1.15625, + "step": 335 + }, + { + "epoch": 0.37168141592920356, + "grad_norm": 13.71894359588623, + "learning_rate": 3.6317796721994903e-07, + "logits/chosen": -1.30859375, + "logits/rejected": -1.1953125, + "logps/chosen": -272.0, + "logps/rejected": -268.0, + "loss": 0.3311, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 1.083984375, + "rewards/margins": 2.33984375, + "rewards/rejected": -1.25390625, + "step": 336 + }, + { + "epoch": 0.372787610619469, + "grad_norm": 12.683527946472168, + "learning_rate": 3.623778054262164e-07, + "logits/chosen": -1.30078125, + "logits/rejected": -1.23046875, + "logps/chosen": -268.0, + "logps/rejected": -275.0, + "loss": 0.302, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.26171875, + "rewards/margins": 2.3515625, + "rewards/rejected": -1.09375, + "step": 337 + }, + { + "epoch": 0.37389380530973454, + "grad_norm": 12.87300968170166, + "learning_rate": 3.6157619828718473e-07, + "logits/chosen": -1.21875, + "logits/rejected": -1.140625, + "logps/chosen": -248.0, + "logps/rejected": -249.5, + "loss": 0.3173, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.939453125, + "rewards/margins": 2.3046875, + "rewards/rejected": -1.36328125, + "step": 338 + }, + { + "epoch": 0.375, + "grad_norm": 13.38857650756836, + "learning_rate": 3.6077315611271095e-07, + "logits/chosen": -1.359375, + "logits/rejected": -1.28125, + "logps/chosen": -247.5, + "logps/rejected": -256.0, + "loss": 0.3028, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.03125, + "rewards/margins": 2.3671875, + "rewards/rejected": -1.33984375, + "step": 339 + }, + { + "epoch": 0.37610619469026546, + "grad_norm": 13.413825988769531, + "learning_rate": 3.5996868923110883e-07, + "logits/chosen": -1.30859375, + "logits/rejected": -1.1796875, + "logps/chosen": -229.0, + "logps/rejected": -270.5, + "loss": 0.3433, + "rewards/accuracies": 0.796875, + "rewards/chosen": 1.015625, + "rewards/margins": 2.2421875, + "rewards/rejected": -1.2265625, + "step": 340 + }, + { + "epoch": 0.377212389380531, + "grad_norm": 14.234886169433594, + "learning_rate": 3.59162807989016e-07, + "logits/chosen": -1.26953125, + "logits/rejected": -1.13671875, + "logps/chosen": -248.5, + "logps/rejected": -233.5, + "loss": 0.3125, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.125, + "rewards/margins": 2.6796875, + "rewards/rejected": -1.55078125, + "step": 341 + }, + { + "epoch": 0.37831858407079644, + "grad_norm": 14.096612930297852, + "learning_rate": 3.583555227512607e-07, + "logits/chosen": -1.3515625, + "logits/rejected": -1.12109375, + "logps/chosen": -238.5, + "logps/rejected": -265.0, + "loss": 0.356, + "rewards/accuracies": 0.765625, + "rewards/chosen": 1.1875, + "rewards/margins": 2.65625, + "rewards/rejected": -1.47265625, + "step": 342 + }, + { + "epoch": 0.37942477876106195, + "grad_norm": 13.053836822509766, + "learning_rate": 3.5754684390072886e-07, + "logits/chosen": -1.16796875, + "logits/rejected": -1.0625, + "logps/chosen": -241.0, + "logps/rejected": -280.0, + "loss": 0.3579, + "rewards/accuracies": 0.7578125, + "rewards/chosen": 1.033203125, + "rewards/margins": 2.10546875, + "rewards/rejected": -1.07421875, + "step": 343 + }, + { + "epoch": 0.3805309734513274, + "grad_norm": 13.854188919067383, + "learning_rate": 3.5673678183823024e-07, + "logits/chosen": -1.13671875, + "logits/rejected": -1.0546875, + "logps/chosen": -284.0, + "logps/rejected": -302.0, + "loss": 0.3128, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 1.25390625, + "rewards/margins": 2.5078125, + "rewards/rejected": -1.2578125, + "step": 344 + }, + { + "epoch": 0.38163716814159293, + "grad_norm": 15.331599235534668, + "learning_rate": 3.559253469823647e-07, + "logits/chosen": -1.09375, + "logits/rejected": -1.048828125, + "logps/chosen": -250.5, + "logps/rejected": -278.0, + "loss": 0.3974, + "rewards/accuracies": 0.7109375, + "rewards/chosen": 1.05078125, + "rewards/margins": 2.3203125, + "rewards/rejected": -1.2734375, + "step": 345 + }, + { + "epoch": 0.3827433628318584, + "grad_norm": 13.891538619995117, + "learning_rate": 3.5511254976938834e-07, + "logits/chosen": -1.2890625, + "logits/rejected": -1.1484375, + "logps/chosen": -269.0, + "logps/rejected": -274.0, + "loss": 0.3552, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 1.0703125, + "rewards/margins": 2.125, + "rewards/rejected": -1.05078125, + "step": 346 + }, + { + "epoch": 0.3838495575221239, + "grad_norm": 14.387117385864258, + "learning_rate": 3.542984006530792e-07, + "logits/chosen": -1.36328125, + "logits/rejected": -1.16015625, + "logps/chosen": -237.0, + "logps/rejected": -272.0, + "loss": 0.3257, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.044921875, + "rewards/margins": 2.546875, + "rewards/rejected": -1.5078125, + "step": 347 + }, + { + "epoch": 0.38495575221238937, + "grad_norm": 14.693575859069824, + "learning_rate": 3.534829101046027e-07, + "logits/chosen": -1.26953125, + "logits/rejected": -1.19921875, + "logps/chosen": -253.0, + "logps/rejected": -271.5, + "loss": 0.4006, + "rewards/accuracies": 0.7578125, + "rewards/chosen": 0.859375, + "rewards/margins": 1.90234375, + "rewards/rejected": -1.046875, + "step": 348 + }, + { + "epoch": 0.3860619469026549, + "grad_norm": 14.893670082092285, + "learning_rate": 3.5266608861237723e-07, + "logits/chosen": -1.30859375, + "logits/rejected": -1.16015625, + "logps/chosen": -257.0, + "logps/rejected": -273.0, + "loss": 0.3469, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.845703125, + "rewards/margins": 2.171875, + "rewards/rejected": -1.32421875, + "step": 349 + }, + { + "epoch": 0.38716814159292035, + "grad_norm": 13.093351364135742, + "learning_rate": 3.518479466819389e-07, + "logits/chosen": -1.1484375, + "logits/rejected": -1.20703125, + "logps/chosen": -251.0, + "logps/rejected": -290.0, + "loss": 0.3118, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 1.056640625, + "rewards/margins": 2.5546875, + "rewards/rejected": -1.5, + "step": 350 + }, + { + "epoch": 0.38716814159292035, + "eval_logits/chosen": -1.260883092880249, + "eval_logits/rejected": -1.1730799674987793, + "eval_logps/chosen": -250.82586669921875, + "eval_logps/rejected": -272.52239990234375, + "eval_loss": 0.3436649739742279, + "eval_rewards/accuracies": 0.7924543023109436, + "eval_rewards/chosen": 1.0035176277160645, + "eval_rewards/margins": 2.3350045680999756, + "eval_rewards/rejected": -1.3310012817382812, + "eval_runtime": 192.8803, + "eval_samples_per_second": 66.637, + "eval_steps_per_second": 1.042, + "step": 350 + }, + { + "epoch": 0.38827433628318586, + "grad_norm": 17.082809448242188, + "learning_rate": 3.510284948358068e-07, + "logits/chosen": -1.2734375, + "logits/rejected": -1.1328125, + "logps/chosen": -264.0, + "logps/rejected": -286.0, + "loss": 0.4283, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.78125, + "rewards/margins": 2.02734375, + "rewards/rejected": -1.25, + "step": 351 + }, + { + "epoch": 0.3893805309734513, + "grad_norm": 14.743417739868164, + "learning_rate": 3.5020774361334744e-07, + "logits/chosen": -1.27734375, + "logits/rejected": -1.28125, + "logps/chosen": -235.0, + "logps/rejected": -291.0, + "loss": 0.3538, + "rewards/accuracies": 0.765625, + "rewards/chosen": 1.1015625, + "rewards/margins": 2.375, + "rewards/rejected": -1.26953125, + "step": 352 + }, + { + "epoch": 0.39048672566371684, + "grad_norm": 12.676244735717773, + "learning_rate": 3.49385703570639e-07, + "logits/chosen": -1.35546875, + "logits/rejected": -1.19921875, + "logps/chosen": -243.5, + "logps/rejected": -254.0, + "loss": 0.2715, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 1.2578125, + "rewards/margins": 2.7578125, + "rewards/rejected": -1.5, + "step": 353 + }, + { + "epoch": 0.3915929203539823, + "grad_norm": 13.607057571411133, + "learning_rate": 3.485623852803361e-07, + "logits/chosen": -1.1953125, + "logits/rejected": -1.12890625, + "logps/chosen": -248.0, + "logps/rejected": -263.5, + "loss": 0.3456, + "rewards/accuracies": 0.7578125, + "rewards/chosen": 0.962890625, + "rewards/margins": 2.46875, + "rewards/rejected": -1.5, + "step": 354 + }, + { + "epoch": 0.3926991150442478, + "grad_norm": 12.650440216064453, + "learning_rate": 3.4773779933153343e-07, + "logits/chosen": -1.296875, + "logits/rejected": -1.2578125, + "logps/chosen": -222.5, + "logps/rejected": -240.0, + "loss": 0.3298, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 1.037109375, + "rewards/margins": 2.265625, + "rewards/rejected": -1.2265625, + "step": 355 + }, + { + "epoch": 0.3938053097345133, + "grad_norm": 13.600220680236816, + "learning_rate": 3.4691195632962957e-07, + "logits/chosen": -1.41015625, + "logits/rejected": -1.1796875, + "logps/chosen": -225.0, + "logps/rejected": -251.0, + "loss": 0.3439, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.978515625, + "rewards/margins": 2.3515625, + "rewards/rejected": -1.375, + "step": 356 + }, + { + "epoch": 0.39491150442477874, + "grad_norm": 14.466880798339844, + "learning_rate": 3.4608486689619083e-07, + "logits/chosen": -1.1640625, + "logits/rejected": -1.171875, + "logps/chosen": -252.0, + "logps/rejected": -254.0, + "loss": 0.3437, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.904296875, + "rewards/margins": 2.4375, + "rewards/rejected": -1.53125, + "step": 357 + }, + { + "epoch": 0.39601769911504425, + "grad_norm": 13.527942657470703, + "learning_rate": 3.4525654166881426e-07, + "logits/chosen": -1.34375, + "logits/rejected": -1.19140625, + "logps/chosen": -256.5, + "logps/rejected": -281.0, + "loss": 0.3267, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.962890625, + "rewards/margins": 2.40625, + "rewards/rejected": -1.44140625, + "step": 358 + }, + { + "epoch": 0.3971238938053097, + "grad_norm": 13.268882751464844, + "learning_rate": 3.4442699130099116e-07, + "logits/chosen": -1.26171875, + "logits/rejected": -1.1328125, + "logps/chosen": -268.0, + "logps/rejected": -297.0, + "loss": 0.293, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.990234375, + "rewards/margins": 2.6171875, + "rewards/rejected": -1.6328125, + "step": 359 + }, + { + "epoch": 0.39823008849557523, + "grad_norm": 14.783308982849121, + "learning_rate": 3.435962264619702e-07, + "logits/chosen": -1.1796875, + "logits/rejected": -1.12109375, + "logps/chosen": -239.5, + "logps/rejected": -278.0, + "loss": 0.3438, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.765625, + "rewards/margins": 2.25390625, + "rewards/rejected": -1.484375, + "step": 360 + }, + { + "epoch": 0.3993362831858407, + "grad_norm": 14.32043743133545, + "learning_rate": 3.427642578366194e-07, + "logits/chosen": -1.2578125, + "logits/rejected": -1.19140625, + "logps/chosen": -256.5, + "logps/rejected": -278.0, + "loss": 0.3622, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 0.9375, + "rewards/margins": 2.359375, + "rewards/rejected": -1.421875, + "step": 361 + }, + { + "epoch": 0.4004424778761062, + "grad_norm": 175.1194305419922, + "learning_rate": 3.419310961252897e-07, + "logits/chosen": -1.2109375, + "logits/rejected": -1.10546875, + "logps/chosen": -234.5, + "logps/rejected": -354.0, + "loss": 0.3211, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.06640625, + "rewards/margins": 2.6015625, + "rewards/rejected": -1.53515625, + "step": 362 + }, + { + "epoch": 0.40154867256637167, + "grad_norm": 13.051432609558105, + "learning_rate": 3.4109675204367686e-07, + "logits/chosen": -1.1953125, + "logits/rejected": -1.06640625, + "logps/chosen": -269.0, + "logps/rejected": -313.0, + "loss": 0.3161, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 1.140625, + "rewards/margins": 2.5703125, + "rewards/rejected": -1.4296875, + "step": 363 + }, + { + "epoch": 0.4026548672566372, + "grad_norm": 12.166739463806152, + "learning_rate": 3.4026123632268354e-07, + "logits/chosen": -1.25390625, + "logits/rejected": -1.09375, + "logps/chosen": -235.0, + "logps/rejected": -244.0, + "loss": 0.3185, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.6875, + "rewards/margins": 2.1953125, + "rewards/rejected": -1.50390625, + "step": 364 + }, + { + "epoch": 0.40376106194690264, + "grad_norm": 13.57703971862793, + "learning_rate": 3.3942455970828146e-07, + "logits/chosen": -1.25390625, + "logits/rejected": -1.11328125, + "logps/chosen": -249.5, + "logps/rejected": -265.0, + "loss": 0.3227, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 1.013671875, + "rewards/margins": 2.546875, + "rewards/rejected": -1.52734375, + "step": 365 + }, + { + "epoch": 0.40486725663716816, + "grad_norm": 12.446300506591797, + "learning_rate": 3.38586732961373e-07, + "logits/chosen": -1.3828125, + "logits/rejected": -1.21875, + "logps/chosen": -232.5, + "logps/rejected": -237.5, + "loss": 0.3575, + "rewards/accuracies": 0.7578125, + "rewards/chosen": 0.560546875, + "rewards/margins": 2.234375, + "rewards/rejected": -1.6796875, + "step": 366 + }, + { + "epoch": 0.4059734513274336, + "grad_norm": 13.510902404785156, + "learning_rate": 3.3774776685765327e-07, + "logits/chosen": -1.3125, + "logits/rejected": -1.265625, + "logps/chosen": -242.5, + "logps/rejected": -264.5, + "loss": 0.3312, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.79296875, + "rewards/margins": 2.3671875, + "rewards/rejected": -1.57421875, + "step": 367 + }, + { + "epoch": 0.40707964601769914, + "grad_norm": 12.729259490966797, + "learning_rate": 3.3690767218747104e-07, + "logits/chosen": -1.1015625, + "logits/rejected": -1.1328125, + "logps/chosen": -243.5, + "logps/rejected": -272.0, + "loss": 0.2871, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.12109375, + "rewards/margins": 2.7734375, + "rewards/rejected": -1.6484375, + "step": 368 + }, + { + "epoch": 0.4081858407079646, + "grad_norm": 14.687115669250488, + "learning_rate": 3.3606645975569e-07, + "logits/chosen": -1.34765625, + "logits/rejected": -1.078125, + "logps/chosen": -248.5, + "logps/rejected": -254.5, + "loss": 0.3694, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.65625, + "rewards/margins": 2.125, + "rewards/rejected": -1.47265625, + "step": 369 + }, + { + "epoch": 0.4092920353982301, + "grad_norm": 11.254127502441406, + "learning_rate": 3.3522414038155016e-07, + "logits/chosen": -1.22265625, + "logits/rejected": -1.12109375, + "logps/chosen": -225.0, + "logps/rejected": -259.5, + "loss": 0.2835, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.970703125, + "rewards/margins": 2.65625, + "rewards/rejected": -1.68359375, + "step": 370 + }, + { + "epoch": 0.4103982300884956, + "grad_norm": 15.060591697692871, + "learning_rate": 3.343807248985283e-07, + "logits/chosen": -1.2890625, + "logits/rejected": -1.16015625, + "logps/chosen": -240.0, + "logps/rejected": -265.5, + "loss": 0.3759, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.689453125, + "rewards/margins": 2.2109375, + "rewards/rejected": -1.5234375, + "step": 371 + }, + { + "epoch": 0.41150442477876104, + "grad_norm": 11.981417655944824, + "learning_rate": 3.335362241541988e-07, + "logits/chosen": -1.171875, + "logits/rejected": -1.09375, + "logps/chosen": -260.0, + "logps/rejected": -280.0, + "loss": 0.3012, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.732421875, + "rewards/margins": 2.3828125, + "rewards/rejected": -1.64453125, + "step": 372 + }, + { + "epoch": 0.41261061946902655, + "grad_norm": 13.114727020263672, + "learning_rate": 3.32690649010094e-07, + "logits/chosen": -1.265625, + "logits/rejected": -1.16796875, + "logps/chosen": -244.5, + "logps/rejected": -267.5, + "loss": 0.2875, + "rewards/accuracies": 0.8828125, + "rewards/chosen": 0.927734375, + "rewards/margins": 2.8125, + "rewards/rejected": -1.8828125, + "step": 373 + }, + { + "epoch": 0.413716814159292, + "grad_norm": 13.435688972473145, + "learning_rate": 3.3184401034156484e-07, + "logits/chosen": -1.16796875, + "logits/rejected": -1.015625, + "logps/chosen": -263.0, + "logps/rejected": -271.0, + "loss": 0.344, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.658203125, + "rewards/margins": 2.2890625, + "rewards/rejected": -1.625, + "step": 374 + }, + { + "epoch": 0.41482300884955753, + "grad_norm": 16.25585174560547, + "learning_rate": 3.3099631903764064e-07, + "logits/chosen": -1.26171875, + "logits/rejected": -1.125, + "logps/chosen": -270.0, + "logps/rejected": -291.0, + "loss": 0.4301, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.572265625, + "rewards/margins": 1.765625, + "rewards/rejected": -1.1953125, + "step": 375 + }, + { + "epoch": 0.415929203539823, + "grad_norm": 13.53650188446045, + "learning_rate": 3.3014758600088923e-07, + "logits/chosen": -1.2890625, + "logits/rejected": -1.2265625, + "logps/chosen": -232.0, + "logps/rejected": -266.5, + "loss": 0.3326, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.755859375, + "rewards/margins": 2.234375, + "rewards/rejected": -1.48046875, + "step": 376 + }, + { + "epoch": 0.4170353982300885, + "grad_norm": 12.601165771484375, + "learning_rate": 3.2929782214727653e-07, + "logits/chosen": -1.29296875, + "logits/rejected": -1.18359375, + "logps/chosen": -246.5, + "logps/rejected": -263.5, + "loss": 0.3436, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 0.771484375, + "rewards/margins": 2.515625, + "rewards/rejected": -1.7421875, + "step": 377 + }, + { + "epoch": 0.41814159292035397, + "grad_norm": 13.9395112991333, + "learning_rate": 3.2844703840602636e-07, + "logits/chosen": -1.30078125, + "logits/rejected": -1.30859375, + "logps/chosen": -243.5, + "logps/rejected": -262.5, + "loss": 0.3515, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.92578125, + "rewards/margins": 2.390625, + "rewards/rejected": -1.46484375, + "step": 378 + }, + { + "epoch": 0.4192477876106195, + "grad_norm": 13.737992286682129, + "learning_rate": 3.2759524571948e-07, + "logits/chosen": -1.3125, + "logits/rejected": -1.11328125, + "logps/chosen": -258.0, + "logps/rejected": -293.0, + "loss": 0.2964, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.970703125, + "rewards/margins": 2.5859375, + "rewards/rejected": -1.61328125, + "step": 379 + }, + { + "epoch": 0.42035398230088494, + "grad_norm": 14.736825942993164, + "learning_rate": 3.26742455042955e-07, + "logits/chosen": -1.3515625, + "logits/rejected": -1.2109375, + "logps/chosen": -247.5, + "logps/rejected": -238.0, + "loss": 0.3616, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.634765625, + "rewards/margins": 2.1875, + "rewards/rejected": -1.55078125, + "step": 380 + }, + { + "epoch": 0.42146017699115046, + "grad_norm": 14.469903945922852, + "learning_rate": 3.2588867734460464e-07, + "logits/chosen": -1.1796875, + "logits/rejected": -1.15234375, + "logps/chosen": -252.0, + "logps/rejected": -260.5, + "loss": 0.355, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.9296875, + "rewards/margins": 2.5546875, + "rewards/rejected": -1.62109375, + "step": 381 + }, + { + "epoch": 0.4225663716814159, + "grad_norm": 14.506092071533203, + "learning_rate": 3.250339236052767e-07, + "logits/chosen": -1.140625, + "logits/rejected": -1.18359375, + "logps/chosen": -261.0, + "logps/rejected": -289.0, + "loss": 0.3673, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.951171875, + "rewards/margins": 2.3359375, + "rewards/rejected": -1.37890625, + "step": 382 + }, + { + "epoch": 0.42367256637168144, + "grad_norm": 13.806063652038574, + "learning_rate": 3.2417820481837256e-07, + "logits/chosen": -1.3671875, + "logits/rejected": -1.16015625, + "logps/chosen": -254.0, + "logps/rejected": -274.0, + "loss": 0.3272, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.712890625, + "rewards/margins": 2.40625, + "rewards/rejected": -1.69140625, + "step": 383 + }, + { + "epoch": 0.4247787610619469, + "grad_norm": 13.990365982055664, + "learning_rate": 3.2332153198970517e-07, + "logits/chosen": -1.28125, + "logits/rejected": -1.2109375, + "logps/chosen": -250.5, + "logps/rejected": -279.0, + "loss": 0.334, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.66796875, + "rewards/margins": 2.1640625, + "rewards/rejected": -1.49609375, + "step": 384 + }, + { + "epoch": 0.4258849557522124, + "grad_norm": 15.526028633117676, + "learning_rate": 3.2246391613735815e-07, + "logits/chosen": -1.390625, + "logits/rejected": -1.1953125, + "logps/chosen": -253.0, + "logps/rejected": -258.5, + "loss": 0.3283, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.92578125, + "rewards/margins": 2.421875, + "rewards/rejected": -1.49609375, + "step": 385 + }, + { + "epoch": 0.4269911504424779, + "grad_norm": 13.068387985229492, + "learning_rate": 3.2160536829154356e-07, + "logits/chosen": -1.27734375, + "logits/rejected": -1.23046875, + "logps/chosen": -248.5, + "logps/rejected": -286.0, + "loss": 0.2813, + "rewards/accuracies": 0.859375, + "rewards/chosen": 1.46875, + "rewards/margins": 3.0078125, + "rewards/rejected": -1.54296875, + "step": 386 + }, + { + "epoch": 0.4280973451327434, + "grad_norm": 14.58014965057373, + "learning_rate": 3.207458994944606e-07, + "logits/chosen": -1.2578125, + "logits/rejected": -1.16796875, + "logps/chosen": -261.0, + "logps/rejected": -269.0, + "loss": 0.3732, + "rewards/accuracies": 0.7578125, + "rewards/chosen": 0.6640625, + "rewards/margins": 2.0546875, + "rewards/rejected": -1.390625, + "step": 387 + }, + { + "epoch": 0.42920353982300885, + "grad_norm": 13.451835632324219, + "learning_rate": 3.1988552080015294e-07, + "logits/chosen": -1.33984375, + "logits/rejected": -1.15234375, + "logps/chosen": -256.5, + "logps/rejected": -264.0, + "loss": 0.3112, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.11328125, + "rewards/margins": 2.515625, + "rewards/rejected": -1.40234375, + "step": 388 + }, + { + "epoch": 0.4303097345132743, + "grad_norm": 15.350495338439941, + "learning_rate": 3.1902424327436725e-07, + "logits/chosen": -1.23828125, + "logits/rejected": -1.0625, + "logps/chosen": -273.0, + "logps/rejected": -264.0, + "loss": 0.3406, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.712890625, + "rewards/margins": 2.1484375, + "rewards/rejected": -1.44140625, + "step": 389 + }, + { + "epoch": 0.4314159292035398, + "grad_norm": 17.061744689941406, + "learning_rate": 3.1816207799440996e-07, + "logits/chosen": -1.265625, + "logits/rejected": -1.12890625, + "logps/chosen": -278.5, + "logps/rejected": -318.0, + "loss": 0.3654, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 1.16015625, + "rewards/margins": 2.4375, + "rewards/rejected": -1.28125, + "step": 390 + }, + { + "epoch": 0.4325221238938053, + "grad_norm": 16.2224063873291, + "learning_rate": 3.1729903604900595e-07, + "logits/chosen": -1.25390625, + "logits/rejected": -1.08203125, + "logps/chosen": -244.0, + "logps/rejected": -281.0, + "loss": 0.3328, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.923828125, + "rewards/margins": 2.5234375, + "rewards/rejected": -1.59375, + "step": 391 + }, + { + "epoch": 0.4336283185840708, + "grad_norm": 13.922826766967773, + "learning_rate": 3.1643512853815487e-07, + "logits/chosen": -1.26171875, + "logits/rejected": -1.25390625, + "logps/chosen": -249.0, + "logps/rejected": -280.0, + "loss": 0.3626, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.89453125, + "rewards/margins": 2.12890625, + "rewards/rejected": -1.2421875, + "step": 392 + }, + { + "epoch": 0.43473451327433627, + "grad_norm": 14.233848571777344, + "learning_rate": 3.15570366572989e-07, + "logits/chosen": -1.3203125, + "logits/rejected": -1.18359375, + "logps/chosen": -246.5, + "logps/rejected": -255.5, + "loss": 0.33, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.875, + "rewards/margins": 2.2421875, + "rewards/rejected": -1.3671875, + "step": 393 + }, + { + "epoch": 0.4358407079646018, + "grad_norm": 15.168607711791992, + "learning_rate": 3.147047612756302e-07, + "logits/chosen": -1.21875, + "logits/rejected": -1.17578125, + "logps/chosen": -281.0, + "logps/rejected": -290.0, + "loss": 0.345, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.892578125, + "rewards/margins": 2.3125, + "rewards/rejected": -1.421875, + "step": 394 + }, + { + "epoch": 0.43694690265486724, + "grad_norm": 14.825115203857422, + "learning_rate": 3.138383237790467e-07, + "logits/chosen": -1.203125, + "logits/rejected": -1.1484375, + "logps/chosen": -250.5, + "logps/rejected": -272.0, + "loss": 0.3428, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.98828125, + "rewards/margins": 2.4765625, + "rewards/rejected": -1.484375, + "step": 395 + }, + { + "epoch": 0.43805309734513276, + "grad_norm": 12.729918479919434, + "learning_rate": 3.129710652269103e-07, + "logits/chosen": -1.30859375, + "logits/rejected": -1.1171875, + "logps/chosen": -230.0, + "logps/rejected": -255.5, + "loss": 0.2864, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 1.20703125, + "rewards/margins": 2.875, + "rewards/rejected": -1.66796875, + "step": 396 + }, + { + "epoch": 0.4391592920353982, + "grad_norm": 12.986177444458008, + "learning_rate": 3.1210299677345253e-07, + "logits/chosen": -1.17578125, + "logits/rejected": -1.140625, + "logps/chosen": -257.0, + "logps/rejected": -279.0, + "loss": 0.3394, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 1.013671875, + "rewards/margins": 2.46875, + "rewards/rejected": -1.45703125, + "step": 397 + }, + { + "epoch": 0.44026548672566373, + "grad_norm": 15.79924488067627, + "learning_rate": 3.1123412958332153e-07, + "logits/chosen": -1.30859375, + "logits/rejected": -1.15234375, + "logps/chosen": -248.0, + "logps/rejected": -275.0, + "loss": 0.3804, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.84375, + "rewards/margins": 2.34375, + "rewards/rejected": -1.5, + "step": 398 + }, + { + "epoch": 0.4413716814159292, + "grad_norm": 14.248608589172363, + "learning_rate": 3.1036447483143834e-07, + "logits/chosen": -1.2578125, + "logits/rejected": -1.22265625, + "logps/chosen": -261.5, + "logps/rejected": -275.0, + "loss": 0.3299, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.99609375, + "rewards/margins": 2.6015625, + "rewards/rejected": -1.609375, + "step": 399 + }, + { + "epoch": 0.4424778761061947, + "grad_norm": 14.430908203125, + "learning_rate": 3.094940437028535e-07, + "logits/chosen": -1.1171875, + "logits/rejected": -1.125, + "logps/chosen": -250.5, + "logps/rejected": -251.5, + "loss": 0.3726, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.953125, + "rewards/margins": 2.26953125, + "rewards/rejected": -1.3203125, + "step": 400 + }, + { + "epoch": 0.4424778761061947, + "eval_logits/chosen": -1.2572294473648071, + "eval_logits/rejected": -1.1652674674987793, + "eval_logps/chosen": -250.63681030273438, + "eval_logps/rejected": -273.5472717285156, + "eval_loss": 0.3369702994823456, + "eval_rewards/accuracies": 0.7978180646896362, + "eval_rewards/chosen": 1.024176001548767, + "eval_rewards/margins": 2.449626922607422, + "eval_rewards/rejected": -1.4251010417938232, + "eval_runtime": 193.115, + "eval_samples_per_second": 66.556, + "eval_steps_per_second": 1.041, + "step": 400 + }, + { + "epoch": 0.4435840707964602, + "grad_norm": 14.084267616271973, + "learning_rate": 3.086228473926024e-07, + "logits/chosen": -1.23828125, + "logits/rejected": -1.26171875, + "logps/chosen": -242.5, + "logps/rejected": -257.0, + "loss": 0.3172, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 1.12109375, + "rewards/margins": 2.5390625, + "rewards/rejected": -1.4140625, + "step": 401 + }, + { + "epoch": 0.4446902654867257, + "grad_norm": 13.272000312805176, + "learning_rate": 3.077508971055623e-07, + "logits/chosen": -1.1015625, + "logits/rejected": -1.171875, + "logps/chosen": -246.5, + "logps/rejected": -295.0, + "loss": 0.2771, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.2265625, + "rewards/margins": 2.90625, + "rewards/rejected": -1.6796875, + "step": 402 + }, + { + "epoch": 0.44579646017699115, + "grad_norm": 13.017451286315918, + "learning_rate": 3.0687820405630736e-07, + "logits/chosen": -1.31640625, + "logits/rejected": -1.2265625, + "logps/chosen": -258.5, + "logps/rejected": -286.0, + "loss": 0.2997, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.3125, + "rewards/margins": 2.8125, + "rewards/rejected": -1.5, + "step": 403 + }, + { + "epoch": 0.4469026548672566, + "grad_norm": 11.719470024108887, + "learning_rate": 3.060047794689649e-07, + "logits/chosen": -1.24609375, + "logits/rejected": -1.171875, + "logps/chosen": -246.0, + "logps/rejected": -252.0, + "loss": 0.273, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 1.154296875, + "rewards/margins": 2.7109375, + "rewards/rejected": -1.55078125, + "step": 404 + }, + { + "epoch": 0.4480088495575221, + "grad_norm": 12.74482250213623, + "learning_rate": 3.0513063457707106e-07, + "logits/chosen": -1.32421875, + "logits/rejected": -1.17578125, + "logps/chosen": -238.5, + "logps/rejected": -227.0, + "loss": 0.3567, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.908203125, + "rewards/margins": 2.234375, + "rewards/rejected": -1.32421875, + "step": 405 + }, + { + "epoch": 0.4491150442477876, + "grad_norm": 14.130414962768555, + "learning_rate": 3.0425578062342577e-07, + "logits/chosen": -1.1484375, + "logits/rejected": -1.18359375, + "logps/chosen": -241.5, + "logps/rejected": -268.0, + "loss": 0.3743, + "rewards/accuracies": 0.71875, + "rewards/chosen": 1.017578125, + "rewards/margins": 2.203125, + "rewards/rejected": -1.18359375, + "step": 406 + }, + { + "epoch": 0.4502212389380531, + "grad_norm": 15.725412368774414, + "learning_rate": 3.03380228859949e-07, + "logits/chosen": -1.29296875, + "logits/rejected": -1.13671875, + "logps/chosen": -271.5, + "logps/rejected": -291.0, + "loss": 0.3421, + "rewards/accuracies": 0.765625, + "rewards/chosen": 1.140625, + "rewards/margins": 2.4140625, + "rewards/rejected": -1.26953125, + "step": 407 + }, + { + "epoch": 0.45132743362831856, + "grad_norm": 13.073963165283203, + "learning_rate": 3.0250399054753526e-07, + "logits/chosen": -1.19921875, + "logits/rejected": -1.1328125, + "logps/chosen": -271.0, + "logps/rejected": -265.0, + "loss": 0.3024, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 1.078125, + "rewards/margins": 2.515625, + "rewards/rejected": -1.4375, + "step": 408 + }, + { + "epoch": 0.4524336283185841, + "grad_norm": 12.850948333740234, + "learning_rate": 3.016270769559093e-07, + "logits/chosen": -1.203125, + "logits/rejected": -1.06640625, + "logps/chosen": -258.0, + "logps/rejected": -275.0, + "loss": 0.3189, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 1.041015625, + "rewards/margins": 2.234375, + "rewards/rejected": -1.1953125, + "step": 409 + }, + { + "epoch": 0.45353982300884954, + "grad_norm": 13.47533130645752, + "learning_rate": 3.007494993634808e-07, + "logits/chosen": -1.2578125, + "logits/rejected": -1.16015625, + "logps/chosen": -259.0, + "logps/rejected": -269.0, + "loss": 0.3222, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.125, + "rewards/margins": 2.53125, + "rewards/rejected": -1.41015625, + "step": 410 + }, + { + "epoch": 0.45464601769911506, + "grad_norm": 13.043135643005371, + "learning_rate": 2.9987126905719965e-07, + "logits/chosen": -1.20703125, + "logits/rejected": -1.1484375, + "logps/chosen": -265.5, + "logps/rejected": -272.5, + "loss": 0.3374, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.9140625, + "rewards/margins": 2.3125, + "rewards/rejected": -1.40234375, + "step": 411 + }, + { + "epoch": 0.4557522123893805, + "grad_norm": 14.49729061126709, + "learning_rate": 2.989923973324105e-07, + "logits/chosen": -1.20703125, + "logits/rejected": -1.08984375, + "logps/chosen": -252.5, + "logps/rejected": -281.0, + "loss": 0.3668, + "rewards/accuracies": 0.765625, + "rewards/chosen": 1.09765625, + "rewards/margins": 2.265625, + "rewards/rejected": -1.1640625, + "step": 412 + }, + { + "epoch": 0.45685840707964603, + "grad_norm": 15.509222030639648, + "learning_rate": 2.9811289549270745e-07, + "logits/chosen": -1.33203125, + "logits/rejected": -1.2109375, + "logps/chosen": -250.5, + "logps/rejected": -286.0, + "loss": 0.3665, + "rewards/accuracies": 0.7421875, + "rewards/chosen": 1.203125, + "rewards/margins": 2.5, + "rewards/rejected": -1.2890625, + "step": 413 + }, + { + "epoch": 0.4579646017699115, + "grad_norm": 13.909936904907227, + "learning_rate": 2.9723277484978917e-07, + "logits/chosen": -1.1796875, + "logits/rejected": -1.2109375, + "logps/chosen": -270.0, + "logps/rejected": -291.0, + "loss": 0.2915, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.17578125, + "rewards/margins": 2.765625, + "rewards/rejected": -1.58984375, + "step": 414 + }, + { + "epoch": 0.459070796460177, + "grad_norm": 13.325665473937988, + "learning_rate": 2.963520467233127e-07, + "logits/chosen": -1.45703125, + "logits/rejected": -1.19140625, + "logps/chosen": -252.0, + "logps/rejected": -262.5, + "loss": 0.3212, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.07421875, + "rewards/margins": 2.4140625, + "rewards/rejected": -1.34375, + "step": 415 + }, + { + "epoch": 0.46017699115044247, + "grad_norm": 15.239510536193848, + "learning_rate": 2.954707224407485e-07, + "logits/chosen": -1.30078125, + "logits/rejected": -1.24609375, + "logps/chosen": -261.5, + "logps/rejected": -285.0, + "loss": 0.3534, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.0625, + "rewards/margins": 2.3046875, + "rewards/rejected": -1.2421875, + "step": 416 + }, + { + "epoch": 0.461283185840708, + "grad_norm": 15.735715866088867, + "learning_rate": 2.945888133372343e-07, + "logits/chosen": -1.21484375, + "logits/rejected": -1.12890625, + "logps/chosen": -287.0, + "logps/rejected": -288.0, + "loss": 0.3967, + "rewards/accuracies": 0.7265625, + "rewards/chosen": 0.88671875, + "rewards/margins": 2.125, + "rewards/rejected": -1.23828125, + "step": 417 + }, + { + "epoch": 0.46238938053097345, + "grad_norm": 15.247976303100586, + "learning_rate": 2.937063307554295e-07, + "logits/chosen": -1.32421875, + "logits/rejected": -1.21484375, + "logps/chosen": -226.0, + "logps/rejected": -250.0, + "loss": 0.3726, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 1.12890625, + "rewards/margins": 2.375, + "rewards/rejected": -1.24609375, + "step": 418 + }, + { + "epoch": 0.46349557522123896, + "grad_norm": 12.819127082824707, + "learning_rate": 2.9282328604536937e-07, + "logits/chosen": -1.26953125, + "logits/rejected": -1.2109375, + "logps/chosen": -249.5, + "logps/rejected": -271.0, + "loss": 0.3065, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 1.14453125, + "rewards/margins": 2.578125, + "rewards/rejected": -1.4296875, + "step": 419 + }, + { + "epoch": 0.4646017699115044, + "grad_norm": 13.217004776000977, + "learning_rate": 2.9193969056431907e-07, + "logits/chosen": -1.19921875, + "logits/rejected": -1.15234375, + "logps/chosen": -254.5, + "logps/rejected": -270.0, + "loss": 0.3139, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.09765625, + "rewards/margins": 2.78125, + "rewards/rejected": -1.68359375, + "step": 420 + }, + { + "epoch": 0.4657079646017699, + "grad_norm": 14.80079174041748, + "learning_rate": 2.910555556766272e-07, + "logits/chosen": -1.4375, + "logits/rejected": -1.234375, + "logps/chosen": -226.5, + "logps/rejected": -263.0, + "loss": 0.3987, + "rewards/accuracies": 0.7265625, + "rewards/chosen": 0.94140625, + "rewards/margins": 2.12109375, + "rewards/rejected": -1.17578125, + "step": 421 + }, + { + "epoch": 0.4668141592920354, + "grad_norm": 13.704545974731445, + "learning_rate": 2.9017089275358014e-07, + "logits/chosen": -1.1953125, + "logits/rejected": -1.1328125, + "logps/chosen": -271.0, + "logps/rejected": -287.0, + "loss": 0.3016, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.9609375, + "rewards/margins": 2.546875, + "rewards/rejected": -1.5859375, + "step": 422 + }, + { + "epoch": 0.46792035398230086, + "grad_norm": 14.536904335021973, + "learning_rate": 2.8928571317325564e-07, + "logits/chosen": -1.25, + "logits/rejected": -1.0703125, + "logps/chosen": -279.0, + "logps/rejected": -291.0, + "loss": 0.3234, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 1.19140625, + "rewards/margins": 2.6171875, + "rewards/rejected": -1.4296875, + "step": 423 + }, + { + "epoch": 0.4690265486725664, + "grad_norm": 15.45283317565918, + "learning_rate": 2.8840002832037625e-07, + "logits/chosen": -1.28515625, + "logits/rejected": -1.1875, + "logps/chosen": -261.0, + "logps/rejected": -283.0, + "loss": 0.365, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.171875, + "rewards/margins": 2.234375, + "rewards/rejected": -1.0703125, + "step": 424 + }, + { + "epoch": 0.47013274336283184, + "grad_norm": 14.761160850524902, + "learning_rate": 2.8751384958616316e-07, + "logits/chosen": -1.20703125, + "logits/rejected": -1.1484375, + "logps/chosen": -257.0, + "logps/rejected": -285.0, + "loss": 0.3295, + "rewards/accuracies": 0.796875, + "rewards/chosen": 1.2578125, + "rewards/margins": 2.9296875, + "rewards/rejected": -1.671875, + "step": 425 + }, + { + "epoch": 0.47123893805309736, + "grad_norm": 14.022229194641113, + "learning_rate": 2.8662718836818964e-07, + "logits/chosen": -1.3359375, + "logits/rejected": -1.1796875, + "logps/chosen": -249.5, + "logps/rejected": -275.0, + "loss": 0.3165, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.984375, + "rewards/margins": 2.53125, + "rewards/rejected": -1.546875, + "step": 426 + }, + { + "epoch": 0.4723451327433628, + "grad_norm": 13.972189903259277, + "learning_rate": 2.8574005607023444e-07, + "logits/chosen": -1.328125, + "logits/rejected": -1.15234375, + "logps/chosen": -253.0, + "logps/rejected": -286.0, + "loss": 0.3595, + "rewards/accuracies": 0.765625, + "rewards/chosen": 1.17578125, + "rewards/margins": 2.3359375, + "rewards/rejected": -1.16015625, + "step": 427 + }, + { + "epoch": 0.47345132743362833, + "grad_norm": 13.316848754882812, + "learning_rate": 2.848524641021349e-07, + "logits/chosen": -1.2734375, + "logits/rejected": -1.15625, + "logps/chosen": -279.0, + "logps/rejected": -304.0, + "loss": 0.2876, + "rewards/accuracies": 0.8828125, + "rewards/chosen": 1.14453125, + "rewards/margins": 2.734375, + "rewards/rejected": -1.5859375, + "step": 428 + }, + { + "epoch": 0.4745575221238938, + "grad_norm": 15.443883895874023, + "learning_rate": 2.839644238796407e-07, + "logits/chosen": -1.24609375, + "logits/rejected": -1.14453125, + "logps/chosen": -279.0, + "logps/rejected": -291.0, + "loss": 0.3446, + "rewards/accuracies": 0.796875, + "rewards/chosen": 1.017578125, + "rewards/margins": 2.453125, + "rewards/rejected": -1.4375, + "step": 429 + }, + { + "epoch": 0.4756637168141593, + "grad_norm": 14.372305870056152, + "learning_rate": 2.8307594682426637e-07, + "logits/chosen": -1.2109375, + "logits/rejected": -1.1484375, + "logps/chosen": -260.5, + "logps/rejected": -309.0, + "loss": 0.2813, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.0, + "rewards/margins": 2.9609375, + "rewards/rejected": -1.95703125, + "step": 430 + }, + { + "epoch": 0.47676991150442477, + "grad_norm": 13.688916206359863, + "learning_rate": 2.8218704436314524e-07, + "logits/chosen": -1.46875, + "logits/rejected": -1.22265625, + "logps/chosen": -253.5, + "logps/rejected": -276.0, + "loss": 0.341, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.798828125, + "rewards/margins": 2.171875, + "rewards/rejected": -1.37109375, + "step": 431 + }, + { + "epoch": 0.4778761061946903, + "grad_norm": 12.372815132141113, + "learning_rate": 2.8129772792888145e-07, + "logits/chosen": -1.29296875, + "logits/rejected": -1.1171875, + "logps/chosen": -235.0, + "logps/rejected": -281.0, + "loss": 0.2966, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.869140625, + "rewards/margins": 2.6328125, + "rewards/rejected": -1.76171875, + "step": 432 + }, + { + "epoch": 0.47898230088495575, + "grad_norm": 15.202491760253906, + "learning_rate": 2.804080089594039e-07, + "logits/chosen": -1.26953125, + "logits/rejected": -1.109375, + "logps/chosen": -260.0, + "logps/rejected": -263.0, + "loss": 0.3812, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.490234375, + "rewards/margins": 1.98046875, + "rewards/rejected": -1.484375, + "step": 433 + }, + { + "epoch": 0.48008849557522126, + "grad_norm": 15.252046585083008, + "learning_rate": 2.7951789889781845e-07, + "logits/chosen": -1.25, + "logits/rejected": -1.125, + "logps/chosen": -261.0, + "logps/rejected": -299.0, + "loss": 0.3649, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.919921875, + "rewards/margins": 2.328125, + "rewards/rejected": -1.40625, + "step": 434 + }, + { + "epoch": 0.4811946902654867, + "grad_norm": 11.937089920043945, + "learning_rate": 2.786274091922611e-07, + "logits/chosen": -1.296875, + "logits/rejected": -1.1484375, + "logps/chosen": -257.0, + "logps/rejected": -279.0, + "loss": 0.2799, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.9375, + "rewards/margins": 2.703125, + "rewards/rejected": -1.76953125, + "step": 435 + }, + { + "epoch": 0.4823008849557522, + "grad_norm": 12.86253833770752, + "learning_rate": 2.7773655129575043e-07, + "logits/chosen": -1.25, + "logits/rejected": -1.11328125, + "logps/chosen": -237.5, + "logps/rejected": -266.5, + "loss": 0.3076, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.802734375, + "rewards/margins": 2.75, + "rewards/rejected": -1.953125, + "step": 436 + }, + { + "epoch": 0.4834070796460177, + "grad_norm": 12.546619415283203, + "learning_rate": 2.7684533666604076e-07, + "logits/chosen": -1.3828125, + "logits/rejected": -1.05078125, + "logps/chosen": -253.5, + "logps/rejected": -257.0, + "loss": 0.3184, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.689453125, + "rewards/margins": 2.3984375, + "rewards/rejected": -1.71484375, + "step": 437 + }, + { + "epoch": 0.48451327433628316, + "grad_norm": 18.16977310180664, + "learning_rate": 2.759537767654744e-07, + "logits/chosen": -1.1796875, + "logits/rejected": -1.1796875, + "logps/chosen": -274.0, + "logps/rejected": -294.0, + "loss": 0.387, + "rewards/accuracies": 0.734375, + "rewards/chosen": 0.615234375, + "rewards/margins": 2.3203125, + "rewards/rejected": -1.703125, + "step": 438 + }, + { + "epoch": 0.4856194690265487, + "grad_norm": 12.303600311279297, + "learning_rate": 2.750618830608343e-07, + "logits/chosen": -1.3203125, + "logits/rejected": -1.06640625, + "logps/chosen": -235.0, + "logps/rejected": -242.5, + "loss": 0.2887, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.74609375, + "rewards/margins": 2.5546875, + "rewards/rejected": -1.8125, + "step": 439 + }, + { + "epoch": 0.48672566371681414, + "grad_norm": 13.665416717529297, + "learning_rate": 2.7416966702319683e-07, + "logits/chosen": -1.203125, + "logits/rejected": -1.1171875, + "logps/chosen": -283.5, + "logps/rejected": -304.0, + "loss": 0.2974, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.880859375, + "rewards/margins": 2.6328125, + "rewards/rejected": -1.75390625, + "step": 440 + }, + { + "epoch": 0.48783185840707965, + "grad_norm": 14.621048927307129, + "learning_rate": 2.732771401277838e-07, + "logits/chosen": -1.21484375, + "logits/rejected": -1.109375, + "logps/chosen": -266.5, + "logps/rejected": -264.5, + "loss": 0.3651, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.4580078125, + "rewards/margins": 2.140625, + "rewards/rejected": -1.68359375, + "step": 441 + }, + { + "epoch": 0.4889380530973451, + "grad_norm": 12.872169494628906, + "learning_rate": 2.7238431385381523e-07, + "logits/chosen": -1.25, + "logits/rejected": -1.1484375, + "logps/chosen": -245.5, + "logps/rejected": -279.0, + "loss": 0.3244, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.765625, + "rewards/margins": 2.640625, + "rewards/rejected": -1.875, + "step": 442 + }, + { + "epoch": 0.49004424778761063, + "grad_norm": 13.446981430053711, + "learning_rate": 2.714911996843616e-07, + "logits/chosen": -1.23828125, + "logits/rejected": -1.14453125, + "logps/chosen": -263.0, + "logps/rejected": -300.0, + "loss": 0.3075, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.671875, + "rewards/margins": 2.5703125, + "rewards/rejected": -1.90234375, + "step": 443 + }, + { + "epoch": 0.4911504424778761, + "grad_norm": 14.347339630126953, + "learning_rate": 2.7059780910619617e-07, + "logits/chosen": -1.26953125, + "logits/rejected": -1.08203125, + "logps/chosen": -275.0, + "logps/rejected": -310.0, + "loss": 0.3042, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 1.0625, + "rewards/margins": 2.9140625, + "rewards/rejected": -1.84765625, + "step": 444 + }, + { + "epoch": 0.4922566371681416, + "grad_norm": 14.27387523651123, + "learning_rate": 2.6970415360964716e-07, + "logits/chosen": -1.27734375, + "logits/rejected": -1.2109375, + "logps/chosen": -237.0, + "logps/rejected": -258.0, + "loss": 0.3354, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.98046875, + "rewards/margins": 2.7421875, + "rewards/rejected": -1.76171875, + "step": 445 + }, + { + "epoch": 0.49336283185840707, + "grad_norm": 13.943259239196777, + "learning_rate": 2.6881024468845e-07, + "logits/chosen": -1.15234375, + "logits/rejected": -1.16796875, + "logps/chosen": -247.5, + "logps/rejected": -275.5, + "loss": 0.3356, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 0.69140625, + "rewards/margins": 2.734375, + "rewards/rejected": -2.046875, + "step": 446 + }, + { + "epoch": 0.4944690265486726, + "grad_norm": 15.396841049194336, + "learning_rate": 2.679160938395997e-07, + "logits/chosen": -1.28515625, + "logits/rejected": -1.203125, + "logps/chosen": -251.0, + "logps/rejected": -283.0, + "loss": 0.3342, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.9921875, + "rewards/margins": 2.6796875, + "rewards/rejected": -1.68359375, + "step": 447 + }, + { + "epoch": 0.49557522123893805, + "grad_norm": 12.922630310058594, + "learning_rate": 2.670217125632027e-07, + "logits/chosen": -1.21875, + "logits/rejected": -1.17578125, + "logps/chosen": -248.5, + "logps/rejected": -258.5, + "loss": 0.3361, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.529296875, + "rewards/margins": 2.1875, + "rewards/rejected": -1.66015625, + "step": 448 + }, + { + "epoch": 0.49668141592920356, + "grad_norm": 18.911989212036133, + "learning_rate": 2.661271123623291e-07, + "logits/chosen": -1.3125, + "logits/rejected": -1.19921875, + "logps/chosen": -288.0, + "logps/rejected": -278.0, + "loss": 0.4185, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.599609375, + "rewards/margins": 2.078125, + "rewards/rejected": -1.4765625, + "step": 449 + }, + { + "epoch": 0.497787610619469, + "grad_norm": 16.120946884155273, + "learning_rate": 2.652323047428646e-07, + "logits/chosen": -1.21484375, + "logits/rejected": -1.109375, + "logps/chosen": -279.0, + "logps/rejected": -303.0, + "loss": 0.363, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 0.9609375, + "rewards/margins": 2.609375, + "rewards/rejected": -1.65625, + "step": 450 + }, + { + "epoch": 0.497787610619469, + "eval_logits/chosen": -1.2554415464401245, + "eval_logits/rejected": -1.1580379009246826, + "eval_logps/chosen": -252.30845642089844, + "eval_logps/rejected": -276.3631896972656, + "eval_loss": 0.3314497768878937, + "eval_rewards/accuracies": 0.8052030205726624, + "eval_rewards/chosen": 0.85384601354599, + "eval_rewards/margins": 2.567397356033325, + "eval_rewards/rejected": -1.7136777639389038, + "eval_runtime": 193.0141, + "eval_samples_per_second": 66.591, + "eval_steps_per_second": 1.041, + "step": 450 + }, + { + "epoch": 0.49889380530973454, + "grad_norm": 13.481291770935059, + "learning_rate": 2.6433730121336283e-07, + "logits/chosen": -1.3671875, + "logits/rejected": -1.234375, + "logps/chosen": -241.5, + "logps/rejected": -278.0, + "loss": 0.3044, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.791015625, + "rewards/margins": 2.7578125, + "rewards/rejected": -1.9765625, + "step": 451 + }, + { + "epoch": 0.5, + "grad_norm": 15.87637996673584, + "learning_rate": 2.6344211328489696e-07, + "logits/chosen": -1.2734375, + "logits/rejected": -1.1953125, + "logps/chosen": -269.0, + "logps/rejected": -291.0, + "loss": 0.3646, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.68359375, + "rewards/margins": 2.28515625, + "rewards/rejected": -1.6015625, + "step": 452 + }, + { + "epoch": 0.5011061946902655, + "grad_norm": 12.020176887512207, + "learning_rate": 2.625467524709118e-07, + "logits/chosen": -1.2265625, + "logits/rejected": -1.10546875, + "logps/chosen": -255.0, + "logps/rejected": -283.0, + "loss": 0.2739, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.91796875, + "rewards/margins": 2.8828125, + "rewards/rejected": -1.96484375, + "step": 453 + }, + { + "epoch": 0.5022123893805309, + "grad_norm": 13.10580825805664, + "learning_rate": 2.616512302870757e-07, + "logits/chosen": -1.23046875, + "logits/rejected": -1.109375, + "logps/chosen": -280.0, + "logps/rejected": -286.0, + "loss": 0.33, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.7685546875, + "rewards/margins": 2.3203125, + "rewards/rejected": -1.55078125, + "step": 454 + }, + { + "epoch": 0.5033185840707964, + "grad_norm": 15.729186058044434, + "learning_rate": 2.607555582511326e-07, + "logits/chosen": -1.3828125, + "logits/rejected": -1.140625, + "logps/chosen": -289.0, + "logps/rejected": -285.0, + "loss": 0.3862, + "rewards/accuracies": 0.734375, + "rewards/chosen": 0.671875, + "rewards/margins": 2.1796875, + "rewards/rejected": -1.50390625, + "step": 455 + }, + { + "epoch": 0.504424778761062, + "grad_norm": 13.986113548278809, + "learning_rate": 2.5985974788275374e-07, + "logits/chosen": -1.3828125, + "logits/rejected": -1.203125, + "logps/chosen": -230.5, + "logps/rejected": -267.0, + "loss": 0.3423, + "rewards/accuracies": 0.765625, + "rewards/chosen": 1.1640625, + "rewards/margins": 2.8515625, + "rewards/rejected": -1.6875, + "step": 456 + }, + { + "epoch": 0.5055309734513275, + "grad_norm": 14.71418285369873, + "learning_rate": 2.5896381070338933e-07, + "logits/chosen": -1.2890625, + "logits/rejected": -1.16796875, + "logps/chosen": -274.0, + "logps/rejected": -273.0, + "loss": 0.3394, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.775390625, + "rewards/margins": 2.19921875, + "rewards/rejected": -1.421875, + "step": 457 + }, + { + "epoch": 0.5066371681415929, + "grad_norm": 13.852214813232422, + "learning_rate": 2.5806775823612076e-07, + "logits/chosen": -1.34375, + "logits/rejected": -1.203125, + "logps/chosen": -244.0, + "logps/rejected": -284.0, + "loss": 0.3206, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.888671875, + "rewards/margins": 2.5078125, + "rewards/rejected": -1.62109375, + "step": 458 + }, + { + "epoch": 0.5077433628318584, + "grad_norm": 13.243755340576172, + "learning_rate": 2.5717160200551213e-07, + "logits/chosen": -1.2421875, + "logits/rejected": -1.0546875, + "logps/chosen": -242.0, + "logps/rejected": -268.0, + "loss": 0.3353, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.626953125, + "rewards/margins": 2.453125, + "rewards/rejected": -1.828125, + "step": 459 + }, + { + "epoch": 0.5088495575221239, + "grad_norm": 13.583708763122559, + "learning_rate": 2.562753535374621e-07, + "logits/chosen": -1.25, + "logits/rejected": -1.1015625, + "logps/chosen": -244.5, + "logps/rejected": -266.5, + "loss": 0.3068, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.912109375, + "rewards/margins": 2.515625, + "rewards/rejected": -1.6015625, + "step": 460 + }, + { + "epoch": 0.5099557522123894, + "grad_norm": 14.627636909484863, + "learning_rate": 2.553790243590556e-07, + "logits/chosen": -1.2109375, + "logits/rejected": -1.21875, + "logps/chosen": -230.0, + "logps/rejected": -265.0, + "loss": 0.3564, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.998046875, + "rewards/margins": 2.3984375, + "rewards/rejected": -1.40234375, + "step": 461 + }, + { + "epoch": 0.5110619469026548, + "grad_norm": 16.162841796875, + "learning_rate": 2.5448262599841556e-07, + "logits/chosen": -1.3984375, + "logits/rejected": -1.23046875, + "logps/chosen": -256.5, + "logps/rejected": -280.0, + "loss": 0.3297, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.83984375, + "rewards/margins": 2.453125, + "rewards/rejected": -1.6171875, + "step": 462 + }, + { + "epoch": 0.5121681415929203, + "grad_norm": 13.07744026184082, + "learning_rate": 2.535861699845549e-07, + "logits/chosen": -1.203125, + "logits/rejected": -1.1796875, + "logps/chosen": -244.5, + "logps/rejected": -279.0, + "loss": 0.3268, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.818359375, + "rewards/margins": 2.5, + "rewards/rejected": -1.68359375, + "step": 463 + }, + { + "epoch": 0.5132743362831859, + "grad_norm": 13.961365699768066, + "learning_rate": 2.526896678472279e-07, + "logits/chosen": -1.265625, + "logits/rejected": -1.12109375, + "logps/chosen": -267.0, + "logps/rejected": -273.0, + "loss": 0.3112, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.732421875, + "rewards/margins": 2.671875, + "rewards/rejected": -1.94140625, + "step": 464 + }, + { + "epoch": 0.5143805309734514, + "grad_norm": 12.809479713439941, + "learning_rate": 2.51793131116782e-07, + "logits/chosen": -1.328125, + "logits/rejected": -1.15625, + "logps/chosen": -223.0, + "logps/rejected": -244.5, + "loss": 0.3162, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.744140625, + "rewards/margins": 2.7109375, + "rewards/rejected": -1.9765625, + "step": 465 + }, + { + "epoch": 0.5154867256637168, + "grad_norm": 12.547861099243164, + "learning_rate": 2.5089657132400964e-07, + "logits/chosen": -1.14453125, + "logits/rejected": -1.08984375, + "logps/chosen": -261.0, + "logps/rejected": -273.0, + "loss": 0.2895, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.85546875, + "rewards/margins": 2.6796875, + "rewards/rejected": -1.828125, + "step": 466 + }, + { + "epoch": 0.5165929203539823, + "grad_norm": 13.852119445800781, + "learning_rate": 2.5e-07, + "logits/chosen": -1.22265625, + "logits/rejected": -1.18359375, + "logps/chosen": -256.5, + "logps/rejected": -294.0, + "loss": 0.3104, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.8046875, + "rewards/margins": 2.6875, + "rewards/rejected": -1.8828125, + "step": 467 + }, + { + "epoch": 0.5176991150442478, + "grad_norm": 12.272007942199707, + "learning_rate": 2.491034286759903e-07, + "logits/chosen": -1.26953125, + "logits/rejected": -1.14453125, + "logps/chosen": -254.0, + "logps/rejected": -284.0, + "loss": 0.2965, + "rewards/accuracies": 0.890625, + "rewards/chosen": 0.87109375, + "rewards/margins": 2.75, + "rewards/rejected": -1.87109375, + "step": 468 + }, + { + "epoch": 0.5188053097345132, + "grad_norm": 13.289767265319824, + "learning_rate": 2.482068688832181e-07, + "logits/chosen": -1.26171875, + "logits/rejected": -1.13671875, + "logps/chosen": -236.5, + "logps/rejected": -258.0, + "loss": 0.3045, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.90234375, + "rewards/margins": 2.734375, + "rewards/rejected": -1.82421875, + "step": 469 + }, + { + "epoch": 0.5199115044247787, + "grad_norm": 13.52807903289795, + "learning_rate": 2.4731033215277213e-07, + "logits/chosen": -1.2578125, + "logits/rejected": -1.17578125, + "logps/chosen": -251.5, + "logps/rejected": -285.0, + "loss": 0.3189, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.828125, + "rewards/margins": 2.640625, + "rewards/rejected": -1.8203125, + "step": 470 + }, + { + "epoch": 0.5210176991150443, + "grad_norm": 14.39229679107666, + "learning_rate": 2.464138300154451e-07, + "logits/chosen": -1.3515625, + "logits/rejected": -1.1640625, + "logps/chosen": -254.5, + "logps/rejected": -280.0, + "loss": 0.3246, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.787109375, + "rewards/margins": 2.6015625, + "rewards/rejected": -1.81640625, + "step": 471 + }, + { + "epoch": 0.5221238938053098, + "grad_norm": 14.51116943359375, + "learning_rate": 2.455173740015845e-07, + "logits/chosen": -1.30859375, + "logits/rejected": -1.12109375, + "logps/chosen": -246.0, + "logps/rejected": -269.5, + "loss": 0.3957, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.65234375, + "rewards/margins": 2.35546875, + "rewards/rejected": -1.70703125, + "step": 472 + }, + { + "epoch": 0.5232300884955752, + "grad_norm": 14.468950271606445, + "learning_rate": 2.4462097564094445e-07, + "logits/chosen": -1.3125, + "logits/rejected": -1.2421875, + "logps/chosen": -250.0, + "logps/rejected": -296.0, + "loss": 0.3396, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.703125, + "rewards/margins": 2.546875, + "rewards/rejected": -1.83984375, + "step": 473 + }, + { + "epoch": 0.5243362831858407, + "grad_norm": 12.03284740447998, + "learning_rate": 2.4372464646253794e-07, + "logits/chosen": -1.1953125, + "logits/rejected": -1.09765625, + "logps/chosen": -255.0, + "logps/rejected": -277.0, + "loss": 0.266, + "rewards/accuracies": 0.859375, + "rewards/chosen": 1.017578125, + "rewards/margins": 2.96875, + "rewards/rejected": -1.9453125, + "step": 474 + }, + { + "epoch": 0.5254424778761062, + "grad_norm": 19.379676818847656, + "learning_rate": 2.4282839799448785e-07, + "logits/chosen": -1.22265625, + "logits/rejected": -1.17578125, + "logps/chosen": -277.0, + "logps/rejected": -316.0, + "loss": 0.3512, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.763671875, + "rewards/margins": 2.7578125, + "rewards/rejected": -1.99609375, + "step": 475 + }, + { + "epoch": 0.5265486725663717, + "grad_norm": 13.170727729797363, + "learning_rate": 2.419322417638792e-07, + "logits/chosen": -1.27734375, + "logits/rejected": -1.1171875, + "logps/chosen": -255.5, + "logps/rejected": -274.0, + "loss": 0.3296, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.4990234375, + "rewards/margins": 2.421875, + "rewards/rejected": -1.9296875, + "step": 476 + }, + { + "epoch": 0.5276548672566371, + "grad_norm": 10.943991661071777, + "learning_rate": 2.410361892966107e-07, + "logits/chosen": -1.3046875, + "logits/rejected": -1.11328125, + "logps/chosen": -223.5, + "logps/rejected": -244.0, + "loss": 0.2629, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.806640625, + "rewards/margins": 3.1171875, + "rewards/rejected": -2.3125, + "step": 477 + }, + { + "epoch": 0.5287610619469026, + "grad_norm": 14.212424278259277, + "learning_rate": 2.401402521172463e-07, + "logits/chosen": -1.20703125, + "logits/rejected": -1.2109375, + "logps/chosen": -249.5, + "logps/rejected": -274.0, + "loss": 0.3509, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.65625, + "rewards/margins": 2.4140625, + "rewards/rejected": -1.75, + "step": 478 + }, + { + "epoch": 0.5298672566371682, + "grad_norm": 11.761503219604492, + "learning_rate": 2.392444417488673e-07, + "logits/chosen": -1.41796875, + "logits/rejected": -1.234375, + "logps/chosen": -234.5, + "logps/rejected": -278.0, + "loss": 0.2504, + "rewards/accuracies": 0.8671875, + "rewards/chosen": 1.00390625, + "rewards/margins": 2.953125, + "rewards/rejected": -1.953125, + "step": 479 + }, + { + "epoch": 0.5309734513274337, + "grad_norm": 15.108154296875, + "learning_rate": 2.3834876971292433e-07, + "logits/chosen": -1.2421875, + "logits/rejected": -1.2265625, + "logps/chosen": -285.0, + "logps/rejected": -303.0, + "loss": 0.3124, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.97265625, + "rewards/margins": 3.0546875, + "rewards/rejected": -2.0859375, + "step": 480 + }, + { + "epoch": 0.5320796460176991, + "grad_norm": 13.407039642333984, + "learning_rate": 2.3745324752908822e-07, + "logits/chosen": -1.20703125, + "logits/rejected": -1.1796875, + "logps/chosen": -253.0, + "logps/rejected": -283.0, + "loss": 0.2827, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.830078125, + "rewards/margins": 2.71875, + "rewards/rejected": -1.890625, + "step": 481 + }, + { + "epoch": 0.5331858407079646, + "grad_norm": 13.764703750610352, + "learning_rate": 2.365578867151031e-07, + "logits/chosen": -1.27734375, + "logits/rejected": -1.1640625, + "logps/chosen": -249.0, + "logps/rejected": -260.0, + "loss": 0.3393, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.607421875, + "rewards/margins": 2.8359375, + "rewards/rejected": -2.2265625, + "step": 482 + }, + { + "epoch": 0.5342920353982301, + "grad_norm": 14.601144790649414, + "learning_rate": 2.3566269878663714e-07, + "logits/chosen": -1.30078125, + "logits/rejected": -1.2109375, + "logps/chosen": -264.5, + "logps/rejected": -291.0, + "loss": 0.3486, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.587890625, + "rewards/margins": 2.3046875, + "rewards/rejected": -1.71484375, + "step": 483 + }, + { + "epoch": 0.5353982300884956, + "grad_norm": 13.12956714630127, + "learning_rate": 2.347676952571354e-07, + "logits/chosen": -1.26953125, + "logits/rejected": -1.16015625, + "logps/chosen": -218.0, + "logps/rejected": -243.0, + "loss": 0.3522, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.751953125, + "rewards/margins": 2.7265625, + "rewards/rejected": -1.9765625, + "step": 484 + }, + { + "epoch": 0.536504424778761, + "grad_norm": 13.433536529541016, + "learning_rate": 2.3387288763767095e-07, + "logits/chosen": -1.22265625, + "logits/rejected": -1.0703125, + "logps/chosen": -266.0, + "logps/rejected": -267.0, + "loss": 0.3058, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.6484375, + "rewards/margins": 2.7734375, + "rewards/rejected": -2.125, + "step": 485 + }, + { + "epoch": 0.5376106194690266, + "grad_norm": 13.798343658447266, + "learning_rate": 2.329782874367973e-07, + "logits/chosen": -1.25, + "logits/rejected": -1.125, + "logps/chosen": -250.5, + "logps/rejected": -260.0, + "loss": 0.2997, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.568359375, + "rewards/margins": 2.703125, + "rewards/rejected": -2.125, + "step": 486 + }, + { + "epoch": 0.5387168141592921, + "grad_norm": 13.606473922729492, + "learning_rate": 2.3208390616040025e-07, + "logits/chosen": -1.234375, + "logits/rejected": -1.20703125, + "logps/chosen": -265.5, + "logps/rejected": -323.0, + "loss": 0.3473, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.7421875, + "rewards/margins": 2.4609375, + "rewards/rejected": -1.71875, + "step": 487 + }, + { + "epoch": 0.5398230088495575, + "grad_norm": 14.441394805908203, + "learning_rate": 2.3118975531155003e-07, + "logits/chosen": -1.22265625, + "logits/rejected": -1.14453125, + "logps/chosen": -257.5, + "logps/rejected": -281.0, + "loss": 0.3566, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.501953125, + "rewards/margins": 2.21875, + "rewards/rejected": -1.71875, + "step": 488 + }, + { + "epoch": 0.540929203539823, + "grad_norm": 13.99435043334961, + "learning_rate": 2.3029584639035284e-07, + "logits/chosen": -1.24609375, + "logits/rejected": -1.16796875, + "logps/chosen": -251.0, + "logps/rejected": -293.0, + "loss": 0.3419, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.701171875, + "rewards/margins": 2.5234375, + "rewards/rejected": -1.828125, + "step": 489 + }, + { + "epoch": 0.5420353982300885, + "grad_norm": 12.558284759521484, + "learning_rate": 2.294021908938039e-07, + "logits/chosen": -1.26953125, + "logits/rejected": -1.05859375, + "logps/chosen": -243.0, + "logps/rejected": -249.5, + "loss": 0.2931, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.7470703125, + "rewards/margins": 2.9140625, + "rewards/rejected": -2.1640625, + "step": 490 + }, + { + "epoch": 0.543141592920354, + "grad_norm": 13.195667266845703, + "learning_rate": 2.285088003156384e-07, + "logits/chosen": -1.20703125, + "logits/rejected": -1.0625, + "logps/chosen": -268.0, + "logps/rejected": -308.0, + "loss": 0.33, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.669921875, + "rewards/margins": 2.5234375, + "rewards/rejected": -1.8515625, + "step": 491 + }, + { + "epoch": 0.5442477876106194, + "grad_norm": 13.966536521911621, + "learning_rate": 2.2761568614618472e-07, + "logits/chosen": -1.33203125, + "logits/rejected": -1.3203125, + "logps/chosen": -250.5, + "logps/rejected": -266.0, + "loss": 0.3732, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 0.763671875, + "rewards/margins": 2.3203125, + "rewards/rejected": -1.5546875, + "step": 492 + }, + { + "epoch": 0.5453539823008849, + "grad_norm": 13.840253829956055, + "learning_rate": 2.2672285987221625e-07, + "logits/chosen": -1.3359375, + "logits/rejected": -1.16015625, + "logps/chosen": -263.0, + "logps/rejected": -279.5, + "loss": 0.3326, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.62109375, + "rewards/margins": 2.4140625, + "rewards/rejected": -1.7890625, + "step": 493 + }, + { + "epoch": 0.5464601769911505, + "grad_norm": 13.471453666687012, + "learning_rate": 2.2583033297680315e-07, + "logits/chosen": -1.203125, + "logits/rejected": -1.06640625, + "logps/chosen": -274.0, + "logps/rejected": -307.0, + "loss": 0.3214, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.6025390625, + "rewards/margins": 2.6953125, + "rewards/rejected": -2.08984375, + "step": 494 + }, + { + "epoch": 0.547566371681416, + "grad_norm": 13.170902252197266, + "learning_rate": 2.2493811693916567e-07, + "logits/chosen": -1.3984375, + "logits/rejected": -1.13671875, + "logps/chosen": -255.5, + "logps/rejected": -285.0, + "loss": 0.2704, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.822265625, + "rewards/margins": 2.7421875, + "rewards/rejected": -1.91796875, + "step": 495 + }, + { + "epoch": 0.5486725663716814, + "grad_norm": 14.177343368530273, + "learning_rate": 2.2404622323452562e-07, + "logits/chosen": -1.30078125, + "logits/rejected": -1.171875, + "logps/chosen": -238.0, + "logps/rejected": -284.0, + "loss": 0.3352, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.916015625, + "rewards/margins": 2.515625, + "rewards/rejected": -1.59765625, + "step": 496 + }, + { + "epoch": 0.5497787610619469, + "grad_norm": 10.760467529296875, + "learning_rate": 2.2315466333395924e-07, + "logits/chosen": -1.21484375, + "logits/rejected": -1.19921875, + "logps/chosen": -222.5, + "logps/rejected": -285.0, + "loss": 0.2274, + "rewards/accuracies": 0.8828125, + "rewards/chosen": 1.1875, + "rewards/margins": 3.3515625, + "rewards/rejected": -2.1640625, + "step": 497 + }, + { + "epoch": 0.5508849557522124, + "grad_norm": 15.023584365844727, + "learning_rate": 2.222634487042496e-07, + "logits/chosen": -1.296875, + "logits/rejected": -1.2109375, + "logps/chosen": -256.5, + "logps/rejected": -279.5, + "loss": 0.3246, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.76953125, + "rewards/margins": 2.515625, + "rewards/rejected": -1.75, + "step": 498 + }, + { + "epoch": 0.5519911504424779, + "grad_norm": 12.802349090576172, + "learning_rate": 2.2137259080773896e-07, + "logits/chosen": -1.28515625, + "logits/rejected": -1.21875, + "logps/chosen": -246.5, + "logps/rejected": -256.0, + "loss": 0.2918, + "rewards/accuracies": 0.8828125, + "rewards/chosen": 0.83203125, + "rewards/margins": 2.8203125, + "rewards/rejected": -1.98828125, + "step": 499 + }, + { + "epoch": 0.5530973451327433, + "grad_norm": 13.720869064331055, + "learning_rate": 2.204821011021815e-07, + "logits/chosen": -1.19921875, + "logits/rejected": -1.09765625, + "logps/chosen": -242.0, + "logps/rejected": -278.5, + "loss": 0.3394, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.6025390625, + "rewards/margins": 2.28125, + "rewards/rejected": -1.6796875, + "step": 500 + }, + { + "epoch": 0.5530973451327433, + "eval_logits/chosen": -1.2517879009246826, + "eval_logits/rejected": -1.1513915061950684, + "eval_logps/chosen": -252.49253845214844, + "eval_logps/rejected": -277.5074768066406, + "eval_loss": 0.3277411162853241, + "eval_rewards/accuracies": 0.8062752485275269, + "eval_rewards/chosen": 0.83104008436203, + "eval_rewards/margins": 2.661613702774048, + "eval_rewards/rejected": -1.8305736780166626, + "eval_runtime": 193.0734, + "eval_samples_per_second": 66.571, + "eval_steps_per_second": 1.041, + "step": 500 + }, + { + "epoch": 0.5542035398230089, + "grad_norm": 13.983589172363281, + "learning_rate": 2.195919910405961e-07, + "logits/chosen": -1.1875, + "logits/rejected": -1.0546875, + "logps/chosen": -243.5, + "logps/rejected": -268.0, + "loss": 0.3415, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.810546875, + "rewards/margins": 2.7109375, + "rewards/rejected": -1.89453125, + "step": 501 + }, + { + "epoch": 0.5553097345132744, + "grad_norm": 13.887030601501465, + "learning_rate": 2.1870227207111853e-07, + "logits/chosen": -1.2578125, + "logits/rejected": -1.0703125, + "logps/chosen": -271.0, + "logps/rejected": -282.0, + "loss": 0.3074, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.68359375, + "rewards/margins": 2.65625, + "rewards/rejected": -1.9765625, + "step": 502 + }, + { + "epoch": 0.5564159292035398, + "grad_norm": 13.680130004882812, + "learning_rate": 2.1781295563685476e-07, + "logits/chosen": -1.15625, + "logits/rejected": -1.009765625, + "logps/chosen": -280.0, + "logps/rejected": -288.0, + "loss": 0.3024, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.8046875, + "rewards/margins": 2.8828125, + "rewards/rejected": -2.0859375, + "step": 503 + }, + { + "epoch": 0.5575221238938053, + "grad_norm": 14.351888656616211, + "learning_rate": 2.1692405317573366e-07, + "logits/chosen": -1.28515625, + "logits/rejected": -1.16796875, + "logps/chosen": -257.0, + "logps/rejected": -259.5, + "loss": 0.3655, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 0.689453125, + "rewards/margins": 2.515625, + "rewards/rejected": -1.82421875, + "step": 504 + }, + { + "epoch": 0.5586283185840708, + "grad_norm": 13.825343132019043, + "learning_rate": 2.1603557612035932e-07, + "logits/chosen": -1.1796875, + "logits/rejected": -1.1875, + "logps/chosen": -274.0, + "logps/rejected": -302.0, + "loss": 0.2957, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 1.046875, + "rewards/margins": 2.78125, + "rewards/rejected": -1.73828125, + "step": 505 + }, + { + "epoch": 0.5597345132743363, + "grad_norm": 15.617433547973633, + "learning_rate": 2.1514753589786516e-07, + "logits/chosen": -1.25, + "logits/rejected": -1.1484375, + "logps/chosen": -257.5, + "logps/rejected": -283.0, + "loss": 0.3659, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.775390625, + "rewards/margins": 2.359375, + "rewards/rejected": -1.578125, + "step": 506 + }, + { + "epoch": 0.5608407079646017, + "grad_norm": 12.907238960266113, + "learning_rate": 2.1425994392976559e-07, + "logits/chosen": -1.22265625, + "logits/rejected": -1.1875, + "logps/chosen": -253.5, + "logps/rejected": -284.0, + "loss": 0.3268, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.8828125, + "rewards/margins": 2.875, + "rewards/rejected": -1.98828125, + "step": 507 + }, + { + "epoch": 0.5619469026548672, + "grad_norm": 12.990286827087402, + "learning_rate": 2.1337281163181034e-07, + "logits/chosen": -1.31640625, + "logits/rejected": -1.16015625, + "logps/chosen": -280.0, + "logps/rejected": -278.0, + "loss": 0.2744, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.853515625, + "rewards/margins": 2.7265625, + "rewards/rejected": -1.875, + "step": 508 + }, + { + "epoch": 0.5630530973451328, + "grad_norm": 10.47897720336914, + "learning_rate": 2.1248615041383682e-07, + "logits/chosen": -1.2421875, + "logits/rejected": -1.1328125, + "logps/chosen": -234.5, + "logps/rejected": -279.5, + "loss": 0.2341, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.822265625, + "rewards/margins": 3.046875, + "rewards/rejected": -2.21875, + "step": 509 + }, + { + "epoch": 0.5641592920353983, + "grad_norm": 12.620576858520508, + "learning_rate": 2.1159997167962378e-07, + "logits/chosen": -1.2421875, + "logits/rejected": -1.078125, + "logps/chosen": -229.5, + "logps/rejected": -265.0, + "loss": 0.3217, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.779296875, + "rewards/margins": 2.5625, + "rewards/rejected": -1.78515625, + "step": 510 + }, + { + "epoch": 0.5652654867256637, + "grad_norm": 14.882222175598145, + "learning_rate": 2.1071428682674436e-07, + "logits/chosen": -1.234375, + "logits/rejected": -1.140625, + "logps/chosen": -258.0, + "logps/rejected": -293.0, + "loss": 0.3461, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 1.03125, + "rewards/margins": 2.65625, + "rewards/rejected": -1.625, + "step": 511 + }, + { + "epoch": 0.5663716814159292, + "grad_norm": 14.687088966369629, + "learning_rate": 2.098291072464199e-07, + "logits/chosen": -1.234375, + "logits/rejected": -1.21875, + "logps/chosen": -250.5, + "logps/rejected": -307.0, + "loss": 0.3347, + "rewards/accuracies": 0.734375, + "rewards/chosen": 1.025390625, + "rewards/margins": 2.8203125, + "rewards/rejected": -1.796875, + "step": 512 + }, + { + "epoch": 0.5674778761061947, + "grad_norm": 13.261859893798828, + "learning_rate": 2.0894444432337282e-07, + "logits/chosen": -1.41796875, + "logits/rejected": -1.18359375, + "logps/chosen": -252.5, + "logps/rejected": -263.0, + "loss": 0.2804, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.830078125, + "rewards/margins": 2.796875, + "rewards/rejected": -1.96875, + "step": 513 + }, + { + "epoch": 0.5685840707964602, + "grad_norm": 15.263360977172852, + "learning_rate": 2.08060309435681e-07, + "logits/chosen": -1.359375, + "logits/rejected": -1.21484375, + "logps/chosen": -267.0, + "logps/rejected": -302.0, + "loss": 0.3238, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.931640625, + "rewards/margins": 2.7734375, + "rewards/rejected": -1.84375, + "step": 514 + }, + { + "epoch": 0.5696902654867256, + "grad_norm": 13.30583667755127, + "learning_rate": 2.071767139546306e-07, + "logits/chosen": -1.27734375, + "logits/rejected": -1.09375, + "logps/chosen": -253.5, + "logps/rejected": -301.0, + "loss": 0.3201, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.810546875, + "rewards/margins": 2.609375, + "rewards/rejected": -1.796875, + "step": 515 + }, + { + "epoch": 0.5707964601769911, + "grad_norm": 13.454291343688965, + "learning_rate": 2.062936692445705e-07, + "logits/chosen": -1.2578125, + "logits/rejected": -1.125, + "logps/chosen": -244.0, + "logps/rejected": -285.0, + "loss": 0.3014, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.8046875, + "rewards/margins": 2.6953125, + "rewards/rejected": -1.8984375, + "step": 516 + }, + { + "epoch": 0.5719026548672567, + "grad_norm": 14.328228950500488, + "learning_rate": 2.0541118666276577e-07, + "logits/chosen": -1.25, + "logits/rejected": -1.17578125, + "logps/chosen": -261.0, + "logps/rejected": -314.0, + "loss": 0.331, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.90625, + "rewards/margins": 2.609375, + "rewards/rejected": -1.70703125, + "step": 517 + }, + { + "epoch": 0.5730088495575221, + "grad_norm": 15.810956001281738, + "learning_rate": 2.045292775592515e-07, + "logits/chosen": -1.25, + "logits/rejected": -1.1875, + "logps/chosen": -258.5, + "logps/rejected": -287.0, + "loss": 0.3654, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.828125, + "rewards/margins": 2.765625, + "rewards/rejected": -1.9375, + "step": 518 + }, + { + "epoch": 0.5741150442477876, + "grad_norm": 14.267369270324707, + "learning_rate": 2.0364795327668722e-07, + "logits/chosen": -1.27734375, + "logits/rejected": -1.078125, + "logps/chosen": -290.0, + "logps/rejected": -282.0, + "loss": 0.3298, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.5419921875, + "rewards/margins": 2.453125, + "rewards/rejected": -1.9140625, + "step": 519 + }, + { + "epoch": 0.5752212389380531, + "grad_norm": 17.044023513793945, + "learning_rate": 2.0276722515021084e-07, + "logits/chosen": -1.3125, + "logits/rejected": -1.15625, + "logps/chosen": -254.5, + "logps/rejected": -288.0, + "loss": 0.4207, + "rewards/accuracies": 0.7421875, + "rewards/chosen": 0.66015625, + "rewards/margins": 2.12890625, + "rewards/rejected": -1.46875, + "step": 520 + }, + { + "epoch": 0.5763274336283186, + "grad_norm": 14.049108505249023, + "learning_rate": 2.0188710450729253e-07, + "logits/chosen": -1.19140625, + "logits/rejected": -1.21875, + "logps/chosen": -235.5, + "logps/rejected": -283.0, + "loss": 0.3075, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.232421875, + "rewards/margins": 3.1640625, + "rewards/rejected": -1.9375, + "step": 521 + }, + { + "epoch": 0.577433628318584, + "grad_norm": 14.81714153289795, + "learning_rate": 2.0100760266758953e-07, + "logits/chosen": -1.26953125, + "logits/rejected": -1.08984375, + "logps/chosen": -253.5, + "logps/rejected": -244.5, + "loss": 0.3601, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.3740234375, + "rewards/margins": 2.3671875, + "rewards/rejected": -2.0, + "step": 522 + }, + { + "epoch": 0.5785398230088495, + "grad_norm": 12.647814750671387, + "learning_rate": 2.0012873094280032e-07, + "logits/chosen": -1.2421875, + "logits/rejected": -1.140625, + "logps/chosen": -254.5, + "logps/rejected": -297.0, + "loss": 0.2831, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.060546875, + "rewards/margins": 3.125, + "rewards/rejected": -2.0625, + "step": 523 + }, + { + "epoch": 0.5796460176991151, + "grad_norm": 13.28409481048584, + "learning_rate": 1.992505006365191e-07, + "logits/chosen": -1.1484375, + "logits/rejected": -1.11328125, + "logps/chosen": -268.0, + "logps/rejected": -295.0, + "loss": 0.2986, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.8828125, + "rewards/margins": 2.859375, + "rewards/rejected": -1.96875, + "step": 524 + }, + { + "epoch": 0.5807522123893806, + "grad_norm": 12.828714370727539, + "learning_rate": 1.983729230440907e-07, + "logits/chosen": -1.25390625, + "logits/rejected": -1.26171875, + "logps/chosen": -241.5, + "logps/rejected": -292.0, + "loss": 0.2867, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.021484375, + "rewards/margins": 3.0703125, + "rewards/rejected": -2.0546875, + "step": 525 + }, + { + "epoch": 0.581858407079646, + "grad_norm": 13.270194053649902, + "learning_rate": 1.974960094524647e-07, + "logits/chosen": -1.1875, + "logits/rejected": -1.13671875, + "logps/chosen": -252.5, + "logps/rejected": -284.0, + "loss": 0.3036, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.837890625, + "rewards/margins": 2.71875, + "rewards/rejected": -1.87890625, + "step": 526 + }, + { + "epoch": 0.5829646017699115, + "grad_norm": 14.76196575164795, + "learning_rate": 1.9661977114005095e-07, + "logits/chosen": -1.2109375, + "logits/rejected": -1.09375, + "logps/chosen": -266.0, + "logps/rejected": -282.0, + "loss": 0.3643, + "rewards/accuracies": 0.7265625, + "rewards/chosen": 0.83984375, + "rewards/margins": 2.65625, + "rewards/rejected": -1.81640625, + "step": 527 + }, + { + "epoch": 0.584070796460177, + "grad_norm": 13.863215446472168, + "learning_rate": 1.9574421937657423e-07, + "logits/chosen": -1.2265625, + "logits/rejected": -1.171875, + "logps/chosen": -261.0, + "logps/rejected": -295.0, + "loss": 0.2729, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.02734375, + "rewards/margins": 3.078125, + "rewards/rejected": -2.0546875, + "step": 528 + }, + { + "epoch": 0.5851769911504425, + "grad_norm": 17.09484100341797, + "learning_rate": 1.9486936542292897e-07, + "logits/chosen": -1.2734375, + "logits/rejected": -1.1484375, + "logps/chosen": -281.0, + "logps/rejected": -284.0, + "loss": 0.4144, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.681640625, + "rewards/margins": 2.203125, + "rewards/rejected": -1.5234375, + "step": 529 + }, + { + "epoch": 0.5862831858407079, + "grad_norm": 13.073324203491211, + "learning_rate": 1.9399522053103512e-07, + "logits/chosen": -1.22265625, + "logits/rejected": -1.16015625, + "logps/chosen": -256.5, + "logps/rejected": -272.5, + "loss": 0.3109, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.90234375, + "rewards/margins": 2.734375, + "rewards/rejected": -1.83984375, + "step": 530 + }, + { + "epoch": 0.5873893805309734, + "grad_norm": 16.40873908996582, + "learning_rate": 1.9312179594369267e-07, + "logits/chosen": -1.171875, + "logits/rejected": -1.1015625, + "logps/chosen": -270.0, + "logps/rejected": -298.0, + "loss": 0.3547, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.759765625, + "rewards/margins": 2.5078125, + "rewards/rejected": -1.75, + "step": 531 + }, + { + "epoch": 0.588495575221239, + "grad_norm": 13.864864349365234, + "learning_rate": 1.9224910289443766e-07, + "logits/chosen": -1.27734375, + "logits/rejected": -1.13671875, + "logps/chosen": -233.5, + "logps/rejected": -259.0, + "loss": 0.367, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.880859375, + "rewards/margins": 2.4765625, + "rewards/rejected": -1.6015625, + "step": 532 + }, + { + "epoch": 0.5896017699115044, + "grad_norm": 11.59555721282959, + "learning_rate": 1.913771526073976e-07, + "logits/chosen": -1.3203125, + "logits/rejected": -1.12890625, + "logps/chosen": -254.0, + "logps/rejected": -295.0, + "loss": 0.2546, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.154296875, + "rewards/margins": 2.9140625, + "rewards/rejected": -1.75390625, + "step": 533 + }, + { + "epoch": 0.5907079646017699, + "grad_norm": 559.9500122070312, + "learning_rate": 1.9050595629714654e-07, + "logits/chosen": -1.2265625, + "logits/rejected": -0.99609375, + "logps/chosen": -270.0, + "logps/rejected": -344.0, + "loss": 0.3657, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.978515625, + "rewards/margins": 2.65625, + "rewards/rejected": -1.671875, + "step": 534 + }, + { + "epoch": 0.5918141592920354, + "grad_norm": 12.260162353515625, + "learning_rate": 1.8963552516856158e-07, + "logits/chosen": -1.2734375, + "logits/rejected": -1.16015625, + "logps/chosen": -242.0, + "logps/rejected": -265.5, + "loss": 0.2995, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.09375, + "rewards/margins": 3.015625, + "rewards/rejected": -1.921875, + "step": 535 + }, + { + "epoch": 0.5929203539823009, + "grad_norm": 15.339058876037598, + "learning_rate": 1.8876587041667852e-07, + "logits/chosen": -1.27734375, + "logits/rejected": -1.20703125, + "logps/chosen": -241.0, + "logps/rejected": -265.0, + "loss": 0.3577, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 0.865234375, + "rewards/margins": 2.3984375, + "rewards/rejected": -1.52734375, + "step": 536 + }, + { + "epoch": 0.5940265486725663, + "grad_norm": 16.724506378173828, + "learning_rate": 1.8789700322654747e-07, + "logits/chosen": -1.23828125, + "logits/rejected": -1.046875, + "logps/chosen": -251.0, + "logps/rejected": -289.0, + "loss": 0.2921, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.802734375, + "rewards/margins": 2.90625, + "rewards/rejected": -2.1015625, + "step": 537 + }, + { + "epoch": 0.5951327433628318, + "grad_norm": 14.673587799072266, + "learning_rate": 1.8702893477308972e-07, + "logits/chosen": -1.2265625, + "logits/rejected": -1.078125, + "logps/chosen": -253.0, + "logps/rejected": -261.0, + "loss": 0.3511, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 0.869140625, + "rewards/margins": 2.671875, + "rewards/rejected": -1.8046875, + "step": 538 + }, + { + "epoch": 0.5962389380530974, + "grad_norm": 15.79550552368164, + "learning_rate": 1.8616167622095324e-07, + "logits/chosen": -1.203125, + "logits/rejected": -1.12109375, + "logps/chosen": -267.0, + "logps/rejected": -313.0, + "loss": 0.3384, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.021484375, + "rewards/margins": 2.671875, + "rewards/rejected": -1.6484375, + "step": 539 + }, + { + "epoch": 0.5973451327433629, + "grad_norm": 14.441572189331055, + "learning_rate": 1.8529523872436977e-07, + "logits/chosen": -1.37890625, + "logits/rejected": -1.1640625, + "logps/chosen": -249.5, + "logps/rejected": -272.0, + "loss": 0.3185, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.921875, + "rewards/margins": 2.671875, + "rewards/rejected": -1.75, + "step": 540 + }, + { + "epoch": 0.5984513274336283, + "grad_norm": 13.769876480102539, + "learning_rate": 1.8442963342701105e-07, + "logits/chosen": -1.14453125, + "logits/rejected": -1.1875, + "logps/chosen": -277.0, + "logps/rejected": -275.0, + "loss": 0.2794, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.994140625, + "rewards/margins": 2.9375, + "rewards/rejected": -1.9453125, + "step": 541 + }, + { + "epoch": 0.5995575221238938, + "grad_norm": 14.039227485656738, + "learning_rate": 1.8356487146184516e-07, + "logits/chosen": -1.26953125, + "logits/rejected": -1.20703125, + "logps/chosen": -234.0, + "logps/rejected": -252.5, + "loss": 0.3448, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 1.119140625, + "rewards/margins": 2.78125, + "rewards/rejected": -1.6640625, + "step": 542 + }, + { + "epoch": 0.6006637168141593, + "grad_norm": 12.930294036865234, + "learning_rate": 1.8270096395099403e-07, + "logits/chosen": -1.31640625, + "logits/rejected": -1.171875, + "logps/chosen": -242.5, + "logps/rejected": -273.0, + "loss": 0.298, + "rewards/accuracies": 0.859375, + "rewards/chosen": 1.0859375, + "rewards/margins": 2.703125, + "rewards/rejected": -1.6171875, + "step": 543 + }, + { + "epoch": 0.6017699115044248, + "grad_norm": 13.82011890411377, + "learning_rate": 1.8183792200559e-07, + "logits/chosen": -1.2890625, + "logits/rejected": -1.1640625, + "logps/chosen": -256.0, + "logps/rejected": -287.0, + "loss": 0.3418, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.943359375, + "rewards/margins": 2.359375, + "rewards/rejected": -1.4140625, + "step": 544 + }, + { + "epoch": 0.6028761061946902, + "grad_norm": 15.775805473327637, + "learning_rate": 1.8097575672563275e-07, + "logits/chosen": -1.34375, + "logits/rejected": -1.15625, + "logps/chosen": -249.0, + "logps/rejected": -275.0, + "loss": 0.2854, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.1875, + "rewards/margins": 3.1171875, + "rewards/rejected": -1.92578125, + "step": 545 + }, + { + "epoch": 0.6039823008849557, + "grad_norm": 12.980159759521484, + "learning_rate": 1.80114479199847e-07, + "logits/chosen": -1.1171875, + "logits/rejected": -1.140625, + "logps/chosen": -263.0, + "logps/rejected": -277.0, + "loss": 0.2444, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.23046875, + "rewards/margins": 3.28125, + "rewards/rejected": -2.0546875, + "step": 546 + }, + { + "epoch": 0.6050884955752213, + "grad_norm": 13.192628860473633, + "learning_rate": 1.792541005055394e-07, + "logits/chosen": -1.2578125, + "logits/rejected": -1.22265625, + "logps/chosen": -254.0, + "logps/rejected": -286.0, + "loss": 0.3065, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 1.138671875, + "rewards/margins": 2.8984375, + "rewards/rejected": -1.7578125, + "step": 547 + }, + { + "epoch": 0.6061946902654868, + "grad_norm": 13.52103328704834, + "learning_rate": 1.783946317084564e-07, + "logits/chosen": -1.328125, + "logits/rejected": -1.08984375, + "logps/chosen": -253.0, + "logps/rejected": -269.0, + "loss": 0.2691, + "rewards/accuracies": 0.8671875, + "rewards/chosen": 1.01953125, + "rewards/margins": 2.9765625, + "rewards/rejected": -1.953125, + "step": 548 + }, + { + "epoch": 0.6073008849557522, + "grad_norm": 12.777830123901367, + "learning_rate": 1.7753608386264193e-07, + "logits/chosen": -1.19921875, + "logits/rejected": -1.20703125, + "logps/chosen": -225.0, + "logps/rejected": -274.0, + "loss": 0.3203, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 1.09765625, + "rewards/margins": 2.7578125, + "rewards/rejected": -1.66796875, + "step": 549 + }, + { + "epoch": 0.6084070796460177, + "grad_norm": 13.757856369018555, + "learning_rate": 1.7667846801029486e-07, + "logits/chosen": -1.32421875, + "logits/rejected": -1.13671875, + "logps/chosen": -264.0, + "logps/rejected": -278.0, + "loss": 0.2789, + "rewards/accuracies": 0.8671875, + "rewards/chosen": 1.126953125, + "rewards/margins": 2.640625, + "rewards/rejected": -1.515625, + "step": 550 + }, + { + "epoch": 0.6084070796460177, + "eval_logits/chosen": -1.262554407119751, + "eval_logits/rejected": -1.1598647832870483, + "eval_logps/chosen": -250.73133850097656, + "eval_logps/rejected": -276.39801025390625, + "eval_loss": 0.32494351267814636, + "eval_rewards/accuracies": 0.8066800236701965, + "eval_rewards/chosen": 1.011232852935791, + "eval_rewards/margins": 2.7289724349975586, + "eval_rewards/rejected": -1.717836618423462, + "eval_runtime": 193.0762, + "eval_samples_per_second": 66.57, + "eval_steps_per_second": 1.041, + "step": 550 + }, + { + "epoch": 0.6095132743362832, + "grad_norm": 16.458438873291016, + "learning_rate": 1.758217951816274e-07, + "logits/chosen": -1.25390625, + "logits/rejected": -1.15625, + "logps/chosen": -289.0, + "logps/rejected": -310.0, + "loss": 0.3871, + "rewards/accuracies": 0.7578125, + "rewards/chosen": 0.57421875, + "rewards/margins": 2.15625, + "rewards/rejected": -1.5859375, + "step": 551 + }, + { + "epoch": 0.6106194690265486, + "grad_norm": 15.513572692871094, + "learning_rate": 1.7496607639472327e-07, + "logits/chosen": -1.2265625, + "logits/rejected": -1.19921875, + "logps/chosen": -242.0, + "logps/rejected": -270.0, + "loss": 0.33, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.00390625, + "rewards/margins": 2.9375, + "rewards/rejected": -1.9375, + "step": 552 + }, + { + "epoch": 0.6117256637168141, + "grad_norm": 15.365250587463379, + "learning_rate": 1.7411132265539536e-07, + "logits/chosen": -1.203125, + "logits/rejected": -1.0625, + "logps/chosen": -250.0, + "logps/rejected": -297.0, + "loss": 0.3456, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.931640625, + "rewards/margins": 2.546875, + "rewards/rejected": -1.6171875, + "step": 553 + }, + { + "epoch": 0.6128318584070797, + "grad_norm": 14.896991729736328, + "learning_rate": 1.7325754495704507e-07, + "logits/chosen": -1.25390625, + "logits/rejected": -1.1875, + "logps/chosen": -267.5, + "logps/rejected": -315.0, + "loss": 0.3605, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.787109375, + "rewards/margins": 2.3671875, + "rewards/rejected": -1.578125, + "step": 554 + }, + { + "epoch": 0.6139380530973452, + "grad_norm": 14.317460060119629, + "learning_rate": 1.7240475428051997e-07, + "logits/chosen": -1.34765625, + "logits/rejected": -1.1328125, + "logps/chosen": -247.0, + "logps/rejected": -268.0, + "loss": 0.3123, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.96875, + "rewards/margins": 2.75, + "rewards/rejected": -1.78515625, + "step": 555 + }, + { + "epoch": 0.6150442477876106, + "grad_norm": 13.919463157653809, + "learning_rate": 1.7155296159397356e-07, + "logits/chosen": -1.22265625, + "logits/rejected": -1.19140625, + "logps/chosen": -261.5, + "logps/rejected": -304.0, + "loss": 0.3188, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.88671875, + "rewards/margins": 2.7734375, + "rewards/rejected": -1.890625, + "step": 556 + }, + { + "epoch": 0.6161504424778761, + "grad_norm": 15.724615097045898, + "learning_rate": 1.707021778527235e-07, + "logits/chosen": -1.34765625, + "logits/rejected": -1.1953125, + "logps/chosen": -278.0, + "logps/rejected": -298.0, + "loss": 0.3411, + "rewards/accuracies": 0.78125, + "rewards/chosen": 1.0625, + "rewards/margins": 2.5234375, + "rewards/rejected": -1.4609375, + "step": 557 + }, + { + "epoch": 0.6172566371681416, + "grad_norm": 11.178709983825684, + "learning_rate": 1.6985241399911082e-07, + "logits/chosen": -1.41796875, + "logits/rejected": -1.21484375, + "logps/chosen": -234.5, + "logps/rejected": -259.0, + "loss": 0.2349, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.91015625, + "rewards/margins": 3.2265625, + "rewards/rejected": -2.3125, + "step": 558 + }, + { + "epoch": 0.6183628318584071, + "grad_norm": 13.019828796386719, + "learning_rate": 1.6900368096235931e-07, + "logits/chosen": -1.328125, + "logits/rejected": -1.1953125, + "logps/chosen": -227.5, + "logps/rejected": -288.0, + "loss": 0.3063, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.208984375, + "rewards/margins": 3.265625, + "rewards/rejected": -2.05078125, + "step": 559 + }, + { + "epoch": 0.6194690265486725, + "grad_norm": 15.409528732299805, + "learning_rate": 1.6815598965843519e-07, + "logits/chosen": -1.30078125, + "logits/rejected": -1.15625, + "logps/chosen": -266.0, + "logps/rejected": -326.0, + "loss": 0.2972, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 1.12109375, + "rewards/margins": 3.1875, + "rewards/rejected": -2.0703125, + "step": 560 + }, + { + "epoch": 0.620575221238938, + "grad_norm": 13.341466903686523, + "learning_rate": 1.67309350989906e-07, + "logits/chosen": -1.37109375, + "logits/rejected": -1.09765625, + "logps/chosen": -257.0, + "logps/rejected": -259.5, + "loss": 0.3021, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.78125, + "rewards/margins": 2.65625, + "rewards/rejected": -1.875, + "step": 561 + }, + { + "epoch": 0.6216814159292036, + "grad_norm": 13.608122825622559, + "learning_rate": 1.664637758458013e-07, + "logits/chosen": -1.3828125, + "logits/rejected": -1.078125, + "logps/chosen": -248.5, + "logps/rejected": -238.5, + "loss": 0.3346, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.4716796875, + "rewards/margins": 2.3359375, + "rewards/rejected": -1.86328125, + "step": 562 + }, + { + "epoch": 0.6227876106194691, + "grad_norm": 14.688355445861816, + "learning_rate": 1.656192751014717e-07, + "logits/chosen": -1.2578125, + "logits/rejected": -1.13671875, + "logps/chosen": -266.0, + "logps/rejected": -298.0, + "loss": 0.3522, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.90234375, + "rewards/margins": 2.578125, + "rewards/rejected": -1.67578125, + "step": 563 + }, + { + "epoch": 0.6238938053097345, + "grad_norm": 13.295293807983398, + "learning_rate": 1.647758596184498e-07, + "logits/chosen": -1.1875, + "logits/rejected": -1.17578125, + "logps/chosen": -262.5, + "logps/rejected": -287.0, + "loss": 0.3039, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.849609375, + "rewards/margins": 2.4921875, + "rewards/rejected": -1.64453125, + "step": 564 + }, + { + "epoch": 0.625, + "grad_norm": 12.697134017944336, + "learning_rate": 1.6393354024431e-07, + "logits/chosen": -1.2421875, + "logits/rejected": -1.05078125, + "logps/chosen": -257.5, + "logps/rejected": -265.0, + "loss": 0.2807, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.87109375, + "rewards/margins": 3.015625, + "rewards/rejected": -2.14453125, + "step": 565 + }, + { + "epoch": 0.6261061946902655, + "grad_norm": 23.94145965576172, + "learning_rate": 1.63092327812529e-07, + "logits/chosen": -1.34375, + "logits/rejected": -1.2734375, + "logps/chosen": -246.5, + "logps/rejected": -232.5, + "loss": 0.3647, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.576171875, + "rewards/margins": 2.421875, + "rewards/rejected": -1.84375, + "step": 566 + }, + { + "epoch": 0.6272123893805309, + "grad_norm": 13.088120460510254, + "learning_rate": 1.622522331423467e-07, + "logits/chosen": -1.2109375, + "logits/rejected": -1.1640625, + "logps/chosen": -261.0, + "logps/rejected": -307.0, + "loss": 0.307, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.875, + "rewards/margins": 2.8046875, + "rewards/rejected": -1.92578125, + "step": 567 + }, + { + "epoch": 0.6283185840707964, + "grad_norm": 16.868358612060547, + "learning_rate": 1.6141326703862706e-07, + "logits/chosen": -1.2734375, + "logits/rejected": -1.24609375, + "logps/chosen": -260.0, + "logps/rejected": -292.0, + "loss": 0.4, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 0.720703125, + "rewards/margins": 2.5703125, + "rewards/rejected": -1.8515625, + "step": 568 + }, + { + "epoch": 0.629424778761062, + "grad_norm": 13.88227653503418, + "learning_rate": 1.605754402917186e-07, + "logits/chosen": -1.43359375, + "logits/rejected": -1.28125, + "logps/chosen": -245.0, + "logps/rejected": -266.5, + "loss": 0.2863, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.96484375, + "rewards/margins": 2.9296875, + "rewards/rejected": -1.96875, + "step": 569 + }, + { + "epoch": 0.6305309734513275, + "grad_norm": 13.3289794921875, + "learning_rate": 1.5973876367731651e-07, + "logits/chosen": -1.35546875, + "logits/rejected": -1.08203125, + "logps/chosen": -280.0, + "logps/rejected": -306.0, + "loss": 0.2719, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.14453125, + "rewards/margins": 3.078125, + "rewards/rejected": -1.93359375, + "step": 570 + }, + { + "epoch": 0.6316371681415929, + "grad_norm": 13.53636360168457, + "learning_rate": 1.5890324795632315e-07, + "logits/chosen": -1.3671875, + "logits/rejected": -1.28515625, + "logps/chosen": -223.0, + "logps/rejected": -262.0, + "loss": 0.3, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.017578125, + "rewards/margins": 2.90625, + "rewards/rejected": -1.89453125, + "step": 571 + }, + { + "epoch": 0.6327433628318584, + "grad_norm": 14.343642234802246, + "learning_rate": 1.5806890387471023e-07, + "logits/chosen": -1.3359375, + "logits/rejected": -1.12109375, + "logps/chosen": -267.0, + "logps/rejected": -291.0, + "loss": 0.2824, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.9453125, + "rewards/margins": 2.984375, + "rewards/rejected": -2.04296875, + "step": 572 + }, + { + "epoch": 0.6338495575221239, + "grad_norm": 13.577574729919434, + "learning_rate": 1.5723574216338065e-07, + "logits/chosen": -1.26171875, + "logits/rejected": -1.17578125, + "logps/chosen": -272.0, + "logps/rejected": -276.0, + "loss": 0.2799, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.9296875, + "rewards/margins": 2.8125, + "rewards/rejected": -1.8828125, + "step": 573 + }, + { + "epoch": 0.6349557522123894, + "grad_norm": 17.00868797302246, + "learning_rate": 1.5640377353802985e-07, + "logits/chosen": -1.3125, + "logits/rejected": -1.025390625, + "logps/chosen": -286.0, + "logps/rejected": -286.0, + "loss": 0.3574, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 0.73828125, + "rewards/margins": 2.6015625, + "rewards/rejected": -1.859375, + "step": 574 + }, + { + "epoch": 0.6360619469026548, + "grad_norm": 14.934076309204102, + "learning_rate": 1.5557300869900874e-07, + "logits/chosen": -1.2578125, + "logits/rejected": -1.09375, + "logps/chosen": -281.5, + "logps/rejected": -320.0, + "loss": 0.347, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 0.7509765625, + "rewards/margins": 2.47265625, + "rewards/rejected": -1.7265625, + "step": 575 + }, + { + "epoch": 0.6371681415929203, + "grad_norm": 14.064445495605469, + "learning_rate": 1.547434583311858e-07, + "logits/chosen": -1.234375, + "logits/rejected": -1.06640625, + "logps/chosen": -262.0, + "logps/rejected": -262.0, + "loss": 0.374, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 0.5078125, + "rewards/margins": 2.16796875, + "rewards/rejected": -1.65625, + "step": 576 + }, + { + "epoch": 0.6382743362831859, + "grad_norm": 14.291051864624023, + "learning_rate": 1.5391513310380923e-07, + "logits/chosen": -1.17578125, + "logits/rejected": -1.1484375, + "logps/chosen": -264.5, + "logps/rejected": -322.0, + "loss": 0.2885, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.8671875, + "rewards/margins": 2.8359375, + "rewards/rejected": -1.96484375, + "step": 577 + }, + { + "epoch": 0.6393805309734514, + "grad_norm": 15.690069198608398, + "learning_rate": 1.5308804367037049e-07, + "logits/chosen": -1.37890625, + "logits/rejected": -1.12109375, + "logps/chosen": -271.0, + "logps/rejected": -313.0, + "loss": 0.3193, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.921875, + "rewards/margins": 2.8125, + "rewards/rejected": -1.88671875, + "step": 578 + }, + { + "epoch": 0.6404867256637168, + "grad_norm": 14.940479278564453, + "learning_rate": 1.5226220066846662e-07, + "logits/chosen": -1.20703125, + "logits/rejected": -1.12109375, + "logps/chosen": -277.0, + "logps/rejected": -313.0, + "loss": 0.317, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.912109375, + "rewards/margins": 2.7421875, + "rewards/rejected": -1.828125, + "step": 579 + }, + { + "epoch": 0.6415929203539823, + "grad_norm": 14.204512596130371, + "learning_rate": 1.5143761471966387e-07, + "logits/chosen": -1.25, + "logits/rejected": -1.1953125, + "logps/chosen": -267.0, + "logps/rejected": -296.0, + "loss": 0.2923, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.935546875, + "rewards/margins": 2.9453125, + "rewards/rejected": -2.01953125, + "step": 580 + }, + { + "epoch": 0.6426991150442478, + "grad_norm": 12.345739364624023, + "learning_rate": 1.5061429642936104e-07, + "logits/chosen": -1.2734375, + "logits/rejected": -1.15625, + "logps/chosen": -238.5, + "logps/rejected": -271.0, + "loss": 0.2898, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.078125, + "rewards/margins": 3.0390625, + "rewards/rejected": -1.9609375, + "step": 581 + }, + { + "epoch": 0.6438053097345132, + "grad_norm": 14.108118057250977, + "learning_rate": 1.497922563866526e-07, + "logits/chosen": -1.25390625, + "logits/rejected": -1.25390625, + "logps/chosen": -225.5, + "logps/rejected": -276.0, + "loss": 0.3588, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 1.01171875, + "rewards/margins": 2.5078125, + "rewards/rejected": -1.4921875, + "step": 582 + }, + { + "epoch": 0.6449115044247787, + "grad_norm": 15.642598152160645, + "learning_rate": 1.4897150516419315e-07, + "logits/chosen": -1.33984375, + "logits/rejected": -1.09765625, + "logps/chosen": -262.5, + "logps/rejected": -281.0, + "loss": 0.3357, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.962890625, + "rewards/margins": 2.671875, + "rewards/rejected": -1.70703125, + "step": 583 + }, + { + "epoch": 0.6460176991150443, + "grad_norm": 13.628485679626465, + "learning_rate": 1.481520533180611e-07, + "logits/chosen": -1.2734375, + "logits/rejected": -1.16796875, + "logps/chosen": -245.0, + "logps/rejected": -250.0, + "loss": 0.2903, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.8515625, + "rewards/margins": 2.8359375, + "rewards/rejected": -1.9921875, + "step": 584 + }, + { + "epoch": 0.6471238938053098, + "grad_norm": 12.115748405456543, + "learning_rate": 1.4733391138762275e-07, + "logits/chosen": -1.265625, + "logits/rejected": -1.21484375, + "logps/chosen": -237.0, + "logps/rejected": -255.5, + "loss": 0.2511, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.16796875, + "rewards/margins": 3.484375, + "rewards/rejected": -2.3125, + "step": 585 + }, + { + "epoch": 0.6482300884955752, + "grad_norm": 12.073527336120605, + "learning_rate": 1.4651708989539733e-07, + "logits/chosen": -1.3203125, + "logits/rejected": -1.2265625, + "logps/chosen": -255.0, + "logps/rejected": -251.5, + "loss": 0.27, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.025390625, + "rewards/margins": 2.8671875, + "rewards/rejected": -1.84375, + "step": 586 + }, + { + "epoch": 0.6493362831858407, + "grad_norm": 15.416234016418457, + "learning_rate": 1.4570159934692084e-07, + "logits/chosen": -1.2265625, + "logits/rejected": -1.234375, + "logps/chosen": -264.0, + "logps/rejected": -290.0, + "loss": 0.4144, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.69921875, + "rewards/margins": 2.203125, + "rewards/rejected": -1.50390625, + "step": 587 + }, + { + "epoch": 0.6504424778761062, + "grad_norm": 14.182881355285645, + "learning_rate": 1.448874502306116e-07, + "logits/chosen": -1.21875, + "logits/rejected": -1.0859375, + "logps/chosen": -262.5, + "logps/rejected": -280.0, + "loss": 0.3193, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.9921875, + "rewards/margins": 2.7421875, + "rewards/rejected": -1.75390625, + "step": 588 + }, + { + "epoch": 0.6515486725663717, + "grad_norm": 14.108832359313965, + "learning_rate": 1.4407465301763532e-07, + "logits/chosen": -1.37890625, + "logits/rejected": -1.21875, + "logps/chosen": -249.0, + "logps/rejected": -257.5, + "loss": 0.355, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.75, + "rewards/margins": 2.734375, + "rewards/rejected": -1.98046875, + "step": 589 + }, + { + "epoch": 0.6526548672566371, + "grad_norm": 14.618428230285645, + "learning_rate": 1.432632181617698e-07, + "logits/chosen": -1.24609375, + "logits/rejected": -1.02734375, + "logps/chosen": -243.0, + "logps/rejected": -281.0, + "loss": 0.313, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.0078125, + "rewards/margins": 2.9296875, + "rewards/rejected": -1.92578125, + "step": 590 + }, + { + "epoch": 0.6537610619469026, + "grad_norm": 15.182010650634766, + "learning_rate": 1.4245315609927112e-07, + "logits/chosen": -1.2578125, + "logits/rejected": -1.14453125, + "logps/chosen": -262.0, + "logps/rejected": -269.0, + "loss": 0.3443, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.732421875, + "rewards/margins": 2.6484375, + "rewards/rejected": -1.91015625, + "step": 591 + }, + { + "epoch": 0.6548672566371682, + "grad_norm": 14.359109878540039, + "learning_rate": 1.4164447724873933e-07, + "logits/chosen": -1.19140625, + "logits/rejected": -1.16015625, + "logps/chosen": -253.5, + "logps/rejected": -288.0, + "loss": 0.3191, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.93359375, + "rewards/margins": 2.6171875, + "rewards/rejected": -1.6875, + "step": 592 + }, + { + "epoch": 0.6559734513274337, + "grad_norm": 14.253448486328125, + "learning_rate": 1.4083719201098402e-07, + "logits/chosen": -1.3359375, + "logits/rejected": -1.2421875, + "logps/chosen": -251.5, + "logps/rejected": -288.0, + "loss": 0.3304, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.8203125, + "rewards/margins": 2.4765625, + "rewards/rejected": -1.65625, + "step": 593 + }, + { + "epoch": 0.6570796460176991, + "grad_norm": 14.521297454833984, + "learning_rate": 1.400313107688912e-07, + "logits/chosen": -1.37109375, + "logits/rejected": -1.19140625, + "logps/chosen": -250.0, + "logps/rejected": -260.0, + "loss": 0.3297, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.982421875, + "rewards/margins": 2.859375, + "rewards/rejected": -1.875, + "step": 594 + }, + { + "epoch": 0.6581858407079646, + "grad_norm": 13.660658836364746, + "learning_rate": 1.39226843887289e-07, + "logits/chosen": -1.26171875, + "logits/rejected": -1.19140625, + "logps/chosen": -235.0, + "logps/rejected": -296.0, + "loss": 0.3347, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.96875, + "rewards/margins": 2.9609375, + "rewards/rejected": -1.9921875, + "step": 595 + }, + { + "epoch": 0.6592920353982301, + "grad_norm": 15.24756908416748, + "learning_rate": 1.384238017128152e-07, + "logits/chosen": -1.2734375, + "logits/rejected": -1.12109375, + "logps/chosen": -241.5, + "logps/rejected": -274.0, + "loss": 0.3958, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.884765625, + "rewards/margins": 2.6875, + "rewards/rejected": -1.80859375, + "step": 596 + }, + { + "epoch": 0.6603982300884956, + "grad_norm": 14.213970184326172, + "learning_rate": 1.3762219457378354e-07, + "logits/chosen": -1.26953125, + "logits/rejected": -1.12109375, + "logps/chosen": -240.0, + "logps/rejected": -288.0, + "loss": 0.2724, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.1484375, + "rewards/margins": 2.84375, + "rewards/rejected": -1.6953125, + "step": 597 + }, + { + "epoch": 0.661504424778761, + "grad_norm": 13.295817375183105, + "learning_rate": 1.3682203278005095e-07, + "logits/chosen": -1.203125, + "logits/rejected": -1.11328125, + "logps/chosen": -267.0, + "logps/rejected": -287.0, + "loss": 0.2403, + "rewards/accuracies": 0.890625, + "rewards/chosen": 1.296875, + "rewards/margins": 3.078125, + "rewards/rejected": -1.77734375, + "step": 598 + }, + { + "epoch": 0.6626106194690266, + "grad_norm": 13.04489517211914, + "learning_rate": 1.3602332662288534e-07, + "logits/chosen": -1.2578125, + "logits/rejected": -1.078125, + "logps/chosen": -262.0, + "logps/rejected": -269.0, + "loss": 0.2891, + "rewards/accuracies": 0.8671875, + "rewards/chosen": 0.86328125, + "rewards/margins": 2.71875, + "rewards/rejected": -1.859375, + "step": 599 + }, + { + "epoch": 0.6637168141592921, + "grad_norm": 15.907817840576172, + "learning_rate": 1.3522608637483266e-07, + "logits/chosen": -1.359375, + "logits/rejected": -1.203125, + "logps/chosen": -241.5, + "logps/rejected": -275.0, + "loss": 0.3724, + "rewards/accuracies": 0.734375, + "rewards/chosen": 0.548828125, + "rewards/margins": 2.2421875, + "rewards/rejected": -1.6953125, + "step": 600 + }, + { + "epoch": 0.6637168141592921, + "eval_logits/chosen": -1.2614272832870483, + "eval_logits/rejected": -1.1564831733703613, + "eval_logps/chosen": -251.43780517578125, + "eval_logps/rejected": -277.5970153808594, + "eval_loss": 0.3234591782093048, + "eval_rewards/accuracies": 0.809928834438324, + "eval_rewards/chosen": 0.9355371594429016, + "eval_rewards/margins": 2.773709535598755, + "eval_rewards/rejected": -1.8374922275543213, + "eval_runtime": 193.0898, + "eval_samples_per_second": 66.565, + "eval_steps_per_second": 1.041, + "step": 600 + }, + { + "epoch": 0.6648230088495575, + "grad_norm": 13.516127586364746, + "learning_rate": 1.3443032228958545e-07, + "logits/chosen": -1.2109375, + "logits/rejected": -1.1015625, + "logps/chosen": -252.0, + "logps/rejected": -284.0, + "loss": 0.3214, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.916015625, + "rewards/margins": 2.734375, + "rewards/rejected": -1.8203125, + "step": 601 + }, + { + "epoch": 0.665929203539823, + "grad_norm": 13.290761947631836, + "learning_rate": 1.336360446018503e-07, + "logits/chosen": -1.37890625, + "logits/rejected": -1.1953125, + "logps/chosen": -240.5, + "logps/rejected": -248.5, + "loss": 0.3253, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 1.048828125, + "rewards/margins": 2.65625, + "rewards/rejected": -1.60546875, + "step": 602 + }, + { + "epoch": 0.6670353982300885, + "grad_norm": 13.730428695678711, + "learning_rate": 1.3284326352721675e-07, + "logits/chosen": -1.24609375, + "logits/rejected": -1.15625, + "logps/chosen": -237.0, + "logps/rejected": -265.0, + "loss": 0.3161, + "rewards/accuracies": 0.8671875, + "rewards/chosen": 0.96875, + "rewards/margins": 3.171875, + "rewards/rejected": -2.1953125, + "step": 603 + }, + { + "epoch": 0.668141592920354, + "grad_norm": 13.699116706848145, + "learning_rate": 1.3205198926202544e-07, + "logits/chosen": -1.25, + "logits/rejected": -1.125, + "logps/chosen": -254.5, + "logps/rejected": -295.0, + "loss": 0.3262, + "rewards/accuracies": 0.796875, + "rewards/chosen": 1.0390625, + "rewards/margins": 2.7265625, + "rewards/rejected": -1.6875, + "step": 604 + }, + { + "epoch": 0.6692477876106194, + "grad_norm": 15.250642776489258, + "learning_rate": 1.312622319832375e-07, + "logits/chosen": -1.234375, + "logits/rejected": -1.13671875, + "logps/chosen": -262.0, + "logps/rejected": -275.0, + "loss": 0.3704, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 0.822265625, + "rewards/margins": 2.515625, + "rewards/rejected": -1.69140625, + "step": 605 + }, + { + "epoch": 0.6703539823008849, + "grad_norm": 14.902206420898438, + "learning_rate": 1.3047400184830303e-07, + "logits/chosen": -1.18359375, + "logits/rejected": -1.064453125, + "logps/chosen": -248.5, + "logps/rejected": -277.0, + "loss": 0.3634, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.76953125, + "rewards/margins": 2.3984375, + "rewards/rejected": -1.625, + "step": 606 + }, + { + "epoch": 0.6714601769911505, + "grad_norm": 15.015229225158691, + "learning_rate": 1.2968730899503106e-07, + "logits/chosen": -1.39453125, + "logits/rejected": -1.265625, + "logps/chosen": -254.5, + "logps/rejected": -271.0, + "loss": 0.3279, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.943359375, + "rewards/margins": 2.625, + "rewards/rejected": -1.6796875, + "step": 607 + }, + { + "epoch": 0.672566371681416, + "grad_norm": 14.369139671325684, + "learning_rate": 1.2890216354145888e-07, + "logits/chosen": -1.28515625, + "logits/rejected": -1.1796875, + "logps/chosen": -241.0, + "logps/rejected": -251.0, + "loss": 0.3464, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.953125, + "rewards/margins": 2.625, + "rewards/rejected": -1.671875, + "step": 608 + }, + { + "epoch": 0.6736725663716814, + "grad_norm": 13.686366081237793, + "learning_rate": 1.2811857558572167e-07, + "logits/chosen": -1.1953125, + "logits/rejected": -1.056640625, + "logps/chosen": -259.0, + "logps/rejected": -262.0, + "loss": 0.3501, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.763671875, + "rewards/margins": 2.7421875, + "rewards/rejected": -1.9765625, + "step": 609 + }, + { + "epoch": 0.6747787610619469, + "grad_norm": 13.95426082611084, + "learning_rate": 1.2733655520592326e-07, + "logits/chosen": -1.30078125, + "logits/rejected": -1.1640625, + "logps/chosen": -257.5, + "logps/rejected": -308.0, + "loss": 0.2923, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 1.10546875, + "rewards/margins": 2.9765625, + "rewards/rejected": -1.875, + "step": 610 + }, + { + "epoch": 0.6758849557522124, + "grad_norm": 13.288588523864746, + "learning_rate": 1.265561124600057e-07, + "logits/chosen": -1.1875, + "logits/rejected": -1.0703125, + "logps/chosen": -250.0, + "logps/rejected": -277.0, + "loss": 0.3124, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.875, + "rewards/margins": 2.765625, + "rewards/rejected": -1.890625, + "step": 611 + }, + { + "epoch": 0.6769911504424779, + "grad_norm": 14.142792701721191, + "learning_rate": 1.2577725738562068e-07, + "logits/chosen": -1.30078125, + "logits/rejected": -1.18359375, + "logps/chosen": -244.5, + "logps/rejected": -249.0, + "loss": 0.3795, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.533203125, + "rewards/margins": 2.19921875, + "rewards/rejected": -1.66796875, + "step": 612 + }, + { + "epoch": 0.6780973451327433, + "grad_norm": 12.952162742614746, + "learning_rate": 1.2500000000000005e-07, + "logits/chosen": -1.08984375, + "logits/rejected": -1.109375, + "logps/chosen": -259.5, + "logps/rejected": -315.0, + "loss": 0.2972, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.9453125, + "rewards/margins": 2.984375, + "rewards/rejected": -2.03515625, + "step": 613 + }, + { + "epoch": 0.6792035398230089, + "grad_norm": 12.563096046447754, + "learning_rate": 1.2422435029982667e-07, + "logits/chosen": -1.33203125, + "logits/rejected": -1.2109375, + "logps/chosen": -251.0, + "logps/rejected": -277.0, + "loss": 0.2854, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.1640625, + "rewards/margins": 2.9921875, + "rewards/rejected": -1.828125, + "step": 614 + }, + { + "epoch": 0.6803097345132744, + "grad_norm": 15.870928764343262, + "learning_rate": 1.234503182611066e-07, + "logits/chosen": -1.38671875, + "logits/rejected": -1.22265625, + "logps/chosen": -271.5, + "logps/rejected": -310.0, + "loss": 0.3588, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6884765625, + "rewards/margins": 2.390625, + "rewards/rejected": -1.70703125, + "step": 615 + }, + { + "epoch": 0.6814159292035398, + "grad_norm": 12.973315238952637, + "learning_rate": 1.2267791383904017e-07, + "logits/chosen": -1.23828125, + "logits/rejected": -1.12890625, + "logps/chosen": -229.0, + "logps/rejected": -272.5, + "loss": 0.2962, + "rewards/accuracies": 0.859375, + "rewards/chosen": 1.130859375, + "rewards/margins": 3.2109375, + "rewards/rejected": -2.09375, + "step": 616 + }, + { + "epoch": 0.6825221238938053, + "grad_norm": 15.841652870178223, + "learning_rate": 1.2190714696789407e-07, + "logits/chosen": -1.20703125, + "logits/rejected": -1.11328125, + "logps/chosen": -266.0, + "logps/rejected": -282.0, + "loss": 0.3952, + "rewards/accuracies": 0.6953125, + "rewards/chosen": 0.6005859375, + "rewards/margins": 2.2421875, + "rewards/rejected": -1.640625, + "step": 617 + }, + { + "epoch": 0.6836283185840708, + "grad_norm": 14.495512008666992, + "learning_rate": 1.2113802756087396e-07, + "logits/chosen": -1.2421875, + "logits/rejected": -1.15625, + "logps/chosen": -251.5, + "logps/rejected": -270.5, + "loss": 0.3808, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.5576171875, + "rewards/margins": 2.3359375, + "rewards/rejected": -1.77734375, + "step": 618 + }, + { + "epoch": 0.6847345132743363, + "grad_norm": 13.138040542602539, + "learning_rate": 1.2037056550999623e-07, + "logits/chosen": -1.08984375, + "logits/rejected": -1.046875, + "logps/chosen": -261.0, + "logps/rejected": -308.0, + "loss": 0.3147, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.82421875, + "rewards/margins": 2.8203125, + "rewards/rejected": -1.99609375, + "step": 619 + }, + { + "epoch": 0.6858407079646017, + "grad_norm": 15.598456382751465, + "learning_rate": 1.1960477068596154e-07, + "logits/chosen": -1.36328125, + "logits/rejected": -1.08984375, + "logps/chosen": -266.0, + "logps/rejected": -286.0, + "loss": 0.3759, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.62890625, + "rewards/margins": 2.1875, + "rewards/rejected": -1.55859375, + "step": 620 + }, + { + "epoch": 0.6869469026548672, + "grad_norm": 13.848457336425781, + "learning_rate": 1.1884065293802756e-07, + "logits/chosen": -1.1640625, + "logits/rejected": -1.18359375, + "logps/chosen": -244.5, + "logps/rejected": -257.0, + "loss": 0.3068, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.958984375, + "rewards/margins": 2.8828125, + "rewards/rejected": -1.92578125, + "step": 621 + }, + { + "epoch": 0.6880530973451328, + "grad_norm": 12.871940612792969, + "learning_rate": 1.1807822209388196e-07, + "logits/chosen": -1.2890625, + "logits/rejected": -1.2109375, + "logps/chosen": -239.0, + "logps/rejected": -281.0, + "loss": 0.2818, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 1.08984375, + "rewards/margins": 3.171875, + "rewards/rejected": -2.0859375, + "step": 622 + }, + { + "epoch": 0.6891592920353983, + "grad_norm": 13.695356369018555, + "learning_rate": 1.173174879595166e-07, + "logits/chosen": -1.32421875, + "logits/rejected": -1.1484375, + "logps/chosen": -244.5, + "logps/rejected": -276.0, + "loss": 0.3137, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.9609375, + "rewards/margins": 2.875, + "rewards/rejected": -1.91796875, + "step": 623 + }, + { + "epoch": 0.6902654867256637, + "grad_norm": 16.23243522644043, + "learning_rate": 1.1655846031910119e-07, + "logits/chosen": -1.359375, + "logits/rejected": -1.21484375, + "logps/chosen": -253.0, + "logps/rejected": -301.0, + "loss": 0.3016, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.837890625, + "rewards/margins": 3.0625, + "rewards/rejected": -2.2265625, + "step": 624 + }, + { + "epoch": 0.6913716814159292, + "grad_norm": 14.047713279724121, + "learning_rate": 1.1580114893485712e-07, + "logits/chosen": -1.203125, + "logits/rejected": -1.18359375, + "logps/chosen": -241.0, + "logps/rejected": -286.0, + "loss": 0.2963, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 1.08203125, + "rewards/margins": 3.265625, + "rewards/rejected": -2.1796875, + "step": 625 + }, + { + "epoch": 0.6924778761061947, + "grad_norm": 13.80639934539795, + "learning_rate": 1.1504556354693226e-07, + "logits/chosen": -1.3359375, + "logits/rejected": -1.15625, + "logps/chosen": -248.5, + "logps/rejected": -274.0, + "loss": 0.317, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.9296875, + "rewards/margins": 2.84375, + "rewards/rejected": -1.91796875, + "step": 626 + }, + { + "epoch": 0.6935840707964602, + "grad_norm": 13.272629737854004, + "learning_rate": 1.1429171387327585e-07, + "logits/chosen": -1.34375, + "logits/rejected": -1.2109375, + "logps/chosen": -238.5, + "logps/rejected": -286.0, + "loss": 0.2575, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.07421875, + "rewards/margins": 3.328125, + "rewards/rejected": -2.25, + "step": 627 + }, + { + "epoch": 0.6946902654867256, + "grad_norm": 15.396360397338867, + "learning_rate": 1.1353960960951293e-07, + "logits/chosen": -1.2421875, + "logits/rejected": -1.15625, + "logps/chosen": -276.0, + "logps/rejected": -276.0, + "loss": 0.3754, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 0.822265625, + "rewards/margins": 2.4921875, + "rewards/rejected": -1.671875, + "step": 628 + }, + { + "epoch": 0.6957964601769911, + "grad_norm": 13.207889556884766, + "learning_rate": 1.1278926042882026e-07, + "logits/chosen": -1.29296875, + "logits/rejected": -1.1484375, + "logps/chosen": -249.5, + "logps/rejected": -302.0, + "loss": 0.3109, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.80078125, + "rewards/margins": 2.9140625, + "rewards/rejected": -2.11328125, + "step": 629 + }, + { + "epoch": 0.6969026548672567, + "grad_norm": 13.04702091217041, + "learning_rate": 1.120406759818014e-07, + "logits/chosen": -1.29296875, + "logits/rejected": -1.1640625, + "logps/chosen": -236.5, + "logps/rejected": -270.0, + "loss": 0.3229, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.900390625, + "rewards/margins": 2.6640625, + "rewards/rejected": -1.76953125, + "step": 630 + }, + { + "epoch": 0.6980088495575221, + "grad_norm": 14.894906997680664, + "learning_rate": 1.1129386589636292e-07, + "logits/chosen": -1.3125, + "logits/rejected": -1.18359375, + "logps/chosen": -280.0, + "logps/rejected": -280.5, + "loss": 0.316, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.83203125, + "rewards/margins": 2.828125, + "rewards/rejected": -2.0, + "step": 631 + }, + { + "epoch": 0.6991150442477876, + "grad_norm": 16.062137603759766, + "learning_rate": 1.1054883977759066e-07, + "logits/chosen": -1.26953125, + "logits/rejected": -1.140625, + "logps/chosen": -275.0, + "logps/rejected": -277.0, + "loss": 0.3502, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.69140625, + "rewards/margins": 2.71875, + "rewards/rejected": -2.0234375, + "step": 632 + }, + { + "epoch": 0.7002212389380531, + "grad_norm": 14.050618171691895, + "learning_rate": 1.0980560720762555e-07, + "logits/chosen": -1.19921875, + "logits/rejected": -1.1484375, + "logps/chosen": -248.0, + "logps/rejected": -288.0, + "loss": 0.3215, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.7216796875, + "rewards/margins": 2.8125, + "rewards/rejected": -2.0859375, + "step": 633 + }, + { + "epoch": 0.7013274336283186, + "grad_norm": 11.265563011169434, + "learning_rate": 1.0906417774554132e-07, + "logits/chosen": -1.1953125, + "logits/rejected": -1.21484375, + "logps/chosen": -234.0, + "logps/rejected": -249.5, + "loss": 0.2667, + "rewards/accuracies": 0.8671875, + "rewards/chosen": 1.05859375, + "rewards/margins": 3.203125, + "rewards/rejected": -2.140625, + "step": 634 + }, + { + "epoch": 0.702433628318584, + "grad_norm": 13.785270690917969, + "learning_rate": 1.0832456092722062e-07, + "logits/chosen": -1.3046875, + "logits/rejected": -1.18359375, + "logps/chosen": -268.0, + "logps/rejected": -271.0, + "loss": 0.3269, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.63671875, + "rewards/margins": 2.421875, + "rewards/rejected": -1.7890625, + "step": 635 + }, + { + "epoch": 0.7035398230088495, + "grad_norm": 14.249685287475586, + "learning_rate": 1.0758676626523311e-07, + "logits/chosen": -1.32421875, + "logits/rejected": -1.1796875, + "logps/chosen": -265.0, + "logps/rejected": -286.0, + "loss": 0.314, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.705078125, + "rewards/margins": 2.75, + "rewards/rejected": -2.046875, + "step": 636 + }, + { + "epoch": 0.7046460176991151, + "grad_norm": 12.366557121276855, + "learning_rate": 1.0685080324871278e-07, + "logits/chosen": -1.2421875, + "logits/rejected": -1.021484375, + "logps/chosen": -256.0, + "logps/rejected": -298.0, + "loss": 0.27, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.75390625, + "rewards/margins": 2.65625, + "rewards/rejected": -1.90234375, + "step": 637 + }, + { + "epoch": 0.7057522123893806, + "grad_norm": 16.30191421508789, + "learning_rate": 1.0611668134323575e-07, + "logits/chosen": -1.30078125, + "logits/rejected": -1.1484375, + "logps/chosen": -282.0, + "logps/rejected": -299.0, + "loss": 0.3438, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4140625, + "rewards/margins": 2.54296875, + "rewards/rejected": -2.1328125, + "step": 638 + }, + { + "epoch": 0.706858407079646, + "grad_norm": 14.99670696258545, + "learning_rate": 1.0538440999069895e-07, + "logits/chosen": -1.30859375, + "logits/rejected": -1.19921875, + "logps/chosen": -255.5, + "logps/rejected": -298.0, + "loss": 0.3104, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.896484375, + "rewards/margins": 2.8671875, + "rewards/rejected": -1.96875, + "step": 639 + }, + { + "epoch": 0.7079646017699115, + "grad_norm": 12.429228782653809, + "learning_rate": 1.0465399860919838e-07, + "logits/chosen": -1.2265625, + "logits/rejected": -1.1953125, + "logps/chosen": -255.5, + "logps/rejected": -273.0, + "loss": 0.2869, + "rewards/accuracies": 0.828125, + "rewards/chosen": 1.080078125, + "rewards/margins": 3.0859375, + "rewards/rejected": -2.00390625, + "step": 640 + }, + { + "epoch": 0.709070796460177, + "grad_norm": 12.204998970031738, + "learning_rate": 1.0392545659290788e-07, + "logits/chosen": -1.2578125, + "logits/rejected": -1.21875, + "logps/chosen": -260.5, + "logps/rejected": -274.0, + "loss": 0.2817, + "rewards/accuracies": 0.8671875, + "rewards/chosen": 0.810546875, + "rewards/margins": 2.9921875, + "rewards/rejected": -2.1875, + "step": 641 + }, + { + "epoch": 0.7101769911504425, + "grad_norm": 14.068879127502441, + "learning_rate": 1.0319879331195882e-07, + "logits/chosen": -1.21484375, + "logits/rejected": -1.0703125, + "logps/chosen": -254.0, + "logps/rejected": -272.0, + "loss": 0.3538, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.591796875, + "rewards/margins": 2.5859375, + "rewards/rejected": -2.0, + "step": 642 + }, + { + "epoch": 0.7112831858407079, + "grad_norm": 12.932374954223633, + "learning_rate": 1.0247401811231887e-07, + "logits/chosen": -1.390625, + "logits/rejected": -1.18359375, + "logps/chosen": -233.0, + "logps/rejected": -259.0, + "loss": 0.2886, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.841796875, + "rewards/margins": 2.9375, + "rewards/rejected": -2.09375, + "step": 643 + }, + { + "epoch": 0.7123893805309734, + "grad_norm": 12.754419326782227, + "learning_rate": 1.0175114031567245e-07, + "logits/chosen": -1.27734375, + "logits/rejected": -1.2109375, + "logps/chosen": -253.0, + "logps/rejected": -288.0, + "loss": 0.2941, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.798828125, + "rewards/margins": 2.671875, + "rewards/rejected": -1.875, + "step": 644 + }, + { + "epoch": 0.713495575221239, + "grad_norm": 13.075281143188477, + "learning_rate": 1.0103016921930055e-07, + "logits/chosen": -1.3046875, + "logits/rejected": -1.18359375, + "logps/chosen": -247.5, + "logps/rejected": -276.0, + "loss": 0.324, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.845703125, + "rewards/margins": 2.5546875, + "rewards/rejected": -1.70703125, + "step": 645 + }, + { + "epoch": 0.7146017699115044, + "grad_norm": 15.02340030670166, + "learning_rate": 1.0031111409596091e-07, + "logits/chosen": -1.15625, + "logits/rejected": -1.2109375, + "logps/chosen": -246.5, + "logps/rejected": -258.0, + "loss": 0.3851, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.669921875, + "rewards/margins": 2.4140625, + "rewards/rejected": -1.7421875, + "step": 646 + }, + { + "epoch": 0.7157079646017699, + "grad_norm": 12.193872451782227, + "learning_rate": 9.95939841937693e-08, + "logits/chosen": -1.30859375, + "logits/rejected": -1.2578125, + "logps/chosen": -259.5, + "logps/rejected": -265.5, + "loss": 0.2392, + "rewards/accuracies": 0.890625, + "rewards/chosen": 0.86328125, + "rewards/margins": 3.2734375, + "rewards/rejected": -2.4140625, + "step": 647 + }, + { + "epoch": 0.7168141592920354, + "grad_norm": 13.42468547821045, + "learning_rate": 9.887878873608027e-08, + "logits/chosen": -1.16015625, + "logits/rejected": -1.11328125, + "logps/chosen": -263.5, + "logps/rejected": -290.0, + "loss": 0.3087, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.791015625, + "rewards/margins": 2.875, + "rewards/rejected": -2.08203125, + "step": 648 + }, + { + "epoch": 0.7179203539823009, + "grad_norm": 13.621614456176758, + "learning_rate": 9.816553692136834e-08, + "logits/chosen": -1.17578125, + "logits/rejected": -1.109375, + "logps/chosen": -256.5, + "logps/rejected": -282.0, + "loss": 0.2806, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.1484375, + "rewards/margins": 3.046875, + "rewards/rejected": -1.89453125, + "step": 649 + }, + { + "epoch": 0.7190265486725663, + "grad_norm": 13.231938362121582, + "learning_rate": 9.745423792310995e-08, + "logits/chosen": -1.29296875, + "logits/rejected": -1.12890625, + "logps/chosen": -243.0, + "logps/rejected": -258.5, + "loss": 0.2872, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.7265625, + "rewards/margins": 2.90625, + "rewards/rejected": -2.1796875, + "step": 650 + }, + { + "epoch": 0.7190265486725663, + "eval_logits/chosen": -1.2634872198104858, + "eval_logits/rejected": -1.1566191911697388, + "eval_logps/chosen": -252.76119995117188, + "eval_logps/rejected": -279.27362060546875, + "eval_loss": 0.3217768967151642, + "eval_rewards/accuracies": 0.8134269714355469, + "eval_rewards/chosen": 0.8057758212089539, + "eval_rewards/margins": 2.8031716346740723, + "eval_rewards/rejected": -1.996579647064209, + "eval_runtime": 193.0564, + "eval_samples_per_second": 66.576, + "eval_steps_per_second": 1.041, + "step": 650 + }, + { + "epoch": 0.7201327433628318, + "grad_norm": 13.893576622009277, + "learning_rate": 9.674490088966562e-08, + "logits/chosen": -1.2734375, + "logits/rejected": -1.12109375, + "logps/chosen": -262.5, + "logps/rejected": -296.0, + "loss": 0.2924, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.919921875, + "rewards/margins": 3.0859375, + "rewards/rejected": -2.1640625, + "step": 651 + }, + { + "epoch": 0.7212389380530974, + "grad_norm": 13.017692565917969, + "learning_rate": 9.603753494416184e-08, + "logits/chosen": -1.328125, + "logits/rejected": -1.28515625, + "logps/chosen": -242.0, + "logps/rejected": -248.0, + "loss": 0.2897, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.791015625, + "rewards/margins": 2.7109375, + "rewards/rejected": -1.9140625, + "step": 652 + }, + { + "epoch": 0.7223451327433629, + "grad_norm": 17.18537712097168, + "learning_rate": 9.533214918437421e-08, + "logits/chosen": -1.25390625, + "logits/rejected": -1.23828125, + "logps/chosen": -283.0, + "logps/rejected": -287.0, + "loss": 0.402, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.509765625, + "rewards/margins": 2.1640625, + "rewards/rejected": -1.65625, + "step": 653 + }, + { + "epoch": 0.7234513274336283, + "grad_norm": 17.223974227905273, + "learning_rate": 9.462875268261e-08, + "logits/chosen": -1.19140625, + "logits/rejected": -1.2578125, + "logps/chosen": -297.0, + "logps/rejected": -311.0, + "loss": 0.3244, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.814453125, + "rewards/margins": 2.859375, + "rewards/rejected": -2.046875, + "step": 654 + }, + { + "epoch": 0.7245575221238938, + "grad_norm": 14.414237976074219, + "learning_rate": 9.39273544855918e-08, + "logits/chosen": -1.24609375, + "logits/rejected": -1.10546875, + "logps/chosen": -259.0, + "logps/rejected": -303.0, + "loss": 0.3058, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.888671875, + "rewards/margins": 3.140625, + "rewards/rejected": -2.25, + "step": 655 + }, + { + "epoch": 0.7256637168141593, + "grad_norm": 13.708487510681152, + "learning_rate": 9.32279636143411e-08, + "logits/chosen": -1.359375, + "logits/rejected": -1.16015625, + "logps/chosen": -271.0, + "logps/rejected": -277.5, + "loss": 0.3109, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.89453125, + "rewards/margins": 2.640625, + "rewards/rejected": -1.7421875, + "step": 656 + }, + { + "epoch": 0.7267699115044248, + "grad_norm": 14.687643051147461, + "learning_rate": 9.253058906406194e-08, + "logits/chosen": -1.203125, + "logits/rejected": -1.0625, + "logps/chosen": -280.0, + "logps/rejected": -306.0, + "loss": 0.314, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.810546875, + "rewards/margins": 2.7734375, + "rewards/rejected": -1.96875, + "step": 657 + }, + { + "epoch": 0.7278761061946902, + "grad_norm": 13.893321990966797, + "learning_rate": 9.183523980402582e-08, + "logits/chosen": -1.21484375, + "logits/rejected": -1.203125, + "logps/chosen": -240.0, + "logps/rejected": -291.0, + "loss": 0.3241, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.82421875, + "rewards/margins": 2.6875, + "rewards/rejected": -1.87109375, + "step": 658 + }, + { + "epoch": 0.7289823008849557, + "grad_norm": 13.528450965881348, + "learning_rate": 9.114192477745566e-08, + "logits/chosen": -1.3359375, + "logits/rejected": -1.1484375, + "logps/chosen": -262.0, + "logps/rejected": -278.0, + "loss": 0.3098, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.6640625, + "rewards/margins": 2.6953125, + "rewards/rejected": -2.03515625, + "step": 659 + }, + { + "epoch": 0.7300884955752213, + "grad_norm": 15.182424545288086, + "learning_rate": 9.045065290141138e-08, + "logits/chosen": -1.22265625, + "logits/rejected": -1.1015625, + "logps/chosen": -275.0, + "logps/rejected": -305.0, + "loss": 0.3081, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.822265625, + "rewards/margins": 2.8046875, + "rewards/rejected": -1.984375, + "step": 660 + }, + { + "epoch": 0.7311946902654868, + "grad_norm": 14.025420188903809, + "learning_rate": 8.976143306667491e-08, + "logits/chosen": -1.21484375, + "logits/rejected": -1.16015625, + "logps/chosen": -255.5, + "logps/rejected": -290.0, + "loss": 0.2861, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.857421875, + "rewards/margins": 2.9765625, + "rewards/rejected": -2.1171875, + "step": 661 + }, + { + "epoch": 0.7323008849557522, + "grad_norm": 12.591769218444824, + "learning_rate": 8.907427413763572e-08, + "logits/chosen": -1.32421875, + "logits/rejected": -1.1171875, + "logps/chosen": -268.0, + "logps/rejected": -275.5, + "loss": 0.2648, + "rewards/accuracies": 0.8828125, + "rewards/chosen": 0.677734375, + "rewards/margins": 2.9453125, + "rewards/rejected": -2.265625, + "step": 662 + }, + { + "epoch": 0.7334070796460177, + "grad_norm": 15.431063652038574, + "learning_rate": 8.838918495217712e-08, + "logits/chosen": -1.25390625, + "logits/rejected": -1.14453125, + "logps/chosen": -269.5, + "logps/rejected": -304.0, + "loss": 0.3575, + "rewards/accuracies": 0.7578125, + "rewards/chosen": 0.787109375, + "rewards/margins": 2.640625, + "rewards/rejected": -1.84765625, + "step": 663 + }, + { + "epoch": 0.7345132743362832, + "grad_norm": 14.970857620239258, + "learning_rate": 8.770617432156257e-08, + "logits/chosen": -1.28515625, + "logits/rejected": -1.05859375, + "logps/chosen": -268.0, + "logps/rejected": -298.0, + "loss": 0.3506, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.708984375, + "rewards/margins": 2.4921875, + "rewards/rejected": -1.78125, + "step": 664 + }, + { + "epoch": 0.7356194690265486, + "grad_norm": 15.439310073852539, + "learning_rate": 8.702525103032184e-08, + "logits/chosen": -1.2421875, + "logits/rejected": -1.07421875, + "logps/chosen": -248.0, + "logps/rejected": -280.5, + "loss": 0.3629, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.7626953125, + "rewards/margins": 2.59375, + "rewards/rejected": -1.828125, + "step": 665 + }, + { + "epoch": 0.7367256637168141, + "grad_norm": 13.227315902709961, + "learning_rate": 8.634642383613891e-08, + "logits/chosen": -1.21875, + "logits/rejected": -1.11328125, + "logps/chosen": -254.5, + "logps/rejected": -285.0, + "loss": 0.3095, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.8984375, + "rewards/margins": 2.875, + "rewards/rejected": -1.97265625, + "step": 666 + }, + { + "epoch": 0.7378318584070797, + "grad_norm": 12.241044044494629, + "learning_rate": 8.566970146973835e-08, + "logits/chosen": -1.30859375, + "logits/rejected": -1.125, + "logps/chosen": -257.0, + "logps/rejected": -293.0, + "loss": 0.2911, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.849609375, + "rewards/margins": 3.0, + "rewards/rejected": -2.15234375, + "step": 667 + }, + { + "epoch": 0.7389380530973452, + "grad_norm": 12.409917831420898, + "learning_rate": 8.499509263477387e-08, + "logits/chosen": -1.375, + "logits/rejected": -1.15234375, + "logps/chosen": -222.0, + "logps/rejected": -269.0, + "loss": 0.285, + "rewards/accuracies": 0.84375, + "rewards/chosen": 1.044921875, + "rewards/margins": 3.109375, + "rewards/rejected": -2.05859375, + "step": 668 + }, + { + "epoch": 0.7400442477876106, + "grad_norm": 16.232877731323242, + "learning_rate": 8.432260600771599e-08, + "logits/chosen": -1.31640625, + "logits/rejected": -1.171875, + "logps/chosen": -278.0, + "logps/rejected": -274.0, + "loss": 0.3434, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.6806640625, + "rewards/margins": 2.796875, + "rewards/rejected": -2.11328125, + "step": 669 + }, + { + "epoch": 0.7411504424778761, + "grad_norm": 12.330305099487305, + "learning_rate": 8.36522502377403e-08, + "logits/chosen": -1.34375, + "logits/rejected": -1.1640625, + "logps/chosen": -239.5, + "logps/rejected": -292.0, + "loss": 0.2725, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.86328125, + "rewards/margins": 2.7734375, + "rewards/rejected": -1.9140625, + "step": 670 + }, + { + "epoch": 0.7422566371681416, + "grad_norm": 15.042512893676758, + "learning_rate": 8.298403394661657e-08, + "logits/chosen": -1.24609375, + "logits/rejected": -1.18359375, + "logps/chosen": -278.0, + "logps/rejected": -262.0, + "loss": 0.3643, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.53515625, + "rewards/margins": 2.4609375, + "rewards/rejected": -1.92578125, + "step": 671 + }, + { + "epoch": 0.7433628318584071, + "grad_norm": 15.917474746704102, + "learning_rate": 8.231796572859778e-08, + "logits/chosen": -1.09765625, + "logits/rejected": -1.12109375, + "logps/chosen": -250.5, + "logps/rejected": -301.0, + "loss": 0.2963, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 1.0, + "rewards/margins": 3.1796875, + "rewards/rejected": -2.1796875, + "step": 672 + }, + { + "epoch": 0.7444690265486725, + "grad_norm": 13.662030220031738, + "learning_rate": 8.165405415030915e-08, + "logits/chosen": -1.35546875, + "logits/rejected": -1.14453125, + "logps/chosen": -288.0, + "logps/rejected": -279.0, + "loss": 0.2763, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.91015625, + "rewards/margins": 2.953125, + "rewards/rejected": -2.0390625, + "step": 673 + }, + { + "epoch": 0.745575221238938, + "grad_norm": 14.487608909606934, + "learning_rate": 8.099230775063879e-08, + "logits/chosen": -1.2890625, + "logits/rejected": -1.140625, + "logps/chosen": -261.0, + "logps/rejected": -277.0, + "loss": 0.319, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.75390625, + "rewards/margins": 2.7109375, + "rewards/rejected": -1.953125, + "step": 674 + }, + { + "epoch": 0.7466814159292036, + "grad_norm": 15.394200325012207, + "learning_rate": 8.033273504062698e-08, + "logits/chosen": -1.12109375, + "logits/rejected": -1.11328125, + "logps/chosen": -267.0, + "logps/rejected": -314.0, + "loss": 0.3292, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.70703125, + "rewards/margins": 2.84375, + "rewards/rejected": -2.1328125, + "step": 675 + }, + { + "epoch": 0.7477876106194691, + "grad_norm": 16.063007354736328, + "learning_rate": 7.967534450335728e-08, + "logits/chosen": -1.32421875, + "logits/rejected": -1.19140625, + "logps/chosen": -253.5, + "logps/rejected": -266.5, + "loss": 0.3824, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.556640625, + "rewards/margins": 2.4453125, + "rewards/rejected": -1.88671875, + "step": 676 + }, + { + "epoch": 0.7488938053097345, + "grad_norm": 15.266008377075195, + "learning_rate": 7.902014459384742e-08, + "logits/chosen": -1.21875, + "logits/rejected": -1.03515625, + "logps/chosen": -259.0, + "logps/rejected": -301.0, + "loss": 0.3159, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.875, + "rewards/margins": 3.1015625, + "rewards/rejected": -2.2265625, + "step": 677 + }, + { + "epoch": 0.75, + "grad_norm": 11.596745491027832, + "learning_rate": 7.836714373894015e-08, + "logits/chosen": -1.1484375, + "logits/rejected": -1.05078125, + "logps/chosen": -248.5, + "logps/rejected": -268.5, + "loss": 0.224, + "rewards/accuracies": 0.8828125, + "rewards/chosen": 1.046875, + "rewards/margins": 3.390625, + "rewards/rejected": -2.3359375, + "step": 678 + }, + { + "epoch": 0.7511061946902655, + "grad_norm": 12.86449909210205, + "learning_rate": 7.771635033719528e-08, + "logits/chosen": -1.26171875, + "logits/rejected": -1.09375, + "logps/chosen": -271.0, + "logps/rejected": -258.5, + "loss": 0.2782, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.80078125, + "rewards/margins": 2.8359375, + "rewards/rejected": -2.03125, + "step": 679 + }, + { + "epoch": 0.7522123893805309, + "grad_norm": 12.727359771728516, + "learning_rate": 7.70677727587816e-08, + "logits/chosen": -1.2578125, + "logits/rejected": -1.0859375, + "logps/chosen": -257.0, + "logps/rejected": -295.0, + "loss": 0.2793, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.70703125, + "rewards/margins": 2.734375, + "rewards/rejected": -2.0234375, + "step": 680 + }, + { + "epoch": 0.7533185840707964, + "grad_norm": 12.862136840820312, + "learning_rate": 7.642141934536874e-08, + "logits/chosen": -1.3203125, + "logits/rejected": -1.1953125, + "logps/chosen": -242.5, + "logps/rejected": -268.0, + "loss": 0.2937, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.720703125, + "rewards/margins": 3.0546875, + "rewards/rejected": -2.328125, + "step": 681 + }, + { + "epoch": 0.754424778761062, + "grad_norm": 13.950096130371094, + "learning_rate": 7.577729841002075e-08, + "logits/chosen": -1.17578125, + "logits/rejected": -1.12890625, + "logps/chosen": -279.0, + "logps/rejected": -308.0, + "loss": 0.2855, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.703125, + "rewards/margins": 2.875, + "rewards/rejected": -2.171875, + "step": 682 + }, + { + "epoch": 0.7555309734513275, + "grad_norm": 15.024590492248535, + "learning_rate": 7.513541823708827e-08, + "logits/chosen": -1.26953125, + "logits/rejected": -1.140625, + "logps/chosen": -251.0, + "logps/rejected": -292.0, + "loss": 0.3303, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.697265625, + "rewards/margins": 2.75, + "rewards/rejected": -2.05078125, + "step": 683 + }, + { + "epoch": 0.7566371681415929, + "grad_norm": 14.337872505187988, + "learning_rate": 7.449578708210267e-08, + "logits/chosen": -1.26953125, + "logits/rejected": -1.21484375, + "logps/chosen": -283.0, + "logps/rejected": -276.0, + "loss": 0.3292, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.71484375, + "rewards/margins": 2.8515625, + "rewards/rejected": -2.14453125, + "step": 684 + }, + { + "epoch": 0.7577433628318584, + "grad_norm": 13.712812423706055, + "learning_rate": 7.385841317166966e-08, + "logits/chosen": -1.30078125, + "logits/rejected": -1.1953125, + "logps/chosen": -250.5, + "logps/rejected": -283.0, + "loss": 0.309, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.654296875, + "rewards/margins": 2.8515625, + "rewards/rejected": -2.1953125, + "step": 685 + }, + { + "epoch": 0.7588495575221239, + "grad_norm": 11.522303581237793, + "learning_rate": 7.322330470336313e-08, + "logits/chosen": -1.43359375, + "logits/rejected": -1.15625, + "logps/chosen": -247.0, + "logps/rejected": -287.0, + "loss": 0.2535, + "rewards/accuracies": 0.890625, + "rewards/chosen": 0.541015625, + "rewards/margins": 2.796875, + "rewards/rejected": -2.265625, + "step": 686 + }, + { + "epoch": 0.7599557522123894, + "grad_norm": 16.617996215820312, + "learning_rate": 7.25904698456203e-08, + "logits/chosen": -1.21484375, + "logits/rejected": -1.0859375, + "logps/chosen": -283.0, + "logps/rejected": -304.0, + "loss": 0.3768, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.59375, + "rewards/margins": 2.578125, + "rewards/rejected": -1.98828125, + "step": 687 + }, + { + "epoch": 0.7610619469026548, + "grad_norm": 13.16273021697998, + "learning_rate": 7.195991673763644e-08, + "logits/chosen": -1.2578125, + "logits/rejected": -1.140625, + "logps/chosen": -256.0, + "logps/rejected": -261.5, + "loss": 0.342, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.5322265625, + "rewards/margins": 2.6015625, + "rewards/rejected": -2.078125, + "step": 688 + }, + { + "epoch": 0.7621681415929203, + "grad_norm": 14.337390899658203, + "learning_rate": 7.133165348925976e-08, + "logits/chosen": -1.25, + "logits/rejected": -1.2734375, + "logps/chosen": -253.5, + "logps/rejected": -281.0, + "loss": 0.3474, + "rewards/accuracies": 0.7578125, + "rewards/chosen": 1.0078125, + "rewards/margins": 2.8359375, + "rewards/rejected": -1.83203125, + "step": 689 + }, + { + "epoch": 0.7632743362831859, + "grad_norm": 13.05460262298584, + "learning_rate": 7.070568818088782e-08, + "logits/chosen": -1.29296875, + "logits/rejected": -1.06640625, + "logps/chosen": -266.5, + "logps/rejected": -289.5, + "loss": 0.3306, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.76171875, + "rewards/margins": 2.6875, + "rewards/rejected": -1.921875, + "step": 690 + }, + { + "epoch": 0.7643805309734514, + "grad_norm": 13.12061595916748, + "learning_rate": 7.008202886336323e-08, + "logits/chosen": -1.296875, + "logits/rejected": -1.11328125, + "logps/chosen": -252.0, + "logps/rejected": -294.0, + "loss": 0.3064, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.6943359375, + "rewards/margins": 2.96875, + "rewards/rejected": -2.2734375, + "step": 691 + }, + { + "epoch": 0.7654867256637168, + "grad_norm": 15.881913185119629, + "learning_rate": 6.94606835578699e-08, + "logits/chosen": -1.2734375, + "logits/rejected": -1.09375, + "logps/chosen": -267.5, + "logps/rejected": -279.0, + "loss": 0.39, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.5244140625, + "rewards/margins": 2.265625, + "rewards/rejected": -1.734375, + "step": 692 + }, + { + "epoch": 0.7665929203539823, + "grad_norm": 18.117141723632812, + "learning_rate": 6.884166025583043e-08, + "logits/chosen": -1.19140625, + "logits/rejected": -1.13671875, + "logps/chosen": -289.0, + "logps/rejected": -318.0, + "loss": 0.3893, + "rewards/accuracies": 0.734375, + "rewards/chosen": 0.591796875, + "rewards/margins": 2.71875, + "rewards/rejected": -2.1328125, + "step": 693 + }, + { + "epoch": 0.7676991150442478, + "grad_norm": 14.054704666137695, + "learning_rate": 6.822496691880275e-08, + "logits/chosen": -1.34765625, + "logits/rejected": -1.18359375, + "logps/chosen": -250.5, + "logps/rejected": -272.0, + "loss": 0.3268, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.708984375, + "rewards/margins": 2.6875, + "rewards/rejected": -1.9765625, + "step": 694 + }, + { + "epoch": 0.7688053097345132, + "grad_norm": 11.454045295715332, + "learning_rate": 6.761061147837807e-08, + "logits/chosen": -1.41796875, + "logits/rejected": -1.12890625, + "logps/chosen": -254.5, + "logps/rejected": -293.0, + "loss": 0.2523, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.861328125, + "rewards/margins": 3.1484375, + "rewards/rejected": -2.28125, + "step": 695 + }, + { + "epoch": 0.7699115044247787, + "grad_norm": 13.245506286621094, + "learning_rate": 6.699860183607894e-08, + "logits/chosen": -1.359375, + "logits/rejected": -1.140625, + "logps/chosen": -275.0, + "logps/rejected": -273.0, + "loss": 0.3098, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4716796875, + "rewards/margins": 2.71875, + "rewards/rejected": -2.25, + "step": 696 + }, + { + "epoch": 0.7710176991150443, + "grad_norm": 13.23479175567627, + "learning_rate": 6.638894586325719e-08, + "logits/chosen": -1.2890625, + "logits/rejected": -1.19140625, + "logps/chosen": -246.0, + "logps/rejected": -288.0, + "loss": 0.2909, + "rewards/accuracies": 0.8671875, + "rewards/chosen": 0.5712890625, + "rewards/margins": 2.859375, + "rewards/rejected": -2.296875, + "step": 697 + }, + { + "epoch": 0.7721238938053098, + "grad_norm": 14.469060897827148, + "learning_rate": 6.578165140099317e-08, + "logits/chosen": -1.26953125, + "logits/rejected": -1.20703125, + "logps/chosen": -252.5, + "logps/rejected": -277.0, + "loss": 0.3493, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.79296875, + "rewards/margins": 2.8359375, + "rewards/rejected": -2.04296875, + "step": 698 + }, + { + "epoch": 0.7732300884955752, + "grad_norm": 15.471959114074707, + "learning_rate": 6.517672625999465e-08, + "logits/chosen": -1.20703125, + "logits/rejected": -1.09375, + "logps/chosen": -254.0, + "logps/rejected": -284.0, + "loss": 0.3456, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.5849609375, + "rewards/margins": 2.78125, + "rewards/rejected": -2.203125, + "step": 699 + }, + { + "epoch": 0.7743362831858407, + "grad_norm": 13.424947738647461, + "learning_rate": 6.457417822049627e-08, + "logits/chosen": -1.35546875, + "logits/rejected": -1.13671875, + "logps/chosen": -260.0, + "logps/rejected": -280.0, + "loss": 0.3278, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.681640625, + "rewards/margins": 2.5703125, + "rewards/rejected": -1.890625, + "step": 700 + }, + { + "epoch": 0.7743362831858407, + "eval_logits/chosen": -1.267957091331482, + "eval_logits/rejected": -1.1595537662506104, + "eval_logps/chosen": -253.52735900878906, + "eval_logps/rejected": -279.9950256347656, + "eval_loss": 0.3205508887767792, + "eval_rewards/accuracies": 0.8137379288673401, + "eval_rewards/chosen": 0.73013836145401, + "eval_rewards/margins": 2.814093589782715, + "eval_rewards/rejected": -2.0833332538604736, + "eval_runtime": 193.1227, + "eval_samples_per_second": 66.554, + "eval_steps_per_second": 1.041, + "step": 700 + }, + { + "epoch": 0.7754424778761062, + "grad_norm": 14.452095985412598, + "learning_rate": 6.397401503215991e-08, + "logits/chosen": -1.2578125, + "logits/rejected": -1.125, + "logps/chosen": -268.0, + "logps/rejected": -296.0, + "loss": 0.3012, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.92578125, + "rewards/margins": 3.0703125, + "rewards/rejected": -2.14453125, + "step": 701 + }, + { + "epoch": 0.7765486725663717, + "grad_norm": 13.497214317321777, + "learning_rate": 6.33762444139744e-08, + "logits/chosen": -1.53515625, + "logits/rejected": -1.15625, + "logps/chosen": -244.5, + "logps/rejected": -292.0, + "loss": 0.3147, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.78515625, + "rewards/margins": 2.78125, + "rewards/rejected": -2.0, + "step": 702 + }, + { + "epoch": 0.7776548672566371, + "grad_norm": 13.972166061401367, + "learning_rate": 6.278087405415683e-08, + "logits/chosen": -1.3125, + "logits/rejected": -1.140625, + "logps/chosen": -258.0, + "logps/rejected": -260.0, + "loss": 0.2868, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.8046875, + "rewards/margins": 3.03125, + "rewards/rejected": -2.2265625, + "step": 703 + }, + { + "epoch": 0.7787610619469026, + "grad_norm": 13.861830711364746, + "learning_rate": 6.218791161005335e-08, + "logits/chosen": -1.2265625, + "logits/rejected": -1.09765625, + "logps/chosen": -238.5, + "logps/rejected": -299.0, + "loss": 0.2945, + "rewards/accuracies": 0.8671875, + "rewards/chosen": 0.76953125, + "rewards/margins": 3.0, + "rewards/rejected": -2.234375, + "step": 704 + }, + { + "epoch": 0.7798672566371682, + "grad_norm": 15.766735076904297, + "learning_rate": 6.159736470804059e-08, + "logits/chosen": -1.3046875, + "logits/rejected": -1.2421875, + "logps/chosen": -250.5, + "logps/rejected": -261.0, + "loss": 0.3834, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 0.6416015625, + "rewards/margins": 2.21875, + "rewards/rejected": -1.57421875, + "step": 705 + }, + { + "epoch": 0.7809734513274337, + "grad_norm": 13.845602989196777, + "learning_rate": 6.100924094342785e-08, + "logits/chosen": -1.4140625, + "logits/rejected": -1.19921875, + "logps/chosen": -230.5, + "logps/rejected": -236.0, + "loss": 0.3024, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.705078125, + "rewards/margins": 2.9609375, + "rewards/rejected": -2.2578125, + "step": 706 + }, + { + "epoch": 0.7820796460176991, + "grad_norm": 14.929936408996582, + "learning_rate": 6.042354788035942e-08, + "logits/chosen": -1.18359375, + "logits/rejected": -1.04296875, + "logps/chosen": -269.0, + "logps/rejected": -291.0, + "loss": 0.3403, + "rewards/accuracies": 0.7578125, + "rewards/chosen": 0.6162109375, + "rewards/margins": 2.7109375, + "rewards/rejected": -2.09765625, + "step": 707 + }, + { + "epoch": 0.7831858407079646, + "grad_norm": 13.97938346862793, + "learning_rate": 5.984029305171678e-08, + "logits/chosen": -1.26953125, + "logits/rejected": -1.2734375, + "logps/chosen": -245.0, + "logps/rejected": -287.0, + "loss": 0.2896, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 1.0, + "rewards/margins": 3.21875, + "rewards/rejected": -2.21875, + "step": 708 + }, + { + "epoch": 0.7842920353982301, + "grad_norm": 14.008685111999512, + "learning_rate": 5.925948395902253e-08, + "logits/chosen": -1.32421875, + "logits/rejected": -1.21875, + "logps/chosen": -272.0, + "logps/rejected": -313.0, + "loss": 0.3008, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.884765625, + "rewards/margins": 2.9765625, + "rewards/rejected": -2.0859375, + "step": 709 + }, + { + "epoch": 0.7853982300884956, + "grad_norm": 12.459348678588867, + "learning_rate": 5.868112807234313e-08, + "logits/chosen": -1.29296875, + "logits/rejected": -1.140625, + "logps/chosen": -269.0, + "logps/rejected": -371.0, + "loss": 0.262, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.994140625, + "rewards/margins": 3.0390625, + "rewards/rejected": -2.046875, + "step": 710 + }, + { + "epoch": 0.786504424778761, + "grad_norm": 14.358124732971191, + "learning_rate": 5.810523283019339e-08, + "logits/chosen": -1.3046875, + "logits/rejected": -1.1875, + "logps/chosen": -282.0, + "logps/rejected": -273.0, + "loss": 0.3423, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.484375, + "rewards/margins": 2.484375, + "rewards/rejected": -2.00390625, + "step": 711 + }, + { + "epoch": 0.7876106194690266, + "grad_norm": 12.388589859008789, + "learning_rate": 5.753180563944057e-08, + "logits/chosen": -1.37109375, + "logits/rejected": -1.09375, + "logps/chosen": -232.0, + "logps/rejected": -247.5, + "loss": 0.2437, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.931640625, + "rewards/margins": 3.46875, + "rewards/rejected": -2.5390625, + "step": 712 + }, + { + "epoch": 0.7887168141592921, + "grad_norm": 12.301764488220215, + "learning_rate": 5.6960853875208935e-08, + "logits/chosen": -1.19921875, + "logits/rejected": -1.1640625, + "logps/chosen": -252.0, + "logps/rejected": -267.0, + "loss": 0.3027, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.671875, + "rewards/margins": 2.8125, + "rewards/rejected": -2.1328125, + "step": 713 + }, + { + "epoch": 0.7898230088495575, + "grad_norm": 14.501238822937012, + "learning_rate": 5.6392384880785294e-08, + "logits/chosen": -1.37109375, + "logits/rejected": -1.2265625, + "logps/chosen": -276.0, + "logps/rejected": -285.0, + "loss": 0.3198, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.501953125, + "rewards/margins": 2.6328125, + "rewards/rejected": -2.125, + "step": 714 + }, + { + "epoch": 0.790929203539823, + "grad_norm": 12.956294059753418, + "learning_rate": 5.5826405967524357e-08, + "logits/chosen": -1.1484375, + "logits/rejected": -1.0859375, + "logps/chosen": -255.0, + "logps/rejected": -297.0, + "loss": 0.272, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.779296875, + "rewards/margins": 3.203125, + "rewards/rejected": -2.421875, + "step": 715 + }, + { + "epoch": 0.7920353982300885, + "grad_norm": 14.246673583984375, + "learning_rate": 5.526292441475447e-08, + "logits/chosen": -1.32421875, + "logits/rejected": -1.140625, + "logps/chosen": -269.0, + "logps/rejected": -308.0, + "loss": 0.2897, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.818359375, + "rewards/margins": 2.765625, + "rewards/rejected": -1.94921875, + "step": 716 + }, + { + "epoch": 0.793141592920354, + "grad_norm": 14.141976356506348, + "learning_rate": 5.470194746968451e-08, + "logits/chosen": -1.265625, + "logits/rejected": -1.2265625, + "logps/chosen": -246.0, + "logps/rejected": -288.0, + "loss": 0.3056, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.7265625, + "rewards/margins": 3.0859375, + "rewards/rejected": -2.359375, + "step": 717 + }, + { + "epoch": 0.7942477876106194, + "grad_norm": 13.89908218383789, + "learning_rate": 5.4143482347310116e-08, + "logits/chosen": -1.3046875, + "logits/rejected": -1.10546875, + "logps/chosen": -273.0, + "logps/rejected": -295.0, + "loss": 0.3041, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.87890625, + "rewards/margins": 2.828125, + "rewards/rejected": -1.94921875, + "step": 718 + }, + { + "epoch": 0.7953539823008849, + "grad_norm": 11.489982604980469, + "learning_rate": 5.358753623032136e-08, + "logits/chosen": -1.359375, + "logits/rejected": -1.12109375, + "logps/chosen": -228.0, + "logps/rejected": -249.0, + "loss": 0.2602, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.88671875, + "rewards/margins": 3.09375, + "rewards/rejected": -2.2109375, + "step": 719 + }, + { + "epoch": 0.7964601769911505, + "grad_norm": 13.466360092163086, + "learning_rate": 5.3034116269010194e-08, + "logits/chosen": -1.453125, + "logits/rejected": -1.234375, + "logps/chosen": -269.0, + "logps/rejected": -295.0, + "loss": 0.3119, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.59765625, + "rewards/margins": 2.4921875, + "rewards/rejected": -1.890625, + "step": 720 + }, + { + "epoch": 0.797566371681416, + "grad_norm": 15.516824722290039, + "learning_rate": 5.248322958117815e-08, + "logits/chosen": -1.1875, + "logits/rejected": -1.15625, + "logps/chosen": -261.0, + "logps/rejected": -273.0, + "loss": 0.361, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.91015625, + "rewards/margins": 2.6875, + "rewards/rejected": -1.78125, + "step": 721 + }, + { + "epoch": 0.7986725663716814, + "grad_norm": 14.861969947814941, + "learning_rate": 5.1934883252045507e-08, + "logits/chosen": -1.234375, + "logits/rejected": -1.16796875, + "logps/chosen": -249.5, + "logps/rejected": -279.0, + "loss": 0.3549, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.65625, + "rewards/margins": 2.8046875, + "rewards/rejected": -2.1484375, + "step": 722 + }, + { + "epoch": 0.7997787610619469, + "grad_norm": 14.74849796295166, + "learning_rate": 5.138908433415945e-08, + "logits/chosen": -1.28125, + "logits/rejected": -1.23828125, + "logps/chosen": -271.0, + "logps/rejected": -311.0, + "loss": 0.2943, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.8203125, + "rewards/margins": 3.1171875, + "rewards/rejected": -2.3046875, + "step": 723 + }, + { + "epoch": 0.8008849557522124, + "grad_norm": 13.291254043579102, + "learning_rate": 5.0845839847303894e-08, + "logits/chosen": -1.25390625, + "logits/rejected": -1.11328125, + "logps/chosen": -244.5, + "logps/rejected": -257.0, + "loss": 0.3242, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.666015625, + "rewards/margins": 2.8671875, + "rewards/rejected": -2.1953125, + "step": 724 + }, + { + "epoch": 0.8019911504424779, + "grad_norm": 12.395694732666016, + "learning_rate": 5.030515677840882e-08, + "logits/chosen": -1.1875, + "logits/rejected": -1.1171875, + "logps/chosen": -240.5, + "logps/rejected": -276.0, + "loss": 0.3041, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.7890625, + "rewards/margins": 3.03125, + "rewards/rejected": -2.2421875, + "step": 725 + }, + { + "epoch": 0.8030973451327433, + "grad_norm": 13.156864166259766, + "learning_rate": 4.9767042081460626e-08, + "logits/chosen": -1.421875, + "logits/rejected": -1.12109375, + "logps/chosen": -253.5, + "logps/rejected": -286.0, + "loss": 0.2806, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.701171875, + "rewards/margins": 2.8359375, + "rewards/rejected": -2.1328125, + "step": 726 + }, + { + "epoch": 0.8042035398230089, + "grad_norm": 13.708073616027832, + "learning_rate": 4.923150267741266e-08, + "logits/chosen": -1.1953125, + "logits/rejected": -1.13671875, + "logps/chosen": -272.0, + "logps/rejected": -310.0, + "loss": 0.2606, + "rewards/accuracies": 0.8671875, + "rewards/chosen": 0.796875, + "rewards/margins": 3.2734375, + "rewards/rejected": -2.4765625, + "step": 727 + }, + { + "epoch": 0.8053097345132744, + "grad_norm": 13.454339981079102, + "learning_rate": 4.869854545409627e-08, + "logits/chosen": -1.2578125, + "logits/rejected": -1.2109375, + "logps/chosen": -243.0, + "logps/rejected": -296.0, + "loss": 0.2951, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.841796875, + "rewards/margins": 2.8671875, + "rewards/rejected": -2.02734375, + "step": 728 + }, + { + "epoch": 0.8064159292035398, + "grad_norm": 13.385002136230469, + "learning_rate": 4.816817726613187e-08, + "logits/chosen": -1.30078125, + "logits/rejected": -1.20703125, + "logps/chosen": -255.5, + "logps/rejected": -267.0, + "loss": 0.3009, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.5869140625, + "rewards/margins": 2.875, + "rewards/rejected": -2.2890625, + "step": 729 + }, + { + "epoch": 0.8075221238938053, + "grad_norm": 11.77560806274414, + "learning_rate": 4.7640404934841284e-08, + "logits/chosen": -1.25390625, + "logits/rejected": -1.1171875, + "logps/chosen": -239.5, + "logps/rejected": -256.5, + "loss": 0.2937, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.65234375, + "rewards/margins": 2.7734375, + "rewards/rejected": -2.12109375, + "step": 730 + }, + { + "epoch": 0.8086283185840708, + "grad_norm": 14.025035858154297, + "learning_rate": 4.7115235248159776e-08, + "logits/chosen": -1.3359375, + "logits/rejected": -1.14453125, + "logps/chosen": -283.0, + "logps/rejected": -303.0, + "loss": 0.2726, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 1.037109375, + "rewards/margins": 3.1328125, + "rewards/rejected": -2.10546875, + "step": 731 + }, + { + "epoch": 0.8097345132743363, + "grad_norm": 13.514138221740723, + "learning_rate": 4.659267496054847e-08, + "logits/chosen": -1.2890625, + "logits/rejected": -1.091796875, + "logps/chosen": -248.5, + "logps/rejected": -266.5, + "loss": 0.2988, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.646484375, + "rewards/margins": 2.84375, + "rewards/rejected": -2.1953125, + "step": 732 + }, + { + "epoch": 0.8108407079646017, + "grad_norm": 15.020828247070312, + "learning_rate": 4.60727307929081e-08, + "logits/chosen": -1.28125, + "logits/rejected": -1.06640625, + "logps/chosen": -258.5, + "logps/rejected": -275.0, + "loss": 0.3037, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.75390625, + "rewards/margins": 3.09375, + "rewards/rejected": -2.34375, + "step": 733 + }, + { + "epoch": 0.8119469026548672, + "grad_norm": 14.957762718200684, + "learning_rate": 4.555540943249187e-08, + "logits/chosen": -1.3515625, + "logits/rejected": -1.19921875, + "logps/chosen": -248.5, + "logps/rejected": -304.0, + "loss": 0.3, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.705078125, + "rewards/margins": 2.8515625, + "rewards/rejected": -2.1484375, + "step": 734 + }, + { + "epoch": 0.8130530973451328, + "grad_norm": 12.412934303283691, + "learning_rate": 4.5040717532820046e-08, + "logits/chosen": -1.28125, + "logits/rejected": -1.15234375, + "logps/chosen": -257.5, + "logps/rejected": -296.0, + "loss": 0.282, + "rewards/accuracies": 0.8671875, + "rewards/chosen": 0.7607421875, + "rewards/margins": 2.953125, + "rewards/rejected": -2.1953125, + "step": 735 + }, + { + "epoch": 0.8141592920353983, + "grad_norm": 15.76734733581543, + "learning_rate": 4.4528661713594125e-08, + "logits/chosen": -1.3515625, + "logits/rejected": -1.1796875, + "logps/chosen": -238.5, + "logps/rejected": -262.0, + "loss": 0.3355, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.8427734375, + "rewards/margins": 2.96875, + "rewards/rejected": -2.125, + "step": 736 + }, + { + "epoch": 0.8152654867256637, + "grad_norm": 16.009498596191406, + "learning_rate": 4.4019248560611454e-08, + "logits/chosen": -1.1953125, + "logits/rejected": -1.16796875, + "logps/chosen": -271.0, + "logps/rejected": -292.0, + "loss": 0.34, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.8203125, + "rewards/margins": 2.7578125, + "rewards/rejected": -1.9375, + "step": 737 + }, + { + "epoch": 0.8163716814159292, + "grad_norm": 12.171030044555664, + "learning_rate": 4.3512484625681e-08, + "logits/chosen": -1.29296875, + "logits/rejected": -1.046875, + "logps/chosen": -257.0, + "logps/rejected": -285.0, + "loss": 0.2528, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.6953125, + "rewards/margins": 2.9921875, + "rewards/rejected": -2.2890625, + "step": 738 + }, + { + "epoch": 0.8174778761061947, + "grad_norm": 14.278532981872559, + "learning_rate": 4.3008376426538903e-08, + "logits/chosen": -1.31640625, + "logits/rejected": -1.2265625, + "logps/chosen": -250.5, + "logps/rejected": -258.5, + "loss": 0.3722, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.5029296875, + "rewards/margins": 2.4921875, + "rewards/rejected": -1.9921875, + "step": 739 + }, + { + "epoch": 0.8185840707964602, + "grad_norm": 12.9563570022583, + "learning_rate": 4.250693044676429e-08, + "logits/chosen": -1.2734375, + "logits/rejected": -1.17578125, + "logps/chosen": -270.0, + "logps/rejected": -287.0, + "loss": 0.2685, + "rewards/accuracies": 0.8671875, + "rewards/chosen": 0.685546875, + "rewards/margins": 3.1640625, + "rewards/rejected": -2.4765625, + "step": 740 + }, + { + "epoch": 0.8196902654867256, + "grad_norm": 13.755107879638672, + "learning_rate": 4.2008153135696584e-08, + "logits/chosen": -1.21484375, + "logits/rejected": -1.09375, + "logps/chosen": -246.5, + "logps/rejected": -285.0, + "loss": 0.3042, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.736328125, + "rewards/margins": 2.9765625, + "rewards/rejected": -2.2421875, + "step": 741 + }, + { + "epoch": 0.8207964601769911, + "grad_norm": 12.855173110961914, + "learning_rate": 4.151205090835183e-08, + "logits/chosen": -1.2734375, + "logits/rejected": -1.10546875, + "logps/chosen": -243.5, + "logps/rejected": -283.0, + "loss": 0.2732, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.916015625, + "rewards/margins": 3.0859375, + "rewards/rejected": -2.1640625, + "step": 742 + }, + { + "epoch": 0.8219026548672567, + "grad_norm": 15.404345512390137, + "learning_rate": 4.1018630145340735e-08, + "logits/chosen": -1.29296875, + "logits/rejected": -1.34375, + "logps/chosen": -255.0, + "logps/rejected": -262.5, + "loss": 0.2993, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.810546875, + "rewards/margins": 3.09375, + "rewards/rejected": -2.28125, + "step": 743 + }, + { + "epoch": 0.8230088495575221, + "grad_norm": 13.697175979614258, + "learning_rate": 4.0527897192786433e-08, + "logits/chosen": -1.2421875, + "logits/rejected": -1.171875, + "logps/chosen": -279.0, + "logps/rejected": -289.0, + "loss": 0.2732, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.83984375, + "rewards/margins": 3.046875, + "rewards/rejected": -2.19921875, + "step": 744 + }, + { + "epoch": 0.8241150442477876, + "grad_norm": 14.881061553955078, + "learning_rate": 4.003985836224255e-08, + "logits/chosen": -1.296875, + "logits/rejected": -1.30859375, + "logps/chosen": -256.5, + "logps/rejected": -284.0, + "loss": 0.3474, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.5068359375, + "rewards/margins": 2.375, + "rewards/rejected": -1.87109375, + "step": 745 + }, + { + "epoch": 0.8252212389380531, + "grad_norm": 13.085796356201172, + "learning_rate": 3.955451993061268e-08, + "logits/chosen": -1.33984375, + "logits/rejected": -1.12890625, + "logps/chosen": -258.0, + "logps/rejected": -292.0, + "loss": 0.2616, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.91796875, + "rewards/margins": 3.09375, + "rewards/rejected": -2.1796875, + "step": 746 + }, + { + "epoch": 0.8263274336283186, + "grad_norm": 13.392922401428223, + "learning_rate": 3.9071888140068926e-08, + "logits/chosen": -1.2109375, + "logits/rejected": -1.16796875, + "logps/chosen": -256.0, + "logps/rejected": -316.0, + "loss": 0.2815, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.986328125, + "rewards/margins": 3.1171875, + "rewards/rejected": -2.1328125, + "step": 747 + }, + { + "epoch": 0.827433628318584, + "grad_norm": 12.065234184265137, + "learning_rate": 3.859196919797228e-08, + "logits/chosen": -1.3359375, + "logits/rejected": -1.12109375, + "logps/chosen": -247.5, + "logps/rejected": -264.0, + "loss": 0.3147, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.7265625, + "rewards/margins": 2.9609375, + "rewards/rejected": -2.2421875, + "step": 748 + }, + { + "epoch": 0.8285398230088495, + "grad_norm": 14.233034133911133, + "learning_rate": 3.811476927679227e-08, + "logits/chosen": -1.16015625, + "logits/rejected": -1.1640625, + "logps/chosen": -265.0, + "logps/rejected": -300.0, + "loss": 0.3261, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.6884765625, + "rewards/margins": 2.75, + "rewards/rejected": -2.0703125, + "step": 749 + }, + { + "epoch": 0.8296460176991151, + "grad_norm": 14.785301208496094, + "learning_rate": 3.764029451402778e-08, + "logits/chosen": -1.265625, + "logits/rejected": -1.19140625, + "logps/chosen": -236.0, + "logps/rejected": -282.0, + "loss": 0.297, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.791015625, + "rewards/margins": 3.1640625, + "rewards/rejected": -2.375, + "step": 750 + }, + { + "epoch": 0.8296460176991151, + "eval_logits/chosen": -1.270017147064209, + "eval_logits/rejected": -1.1617498397827148, + "eval_logps/chosen": -253.53233337402344, + "eval_logps/rejected": -280.19403076171875, + "eval_loss": 0.31980380415916443, + "eval_rewards/accuracies": 0.8147646188735962, + "eval_rewards/chosen": 0.72982257604599, + "eval_rewards/margins": 2.827347755432129, + "eval_rewards/rejected": -2.09759783744812, + "eval_runtime": 193.0983, + "eval_samples_per_second": 66.562, + "eval_steps_per_second": 1.041, + "step": 750 + }, + { + "epoch": 0.8307522123893806, + "grad_norm": 13.682051658630371, + "learning_rate": 3.716855101212826e-08, + "logits/chosen": -1.26953125, + "logits/rejected": -1.1953125, + "logps/chosen": -270.0, + "logps/rejected": -284.5, + "loss": 0.3091, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.740234375, + "rewards/margins": 2.8125, + "rewards/rejected": -2.0703125, + "step": 751 + }, + { + "epoch": 0.831858407079646, + "grad_norm": 14.422385215759277, + "learning_rate": 3.6699544838415034e-08, + "logits/chosen": -1.328125, + "logits/rejected": -1.1015625, + "logps/chosen": -268.0, + "logps/rejected": -266.0, + "loss": 0.3043, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.724609375, + "rewards/margins": 2.875, + "rewards/rejected": -2.15234375, + "step": 752 + }, + { + "epoch": 0.8329646017699115, + "grad_norm": 14.678279876708984, + "learning_rate": 3.623328202500322e-08, + "logits/chosen": -1.21484375, + "logits/rejected": -1.13671875, + "logps/chosen": -280.0, + "logps/rejected": -305.0, + "loss": 0.3304, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.6796875, + "rewards/margins": 2.4765625, + "rewards/rejected": -1.796875, + "step": 753 + }, + { + "epoch": 0.834070796460177, + "grad_norm": 12.621984481811523, + "learning_rate": 3.576976856872438e-08, + "logits/chosen": -1.421875, + "logits/rejected": -1.0859375, + "logps/chosen": -252.0, + "logps/rejected": -276.0, + "loss": 0.294, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.73046875, + "rewards/margins": 2.8515625, + "rewards/rejected": -2.1171875, + "step": 754 + }, + { + "epoch": 0.8351769911504425, + "grad_norm": 11.676498413085938, + "learning_rate": 3.530901043104928e-08, + "logits/chosen": -1.26171875, + "logits/rejected": -1.1953125, + "logps/chosen": -227.0, + "logps/rejected": -266.0, + "loss": 0.2778, + "rewards/accuracies": 0.8828125, + "rewards/chosen": 1.048828125, + "rewards/margins": 3.109375, + "rewards/rejected": -2.0625, + "step": 755 + }, + { + "epoch": 0.8362831858407079, + "grad_norm": 12.77115249633789, + "learning_rate": 3.4851013538011035e-08, + "logits/chosen": -1.3125, + "logits/rejected": -1.17578125, + "logps/chosen": -252.5, + "logps/rejected": -290.0, + "loss": 0.2771, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.7578125, + "rewards/margins": 3.0078125, + "rewards/rejected": -2.2578125, + "step": 756 + }, + { + "epoch": 0.8373893805309734, + "grad_norm": 13.537567138671875, + "learning_rate": 3.439578378012925e-08, + "logits/chosen": -1.34765625, + "logits/rejected": -1.17578125, + "logps/chosen": -251.0, + "logps/rejected": -285.5, + "loss": 0.2978, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.9921875, + "rewards/margins": 3.140625, + "rewards/rejected": -2.1484375, + "step": 757 + }, + { + "epoch": 0.838495575221239, + "grad_norm": 12.62022590637207, + "learning_rate": 3.394332701233391e-08, + "logits/chosen": -1.265625, + "logits/rejected": -1.1796875, + "logps/chosen": -242.5, + "logps/rejected": -261.0, + "loss": 0.2755, + "rewards/accuracies": 0.8671875, + "rewards/chosen": 0.8359375, + "rewards/margins": 3.0078125, + "rewards/rejected": -2.1640625, + "step": 758 + }, + { + "epoch": 0.8396017699115044, + "grad_norm": 14.283227920532227, + "learning_rate": 3.349364905389032e-08, + "logits/chosen": -1.15234375, + "logits/rejected": -1.158203125, + "logps/chosen": -269.0, + "logps/rejected": -290.0, + "loss": 0.3305, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.658203125, + "rewards/margins": 2.8828125, + "rewards/rejected": -2.2265625, + "step": 759 + }, + { + "epoch": 0.8407079646017699, + "grad_norm": 12.961087226867676, + "learning_rate": 3.304675568832427e-08, + "logits/chosen": -1.265625, + "logits/rejected": -1.109375, + "logps/chosen": -263.0, + "logps/rejected": -279.5, + "loss": 0.3033, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.755859375, + "rewards/margins": 2.5078125, + "rewards/rejected": -1.75, + "step": 760 + }, + { + "epoch": 0.8418141592920354, + "grad_norm": 14.768875122070312, + "learning_rate": 3.260265266334725e-08, + "logits/chosen": -1.21484375, + "logits/rejected": -1.125, + "logps/chosen": -256.0, + "logps/rejected": -282.0, + "loss": 0.382, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.765625, + "rewards/margins": 2.40234375, + "rewards/rejected": -1.63671875, + "step": 761 + }, + { + "epoch": 0.8429203539823009, + "grad_norm": 16.72699546813965, + "learning_rate": 3.216134569078316e-08, + "logits/chosen": -1.25, + "logits/rejected": -1.22265625, + "logps/chosen": -266.0, + "logps/rejected": -300.0, + "loss": 0.3642, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.90625, + "rewards/margins": 2.7421875, + "rewards/rejected": -1.83203125, + "step": 762 + }, + { + "epoch": 0.8440265486725663, + "grad_norm": 12.911907196044922, + "learning_rate": 3.172284044649437e-08, + "logits/chosen": -1.265625, + "logits/rejected": -1.1171875, + "logps/chosen": -260.5, + "logps/rejected": -308.0, + "loss": 0.3017, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.6640625, + "rewards/margins": 2.6640625, + "rewards/rejected": -2.0, + "step": 763 + }, + { + "epoch": 0.8451327433628318, + "grad_norm": 15.997196197509766, + "learning_rate": 3.128714257030882e-08, + "logits/chosen": -1.27734375, + "logits/rejected": -1.15625, + "logps/chosen": -284.0, + "logps/rejected": -301.0, + "loss": 0.3964, + "rewards/accuracies": 0.7421875, + "rewards/chosen": 0.607421875, + "rewards/margins": 2.3203125, + "rewards/rejected": -1.71875, + "step": 764 + }, + { + "epoch": 0.8462389380530974, + "grad_norm": 14.732622146606445, + "learning_rate": 3.085425766594768e-08, + "logits/chosen": -1.2265625, + "logits/rejected": -1.23828125, + "logps/chosen": -262.0, + "logps/rejected": -245.0, + "loss": 0.3107, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.724609375, + "rewards/margins": 2.9375, + "rewards/rejected": -2.21875, + "step": 765 + }, + { + "epoch": 0.8473451327433629, + "grad_norm": 14.123418807983398, + "learning_rate": 3.042419130095292e-08, + "logits/chosen": -1.26953125, + "logits/rejected": -1.19140625, + "logps/chosen": -249.0, + "logps/rejected": -303.0, + "loss": 0.2951, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.73828125, + "rewards/margins": 3.2109375, + "rewards/rejected": -2.4765625, + "step": 766 + }, + { + "epoch": 0.8484513274336283, + "grad_norm": 15.25007438659668, + "learning_rate": 2.999694900661609e-08, + "logits/chosen": -1.2578125, + "logits/rejected": -1.12109375, + "logps/chosen": -280.0, + "logps/rejected": -286.0, + "loss": 0.3976, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.296875, + "rewards/margins": 1.96484375, + "rewards/rejected": -1.66796875, + "step": 767 + }, + { + "epoch": 0.8495575221238938, + "grad_norm": 14.552936553955078, + "learning_rate": 2.9572536277906984e-08, + "logits/chosen": -1.2421875, + "logits/rejected": -1.15234375, + "logps/chosen": -251.5, + "logps/rejected": -294.0, + "loss": 0.3292, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.5849609375, + "rewards/margins": 2.734375, + "rewards/rejected": -2.1484375, + "step": 768 + }, + { + "epoch": 0.8506637168141593, + "grad_norm": 12.925614356994629, + "learning_rate": 2.9150958573402885e-08, + "logits/chosen": -1.3828125, + "logits/rejected": -1.171875, + "logps/chosen": -267.0, + "logps/rejected": -305.0, + "loss": 0.2762, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.775390625, + "rewards/margins": 2.8984375, + "rewards/rejected": -2.125, + "step": 769 + }, + { + "epoch": 0.8517699115044248, + "grad_norm": 14.300766944885254, + "learning_rate": 2.8732221315218573e-08, + "logits/chosen": -1.18359375, + "logits/rejected": -1.12109375, + "logps/chosen": -257.0, + "logps/rejected": -279.0, + "loss": 0.344, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.52587890625, + "rewards/margins": 2.484375, + "rewards/rejected": -1.953125, + "step": 770 + }, + { + "epoch": 0.8528761061946902, + "grad_norm": 13.133272171020508, + "learning_rate": 2.8316329888936315e-08, + "logits/chosen": -1.2578125, + "logits/rejected": -1.1015625, + "logps/chosen": -250.5, + "logps/rejected": -269.0, + "loss": 0.2487, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.927734375, + "rewards/margins": 3.046875, + "rewards/rejected": -2.12109375, + "step": 771 + }, + { + "epoch": 0.8539823008849557, + "grad_norm": 12.045042991638184, + "learning_rate": 2.7903289643537e-08, + "logits/chosen": -1.34375, + "logits/rejected": -1.1171875, + "logps/chosen": -256.5, + "logps/rejected": -267.5, + "loss": 0.2765, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.76171875, + "rewards/margins": 2.96875, + "rewards/rejected": -2.2109375, + "step": 772 + }, + { + "epoch": 0.8550884955752213, + "grad_norm": 12.052350044250488, + "learning_rate": 2.7493105891330832e-08, + "logits/chosen": -1.28125, + "logits/rejected": -1.16015625, + "logps/chosen": -240.0, + "logps/rejected": -274.0, + "loss": 0.2838, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.81640625, + "rewards/margins": 3.1171875, + "rewards/rejected": -2.296875, + "step": 773 + }, + { + "epoch": 0.8561946902654868, + "grad_norm": 12.869089126586914, + "learning_rate": 2.7085783907889514e-08, + "logits/chosen": -1.26953125, + "logits/rejected": -1.1796875, + "logps/chosen": -260.0, + "logps/rejected": -274.0, + "loss": 0.3115, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.6953125, + "rewards/margins": 2.546875, + "rewards/rejected": -1.8515625, + "step": 774 + }, + { + "epoch": 0.8573008849557522, + "grad_norm": 13.210247993469238, + "learning_rate": 2.6681328931977942e-08, + "logits/chosen": -1.2578125, + "logits/rejected": -1.03125, + "logps/chosen": -247.5, + "logps/rejected": -286.0, + "loss": 0.2939, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.896484375, + "rewards/margins": 2.9765625, + "rewards/rejected": -2.078125, + "step": 775 + }, + { + "epoch": 0.8584070796460177, + "grad_norm": 13.413789749145508, + "learning_rate": 2.6279746165487255e-08, + "logits/chosen": -1.26171875, + "logits/rejected": -1.171875, + "logps/chosen": -267.0, + "logps/rejected": -282.0, + "loss": 0.3004, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.658203125, + "rewards/margins": 2.6015625, + "rewards/rejected": -1.94921875, + "step": 776 + }, + { + "epoch": 0.8595132743362832, + "grad_norm": 13.01457691192627, + "learning_rate": 2.5881040773367502e-08, + "logits/chosen": -1.1875, + "logits/rejected": -1.05859375, + "logps/chosen": -240.0, + "logps/rejected": -257.0, + "loss": 0.3088, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.716796875, + "rewards/margins": 3.03125, + "rewards/rejected": -2.31640625, + "step": 777 + }, + { + "epoch": 0.8606194690265486, + "grad_norm": 12.700637817382812, + "learning_rate": 2.5485217883561616e-08, + "logits/chosen": -1.30859375, + "logits/rejected": -1.1484375, + "logps/chosen": -248.5, + "logps/rejected": -279.0, + "loss": 0.2977, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.826171875, + "rewards/margins": 2.9375, + "rewards/rejected": -2.11328125, + "step": 778 + }, + { + "epoch": 0.8617256637168141, + "grad_norm": 13.09081745147705, + "learning_rate": 2.5092282586939183e-08, + "logits/chosen": -1.3828125, + "logits/rejected": -1.1328125, + "logps/chosen": -272.0, + "logps/rejected": -284.5, + "loss": 0.2959, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.755859375, + "rewards/margins": 2.7265625, + "rewards/rejected": -1.96875, + "step": 779 + }, + { + "epoch": 0.8628318584070797, + "grad_norm": 12.912965774536133, + "learning_rate": 2.470223993723103e-08, + "logits/chosen": -1.171875, + "logits/rejected": -1.10546875, + "logps/chosen": -259.0, + "logps/rejected": -283.5, + "loss": 0.304, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.73828125, + "rewards/margins": 2.9609375, + "rewards/rejected": -2.21875, + "step": 780 + }, + { + "epoch": 0.8639380530973452, + "grad_norm": 13.398490905761719, + "learning_rate": 2.4315094950964343e-08, + "logits/chosen": -1.375, + "logits/rejected": -1.1953125, + "logps/chosen": -272.5, + "logps/rejected": -278.5, + "loss": 0.3286, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.61328125, + "rewards/margins": 2.4140625, + "rewards/rejected": -1.796875, + "step": 781 + }, + { + "epoch": 0.8650442477876106, + "grad_norm": 13.045671463012695, + "learning_rate": 2.393085260739794e-08, + "logits/chosen": -1.36328125, + "logits/rejected": -1.15625, + "logps/chosen": -242.0, + "logps/rejected": -263.5, + "loss": 0.3228, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.80859375, + "rewards/margins": 2.7421875, + "rewards/rejected": -1.94140625, + "step": 782 + }, + { + "epoch": 0.8661504424778761, + "grad_norm": 15.309684753417969, + "learning_rate": 2.3549517848458435e-08, + "logits/chosen": -1.26171875, + "logits/rejected": -1.1328125, + "logps/chosen": -279.0, + "logps/rejected": -301.0, + "loss": 0.3618, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.724609375, + "rewards/margins": 2.4765625, + "rewards/rejected": -1.75, + "step": 783 + }, + { + "epoch": 0.8672566371681416, + "grad_norm": 12.972829818725586, + "learning_rate": 2.3171095578676637e-08, + "logits/chosen": -1.2578125, + "logits/rejected": -1.1171875, + "logps/chosen": -255.5, + "logps/rejected": -305.0, + "loss": 0.2948, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.84375, + "rewards/margins": 2.9140625, + "rewards/rejected": -2.078125, + "step": 784 + }, + { + "epoch": 0.8683628318584071, + "grad_norm": 12.639619827270508, + "learning_rate": 2.2795590665124263e-08, + "logits/chosen": -1.2421875, + "logits/rejected": -1.09375, + "logps/chosen": -235.0, + "logps/rejected": -269.5, + "loss": 0.2619, + "rewards/accuracies": 0.8828125, + "rewards/chosen": 0.7578125, + "rewards/margins": 3.171875, + "rewards/rejected": -2.421875, + "step": 785 + }, + { + "epoch": 0.8694690265486725, + "grad_norm": 12.34381103515625, + "learning_rate": 2.2423007937351634e-08, + "logits/chosen": -1.1953125, + "logits/rejected": -1.13671875, + "logps/chosen": -254.0, + "logps/rejected": -264.0, + "loss": 0.2839, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.521484375, + "rewards/margins": 2.8828125, + "rewards/rejected": -2.359375, + "step": 786 + }, + { + "epoch": 0.870575221238938, + "grad_norm": 13.31490707397461, + "learning_rate": 2.205335218732543e-08, + "logits/chosen": -1.2734375, + "logits/rejected": -1.21875, + "logps/chosen": -259.0, + "logps/rejected": -280.0, + "loss": 0.3176, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.666015625, + "rewards/margins": 2.65625, + "rewards/rejected": -1.9921875, + "step": 787 + }, + { + "epoch": 0.8716814159292036, + "grad_norm": 14.77593994140625, + "learning_rate": 2.1686628169366923e-08, + "logits/chosen": -1.109375, + "logits/rejected": -1.1171875, + "logps/chosen": -266.0, + "logps/rejected": -297.0, + "loss": 0.3291, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.771484375, + "rewards/margins": 2.8671875, + "rewards/rejected": -2.1015625, + "step": 788 + }, + { + "epoch": 0.8727876106194691, + "grad_norm": 12.58286190032959, + "learning_rate": 2.1322840600091096e-08, + "logits/chosen": -1.265625, + "logits/rejected": -1.15625, + "logps/chosen": -249.5, + "logps/rejected": -260.0, + "loss": 0.2995, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.3896484375, + "rewards/margins": 2.6328125, + "rewards/rejected": -2.2421875, + "step": 789 + }, + { + "epoch": 0.8738938053097345, + "grad_norm": 13.99928092956543, + "learning_rate": 2.0961994158345763e-08, + "logits/chosen": -1.34765625, + "logits/rejected": -1.0859375, + "logps/chosen": -254.5, + "logps/rejected": -263.5, + "loss": 0.2972, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.662109375, + "rewards/margins": 2.8359375, + "rewards/rejected": -2.16796875, + "step": 790 + }, + { + "epoch": 0.875, + "grad_norm": 11.941873550415039, + "learning_rate": 2.0604093485151548e-08, + "logits/chosen": -1.31640625, + "logits/rejected": -1.1171875, + "logps/chosen": -261.0, + "logps/rejected": -270.0, + "loss": 0.2886, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.818359375, + "rewards/margins": 3.15625, + "rewards/rejected": -2.3359375, + "step": 791 + }, + { + "epoch": 0.8761061946902655, + "grad_norm": 17.870344161987305, + "learning_rate": 2.0249143183642097e-08, + "logits/chosen": -1.3984375, + "logits/rejected": -1.1953125, + "logps/chosen": -244.0, + "logps/rejected": -268.0, + "loss": 0.4293, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.4189453125, + "rewards/margins": 2.26171875, + "rewards/rejected": -1.83984375, + "step": 792 + }, + { + "epoch": 0.8772123893805309, + "grad_norm": 12.3770112991333, + "learning_rate": 1.989714781900484e-08, + "logits/chosen": -1.3671875, + "logits/rejected": -1.14453125, + "logps/chosen": -264.0, + "logps/rejected": -284.0, + "loss": 0.2621, + "rewards/accuracies": 0.8671875, + "rewards/chosen": 0.869140625, + "rewards/margins": 3.03125, + "rewards/rejected": -2.1640625, + "step": 793 + }, + { + "epoch": 0.8783185840707964, + "grad_norm": 14.804219245910645, + "learning_rate": 1.95481119184224e-08, + "logits/chosen": -1.1953125, + "logits/rejected": -1.140625, + "logps/chosen": -245.0, + "logps/rejected": -302.0, + "loss": 0.3552, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.638671875, + "rewards/margins": 2.6875, + "rewards/rejected": -2.04296875, + "step": 794 + }, + { + "epoch": 0.879424778761062, + "grad_norm": 13.518996238708496, + "learning_rate": 1.9202039971014243e-08, + "logits/chosen": -1.375, + "logits/rejected": -1.20703125, + "logps/chosen": -241.5, + "logps/rejected": -263.0, + "loss": 0.3375, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.69921875, + "rewards/margins": 2.734375, + "rewards/rejected": -2.03125, + "step": 795 + }, + { + "epoch": 0.8805309734513275, + "grad_norm": 13.753449440002441, + "learning_rate": 1.8858936427779137e-08, + "logits/chosen": -1.21875, + "logits/rejected": -1.12109375, + "logps/chosen": -262.5, + "logps/rejected": -289.0, + "loss": 0.2857, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.75, + "rewards/margins": 2.8203125, + "rewards/rejected": -2.0703125, + "step": 796 + }, + { + "epoch": 0.8816371681415929, + "grad_norm": 15.924559593200684, + "learning_rate": 1.8518805701537548e-08, + "logits/chosen": -1.25, + "logits/rejected": -1.09765625, + "logps/chosen": -253.5, + "logps/rejected": -264.5, + "loss": 0.3678, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.671875, + "rewards/margins": 2.640625, + "rewards/rejected": -1.96484375, + "step": 797 + }, + { + "epoch": 0.8827433628318584, + "grad_norm": 12.930898666381836, + "learning_rate": 1.818165216687531e-08, + "logits/chosen": -1.26171875, + "logits/rejected": -1.1640625, + "logps/chosen": -248.5, + "logps/rejected": -258.0, + "loss": 0.2994, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.732421875, + "rewards/margins": 2.9296875, + "rewards/rejected": -2.1953125, + "step": 798 + }, + { + "epoch": 0.8838495575221239, + "grad_norm": 14.774980545043945, + "learning_rate": 1.7847480160087025e-08, + "logits/chosen": -1.26953125, + "logits/rejected": -1.125, + "logps/chosen": -250.5, + "logps/rejected": -290.0, + "loss": 0.3111, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.724609375, + "rewards/margins": 2.953125, + "rewards/rejected": -2.234375, + "step": 799 + }, + { + "epoch": 0.8849557522123894, + "grad_norm": 13.716545104980469, + "learning_rate": 1.7516293979120523e-08, + "logits/chosen": -1.2578125, + "logits/rejected": -1.1328125, + "logps/chosen": -262.5, + "logps/rejected": -267.0, + "loss": 0.3317, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.5703125, + "rewards/margins": 2.640625, + "rewards/rejected": -2.0625, + "step": 800 + }, + { + "epoch": 0.8849557522123894, + "eval_logits/chosen": -1.2699394226074219, + "eval_logits/rejected": -1.161244511604309, + "eval_logps/chosen": -253.7014923095703, + "eval_logps/rejected": -280.33831787109375, + "eval_loss": 0.31928393244743347, + "eval_rewards/accuracies": 0.8145930171012878, + "eval_rewards/chosen": 0.7206642627716064, + "eval_rewards/margins": 2.836987018585205, + "eval_rewards/rejected": -2.1172263622283936, + "eval_runtime": 193.0847, + "eval_samples_per_second": 66.567, + "eval_steps_per_second": 1.041, + "step": 800 + }, + { + "epoch": 0.8860619469026548, + "grad_norm": 14.249678611755371, + "learning_rate": 1.7188097883521352e-08, + "logits/chosen": -1.28515625, + "logits/rejected": -1.171875, + "logps/chosen": -248.5, + "logps/rejected": -251.0, + "loss": 0.2843, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.751953125, + "rewards/margins": 2.984375, + "rewards/rejected": -2.234375, + "step": 801 + }, + { + "epoch": 0.8871681415929203, + "grad_norm": 12.044215202331543, + "learning_rate": 1.6862896094378244e-08, + "logits/chosen": -1.296875, + "logits/rejected": -1.2578125, + "logps/chosen": -236.0, + "logps/rejected": -261.5, + "loss": 0.2971, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.8515625, + "rewards/margins": 3.1171875, + "rewards/rejected": -2.265625, + "step": 802 + }, + { + "epoch": 0.8882743362831859, + "grad_norm": 13.170328140258789, + "learning_rate": 1.654069279426873e-08, + "logits/chosen": -1.21484375, + "logits/rejected": -1.125, + "logps/chosen": -255.5, + "logps/rejected": -300.0, + "loss": 0.2789, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 1.017578125, + "rewards/margins": 3.1328125, + "rewards/rejected": -2.1171875, + "step": 803 + }, + { + "epoch": 0.8893805309734514, + "grad_norm": 13.188612937927246, + "learning_rate": 1.6221492127205166e-08, + "logits/chosen": -1.26953125, + "logits/rejected": -1.24609375, + "logps/chosen": -269.0, + "logps/rejected": -293.0, + "loss": 0.2959, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.900390625, + "rewards/margins": 2.8125, + "rewards/rejected": -1.921875, + "step": 804 + }, + { + "epoch": 0.8904867256637168, + "grad_norm": 17.259361267089844, + "learning_rate": 1.5905298198581774e-08, + "logits/chosen": -1.25, + "logits/rejected": -1.171875, + "logps/chosen": -271.0, + "logps/rejected": -301.0, + "loss": 0.3979, + "rewards/accuracies": 0.7421875, + "rewards/chosen": 0.814453125, + "rewards/margins": 2.4921875, + "rewards/rejected": -1.6796875, + "step": 805 + }, + { + "epoch": 0.8915929203539823, + "grad_norm": 13.314188957214355, + "learning_rate": 1.5592115075121508e-08, + "logits/chosen": -1.3203125, + "logits/rejected": -1.15234375, + "logps/chosen": -250.5, + "logps/rejected": -294.0, + "loss": 0.3297, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.822265625, + "rewards/margins": 2.546875, + "rewards/rejected": -1.7265625, + "step": 806 + }, + { + "epoch": 0.8926991150442478, + "grad_norm": 12.087140083312988, + "learning_rate": 1.5281946784824002e-08, + "logits/chosen": -1.33203125, + "logits/rejected": -1.15625, + "logps/chosen": -250.0, + "logps/rejected": -297.0, + "loss": 0.2368, + "rewards/accuracies": 0.8828125, + "rewards/chosen": 0.66015625, + "rewards/margins": 3.1953125, + "rewards/rejected": -2.5234375, + "step": 807 + }, + { + "epoch": 0.8938053097345132, + "grad_norm": 15.906932830810547, + "learning_rate": 1.4974797316913673e-08, + "logits/chosen": -1.28125, + "logits/rejected": -1.15625, + "logps/chosen": -287.0, + "logps/rejected": -302.0, + "loss": 0.3282, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.671875, + "rewards/margins": 2.625, + "rewards/rejected": -1.9609375, + "step": 808 + }, + { + "epoch": 0.8949115044247787, + "grad_norm": 13.53934383392334, + "learning_rate": 1.4670670621788229e-08, + "logits/chosen": -1.1328125, + "logits/rejected": -1.1015625, + "logps/chosen": -266.0, + "logps/rejected": -293.0, + "loss": 0.2885, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 1.05859375, + "rewards/margins": 3.359375, + "rewards/rejected": -2.3046875, + "step": 809 + }, + { + "epoch": 0.8960176991150443, + "grad_norm": 13.705190658569336, + "learning_rate": 1.4369570610968274e-08, + "logits/chosen": -1.3515625, + "logits/rejected": -1.17578125, + "logps/chosen": -250.0, + "logps/rejected": -271.0, + "loss": 0.346, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.693359375, + "rewards/margins": 2.6171875, + "rewards/rejected": -1.92578125, + "step": 810 + }, + { + "epoch": 0.8971238938053098, + "grad_norm": 12.113191604614258, + "learning_rate": 1.4071501157046666e-08, + "logits/chosen": -1.19921875, + "logits/rejected": -1.1328125, + "logps/chosen": -256.0, + "logps/rejected": -274.5, + "loss": 0.2523, + "rewards/accuracies": 0.8671875, + "rewards/chosen": 0.84375, + "rewards/margins": 3.1171875, + "rewards/rejected": -2.27734375, + "step": 811 + }, + { + "epoch": 0.8982300884955752, + "grad_norm": 13.494206428527832, + "learning_rate": 1.3776466093638695e-08, + "logits/chosen": -1.234375, + "logits/rejected": -1.0546875, + "logps/chosen": -241.0, + "logps/rejected": -272.0, + "loss": 0.2704, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.990234375, + "rewards/margins": 3.4140625, + "rewards/rejected": -2.4296875, + "step": 812 + }, + { + "epoch": 0.8993362831858407, + "grad_norm": 11.447568893432617, + "learning_rate": 1.3484469215333082e-08, + "logits/chosen": -1.34765625, + "logits/rejected": -1.19921875, + "logps/chosen": -252.5, + "logps/rejected": -244.0, + "loss": 0.257, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6953125, + "rewards/margins": 3.109375, + "rewards/rejected": -2.4140625, + "step": 813 + }, + { + "epoch": 0.9004424778761062, + "grad_norm": 17.08716583251953, + "learning_rate": 1.3195514277642817e-08, + "logits/chosen": -1.41015625, + "logits/rejected": -1.3125, + "logps/chosen": -264.0, + "logps/rejected": -257.5, + "loss": 0.4052, + "rewards/accuracies": 0.7265625, + "rewards/chosen": 0.4658203125, + "rewards/margins": 2.4140625, + "rewards/rejected": -1.94921875, + "step": 814 + }, + { + "epoch": 0.9015486725663717, + "grad_norm": 12.849235534667969, + "learning_rate": 1.2909604996957091e-08, + "logits/chosen": -1.2890625, + "logits/rejected": -1.14453125, + "logps/chosen": -252.5, + "logps/rejected": -279.0, + "loss": 0.2986, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.75, + "rewards/margins": 2.734375, + "rewards/rejected": -1.98046875, + "step": 815 + }, + { + "epoch": 0.9026548672566371, + "grad_norm": 16.0147647857666, + "learning_rate": 1.2626745050493493e-08, + "logits/chosen": -1.36328125, + "logits/rejected": -1.23828125, + "logps/chosen": -245.5, + "logps/rejected": -296.0, + "loss": 0.3544, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.650390625, + "rewards/margins": 2.578125, + "rewards/rejected": -1.93359375, + "step": 816 + }, + { + "epoch": 0.9037610619469026, + "grad_norm": 12.192747116088867, + "learning_rate": 1.234693807625048e-08, + "logits/chosen": -1.234375, + "logits/rejected": -1.21875, + "logps/chosen": -256.0, + "logps/rejected": -277.0, + "loss": 0.2743, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.802734375, + "rewards/margins": 3.2421875, + "rewards/rejected": -2.4296875, + "step": 817 + }, + { + "epoch": 0.9048672566371682, + "grad_norm": 19.156158447265625, + "learning_rate": 1.2070187672960947e-08, + "logits/chosen": -1.4296875, + "logits/rejected": -1.125, + "logps/chosen": -261.0, + "logps/rejected": -283.0, + "loss": 0.4435, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.70703125, + "rewards/margins": 2.515625, + "rewards/rejected": -1.8125, + "step": 818 + }, + { + "epoch": 0.9059734513274337, + "grad_norm": 14.084782600402832, + "learning_rate": 1.179649740004557e-08, + "logits/chosen": -1.24609375, + "logits/rejected": -1.0703125, + "logps/chosen": -273.0, + "logps/rejected": -272.5, + "loss": 0.2877, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.4453125, + "rewards/margins": 2.546875, + "rewards/rejected": -2.1015625, + "step": 819 + }, + { + "epoch": 0.9070796460176991, + "grad_norm": 14.487624168395996, + "learning_rate": 1.1525870777567393e-08, + "logits/chosen": -1.234375, + "logits/rejected": -1.16015625, + "logps/chosen": -273.0, + "logps/rejected": -278.0, + "loss": 0.3505, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.5, + "rewards/margins": 2.34375, + "rewards/rejected": -1.83984375, + "step": 820 + }, + { + "epoch": 0.9081858407079646, + "grad_norm": 13.851645469665527, + "learning_rate": 1.1258311286186207e-08, + "logits/chosen": -1.28125, + "logits/rejected": -1.1015625, + "logps/chosen": -244.0, + "logps/rejected": -292.0, + "loss": 0.2884, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.9921875, + "rewards/margins": 3.2578125, + "rewards/rejected": -2.265625, + "step": 821 + }, + { + "epoch": 0.9092920353982301, + "grad_norm": 13.431646347045898, + "learning_rate": 1.0993822367114047e-08, + "logits/chosen": -1.359375, + "logits/rejected": -1.1640625, + "logps/chosen": -285.0, + "logps/rejected": -291.0, + "loss": 0.2858, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.703125, + "rewards/margins": 2.6796875, + "rewards/rejected": -1.96875, + "step": 822 + }, + { + "epoch": 0.9103982300884956, + "grad_norm": 15.518174171447754, + "learning_rate": 1.0732407422070794e-08, + "logits/chosen": -1.3359375, + "logits/rejected": -1.19140625, + "logps/chosen": -230.5, + "logps/rejected": -270.0, + "loss": 0.3882, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.5517578125, + "rewards/margins": 2.5546875, + "rewards/rejected": -1.99609375, + "step": 823 + }, + { + "epoch": 0.911504424778761, + "grad_norm": 12.571428298950195, + "learning_rate": 1.0474069813240505e-08, + "logits/chosen": -1.23046875, + "logits/rejected": -1.1328125, + "logps/chosen": -241.0, + "logps/rejected": -290.0, + "loss": 0.3193, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.53515625, + "rewards/margins": 2.59375, + "rewards/rejected": -2.0546875, + "step": 824 + }, + { + "epoch": 0.9126106194690266, + "grad_norm": 14.974266052246094, + "learning_rate": 1.021881286322801e-08, + "logits/chosen": -1.2421875, + "logits/rejected": -1.109375, + "logps/chosen": -264.5, + "logps/rejected": -283.0, + "loss": 0.3549, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.5546875, + "rewards/margins": 2.23828125, + "rewards/rejected": -1.6796875, + "step": 825 + }, + { + "epoch": 0.9137168141592921, + "grad_norm": 12.049909591674805, + "learning_rate": 9.966639855016446e-09, + "logits/chosen": -1.3984375, + "logits/rejected": -1.203125, + "logps/chosen": -238.0, + "logps/rejected": -257.0, + "loss": 0.2548, + "rewards/accuracies": 0.890625, + "rewards/chosen": 0.775390625, + "rewards/margins": 3.203125, + "rewards/rejected": -2.4296875, + "step": 826 + }, + { + "epoch": 0.9148230088495575, + "grad_norm": 16.12934112548828, + "learning_rate": 9.71755403192484e-09, + "logits/chosen": -1.27734375, + "logits/rejected": -1.12890625, + "logps/chosen": -274.0, + "logps/rejected": -281.0, + "loss": 0.3717, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.437744140625, + "rewards/margins": 2.4609375, + "rewards/rejected": -2.0234375, + "step": 827 + }, + { + "epoch": 0.915929203539823, + "grad_norm": 15.575227737426758, + "learning_rate": 9.47155859756632e-09, + "logits/chosen": -1.34765625, + "logits/rejected": -1.2265625, + "logps/chosen": -244.0, + "logps/rejected": -277.0, + "loss": 0.3755, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.701171875, + "rewards/margins": 2.8046875, + "rewards/rejected": -2.109375, + "step": 828 + }, + { + "epoch": 0.9170353982300885, + "grad_norm": 13.580742835998535, + "learning_rate": 9.228656715807249e-09, + "logits/chosen": -1.2265625, + "logits/rejected": -1.125, + "logps/chosen": -264.0, + "logps/rejected": -301.0, + "loss": 0.2762, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.828125, + "rewards/margins": 3.140625, + "rewards/rejected": -2.3125, + "step": 829 + }, + { + "epoch": 0.918141592920354, + "grad_norm": 14.229433059692383, + "learning_rate": 8.988851510726092e-09, + "logits/chosen": -1.375, + "logits/rejected": -1.109375, + "logps/chosen": -269.0, + "logps/rejected": -276.0, + "loss": 0.2769, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.5478515625, + "rewards/margins": 2.8828125, + "rewards/rejected": -2.3359375, + "step": 830 + }, + { + "epoch": 0.9192477876106194, + "grad_norm": 12.20298957824707, + "learning_rate": 8.752146066573597e-09, + "logits/chosen": -1.171875, + "logits/rejected": -1.1484375, + "logps/chosen": -254.0, + "logps/rejected": -291.0, + "loss": 0.2699, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.734375, + "rewards/margins": 2.8125, + "rewards/rejected": -2.08203125, + "step": 831 + }, + { + "epoch": 0.9203539823008849, + "grad_norm": 14.036704063415527, + "learning_rate": 8.518543427732949e-09, + "logits/chosen": -1.296875, + "logits/rejected": -1.1171875, + "logps/chosen": -265.0, + "logps/rejected": -267.0, + "loss": 0.3249, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.578125, + "rewards/margins": 2.5625, + "rewards/rejected": -1.98046875, + "step": 832 + }, + { + "epoch": 0.9214601769911505, + "grad_norm": 12.48025131225586, + "learning_rate": 8.288046598680627e-09, + "logits/chosen": -1.234375, + "logits/rejected": -1.12109375, + "logps/chosen": -260.0, + "logps/rejected": -268.0, + "loss": 0.2814, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.66796875, + "rewards/margins": 3.0859375, + "rewards/rejected": -2.421875, + "step": 833 + }, + { + "epoch": 0.922566371681416, + "grad_norm": 12.8703031539917, + "learning_rate": 8.060658543947829e-09, + "logits/chosen": -1.2890625, + "logits/rejected": -1.30078125, + "logps/chosen": -223.0, + "logps/rejected": -258.5, + "loss": 0.2808, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.8125, + "rewards/margins": 2.953125, + "rewards/rejected": -2.1484375, + "step": 834 + }, + { + "epoch": 0.9236725663716814, + "grad_norm": 13.693394660949707, + "learning_rate": 7.836382188082302e-09, + "logits/chosen": -1.234375, + "logits/rejected": -1.203125, + "logps/chosen": -264.0, + "logps/rejected": -289.0, + "loss": 0.2979, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.48828125, + "rewards/margins": 2.7421875, + "rewards/rejected": -2.2421875, + "step": 835 + }, + { + "epoch": 0.9247787610619469, + "grad_norm": 12.683737754821777, + "learning_rate": 7.61522041561069e-09, + "logits/chosen": -1.30859375, + "logits/rejected": -1.11328125, + "logps/chosen": -246.0, + "logps/rejected": -265.0, + "loss": 0.2762, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.81640625, + "rewards/margins": 2.859375, + "rewards/rejected": -2.046875, + "step": 836 + }, + { + "epoch": 0.9258849557522124, + "grad_norm": 15.07400894165039, + "learning_rate": 7.397176071001543e-09, + "logits/chosen": -1.35546875, + "logits/rejected": -1.1796875, + "logps/chosen": -251.0, + "logps/rejected": -267.0, + "loss": 0.3266, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.708984375, + "rewards/margins": 2.6171875, + "rewards/rejected": -1.90625, + "step": 837 + }, + { + "epoch": 0.9269911504424779, + "grad_norm": 12.571556091308594, + "learning_rate": 7.182251958628538e-09, + "logits/chosen": -1.33984375, + "logits/rejected": -1.21875, + "logps/chosen": -236.5, + "logps/rejected": -259.0, + "loss": 0.2943, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.5966796875, + "rewards/margins": 2.7109375, + "rewards/rejected": -2.12109375, + "step": 838 + }, + { + "epoch": 0.9280973451327433, + "grad_norm": 12.665489196777344, + "learning_rate": 6.970450842734649e-09, + "logits/chosen": -1.375, + "logits/rejected": -1.171875, + "logps/chosen": -260.0, + "logps/rejected": -276.0, + "loss": 0.2713, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7734375, + "rewards/margins": 3.0078125, + "rewards/rejected": -2.234375, + "step": 839 + }, + { + "epoch": 0.9292035398230089, + "grad_norm": 15.426192283630371, + "learning_rate": 6.761775447396506e-09, + "logits/chosen": -1.26171875, + "logits/rejected": -1.203125, + "logps/chosen": -244.0, + "logps/rejected": -297.0, + "loss": 0.3234, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.791015625, + "rewards/margins": 3.015625, + "rewards/rejected": -2.234375, + "step": 840 + }, + { + "epoch": 0.9303097345132744, + "grad_norm": 12.329756736755371, + "learning_rate": 6.556228456489232e-09, + "logits/chosen": -1.1875, + "logits/rejected": -1.0859375, + "logps/chosen": -253.5, + "logps/rejected": -280.0, + "loss": 0.2926, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.6484375, + "rewards/margins": 2.9453125, + "rewards/rejected": -2.296875, + "step": 841 + }, + { + "epoch": 0.9314159292035398, + "grad_norm": 16.28516387939453, + "learning_rate": 6.353812513652052e-09, + "logits/chosen": -1.2109375, + "logits/rejected": -1.10546875, + "logps/chosen": -260.0, + "logps/rejected": -282.0, + "loss": 0.3844, + "rewards/accuracies": 0.7578125, + "rewards/chosen": 0.529296875, + "rewards/margins": 2.4453125, + "rewards/rejected": -1.9140625, + "step": 842 + }, + { + "epoch": 0.9325221238938053, + "grad_norm": 16.69934844970703, + "learning_rate": 6.154530222254372e-09, + "logits/chosen": -1.25390625, + "logits/rejected": -1.21875, + "logps/chosen": -245.5, + "logps/rejected": -280.0, + "loss": 0.3776, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 0.6630859375, + "rewards/margins": 2.53125, + "rewards/rejected": -1.8671875, + "step": 843 + }, + { + "epoch": 0.9336283185840708, + "grad_norm": 15.312355995178223, + "learning_rate": 5.958384145362038e-09, + "logits/chosen": -1.27734375, + "logits/rejected": -1.1796875, + "logps/chosen": -267.5, + "logps/rejected": -304.0, + "loss": 0.3446, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.619140625, + "rewards/margins": 2.7265625, + "rewards/rejected": -2.109375, + "step": 844 + }, + { + "epoch": 0.9347345132743363, + "grad_norm": 13.851134300231934, + "learning_rate": 5.765376805704575e-09, + "logits/chosen": -1.296875, + "logits/rejected": -1.16015625, + "logps/chosen": -242.5, + "logps/rejected": -286.0, + "loss": 0.312, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.826171875, + "rewards/margins": 2.78125, + "rewards/rejected": -1.953125, + "step": 845 + }, + { + "epoch": 0.9358407079646017, + "grad_norm": 12.364534378051758, + "learning_rate": 5.575510685642798e-09, + "logits/chosen": -1.1328125, + "logits/rejected": -1.1875, + "logps/chosen": -265.0, + "logps/rejected": -298.0, + "loss": 0.2532, + "rewards/accuracies": 0.8671875, + "rewards/chosen": 1.015625, + "rewards/margins": 3.3671875, + "rewards/rejected": -2.3515625, + "step": 846 + }, + { + "epoch": 0.9369469026548672, + "grad_norm": 15.209588050842285, + "learning_rate": 5.38878822713662e-09, + "logits/chosen": -1.25390625, + "logits/rejected": -1.10546875, + "logps/chosen": -279.0, + "logps/rejected": -300.0, + "loss": 0.3528, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 0.505859375, + "rewards/margins": 2.6796875, + "rewards/rejected": -2.1796875, + "step": 847 + }, + { + "epoch": 0.9380530973451328, + "grad_norm": 13.730789184570312, + "learning_rate": 5.205211831713935e-09, + "logits/chosen": -1.37109375, + "logits/rejected": -1.125, + "logps/chosen": -239.5, + "logps/rejected": -240.0, + "loss": 0.3282, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.6494140625, + "rewards/margins": 2.921875, + "rewards/rejected": -2.2734375, + "step": 848 + }, + { + "epoch": 0.9391592920353983, + "grad_norm": 13.921919822692871, + "learning_rate": 5.024783860439474e-09, + "logits/chosen": -1.28125, + "logits/rejected": -1.08203125, + "logps/chosen": -228.0, + "logps/rejected": -262.0, + "loss": 0.3565, + "rewards/accuracies": 0.7421875, + "rewards/chosen": 0.517578125, + "rewards/margins": 2.5703125, + "rewards/rejected": -2.046875, + "step": 849 + }, + { + "epoch": 0.9402654867256637, + "grad_norm": 15.472764015197754, + "learning_rate": 4.8475066338846685e-09, + "logits/chosen": -1.3515625, + "logits/rejected": -1.15234375, + "logps/chosen": -252.0, + "logps/rejected": -282.0, + "loss": 0.3386, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.658203125, + "rewards/margins": 2.8203125, + "rewards/rejected": -2.1640625, + "step": 850 + }, + { + "epoch": 0.9402654867256637, + "eval_logits/chosen": -1.2672574520111084, + "eval_logits/rejected": -1.1583489179611206, + "eval_logps/chosen": -253.592041015625, + "eval_logps/rejected": -280.3034973144531, + "eval_loss": 0.31901347637176514, + "eval_rewards/accuracies": 0.8145152926445007, + "eval_rewards/chosen": 0.7268248796463013, + "eval_rewards/margins": 2.8418843746185303, + "eval_rewards/rejected": -2.1149721145629883, + "eval_runtime": 192.9475, + "eval_samples_per_second": 66.614, + "eval_steps_per_second": 1.042, + "step": 850 + }, + { + "epoch": 0.9413716814159292, + "grad_norm": 13.594935417175293, + "learning_rate": 4.673382432097667e-09, + "logits/chosen": -1.3515625, + "logits/rejected": -1.23046875, + "logps/chosen": -256.0, + "logps/rejected": -263.0, + "loss": 0.3324, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.6171875, + "rewards/margins": 2.7265625, + "rewards/rejected": -2.1171875, + "step": 851 + }, + { + "epoch": 0.9424778761061947, + "grad_norm": 14.526602745056152, + "learning_rate": 4.5024134945740036e-09, + "logits/chosen": -1.3203125, + "logits/rejected": -1.22265625, + "logps/chosen": -229.5, + "logps/rejected": -244.5, + "loss": 0.3492, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.546875, + "rewards/margins": 2.6640625, + "rewards/rejected": -2.109375, + "step": 852 + }, + { + "epoch": 0.9435840707964602, + "grad_norm": 16.662525177001953, + "learning_rate": 4.334602020227867e-09, + "logits/chosen": -1.34375, + "logits/rejected": -1.15625, + "logps/chosen": -285.0, + "logps/rejected": -290.0, + "loss": 0.3672, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.4638671875, + "rewards/margins": 2.3984375, + "rewards/rejected": -1.94140625, + "step": 853 + }, + { + "epoch": 0.9446902654867256, + "grad_norm": 14.094331741333008, + "learning_rate": 4.169950167363767e-09, + "logits/chosen": -1.265625, + "logits/rejected": -1.0625, + "logps/chosen": -263.0, + "logps/rejected": -297.0, + "loss": 0.3088, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.84765625, + "rewards/margins": 2.9375, + "rewards/rejected": -2.09375, + "step": 854 + }, + { + "epoch": 0.9457964601769911, + "grad_norm": 14.868205070495605, + "learning_rate": 4.0084600536488265e-09, + "logits/chosen": -1.38671875, + "logits/rejected": -1.17578125, + "logps/chosen": -238.0, + "logps/rejected": -290.0, + "loss": 0.3156, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.8359375, + "rewards/margins": 2.8125, + "rewards/rejected": -1.9765625, + "step": 855 + }, + { + "epoch": 0.9469026548672567, + "grad_norm": 13.155553817749023, + "learning_rate": 3.850133756085505e-09, + "logits/chosen": -1.31640625, + "logits/rejected": -1.15234375, + "logps/chosen": -270.0, + "logps/rejected": -290.0, + "loss": 0.3135, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.7265625, + "rewards/margins": 2.7890625, + "rewards/rejected": -2.0625, + "step": 856 + }, + { + "epoch": 0.9480088495575221, + "grad_norm": 13.842921257019043, + "learning_rate": 3.694973310984839e-09, + "logits/chosen": -1.359375, + "logits/rejected": -1.15625, + "logps/chosen": -258.0, + "logps/rejected": -281.0, + "loss": 0.3115, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.759765625, + "rewards/margins": 2.6171875, + "rewards/rejected": -1.859375, + "step": 857 + }, + { + "epoch": 0.9491150442477876, + "grad_norm": 13.213567733764648, + "learning_rate": 3.5429807139403524e-09, + "logits/chosen": -1.22265625, + "logits/rejected": -1.048828125, + "logps/chosen": -243.0, + "logps/rejected": -300.0, + "loss": 0.2749, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.87890625, + "rewards/margins": 3.2109375, + "rewards/rejected": -2.328125, + "step": 858 + }, + { + "epoch": 0.9502212389380531, + "grad_norm": 11.955760955810547, + "learning_rate": 3.3941579198023816e-09, + "logits/chosen": -1.484375, + "logits/rejected": -1.13671875, + "logps/chosen": -218.0, + "logps/rejected": -260.0, + "loss": 0.2961, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.8046875, + "rewards/margins": 2.7421875, + "rewards/rejected": -1.93359375, + "step": 859 + }, + { + "epoch": 0.9513274336283186, + "grad_norm": 13.337422370910645, + "learning_rate": 3.248506842652793e-09, + "logits/chosen": -1.2578125, + "logits/rejected": -1.12109375, + "logps/chosen": -249.0, + "logps/rejected": -309.0, + "loss": 0.2853, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.8349609375, + "rewards/margins": 3.078125, + "rewards/rejected": -2.25, + "step": 860 + }, + { + "epoch": 0.952433628318584, + "grad_norm": 12.912832260131836, + "learning_rate": 3.106029355780582e-09, + "logits/chosen": -1.234375, + "logits/rejected": -1.15234375, + "logps/chosen": -271.0, + "logps/rejected": -282.0, + "loss": 0.3052, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.447265625, + "rewards/margins": 2.65625, + "rewards/rejected": -2.20703125, + "step": 861 + }, + { + "epoch": 0.9535398230088495, + "grad_norm": 14.942134857177734, + "learning_rate": 2.9667272916575337e-09, + "logits/chosen": -1.20703125, + "logits/rejected": -1.04296875, + "logps/chosen": -247.5, + "logps/rejected": -279.0, + "loss": 0.3356, + "rewards/accuracies": 0.734375, + "rewards/chosen": 0.69921875, + "rewards/margins": 2.8671875, + "rewards/rejected": -2.1640625, + "step": 862 + }, + { + "epoch": 0.9546460176991151, + "grad_norm": 11.314682960510254, + "learning_rate": 2.830602441914881e-09, + "logits/chosen": -1.23828125, + "logits/rejected": -1.1875, + "logps/chosen": -263.0, + "logps/rejected": -277.0, + "loss": 0.2615, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.80859375, + "rewards/margins": 3.1953125, + "rewards/rejected": -2.390625, + "step": 863 + }, + { + "epoch": 0.9557522123893806, + "grad_norm": 13.024490356445312, + "learning_rate": 2.6976565573202102e-09, + "logits/chosen": -1.23828125, + "logits/rejected": -1.23046875, + "logps/chosen": -249.0, + "logps/rejected": -275.0, + "loss": 0.2961, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.734375, + "rewards/margins": 2.7890625, + "rewards/rejected": -2.0546875, + "step": 864 + }, + { + "epoch": 0.956858407079646, + "grad_norm": 13.718770980834961, + "learning_rate": 2.5678913477547302e-09, + "logits/chosen": -1.39453125, + "logits/rejected": -1.1796875, + "logps/chosen": -274.0, + "logps/rejected": -312.0, + "loss": 0.2869, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.853515625, + "rewards/margins": 2.7890625, + "rewards/rejected": -1.9296875, + "step": 865 + }, + { + "epoch": 0.9579646017699115, + "grad_norm": 13.562867164611816, + "learning_rate": 2.441308482191623e-09, + "logits/chosen": -1.12890625, + "logits/rejected": -1.0078125, + "logps/chosen": -252.5, + "logps/rejected": -291.0, + "loss": 0.3117, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.845703125, + "rewards/margins": 2.8359375, + "rewards/rejected": -1.98828125, + "step": 866 + }, + { + "epoch": 0.959070796460177, + "grad_norm": 13.698179244995117, + "learning_rate": 2.3179095886743384e-09, + "logits/chosen": -1.2890625, + "logits/rejected": -1.234375, + "logps/chosen": -230.5, + "logps/rejected": -266.5, + "loss": 0.3103, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.9375, + "rewards/margins": 3.1484375, + "rewards/rejected": -2.20703125, + "step": 867 + }, + { + "epoch": 0.9601769911504425, + "grad_norm": 13.498557090759277, + "learning_rate": 2.1976962542956945e-09, + "logits/chosen": -1.2890625, + "logits/rejected": -1.22265625, + "logps/chosen": -245.5, + "logps/rejected": -281.0, + "loss": 0.3036, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.935546875, + "rewards/margins": 2.9453125, + "rewards/rejected": -2.0, + "step": 868 + }, + { + "epoch": 0.9612831858407079, + "grad_norm": 12.500775337219238, + "learning_rate": 2.0806700251775055e-09, + "logits/chosen": -1.296875, + "logits/rejected": -1.171875, + "logps/chosen": -232.5, + "logps/rejected": -262.0, + "loss": 0.2973, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.724609375, + "rewards/margins": 2.953125, + "rewards/rejected": -2.2265625, + "step": 869 + }, + { + "epoch": 0.9623893805309734, + "grad_norm": 13.277873992919922, + "learning_rate": 1.966832406450708e-09, + "logits/chosen": -1.3828125, + "logits/rejected": -1.15234375, + "logps/chosen": -232.0, + "logps/rejected": -260.0, + "loss": 0.3434, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.5185546875, + "rewards/margins": 2.3984375, + "rewards/rejected": -1.87890625, + "step": 870 + }, + { + "epoch": 0.963495575221239, + "grad_norm": 14.751419067382812, + "learning_rate": 1.85618486223596e-09, + "logits/chosen": -1.234375, + "logits/rejected": -1.1484375, + "logps/chosen": -269.0, + "logps/rejected": -291.0, + "loss": 0.3578, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.4072265625, + "rewards/margins": 2.484375, + "rewards/rejected": -2.07421875, + "step": 871 + }, + { + "epoch": 0.9646017699115044, + "grad_norm": 16.29852294921875, + "learning_rate": 1.748728815624878e-09, + "logits/chosen": -1.328125, + "logits/rejected": -1.109375, + "logps/chosen": -273.0, + "logps/rejected": -264.0, + "loss": 0.3518, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.50927734375, + "rewards/margins": 2.734375, + "rewards/rejected": -2.2265625, + "step": 872 + }, + { + "epoch": 0.9657079646017699, + "grad_norm": 16.783334732055664, + "learning_rate": 1.6444656486615805e-09, + "logits/chosen": -1.1328125, + "logits/rejected": -1.07421875, + "logps/chosen": -287.0, + "logps/rejected": -307.0, + "loss": 0.3656, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 0.478515625, + "rewards/margins": 2.4765625, + "rewards/rejected": -2.00390625, + "step": 873 + }, + { + "epoch": 0.9668141592920354, + "grad_norm": 16.244199752807617, + "learning_rate": 1.5433967023250894e-09, + "logits/chosen": -1.37109375, + "logits/rejected": -1.1015625, + "logps/chosen": -275.0, + "logps/rejected": -317.0, + "loss": 0.3542, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.7353515625, + "rewards/margins": 2.8359375, + "rewards/rejected": -2.09375, + "step": 874 + }, + { + "epoch": 0.9679203539823009, + "grad_norm": 13.663660049438477, + "learning_rate": 1.4455232765120396e-09, + "logits/chosen": -1.3359375, + "logits/rejected": -1.22265625, + "logps/chosen": -244.5, + "logps/rejected": -268.0, + "loss": 0.3567, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.701171875, + "rewards/margins": 2.625, + "rewards/rejected": -1.921875, + "step": 875 + }, + { + "epoch": 0.9690265486725663, + "grad_norm": 12.790926933288574, + "learning_rate": 1.3508466300198306e-09, + "logits/chosen": -1.4296875, + "logits/rejected": -1.21875, + "logps/chosen": -232.5, + "logps/rejected": -262.0, + "loss": 0.3053, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.685546875, + "rewards/margins": 2.921875, + "rewards/rejected": -2.234375, + "step": 876 + }, + { + "epoch": 0.9701327433628318, + "grad_norm": 15.329063415527344, + "learning_rate": 1.2593679805306401e-09, + "logits/chosen": -1.20703125, + "logits/rejected": -1.20703125, + "logps/chosen": -254.5, + "logps/rejected": -278.0, + "loss": 0.3161, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.87890625, + "rewards/margins": 2.9609375, + "rewards/rejected": -2.07421875, + "step": 877 + }, + { + "epoch": 0.9712389380530974, + "grad_norm": 15.826077461242676, + "learning_rate": 1.1710885045956021e-09, + "logits/chosen": -1.41015625, + "logits/rejected": -1.26953125, + "logps/chosen": -257.5, + "logps/rejected": -281.0, + "loss": 0.3719, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.478515625, + "rewards/margins": 2.2734375, + "rewards/rejected": -1.79296875, + "step": 878 + }, + { + "epoch": 0.9723451327433629, + "grad_norm": 15.952284812927246, + "learning_rate": 1.0860093376197642e-09, + "logits/chosen": -1.28125, + "logits/rejected": -1.05078125, + "logps/chosen": -260.0, + "logps/rejected": -289.0, + "loss": 0.3437, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.73828125, + "rewards/margins": 2.859375, + "rewards/rejected": -2.1171875, + "step": 879 + }, + { + "epoch": 0.9734513274336283, + "grad_norm": 13.334358215332031, + "learning_rate": 1.0041315738474055e-09, + "logits/chosen": -1.203125, + "logits/rejected": -1.0859375, + "logps/chosen": -261.5, + "logps/rejected": -312.0, + "loss": 0.2845, + "rewards/accuracies": 0.8359375, + "rewards/chosen": 0.869140625, + "rewards/margins": 3.125, + "rewards/rejected": -2.265625, + "step": 880 + }, + { + "epoch": 0.9745575221238938, + "grad_norm": 11.215107917785645, + "learning_rate": 9.254562663480458e-10, + "logits/chosen": -1.3125, + "logits/rejected": -1.2265625, + "logps/chosen": -241.5, + "logps/rejected": -287.0, + "loss": 0.2595, + "rewards/accuracies": 0.8828125, + "rewards/chosen": 1.03125, + "rewards/margins": 3.1484375, + "rewards/rejected": -2.125, + "step": 881 + }, + { + "epoch": 0.9756637168141593, + "grad_norm": 13.879293441772461, + "learning_rate": 8.499844270028755e-10, + "logits/chosen": -1.3046875, + "logits/rejected": -1.10546875, + "logps/chosen": -250.5, + "logps/rejected": -267.5, + "loss": 0.3143, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.494140625, + "rewards/margins": 2.84375, + "rewards/rejected": -2.3515625, + "step": 882 + }, + { + "epoch": 0.9767699115044248, + "grad_norm": 425.59161376953125, + "learning_rate": 7.777170264917365e-10, + "logits/chosen": -1.2421875, + "logits/rejected": -1.0703125, + "logps/chosen": -260.0, + "logps/rejected": -347.0, + "loss": 0.4257, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.646484375, + "rewards/margins": 2.890625, + "rewards/rejected": -2.2421875, + "step": 883 + }, + { + "epoch": 0.9778761061946902, + "grad_norm": 25.028003692626953, + "learning_rate": 7.086549942805498e-10, + "logits/chosen": -1.19921875, + "logits/rejected": -1.11328125, + "logps/chosen": -285.0, + "logps/rejected": -282.0, + "loss": 0.3772, + "rewards/accuracies": 0.7734375, + "rewards/chosen": 0.56640625, + "rewards/margins": 2.359375, + "rewards/rejected": -1.79296875, + "step": 884 + }, + { + "epoch": 0.9789823008849557, + "grad_norm": 15.059175491333008, + "learning_rate": 6.427992186095744e-10, + "logits/chosen": -1.28515625, + "logits/rejected": -1.19140625, + "logps/chosen": -228.0, + "logps/rejected": -271.0, + "loss": 0.3026, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.94921875, + "rewards/margins": 3.140625, + "rewards/rejected": -2.1953125, + "step": 885 + }, + { + "epoch": 0.9800884955752213, + "grad_norm": 12.722869873046875, + "learning_rate": 5.801505464817502e-10, + "logits/chosen": -1.171875, + "logits/rejected": -1.08984375, + "logps/chosen": -244.0, + "logps/rejected": -279.0, + "loss": 0.3066, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.61328125, + "rewards/margins": 2.6640625, + "rewards/rejected": -2.0546875, + "step": 886 + }, + { + "epoch": 0.9811946902654868, + "grad_norm": 13.964948654174805, + "learning_rate": 5.207097836519569e-10, + "logits/chosen": -1.2421875, + "logits/rejected": -1.1171875, + "logps/chosen": -249.5, + "logps/rejected": -287.0, + "loss": 0.3159, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.8046875, + "rewards/margins": 2.953125, + "rewards/rejected": -2.1484375, + "step": 887 + }, + { + "epoch": 0.9823008849557522, + "grad_norm": 13.418638229370117, + "learning_rate": 4.644776946165774e-10, + "logits/chosen": -1.2734375, + "logits/rejected": -1.1484375, + "logps/chosen": -246.0, + "logps/rejected": -253.5, + "loss": 0.3351, + "rewards/accuracies": 0.8671875, + "rewards/chosen": 0.580078125, + "rewards/margins": 2.7734375, + "rewards/rejected": -2.1953125, + "step": 888 + }, + { + "epoch": 0.9834070796460177, + "grad_norm": 11.94414234161377, + "learning_rate": 4.114550026037278e-10, + "logits/chosen": -1.30078125, + "logits/rejected": -1.10546875, + "logps/chosen": -237.0, + "logps/rejected": -285.0, + "loss": 0.2559, + "rewards/accuracies": 0.8671875, + "rewards/chosen": 0.701171875, + "rewards/margins": 3.1484375, + "rewards/rejected": -2.453125, + "step": 889 + }, + { + "epoch": 0.9845132743362832, + "grad_norm": 14.505678176879883, + "learning_rate": 3.6164238956384876e-10, + "logits/chosen": -1.21484375, + "logits/rejected": -1.26953125, + "logps/chosen": -248.5, + "logps/rejected": -281.0, + "loss": 0.2998, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.876953125, + "rewards/margins": 2.875, + "rewards/rejected": -2.00390625, + "step": 890 + }, + { + "epoch": 0.9856194690265486, + "grad_norm": 12.155240058898926, + "learning_rate": 3.150404961611008e-10, + "logits/chosen": -1.234375, + "logits/rejected": -1.140625, + "logps/chosen": -240.0, + "logps/rejected": -276.0, + "loss": 0.2918, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.685546875, + "rewards/margins": 3.0078125, + "rewards/rejected": -2.3203125, + "step": 891 + }, + { + "epoch": 0.9867256637168141, + "grad_norm": 13.752731323242188, + "learning_rate": 2.716499217649271e-10, + "logits/chosen": -1.2109375, + "logits/rejected": -1.1640625, + "logps/chosen": -241.5, + "logps/rejected": -277.0, + "loss": 0.3461, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.638671875, + "rewards/margins": 2.6015625, + "rewards/rejected": -1.9609375, + "step": 892 + }, + { + "epoch": 0.9878318584070797, + "grad_norm": 14.712821960449219, + "learning_rate": 2.3147122444250323e-10, + "logits/chosen": -1.2265625, + "logits/rejected": -1.15234375, + "logps/chosen": -242.0, + "logps/rejected": -274.0, + "loss": 0.3957, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.4921875, + "rewards/margins": 2.5234375, + "rewards/rejected": -2.03125, + "step": 893 + }, + { + "epoch": 0.9889380530973452, + "grad_norm": 13.806950569152832, + "learning_rate": 1.9450492095149373e-10, + "logits/chosen": -1.27734375, + "logits/rejected": -1.06640625, + "logps/chosen": -251.0, + "logps/rejected": -282.0, + "loss": 0.3152, + "rewards/accuracies": 0.8203125, + "rewards/chosen": 0.84375, + "rewards/margins": 2.9921875, + "rewards/rejected": -2.1484375, + "step": 894 + }, + { + "epoch": 0.9900442477876106, + "grad_norm": 13.336440086364746, + "learning_rate": 1.607514867333626e-10, + "logits/chosen": -1.17578125, + "logits/rejected": -1.0546875, + "logps/chosen": -273.5, + "logps/rejected": -280.0, + "loss": 0.3012, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.646484375, + "rewards/margins": 2.7578125, + "rewards/rejected": -2.1015625, + "step": 895 + }, + { + "epoch": 0.9911504424778761, + "grad_norm": 14.591585159301758, + "learning_rate": 1.3021135590740583e-10, + "logits/chosen": -1.30078125, + "logits/rejected": -1.10546875, + "logps/chosen": -255.0, + "logps/rejected": -281.0, + "loss": 0.356, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.66015625, + "rewards/margins": 2.4921875, + "rewards/rejected": -1.83203125, + "step": 896 + }, + { + "epoch": 0.9922566371681416, + "grad_norm": 14.33768367767334, + "learning_rate": 1.028849212649785e-10, + "logits/chosen": -1.33203125, + "logits/rejected": -1.1875, + "logps/chosen": -272.0, + "logps/rejected": -264.0, + "loss": 0.3197, + "rewards/accuracies": 0.8046875, + "rewards/chosen": 0.716796875, + "rewards/margins": 2.765625, + "rewards/rejected": -2.046875, + "step": 897 + }, + { + "epoch": 0.9933628318584071, + "grad_norm": 14.789177894592285, + "learning_rate": 7.877253426458175e-11, + "logits/chosen": -1.2890625, + "logits/rejected": -1.125, + "logps/chosen": -253.5, + "logps/rejected": -296.0, + "loss": 0.3679, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.57421875, + "rewards/margins": 2.3984375, + "rewards/rejected": -1.82421875, + "step": 898 + }, + { + "epoch": 0.9944690265486725, + "grad_norm": 14.226619720458984, + "learning_rate": 5.7874505027283304e-11, + "logits/chosen": -1.265625, + "logits/rejected": -1.12890625, + "logps/chosen": -256.0, + "logps/rejected": -263.5, + "loss": 0.3177, + "rewards/accuracies": 0.8515625, + "rewards/chosen": 0.6015625, + "rewards/margins": 2.6796875, + "rewards/rejected": -2.0703125, + "step": 899 + }, + { + "epoch": 0.995575221238938, + "grad_norm": 14.491003036499023, + "learning_rate": 4.0191102332748364e-11, + "logits/chosen": -1.37109375, + "logits/rejected": -1.2421875, + "logps/chosen": -261.0, + "logps/rejected": -300.0, + "loss": 0.2955, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.892578125, + "rewards/margins": 2.9140625, + "rewards/rejected": -2.0234375, + "step": 900 + }, + { + "epoch": 0.995575221238938, + "eval_logits/chosen": -1.2664412260055542, + "eval_logits/rejected": -1.1579796075820923, + "eval_logps/chosen": -253.59701538085938, + "eval_logps/rejected": -280.3333435058594, + "eval_loss": 0.31904885172843933, + "eval_rewards/accuracies": 0.8163970708847046, + "eval_rewards/chosen": 0.7264896035194397, + "eval_rewards/margins": 2.841573476791382, + "eval_rewards/rejected": -2.1163711547851562, + "eval_runtime": 193.0253, + "eval_samples_per_second": 66.587, + "eval_steps_per_second": 1.041, + "step": 900 + }, + { + "epoch": 0.9966814159292036, + "grad_norm": 11.916271209716797, + "learning_rate": 2.5722553615770137e-11, + "logits/chosen": -1.31640625, + "logits/rejected": -1.11328125, + "logps/chosen": -246.0, + "logps/rejected": -269.5, + "loss": 0.2564, + "rewards/accuracies": 0.859375, + "rewards/chosen": 0.962890625, + "rewards/margins": 3.4609375, + "rewards/rejected": -2.4921875, + "step": 901 + }, + { + "epoch": 0.9977876106194691, + "grad_norm": 15.206621170043945, + "learning_rate": 1.4469044963355547e-11, + "logits/chosen": -1.2109375, + "logits/rejected": -1.08984375, + "logps/chosen": -250.5, + "logps/rejected": -298.0, + "loss": 0.3246, + "rewards/accuracies": 0.7890625, + "rewards/chosen": 0.677734375, + "rewards/margins": 2.6953125, + "rewards/rejected": -2.015625, + "step": 902 + }, + { + "epoch": 0.9988938053097345, + "grad_norm": 16.438447952270508, + "learning_rate": 6.430721112282711e-12, + "logits/chosen": -1.265625, + "logits/rejected": -1.19921875, + "logps/chosen": -255.5, + "logps/rejected": -294.0, + "loss": 0.4007, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.6181640625, + "rewards/margins": 2.5703125, + "rewards/rejected": -1.953125, + "step": 903 + }, + { + "epoch": 1.0, + "grad_norm": 13.59802532196045, + "learning_rate": 1.6076854473801027e-12, + "logits/chosen": -1.359375, + "logits/rejected": -1.16796875, + "logps/chosen": -269.0, + "logps/rejected": -293.0, + "loss": 0.2951, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6875, + "rewards/margins": 2.75, + "rewards/rejected": -2.0625, + "step": 904 + } + ], + "logging_steps": 1, + "max_steps": 904, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "train_dataloader_state_dict": null, + "trial_name": null, + "trial_params": null +}