diff --git "a/checkpoint-500/trainer_state.json" "b/checkpoint-500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-500/trainer_state.json" @@ -0,0 +1,9021 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 54.7275276184082, + "learning_rate": 3.3333333333333334e-09, + "logps/chosen": -12.533590316772461, + "logps/rejected": -31.803932189941406, + "loss": 0.6931, + "losses/dpo": 0.6931471824645996, + "losses/sft": 0.26103636622428894, + "losses/total": 0.6931471824645996, + "ref_logps/chosen": -12.533590316772461, + "ref_logps/rejected": -31.803932189941406, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 54.985111236572266, + "learning_rate": 6.666666666666667e-09, + "logps/chosen": -10.713068962097168, + "logps/rejected": -33.42286682128906, + "loss": 0.6931, + "losses/dpo": 0.6931471824645996, + "losses/sft": 0.2067307084798813, + "losses/total": 0.6931471824645996, + "ref_logps/chosen": -10.713068962097168, + "ref_logps/rejected": -33.42286682128906, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.01, + "grad_norm": 63.79972839355469, + "learning_rate": 1e-08, + "logps/chosen": -18.444631576538086, + "logps/rejected": -40.561065673828125, + "loss": 0.6898, + "losses/dpo": 0.6951494216918945, + "losses/sft": 0.3127816319465637, + "losses/total": 0.6951494216918945, + "ref_logps/chosen": -18.396644592285156, + "ref_logps/rejected": -40.443275451660156, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.004798633046448231, + "rewards/margins": 0.006980050355195999, + "rewards/rejected": -0.011778682470321655, + "step": 3 + }, + { + "epoch": 0.01, + "grad_norm": 62.5953254699707, + "learning_rate": 1.3333333333333334e-08, + "logps/chosen": -14.227697372436523, + "logps/rejected": -48.33661651611328, + "loss": 0.6971, + "losses/dpo": 0.6941574811935425, + "losses/sft": 0.3062475621700287, + "losses/total": 0.6941574811935425, + "ref_logps/chosen": -14.194951057434082, + "ref_logps/rejected": -48.378318786621094, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.003274601884186268, + "rewards/margins": -0.007445037364959717, + "rewards/rejected": 0.004170434549450874, + "step": 4 + }, + { + "epoch": 0.01, + "grad_norm": 55.13102722167969, + "learning_rate": 1.6666666666666667e-08, + "logps/chosen": -13.59489631652832, + "logps/rejected": -30.49202537536621, + "loss": 0.6868, + "losses/dpo": 0.681822657585144, + "losses/sft": 0.2918586730957031, + "losses/total": 0.681822657585144, + "ref_logps/chosen": -13.620017051696777, + "ref_logps/rejected": -30.386978149414062, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.002512148581445217, + "rewards/margins": 0.01301683858036995, + "rewards/rejected": -0.010504689998924732, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 63.41639709472656, + "learning_rate": 2e-08, + "logps/chosen": -21.938560485839844, + "logps/rejected": -45.47388458251953, + "loss": 0.6968, + "losses/dpo": 0.7057414054870605, + "losses/sft": 0.23792970180511475, + "losses/total": 0.7057414054870605, + "ref_logps/chosen": -21.881378173828125, + "ref_logps/rejected": -45.48707580566406, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0057183862663805485, + "rewards/margins": -0.007037466391921043, + "rewards/rejected": 0.0013190805912017822, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 78.54264068603516, + "learning_rate": 2.3333333333333334e-08, + "logps/chosen": -18.155752182006836, + "logps/rejected": -56.19256591796875, + "loss": 0.6961, + "losses/dpo": 0.6858953237533569, + "losses/sft": 0.36166518926620483, + "losses/total": 0.6858953237533569, + "ref_logps/chosen": -18.145042419433594, + "ref_logps/rejected": -56.23862838745117, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.001071083708666265, + "rewards/margins": -0.005677402019500732, + "rewards/rejected": 0.004606318660080433, + "step": 7 + }, + { + "epoch": 0.02, + "grad_norm": 67.39810180664062, + "learning_rate": 2.6666666666666667e-08, + "logps/chosen": -18.465206146240234, + "logps/rejected": -49.916664123535156, + "loss": 0.6982, + "losses/dpo": 0.6833201050758362, + "losses/sft": 0.32572177052497864, + "losses/total": 0.6833201050758362, + "ref_logps/chosen": -18.501081466674805, + "ref_logps/rejected": -50.047977447509766, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0035875202156603336, + "rewards/margins": -0.009543540887534618, + "rewards/rejected": 0.013131062500178814, + "step": 8 + }, + { + "epoch": 0.02, + "grad_norm": 53.30474090576172, + "learning_rate": 3e-08, + "logps/chosen": -11.304487228393555, + "logps/rejected": -35.22385787963867, + "loss": 0.6868, + "losses/dpo": 0.6948688626289368, + "losses/sft": 0.2951069474220276, + "losses/total": 0.6948688626289368, + "ref_logps/chosen": -11.316177368164062, + "ref_logps/rejected": -35.10341262817383, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0011690347455441952, + "rewards/margins": 0.013213572092354298, + "rewards/rejected": -0.01204453781247139, + "step": 9 + }, + { + "epoch": 0.02, + "grad_norm": 62.466922760009766, + "learning_rate": 3.3333333333333334e-08, + "logps/chosen": -18.549427032470703, + "logps/rejected": -45.951873779296875, + "loss": 0.6818, + "losses/dpo": 0.6670930981636047, + "losses/sft": 0.2927955687046051, + "losses/total": 0.6670930981636047, + "ref_logps/chosen": -18.618999481201172, + "ref_logps/rejected": -45.7880973815918, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.006957197096198797, + "rewards/margins": 0.023334600031375885, + "rewards/rejected": -0.01637740060687065, + "step": 10 + }, + { + "epoch": 0.02, + "grad_norm": 52.64626693725586, + "learning_rate": 3.6666666666666664e-08, + "logps/chosen": -11.861335754394531, + "logps/rejected": -42.37664794921875, + "loss": 0.6953, + "losses/dpo": 0.6928779482841492, + "losses/sft": 0.2991476058959961, + "losses/total": 0.6928779482841492, + "ref_logps/chosen": -11.865274429321289, + "ref_logps/rejected": -42.42080307006836, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0003938704030588269, + "rewards/margins": -0.004021647851914167, + "rewards/rejected": 0.004415517672896385, + "step": 11 + }, + { + "epoch": 0.02, + "grad_norm": 60.31581497192383, + "learning_rate": 4e-08, + "logps/chosen": -10.655393600463867, + "logps/rejected": -45.220428466796875, + "loss": 0.6907, + "losses/dpo": 0.6967537999153137, + "losses/sft": 0.32602399587631226, + "losses/total": 0.6967537999153137, + "ref_logps/chosen": -10.646404266357422, + "ref_logps/rejected": -45.16096878051758, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0008988277986645699, + "rewards/margins": 0.005047045648097992, + "rewards/rejected": -0.005945873446762562, + "step": 12 + }, + { + "epoch": 0.03, + "grad_norm": 67.63394927978516, + "learning_rate": 4.333333333333333e-08, + "logps/chosen": -14.832448959350586, + "logps/rejected": -42.32299041748047, + "loss": 0.6898, + "losses/dpo": 0.686829149723053, + "losses/sft": 0.2940187454223633, + "losses/total": 0.686829149723053, + "ref_logps/chosen": -14.852863311767578, + "ref_logps/rejected": -42.27099609375, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0020414486061781645, + "rewards/margins": 0.007241221610456705, + "rewards/rejected": -0.005199772771447897, + "step": 13 + }, + { + "epoch": 0.03, + "grad_norm": 68.29341125488281, + "learning_rate": 4.666666666666667e-08, + "logps/chosen": -11.122224807739258, + "logps/rejected": -44.558685302734375, + "loss": 0.6936, + "losses/dpo": 0.6977905035018921, + "losses/sft": 0.3477708101272583, + "losses/total": 0.6977905035018921, + "ref_logps/chosen": -11.04557991027832, + "ref_logps/rejected": -44.489044189453125, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.007664448581635952, + "rewards/margins": -0.0007005957886576653, + "rewards/rejected": -0.006963852792978287, + "step": 14 + }, + { + "epoch": 0.03, + "grad_norm": 64.83494567871094, + "learning_rate": 5e-08, + "logps/chosen": -20.65314483642578, + "logps/rejected": -48.80792236328125, + "loss": 0.6906, + "losses/dpo": 0.6923655867576599, + "losses/sft": 0.3701089918613434, + "losses/total": 0.6923655867576599, + "ref_logps/chosen": -20.716949462890625, + "ref_logps/rejected": -48.81908416748047, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.006380443461239338, + "rewards/margins": 0.00526385335251689, + "rewards/rejected": 0.0011165902251377702, + "step": 15 + }, + { + "epoch": 0.03, + "grad_norm": 66.92870330810547, + "learning_rate": 5.3333333333333334e-08, + "logps/chosen": -11.5431489944458, + "logps/rejected": -49.763465881347656, + "loss": 0.6928, + "losses/dpo": 0.6806471347808838, + "losses/sft": 0.26024022698402405, + "losses/total": 0.6806471347808838, + "ref_logps/chosen": -11.589057922363281, + "ref_logps/rejected": -49.79829406738281, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004590884782373905, + "rewards/margins": 0.0011078307870775461, + "rewards/rejected": 0.0034830542281270027, + "step": 16 + }, + { + "epoch": 0.03, + "grad_norm": 71.3647689819336, + "learning_rate": 5.666666666666666e-08, + "logps/chosen": -18.725505828857422, + "logps/rejected": -53.86628341674805, + "loss": 0.6869, + "losses/dpo": 0.6923279762268066, + "losses/sft": 0.3113042712211609, + "losses/total": 0.6923279762268066, + "ref_logps/chosen": -18.76999282836914, + "ref_logps/rejected": -53.78404998779297, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0044487896375358105, + "rewards/margins": 0.012671994976699352, + "rewards/rejected": -0.008223205804824829, + "step": 17 + }, + { + "epoch": 0.04, + "grad_norm": 54.42089080810547, + "learning_rate": 6e-08, + "logps/chosen": -15.599297523498535, + "logps/rejected": -35.98277282714844, + "loss": 0.686, + "losses/dpo": 0.6924772262573242, + "losses/sft": 0.28760266304016113, + "losses/total": 0.6924772262573242, + "ref_logps/chosen": -15.625329971313477, + "ref_logps/rejected": -35.86233139038086, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.002603264059871435, + "rewards/margins": 0.014647157862782478, + "rewards/rejected": -0.012043893337249756, + "step": 18 + }, + { + "epoch": 0.04, + "grad_norm": 84.1619644165039, + "learning_rate": 6.333333333333333e-08, + "logps/chosen": -19.605751037597656, + "logps/rejected": -62.37677764892578, + "loss": 0.686, + "losses/dpo": 0.689681351184845, + "losses/sft": 0.29873067140579224, + "losses/total": 0.689681351184845, + "ref_logps/chosen": -19.670486450195312, + "ref_logps/rejected": -62.29419708251953, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.006473721005022526, + "rewards/margins": 0.014731885865330696, + "rewards/rejected": -0.008258162997663021, + "step": 19 + }, + { + "epoch": 0.04, + "grad_norm": 67.69758605957031, + "learning_rate": 6.666666666666667e-08, + "logps/chosen": -12.570611953735352, + "logps/rejected": -56.44734191894531, + "loss": 0.6914, + "losses/dpo": 0.6809213161468506, + "losses/sft": 0.17609833180904388, + "losses/total": 0.6809213161468506, + "ref_logps/chosen": -12.528483390808105, + "ref_logps/rejected": -56.36581039428711, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.004212804604321718, + "rewards/margins": 0.003940396010875702, + "rewards/rejected": -0.008153200149536133, + "step": 20 + }, + { + "epoch": 0.04, + "grad_norm": 59.63766860961914, + "learning_rate": 7e-08, + "logps/chosen": -12.914083480834961, + "logps/rejected": -40.84098815917969, + "loss": 0.6877, + "losses/dpo": 0.6884068250656128, + "losses/sft": 0.2242521047592163, + "losses/total": 0.6884068250656128, + "ref_logps/chosen": -12.980051040649414, + "ref_logps/rejected": -40.794857025146484, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.006596784107387066, + "rewards/margins": 0.011209950782358646, + "rewards/rejected": -0.0046131666749715805, + "step": 21 + }, + { + "epoch": 0.04, + "grad_norm": 56.689208984375, + "learning_rate": 7.333333333333333e-08, + "logps/chosen": -13.277399063110352, + "logps/rejected": -42.286441802978516, + "loss": 0.6838, + "losses/dpo": 0.6860532760620117, + "losses/sft": 0.2654157280921936, + "losses/total": 0.6860532760620117, + "ref_logps/chosen": -13.287601470947266, + "ref_logps/rejected": -42.10773468017578, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.0010203286074101925, + "rewards/margins": 0.018891172483563423, + "rewards/rejected": -0.017870843410491943, + "step": 22 + }, + { + "epoch": 0.05, + "grad_norm": 56.51724624633789, + "learning_rate": 7.666666666666665e-08, + "logps/chosen": -14.184118270874023, + "logps/rejected": -38.65294647216797, + "loss": 0.6937, + "losses/dpo": 0.7033101320266724, + "losses/sft": 0.22815537452697754, + "losses/total": 0.7033101320266724, + "ref_logps/chosen": -14.229755401611328, + "ref_logps/rejected": -38.708251953125, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004563881549984217, + "rewards/margins": -0.0009667248232290149, + "rewards/rejected": 0.005530606489628553, + "step": 23 + }, + { + "epoch": 0.05, + "grad_norm": 54.5797119140625, + "learning_rate": 8e-08, + "logps/chosen": -13.569002151489258, + "logps/rejected": -36.74486541748047, + "loss": 0.6955, + "losses/dpo": 0.6941298246383667, + "losses/sft": 0.2619819641113281, + "losses/total": 0.6941298246383667, + "ref_logps/chosen": -13.519054412841797, + "ref_logps/rejected": -36.73900604248047, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.004994727671146393, + "rewards/margins": -0.004408624954521656, + "rewards/rejected": -0.0005861027166247368, + "step": 24 + }, + { + "epoch": 0.05, + "grad_norm": 60.37660217285156, + "learning_rate": 8.333333333333333e-08, + "logps/chosen": -11.180171012878418, + "logps/rejected": -45.01734161376953, + "loss": 0.6862, + "losses/dpo": 0.6880084872245789, + "losses/sft": 0.2977868318557739, + "losses/total": 0.6880084872245789, + "ref_logps/chosen": -11.221393585205078, + "ref_logps/rejected": -44.918087005615234, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.004122227430343628, + "rewards/margins": 0.014047539792954922, + "rewards/rejected": -0.009925312362611294, + "step": 25 + }, + { + "epoch": 0.05, + "grad_norm": 70.93061828613281, + "learning_rate": 8.666666666666666e-08, + "logps/chosen": -18.109363555908203, + "logps/rejected": -61.71974563598633, + "loss": 0.6808, + "losses/dpo": 0.6811847686767578, + "losses/sft": 0.24314402043819427, + "losses/total": 0.6811847686767578, + "ref_logps/chosen": -18.21930503845215, + "ref_logps/rejected": -61.57814025878906, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.010994033887982368, + "rewards/margins": 0.025154881179332733, + "rewards/rejected": -0.01416084822267294, + "step": 26 + }, + { + "epoch": 0.05, + "grad_norm": 69.38543701171875, + "learning_rate": 9e-08, + "logps/chosen": -15.200861930847168, + "logps/rejected": -43.106510162353516, + "loss": 0.6763, + "losses/dpo": 0.6658411026000977, + "losses/sft": 0.25579920411109924, + "losses/total": 0.6658411026000977, + "ref_logps/chosen": -15.397557258605957, + "ref_logps/rejected": -42.95894241333008, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.019669612869620323, + "rewards/margins": 0.0344264879822731, + "rewards/rejected": -0.01475687325000763, + "step": 27 + }, + { + "epoch": 0.06, + "grad_norm": 49.01519012451172, + "learning_rate": 9.333333333333334e-08, + "logps/chosen": -10.879910469055176, + "logps/rejected": -34.23360061645508, + "loss": 0.6912, + "losses/dpo": 0.704246997833252, + "losses/sft": 0.2846185266971588, + "losses/total": 0.704246997833252, + "ref_logps/chosen": -10.923196792602539, + "ref_logps/rejected": -34.23560333251953, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0043286713771522045, + "rewards/margins": 0.004128447733819485, + "rewards/rejected": 0.00020022434182465076, + "step": 28 + }, + { + "epoch": 0.06, + "grad_norm": 55.75774002075195, + "learning_rate": 9.666666666666666e-08, + "logps/chosen": -13.540484428405762, + "logps/rejected": -37.22199249267578, + "loss": 0.6721, + "losses/dpo": 0.670184850692749, + "losses/sft": 0.24113543331623077, + "losses/total": 0.670184850692749, + "ref_logps/chosen": -13.689138412475586, + "ref_logps/rejected": -36.942596435546875, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.014865398406982422, + "rewards/margins": 0.042804695665836334, + "rewards/rejected": -0.027939295396208763, + "step": 29 + }, + { + "epoch": 0.06, + "grad_norm": 65.36017608642578, + "learning_rate": 1e-07, + "logps/chosen": -16.502321243286133, + "logps/rejected": -63.630531311035156, + "loss": 0.664, + "losses/dpo": 0.6740528345108032, + "losses/sft": 0.2862235903739929, + "losses/total": 0.6740528345108032, + "ref_logps/chosen": -16.616275787353516, + "ref_logps/rejected": -63.14226531982422, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01139531098306179, + "rewards/margins": 0.060221925377845764, + "rewards/rejected": -0.048826612532138824, + "step": 30 + }, + { + "epoch": 0.06, + "grad_norm": 66.14002227783203, + "learning_rate": 1.0333333333333333e-07, + "logps/chosen": -11.186877250671387, + "logps/rejected": -48.64232635498047, + "loss": 0.6673, + "losses/dpo": 0.6822100281715393, + "losses/sft": 0.2335319221019745, + "losses/total": 0.6822100281715393, + "ref_logps/chosen": -11.297755241394043, + "ref_logps/rejected": -48.220863342285156, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.01108776405453682, + "rewards/margins": 0.05323418974876404, + "rewards/rejected": -0.04214642941951752, + "step": 31 + }, + { + "epoch": 0.06, + "grad_norm": 64.11575317382812, + "learning_rate": 1.0666666666666667e-07, + "logps/chosen": -14.923648834228516, + "logps/rejected": -47.85265350341797, + "loss": 0.6695, + "losses/dpo": 0.6915856599807739, + "losses/sft": 0.2507500648498535, + "losses/total": 0.6915856599807739, + "ref_logps/chosen": -15.00050163269043, + "ref_logps/rejected": -47.44062042236328, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.0076853265054523945, + "rewards/margins": 0.04888825863599777, + "rewards/rejected": -0.04120292887091637, + "step": 32 + }, + { + "epoch": 0.07, + "grad_norm": 67.46519470214844, + "learning_rate": 1.0999999999999999e-07, + "logps/chosen": -15.70111083984375, + "logps/rejected": -51.965126037597656, + "loss": 0.6681, + "losses/dpo": 0.6695447564125061, + "losses/sft": 0.30792540311813354, + "losses/total": 0.6695447564125061, + "ref_logps/chosen": -15.807563781738281, + "ref_logps/rejected": -51.55677032470703, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.010645204223692417, + "rewards/margins": 0.05148132145404816, + "rewards/rejected": -0.040836118161678314, + "step": 33 + }, + { + "epoch": 0.07, + "grad_norm": 50.910823822021484, + "learning_rate": 1.1333333333333332e-07, + "logps/chosen": -11.442159652709961, + "logps/rejected": -32.383033752441406, + "loss": 0.6783, + "losses/dpo": 0.6662068367004395, + "losses/sft": 0.30792683362960815, + "losses/total": 0.6662068367004395, + "ref_logps/chosen": -11.554034233093262, + "ref_logps/rejected": -32.18949890136719, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.011187402531504631, + "rewards/margins": 0.03054075315594673, + "rewards/rejected": -0.01935334876179695, + "step": 34 + }, + { + "epoch": 0.07, + "grad_norm": 53.7436637878418, + "learning_rate": 1.1666666666666667e-07, + "logps/chosen": -14.998608589172363, + "logps/rejected": -31.25254249572754, + "loss": 0.6719, + "losses/dpo": 0.6867252588272095, + "losses/sft": 0.2634373605251312, + "losses/total": 0.6867252588272095, + "ref_logps/chosen": -15.171464920043945, + "ref_logps/rejected": -30.990324020385742, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.017285751178860664, + "rewards/margins": 0.04350760579109192, + "rewards/rejected": -0.026221856474876404, + "step": 35 + }, + { + "epoch": 0.07, + "grad_norm": 50.0301513671875, + "learning_rate": 1.2e-07, + "logps/chosen": -10.806026458740234, + "logps/rejected": -37.376033782958984, + "loss": 0.6672, + "losses/dpo": 0.6568068265914917, + "losses/sft": 0.25919607281684875, + "losses/total": 0.6568068265914917, + "ref_logps/chosen": -10.89212417602539, + "ref_logps/rejected": -36.92989730834961, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.008609759621322155, + "rewards/margins": 0.05322342365980148, + "rewards/rejected": -0.044613663107156754, + "step": 36 + }, + { + "epoch": 0.07, + "grad_norm": 62.99734115600586, + "learning_rate": 1.2333333333333333e-07, + "logps/chosen": -9.172164916992188, + "logps/rejected": -48.006935119628906, + "loss": 0.6505, + "losses/dpo": 0.6659780144691467, + "losses/sft": 0.19942894577980042, + "losses/total": 0.6659780144691467, + "ref_logps/chosen": -9.280783653259277, + "ref_logps/rejected": -47.235198974609375, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.010861923918128014, + "rewards/margins": 0.08803565055131912, + "rewards/rejected": -0.07717373222112656, + "step": 37 + }, + { + "epoch": 0.08, + "grad_norm": 69.74348449707031, + "learning_rate": 1.2666666666666666e-07, + "logps/chosen": -20.615678787231445, + "logps/rejected": -54.96332931518555, + "loss": 0.6384, + "losses/dpo": 0.6584606766700745, + "losses/sft": 0.2419307678937912, + "losses/total": 0.6584606766700745, + "ref_logps/chosen": -21.108688354492188, + "ref_logps/rejected": -54.317012786865234, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.04930093139410019, + "rewards/margins": 0.11393265426158905, + "rewards/rejected": -0.06463172286748886, + "step": 38 + }, + { + "epoch": 0.08, + "grad_norm": 72.3482894897461, + "learning_rate": 1.3e-07, + "logps/chosen": -13.5218505859375, + "logps/rejected": -59.38899230957031, + "loss": 0.612, + "losses/dpo": 0.6105036735534668, + "losses/sft": 0.34414657950401306, + "losses/total": 0.6105036735534668, + "ref_logps/chosen": -13.771303176879883, + "ref_logps/rejected": -57.91090393066406, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.024945255368947983, + "rewards/margins": 0.17275384068489075, + "rewards/rejected": -0.14780858159065247, + "step": 39 + }, + { + "epoch": 0.08, + "grad_norm": 66.04383087158203, + "learning_rate": 1.3333333333333334e-07, + "logps/chosen": -16.882469177246094, + "logps/rejected": -59.21009826660156, + "loss": 0.6179, + "losses/dpo": 0.6421835422515869, + "losses/sft": 0.3301094174385071, + "losses/total": 0.6421835422515869, + "ref_logps/chosen": -17.26021385192871, + "ref_logps/rejected": -57.98883056640625, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.037774428725242615, + "rewards/margins": 0.15990111231803894, + "rewards/rejected": -0.12212669849395752, + "step": 40 + }, + { + "epoch": 0.08, + "grad_norm": 67.81195068359375, + "learning_rate": 1.3666666666666665e-07, + "logps/chosen": -14.069759368896484, + "logps/rejected": -53.27463912963867, + "loss": 0.6137, + "losses/dpo": 0.6028338670730591, + "losses/sft": 0.26799964904785156, + "losses/total": 0.6028338670730591, + "ref_logps/chosen": -14.432823181152344, + "ref_logps/rejected": -51.95939636230469, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03630626201629639, + "rewards/margins": 0.16783034801483154, + "rewards/rejected": -0.13152408599853516, + "step": 41 + }, + { + "epoch": 0.08, + "grad_norm": 60.39308166503906, + "learning_rate": 1.4e-07, + "logps/chosen": -16.07038116455078, + "logps/rejected": -51.06721115112305, + "loss": 0.624, + "losses/dpo": 0.6596254110336304, + "losses/sft": 0.2904399037361145, + "losses/total": 0.6596254110336304, + "ref_logps/chosen": -16.394489288330078, + "ref_logps/rejected": -49.92795181274414, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.03241092711687088, + "rewards/margins": 0.14633695781230927, + "rewards/rejected": -0.11392602324485779, + "step": 42 + }, + { + "epoch": 0.09, + "grad_norm": 55.89990997314453, + "learning_rate": 1.4333333333333335e-07, + "logps/chosen": -17.65834617614746, + "logps/rejected": -49.329383850097656, + "loss": 0.6355, + "losses/dpo": 0.6224067211151123, + "losses/sft": 0.2809451222419739, + "losses/total": 0.6224067211151123, + "ref_logps/chosen": -17.987916946411133, + "ref_logps/rejected": -48.46190643310547, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03295706957578659, + "rewards/margins": 0.11970466375350952, + "rewards/rejected": -0.08674759417772293, + "step": 43 + }, + { + "epoch": 0.09, + "grad_norm": 51.306312561035156, + "learning_rate": 1.4666666666666666e-07, + "logps/chosen": -10.183408737182617, + "logps/rejected": -32.69266128540039, + "loss": 0.6226, + "losses/dpo": 0.5917978286743164, + "losses/sft": 0.25666502118110657, + "losses/total": 0.5917978286743164, + "ref_logps/chosen": -10.467521667480469, + "ref_logps/rejected": -31.477275848388672, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.028411362320184708, + "rewards/margins": 0.1499500572681427, + "rewards/rejected": -0.12153870612382889, + "step": 44 + }, + { + "epoch": 0.09, + "grad_norm": 66.90591430664062, + "learning_rate": 1.5e-07, + "logps/chosen": -14.379773139953613, + "logps/rejected": -65.27030944824219, + "loss": 0.5908, + "losses/dpo": 0.5828587412834167, + "losses/sft": 0.2970637381076813, + "losses/total": 0.5828587412834167, + "ref_logps/chosen": -14.763320922851562, + "ref_logps/rejected": -63.37145233154297, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.03835476189851761, + "rewards/margins": 0.22824007272720337, + "rewards/rejected": -0.18988531827926636, + "step": 45 + }, + { + "epoch": 0.09, + "grad_norm": 52.453800201416016, + "learning_rate": 1.533333333333333e-07, + "logps/chosen": -13.01441764831543, + "logps/rejected": -42.7321662902832, + "loss": 0.6133, + "losses/dpo": 0.6588992476463318, + "losses/sft": 0.3094305992126465, + "losses/total": 0.6588992476463318, + "ref_logps/chosen": -13.52122688293457, + "ref_logps/rejected": -41.48785400390625, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05068095400929451, + "rewards/margins": 0.17511233687400818, + "rewards/rejected": -0.12443137168884277, + "step": 46 + }, + { + "epoch": 0.09, + "grad_norm": 60.28200149536133, + "learning_rate": 1.5666666666666667e-07, + "logps/chosen": -11.062196731567383, + "logps/rejected": -43.15785217285156, + "loss": 0.5899, + "losses/dpo": 0.6494508385658264, + "losses/sft": 0.28766411542892456, + "losses/total": 0.6494508385658264, + "ref_logps/chosen": -11.557550430297852, + "ref_logps/rejected": -41.409461975097656, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.04953545704483986, + "rewards/margins": 0.22437459230422974, + "rewards/rejected": -0.17483913898468018, + "step": 47 + }, + { + "epoch": 0.1, + "grad_norm": 44.662635803222656, + "learning_rate": 1.6e-07, + "logps/chosen": -13.756662368774414, + "logps/rejected": -31.407695770263672, + "loss": 0.6274, + "losses/dpo": 0.6444739103317261, + "losses/sft": 0.24791499972343445, + "losses/total": 0.6444739103317261, + "ref_logps/chosen": -14.214315414428711, + "ref_logps/rejected": -30.471614837646484, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.04576535150408745, + "rewards/margins": 0.13937321305274963, + "rewards/rejected": -0.09360785037279129, + "step": 48 + }, + { + "epoch": 0.1, + "grad_norm": 51.2293701171875, + "learning_rate": 1.6333333333333331e-07, + "logps/chosen": -12.181818008422852, + "logps/rejected": -39.64811706542969, + "loss": 0.6029, + "losses/dpo": 0.5977880954742432, + "losses/sft": 0.24987655878067017, + "losses/total": 0.5977880954742432, + "ref_logps/chosen": -12.48669719696045, + "ref_logps/rejected": -37.98247528076172, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.030487842857837677, + "rewards/margins": 0.19705218076705933, + "rewards/rejected": -0.16656433045864105, + "step": 49 + }, + { + "epoch": 0.1, + "grad_norm": 61.242591857910156, + "learning_rate": 1.6666666666666665e-07, + "logps/chosen": -17.916460037231445, + "logps/rejected": -51.87417984008789, + "loss": 0.5833, + "losses/dpo": 0.5518717765808105, + "losses/sft": 0.334248423576355, + "losses/total": 0.5518717765808105, + "ref_logps/chosen": -18.505146026611328, + "ref_logps/rejected": -50.02173614501953, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.058868564665317535, + "rewards/margins": 0.24411310255527496, + "rewards/rejected": -0.18524454534053802, + "step": 50 + }, + { + "epoch": 0.1, + "grad_norm": 54.873069763183594, + "learning_rate": 1.7000000000000001e-07, + "logps/chosen": -15.96760082244873, + "logps/rejected": -45.260826110839844, + "loss": 0.5708, + "losses/dpo": 0.5711266994476318, + "losses/sft": 0.2405555695295334, + "losses/total": 0.5711266994476318, + "ref_logps/chosen": -16.65478515625, + "ref_logps/rejected": -43.300804138183594, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06871844083070755, + "rewards/margins": 0.2647208869457245, + "rewards/rejected": -0.19600245356559753, + "step": 51 + }, + { + "epoch": 0.1, + "grad_norm": 61.774105072021484, + "learning_rate": 1.7333333333333332e-07, + "logps/chosen": -15.063488006591797, + "logps/rejected": -52.22322082519531, + "loss": 0.5067, + "losses/dpo": 0.5486783981323242, + "losses/sft": 0.2762540280818939, + "losses/total": 0.5486783981323242, + "ref_logps/chosen": -15.942670822143555, + "ref_logps/rejected": -48.783878326416016, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08791828900575638, + "rewards/margins": 0.43185263872146606, + "rewards/rejected": -0.3439343571662903, + "step": 52 + }, + { + "epoch": 0.11, + "grad_norm": 51.12025833129883, + "learning_rate": 1.7666666666666666e-07, + "logps/chosen": -12.447774887084961, + "logps/rejected": -49.730525970458984, + "loss": 0.5292, + "losses/dpo": 0.5261654853820801, + "losses/sft": 0.3010826110839844, + "losses/total": 0.5261654853820801, + "ref_logps/chosen": -12.847427368164062, + "ref_logps/rejected": -46.31183624267578, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.039965298026800156, + "rewards/margins": 0.38183388113975525, + "rewards/rejected": -0.3418685793876648, + "step": 53 + }, + { + "epoch": 0.11, + "grad_norm": 56.890716552734375, + "learning_rate": 1.8e-07, + "logps/chosen": -15.07343864440918, + "logps/rejected": -47.3963508605957, + "loss": 0.5003, + "losses/dpo": 0.4316698908805847, + "losses/sft": 0.25664612650871277, + "losses/total": 0.4316698908805847, + "ref_logps/chosen": -15.735147476196289, + "ref_logps/rejected": -43.51297378540039, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06617091596126556, + "rewards/margins": 0.45450854301452637, + "rewards/rejected": -0.3883376121520996, + "step": 54 + }, + { + "epoch": 0.11, + "grad_norm": 47.778656005859375, + "learning_rate": 1.833333333333333e-07, + "logps/chosen": -12.364097595214844, + "logps/rejected": -40.489898681640625, + "loss": 0.5156, + "losses/dpo": 0.5084203481674194, + "losses/sft": 0.19767522811889648, + "losses/total": 0.5084203481674194, + "ref_logps/chosen": -12.910322189331055, + "ref_logps/rejected": -36.8597412109375, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05462249368429184, + "rewards/margins": 0.41763830184936523, + "rewards/rejected": -0.3630157709121704, + "step": 55 + }, + { + "epoch": 0.11, + "grad_norm": 48.53829574584961, + "learning_rate": 1.8666666666666667e-07, + "logps/chosen": -15.324739456176758, + "logps/rejected": -55.346656799316406, + "loss": 0.4816, + "losses/dpo": 0.4806634485721588, + "losses/sft": 0.31916913390159607, + "losses/total": 0.4806634485721588, + "ref_logps/chosen": -16.457077026367188, + "ref_logps/rejected": -51.29027557373047, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11323380470275879, + "rewards/margins": 0.5188711881637573, + "rewards/rejected": -0.4056374132633209, + "step": 56 + }, + { + "epoch": 0.11, + "grad_norm": 50.436275482177734, + "learning_rate": 1.8999999999999998e-07, + "logps/chosen": -18.092538833618164, + "logps/rejected": -51.386226654052734, + "loss": 0.4638, + "losses/dpo": 0.48723289370536804, + "losses/sft": 0.24147015810012817, + "losses/total": 0.48723289370536804, + "ref_logps/chosen": -19.495101928710938, + "ref_logps/rejected": -47.26172637939453, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1402563750743866, + "rewards/margins": 0.5527061223983765, + "rewards/rejected": -0.41244977712631226, + "step": 57 + }, + { + "epoch": 0.12, + "grad_norm": 49.289188385009766, + "learning_rate": 1.9333333333333332e-07, + "logps/chosen": -19.1153621673584, + "logps/rejected": -59.98572540283203, + "loss": 0.4424, + "losses/dpo": 0.49887678027153015, + "losses/sft": 0.30951380729675293, + "losses/total": 0.49887678027153015, + "ref_logps/chosen": -20.371097564697266, + "ref_logps/rejected": -54.775394439697266, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.12557338178157806, + "rewards/margins": 0.6466065049171448, + "rewards/rejected": -0.5210331082344055, + "step": 58 + }, + { + "epoch": 0.12, + "grad_norm": 45.76652526855469, + "learning_rate": 1.9666666666666665e-07, + "logps/chosen": -13.932234764099121, + "logps/rejected": -38.268592834472656, + "loss": 0.5087, + "losses/dpo": 0.42856013774871826, + "losses/sft": 0.2895386219024658, + "losses/total": 0.42856013774871826, + "ref_logps/chosen": -14.634007453918457, + "ref_logps/rejected": -34.70841979980469, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07017731666564941, + "rewards/margins": 0.42619505524635315, + "rewards/rejected": -0.3560177683830261, + "step": 59 + }, + { + "epoch": 0.12, + "grad_norm": 44.962379455566406, + "learning_rate": 2e-07, + "logps/chosen": -15.4169282913208, + "logps/rejected": -39.44453811645508, + "loss": 0.5122, + "losses/dpo": 0.4593808650970459, + "losses/sft": 0.29917794466018677, + "losses/total": 0.4593808650970459, + "ref_logps/chosen": -16.19096565246582, + "ref_logps/rejected": -35.855220794677734, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.07740387320518494, + "rewards/margins": 0.4363355338573456, + "rewards/rejected": -0.35893166065216064, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 41.11608123779297, + "learning_rate": 2.0333333333333333e-07, + "logps/chosen": -14.037020683288574, + "logps/rejected": -50.51776123046875, + "loss": 0.4495, + "losses/dpo": 0.4573870301246643, + "losses/sft": 0.3381502628326416, + "losses/total": 0.4573870301246643, + "ref_logps/chosen": -14.76598834991455, + "ref_logps/rejected": -44.90858840942383, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07289674878120422, + "rewards/margins": 0.633813738822937, + "rewards/rejected": -0.5609170198440552, + "step": 61 + }, + { + "epoch": 0.12, + "grad_norm": 45.58005142211914, + "learning_rate": 2.0666666666666666e-07, + "logps/chosen": -18.84115982055664, + "logps/rejected": -54.45577621459961, + "loss": 0.4451, + "losses/dpo": 0.4381011128425598, + "losses/sft": 0.26262062788009644, + "losses/total": 0.4381011128425598, + "ref_logps/chosen": -20.094820022583008, + "ref_logps/rejected": -49.11144256591797, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12536601722240448, + "rewards/margins": 0.6597994565963745, + "rewards/rejected": -0.5344335436820984, + "step": 62 + }, + { + "epoch": 0.13, + "grad_norm": 37.02690505981445, + "learning_rate": 2.0999999999999997e-07, + "logps/chosen": -13.949782371520996, + "logps/rejected": -37.66564178466797, + "loss": 0.5103, + "losses/dpo": 0.4280434548854828, + "losses/sft": 0.24366047978401184, + "losses/total": 0.4280434548854828, + "ref_logps/chosen": -15.224845886230469, + "ref_logps/rejected": -34.20655059814453, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1275063157081604, + "rewards/margins": 0.4734152853488922, + "rewards/rejected": -0.3459089994430542, + "step": 63 + }, + { + "epoch": 0.13, + "grad_norm": 44.19191360473633, + "learning_rate": 2.1333333333333334e-07, + "logps/chosen": -9.342233657836914, + "logps/rejected": -40.536136627197266, + "loss": 0.4482, + "losses/dpo": 0.5243103504180908, + "losses/sft": 0.1963924765586853, + "losses/total": 0.5243103504180908, + "ref_logps/chosen": -9.877543449401855, + "ref_logps/rejected": -34.941558837890625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.05353102833032608, + "rewards/margins": 0.6129887104034424, + "rewards/rejected": -0.5594576597213745, + "step": 64 + }, + { + "epoch": 0.13, + "grad_norm": 43.218048095703125, + "learning_rate": 2.1666666666666667e-07, + "logps/chosen": -24.74606704711914, + "logps/rejected": -65.12860107421875, + "loss": 0.4176, + "losses/dpo": 0.537351667881012, + "losses/sft": 0.13678902387619019, + "losses/total": 0.537351667881012, + "ref_logps/chosen": -26.11852264404297, + "ref_logps/rejected": -58.860374450683594, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.13724543154239655, + "rewards/margins": 0.7640678286552429, + "rewards/rejected": -0.6268223524093628, + "step": 65 + }, + { + "epoch": 0.13, + "grad_norm": 39.85637283325195, + "learning_rate": 2.1999999999999998e-07, + "logps/chosen": -14.346301078796387, + "logps/rejected": -39.72654724121094, + "loss": 0.4504, + "losses/dpo": 0.5213490724563599, + "losses/sft": 0.2660483121871948, + "losses/total": 0.5213490724563599, + "ref_logps/chosen": -15.04425048828125, + "ref_logps/rejected": -33.927894592285156, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.06979489326477051, + "rewards/margins": 0.6496601700782776, + "rewards/rejected": -0.5798653364181519, + "step": 66 + }, + { + "epoch": 0.13, + "grad_norm": 38.80109786987305, + "learning_rate": 2.2333333333333332e-07, + "logps/chosen": -11.50251293182373, + "logps/rejected": -40.8316535949707, + "loss": 0.4839, + "losses/dpo": 0.4850131869316101, + "losses/sft": 0.22737279534339905, + "losses/total": 0.4850131869316101, + "ref_logps/chosen": -11.721717834472656, + "ref_logps/rejected": -35.50334930419922, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.021920526400208473, + "rewards/margins": 0.554750919342041, + "rewards/rejected": -0.5328303575515747, + "step": 67 + }, + { + "epoch": 0.14, + "grad_norm": 41.928707122802734, + "learning_rate": 2.2666666666666663e-07, + "logps/chosen": -15.948335647583008, + "logps/rejected": -58.71236801147461, + "loss": 0.3387, + "losses/dpo": 0.4127328097820282, + "losses/sft": 0.2764541506767273, + "losses/total": 0.4127328097820282, + "ref_logps/chosen": -17.22922134399414, + "ref_logps/rejected": -50.34320831298828, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12808847427368164, + "rewards/margins": 0.9650048613548279, + "rewards/rejected": -0.8369163870811462, + "step": 68 + }, + { + "epoch": 0.14, + "grad_norm": 37.16038131713867, + "learning_rate": 2.3e-07, + "logps/chosen": -13.122339248657227, + "logps/rejected": -48.92229461669922, + "loss": 0.4026, + "losses/dpo": 0.36682942509651184, + "losses/sft": 0.24337750673294067, + "losses/total": 0.36682942509651184, + "ref_logps/chosen": -13.790239334106445, + "ref_logps/rejected": -41.1927490234375, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.06679011881351471, + "rewards/margins": 0.8397451639175415, + "rewards/rejected": -0.772955060005188, + "step": 69 + }, + { + "epoch": 0.14, + "grad_norm": 43.80226135253906, + "learning_rate": 2.3333333333333333e-07, + "logps/chosen": -16.744388580322266, + "logps/rejected": -63.02336883544922, + "loss": 0.371, + "losses/dpo": 0.4458756148815155, + "losses/sft": 0.2844783663749695, + "losses/total": 0.4458756148815155, + "ref_logps/chosen": -17.103076934814453, + "ref_logps/rejected": -54.47646713256836, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.035868849605321884, + "rewards/margins": 0.8905590176582336, + "rewards/rejected": -0.8546901941299438, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 37.900611877441406, + "learning_rate": 2.3666666666666664e-07, + "logps/chosen": -15.185718536376953, + "logps/rejected": -57.733131408691406, + "loss": 0.3906, + "losses/dpo": 0.36287635564804077, + "losses/sft": 0.26010823249816895, + "losses/total": 0.36287635564804077, + "ref_logps/chosen": -15.726322174072266, + "ref_logps/rejected": -49.109825134277344, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05406036227941513, + "rewards/margins": 0.9163906574249268, + "rewards/rejected": -0.8623303174972534, + "step": 71 + }, + { + "epoch": 0.14, + "grad_norm": 29.51009750366211, + "learning_rate": 2.4e-07, + "logps/chosen": -10.109313011169434, + "logps/rejected": -70.4901351928711, + "loss": 0.2932, + "losses/dpo": 0.3870830535888672, + "losses/sft": 0.26137688755989075, + "losses/total": 0.3870830535888672, + "ref_logps/chosen": -10.630828857421875, + "ref_logps/rejected": -57.545108795166016, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05215153098106384, + "rewards/margins": 1.3466542959213257, + "rewards/rejected": -1.2945027351379395, + "step": 72 + }, + { + "epoch": 0.15, + "grad_norm": 35.36587905883789, + "learning_rate": 2.433333333333333e-07, + "logps/chosen": -14.218860626220703, + "logps/rejected": -51.722843170166016, + "loss": 0.2982, + "losses/dpo": 0.25439390540122986, + "losses/sft": 0.30609095096588135, + "losses/total": 0.25439390540122986, + "ref_logps/chosen": -14.53840446472168, + "ref_logps/rejected": -40.36719512939453, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0319543182849884, + "rewards/margins": 1.1675193309783936, + "rewards/rejected": -1.1355650424957275, + "step": 73 + }, + { + "epoch": 0.15, + "grad_norm": 34.00502395629883, + "learning_rate": 2.4666666666666665e-07, + "logps/chosen": -14.362863540649414, + "logps/rejected": -57.53142166137695, + "loss": 0.2736, + "losses/dpo": 0.281907856464386, + "losses/sft": 0.2517828643321991, + "losses/total": 0.281907856464386, + "ref_logps/chosen": -14.883176803588867, + "ref_logps/rejected": -44.662784576416016, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.052031371742486954, + "rewards/margins": 1.3388949632644653, + "rewards/rejected": -1.2868635654449463, + "step": 74 + }, + { + "epoch": 0.15, + "grad_norm": 33.48396301269531, + "learning_rate": 2.5e-07, + "logps/chosen": -15.098970413208008, + "logps/rejected": -64.7098159790039, + "loss": 0.2486, + "losses/dpo": 0.24968230724334717, + "losses/sft": 0.2866656184196472, + "losses/total": 0.24968230724334717, + "ref_logps/chosen": -15.477853775024414, + "ref_logps/rejected": -50.06144714355469, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03788831830024719, + "rewards/margins": 1.50272536277771, + "rewards/rejected": -1.4648370742797852, + "step": 75 + }, + { + "epoch": 0.15, + "grad_norm": 37.191131591796875, + "learning_rate": 2.533333333333333e-07, + "logps/chosen": -14.241877555847168, + "logps/rejected": -69.87599182128906, + "loss": 0.2465, + "losses/dpo": 0.23917606472969055, + "losses/sft": 0.27395009994506836, + "losses/total": 0.23917606472969055, + "ref_logps/chosen": -12.860979080200195, + "ref_logps/rejected": -53.578670501708984, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13808982074260712, + "rewards/margins": 1.4916424751281738, + "rewards/rejected": -1.629732370376587, + "step": 76 + }, + { + "epoch": 0.15, + "grad_norm": 26.611677169799805, + "learning_rate": 2.5666666666666666e-07, + "logps/chosen": -14.91260814666748, + "logps/rejected": -80.22364044189453, + "loss": 0.2541, + "losses/dpo": 0.3971107602119446, + "losses/sft": 0.3323269486427307, + "losses/total": 0.3971107602119446, + "ref_logps/chosen": -13.36182975769043, + "ref_logps/rejected": -58.73191833496094, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15507787466049194, + "rewards/margins": 1.994094729423523, + "rewards/rejected": -2.149172782897949, + "step": 77 + }, + { + "epoch": 0.16, + "grad_norm": 28.33013343811035, + "learning_rate": 2.6e-07, + "logps/chosen": -15.90027141571045, + "logps/rejected": -66.85693359375, + "loss": 0.2514, + "losses/dpo": 0.18611454963684082, + "losses/sft": 0.28300029039382935, + "losses/total": 0.18611454963684082, + "ref_logps/chosen": -13.217735290527344, + "ref_logps/rejected": -49.22061538696289, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26825347542762756, + "rewards/margins": 1.495378851890564, + "rewards/rejected": -1.7636322975158691, + "step": 78 + }, + { + "epoch": 0.16, + "grad_norm": 37.32158279418945, + "learning_rate": 2.633333333333333e-07, + "logps/chosen": -15.713750839233398, + "logps/rejected": -53.1090087890625, + "loss": 0.3564, + "losses/dpo": 0.5421111583709717, + "losses/sft": 0.3495892286300659, + "losses/total": 0.5421111583709717, + "ref_logps/chosen": -13.422369003295898, + "ref_logps/rejected": -37.21599578857422, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.2291383147239685, + "rewards/margins": 1.3601632118225098, + "rewards/rejected": -1.589301586151123, + "step": 79 + }, + { + "epoch": 0.16, + "grad_norm": 33.47981643676758, + "learning_rate": 2.6666666666666667e-07, + "logps/chosen": -16.27362060546875, + "logps/rejected": -57.29899978637695, + "loss": 0.2627, + "losses/dpo": 0.19374717772006989, + "losses/sft": 0.32004982233047485, + "losses/total": 0.19374717772006989, + "ref_logps/chosen": -13.620620727539062, + "ref_logps/rejected": -38.440185546875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.26529988646507263, + "rewards/margins": 1.620581865310669, + "rewards/rejected": -1.8858816623687744, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 35.3527946472168, + "learning_rate": 2.7e-07, + "logps/chosen": -20.024314880371094, + "logps/rejected": -85.74209594726562, + "loss": 0.2311, + "losses/dpo": 0.19971030950546265, + "losses/sft": 0.2853155732154846, + "losses/total": 0.19971030950546265, + "ref_logps/chosen": -18.787288665771484, + "ref_logps/rejected": -65.08699035644531, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12370243668556213, + "rewards/margins": 1.9418089389801025, + "rewards/rejected": -2.0655112266540527, + "step": 81 + }, + { + "epoch": 0.16, + "grad_norm": 28.21786117553711, + "learning_rate": 2.733333333333333e-07, + "logps/chosen": -16.94407081604004, + "logps/rejected": -79.70083618164062, + "loss": 0.2106, + "losses/dpo": 0.22653785347938538, + "losses/sft": 0.31818974018096924, + "losses/total": 0.22653785347938538, + "ref_logps/chosen": -15.551984786987305, + "ref_logps/rejected": -55.47174072265625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.13920865952968597, + "rewards/margins": 2.283700704574585, + "rewards/rejected": -2.4229092597961426, + "step": 82 + }, + { + "epoch": 0.17, + "grad_norm": 23.01938247680664, + "learning_rate": 2.766666666666667e-07, + "logps/chosen": -15.447221755981445, + "logps/rejected": -83.06011199951172, + "loss": 0.1544, + "losses/dpo": 0.13929356634616852, + "losses/sft": 0.33886247873306274, + "losses/total": 0.13929356634616852, + "ref_logps/chosen": -12.357421875, + "ref_logps/rejected": -56.69266891479492, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3089800477027893, + "rewards/margins": 2.3277645111083984, + "rewards/rejected": -2.636744499206543, + "step": 83 + }, + { + "epoch": 0.17, + "grad_norm": 30.944183349609375, + "learning_rate": 2.8e-07, + "logps/chosen": -19.864492416381836, + "logps/rejected": -59.14112091064453, + "loss": 0.2418, + "losses/dpo": 0.33125755190849304, + "losses/sft": 0.23029811680316925, + "losses/total": 0.33125755190849304, + "ref_logps/chosen": -16.55842399597168, + "ref_logps/rejected": -39.68665313720703, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3306068181991577, + "rewards/margins": 1.6148401498794556, + "rewards/rejected": -1.9454468488693237, + "step": 84 + }, + { + "epoch": 0.17, + "grad_norm": 26.384204864501953, + "learning_rate": 2.833333333333333e-07, + "logps/chosen": -16.929367065429688, + "logps/rejected": -87.05389404296875, + "loss": 0.1536, + "losses/dpo": 0.18227216601371765, + "losses/sft": 0.3527926206588745, + "losses/total": 0.18227216601371765, + "ref_logps/chosen": -14.183502197265625, + "ref_logps/rejected": -56.431129455566406, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27458658814430237, + "rewards/margins": 2.787689685821533, + "rewards/rejected": -3.0622763633728027, + "step": 85 + }, + { + "epoch": 0.17, + "grad_norm": 38.91211700439453, + "learning_rate": 2.866666666666667e-07, + "logps/chosen": -17.564605712890625, + "logps/rejected": -52.08045196533203, + "loss": 0.2764, + "losses/dpo": 0.1875544637441635, + "losses/sft": 0.3149993121623993, + "losses/total": 0.1875544637441635, + "ref_logps/chosen": -13.960559844970703, + "ref_logps/rejected": -32.2154426574707, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3604046702384949, + "rewards/margins": 1.626096487045288, + "rewards/rejected": -1.9865012168884277, + "step": 86 + }, + { + "epoch": 0.17, + "grad_norm": 54.64894485473633, + "learning_rate": 2.9e-07, + "logps/chosen": -24.219192504882812, + "logps/rejected": -59.99394989013672, + "loss": 0.3631, + "losses/dpo": 0.3719308376312256, + "losses/sft": 0.49110162258148193, + "losses/total": 0.3719308376312256, + "ref_logps/chosen": -15.149127960205078, + "ref_logps/rejected": -34.75643539428711, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9070063829421997, + "rewards/margins": 1.6167452335357666, + "rewards/rejected": -2.523751735687256, + "step": 87 + }, + { + "epoch": 0.18, + "grad_norm": 67.03410339355469, + "learning_rate": 2.933333333333333e-07, + "logps/chosen": -15.183414459228516, + "logps/rejected": -64.70655059814453, + "loss": 0.3215, + "losses/dpo": 0.4381785988807678, + "losses/sft": 0.40025150775909424, + "losses/total": 0.4381785988807678, + "ref_logps/chosen": -11.338159561157227, + "ref_logps/rejected": -38.85139083862305, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3845253884792328, + "rewards/margins": 2.200990676879883, + "rewards/rejected": -2.5855159759521484, + "step": 88 + }, + { + "epoch": 0.18, + "grad_norm": 37.209999084472656, + "learning_rate": 2.966666666666667e-07, + "logps/chosen": -20.264392852783203, + "logps/rejected": -64.114013671875, + "loss": 0.304, + "losses/dpo": 0.20819611847400665, + "losses/sft": 0.3846052885055542, + "losses/total": 0.20819611847400665, + "ref_logps/chosen": -13.015954971313477, + "ref_logps/rejected": -37.71721649169922, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7248438596725464, + "rewards/margins": 1.9148359298706055, + "rewards/rejected": -2.6396799087524414, + "step": 89 + }, + { + "epoch": 0.18, + "grad_norm": 28.591053009033203, + "learning_rate": 3e-07, + "logps/chosen": -25.06616973876953, + "logps/rejected": -81.93843841552734, + "loss": 0.2055, + "losses/dpo": 0.31712400913238525, + "losses/sft": 0.31778547167778015, + "losses/total": 0.31712400913238525, + "ref_logps/chosen": -21.52896499633789, + "ref_logps/rejected": -53.268577575683594, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3537205457687378, + "rewards/margins": 2.51326584815979, + "rewards/rejected": -2.8669862747192383, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 38.66221618652344, + "learning_rate": 3.033333333333333e-07, + "logps/chosen": -24.031814575195312, + "logps/rejected": -61.21632385253906, + "loss": 0.246, + "losses/dpo": 0.18292659521102905, + "losses/sft": 0.4320409297943115, + "losses/total": 0.18292659521102905, + "ref_logps/chosen": -17.206039428710938, + "ref_logps/rejected": -38.700233459472656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.68257737159729, + "rewards/margins": 1.5690321922302246, + "rewards/rejected": -2.2516093254089355, + "step": 91 + }, + { + "epoch": 0.18, + "grad_norm": 39.619354248046875, + "learning_rate": 3.066666666666666e-07, + "logps/chosen": -19.91756820678711, + "logps/rejected": -62.49524688720703, + "loss": 0.2167, + "losses/dpo": 0.22770985960960388, + "losses/sft": 0.4464987814426422, + "losses/total": 0.22770985960960388, + "ref_logps/chosen": -12.815778732299805, + "ref_logps/rejected": -34.96184539794922, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7101789712905884, + "rewards/margins": 2.043161392211914, + "rewards/rejected": -2.753340482711792, + "step": 92 + }, + { + "epoch": 0.19, + "grad_norm": 36.227333068847656, + "learning_rate": 3.1e-07, + "logps/chosen": -25.785503387451172, + "logps/rejected": -64.98928833007812, + "loss": 0.2481, + "losses/dpo": 0.220164492726326, + "losses/sft": 0.3677349090576172, + "losses/total": 0.220164492726326, + "ref_logps/chosen": -21.26690673828125, + "ref_logps/rejected": -37.80683898925781, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45185956358909607, + "rewards/margins": 2.266386032104492, + "rewards/rejected": -2.7182457447052, + "step": 93 + }, + { + "epoch": 0.19, + "grad_norm": 37.962623596191406, + "learning_rate": 3.1333333333333333e-07, + "logps/chosen": -24.163372039794922, + "logps/rejected": -66.34964752197266, + "loss": 0.2241, + "losses/dpo": 0.310378760099411, + "losses/sft": 0.4730556607246399, + "losses/total": 0.310378760099411, + "ref_logps/chosen": -14.319221496582031, + "ref_logps/rejected": -38.69602966308594, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9844151139259338, + "rewards/margins": 1.7809470891952515, + "rewards/rejected": -2.76536226272583, + "step": 94 + }, + { + "epoch": 0.19, + "grad_norm": 18.532638549804688, + "learning_rate": 3.166666666666666e-07, + "logps/chosen": -20.102611541748047, + "logps/rejected": -83.15855407714844, + "loss": 0.1204, + "losses/dpo": 0.039735302329063416, + "losses/sft": 0.36929309368133545, + "losses/total": 0.039735302329063416, + "ref_logps/chosen": -12.35050106048584, + "ref_logps/rejected": -45.502262115478516, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.775210976600647, + "rewards/margins": 2.9904184341430664, + "rewards/rejected": -3.765629291534424, + "step": 95 + }, + { + "epoch": 0.19, + "grad_norm": 47.28300476074219, + "learning_rate": 3.2e-07, + "logps/chosen": -30.591978073120117, + "logps/rejected": -79.0492172241211, + "loss": 0.2275, + "losses/dpo": 0.14131276309490204, + "losses/sft": 0.34366729855537415, + "losses/total": 0.14131276309490204, + "ref_logps/chosen": -20.450027465820312, + "ref_logps/rejected": -51.264705657958984, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0141950845718384, + "rewards/margins": 1.7642561197280884, + "rewards/rejected": -2.7784509658813477, + "step": 96 + }, + { + "epoch": 0.19, + "grad_norm": 22.053281784057617, + "learning_rate": 3.233333333333333e-07, + "logps/chosen": -23.116226196289062, + "logps/rejected": -105.54429626464844, + "loss": 0.0763, + "losses/dpo": 0.08986547589302063, + "losses/sft": 0.47947466373443604, + "losses/total": 0.08986547589302063, + "ref_logps/chosen": -15.21923542022705, + "ref_logps/rejected": -61.29276657104492, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7896990180015564, + "rewards/margins": 3.6354546546936035, + "rewards/rejected": -4.425153732299805, + "step": 97 + }, + { + "epoch": 0.2, + "grad_norm": 28.593852996826172, + "learning_rate": 3.2666666666666663e-07, + "logps/chosen": -17.118698120117188, + "logps/rejected": -88.55982971191406, + "loss": 0.1363, + "losses/dpo": 0.07273420691490173, + "losses/sft": 0.4799639582633972, + "losses/total": 0.07273420691490173, + "ref_logps/chosen": -12.235620498657227, + "ref_logps/rejected": -49.14242935180664, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4883077144622803, + "rewards/margins": 3.453432559967041, + "rewards/rejected": -3.9417405128479004, + "step": 98 + }, + { + "epoch": 0.2, + "grad_norm": 28.878681182861328, + "learning_rate": 3.3e-07, + "logps/chosen": -24.96469497680664, + "logps/rejected": -89.02628326416016, + "loss": 0.1801, + "losses/dpo": 0.21059244871139526, + "losses/sft": 0.4723120331764221, + "losses/total": 0.21059244871139526, + "ref_logps/chosen": -15.973878860473633, + "ref_logps/rejected": -47.815399169921875, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8990815877914429, + "rewards/margins": 3.2220067977905273, + "rewards/rejected": -4.12108850479126, + "step": 99 + }, + { + "epoch": 0.2, + "grad_norm": 29.858488082885742, + "learning_rate": 3.333333333333333e-07, + "logps/chosen": -23.120346069335938, + "logps/rejected": -78.31523895263672, + "loss": 0.1001, + "losses/dpo": 0.2422274351119995, + "losses/sft": 0.527462899684906, + "losses/total": 0.2422274351119995, + "ref_logps/chosen": -14.651782035827637, + "ref_logps/rejected": -38.066070556640625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8468563556671143, + "rewards/margins": 3.178060531616211, + "rewards/rejected": -4.024916648864746, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 40.085304260253906, + "learning_rate": 3.3666666666666664e-07, + "logps/chosen": -17.66061019897461, + "logps/rejected": -55.92891311645508, + "loss": 0.3245, + "losses/dpo": 0.38317328691482544, + "losses/sft": 0.43344035744667053, + "losses/total": 0.38317328691482544, + "ref_logps/chosen": -11.103376388549805, + "ref_logps/rejected": -33.681671142578125, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.655723512172699, + "rewards/margins": 1.5690006017684937, + "rewards/rejected": -2.224724054336548, + "step": 101 + }, + { + "epoch": 0.2, + "grad_norm": 37.162322998046875, + "learning_rate": 3.4000000000000003e-07, + "logps/chosen": -20.915874481201172, + "logps/rejected": -70.4129638671875, + "loss": 0.2303, + "losses/dpo": 0.40623629093170166, + "losses/sft": 0.2984255254268646, + "losses/total": 0.40623629093170166, + "ref_logps/chosen": -13.981635093688965, + "ref_logps/rejected": -39.540313720703125, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6934242248535156, + "rewards/margins": 2.393840789794922, + "rewards/rejected": -3.0872647762298584, + "step": 102 + }, + { + "epoch": 0.21, + "grad_norm": 52.18952178955078, + "learning_rate": 3.433333333333333e-07, + "logps/chosen": -23.943897247314453, + "logps/rejected": -86.97280883789062, + "loss": 0.2605, + "losses/dpo": 0.36269426345825195, + "losses/sft": 0.4030131697654724, + "losses/total": 0.36269426345825195, + "ref_logps/chosen": -14.55135726928711, + "ref_logps/rejected": -48.15530776977539, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9392539262771606, + "rewards/margins": 2.9424962997436523, + "rewards/rejected": -3.8817505836486816, + "step": 103 + }, + { + "epoch": 0.21, + "grad_norm": 27.642332077026367, + "learning_rate": 3.4666666666666665e-07, + "logps/chosen": -25.413158416748047, + "logps/rejected": -79.21366119384766, + "loss": 0.0897, + "losses/dpo": 0.05271019786596298, + "losses/sft": 0.547292172908783, + "losses/total": 0.05271019786596298, + "ref_logps/chosen": -17.499645233154297, + "ref_logps/rejected": -39.83184051513672, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7913513779640198, + "rewards/margins": 3.1468305587768555, + "rewards/rejected": -3.9381821155548096, + "step": 104 + }, + { + "epoch": 0.21, + "grad_norm": 26.87654685974121, + "learning_rate": 3.5e-07, + "logps/chosen": -20.214862823486328, + "logps/rejected": -87.76766967773438, + "loss": 0.1645, + "losses/dpo": 0.20824576914310455, + "losses/sft": 0.33288073539733887, + "losses/total": 0.20824576914310455, + "ref_logps/chosen": -11.795272827148438, + "ref_logps/rejected": -48.254093170166016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8419591188430786, + "rewards/margins": 3.10939884185791, + "rewards/rejected": -3.951357841491699, + "step": 105 + }, + { + "epoch": 0.21, + "grad_norm": 64.75567626953125, + "learning_rate": 3.533333333333333e-07, + "logps/chosen": -30.353635787963867, + "logps/rejected": -72.8525390625, + "loss": 0.3244, + "losses/dpo": 0.3965432345867157, + "losses/sft": 0.5763383507728577, + "losses/total": 0.3965432345867157, + "ref_logps/chosen": -15.887895584106445, + "ref_logps/rejected": -36.173828125, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.4465739727020264, + "rewards/margins": 2.221297025680542, + "rewards/rejected": -3.6678709983825684, + "step": 106 + }, + { + "epoch": 0.21, + "grad_norm": 70.32903289794922, + "learning_rate": 3.5666666666666666e-07, + "logps/chosen": -27.496435165405273, + "logps/rejected": -96.47198486328125, + "loss": 0.1491, + "losses/dpo": 0.14805351197719574, + "losses/sft": 0.48672235012054443, + "losses/total": 0.14805351197719574, + "ref_logps/chosen": -14.464235305786133, + "ref_logps/rejected": -50.544700622558594, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3032200336456299, + "rewards/margins": 3.2895092964172363, + "rewards/rejected": -4.592729091644287, + "step": 107 + }, + { + "epoch": 0.22, + "grad_norm": 19.775211334228516, + "learning_rate": 3.6e-07, + "logps/chosen": -26.703758239746094, + "logps/rejected": -95.46380615234375, + "loss": 0.0968, + "losses/dpo": 0.046378664672374725, + "losses/sft": 0.40180331468582153, + "losses/total": 0.046378664672374725, + "ref_logps/chosen": -15.243803024291992, + "ref_logps/rejected": -47.9715576171875, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1459956169128418, + "rewards/margins": 3.60322904586792, + "rewards/rejected": -4.749224662780762, + "step": 108 + }, + { + "epoch": 0.22, + "grad_norm": 34.58012390136719, + "learning_rate": 3.6333333333333333e-07, + "logps/chosen": -25.720474243164062, + "logps/rejected": -101.4200439453125, + "loss": 0.1469, + "losses/dpo": 0.16447117924690247, + "losses/sft": 0.49221622943878174, + "losses/total": 0.16447117924690247, + "ref_logps/chosen": -16.14147186279297, + "ref_logps/rejected": -54.09002685546875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9579001665115356, + "rewards/margins": 3.7751007080078125, + "rewards/rejected": -4.733000755310059, + "step": 109 + }, + { + "epoch": 0.22, + "grad_norm": 40.38216018676758, + "learning_rate": 3.666666666666666e-07, + "logps/chosen": -26.010862350463867, + "logps/rejected": -78.31238555908203, + "loss": 0.2122, + "losses/dpo": 0.38288456201553345, + "losses/sft": 0.5134553909301758, + "losses/total": 0.38288456201553345, + "ref_logps/chosen": -12.394744873046875, + "ref_logps/rejected": -39.01166915893555, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3616118431091309, + "rewards/margins": 2.568459987640381, + "rewards/rejected": -3.9300713539123535, + "step": 110 + }, + { + "epoch": 0.22, + "grad_norm": 34.98234939575195, + "learning_rate": 3.7e-07, + "logps/chosen": -33.29746627807617, + "logps/rejected": -82.07467651367188, + "loss": 0.1525, + "losses/dpo": 0.1822669804096222, + "losses/sft": 0.49815496802330017, + "losses/total": 0.1822669804096222, + "ref_logps/chosen": -21.91498374938965, + "ref_logps/rejected": -41.01658248901367, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1382479667663574, + "rewards/margins": 2.9675610065460205, + "rewards/rejected": -4.105809211730957, + "step": 111 + }, + { + "epoch": 0.22, + "grad_norm": 33.530418395996094, + "learning_rate": 3.7333333333333334e-07, + "logps/chosen": -23.67328643798828, + "logps/rejected": -92.13876342773438, + "loss": 0.0961, + "losses/dpo": 0.008127570152282715, + "losses/sft": 0.21549856662750244, + "losses/total": 0.008127570152282715, + "ref_logps/chosen": -13.349861145019531, + "ref_logps/rejected": -43.54875946044922, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0323424339294434, + "rewards/margins": 3.8266587257385254, + "rewards/rejected": -4.859001159667969, + "step": 112 + }, + { + "epoch": 0.23, + "grad_norm": 38.63556671142578, + "learning_rate": 3.766666666666666e-07, + "logps/chosen": -30.415813446044922, + "logps/rejected": -84.25430297851562, + "loss": 0.1936, + "losses/dpo": 0.12123291194438934, + "losses/sft": 0.20419417321681976, + "losses/total": 0.12123291194438934, + "ref_logps/chosen": -19.41822052001953, + "ref_logps/rejected": -45.32547378540039, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0997594594955444, + "rewards/margins": 2.793123483657837, + "rewards/rejected": -3.892883062362671, + "step": 113 + }, + { + "epoch": 0.23, + "grad_norm": 23.571645736694336, + "learning_rate": 3.7999999999999996e-07, + "logps/chosen": -26.51279067993164, + "logps/rejected": -94.08573913574219, + "loss": 0.1537, + "losses/dpo": 0.14313971996307373, + "losses/sft": 0.36151280999183655, + "losses/total": 0.14313971996307373, + "ref_logps/chosen": -18.325164794921875, + "ref_logps/rejected": -49.47245788574219, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8187628984451294, + "rewards/margins": 3.6425652503967285, + "rewards/rejected": -4.461328029632568, + "step": 114 + }, + { + "epoch": 0.23, + "grad_norm": 23.784942626953125, + "learning_rate": 3.8333333333333335e-07, + "logps/chosen": -28.95541763305664, + "logps/rejected": -91.38735961914062, + "loss": 0.0878, + "losses/dpo": 0.0992375910282135, + "losses/sft": 0.2555049955844879, + "losses/total": 0.0992375910282135, + "ref_logps/chosen": -16.020353317260742, + "ref_logps/rejected": -43.40199279785156, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2935062646865845, + "rewards/margins": 3.505030632019043, + "rewards/rejected": -4.798537254333496, + "step": 115 + }, + { + "epoch": 0.23, + "grad_norm": 67.53050994873047, + "learning_rate": 3.8666666666666664e-07, + "logps/chosen": -25.763362884521484, + "logps/rejected": -83.27964782714844, + "loss": 0.2907, + "losses/dpo": 0.1631295382976532, + "losses/sft": 0.5447544455528259, + "losses/total": 0.1631295382976532, + "ref_logps/chosen": -11.275564193725586, + "ref_logps/rejected": -42.068965911865234, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.448779582977295, + "rewards/margins": 2.672288179397583, + "rewards/rejected": -4.121068000793457, + "step": 116 + }, + { + "epoch": 0.23, + "grad_norm": 35.78622055053711, + "learning_rate": 3.8999999999999997e-07, + "logps/chosen": -24.083629608154297, + "logps/rejected": -84.81612396240234, + "loss": 0.1676, + "losses/dpo": 0.24396544694900513, + "losses/sft": 0.6446655988693237, + "losses/total": 0.24396544694900513, + "ref_logps/chosen": -12.8930025100708, + "ref_logps/rejected": -37.470420837402344, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1190627813339233, + "rewards/margins": 3.615507125854492, + "rewards/rejected": -4.734570503234863, + "step": 117 + }, + { + "epoch": 0.24, + "grad_norm": 24.79482650756836, + "learning_rate": 3.933333333333333e-07, + "logps/chosen": -24.10096549987793, + "logps/rejected": -91.7545394897461, + "loss": 0.1411, + "losses/dpo": 0.07791762053966522, + "losses/sft": 0.5899461507797241, + "losses/total": 0.07791762053966522, + "ref_logps/chosen": -13.302517890930176, + "ref_logps/rejected": -47.74324035644531, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0798447132110596, + "rewards/margins": 3.3212857246398926, + "rewards/rejected": -4.401130199432373, + "step": 118 + }, + { + "epoch": 0.24, + "grad_norm": 35.7785530090332, + "learning_rate": 3.9666666666666665e-07, + "logps/chosen": -31.792556762695312, + "logps/rejected": -106.40116882324219, + "loss": 0.1401, + "losses/dpo": 0.4013972878456116, + "losses/sft": 0.6858751177787781, + "losses/total": 0.4013972878456116, + "ref_logps/chosen": -13.898950576782227, + "ref_logps/rejected": -49.06494140625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.789360523223877, + "rewards/margins": 3.9442622661590576, + "rewards/rejected": -5.7336225509643555, + "step": 119 + }, + { + "epoch": 0.24, + "grad_norm": 53.136749267578125, + "learning_rate": 4e-07, + "logps/chosen": -29.701318740844727, + "logps/rejected": -108.02490997314453, + "loss": 0.2095, + "losses/dpo": 0.6063670516014099, + "losses/sft": 0.5956507325172424, + "losses/total": 0.6063670516014099, + "ref_logps/chosen": -10.965313911437988, + "ref_logps/rejected": -52.60283279418945, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8736006021499634, + "rewards/margins": 3.668607711791992, + "rewards/rejected": -5.542208194732666, + "step": 120 + }, + { + "epoch": 0.24, + "grad_norm": 32.23996353149414, + "learning_rate": 4.033333333333333e-07, + "logps/chosen": -33.582359313964844, + "logps/rejected": -113.1934814453125, + "loss": 0.1153, + "losses/dpo": 0.05961308628320694, + "losses/sft": 0.45472973585128784, + "losses/total": 0.05961308628320694, + "ref_logps/chosen": -18.625728607177734, + "ref_logps/rejected": -56.13653564453125, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4956634044647217, + "rewards/margins": 4.210031509399414, + "rewards/rejected": -5.705695152282715, + "step": 121 + }, + { + "epoch": 0.24, + "grad_norm": 55.00533676147461, + "learning_rate": 4.0666666666666666e-07, + "logps/chosen": -26.204700469970703, + "logps/rejected": -103.64109802246094, + "loss": 0.1689, + "losses/dpo": 0.274069219827652, + "losses/sft": 0.598950207233429, + "losses/total": 0.274069219827652, + "ref_logps/chosen": -10.38711929321289, + "ref_logps/rejected": -48.82931137084961, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5817580223083496, + "rewards/margins": 3.8994200229644775, + "rewards/rejected": -5.481178283691406, + "step": 122 + }, + { + "epoch": 0.25, + "grad_norm": 28.600406646728516, + "learning_rate": 4.0999999999999994e-07, + "logps/chosen": -27.372495651245117, + "logps/rejected": -88.63994598388672, + "loss": 0.1469, + "losses/dpo": 0.06027643382549286, + "losses/sft": 0.5721110105514526, + "losses/total": 0.06027643382549286, + "ref_logps/chosen": -10.918659210205078, + "ref_logps/rejected": -36.75074005126953, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.645383596420288, + "rewards/margins": 3.54353666305542, + "rewards/rejected": -5.188920497894287, + "step": 123 + }, + { + "epoch": 0.25, + "grad_norm": 80.9706802368164, + "learning_rate": 4.1333333333333333e-07, + "logps/chosen": -35.911651611328125, + "logps/rejected": -91.64453125, + "loss": 0.3585, + "losses/dpo": 0.6168155074119568, + "losses/sft": 0.8081568479537964, + "losses/total": 0.6168155074119568, + "ref_logps/chosen": -12.966390609741211, + "ref_logps/rejected": -45.097755432128906, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2945261001586914, + "rewards/margins": 2.3601508140563965, + "rewards/rejected": -4.654676914215088, + "step": 124 + }, + { + "epoch": 0.25, + "grad_norm": 58.33188247680664, + "learning_rate": 4.1666666666666667e-07, + "logps/chosen": -33.37420654296875, + "logps/rejected": -100.40385437011719, + "loss": 0.2055, + "losses/dpo": 0.35650166869163513, + "losses/sft": 0.651277482509613, + "losses/total": 0.35650166869163513, + "ref_logps/chosen": -14.553751945495605, + "ref_logps/rejected": -42.69612503051758, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8820456266403198, + "rewards/margins": 3.8887269496917725, + "rewards/rejected": -5.770772933959961, + "step": 125 + }, + { + "epoch": 0.25, + "grad_norm": 63.172340393066406, + "learning_rate": 4.1999999999999995e-07, + "logps/chosen": -34.846107482910156, + "logps/rejected": -123.60401153564453, + "loss": 0.2306, + "losses/dpo": 0.06216158717870712, + "losses/sft": 0.736182451248169, + "losses/total": 0.06216158717870712, + "ref_logps/chosen": -16.453075408935547, + "ref_logps/rejected": -64.68106842041016, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8393032550811768, + "rewards/margins": 4.052990913391113, + "rewards/rejected": -5.892294406890869, + "step": 126 + }, + { + "epoch": 0.25, + "grad_norm": 31.807851791381836, + "learning_rate": 4.2333333333333334e-07, + "logps/chosen": -30.268192291259766, + "logps/rejected": -88.67607116699219, + "loss": 0.1248, + "losses/dpo": 0.24264563620090485, + "losses/sft": 0.5180367827415466, + "losses/total": 0.24264563620090485, + "ref_logps/chosen": -17.16900062561035, + "ref_logps/rejected": -38.45530319213867, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3099192380905151, + "rewards/margins": 3.712158203125, + "rewards/rejected": -5.022077560424805, + "step": 127 + }, + { + "epoch": 0.26, + "grad_norm": 24.525039672851562, + "learning_rate": 4.266666666666667e-07, + "logps/chosen": -39.662452697753906, + "logps/rejected": -110.11405944824219, + "loss": 0.0984, + "losses/dpo": 0.17816494405269623, + "losses/sft": 0.6083031296730042, + "losses/total": 0.17816494405269623, + "ref_logps/chosen": -17.88237762451172, + "ref_logps/rejected": -45.97468566894531, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1780076026916504, + "rewards/margins": 4.235930442810059, + "rewards/rejected": -6.413937568664551, + "step": 128 + }, + { + "epoch": 0.26, + "grad_norm": 36.77156448364258, + "learning_rate": 4.2999999999999996e-07, + "logps/chosen": -33.76113510131836, + "logps/rejected": -85.31007385253906, + "loss": 0.1854, + "losses/dpo": 0.09161588549613953, + "losses/sft": 0.4901028275489807, + "losses/total": 0.09161588549613953, + "ref_logps/chosen": -19.187625885009766, + "ref_logps/rejected": -41.31425857543945, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4573508501052856, + "rewards/margins": 2.942230463027954, + "rewards/rejected": -4.399581432342529, + "step": 129 + }, + { + "epoch": 0.26, + "grad_norm": 14.954230308532715, + "learning_rate": 4.3333333333333335e-07, + "logps/chosen": -26.31670379638672, + "logps/rejected": -95.92216491699219, + "loss": 0.0717, + "losses/dpo": 0.22551429271697998, + "losses/sft": 0.5335481762886047, + "losses/total": 0.22551429271697998, + "ref_logps/chosen": -14.930686950683594, + "ref_logps/rejected": -44.4586181640625, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1386016607284546, + "rewards/margins": 4.007754325866699, + "rewards/rejected": -5.146356105804443, + "step": 130 + }, + { + "epoch": 0.26, + "grad_norm": 18.751760482788086, + "learning_rate": 4.3666666666666663e-07, + "logps/chosen": -27.520706176757812, + "logps/rejected": -87.16500854492188, + "loss": 0.0965, + "losses/dpo": 0.062105268239974976, + "losses/sft": 0.48621320724487305, + "losses/total": 0.062105268239974976, + "ref_logps/chosen": -12.407279968261719, + "ref_logps/rejected": -39.634822845458984, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5113425254821777, + "rewards/margins": 3.241675853729248, + "rewards/rejected": -4.753018379211426, + "step": 131 + }, + { + "epoch": 0.26, + "grad_norm": 43.365806579589844, + "learning_rate": 4.3999999999999997e-07, + "logps/chosen": -30.080524444580078, + "logps/rejected": -100.35038757324219, + "loss": 0.1902, + "losses/dpo": 0.5356749296188354, + "losses/sft": 0.5842803716659546, + "losses/total": 0.5356749296188354, + "ref_logps/chosen": -12.293049812316895, + "ref_logps/rejected": -47.880836486816406, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.778747320175171, + "rewards/margins": 3.468207836151123, + "rewards/rejected": -5.246955394744873, + "step": 132 + }, + { + "epoch": 0.27, + "grad_norm": 27.982099533081055, + "learning_rate": 4.4333333333333336e-07, + "logps/chosen": -31.108388900756836, + "logps/rejected": -93.52197265625, + "loss": 0.0911, + "losses/dpo": 0.08737071603536606, + "losses/sft": 0.6428839564323425, + "losses/total": 0.08737071603536606, + "ref_logps/chosen": -15.797719955444336, + "ref_logps/rejected": -40.723243713378906, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.531067132949829, + "rewards/margins": 3.7488057613372803, + "rewards/rejected": -5.279872417449951, + "step": 133 + }, + { + "epoch": 0.27, + "grad_norm": 40.31496047973633, + "learning_rate": 4.4666666666666664e-07, + "logps/chosen": -25.789161682128906, + "logps/rejected": -110.07024383544922, + "loss": 0.124, + "losses/dpo": 0.4045230448246002, + "losses/sft": 0.4008540213108063, + "losses/total": 0.4045230448246002, + "ref_logps/chosen": -15.012077331542969, + "ref_logps/rejected": -47.305419921875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.07770836353302, + "rewards/margins": 5.1987738609313965, + "rewards/rejected": -6.276482105255127, + "step": 134 + }, + { + "epoch": 0.27, + "grad_norm": 46.73427200317383, + "learning_rate": 4.5e-07, + "logps/chosen": -33.74063491821289, + "logps/rejected": -86.67448425292969, + "loss": 0.1774, + "losses/dpo": 0.23459814488887787, + "losses/sft": 0.5259028673171997, + "losses/total": 0.23459814488887787, + "ref_logps/chosen": -15.16489028930664, + "ref_logps/rejected": -37.2421875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.857574462890625, + "rewards/margins": 3.085655689239502, + "rewards/rejected": -4.943229675292969, + "step": 135 + }, + { + "epoch": 0.27, + "grad_norm": 65.31625366210938, + "learning_rate": 4.5333333333333326e-07, + "logps/chosen": -23.63500213623047, + "logps/rejected": -81.20355224609375, + "loss": 0.134, + "losses/dpo": 0.3590734004974365, + "losses/sft": 0.5037756562232971, + "losses/total": 0.3590734004974365, + "ref_logps/chosen": -11.519001960754395, + "ref_logps/rejected": -35.56211853027344, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2116000652313232, + "rewards/margins": 3.352543354034424, + "rewards/rejected": -4.564143180847168, + "step": 136 + }, + { + "epoch": 0.27, + "grad_norm": 26.6831111907959, + "learning_rate": 4.5666666666666665e-07, + "logps/chosen": -30.455238342285156, + "logps/rejected": -90.72950744628906, + "loss": 0.1094, + "losses/dpo": 0.08296354115009308, + "losses/sft": 0.6522700786590576, + "losses/total": 0.08296354115009308, + "ref_logps/chosen": -14.431440353393555, + "ref_logps/rejected": -37.58430480957031, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6023797988891602, + "rewards/margins": 3.7121400833129883, + "rewards/rejected": -5.314520359039307, + "step": 137 + }, + { + "epoch": 0.28, + "grad_norm": 44.44517517089844, + "learning_rate": 4.6e-07, + "logps/chosen": -26.478111267089844, + "logps/rejected": -86.48759460449219, + "loss": 0.1435, + "losses/dpo": 0.25203052163124084, + "losses/sft": 0.5533711910247803, + "losses/total": 0.25203052163124084, + "ref_logps/chosen": -15.576150894165039, + "ref_logps/rejected": -38.519466400146484, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.090196132659912, + "rewards/margins": 3.7066164016723633, + "rewards/rejected": -4.796812057495117, + "step": 138 + }, + { + "epoch": 0.28, + "grad_norm": 33.14357376098633, + "learning_rate": 4.633333333333333e-07, + "logps/chosen": -26.719112396240234, + "logps/rejected": -92.25952911376953, + "loss": 0.1509, + "losses/dpo": 0.11920268833637238, + "losses/sft": 0.6446678638458252, + "losses/total": 0.11920268833637238, + "ref_logps/chosen": -11.331499099731445, + "ref_logps/rejected": -35.657508850097656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5387613773345947, + "rewards/margins": 4.121440410614014, + "rewards/rejected": -5.6602020263671875, + "step": 139 + }, + { + "epoch": 0.28, + "grad_norm": 38.349788665771484, + "learning_rate": 4.6666666666666666e-07, + "logps/chosen": -26.575002670288086, + "logps/rejected": -93.16529083251953, + "loss": 0.1955, + "losses/dpo": 0.27706673741340637, + "losses/sft": 0.6116777658462524, + "losses/total": 0.27706673741340637, + "ref_logps/chosen": -13.48236083984375, + "ref_logps/rejected": -40.799232482910156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.309264063835144, + "rewards/margins": 3.9273412227630615, + "rewards/rejected": -5.236605167388916, + "step": 140 + }, + { + "epoch": 0.28, + "grad_norm": 19.368520736694336, + "learning_rate": 4.6999999999999995e-07, + "logps/chosen": -26.615325927734375, + "logps/rejected": -99.11043548583984, + "loss": 0.126, + "losses/dpo": 0.1754596084356308, + "losses/sft": 0.49534112215042114, + "losses/total": 0.1754596084356308, + "ref_logps/chosen": -13.058014869689941, + "ref_logps/rejected": -45.73929977416992, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3557310104370117, + "rewards/margins": 3.9813828468322754, + "rewards/rejected": -5.337113380432129, + "step": 141 + }, + { + "epoch": 0.28, + "grad_norm": 41.14610290527344, + "learning_rate": 4.733333333333333e-07, + "logps/chosen": -32.56314468383789, + "logps/rejected": -93.21540069580078, + "loss": 0.201, + "losses/dpo": 0.4042896628379822, + "losses/sft": 0.9554309844970703, + "losses/total": 0.4042896628379822, + "ref_logps/chosen": -12.147167205810547, + "ref_logps/rejected": -40.382930755615234, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.041597604751587, + "rewards/margins": 3.241649866104126, + "rewards/rejected": -5.283247470855713, + "step": 142 + }, + { + "epoch": 0.29, + "grad_norm": 51.96454620361328, + "learning_rate": 4.7666666666666667e-07, + "logps/chosen": -26.63742446899414, + "logps/rejected": -133.12020874023438, + "loss": 0.145, + "losses/dpo": 0.0019743088632822037, + "losses/sft": 0.6446032524108887, + "losses/total": 0.0019743088632822037, + "ref_logps/chosen": -10.237613677978516, + "ref_logps/rejected": -61.997535705566406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6399810314178467, + "rewards/margins": 5.472287178039551, + "rewards/rejected": -7.112268447875977, + "step": 143 + }, + { + "epoch": 0.29, + "grad_norm": 37.46514129638672, + "learning_rate": 4.8e-07, + "logps/chosen": -37.32117462158203, + "logps/rejected": -96.15669250488281, + "loss": 0.1903, + "losses/dpo": 0.11820630729198456, + "losses/sft": 0.6547752618789673, + "losses/total": 0.11820630729198456, + "ref_logps/chosen": -16.876934051513672, + "ref_logps/rejected": -38.996849060058594, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.044424057006836, + "rewards/margins": 3.6715593338012695, + "rewards/rejected": -5.7159833908081055, + "step": 144 + }, + { + "epoch": 0.29, + "grad_norm": 51.414974212646484, + "learning_rate": 4.833333333333333e-07, + "logps/chosen": -33.48571014404297, + "logps/rejected": -100.97714233398438, + "loss": 0.2538, + "losses/dpo": 0.30546796321868896, + "losses/sft": 0.6166024804115295, + "losses/total": 0.30546796321868896, + "ref_logps/chosen": -14.59931755065918, + "ref_logps/rejected": -47.38710021972656, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.888639211654663, + "rewards/margins": 3.470364570617676, + "rewards/rejected": -5.359004020690918, + "step": 145 + }, + { + "epoch": 0.29, + "grad_norm": 45.47442626953125, + "learning_rate": 4.866666666666666e-07, + "logps/chosen": -40.079994201660156, + "logps/rejected": -128.56350708007812, + "loss": 0.1701, + "losses/dpo": 0.022961853072047234, + "losses/sft": 0.595543384552002, + "losses/total": 0.022961853072047234, + "ref_logps/chosen": -18.27761459350586, + "ref_logps/rejected": -60.02783966064453, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1802380084991455, + "rewards/margins": 4.673327445983887, + "rewards/rejected": -6.853565216064453, + "step": 146 + }, + { + "epoch": 0.29, + "grad_norm": 26.409053802490234, + "learning_rate": 4.9e-07, + "logps/chosen": -33.39720153808594, + "logps/rejected": -109.74790954589844, + "loss": 0.0791, + "losses/dpo": 0.15374058485031128, + "losses/sft": 0.4255408048629761, + "losses/total": 0.15374058485031128, + "ref_logps/chosen": -20.84080696105957, + "ref_logps/rejected": -52.04368591308594, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2556393146514893, + "rewards/margins": 4.5147833824157715, + "rewards/rejected": -5.770422458648682, + "step": 147 + }, + { + "epoch": 0.3, + "grad_norm": 40.29155349731445, + "learning_rate": 4.933333333333333e-07, + "logps/chosen": -25.50351333618164, + "logps/rejected": -97.32110595703125, + "loss": 0.1404, + "losses/dpo": 0.05997714027762413, + "losses/sft": 0.4540533721446991, + "losses/total": 0.05997714027762413, + "ref_logps/chosen": -11.617637634277344, + "ref_logps/rejected": -42.73888397216797, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3885875940322876, + "rewards/margins": 4.069634914398193, + "rewards/rejected": -5.45822286605835, + "step": 148 + }, + { + "epoch": 0.3, + "grad_norm": 22.356138229370117, + "learning_rate": 4.966666666666666e-07, + "logps/chosen": -36.5714111328125, + "logps/rejected": -119.6854248046875, + "loss": 0.0701, + "losses/dpo": 0.095610611140728, + "losses/sft": 0.5715881586074829, + "losses/total": 0.095610611140728, + "ref_logps/chosen": -15.912175178527832, + "ref_logps/rejected": -53.39912414550781, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0659236907958984, + "rewards/margins": 4.562705993652344, + "rewards/rejected": -6.628629684448242, + "step": 149 + }, + { + "epoch": 0.3, + "grad_norm": 52.81399917602539, + "learning_rate": 5e-07, + "logps/chosen": -30.146623611450195, + "logps/rejected": -95.46763610839844, + "loss": 0.1872, + "losses/dpo": 0.35411715507507324, + "losses/sft": 0.7511922121047974, + "losses/total": 0.35411715507507324, + "ref_logps/chosen": -11.182028770446777, + "ref_logps/rejected": -40.55536651611328, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8964595794677734, + "rewards/margins": 3.5947678089141846, + "rewards/rejected": -5.491227149963379, + "step": 150 + }, + { + "epoch": 0.3, + "grad_norm": 51.27703857421875, + "learning_rate": 4.996296296296296e-07, + "logps/chosen": -26.03563690185547, + "logps/rejected": -85.95819091796875, + "loss": 0.1315, + "losses/dpo": 0.04389333724975586, + "losses/sft": 0.5486029386520386, + "losses/total": 0.04389333724975586, + "ref_logps/chosen": -10.959346771240234, + "ref_logps/rejected": -36.31953430175781, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5076290369033813, + "rewards/margins": 3.456235647201538, + "rewards/rejected": -4.963865280151367, + "step": 151 + }, + { + "epoch": 0.3, + "grad_norm": 28.76018714904785, + "learning_rate": 4.992592592592593e-07, + "logps/chosen": -28.23834991455078, + "logps/rejected": -113.56182098388672, + "loss": 0.1036, + "losses/dpo": 0.011073265224695206, + "losses/sft": 0.3654525876045227, + "losses/total": 0.011073265224695206, + "ref_logps/chosen": -16.015453338623047, + "ref_logps/rejected": -50.925968170166016, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.222289800643921, + "rewards/margins": 5.041295528411865, + "rewards/rejected": -6.263585567474365, + "step": 152 + }, + { + "epoch": 0.31, + "grad_norm": 25.341352462768555, + "learning_rate": 4.988888888888889e-07, + "logps/chosen": -22.11452865600586, + "logps/rejected": -83.98039245605469, + "loss": 0.1078, + "losses/dpo": 0.18434298038482666, + "losses/sft": 0.3981594741344452, + "losses/total": 0.18434298038482666, + "ref_logps/chosen": -12.649343490600586, + "ref_logps/rejected": -36.79124450683594, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9465184211730957, + "rewards/margins": 3.7723965644836426, + "rewards/rejected": -4.718914985656738, + "step": 153 + }, + { + "epoch": 0.31, + "grad_norm": 48.10272216796875, + "learning_rate": 4.985185185185185e-07, + "logps/chosen": -24.186851501464844, + "logps/rejected": -97.55790710449219, + "loss": 0.1649, + "losses/dpo": 0.20018045604228973, + "losses/sft": 0.4684436321258545, + "losses/total": 0.20018045604228973, + "ref_logps/chosen": -13.064205169677734, + "ref_logps/rejected": -51.07586669921875, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1122647523880005, + "rewards/margins": 3.5359394550323486, + "rewards/rejected": -4.6482038497924805, + "step": 154 + }, + { + "epoch": 0.31, + "grad_norm": 40.72660827636719, + "learning_rate": 4.981481481481482e-07, + "logps/chosen": -25.070098876953125, + "logps/rejected": -96.81678009033203, + "loss": 0.1761, + "losses/dpo": 0.32067233324050903, + "losses/sft": 0.47111189365386963, + "losses/total": 0.32067233324050903, + "ref_logps/chosen": -13.66195011138916, + "ref_logps/rejected": -52.27348327636719, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.140815019607544, + "rewards/margins": 3.3135147094726562, + "rewards/rejected": -4.454329967498779, + "step": 155 + }, + { + "epoch": 0.31, + "grad_norm": 60.72633743286133, + "learning_rate": 4.977777777777777e-07, + "logps/chosen": -25.260963439941406, + "logps/rejected": -120.26380920410156, + "loss": 0.1332, + "losses/dpo": 0.0010022318456321955, + "losses/sft": 0.4159192442893982, + "losses/total": 0.0010022318456321955, + "ref_logps/chosen": -11.787809371948242, + "ref_logps/rejected": -55.686058044433594, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3473155498504639, + "rewards/margins": 5.11046028137207, + "rewards/rejected": -6.457776069641113, + "step": 156 + }, + { + "epoch": 0.31, + "grad_norm": 13.246103286743164, + "learning_rate": 4.974074074074074e-07, + "logps/chosen": -32.20502853393555, + "logps/rejected": -99.13482666015625, + "loss": 0.043, + "losses/dpo": 0.06591884046792984, + "losses/sft": 0.3959016799926758, + "losses/total": 0.06591884046792984, + "ref_logps/chosen": -18.83717155456543, + "ref_logps/rejected": -42.83684539794922, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3367857933044434, + "rewards/margins": 4.293012619018555, + "rewards/rejected": -5.629798412322998, + "step": 157 + }, + { + "epoch": 0.32, + "grad_norm": 51.342262268066406, + "learning_rate": 4.97037037037037e-07, + "logps/chosen": -34.90985107421875, + "logps/rejected": -100.9737548828125, + "loss": 0.1856, + "losses/dpo": 0.3706285357475281, + "losses/sft": 0.6656729578971863, + "losses/total": 0.3706285357475281, + "ref_logps/chosen": -14.48876953125, + "ref_logps/rejected": -45.89849853515625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0421080589294434, + "rewards/margins": 3.4654178619384766, + "rewards/rejected": -5.50752592086792, + "step": 158 + }, + { + "epoch": 0.32, + "grad_norm": 27.331863403320312, + "learning_rate": 4.966666666666666e-07, + "logps/chosen": -26.92082405090332, + "logps/rejected": -89.95733642578125, + "loss": 0.1194, + "losses/dpo": 0.01959105022251606, + "losses/sft": 0.614719033241272, + "losses/total": 0.01959105022251606, + "ref_logps/chosen": -11.247827529907227, + "ref_logps/rejected": -35.469085693359375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5672996044158936, + "rewards/margins": 3.8815250396728516, + "rewards/rejected": -5.448824882507324, + "step": 159 + }, + { + "epoch": 0.32, + "grad_norm": 29.60675048828125, + "learning_rate": 4.962962962962963e-07, + "logps/chosen": -28.120311737060547, + "logps/rejected": -109.587646484375, + "loss": 0.0785, + "losses/dpo": 0.07884176820516586, + "losses/sft": 0.4254249334335327, + "losses/total": 0.07884176820516586, + "ref_logps/chosen": -12.621431350708008, + "ref_logps/rejected": -48.657039642333984, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.549888014793396, + "rewards/margins": 4.543172836303711, + "rewards/rejected": -6.093061447143555, + "step": 160 + }, + { + "epoch": 0.32, + "grad_norm": 38.52204513549805, + "learning_rate": 4.959259259259259e-07, + "logps/chosen": -34.612762451171875, + "logps/rejected": -106.73829650878906, + "loss": 0.096, + "losses/dpo": 0.0721823126077652, + "losses/sft": 0.6061477065086365, + "losses/total": 0.0721823126077652, + "ref_logps/chosen": -17.091495513916016, + "ref_logps/rejected": -50.930519104003906, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.752126693725586, + "rewards/margins": 3.828650712966919, + "rewards/rejected": -5.580777645111084, + "step": 161 + }, + { + "epoch": 0.32, + "grad_norm": 17.7041072845459, + "learning_rate": 4.955555555555556e-07, + "logps/chosen": -36.67155075073242, + "logps/rejected": -109.15782165527344, + "loss": 0.0632, + "losses/dpo": 0.11136096715927124, + "losses/sft": 0.587780773639679, + "losses/total": 0.11136096715927124, + "ref_logps/chosen": -20.507972717285156, + "ref_logps/rejected": -48.61429977416992, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.616357684135437, + "rewards/margins": 4.437994003295898, + "rewards/rejected": -6.054351806640625, + "step": 162 + }, + { + "epoch": 0.33, + "grad_norm": 21.67244529724121, + "learning_rate": 4.951851851851851e-07, + "logps/chosen": -29.562240600585938, + "logps/rejected": -101.33702087402344, + "loss": 0.1004, + "losses/dpo": 0.010235275141894817, + "losses/sft": 0.5660809278488159, + "losses/total": 0.010235275141894817, + "ref_logps/chosen": -13.207432746887207, + "ref_logps/rejected": -45.143585205078125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6354806423187256, + "rewards/margins": 3.983863115310669, + "rewards/rejected": -5.619344234466553, + "step": 163 + }, + { + "epoch": 0.33, + "grad_norm": 28.07857322692871, + "learning_rate": 4.948148148148148e-07, + "logps/chosen": -30.570457458496094, + "logps/rejected": -112.83328247070312, + "loss": 0.0717, + "losses/dpo": 0.06467992067337036, + "losses/sft": 0.45188504457473755, + "losses/total": 0.06467992067337036, + "ref_logps/chosen": -15.896978378295898, + "ref_logps/rejected": -52.206260681152344, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4673478603363037, + "rewards/margins": 4.595355033874512, + "rewards/rejected": -6.0627031326293945, + "step": 164 + }, + { + "epoch": 0.33, + "grad_norm": 26.712987899780273, + "learning_rate": 4.944444444444445e-07, + "logps/chosen": -23.911460876464844, + "logps/rejected": -107.57379150390625, + "loss": 0.1045, + "losses/dpo": 0.15704016387462616, + "losses/sft": 0.585786759853363, + "losses/total": 0.15704016387462616, + "ref_logps/chosen": -13.260459899902344, + "ref_logps/rejected": -53.76809310913086, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0651001930236816, + "rewards/margins": 4.315468788146973, + "rewards/rejected": -5.3805694580078125, + "step": 165 + }, + { + "epoch": 0.33, + "grad_norm": 16.83503532409668, + "learning_rate": 4.94074074074074e-07, + "logps/chosen": -40.63855743408203, + "logps/rejected": -105.12744903564453, + "loss": 0.0802, + "losses/dpo": 0.1741233468055725, + "losses/sft": 0.2261582911014557, + "losses/total": 0.1741233468055725, + "ref_logps/chosen": -24.93436050415039, + "ref_logps/rejected": -50.1585578918457, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5704195499420166, + "rewards/margins": 3.926469564437866, + "rewards/rejected": -5.496889114379883, + "step": 166 + }, + { + "epoch": 0.33, + "grad_norm": 43.83147430419922, + "learning_rate": 4.937037037037037e-07, + "logps/chosen": -30.703670501708984, + "logps/rejected": -97.72269439697266, + "loss": 0.1269, + "losses/dpo": 0.1549128293991089, + "losses/sft": 0.6281462907791138, + "losses/total": 0.1549128293991089, + "ref_logps/chosen": -13.337878227233887, + "ref_logps/rejected": -40.41243362426758, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.736579179763794, + "rewards/margins": 3.9944469928741455, + "rewards/rejected": -5.731026649475098, + "step": 167 + }, + { + "epoch": 0.34, + "grad_norm": 28.505626678466797, + "learning_rate": 4.933333333333333e-07, + "logps/chosen": -33.72929382324219, + "logps/rejected": -133.2057647705078, + "loss": 0.1079, + "losses/dpo": 0.04674118757247925, + "losses/sft": 0.49377191066741943, + "losses/total": 0.04674118757247925, + "ref_logps/chosen": -15.563924789428711, + "ref_logps/rejected": -61.7348518371582, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8165371417999268, + "rewards/margins": 5.330554962158203, + "rewards/rejected": -7.147091865539551, + "step": 168 + }, + { + "epoch": 0.34, + "grad_norm": 40.403419494628906, + "learning_rate": 4.929629629629629e-07, + "logps/chosen": -33.252532958984375, + "logps/rejected": -105.06487274169922, + "loss": 0.1667, + "losses/dpo": 0.2247321456670761, + "losses/sft": 0.6353814601898193, + "losses/total": 0.2247321456670761, + "ref_logps/chosen": -14.608097076416016, + "ref_logps/rejected": -41.214820861816406, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8644435405731201, + "rewards/margins": 4.520562171936035, + "rewards/rejected": -6.385005474090576, + "step": 169 + }, + { + "epoch": 0.34, + "grad_norm": 50.592559814453125, + "learning_rate": 4.925925925925926e-07, + "logps/chosen": -31.340618133544922, + "logps/rejected": -106.16822052001953, + "loss": 0.1412, + "losses/dpo": 0.23204943537712097, + "losses/sft": 0.6205928325653076, + "losses/total": 0.23204943537712097, + "ref_logps/chosen": -14.412965774536133, + "ref_logps/rejected": -41.494407653808594, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.692765235900879, + "rewards/margins": 4.77461576461792, + "rewards/rejected": -6.467381477355957, + "step": 170 + }, + { + "epoch": 0.34, + "grad_norm": 34.329734802246094, + "learning_rate": 4.922222222222222e-07, + "logps/chosen": -33.6573371887207, + "logps/rejected": -77.05763244628906, + "loss": 0.1371, + "losses/dpo": 0.15634378790855408, + "losses/sft": 0.6816786527633667, + "losses/total": 0.15634378790855408, + "ref_logps/chosen": -15.983509063720703, + "ref_logps/rejected": -30.413066864013672, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7673827409744263, + "rewards/margins": 2.8970742225646973, + "rewards/rejected": -4.664457321166992, + "step": 171 + }, + { + "epoch": 0.34, + "grad_norm": 37.6192741394043, + "learning_rate": 4.918518518518519e-07, + "logps/chosen": -36.66886520385742, + "logps/rejected": -85.15867614746094, + "loss": 0.1536, + "losses/dpo": 0.21989810466766357, + "losses/sft": 0.596770703792572, + "losses/total": 0.21989810466766357, + "ref_logps/chosen": -20.18588638305664, + "ref_logps/rejected": -34.98530197143555, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6482977867126465, + "rewards/margins": 3.3690390586853027, + "rewards/rejected": -5.017336845397949, + "step": 172 + }, + { + "epoch": 0.35, + "grad_norm": 25.663639068603516, + "learning_rate": 4.914814814814814e-07, + "logps/chosen": -35.823875427246094, + "logps/rejected": -106.20530700683594, + "loss": 0.0575, + "losses/dpo": 0.04412658512592316, + "losses/sft": 0.3898341655731201, + "losses/total": 0.04412658512592316, + "ref_logps/chosen": -21.750324249267578, + "ref_logps/rejected": -43.821144104003906, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4073551893234253, + "rewards/margins": 4.831061363220215, + "rewards/rejected": -6.23841667175293, + "step": 173 + }, + { + "epoch": 0.35, + "grad_norm": 35.56593704223633, + "learning_rate": 4.91111111111111e-07, + "logps/chosen": -32.283058166503906, + "logps/rejected": -117.61260986328125, + "loss": 0.0963, + "losses/dpo": 0.033313900232315063, + "losses/sft": 0.6720283031463623, + "losses/total": 0.033313900232315063, + "ref_logps/chosen": -18.36054039001465, + "ref_logps/rejected": -54.971343994140625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.392251968383789, + "rewards/margins": 4.8718743324279785, + "rewards/rejected": -6.264126300811768, + "step": 174 + }, + { + "epoch": 0.35, + "grad_norm": 43.761077880859375, + "learning_rate": 4.907407407407407e-07, + "logps/chosen": -34.093841552734375, + "logps/rejected": -100.14768981933594, + "loss": 0.1455, + "losses/dpo": 0.13595961034297943, + "losses/sft": 0.5358244180679321, + "losses/total": 0.13595961034297943, + "ref_logps/chosen": -16.00359344482422, + "ref_logps/rejected": -42.43475341796875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.809024691581726, + "rewards/margins": 3.962268829345703, + "rewards/rejected": -5.771293640136719, + "step": 175 + }, + { + "epoch": 0.35, + "grad_norm": 24.053037643432617, + "learning_rate": 4.903703703703703e-07, + "logps/chosen": -24.640159606933594, + "logps/rejected": -116.64846801757812, + "loss": 0.0634, + "losses/dpo": 0.15330170094966888, + "losses/sft": 0.496634840965271, + "losses/total": 0.15330170094966888, + "ref_logps/chosen": -12.316726684570312, + "ref_logps/rejected": -49.52324295043945, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2323435544967651, + "rewards/margins": 5.480178356170654, + "rewards/rejected": -6.712522506713867, + "step": 176 + }, + { + "epoch": 0.35, + "grad_norm": 18.6986141204834, + "learning_rate": 4.9e-07, + "logps/chosen": -25.20108413696289, + "logps/rejected": -98.72869873046875, + "loss": 0.0739, + "losses/dpo": 0.09845062345266342, + "losses/sft": 0.7194775342941284, + "losses/total": 0.09845062345266342, + "ref_logps/chosen": -11.484321594238281, + "ref_logps/rejected": -40.09552764892578, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3716762065887451, + "rewards/margins": 4.491641044616699, + "rewards/rejected": -5.863317489624023, + "step": 177 + }, + { + "epoch": 0.36, + "grad_norm": 25.084514617919922, + "learning_rate": 4.896296296296296e-07, + "logps/chosen": -28.90692138671875, + "logps/rejected": -109.8388671875, + "loss": 0.1246, + "losses/dpo": 0.1252589076757431, + "losses/sft": 0.48763686418533325, + "losses/total": 0.1252589076757431, + "ref_logps/chosen": -13.82424545288086, + "ref_logps/rejected": -44.95261001586914, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5082676410675049, + "rewards/margins": 4.980358123779297, + "rewards/rejected": -6.488625526428223, + "step": 178 + }, + { + "epoch": 0.36, + "grad_norm": 25.185590744018555, + "learning_rate": 4.892592592592592e-07, + "logps/chosen": -35.07394027709961, + "logps/rejected": -107.2217025756836, + "loss": 0.0935, + "losses/dpo": 0.06011039763689041, + "losses/sft": 0.6677021980285645, + "losses/total": 0.06011039763689041, + "ref_logps/chosen": -15.76345443725586, + "ref_logps/rejected": -44.287784576416016, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9310487508773804, + "rewards/margins": 4.362342834472656, + "rewards/rejected": -6.293392181396484, + "step": 179 + }, + { + "epoch": 0.36, + "grad_norm": 33.190589904785156, + "learning_rate": 4.888888888888889e-07, + "logps/chosen": -27.996540069580078, + "logps/rejected": -84.33702087402344, + "loss": 0.151, + "losses/dpo": 0.2758825123310089, + "losses/sft": 0.4535401463508606, + "losses/total": 0.2758825123310089, + "ref_logps/chosen": -13.512284278869629, + "ref_logps/rejected": -37.30048751831055, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4484257698059082, + "rewards/margins": 3.25522780418396, + "rewards/rejected": -4.703653335571289, + "step": 180 + }, + { + "epoch": 0.36, + "grad_norm": 52.29859924316406, + "learning_rate": 4.885185185185185e-07, + "logps/chosen": -33.77317810058594, + "logps/rejected": -103.15384674072266, + "loss": 0.1567, + "losses/dpo": 0.11710235476493835, + "losses/sft": 0.45147156715393066, + "losses/total": 0.11710235476493835, + "ref_logps/chosen": -20.137229919433594, + "ref_logps/rejected": -41.28327178955078, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3635950088500977, + "rewards/margins": 4.823462963104248, + "rewards/rejected": -6.187057971954346, + "step": 181 + }, + { + "epoch": 0.36, + "grad_norm": 70.58116149902344, + "learning_rate": 4.881481481481482e-07, + "logps/chosen": -38.54304504394531, + "logps/rejected": -84.55474090576172, + "loss": 0.2789, + "losses/dpo": 0.24431732296943665, + "losses/sft": 0.6288025975227356, + "losses/total": 0.24431732296943665, + "ref_logps/chosen": -17.36640167236328, + "ref_logps/rejected": -35.1982536315918, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1176645755767822, + "rewards/margins": 2.8179845809936523, + "rewards/rejected": -4.935649394989014, + "step": 182 + }, + { + "epoch": 0.37, + "grad_norm": 37.199989318847656, + "learning_rate": 4.877777777777777e-07, + "logps/chosen": -31.085803985595703, + "logps/rejected": -92.9924545288086, + "loss": 0.171, + "losses/dpo": 0.18612107634544373, + "losses/sft": 0.5101003646850586, + "losses/total": 0.18612107634544373, + "ref_logps/chosen": -13.339794158935547, + "ref_logps/rejected": -39.03697204589844, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7746011018753052, + "rewards/margins": 3.6209468841552734, + "rewards/rejected": -5.395547866821289, + "step": 183 + }, + { + "epoch": 0.37, + "grad_norm": 31.927581787109375, + "learning_rate": 4.874074074074073e-07, + "logps/chosen": -26.326370239257812, + "logps/rejected": -101.94358825683594, + "loss": 0.1419, + "losses/dpo": 0.18461284041404724, + "losses/sft": 0.7296566367149353, + "losses/total": 0.18461284041404724, + "ref_logps/chosen": -12.85525131225586, + "ref_logps/rejected": -45.30030822753906, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.347111701965332, + "rewards/margins": 4.317215919494629, + "rewards/rejected": -5.664327621459961, + "step": 184 + }, + { + "epoch": 0.37, + "grad_norm": 37.610389709472656, + "learning_rate": 4.87037037037037e-07, + "logps/chosen": -25.972145080566406, + "logps/rejected": -91.00416564941406, + "loss": 0.1407, + "losses/dpo": 0.19254301488399506, + "losses/sft": 0.6111183166503906, + "losses/total": 0.19254301488399506, + "ref_logps/chosen": -13.008270263671875, + "ref_logps/rejected": -42.0411491394043, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2963876724243164, + "rewards/margins": 3.599914073944092, + "rewards/rejected": -4.896301746368408, + "step": 185 + }, + { + "epoch": 0.37, + "grad_norm": 46.32947540283203, + "learning_rate": 4.866666666666666e-07, + "logps/chosen": -30.488025665283203, + "logps/rejected": -84.3827896118164, + "loss": 0.2015, + "losses/dpo": 0.2830401360988617, + "losses/sft": 0.6334186792373657, + "losses/total": 0.2830401360988617, + "ref_logps/chosen": -13.258672714233398, + "ref_logps/rejected": -34.712615966796875, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.722935438156128, + "rewards/margins": 3.244082450866699, + "rewards/rejected": -4.967017650604248, + "step": 186 + }, + { + "epoch": 0.37, + "grad_norm": 43.799713134765625, + "learning_rate": 4.862962962962963e-07, + "logps/chosen": -38.89411163330078, + "logps/rejected": -103.1797103881836, + "loss": 0.1821, + "losses/dpo": 0.47093939781188965, + "losses/sft": 0.7258151769638062, + "losses/total": 0.47093939781188965, + "ref_logps/chosen": -16.819536209106445, + "ref_logps/rejected": -42.68255615234375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2074575424194336, + "rewards/margins": 3.8422579765319824, + "rewards/rejected": -6.049715518951416, + "step": 187 + }, + { + "epoch": 0.38, + "grad_norm": 50.88268280029297, + "learning_rate": 4.859259259259259e-07, + "logps/chosen": -35.74350357055664, + "logps/rejected": -76.9640121459961, + "loss": 0.2391, + "losses/dpo": 0.25321850180625916, + "losses/sft": 0.5408048033714294, + "losses/total": 0.25321850180625916, + "ref_logps/chosen": -17.249225616455078, + "ref_logps/rejected": -31.546180725097656, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8494281768798828, + "rewards/margins": 2.6923556327819824, + "rewards/rejected": -4.541783809661865, + "step": 188 + }, + { + "epoch": 0.38, + "grad_norm": 30.711532592773438, + "learning_rate": 4.855555555555556e-07, + "logps/chosen": -34.859683990478516, + "logps/rejected": -98.89837646484375, + "loss": 0.1398, + "losses/dpo": 0.24823901057243347, + "losses/sft": 0.6124401688575745, + "losses/total": 0.24823901057243347, + "ref_logps/chosen": -14.467687606811523, + "ref_logps/rejected": -38.222923278808594, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0391998291015625, + "rewards/margins": 4.028346061706543, + "rewards/rejected": -6.0675458908081055, + "step": 189 + }, + { + "epoch": 0.38, + "grad_norm": 62.155982971191406, + "learning_rate": 4.851851851851852e-07, + "logps/chosen": -35.63481140136719, + "logps/rejected": -111.28214263916016, + "loss": 0.2023, + "losses/dpo": 0.2405925989151001, + "losses/sft": 0.816167950630188, + "losses/total": 0.2405925989151001, + "ref_logps/chosen": -13.084310531616211, + "ref_logps/rejected": -44.44044494628906, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.255049705505371, + "rewards/margins": 4.4291205406188965, + "rewards/rejected": -6.684170722961426, + "step": 190 + }, + { + "epoch": 0.38, + "grad_norm": 20.9333553314209, + "learning_rate": 4.848148148148148e-07, + "logps/chosen": -28.955718994140625, + "logps/rejected": -106.60125732421875, + "loss": 0.1037, + "losses/dpo": 0.17893077433109283, + "losses/sft": 0.5490385890007019, + "losses/total": 0.17893077433109283, + "ref_logps/chosen": -14.494391441345215, + "ref_logps/rejected": -42.58512878417969, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4461328983306885, + "rewards/margins": 4.955480098724365, + "rewards/rejected": -6.401612758636475, + "step": 191 + }, + { + "epoch": 0.38, + "grad_norm": 37.37415313720703, + "learning_rate": 4.844444444444445e-07, + "logps/chosen": -33.32417297363281, + "logps/rejected": -103.1170654296875, + "loss": 0.2031, + "losses/dpo": 0.3571177124977112, + "losses/sft": 0.5955436825752258, + "losses/total": 0.3571177124977112, + "ref_logps/chosen": -17.138912200927734, + "ref_logps/rejected": -44.96180725097656, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6185261011123657, + "rewards/margins": 4.1969990730285645, + "rewards/rejected": -5.815525531768799, + "step": 192 + }, + { + "epoch": 0.39, + "grad_norm": 17.647977828979492, + "learning_rate": 4.840740740740741e-07, + "logps/chosen": -27.43695068359375, + "logps/rejected": -98.47382354736328, + "loss": 0.0697, + "losses/dpo": 0.06315574049949646, + "losses/sft": 0.4831572473049164, + "losses/total": 0.06315574049949646, + "ref_logps/chosen": -13.393509864807129, + "ref_logps/rejected": -38.59257888793945, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.404344081878662, + "rewards/margins": 4.583780288696289, + "rewards/rejected": -5.988123893737793, + "step": 193 + }, + { + "epoch": 0.39, + "grad_norm": 46.62928771972656, + "learning_rate": 4.837037037037037e-07, + "logps/chosen": -29.626968383789062, + "logps/rejected": -91.1700439453125, + "loss": 0.2158, + "losses/dpo": 0.49264955520629883, + "losses/sft": 0.6701329350471497, + "losses/total": 0.49264955520629883, + "ref_logps/chosen": -11.046613693237305, + "ref_logps/rejected": -39.36475372314453, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8580358028411865, + "rewards/margins": 3.322493076324463, + "rewards/rejected": -5.18052864074707, + "step": 194 + }, + { + "epoch": 0.39, + "grad_norm": 18.034120559692383, + "learning_rate": 4.833333333333333e-07, + "logps/chosen": -29.973468780517578, + "logps/rejected": -105.07539367675781, + "loss": 0.0674, + "losses/dpo": 0.08304838836193085, + "losses/sft": 0.6566027402877808, + "losses/total": 0.08304838836193085, + "ref_logps/chosen": -14.975432395935059, + "ref_logps/rejected": -43.71247863769531, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4998037815093994, + "rewards/margins": 4.6364874839782715, + "rewards/rejected": -6.13629150390625, + "step": 195 + }, + { + "epoch": 0.39, + "grad_norm": 54.77147674560547, + "learning_rate": 4.829629629629629e-07, + "logps/chosen": -32.78771209716797, + "logps/rejected": -109.28285217285156, + "loss": 0.2141, + "losses/dpo": 0.3265150189399719, + "losses/sft": 0.8198443651199341, + "losses/total": 0.3265150189399719, + "ref_logps/chosen": -11.376585006713867, + "ref_logps/rejected": -46.43901062011719, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.141112804412842, + "rewards/margins": 4.143270492553711, + "rewards/rejected": -6.284383773803711, + "step": 196 + }, + { + "epoch": 0.39, + "grad_norm": 48.40336227416992, + "learning_rate": 4.825925925925926e-07, + "logps/chosen": -29.586572647094727, + "logps/rejected": -100.91725158691406, + "loss": 0.1325, + "losses/dpo": 0.1769840270280838, + "losses/sft": 0.528570294380188, + "losses/total": 0.1769840270280838, + "ref_logps/chosen": -10.439521789550781, + "ref_logps/rejected": -40.107704162597656, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9147053956985474, + "rewards/margins": 4.166248798370361, + "rewards/rejected": -6.080954551696777, + "step": 197 + }, + { + "epoch": 0.4, + "grad_norm": 19.345624923706055, + "learning_rate": 4.822222222222222e-07, + "logps/chosen": -33.93301773071289, + "logps/rejected": -112.19253540039062, + "loss": 0.0755, + "losses/dpo": 0.2622387409210205, + "losses/sft": 0.5382803678512573, + "losses/total": 0.2622387409210205, + "ref_logps/chosen": -15.570449829101562, + "ref_logps/rejected": -42.25779724121094, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8362568616867065, + "rewards/margins": 5.157217025756836, + "rewards/rejected": -6.993474006652832, + "step": 198 + }, + { + "epoch": 0.4, + "grad_norm": 8.654232025146484, + "learning_rate": 4.818518518518519e-07, + "logps/chosen": -32.183082580566406, + "logps/rejected": -139.59701538085938, + "loss": 0.0178, + "losses/dpo": 0.0051182028837502, + "losses/sft": 0.29134008288383484, + "losses/total": 0.0051182028837502, + "ref_logps/chosen": -19.9312686920166, + "ref_logps/rejected": -55.7081298828125, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.225181221961975, + "rewards/margins": 7.163707733154297, + "rewards/rejected": -8.38888931274414, + "step": 199 + }, + { + "epoch": 0.4, + "grad_norm": 19.976350784301758, + "learning_rate": 4.814814814814814e-07, + "logps/chosen": -30.71693229675293, + "logps/rejected": -122.64790344238281, + "loss": 0.0996, + "losses/dpo": 0.04794596508145332, + "losses/sft": 0.655545175075531, + "losses/total": 0.04794596508145332, + "ref_logps/chosen": -16.344539642333984, + "ref_logps/rejected": -52.21672058105469, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4372395277023315, + "rewards/margins": 5.605879783630371, + "rewards/rejected": -7.04311990737915, + "step": 200 + }, + { + "epoch": 0.4, + "grad_norm": 9.067500114440918, + "learning_rate": 4.81111111111111e-07, + "logps/chosen": -24.73886489868164, + "logps/rejected": -113.34480285644531, + "loss": 0.07, + "losses/dpo": 0.2013578712940216, + "losses/sft": 0.6681497693061829, + "losses/total": 0.2013578712940216, + "ref_logps/chosen": -10.942086219787598, + "ref_logps/rejected": -45.542938232421875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3796777725219727, + "rewards/margins": 5.400509834289551, + "rewards/rejected": -6.780187606811523, + "step": 201 + }, + { + "epoch": 0.4, + "grad_norm": 20.155838012695312, + "learning_rate": 4.807407407407407e-07, + "logps/chosen": -34.64628219604492, + "logps/rejected": -109.43836212158203, + "loss": 0.0937, + "losses/dpo": 0.26625820994377136, + "losses/sft": 0.5927165150642395, + "losses/total": 0.26625820994377136, + "ref_logps/chosen": -14.607067108154297, + "ref_logps/rejected": -45.044769287109375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0039215087890625, + "rewards/margins": 4.4354376792907715, + "rewards/rejected": -6.439358711242676, + "step": 202 + }, + { + "epoch": 0.41, + "grad_norm": 45.13740158081055, + "learning_rate": 4.803703703703704e-07, + "logps/chosen": -34.342552185058594, + "logps/rejected": -115.29844665527344, + "loss": 0.16, + "losses/dpo": 0.19734561443328857, + "losses/sft": 0.9053852558135986, + "losses/total": 0.19734561443328857, + "ref_logps/chosen": -12.908370018005371, + "ref_logps/rejected": -47.28173065185547, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.143418073654175, + "rewards/margins": 4.6582536697387695, + "rewards/rejected": -6.801671981811523, + "step": 203 + }, + { + "epoch": 0.41, + "grad_norm": 47.24686813354492, + "learning_rate": 4.8e-07, + "logps/chosen": -37.38880157470703, + "logps/rejected": -108.8377456665039, + "loss": 0.129, + "losses/dpo": 0.08070208877325058, + "losses/sft": 0.9097791314125061, + "losses/total": 0.08070208877325058, + "ref_logps/chosen": -15.263628959655762, + "ref_logps/rejected": -41.726402282714844, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.212517261505127, + "rewards/margins": 4.498617172241211, + "rewards/rejected": -6.71113395690918, + "step": 204 + }, + { + "epoch": 0.41, + "grad_norm": 8.513498306274414, + "learning_rate": 4.796296296296296e-07, + "logps/chosen": -34.856285095214844, + "logps/rejected": -127.9891357421875, + "loss": 0.0201, + "losses/dpo": 0.0579579658806324, + "losses/sft": 0.6573350429534912, + "losses/total": 0.0579579658806324, + "ref_logps/chosen": -12.282885551452637, + "ref_logps/rejected": -46.556800842285156, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2573397159576416, + "rewards/margins": 5.885893821716309, + "rewards/rejected": -8.143234252929688, + "step": 205 + }, + { + "epoch": 0.41, + "grad_norm": 35.727291107177734, + "learning_rate": 4.792592592592592e-07, + "logps/chosen": -36.47418975830078, + "logps/rejected": -120.59506225585938, + "loss": 0.106, + "losses/dpo": 0.17767032980918884, + "losses/sft": 0.5659396648406982, + "losses/total": 0.17767032980918884, + "ref_logps/chosen": -14.854631423950195, + "ref_logps/rejected": -54.54015350341797, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1619560718536377, + "rewards/margins": 4.443534851074219, + "rewards/rejected": -6.605490684509277, + "step": 206 + }, + { + "epoch": 0.41, + "grad_norm": 53.7916374206543, + "learning_rate": 4.788888888888889e-07, + "logps/chosen": -31.674482345581055, + "logps/rejected": -111.17448425292969, + "loss": 0.1663, + "losses/dpo": 0.4532434344291687, + "losses/sft": 0.7202601432800293, + "losses/total": 0.4532434344291687, + "ref_logps/chosen": -13.298295021057129, + "ref_logps/rejected": -40.8799934387207, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8376187086105347, + "rewards/margins": 5.191830635070801, + "rewards/rejected": -7.029449462890625, + "step": 207 + }, + { + "epoch": 0.42, + "grad_norm": 40.02408981323242, + "learning_rate": 4.785185185185185e-07, + "logps/chosen": -34.69915008544922, + "logps/rejected": -99.4919204711914, + "loss": 0.2046, + "losses/dpo": 0.6538295745849609, + "losses/sft": 0.660841703414917, + "losses/total": 0.6538295745849609, + "ref_logps/chosen": -14.765646934509277, + "ref_logps/rejected": -39.34600830078125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9933503866195679, + "rewards/margins": 4.021241188049316, + "rewards/rejected": -6.014591693878174, + "step": 208 + }, + { + "epoch": 0.42, + "grad_norm": 61.37720489501953, + "learning_rate": 4.781481481481482e-07, + "logps/chosen": -47.08232879638672, + "logps/rejected": -118.26411437988281, + "loss": 0.1965, + "losses/dpo": 0.3133441209793091, + "losses/sft": 0.6915695667266846, + "losses/total": 0.3133441209793091, + "ref_logps/chosen": -16.830141067504883, + "ref_logps/rejected": -45.98204040527344, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.025218963623047, + "rewards/margins": 4.202988624572754, + "rewards/rejected": -7.228207588195801, + "step": 209 + }, + { + "epoch": 0.42, + "grad_norm": 31.95800018310547, + "learning_rate": 4.777777777777778e-07, + "logps/chosen": -36.95540237426758, + "logps/rejected": -123.79988098144531, + "loss": 0.0974, + "losses/dpo": 0.09307215362787247, + "losses/sft": 0.7482750415802002, + "losses/total": 0.09307215362787247, + "ref_logps/chosen": -14.497810363769531, + "ref_logps/rejected": -52.44733428955078, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2457590103149414, + "rewards/margins": 4.889495849609375, + "rewards/rejected": -7.135254859924316, + "step": 210 + }, + { + "epoch": 0.42, + "grad_norm": 56.4405632019043, + "learning_rate": 4.774074074074073e-07, + "logps/chosen": -26.164018630981445, + "logps/rejected": -125.44146728515625, + "loss": 0.2563, + "losses/dpo": 0.01354515552520752, + "losses/sft": 0.4640093445777893, + "losses/total": 0.01354515552520752, + "ref_logps/chosen": -10.535161018371582, + "ref_logps/rejected": -49.800575256347656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5628857612609863, + "rewards/margins": 6.001203536987305, + "rewards/rejected": -7.564088821411133, + "step": 211 + }, + { + "epoch": 0.42, + "grad_norm": 36.40195846557617, + "learning_rate": 4.77037037037037e-07, + "logps/chosen": -29.295894622802734, + "logps/rejected": -79.89122009277344, + "loss": 0.1645, + "losses/dpo": 0.19777898490428925, + "losses/sft": 0.5077103972434998, + "losses/total": 0.19777898490428925, + "ref_logps/chosen": -11.7691068649292, + "ref_logps/rejected": -33.79277038574219, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.752678632736206, + "rewards/margins": 2.8571667671203613, + "rewards/rejected": -4.609845161437988, + "step": 212 + }, + { + "epoch": 0.43, + "grad_norm": 70.0155029296875, + "learning_rate": 4.7666666666666667e-07, + "logps/chosen": -30.167865753173828, + "logps/rejected": -88.53804016113281, + "loss": 0.2756, + "losses/dpo": 0.18790437281131744, + "losses/sft": 0.6706699132919312, + "losses/total": 0.18790437281131744, + "ref_logps/chosen": -11.840088844299316, + "ref_logps/rejected": -32.273094177246094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.832777500152588, + "rewards/margins": 3.7937171459198, + "rewards/rejected": -5.626494884490967, + "step": 213 + }, + { + "epoch": 0.43, + "grad_norm": 43.36481857299805, + "learning_rate": 4.7629629629629626e-07, + "logps/chosen": -34.01522445678711, + "logps/rejected": -145.6756591796875, + "loss": 0.1293, + "losses/dpo": 0.005147114396095276, + "losses/sft": 0.7248036861419678, + "losses/total": 0.005147114396095276, + "ref_logps/chosen": -14.497140884399414, + "ref_logps/rejected": -54.134281158447266, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9518084526062012, + "rewards/margins": 7.202329635620117, + "rewards/rejected": -9.154138565063477, + "step": 214 + }, + { + "epoch": 0.43, + "grad_norm": 15.9217529296875, + "learning_rate": 4.759259259259259e-07, + "logps/chosen": -39.08689880371094, + "logps/rejected": -118.49710845947266, + "loss": 0.0408, + "losses/dpo": 0.05637574940919876, + "losses/sft": 0.6518598794937134, + "losses/total": 0.05637574940919876, + "ref_logps/chosen": -16.12924575805664, + "ref_logps/rejected": -45.19512939453125, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.295764923095703, + "rewards/margins": 5.034433841705322, + "rewards/rejected": -7.330198287963867, + "step": 215 + }, + { + "epoch": 0.43, + "grad_norm": 28.412574768066406, + "learning_rate": 4.7555555555555554e-07, + "logps/chosen": -40.055641174316406, + "logps/rejected": -119.33856201171875, + "loss": 0.066, + "losses/dpo": 0.16118040680885315, + "losses/sft": 0.7415938377380371, + "losses/total": 0.16118040680885315, + "ref_logps/chosen": -18.70709991455078, + "ref_logps/rejected": -47.00398635864258, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1348538398742676, + "rewards/margins": 5.098602771759033, + "rewards/rejected": -7.233456611633301, + "step": 216 + }, + { + "epoch": 0.43, + "grad_norm": 30.004011154174805, + "learning_rate": 4.751851851851852e-07, + "logps/chosen": -31.2343692779541, + "logps/rejected": -152.04202270507812, + "loss": 0.0871, + "losses/dpo": 0.2913620173931122, + "losses/sft": 0.5769238471984863, + "losses/total": 0.2913620173931122, + "ref_logps/chosen": -11.397944450378418, + "ref_logps/rejected": -56.8441162109375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.983642339706421, + "rewards/margins": 7.536149024963379, + "rewards/rejected": -9.519791603088379, + "step": 217 + }, + { + "epoch": 0.44, + "grad_norm": 49.84681701660156, + "learning_rate": 4.7481481481481477e-07, + "logps/chosen": -38.713043212890625, + "logps/rejected": -169.45138549804688, + "loss": 0.0497, + "losses/dpo": 0.021128835156559944, + "losses/sft": 0.6783910989761353, + "losses/total": 0.021128835156559944, + "ref_logps/chosen": -13.662879943847656, + "ref_logps/rejected": -70.60210418701172, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.505016326904297, + "rewards/margins": 7.379911422729492, + "rewards/rejected": -9.884927749633789, + "step": 218 + }, + { + "epoch": 0.44, + "grad_norm": 42.746803283691406, + "learning_rate": 4.744444444444444e-07, + "logps/chosen": -40.39366912841797, + "logps/rejected": -112.6432113647461, + "loss": 0.0885, + "losses/dpo": 0.092128686606884, + "losses/sft": 0.5247994661331177, + "losses/total": 0.092128686606884, + "ref_logps/chosen": -17.325380325317383, + "ref_logps/rejected": -42.825828552246094, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3068289756774902, + "rewards/margins": 4.6749091148376465, + "rewards/rejected": -6.9817376136779785, + "step": 219 + }, + { + "epoch": 0.44, + "grad_norm": 20.136625289916992, + "learning_rate": 4.7407407407407405e-07, + "logps/chosen": -33.28211975097656, + "logps/rejected": -112.59181213378906, + "loss": 0.0883, + "losses/dpo": 0.20460954308509827, + "losses/sft": 0.5208683013916016, + "losses/total": 0.20460954308509827, + "ref_logps/chosen": -15.455129623413086, + "ref_logps/rejected": -40.72985076904297, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7826988697052002, + "rewards/margins": 5.403497695922852, + "rewards/rejected": -7.186196327209473, + "step": 220 + }, + { + "epoch": 0.44, + "grad_norm": 66.03868103027344, + "learning_rate": 4.7370370370370364e-07, + "logps/chosen": -31.484960556030273, + "logps/rejected": -113.89936828613281, + "loss": 0.2053, + "losses/dpo": 0.5744110941886902, + "losses/sft": 0.8293743133544922, + "losses/total": 0.5744110941886902, + "ref_logps/chosen": -12.668027877807617, + "ref_logps/rejected": -46.156768798828125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8816933631896973, + "rewards/margins": 4.892566680908203, + "rewards/rejected": -6.7742600440979, + "step": 221 + }, + { + "epoch": 0.44, + "grad_norm": 36.92087936401367, + "learning_rate": 4.733333333333333e-07, + "logps/chosen": -34.15308380126953, + "logps/rejected": -102.67060852050781, + "loss": 0.1643, + "losses/dpo": 0.04543168097734451, + "losses/sft": 0.6770851016044617, + "losses/total": 0.04543168097734451, + "ref_logps/chosen": -13.57850456237793, + "ref_logps/rejected": -39.29328918457031, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.05745792388916, + "rewards/margins": 4.280274391174316, + "rewards/rejected": -6.337732315063477, + "step": 222 + }, + { + "epoch": 0.45, + "grad_norm": 47.12366485595703, + "learning_rate": 4.72962962962963e-07, + "logps/chosen": -31.432491302490234, + "logps/rejected": -99.98987579345703, + "loss": 0.1713, + "losses/dpo": 0.1538832187652588, + "losses/sft": 0.8228436708450317, + "losses/total": 0.1538832187652588, + "ref_logps/chosen": -12.841203689575195, + "ref_logps/rejected": -39.756378173828125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8591285943984985, + "rewards/margins": 4.16422176361084, + "rewards/rejected": -6.023350238800049, + "step": 223 + }, + { + "epoch": 0.45, + "grad_norm": 74.00916290283203, + "learning_rate": 4.725925925925926e-07, + "logps/chosen": -33.925750732421875, + "logps/rejected": -107.38592529296875, + "loss": 0.2729, + "losses/dpo": 0.24903066456317902, + "losses/sft": 0.5291041135787964, + "losses/total": 0.24903066456317902, + "ref_logps/chosen": -17.715248107910156, + "ref_logps/rejected": -48.10115051269531, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.621050238609314, + "rewards/margins": 4.307427406311035, + "rewards/rejected": -5.9284772872924805, + "step": 224 + }, + { + "epoch": 0.45, + "grad_norm": 26.433197021484375, + "learning_rate": 4.722222222222222e-07, + "logps/chosen": -29.673873901367188, + "logps/rejected": -96.80340576171875, + "loss": 0.1643, + "losses/dpo": 0.0513819195330143, + "losses/sft": 0.5060651302337646, + "losses/total": 0.0513819195330143, + "ref_logps/chosen": -13.205547332763672, + "ref_logps/rejected": -39.28229522705078, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6468327045440674, + "rewards/margins": 4.105278491973877, + "rewards/rejected": -5.752111434936523, + "step": 225 + }, + { + "epoch": 0.45, + "grad_norm": 24.875255584716797, + "learning_rate": 4.7185185185185185e-07, + "logps/chosen": -34.83883285522461, + "logps/rejected": -128.52703857421875, + "loss": 0.0889, + "losses/dpo": 0.26298192143440247, + "losses/sft": 0.489166259765625, + "losses/total": 0.26298192143440247, + "ref_logps/chosen": -18.28306770324707, + "ref_logps/rejected": -54.15491485595703, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6555763483047485, + "rewards/margins": 5.78163480758667, + "rewards/rejected": -7.437211513519287, + "step": 226 + }, + { + "epoch": 0.45, + "grad_norm": 27.833566665649414, + "learning_rate": 4.714814814814815e-07, + "logps/chosen": -32.174381256103516, + "logps/rejected": -122.44319152832031, + "loss": 0.0825, + "losses/dpo": 0.08397988975048065, + "losses/sft": 0.7054194808006287, + "losses/total": 0.08397988975048065, + "ref_logps/chosen": -13.736897468566895, + "ref_logps/rejected": -52.05677032470703, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8437484502792358, + "rewards/margins": 5.194893836975098, + "rewards/rejected": -7.038642406463623, + "step": 227 + }, + { + "epoch": 0.46, + "grad_norm": 8.22507095336914, + "learning_rate": 4.711111111111111e-07, + "logps/chosen": -33.30576705932617, + "logps/rejected": -127.82069396972656, + "loss": 0.0227, + "losses/dpo": 0.049265846610069275, + "losses/sft": 0.48058852553367615, + "losses/total": 0.049265846610069275, + "ref_logps/chosen": -15.762434959411621, + "ref_logps/rejected": -50.94194030761719, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.75433349609375, + "rewards/margins": 5.933541297912598, + "rewards/rejected": -7.687874794006348, + "step": 228 + }, + { + "epoch": 0.46, + "grad_norm": 17.72207260131836, + "learning_rate": 4.707407407407407e-07, + "logps/chosen": -37.80146789550781, + "logps/rejected": -109.19346618652344, + "loss": 0.0558, + "losses/dpo": 0.10598674416542053, + "losses/sft": 0.5179128050804138, + "losses/total": 0.10598674416542053, + "ref_logps/chosen": -17.415359497070312, + "ref_logps/rejected": -44.45947265625, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0386109352111816, + "rewards/margins": 4.434789657592773, + "rewards/rejected": -6.473400592803955, + "step": 229 + }, + { + "epoch": 0.46, + "grad_norm": 76.22272491455078, + "learning_rate": 4.7037037037037036e-07, + "logps/chosen": -37.97969436645508, + "logps/rejected": -98.01371002197266, + "loss": 0.303, + "losses/dpo": 0.26055893301963806, + "losses/sft": 0.7976289987564087, + "losses/total": 0.26055893301963806, + "ref_logps/chosen": -15.016735076904297, + "ref_logps/rejected": -39.44939422607422, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2962958812713623, + "rewards/margins": 3.560136318206787, + "rewards/rejected": -5.85643196105957, + "step": 230 + }, + { + "epoch": 0.46, + "grad_norm": 4.947128772735596, + "learning_rate": 4.6999999999999995e-07, + "logps/chosen": -32.68196487426758, + "logps/rejected": -122.32713317871094, + "loss": 0.0543, + "losses/dpo": 0.1880553811788559, + "losses/sft": 0.4257510006427765, + "losses/total": 0.1880553811788559, + "ref_logps/chosen": -17.893299102783203, + "ref_logps/rejected": -49.445945739746094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4788663387298584, + "rewards/margins": 5.809252738952637, + "rewards/rejected": -7.288119316101074, + "step": 231 + }, + { + "epoch": 0.46, + "grad_norm": 29.260194778442383, + "learning_rate": 4.696296296296296e-07, + "logps/chosen": -30.062938690185547, + "logps/rejected": -100.40599060058594, + "loss": 0.0955, + "losses/dpo": 0.04028013348579407, + "losses/sft": 0.6679549217224121, + "losses/total": 0.04028013348579407, + "ref_logps/chosen": -12.384693145751953, + "ref_logps/rejected": -39.80000305175781, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.767824411392212, + "rewards/margins": 4.292774200439453, + "rewards/rejected": -6.060598850250244, + "step": 232 + }, + { + "epoch": 0.47, + "grad_norm": 70.72667694091797, + "learning_rate": 4.6925925925925923e-07, + "logps/chosen": -37.28951644897461, + "logps/rejected": -87.13302612304688, + "loss": 0.4117, + "losses/dpo": 0.19169825315475464, + "losses/sft": 0.9827092885971069, + "losses/total": 0.19169825315475464, + "ref_logps/chosen": -11.481277465820312, + "ref_logps/rejected": -34.47808074951172, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.5808238983154297, + "rewards/margins": 2.6846706867218018, + "rewards/rejected": -5.265494346618652, + "step": 233 + }, + { + "epoch": 0.47, + "grad_norm": 38.08920669555664, + "learning_rate": 4.6888888888888887e-07, + "logps/chosen": -46.131221771240234, + "logps/rejected": -120.89785766601562, + "loss": 0.1526, + "losses/dpo": 0.06840162724256516, + "losses/sft": 0.5885744094848633, + "losses/total": 0.06840162724256516, + "ref_logps/chosen": -29.300039291381836, + "ref_logps/rejected": -57.00225830078125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.683118224143982, + "rewards/margins": 4.706441402435303, + "rewards/rejected": -6.389559745788574, + "step": 234 + }, + { + "epoch": 0.47, + "grad_norm": 60.85316848754883, + "learning_rate": 4.6851851851851846e-07, + "logps/chosen": -36.02318572998047, + "logps/rejected": -101.59310913085938, + "loss": 0.2075, + "losses/dpo": 0.1142917200922966, + "losses/sft": 0.7939429879188538, + "losses/total": 0.1142917200922966, + "ref_logps/chosen": -14.844230651855469, + "ref_logps/rejected": -42.496620178222656, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1178956031799316, + "rewards/margins": 3.7917532920837402, + "rewards/rejected": -5.909648895263672, + "step": 235 + }, + { + "epoch": 0.47, + "grad_norm": 30.55718421936035, + "learning_rate": 4.681481481481481e-07, + "logps/chosen": -40.79773712158203, + "logps/rejected": -105.50750732421875, + "loss": 0.0906, + "losses/dpo": 0.08892350643873215, + "losses/sft": 0.33517754077911377, + "losses/total": 0.08892350643873215, + "ref_logps/chosen": -20.355552673339844, + "ref_logps/rejected": -45.50160217285156, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0442183017730713, + "rewards/margins": 3.956371545791626, + "rewards/rejected": -6.0005903244018555, + "step": 236 + }, + { + "epoch": 0.47, + "grad_norm": 36.39009475708008, + "learning_rate": 4.677777777777778e-07, + "logps/chosen": -35.50791931152344, + "logps/rejected": -107.39195251464844, + "loss": 0.124, + "losses/dpo": 0.1986294537782669, + "losses/sft": 0.3821603059768677, + "losses/total": 0.1986294537782669, + "ref_logps/chosen": -16.365543365478516, + "ref_logps/rejected": -40.81610107421875, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9142378568649292, + "rewards/margins": 4.74334716796875, + "rewards/rejected": -6.6575846672058105, + "step": 237 + }, + { + "epoch": 0.48, + "grad_norm": 34.70072555541992, + "learning_rate": 4.674074074074074e-07, + "logps/chosen": -31.894338607788086, + "logps/rejected": -115.76055145263672, + "loss": 0.0858, + "losses/dpo": 0.17275740206241608, + "losses/sft": 0.6638415455818176, + "losses/total": 0.17275740206241608, + "ref_logps/chosen": -14.566429138183594, + "ref_logps/rejected": -48.65589904785156, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7327909469604492, + "rewards/margins": 4.97767448425293, + "rewards/rejected": -6.710465431213379, + "step": 238 + }, + { + "epoch": 0.48, + "grad_norm": 65.5306625366211, + "learning_rate": 4.67037037037037e-07, + "logps/chosen": -36.98841857910156, + "logps/rejected": -89.85264587402344, + "loss": 0.2781, + "losses/dpo": 0.0937236100435257, + "losses/sft": 0.5629141330718994, + "losses/total": 0.0937236100435257, + "ref_logps/chosen": -15.890052795410156, + "ref_logps/rejected": -37.154483795166016, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1098363399505615, + "rewards/margins": 3.159980058670044, + "rewards/rejected": -5.2698163986206055, + "step": 239 + }, + { + "epoch": 0.48, + "grad_norm": 29.891502380371094, + "learning_rate": 4.6666666666666666e-07, + "logps/chosen": -25.592954635620117, + "logps/rejected": -114.64250183105469, + "loss": 0.1173, + "losses/dpo": 0.15019918978214264, + "losses/sft": 0.5278570652008057, + "losses/total": 0.15019918978214264, + "ref_logps/chosen": -13.68138313293457, + "ref_logps/rejected": -52.11135482788086, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1911571025848389, + "rewards/margins": 5.061957359313965, + "rewards/rejected": -6.253114700317383, + "step": 240 + }, + { + "epoch": 0.48, + "grad_norm": 16.76541519165039, + "learning_rate": 4.662962962962963e-07, + "logps/chosen": -31.80239486694336, + "logps/rejected": -137.59100341796875, + "loss": 0.0861, + "losses/dpo": 0.23009899258613586, + "losses/sft": 0.5560140013694763, + "losses/total": 0.23009899258613586, + "ref_logps/chosen": -11.215553283691406, + "ref_logps/rejected": -52.46674346923828, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0586843490600586, + "rewards/margins": 6.453742027282715, + "rewards/rejected": -8.512426376342773, + "step": 241 + }, + { + "epoch": 0.48, + "grad_norm": 54.48352813720703, + "learning_rate": 4.659259259259259e-07, + "logps/chosen": -30.94244384765625, + "logps/rejected": -103.1988525390625, + "loss": 0.1839, + "losses/dpo": 0.4775644540786743, + "losses/sft": 0.7302254438400269, + "losses/total": 0.4775644540786743, + "ref_logps/chosen": -13.036834716796875, + "ref_logps/rejected": -44.61477279663086, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7905609607696533, + "rewards/margins": 4.067846775054932, + "rewards/rejected": -5.858407974243164, + "step": 242 + }, + { + "epoch": 0.49, + "grad_norm": 68.84845733642578, + "learning_rate": 4.6555555555555553e-07, + "logps/chosen": -30.0528564453125, + "logps/rejected": -136.130859375, + "loss": 0.1851, + "losses/dpo": 0.0996524840593338, + "losses/sft": 0.5073419213294983, + "losses/total": 0.0996524840593338, + "ref_logps/chosen": -13.253033638000488, + "ref_logps/rejected": -60.84919738769531, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.679982304573059, + "rewards/margins": 5.848184585571289, + "rewards/rejected": -7.5281662940979, + "step": 243 + }, + { + "epoch": 0.49, + "grad_norm": 50.93902587890625, + "learning_rate": 4.651851851851852e-07, + "logps/chosen": -33.61359405517578, + "logps/rejected": -108.6624984741211, + "loss": 0.1313, + "losses/dpo": 0.01921762339770794, + "losses/sft": 0.6857509613037109, + "losses/total": 0.01921762339770794, + "ref_logps/chosen": -15.684530258178711, + "ref_logps/rejected": -44.83380126953125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7929065227508545, + "rewards/margins": 4.589962959289551, + "rewards/rejected": -6.382869243621826, + "step": 244 + }, + { + "epoch": 0.49, + "grad_norm": 32.41175079345703, + "learning_rate": 4.6481481481481476e-07, + "logps/chosen": -30.212507247924805, + "logps/rejected": -133.505126953125, + "loss": 0.0789, + "losses/dpo": 0.2453601062297821, + "losses/sft": 0.7741209268569946, + "losses/total": 0.2453601062297821, + "ref_logps/chosen": -11.35911750793457, + "ref_logps/rejected": -49.58050537109375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8853390216827393, + "rewards/margins": 6.507124423980713, + "rewards/rejected": -8.392463684082031, + "step": 245 + }, + { + "epoch": 0.49, + "grad_norm": 29.210350036621094, + "learning_rate": 4.644444444444444e-07, + "logps/chosen": -32.476158142089844, + "logps/rejected": -92.0237045288086, + "loss": 0.0859, + "losses/dpo": 0.06182260811328888, + "losses/sft": 0.8990675806999207, + "losses/total": 0.06182260811328888, + "ref_logps/chosen": -12.002470970153809, + "ref_logps/rejected": -30.318471908569336, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0473690032958984, + "rewards/margins": 4.123154640197754, + "rewards/rejected": -6.170523643493652, + "step": 246 + }, + { + "epoch": 0.49, + "grad_norm": 40.80755615234375, + "learning_rate": 4.6407407407407404e-07, + "logps/chosen": -33.550479888916016, + "logps/rejected": -115.95787811279297, + "loss": 0.1212, + "losses/dpo": 0.10118857771158218, + "losses/sft": 0.5980912446975708, + "losses/total": 0.10118857771158218, + "ref_logps/chosen": -12.059894561767578, + "ref_logps/rejected": -49.10792922973633, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1490585803985596, + "rewards/margins": 4.535937309265137, + "rewards/rejected": -6.684995651245117, + "step": 247 + }, + { + "epoch": 0.5, + "grad_norm": 46.593788146972656, + "learning_rate": 4.637037037037037e-07, + "logps/chosen": -40.190673828125, + "logps/rejected": -125.31559753417969, + "loss": 0.1325, + "losses/dpo": 0.12757886946201324, + "losses/sft": 0.7891998291015625, + "losses/total": 0.12757886946201324, + "ref_logps/chosen": -17.106029510498047, + "ref_logps/rejected": -51.4550895690918, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.308464288711548, + "rewards/margins": 5.077587127685547, + "rewards/rejected": -7.386051654815674, + "step": 248 + }, + { + "epoch": 0.5, + "grad_norm": 12.482865333557129, + "learning_rate": 4.633333333333333e-07, + "logps/chosen": -33.39018249511719, + "logps/rejected": -124.94376373291016, + "loss": 0.0251, + "losses/dpo": 0.0052696047350764275, + "losses/sft": 0.7853429317474365, + "losses/total": 0.0052696047350764275, + "ref_logps/chosen": -13.695304870605469, + "ref_logps/rejected": -46.375450134277344, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9694875478744507, + "rewards/margins": 5.887343883514404, + "rewards/rejected": -7.856831073760986, + "step": 249 + }, + { + "epoch": 0.5, + "grad_norm": 13.024248123168945, + "learning_rate": 4.6296296296296297e-07, + "logps/chosen": -25.502864837646484, + "logps/rejected": -125.57063293457031, + "loss": 0.0692, + "losses/dpo": 0.015543513000011444, + "losses/sft": 0.41955021023750305, + "losses/total": 0.015543513000011444, + "ref_logps/chosen": -12.20850658416748, + "ref_logps/rejected": -49.56139373779297, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3294358253479004, + "rewards/margins": 6.271487236022949, + "rewards/rejected": -7.600924015045166, + "step": 250 + }, + { + "epoch": 0.5, + "grad_norm": 23.54766082763672, + "learning_rate": 4.625925925925926e-07, + "logps/chosen": -32.23238754272461, + "logps/rejected": -103.67035675048828, + "loss": 0.1256, + "losses/dpo": 0.06304627656936646, + "losses/sft": 0.591474175453186, + "losses/total": 0.06304627656936646, + "ref_logps/chosen": -11.873893737792969, + "ref_logps/rejected": -39.451812744140625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0358495712280273, + "rewards/margins": 4.386005401611328, + "rewards/rejected": -6.4218549728393555, + "step": 251 + }, + { + "epoch": 0.5, + "grad_norm": 13.067554473876953, + "learning_rate": 4.622222222222222e-07, + "logps/chosen": -36.21117401123047, + "logps/rejected": -123.28595733642578, + "loss": 0.0377, + "losses/dpo": 0.035405233502388, + "losses/sft": 0.7797837853431702, + "losses/total": 0.035405233502388, + "ref_logps/chosen": -16.308122634887695, + "ref_logps/rejected": -51.14239501953125, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9903050661087036, + "rewards/margins": 5.224051475524902, + "rewards/rejected": -7.214356422424316, + "step": 252 + }, + { + "epoch": 0.51, + "grad_norm": 39.37320327758789, + "learning_rate": 4.6185185185185184e-07, + "logps/chosen": -39.30860900878906, + "logps/rejected": -93.77995300292969, + "loss": 0.2996, + "losses/dpo": 0.1254265010356903, + "losses/sft": 0.975360631942749, + "losses/total": 0.1254265010356903, + "ref_logps/chosen": -10.353572845458984, + "ref_logps/rejected": -28.20992088317871, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.895503520965576, + "rewards/margins": 3.661499500274658, + "rewards/rejected": -6.557003021240234, + "step": 253 + }, + { + "epoch": 0.51, + "grad_norm": 15.333211898803711, + "learning_rate": 4.614814814814815e-07, + "logps/chosen": -37.52141571044922, + "logps/rejected": -137.45230102539062, + "loss": 0.0751, + "losses/dpo": 0.0252683162689209, + "losses/sft": 0.4655612111091614, + "losses/total": 0.0252683162689209, + "ref_logps/chosen": -18.3541259765625, + "ref_logps/rejected": -56.88932800292969, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9167288541793823, + "rewards/margins": 6.139569282531738, + "rewards/rejected": -8.056299209594727, + "step": 254 + }, + { + "epoch": 0.51, + "grad_norm": 32.789756774902344, + "learning_rate": 4.611111111111111e-07, + "logps/chosen": -36.83077621459961, + "logps/rejected": -113.62246704101562, + "loss": 0.1548, + "losses/dpo": 0.2664481997489929, + "losses/sft": 0.7239383459091187, + "losses/total": 0.2664481997489929, + "ref_logps/chosen": -14.862010955810547, + "ref_logps/rejected": -41.03871536254883, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1968765258789062, + "rewards/margins": 5.061498641967773, + "rewards/rejected": -7.25837516784668, + "step": 255 + }, + { + "epoch": 0.51, + "grad_norm": 15.145865440368652, + "learning_rate": 4.607407407407407e-07, + "logps/chosen": -36.43583297729492, + "logps/rejected": -128.9272003173828, + "loss": 0.0594, + "losses/dpo": 0.15890918672084808, + "losses/sft": 1.0230506658554077, + "losses/total": 0.15890918672084808, + "ref_logps/chosen": -13.353048324584961, + "ref_logps/rejected": -46.61483383178711, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3082785606384277, + "rewards/margins": 5.9229583740234375, + "rewards/rejected": -8.231237411499023, + "step": 256 + }, + { + "epoch": 0.51, + "grad_norm": 34.71031951904297, + "learning_rate": 4.6037037037037035e-07, + "logps/chosen": -36.72929382324219, + "logps/rejected": -120.64111328125, + "loss": 0.1215, + "losses/dpo": 0.34539294242858887, + "losses/sft": 0.5950815677642822, + "losses/total": 0.34539294242858887, + "ref_logps/chosen": -14.87203311920166, + "ref_logps/rejected": -43.00688934326172, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1857261657714844, + "rewards/margins": 5.577695846557617, + "rewards/rejected": -7.763422012329102, + "step": 257 + }, + { + "epoch": 0.52, + "grad_norm": 39.24991226196289, + "learning_rate": 4.6e-07, + "logps/chosen": -36.05840301513672, + "logps/rejected": -112.58522033691406, + "loss": 0.12, + "losses/dpo": 0.390174001455307, + "losses/sft": 0.3807469606399536, + "losses/total": 0.390174001455307, + "ref_logps/chosen": -15.537040710449219, + "ref_logps/rejected": -48.35340118408203, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0521364212036133, + "rewards/margins": 4.371045112609863, + "rewards/rejected": -6.423181533813477, + "step": 258 + }, + { + "epoch": 0.52, + "grad_norm": 34.78696060180664, + "learning_rate": 4.596296296296296e-07, + "logps/chosen": -30.690231323242188, + "logps/rejected": -98.33554077148438, + "loss": 0.1078, + "losses/dpo": 0.0833856463432312, + "losses/sft": 0.6299813985824585, + "losses/total": 0.0833856463432312, + "ref_logps/chosen": -11.812528610229492, + "ref_logps/rejected": -32.57080078125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.887770175933838, + "rewards/margins": 4.688704490661621, + "rewards/rejected": -6.576474666595459, + "step": 259 + }, + { + "epoch": 0.52, + "grad_norm": 13.130879402160645, + "learning_rate": 4.592592592592592e-07, + "logps/chosen": -30.18829345703125, + "logps/rejected": -131.82125854492188, + "loss": 0.0388, + "losses/dpo": 0.04645497351884842, + "losses/sft": 0.4061301350593567, + "losses/total": 0.04645497351884842, + "ref_logps/chosen": -13.528017044067383, + "ref_logps/rejected": -56.977508544921875, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6660277843475342, + "rewards/margins": 5.818347930908203, + "rewards/rejected": -7.484375953674316, + "step": 260 + }, + { + "epoch": 0.52, + "grad_norm": 19.638797760009766, + "learning_rate": 4.5888888888888886e-07, + "logps/chosen": -28.61343765258789, + "logps/rejected": -122.08333587646484, + "loss": 0.063, + "losses/dpo": 0.039696015417575836, + "losses/sft": 0.5925369262695312, + "losses/total": 0.039696015417575836, + "ref_logps/chosen": -12.935312271118164, + "ref_logps/rejected": -47.2875862121582, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5678125619888306, + "rewards/margins": 5.911762237548828, + "rewards/rejected": -7.479574203491211, + "step": 261 + }, + { + "epoch": 0.52, + "grad_norm": 25.106760025024414, + "learning_rate": 4.5851851851851845e-07, + "logps/chosen": -30.96830177307129, + "logps/rejected": -123.52799987792969, + "loss": 0.129, + "losses/dpo": 0.1747172325849533, + "losses/sft": 0.6229246258735657, + "losses/total": 0.1747172325849533, + "ref_logps/chosen": -9.280638694763184, + "ref_logps/rejected": -40.720794677734375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1687662601470947, + "rewards/margins": 6.111954212188721, + "rewards/rejected": -8.280721664428711, + "step": 262 + }, + { + "epoch": 0.53, + "grad_norm": 16.804208755493164, + "learning_rate": 4.5814814814814814e-07, + "logps/chosen": -30.801437377929688, + "logps/rejected": -137.78640747070312, + "loss": 0.056, + "losses/dpo": 0.11071071773767471, + "losses/sft": 0.5817344188690186, + "losses/total": 0.11071071773767471, + "ref_logps/chosen": -12.325854301452637, + "ref_logps/rejected": -59.880088806152344, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8475583791732788, + "rewards/margins": 5.943074703216553, + "rewards/rejected": -7.790633201599121, + "step": 263 + }, + { + "epoch": 0.53, + "grad_norm": 22.54884910583496, + "learning_rate": 4.577777777777778e-07, + "logps/chosen": -36.032554626464844, + "logps/rejected": -111.55353546142578, + "loss": 0.0648, + "losses/dpo": 0.08020076900720596, + "losses/sft": 0.5052534341812134, + "losses/total": 0.08020076900720596, + "ref_logps/chosen": -12.630363464355469, + "ref_logps/rejected": -38.44928741455078, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.340219259262085, + "rewards/margins": 4.970205307006836, + "rewards/rejected": -7.310424327850342, + "step": 264 + }, + { + "epoch": 0.53, + "grad_norm": 49.5006103515625, + "learning_rate": 4.574074074074074e-07, + "logps/chosen": -42.61256790161133, + "logps/rejected": -110.77767944335938, + "loss": 0.1175, + "losses/dpo": 0.14292128384113312, + "losses/sft": 0.8216965198516846, + "losses/total": 0.14292128384113312, + "ref_logps/chosen": -14.239264488220215, + "ref_logps/rejected": -39.324241638183594, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8373303413391113, + "rewards/margins": 4.308013916015625, + "rewards/rejected": -7.145343780517578, + "step": 265 + }, + { + "epoch": 0.53, + "grad_norm": 39.606327056884766, + "learning_rate": 4.57037037037037e-07, + "logps/chosen": -36.93309020996094, + "logps/rejected": -108.97806549072266, + "loss": 0.111, + "losses/dpo": 0.30392947793006897, + "losses/sft": 0.7737802267074585, + "losses/total": 0.30392947793006897, + "ref_logps/chosen": -14.136423110961914, + "ref_logps/rejected": -38.00397491455078, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2796666622161865, + "rewards/margins": 4.817742347717285, + "rewards/rejected": -7.097409725189209, + "step": 266 + }, + { + "epoch": 0.53, + "grad_norm": 41.95115661621094, + "learning_rate": 4.5666666666666665e-07, + "logps/chosen": -33.319175720214844, + "logps/rejected": -120.86522674560547, + "loss": 0.0801, + "losses/dpo": 0.25569969415664673, + "losses/sft": 0.8948369026184082, + "losses/total": 0.25569969415664673, + "ref_logps/chosen": -12.202211380004883, + "ref_logps/rejected": -40.148895263671875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.111696720123291, + "rewards/margins": 5.95993709564209, + "rewards/rejected": -8.071634292602539, + "step": 267 + }, + { + "epoch": 0.54, + "grad_norm": 13.912221908569336, + "learning_rate": 4.562962962962963e-07, + "logps/chosen": -46.565673828125, + "logps/rejected": -145.53082275390625, + "loss": 0.0639, + "losses/dpo": 0.2050100415945053, + "losses/sft": 0.6665656566619873, + "losses/total": 0.2050100415945053, + "ref_logps/chosen": -17.806560516357422, + "ref_logps/rejected": -48.13336181640625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8759117126464844, + "rewards/margins": 6.863834857940674, + "rewards/rejected": -9.73974609375, + "step": 268 + }, + { + "epoch": 0.54, + "grad_norm": 50.010066986083984, + "learning_rate": 4.559259259259259e-07, + "logps/chosen": -34.211036682128906, + "logps/rejected": -83.97434997558594, + "loss": 0.1918, + "losses/dpo": 0.2276979237794876, + "losses/sft": 0.6309546828269958, + "losses/total": 0.2276979237794876, + "ref_logps/chosen": -13.882712364196777, + "ref_logps/rejected": -31.278932571411133, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.032832622528076, + "rewards/margins": 3.2367098331451416, + "rewards/rejected": -5.269542217254639, + "step": 269 + }, + { + "epoch": 0.54, + "grad_norm": 53.363426208496094, + "learning_rate": 4.555555555555555e-07, + "logps/chosen": -45.16518020629883, + "logps/rejected": -133.35223388671875, + "loss": 0.1578, + "losses/dpo": 0.18790379166603088, + "losses/sft": 0.7288150787353516, + "losses/total": 0.18790379166603088, + "ref_logps/chosen": -15.43204402923584, + "ref_logps/rejected": -51.11394119262695, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.973313808441162, + "rewards/margins": 5.250514030456543, + "rewards/rejected": -8.223828315734863, + "step": 270 + }, + { + "epoch": 0.54, + "grad_norm": 11.669504165649414, + "learning_rate": 4.5518518518518516e-07, + "logps/chosen": -45.2990837097168, + "logps/rejected": -153.83236694335938, + "loss": 0.0161, + "losses/dpo": 0.012812875211238861, + "losses/sft": 0.8085505962371826, + "losses/total": 0.012812875211238861, + "ref_logps/chosen": -16.569459915161133, + "ref_logps/rejected": -61.62638473510742, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.872962474822998, + "rewards/margins": 6.347635269165039, + "rewards/rejected": -9.220597267150879, + "step": 271 + }, + { + "epoch": 0.54, + "grad_norm": 101.09085845947266, + "learning_rate": 4.548148148148148e-07, + "logps/chosen": -43.06938171386719, + "logps/rejected": -112.60737609863281, + "loss": 0.2584, + "losses/dpo": 0.1160985678434372, + "losses/sft": 0.5782561302185059, + "losses/total": 0.1160985678434372, + "ref_logps/chosen": -15.254789352416992, + "ref_logps/rejected": -38.1031379699707, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.781459331512451, + "rewards/margins": 4.668964385986328, + "rewards/rejected": -7.450423717498779, + "step": 272 + }, + { + "epoch": 0.55, + "grad_norm": 50.457908630371094, + "learning_rate": 4.544444444444444e-07, + "logps/chosen": -41.965553283691406, + "logps/rejected": -151.30963134765625, + "loss": 0.1371, + "losses/dpo": 0.24365629255771637, + "losses/sft": 0.8020548820495605, + "losses/total": 0.24365629255771637, + "ref_logps/chosen": -17.38597869873047, + "ref_logps/rejected": -60.589176177978516, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4579577445983887, + "rewards/margins": 6.614086627960205, + "rewards/rejected": -9.072044372558594, + "step": 273 + }, + { + "epoch": 0.55, + "grad_norm": 43.55804443359375, + "learning_rate": 4.5407407407407403e-07, + "logps/chosen": -45.85125732421875, + "logps/rejected": -116.78541564941406, + "loss": 0.1379, + "losses/dpo": 0.307068407535553, + "losses/sft": 0.6947055459022522, + "losses/total": 0.307068407535553, + "ref_logps/chosen": -15.99758529663086, + "ref_logps/rejected": -41.6662712097168, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9853672981262207, + "rewards/margins": 4.526546478271484, + "rewards/rejected": -7.511913776397705, + "step": 274 + }, + { + "epoch": 0.55, + "grad_norm": 63.197059631347656, + "learning_rate": 4.537037037037037e-07, + "logps/chosen": -35.24736022949219, + "logps/rejected": -103.81106567382812, + "loss": 0.1881, + "losses/dpo": 0.09665709733963013, + "losses/sft": 0.39394059777259827, + "losses/total": 0.09665709733963013, + "ref_logps/chosen": -11.432135581970215, + "ref_logps/rejected": -36.34547424316406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3815226554870605, + "rewards/margins": 4.36503791809082, + "rewards/rejected": -6.746560096740723, + "step": 275 + }, + { + "epoch": 0.55, + "grad_norm": 37.735565185546875, + "learning_rate": 4.5333333333333326e-07, + "logps/chosen": -43.59982681274414, + "logps/rejected": -135.76087951660156, + "loss": 0.0668, + "losses/dpo": 0.10445442795753479, + "losses/sft": 0.5132284164428711, + "losses/total": 0.10445442795753479, + "ref_logps/chosen": -21.570680618286133, + "ref_logps/rejected": -52.23603820800781, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2029147148132324, + "rewards/margins": 6.149569511413574, + "rewards/rejected": -8.352483749389648, + "step": 276 + }, + { + "epoch": 0.55, + "grad_norm": 28.7652530670166, + "learning_rate": 4.5296296296296296e-07, + "logps/chosen": -36.430747985839844, + "logps/rejected": -103.90020751953125, + "loss": 0.14, + "losses/dpo": 0.31618010997772217, + "losses/sft": 1.035854697227478, + "losses/total": 0.31618010997772217, + "ref_logps/chosen": -11.691465377807617, + "ref_logps/rejected": -35.15990447998047, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.473928451538086, + "rewards/margins": 4.400101661682129, + "rewards/rejected": -6.874030113220215, + "step": 277 + }, + { + "epoch": 0.56, + "grad_norm": 47.00922393798828, + "learning_rate": 4.525925925925926e-07, + "logps/chosen": -31.12659454345703, + "logps/rejected": -111.13185119628906, + "loss": 0.0839, + "losses/dpo": 0.0065039535984396935, + "losses/sft": 0.5701088309288025, + "losses/total": 0.0065039535984396935, + "ref_logps/chosen": -11.445125579833984, + "ref_logps/rejected": -39.33173370361328, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9681470394134521, + "rewards/margins": 5.211864948272705, + "rewards/rejected": -7.180011749267578, + "step": 278 + }, + { + "epoch": 0.56, + "grad_norm": 14.516613006591797, + "learning_rate": 4.5222222222222224e-07, + "logps/chosen": -47.697349548339844, + "logps/rejected": -147.38427734375, + "loss": 0.0261, + "losses/dpo": 0.08990654349327087, + "losses/sft": 1.041534423828125, + "losses/total": 0.08990654349327087, + "ref_logps/chosen": -16.217287063598633, + "ref_logps/rejected": -51.448463439941406, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1480064392089844, + "rewards/margins": 6.44557523727417, + "rewards/rejected": -9.593582153320312, + "step": 279 + }, + { + "epoch": 0.56, + "grad_norm": 29.43877601623535, + "learning_rate": 4.5185185185185183e-07, + "logps/chosen": -25.924114227294922, + "logps/rejected": -103.9954833984375, + "loss": 0.0962, + "losses/dpo": 0.17584168910980225, + "losses/sft": 0.3538723289966583, + "losses/total": 0.17584168910980225, + "ref_logps/chosen": -11.91704273223877, + "ref_logps/rejected": -38.9530029296875, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4007071256637573, + "rewards/margins": 5.103541374206543, + "rewards/rejected": -6.50424861907959, + "step": 280 + }, + { + "epoch": 0.56, + "grad_norm": 32.56535339355469, + "learning_rate": 4.5148148148148147e-07, + "logps/chosen": -34.9146728515625, + "logps/rejected": -138.0476531982422, + "loss": 0.1337, + "losses/dpo": 0.19118238985538483, + "losses/sft": 0.7913627624511719, + "losses/total": 0.19118238985538483, + "ref_logps/chosen": -14.662369728088379, + "ref_logps/rejected": -48.69712448120117, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0252304077148438, + "rewards/margins": 6.909822463989258, + "rewards/rejected": -8.935052871704102, + "step": 281 + }, + { + "epoch": 0.56, + "grad_norm": 25.12720489501953, + "learning_rate": 4.511111111111111e-07, + "logps/chosen": -39.446475982666016, + "logps/rejected": -138.76202392578125, + "loss": 0.0765, + "losses/dpo": 0.04247686639428139, + "losses/sft": 0.618464469909668, + "losses/total": 0.04247686639428139, + "ref_logps/chosen": -17.198135375976562, + "ref_logps/rejected": -58.359657287597656, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2248339653015137, + "rewards/margins": 5.815402984619141, + "rewards/rejected": -8.040237426757812, + "step": 282 + }, + { + "epoch": 0.57, + "grad_norm": 25.820880889892578, + "learning_rate": 4.507407407407407e-07, + "logps/chosen": -43.3232421875, + "logps/rejected": -140.09263610839844, + "loss": 0.0537, + "losses/dpo": 0.1348758488893509, + "losses/sft": 0.7474174499511719, + "losses/total": 0.1348758488893509, + "ref_logps/chosen": -17.421993255615234, + "ref_logps/rejected": -50.615196228027344, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.59012508392334, + "rewards/margins": 6.357618808746338, + "rewards/rejected": -8.94774341583252, + "step": 283 + }, + { + "epoch": 0.57, + "grad_norm": 26.093074798583984, + "learning_rate": 4.5037037037037034e-07, + "logps/chosen": -33.18143081665039, + "logps/rejected": -129.96148681640625, + "loss": 0.0364, + "losses/dpo": 0.01221714448183775, + "losses/sft": 0.5038900971412659, + "losses/total": 0.01221714448183775, + "ref_logps/chosen": -11.697612762451172, + "ref_logps/rejected": -47.87187957763672, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1483817100524902, + "rewards/margins": 6.060579299926758, + "rewards/rejected": -8.208961486816406, + "step": 284 + }, + { + "epoch": 0.57, + "grad_norm": 47.605594635009766, + "learning_rate": 4.5e-07, + "logps/chosen": -43.6019172668457, + "logps/rejected": -107.98235321044922, + "loss": 0.1269, + "losses/dpo": 0.006264102179557085, + "losses/sft": 0.6233557462692261, + "losses/total": 0.006264102179557085, + "ref_logps/chosen": -19.681671142578125, + "ref_logps/rejected": -36.80232620239258, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.392024278640747, + "rewards/margins": 4.725978851318359, + "rewards/rejected": -7.118002891540527, + "step": 285 + }, + { + "epoch": 0.57, + "grad_norm": 15.573431968688965, + "learning_rate": 4.496296296296296e-07, + "logps/chosen": -41.77413558959961, + "logps/rejected": -159.12973022460938, + "loss": 0.0299, + "losses/dpo": 0.10609380900859833, + "losses/sft": 1.0648298263549805, + "losses/total": 0.10609380900859833, + "ref_logps/chosen": -16.024517059326172, + "ref_logps/rejected": -60.03338623046875, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5749616622924805, + "rewards/margins": 7.334672927856445, + "rewards/rejected": -9.909634590148926, + "step": 286 + }, + { + "epoch": 0.57, + "grad_norm": 27.241920471191406, + "learning_rate": 4.492592592592592e-07, + "logps/chosen": -41.994728088378906, + "logps/rejected": -149.73220825195312, + "loss": 0.0586, + "losses/dpo": 0.08027364313602448, + "losses/sft": 0.6191474795341492, + "losses/total": 0.08027364313602448, + "ref_logps/chosen": -15.291231155395508, + "ref_logps/rejected": -62.80726623535156, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.670349597930908, + "rewards/margins": 6.0221452713012695, + "rewards/rejected": -8.69249439239502, + "step": 287 + }, + { + "epoch": 0.58, + "grad_norm": 76.77140808105469, + "learning_rate": 4.4888888888888885e-07, + "logps/chosen": -41.82225799560547, + "logps/rejected": -102.08113098144531, + "loss": 0.2931, + "losses/dpo": 0.3177294135093689, + "losses/sft": 0.9429760575294495, + "losses/total": 0.3177294135093689, + "ref_logps/chosen": -14.118833541870117, + "ref_logps/rejected": -30.601680755615234, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7703428268432617, + "rewards/margins": 4.377601623535156, + "rewards/rejected": -7.147944450378418, + "step": 288 + }, + { + "epoch": 0.58, + "grad_norm": 20.054603576660156, + "learning_rate": 4.4851851851851854e-07, + "logps/chosen": -40.90589904785156, + "logps/rejected": -129.79904174804688, + "loss": 0.0673, + "losses/dpo": 0.008846285752952099, + "losses/sft": 0.9324573278427124, + "losses/total": 0.008846285752952099, + "ref_logps/chosen": -13.950414657592773, + "ref_logps/rejected": -42.79026794433594, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6955482959747314, + "rewards/margins": 6.005330562591553, + "rewards/rejected": -8.700878143310547, + "step": 289 + }, + { + "epoch": 0.58, + "grad_norm": 27.402971267700195, + "learning_rate": 4.4814814814814813e-07, + "logps/chosen": -40.17758560180664, + "logps/rejected": -131.27212524414062, + "loss": 0.0893, + "losses/dpo": 0.17474588751792908, + "losses/sft": 0.7338415384292603, + "losses/total": 0.17474588751792908, + "ref_logps/chosen": -12.303985595703125, + "ref_logps/rejected": -47.974098205566406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7873599529266357, + "rewards/margins": 5.542443752288818, + "rewards/rejected": -8.329803466796875, + "step": 290 + }, + { + "epoch": 0.58, + "grad_norm": 51.174476623535156, + "learning_rate": 4.4777777777777777e-07, + "logps/chosen": -37.67184066772461, + "logps/rejected": -130.61839294433594, + "loss": 0.0929, + "losses/dpo": 0.1532328724861145, + "losses/sft": 0.6164145469665527, + "losses/total": 0.1532328724861145, + "ref_logps/chosen": -13.739557266235352, + "ref_logps/rejected": -47.06127166748047, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.39322829246521, + "rewards/margins": 5.962482929229736, + "rewards/rejected": -8.355711936950684, + "step": 291 + }, + { + "epoch": 0.58, + "grad_norm": 24.83441925048828, + "learning_rate": 4.474074074074074e-07, + "logps/chosen": -43.6513671875, + "logps/rejected": -114.49932098388672, + "loss": 0.1051, + "losses/dpo": 0.14195476472377777, + "losses/sft": 0.6711016893386841, + "losses/total": 0.14195476472377777, + "ref_logps/chosen": -18.276350021362305, + "ref_logps/rejected": -40.27934646606445, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.537501573562622, + "rewards/margins": 4.884495735168457, + "rewards/rejected": -7.4219970703125, + "step": 292 + }, + { + "epoch": 0.59, + "grad_norm": 54.920963287353516, + "learning_rate": 4.47037037037037e-07, + "logps/chosen": -39.11894989013672, + "logps/rejected": -102.25743103027344, + "loss": 0.1623, + "losses/dpo": 0.13864727318286896, + "losses/sft": 0.8034663200378418, + "losses/total": 0.13864727318286896, + "ref_logps/chosen": -16.265247344970703, + "ref_logps/rejected": -31.22001075744629, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.285369873046875, + "rewards/margins": 4.81837272644043, + "rewards/rejected": -7.103742599487305, + "step": 293 + }, + { + "epoch": 0.59, + "grad_norm": 42.93906021118164, + "learning_rate": 4.4666666666666664e-07, + "logps/chosen": -44.16669464111328, + "logps/rejected": -126.49778747558594, + "loss": 0.1054, + "losses/dpo": 0.008677210658788681, + "losses/sft": 0.7196841835975647, + "losses/total": 0.008677210658788681, + "ref_logps/chosen": -16.730802536010742, + "ref_logps/rejected": -46.01910400390625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.743589401245117, + "rewards/margins": 5.304279804229736, + "rewards/rejected": -8.047868728637695, + "step": 294 + }, + { + "epoch": 0.59, + "grad_norm": 19.96090316772461, + "learning_rate": 4.462962962962963e-07, + "logps/chosen": -37.56354522705078, + "logps/rejected": -116.57569122314453, + "loss": 0.079, + "losses/dpo": 0.13941551744937897, + "losses/sft": 1.1352434158325195, + "losses/total": 0.13941551744937897, + "ref_logps/chosen": -11.381913185119629, + "ref_logps/rejected": -40.20533752441406, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6181631088256836, + "rewards/margins": 5.018872261047363, + "rewards/rejected": -7.637035846710205, + "step": 295 + }, + { + "epoch": 0.59, + "grad_norm": 35.72517395019531, + "learning_rate": 4.459259259259259e-07, + "logps/chosen": -41.6621208190918, + "logps/rejected": -103.08749389648438, + "loss": 0.1509, + "losses/dpo": 0.28686463832855225, + "losses/sft": 0.8667877316474915, + "losses/total": 0.28686463832855225, + "ref_logps/chosen": -14.639177322387695, + "ref_logps/rejected": -33.67582702636719, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7022948265075684, + "rewards/margins": 4.238872051239014, + "rewards/rejected": -6.941166877746582, + "step": 296 + }, + { + "epoch": 0.59, + "grad_norm": 21.558837890625, + "learning_rate": 4.455555555555555e-07, + "logps/chosen": -40.06609344482422, + "logps/rejected": -143.51748657226562, + "loss": 0.068, + "losses/dpo": 0.05549360811710358, + "losses/sft": 0.6680642366409302, + "losses/total": 0.05549360811710358, + "ref_logps/chosen": -15.775091171264648, + "ref_logps/rejected": -57.9827766418457, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4291000366210938, + "rewards/margins": 6.124370574951172, + "rewards/rejected": -8.553470611572266, + "step": 297 + }, + { + "epoch": 0.6, + "grad_norm": 14.56146240234375, + "learning_rate": 4.4518518518518515e-07, + "logps/chosen": -34.31193542480469, + "logps/rejected": -137.0986328125, + "loss": 0.0469, + "losses/dpo": 0.09848415851593018, + "losses/sft": 0.5717564821243286, + "losses/total": 0.09848415851593018, + "ref_logps/chosen": -16.207962036132812, + "ref_logps/rejected": -54.10321044921875, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8103973865509033, + "rewards/margins": 6.4891462326049805, + "rewards/rejected": -8.299543380737305, + "step": 298 + }, + { + "epoch": 0.6, + "grad_norm": 21.25967788696289, + "learning_rate": 4.448148148148148e-07, + "logps/chosen": -55.163639068603516, + "logps/rejected": -189.49459838867188, + "loss": 0.0382, + "losses/dpo": 0.01291961781680584, + "losses/sft": 0.5291265249252319, + "losses/total": 0.01291961781680584, + "ref_logps/chosen": -25.353343963623047, + "ref_logps/rejected": -74.54182434082031, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.981029510498047, + "rewards/margins": 8.514249801635742, + "rewards/rejected": -11.495279312133789, + "step": 299 + }, + { + "epoch": 0.6, + "grad_norm": 52.146820068359375, + "learning_rate": 4.444444444444444e-07, + "logps/chosen": -40.749237060546875, + "logps/rejected": -144.6281280517578, + "loss": 0.0857, + "losses/dpo": 0.03312437981367111, + "losses/sft": 0.7374606132507324, + "losses/total": 0.03312437981367111, + "ref_logps/chosen": -16.874542236328125, + "ref_logps/rejected": -57.76238250732422, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.38746976852417, + "rewards/margins": 6.299104690551758, + "rewards/rejected": -8.686574935913086, + "step": 300 + }, + { + "epoch": 0.6, + "grad_norm": 40.0439567565918, + "learning_rate": 4.44074074074074e-07, + "logps/chosen": -35.04018020629883, + "logps/rejected": -151.41586303710938, + "loss": 0.1326, + "losses/dpo": 0.020077509805560112, + "losses/sft": 0.6906237006187439, + "losses/total": 0.020077509805560112, + "ref_logps/chosen": -12.195897102355957, + "ref_logps/rejected": -57.81694030761719, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.284428358078003, + "rewards/margins": 7.075465202331543, + "rewards/rejected": -9.359892845153809, + "step": 301 + }, + { + "epoch": 0.6, + "grad_norm": 31.01496696472168, + "learning_rate": 4.4370370370370367e-07, + "logps/chosen": -42.011356353759766, + "logps/rejected": -120.56248474121094, + "loss": 0.0772, + "losses/dpo": 0.14634078741073608, + "losses/sft": 1.0607680082321167, + "losses/total": 0.14634078741073608, + "ref_logps/chosen": -14.41528606414795, + "ref_logps/rejected": -45.77004623413086, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7596068382263184, + "rewards/margins": 4.7196364402771, + "rewards/rejected": -7.479243278503418, + "step": 302 + }, + { + "epoch": 0.61, + "grad_norm": 28.67228126525879, + "learning_rate": 4.4333333333333336e-07, + "logps/chosen": -46.753761291503906, + "logps/rejected": -114.69883728027344, + "loss": 0.1173, + "losses/dpo": 0.042167238891124725, + "losses/sft": 0.6821388006210327, + "losses/total": 0.042167238891124725, + "ref_logps/chosen": -17.26732635498047, + "ref_logps/rejected": -40.17786407470703, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.948643684387207, + "rewards/margins": 4.503453254699707, + "rewards/rejected": -7.452096939086914, + "step": 303 + }, + { + "epoch": 0.61, + "grad_norm": 44.84244918823242, + "learning_rate": 4.4296296296296295e-07, + "logps/chosen": -39.67682647705078, + "logps/rejected": -121.08567810058594, + "loss": 0.1254, + "losses/dpo": 0.12976478040218353, + "losses/sft": 0.7743264436721802, + "losses/total": 0.12976478040218353, + "ref_logps/chosen": -14.958328247070312, + "ref_logps/rejected": -45.83427047729492, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4718499183654785, + "rewards/margins": 5.053291320800781, + "rewards/rejected": -7.525140762329102, + "step": 304 + }, + { + "epoch": 0.61, + "grad_norm": 53.262367248535156, + "learning_rate": 4.425925925925926e-07, + "logps/chosen": -36.113426208496094, + "logps/rejected": -141.60494995117188, + "loss": 0.193, + "losses/dpo": 0.5892627835273743, + "losses/sft": 0.7533572316169739, + "losses/total": 0.5892627835273743, + "ref_logps/chosen": -13.810129165649414, + "ref_logps/rejected": -58.52567672729492, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.230329990386963, + "rewards/margins": 6.077596664428711, + "rewards/rejected": -8.307926177978516, + "step": 305 + }, + { + "epoch": 0.61, + "grad_norm": 26.402587890625, + "learning_rate": 4.4222222222222223e-07, + "logps/chosen": -33.75492858886719, + "logps/rejected": -116.12544250488281, + "loss": 0.1265, + "losses/dpo": 0.22533872723579407, + "losses/sft": 0.7445718050003052, + "losses/total": 0.22533872723579407, + "ref_logps/chosen": -14.405969619750977, + "ref_logps/rejected": -45.15299987792969, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9348959922790527, + "rewards/margins": 5.162348747253418, + "rewards/rejected": -7.097244739532471, + "step": 306 + }, + { + "epoch": 0.61, + "grad_norm": 40.44252014160156, + "learning_rate": 4.418518518518518e-07, + "logps/chosen": -36.53474426269531, + "logps/rejected": -120.66770935058594, + "loss": 0.1622, + "losses/dpo": 0.1868322640657425, + "losses/sft": 0.7348847389221191, + "losses/total": 0.1868322640657425, + "ref_logps/chosen": -10.541549682617188, + "ref_logps/rejected": -43.96518325805664, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5993194580078125, + "rewards/margins": 5.070933818817139, + "rewards/rejected": -7.670252799987793, + "step": 307 + }, + { + "epoch": 0.62, + "grad_norm": 33.434852600097656, + "learning_rate": 4.4148148148148146e-07, + "logps/chosen": -32.63652038574219, + "logps/rejected": -122.68383026123047, + "loss": 0.094, + "losses/dpo": 0.08705702424049377, + "losses/sft": 0.7313491106033325, + "losses/total": 0.08705702424049377, + "ref_logps/chosen": -13.012248039245605, + "ref_logps/rejected": -46.19256591796875, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9624271392822266, + "rewards/margins": 5.686699867248535, + "rewards/rejected": -7.649127006530762, + "step": 308 + }, + { + "epoch": 0.62, + "grad_norm": 61.25316619873047, + "learning_rate": 4.411111111111111e-07, + "logps/chosen": -36.29332733154297, + "logps/rejected": -89.23573303222656, + "loss": 0.1863, + "losses/dpo": 0.2902761995792389, + "losses/sft": 0.6693310141563416, + "losses/total": 0.2902761995792389, + "ref_logps/chosen": -13.322938919067383, + "ref_logps/rejected": -34.61989974975586, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2970387935638428, + "rewards/margins": 3.1645452976226807, + "rewards/rejected": -5.461584091186523, + "step": 309 + }, + { + "epoch": 0.62, + "grad_norm": 15.663551330566406, + "learning_rate": 4.4074074074074074e-07, + "logps/chosen": -39.670867919921875, + "logps/rejected": -113.9607925415039, + "loss": 0.0431, + "losses/dpo": 0.05196515470743179, + "losses/sft": 0.46948933601379395, + "losses/total": 0.05196515470743179, + "ref_logps/chosen": -17.334888458251953, + "ref_logps/rejected": -42.784278869628906, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.233597755432129, + "rewards/margins": 4.8840532302856445, + "rewards/rejected": -7.11765193939209, + "step": 310 + }, + { + "epoch": 0.62, + "grad_norm": 52.19047164916992, + "learning_rate": 4.4037037037037033e-07, + "logps/chosen": -44.69192123413086, + "logps/rejected": -132.44076538085938, + "loss": 0.222, + "losses/dpo": 0.32939571142196655, + "losses/sft": 0.918161153793335, + "losses/total": 0.32939571142196655, + "ref_logps/chosen": -15.003911972045898, + "ref_logps/rejected": -56.22412872314453, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9688005447387695, + "rewards/margins": 4.652862548828125, + "rewards/rejected": -7.6216630935668945, + "step": 311 + }, + { + "epoch": 0.62, + "grad_norm": 26.470958709716797, + "learning_rate": 4.3999999999999997e-07, + "logps/chosen": -36.404624938964844, + "logps/rejected": -128.2510223388672, + "loss": 0.0642, + "losses/dpo": 0.0827181488275528, + "losses/sft": 0.6652327179908752, + "losses/total": 0.0827181488275528, + "ref_logps/chosen": -17.150863647460938, + "ref_logps/rejected": -50.41554260253906, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9253758192062378, + "rewards/margins": 5.858171463012695, + "rewards/rejected": -7.783547401428223, + "step": 312 + }, + { + "epoch": 0.63, + "grad_norm": 68.88140869140625, + "learning_rate": 4.396296296296296e-07, + "logps/chosen": -48.03262710571289, + "logps/rejected": -126.91957092285156, + "loss": 0.2486, + "losses/dpo": 0.16818460822105408, + "losses/sft": 0.5306582450866699, + "losses/total": 0.16818460822105408, + "ref_logps/chosen": -19.19322967529297, + "ref_logps/rejected": -51.45057678222656, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.883939743041992, + "rewards/margins": 4.662960052490234, + "rewards/rejected": -7.546899318695068, + "step": 313 + }, + { + "epoch": 0.63, + "grad_norm": 71.32614135742188, + "learning_rate": 4.392592592592592e-07, + "logps/chosen": -33.718597412109375, + "logps/rejected": -118.68995666503906, + "loss": 0.2059, + "losses/dpo": 0.5544869899749756, + "losses/sft": 1.09904944896698, + "losses/total": 0.5544869899749756, + "ref_logps/chosen": -12.754974365234375, + "ref_logps/rejected": -46.52629852294922, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0963621139526367, + "rewards/margins": 5.120002746582031, + "rewards/rejected": -7.216364860534668, + "step": 314 + }, + { + "epoch": 0.63, + "grad_norm": 45.111080169677734, + "learning_rate": 4.3888888888888884e-07, + "logps/chosen": -29.031845092773438, + "logps/rejected": -102.54756927490234, + "loss": 0.1605, + "losses/dpo": 0.19417327642440796, + "losses/sft": 0.4396215081214905, + "losses/total": 0.19417327642440796, + "ref_logps/chosen": -12.911455154418945, + "ref_logps/rejected": -41.638851165771484, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6120388507843018, + "rewards/margins": 4.478832721710205, + "rewards/rejected": -6.090871810913086, + "step": 315 + }, + { + "epoch": 0.63, + "grad_norm": 44.88180923461914, + "learning_rate": 4.3851851851851853e-07, + "logps/chosen": -32.31817626953125, + "logps/rejected": -94.99343872070312, + "loss": 0.1621, + "losses/dpo": 0.2723809778690338, + "losses/sft": 0.5749231576919556, + "losses/total": 0.2723809778690338, + "ref_logps/chosen": -13.24195671081543, + "ref_logps/rejected": -34.43061065673828, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.907622218132019, + "rewards/margins": 4.148660659790039, + "rewards/rejected": -6.056282997131348, + "step": 316 + }, + { + "epoch": 0.63, + "grad_norm": 45.00884246826172, + "learning_rate": 4.381481481481482e-07, + "logps/chosen": -38.91361999511719, + "logps/rejected": -127.98786926269531, + "loss": 0.1514, + "losses/dpo": 0.0010936926119029522, + "losses/sft": 0.7295569777488708, + "losses/total": 0.0010936926119029522, + "ref_logps/chosen": -15.756044387817383, + "ref_logps/rejected": -47.62613296508789, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3157577514648438, + "rewards/margins": 5.720416069030762, + "rewards/rejected": -8.036173820495605, + "step": 317 + }, + { + "epoch": 0.64, + "grad_norm": 18.036989212036133, + "learning_rate": 4.3777777777777776e-07, + "logps/chosen": -33.98314666748047, + "logps/rejected": -131.54330444335938, + "loss": 0.0428, + "losses/dpo": 0.06015072390437126, + "losses/sft": 0.5692850947380066, + "losses/total": 0.06015072390437126, + "ref_logps/chosen": -17.438108444213867, + "ref_logps/rejected": -49.704689025878906, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6545041799545288, + "rewards/margins": 6.529358863830566, + "rewards/rejected": -8.183862686157227, + "step": 318 + }, + { + "epoch": 0.64, + "grad_norm": 39.62465286254883, + "learning_rate": 4.374074074074074e-07, + "logps/chosen": -39.50983810424805, + "logps/rejected": -133.33090209960938, + "loss": 0.0765, + "losses/dpo": 0.042518407106399536, + "losses/sft": 0.6064221858978271, + "losses/total": 0.042518407106399536, + "ref_logps/chosen": -16.553245544433594, + "ref_logps/rejected": -55.0023193359375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.295659065246582, + "rewards/margins": 5.537198066711426, + "rewards/rejected": -7.832857131958008, + "step": 319 + }, + { + "epoch": 0.64, + "grad_norm": 30.962528228759766, + "learning_rate": 4.3703703703703704e-07, + "logps/chosen": -34.99256134033203, + "logps/rejected": -144.94845581054688, + "loss": 0.0733, + "losses/dpo": 0.1906587928533554, + "losses/sft": 0.5711092948913574, + "losses/total": 0.1906587928533554, + "ref_logps/chosen": -14.728666305541992, + "ref_logps/rejected": -60.68986511230469, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0263895988464355, + "rewards/margins": 6.399470329284668, + "rewards/rejected": -8.425859451293945, + "step": 320 + }, + { + "epoch": 0.64, + "grad_norm": 16.429805755615234, + "learning_rate": 4.3666666666666663e-07, + "logps/chosen": -29.450841903686523, + "logps/rejected": -111.19835662841797, + "loss": 0.0472, + "losses/dpo": 0.032579537481069565, + "losses/sft": 0.5942339301109314, + "losses/total": 0.032579537481069565, + "ref_logps/chosen": -13.149361610412598, + "ref_logps/rejected": -44.32951736450195, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.630147933959961, + "rewards/margins": 5.056735992431641, + "rewards/rejected": -6.68688440322876, + "step": 321 + }, + { + "epoch": 0.64, + "grad_norm": 9.389930725097656, + "learning_rate": 4.362962962962963e-07, + "logps/chosen": -33.028564453125, + "logps/rejected": -126.03584289550781, + "loss": 0.024, + "losses/dpo": 0.03465582802891731, + "losses/sft": 0.531201958656311, + "losses/total": 0.03465582802891731, + "ref_logps/chosen": -14.504253387451172, + "ref_logps/rejected": -47.91209030151367, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.852430820465088, + "rewards/margins": 5.959944725036621, + "rewards/rejected": -7.812375068664551, + "step": 322 + }, + { + "epoch": 0.65, + "grad_norm": 59.17805480957031, + "learning_rate": 4.359259259259259e-07, + "logps/chosen": -35.84375, + "logps/rejected": -89.96308135986328, + "loss": 0.1587, + "losses/dpo": 0.45940184593200684, + "losses/sft": 0.693610668182373, + "losses/total": 0.45940184593200684, + "ref_logps/chosen": -13.855268478393555, + "ref_logps/rejected": -31.887981414794922, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.198847770690918, + "rewards/margins": 3.6086621284484863, + "rewards/rejected": -5.8075103759765625, + "step": 323 + }, + { + "epoch": 0.65, + "grad_norm": 35.07844543457031, + "learning_rate": 4.355555555555555e-07, + "logps/chosen": -37.46888732910156, + "logps/rejected": -111.22254943847656, + "loss": 0.1159, + "losses/dpo": 0.13895408809185028, + "losses/sft": 0.5949329733848572, + "losses/total": 0.13895408809185028, + "ref_logps/chosen": -16.702150344848633, + "ref_logps/rejected": -48.511962890625, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0766735076904297, + "rewards/margins": 4.194385051727295, + "rewards/rejected": -6.271059036254883, + "step": 324 + }, + { + "epoch": 0.65, + "grad_norm": 18.968955993652344, + "learning_rate": 4.3518518518518514e-07, + "logps/chosen": -42.50941467285156, + "logps/rejected": -129.11965942382812, + "loss": 0.0579, + "losses/dpo": 0.06625945121049881, + "losses/sft": 0.6571711301803589, + "losses/total": 0.06625945121049881, + "ref_logps/chosen": -16.408336639404297, + "ref_logps/rejected": -47.11133575439453, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.610107421875, + "rewards/margins": 5.590725898742676, + "rewards/rejected": -8.200833320617676, + "step": 325 + }, + { + "epoch": 0.65, + "grad_norm": 21.00503158569336, + "learning_rate": 4.348148148148148e-07, + "logps/chosen": -33.02533721923828, + "logps/rejected": -137.7234344482422, + "loss": 0.0548, + "losses/dpo": 0.150486558675766, + "losses/sft": 0.5657316446304321, + "losses/total": 0.150486558675766, + "ref_logps/chosen": -16.13014030456543, + "ref_logps/rejected": -54.781654357910156, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6895192861557007, + "rewards/margins": 6.604659080505371, + "rewards/rejected": -8.294178009033203, + "step": 326 + }, + { + "epoch": 0.65, + "grad_norm": 9.425925254821777, + "learning_rate": 4.344444444444444e-07, + "logps/chosen": -29.731834411621094, + "logps/rejected": -135.8011474609375, + "loss": 0.0229, + "losses/dpo": 0.01692948304116726, + "losses/sft": 0.44572684168815613, + "losses/total": 0.01692948304116726, + "ref_logps/chosen": -14.26729965209961, + "ref_logps/rejected": -50.366058349609375, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5464532375335693, + "rewards/margins": 6.9970550537109375, + "rewards/rejected": -8.543508529663086, + "step": 327 + }, + { + "epoch": 0.66, + "grad_norm": 38.38417434692383, + "learning_rate": 4.34074074074074e-07, + "logps/chosen": -42.788551330566406, + "logps/rejected": -121.70774841308594, + "loss": 0.0896, + "losses/dpo": 0.19306769967079163, + "losses/sft": 0.448386549949646, + "losses/total": 0.19306769967079163, + "ref_logps/chosen": -21.298828125, + "ref_logps/rejected": -44.85980987548828, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1489720344543457, + "rewards/margins": 5.535821437835693, + "rewards/rejected": -7.684793472290039, + "step": 328 + }, + { + "epoch": 0.66, + "grad_norm": 74.86832427978516, + "learning_rate": 4.337037037037037e-07, + "logps/chosen": -37.2952766418457, + "logps/rejected": -93.89851379394531, + "loss": 0.2036, + "losses/dpo": 0.1406317800283432, + "losses/sft": 0.678645133972168, + "losses/total": 0.1406317800283432, + "ref_logps/chosen": -12.764579772949219, + "ref_logps/rejected": -35.51376724243164, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4530696868896484, + "rewards/margins": 3.3854055404663086, + "rewards/rejected": -5.838475227355957, + "step": 329 + }, + { + "epoch": 0.66, + "grad_norm": 101.73636627197266, + "learning_rate": 4.3333333333333335e-07, + "logps/chosen": -35.815696716308594, + "logps/rejected": -116.6551513671875, + "loss": 0.3297, + "losses/dpo": 0.45946767926216125, + "losses/sft": 0.6446714997291565, + "losses/total": 0.45946767926216125, + "ref_logps/chosen": -14.69547176361084, + "ref_logps/rejected": -42.31071472167969, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.112022876739502, + "rewards/margins": 5.322422027587891, + "rewards/rejected": -7.434444427490234, + "step": 330 + }, + { + "epoch": 0.66, + "grad_norm": 15.054662704467773, + "learning_rate": 4.3296296296296294e-07, + "logps/chosen": -31.542770385742188, + "logps/rejected": -104.85406494140625, + "loss": 0.049, + "losses/dpo": 0.014171997085213661, + "losses/sft": 0.7473487257957458, + "losses/total": 0.014171997085213661, + "ref_logps/chosen": -11.19998550415039, + "ref_logps/rejected": -35.349510192871094, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.034278392791748, + "rewards/margins": 4.916176795959473, + "rewards/rejected": -6.950455188751221, + "step": 331 + }, + { + "epoch": 0.66, + "grad_norm": 34.530670166015625, + "learning_rate": 4.325925925925926e-07, + "logps/chosen": -36.74336242675781, + "logps/rejected": -105.05992126464844, + "loss": 0.0749, + "losses/dpo": 0.05851326882839203, + "losses/sft": 0.5923424959182739, + "losses/total": 0.05851326882839203, + "ref_logps/chosen": -14.333761215209961, + "ref_logps/rejected": -31.491056442260742, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.240960121154785, + "rewards/margins": 5.115926742553711, + "rewards/rejected": -7.356886863708496, + "step": 332 + }, + { + "epoch": 0.67, + "grad_norm": 53.47503662109375, + "learning_rate": 4.322222222222222e-07, + "logps/chosen": -40.43827819824219, + "logps/rejected": -136.73532104492188, + "loss": 0.1446, + "losses/dpo": 0.0015527131035923958, + "losses/sft": 0.5699902772903442, + "losses/total": 0.0015527131035923958, + "ref_logps/chosen": -15.056352615356445, + "ref_logps/rejected": -47.89313507080078, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5381927490234375, + "rewards/margins": 6.34602689743042, + "rewards/rejected": -8.884220123291016, + "step": 333 + }, + { + "epoch": 0.67, + "grad_norm": 20.74199867248535, + "learning_rate": 4.3185185185185186e-07, + "logps/chosen": -32.69434356689453, + "logps/rejected": -102.65889739990234, + "loss": 0.0676, + "losses/dpo": 0.1204390749335289, + "losses/sft": 0.6606833934783936, + "losses/total": 0.1204390749335289, + "ref_logps/chosen": -12.64737319946289, + "ref_logps/rejected": -35.22722625732422, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.004696846008301, + "rewards/margins": 4.738471031188965, + "rewards/rejected": -6.743167877197266, + "step": 334 + }, + { + "epoch": 0.67, + "grad_norm": 26.58478546142578, + "learning_rate": 4.3148148148148145e-07, + "logps/chosen": -35.19960021972656, + "logps/rejected": -93.163330078125, + "loss": 0.0759, + "losses/dpo": 0.14587007462978363, + "losses/sft": 0.7762695550918579, + "losses/total": 0.14587007462978363, + "ref_logps/chosen": -9.405826568603516, + "ref_logps/rejected": -29.789966583251953, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5793776512145996, + "rewards/margins": 3.7579588890075684, + "rewards/rejected": -6.337336540222168, + "step": 335 + }, + { + "epoch": 0.67, + "grad_norm": 26.999387741088867, + "learning_rate": 4.311111111111111e-07, + "logps/chosen": -30.46701431274414, + "logps/rejected": -122.60038757324219, + "loss": 0.0843, + "losses/dpo": 0.00098854408133775, + "losses/sft": 0.6043179631233215, + "losses/total": 0.00098854408133775, + "ref_logps/chosen": -13.824264526367188, + "ref_logps/rejected": -42.21704864501953, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6642749309539795, + "rewards/margins": 6.374058723449707, + "rewards/rejected": -8.038333892822266, + "step": 336 + }, + { + "epoch": 0.67, + "grad_norm": 14.741119384765625, + "learning_rate": 4.3074074074074073e-07, + "logps/chosen": -31.685304641723633, + "logps/rejected": -134.98548889160156, + "loss": 0.1284, + "losses/dpo": 0.107744500041008, + "losses/sft": 0.6917870044708252, + "losses/total": 0.107744500041008, + "ref_logps/chosen": -11.8071928024292, + "ref_logps/rejected": -49.07727813720703, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9878110885620117, + "rewards/margins": 6.603010654449463, + "rewards/rejected": -8.590821266174316, + "step": 337 + }, + { + "epoch": 0.68, + "grad_norm": 20.093679428100586, + "learning_rate": 4.303703703703703e-07, + "logps/chosen": -42.22148132324219, + "logps/rejected": -143.92608642578125, + "loss": 0.0437, + "losses/dpo": 0.0012397286482155323, + "losses/sft": 0.7648298740386963, + "losses/total": 0.0012397286482155323, + "ref_logps/chosen": -19.591176986694336, + "ref_logps/rejected": -57.07090759277344, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2630302906036377, + "rewards/margins": 6.422488689422607, + "rewards/rejected": -8.685519218444824, + "step": 338 + }, + { + "epoch": 0.68, + "grad_norm": 40.627845764160156, + "learning_rate": 4.2999999999999996e-07, + "logps/chosen": -45.338165283203125, + "logps/rejected": -124.24771118164062, + "loss": 0.1153, + "losses/dpo": 0.03297574073076248, + "losses/sft": 0.7030767798423767, + "losses/total": 0.03297574073076248, + "ref_logps/chosen": -17.034236907958984, + "ref_logps/rejected": -46.402137756347656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.830392837524414, + "rewards/margins": 4.954164981842041, + "rewards/rejected": -7.784557819366455, + "step": 339 + }, + { + "epoch": 0.68, + "grad_norm": 21.781951904296875, + "learning_rate": 4.296296296296296e-07, + "logps/chosen": -38.99799728393555, + "logps/rejected": -129.36880493164062, + "loss": 0.0632, + "losses/dpo": 0.17519626021385193, + "losses/sft": 0.4823850393295288, + "losses/total": 0.17519626021385193, + "ref_logps/chosen": -19.250946044921875, + "ref_logps/rejected": -46.28950881958008, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9747052192687988, + "rewards/margins": 6.333225250244141, + "rewards/rejected": -8.307929992675781, + "step": 340 + }, + { + "epoch": 0.68, + "grad_norm": 33.246971130371094, + "learning_rate": 4.2925925925925924e-07, + "logps/chosen": -34.87786865234375, + "logps/rejected": -110.35307312011719, + "loss": 0.1453, + "losses/dpo": 0.2280084192752838, + "losses/sft": 0.822892427444458, + "losses/total": 0.2280084192752838, + "ref_logps/chosen": -10.642579078674316, + "ref_logps/rejected": -36.44561004638672, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4235291481018066, + "rewards/margins": 4.967216968536377, + "rewards/rejected": -7.390746116638184, + "step": 341 + }, + { + "epoch": 0.68, + "grad_norm": 25.239110946655273, + "learning_rate": 4.2888888888888883e-07, + "logps/chosen": -34.88047790527344, + "logps/rejected": -112.80699157714844, + "loss": 0.073, + "losses/dpo": 0.13405922055244446, + "losses/sft": 0.5381667017936707, + "losses/total": 0.13405922055244446, + "ref_logps/chosen": -16.415294647216797, + "ref_logps/rejected": -45.48058319091797, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8465182781219482, + "rewards/margins": 4.886122226715088, + "rewards/rejected": -6.732640266418457, + "step": 342 + }, + { + "epoch": 0.69, + "grad_norm": 63.96232986450195, + "learning_rate": 4.285185185185185e-07, + "logps/chosen": -42.0777473449707, + "logps/rejected": -113.29154968261719, + "loss": 0.2174, + "losses/dpo": 0.3859434127807617, + "losses/sft": 0.7559604048728943, + "losses/total": 0.3859434127807617, + "ref_logps/chosen": -14.650555610656738, + "ref_logps/rejected": -45.057212829589844, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7427194118499756, + "rewards/margins": 4.080714225769043, + "rewards/rejected": -6.8234333992004395, + "step": 343 + }, + { + "epoch": 0.69, + "grad_norm": 56.83890151977539, + "learning_rate": 4.2814814814814816e-07, + "logps/chosen": -36.53527069091797, + "logps/rejected": -108.47628784179688, + "loss": 0.103, + "losses/dpo": 0.31091389060020447, + "losses/sft": 0.9659160375595093, + "losses/total": 0.31091389060020447, + "ref_logps/chosen": -11.020648956298828, + "ref_logps/rejected": -31.206253051757812, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.551462173461914, + "rewards/margins": 5.175541877746582, + "rewards/rejected": -7.72700309753418, + "step": 344 + }, + { + "epoch": 0.69, + "grad_norm": 58.79685974121094, + "learning_rate": 4.2777777777777775e-07, + "logps/chosen": -42.97200012207031, + "logps/rejected": -116.67851257324219, + "loss": 0.1638, + "losses/dpo": 0.14491723477840424, + "losses/sft": 0.57149738073349, + "losses/total": 0.14491723477840424, + "ref_logps/chosen": -16.25571060180664, + "ref_logps/rejected": -47.97196578979492, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6716291904449463, + "rewards/margins": 4.199025630950928, + "rewards/rejected": -6.870654582977295, + "step": 345 + }, + { + "epoch": 0.69, + "grad_norm": 35.571510314941406, + "learning_rate": 4.274074074074074e-07, + "logps/chosen": -43.3200798034668, + "logps/rejected": -131.97885131835938, + "loss": 0.0656, + "losses/dpo": 0.10765451937913895, + "losses/sft": 0.9277602434158325, + "losses/total": 0.10765451937913895, + "ref_logps/chosen": -16.17816925048828, + "ref_logps/rejected": -45.527259826660156, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.71419095993042, + "rewards/margins": 5.930967330932617, + "rewards/rejected": -8.645158767700195, + "step": 346 + }, + { + "epoch": 0.69, + "grad_norm": 87.16278839111328, + "learning_rate": 4.2703703703703703e-07, + "logps/chosen": -45.16498947143555, + "logps/rejected": -122.59102630615234, + "loss": 0.1582, + "losses/dpo": 0.17615115642547607, + "losses/sft": 0.7514989376068115, + "losses/total": 0.17615115642547607, + "ref_logps/chosen": -14.824213027954102, + "ref_logps/rejected": -43.101112365722656, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.0340776443481445, + "rewards/margins": 4.914913654327393, + "rewards/rejected": -7.948991298675537, + "step": 347 + }, + { + "epoch": 0.7, + "grad_norm": 14.873774528503418, + "learning_rate": 4.266666666666667e-07, + "logps/chosen": -37.2757568359375, + "logps/rejected": -136.62548828125, + "loss": 0.0518, + "losses/dpo": 0.0002141115692211315, + "losses/sft": 0.6635446548461914, + "losses/total": 0.0002141115692211315, + "ref_logps/chosen": -13.512396812438965, + "ref_logps/rejected": -46.763038635253906, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.376336097717285, + "rewards/margins": 6.609910011291504, + "rewards/rejected": -8.986246109008789, + "step": 348 + }, + { + "epoch": 0.7, + "grad_norm": 46.89150619506836, + "learning_rate": 4.2629629629629626e-07, + "logps/chosen": -50.33883285522461, + "logps/rejected": -103.36832427978516, + "loss": 0.1483, + "losses/dpo": 0.06724968552589417, + "losses/sft": 0.8436750173568726, + "losses/total": 0.06724968552589417, + "ref_logps/chosen": -18.469154357910156, + "ref_logps/rejected": -33.844810485839844, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.1869678497314453, + "rewards/margins": 3.7653839588165283, + "rewards/rejected": -6.9523515701293945, + "step": 349 + }, + { + "epoch": 0.7, + "grad_norm": 22.70137596130371, + "learning_rate": 4.259259259259259e-07, + "logps/chosen": -36.81205749511719, + "logps/rejected": -128.5601806640625, + "loss": 0.0615, + "losses/dpo": 0.08877705037593842, + "losses/sft": 0.5782928466796875, + "losses/total": 0.08877705037593842, + "ref_logps/chosen": -18.262386322021484, + "ref_logps/rejected": -46.65271759033203, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8549673557281494, + "rewards/margins": 6.33577823638916, + "rewards/rejected": -8.190746307373047, + "step": 350 + }, + { + "epoch": 0.7, + "grad_norm": 31.05264663696289, + "learning_rate": 4.2555555555555555e-07, + "logps/chosen": -49.91789245605469, + "logps/rejected": -129.2419891357422, + "loss": 0.0952, + "losses/dpo": 0.199846088886261, + "losses/sft": 0.975861668586731, + "losses/total": 0.199846088886261, + "ref_logps/chosen": -13.545465469360352, + "ref_logps/rejected": -42.03595733642578, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.6372427940368652, + "rewards/margins": 5.083359718322754, + "rewards/rejected": -8.720602989196777, + "step": 351 + }, + { + "epoch": 0.7, + "grad_norm": 48.4364013671875, + "learning_rate": 4.2518518518518513e-07, + "logps/chosen": -38.42912673950195, + "logps/rejected": -151.83380126953125, + "loss": 0.086, + "losses/dpo": 0.02164197713136673, + "losses/sft": 0.6958895325660706, + "losses/total": 0.02164197713136673, + "ref_logps/chosen": -14.27855396270752, + "ref_logps/rejected": -56.75341033935547, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.415057420730591, + "rewards/margins": 7.092981338500977, + "rewards/rejected": -9.508038520812988, + "step": 352 + }, + { + "epoch": 0.71, + "grad_norm": 33.09815216064453, + "learning_rate": 4.248148148148148e-07, + "logps/chosen": -36.29337692260742, + "logps/rejected": -129.50125122070312, + "loss": 0.121, + "losses/dpo": 0.18076889216899872, + "losses/sft": 0.9632574915885925, + "losses/total": 0.18076889216899872, + "ref_logps/chosen": -11.399040222167969, + "ref_logps/rejected": -50.23072814941406, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4894332885742188, + "rewards/margins": 5.437620162963867, + "rewards/rejected": -7.927053451538086, + "step": 353 + }, + { + "epoch": 0.71, + "grad_norm": 50.44475555419922, + "learning_rate": 4.244444444444444e-07, + "logps/chosen": -36.57550811767578, + "logps/rejected": -105.3508071899414, + "loss": 0.158, + "losses/dpo": 0.01699255406856537, + "losses/sft": 0.6166819334030151, + "losses/total": 0.01699255406856537, + "ref_logps/chosen": -12.352455139160156, + "ref_logps/rejected": -37.243473052978516, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.422305107116699, + "rewards/margins": 4.388428688049316, + "rewards/rejected": -6.810733795166016, + "step": 354 + }, + { + "epoch": 0.71, + "grad_norm": 41.80162811279297, + "learning_rate": 4.24074074074074e-07, + "logps/chosen": -32.989688873291016, + "logps/rejected": -104.3983154296875, + "loss": 0.138, + "losses/dpo": 0.13521035015583038, + "losses/sft": 0.48757117986679077, + "losses/total": 0.13521035015583038, + "ref_logps/chosen": -15.936813354492188, + "ref_logps/rejected": -38.816070556640625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7052874565124512, + "rewards/margins": 4.852938175201416, + "rewards/rejected": -6.558225631713867, + "step": 355 + }, + { + "epoch": 0.71, + "grad_norm": 27.352943420410156, + "learning_rate": 4.237037037037037e-07, + "logps/chosen": -31.969324111938477, + "logps/rejected": -96.71903991699219, + "loss": 0.1022, + "losses/dpo": 0.20521797239780426, + "losses/sft": 0.7039509415626526, + "losses/total": 0.20521797239780426, + "ref_logps/chosen": -10.606605529785156, + "ref_logps/rejected": -35.394012451171875, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1362719535827637, + "rewards/margins": 3.9962306022644043, + "rewards/rejected": -6.132502555847168, + "step": 356 + }, + { + "epoch": 0.71, + "grad_norm": 34.827945709228516, + "learning_rate": 4.2333333333333334e-07, + "logps/chosen": -39.3182373046875, + "logps/rejected": -122.38554382324219, + "loss": 0.0739, + "losses/dpo": 0.15058466792106628, + "losses/sft": 0.6575937271118164, + "losses/total": 0.15058466792106628, + "ref_logps/chosen": -14.849676132202148, + "ref_logps/rejected": -45.92652893066406, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4468560218811035, + "rewards/margins": 5.199045181274414, + "rewards/rejected": -7.645901203155518, + "step": 357 + }, + { + "epoch": 0.72, + "grad_norm": 35.0914421081543, + "learning_rate": 4.22962962962963e-07, + "logps/chosen": -34.46870422363281, + "logps/rejected": -106.36213684082031, + "loss": 0.0733, + "losses/dpo": 0.22453603148460388, + "losses/sft": 1.1330952644348145, + "losses/total": 0.22453603148460388, + "ref_logps/chosen": -12.364035606384277, + "ref_logps/rejected": -34.82964324951172, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2104668617248535, + "rewards/margins": 4.942782878875732, + "rewards/rejected": -7.153249740600586, + "step": 358 + }, + { + "epoch": 0.72, + "grad_norm": 26.42351531982422, + "learning_rate": 4.2259259259259257e-07, + "logps/chosen": -34.861663818359375, + "logps/rejected": -108.93109130859375, + "loss": 0.0815, + "losses/dpo": 0.04924841225147247, + "losses/sft": 0.89609694480896, + "losses/total": 0.04924841225147247, + "ref_logps/chosen": -13.483573913574219, + "ref_logps/rejected": -35.11107635498047, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1378092765808105, + "rewards/margins": 5.244193077087402, + "rewards/rejected": -7.382001876831055, + "step": 359 + }, + { + "epoch": 0.72, + "grad_norm": 78.83193969726562, + "learning_rate": 4.222222222222222e-07, + "logps/chosen": -37.34288787841797, + "logps/rejected": -124.1546630859375, + "loss": 0.1932, + "losses/dpo": 0.0520499125123024, + "losses/sft": 0.7105432748794556, + "losses/total": 0.0520499125123024, + "ref_logps/chosen": -16.80919647216797, + "ref_logps/rejected": -44.207786560058594, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0533690452575684, + "rewards/margins": 5.941318511962891, + "rewards/rejected": -7.994687557220459, + "step": 360 + }, + { + "epoch": 0.72, + "grad_norm": 16.736604690551758, + "learning_rate": 4.2185185185185185e-07, + "logps/chosen": -43.460426330566406, + "logps/rejected": -139.9253387451172, + "loss": 0.0443, + "losses/dpo": 0.010741611942648888, + "losses/sft": 0.5808259844779968, + "losses/total": 0.010741611942648888, + "ref_logps/chosen": -18.60515785217285, + "ref_logps/rejected": -51.46515655517578, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4855270385742188, + "rewards/margins": 6.360491752624512, + "rewards/rejected": -8.846017837524414, + "step": 361 + }, + { + "epoch": 0.72, + "grad_norm": 47.535552978515625, + "learning_rate": 4.2148148148148144e-07, + "logps/chosen": -32.118614196777344, + "logps/rejected": -121.19420623779297, + "loss": 0.156, + "losses/dpo": 0.1825261265039444, + "losses/sft": 0.44307950139045715, + "losses/total": 0.1825261265039444, + "ref_logps/chosen": -10.199451446533203, + "ref_logps/rejected": -48.19236755371094, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1919162273406982, + "rewards/margins": 5.108268737792969, + "rewards/rejected": -7.300185203552246, + "step": 362 + }, + { + "epoch": 0.73, + "grad_norm": 16.611141204833984, + "learning_rate": 4.211111111111111e-07, + "logps/chosen": -31.812637329101562, + "logps/rejected": -127.39079284667969, + "loss": 0.0425, + "losses/dpo": 0.002231371821835637, + "losses/sft": 0.6797657012939453, + "losses/total": 0.002231371821835637, + "ref_logps/chosen": -14.011062622070312, + "ref_logps/rejected": -46.2428092956543, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7801575660705566, + "rewards/margins": 6.334640979766846, + "rewards/rejected": -8.114798545837402, + "step": 363 + }, + { + "epoch": 0.73, + "grad_norm": 118.1374740600586, + "learning_rate": 4.207407407407407e-07, + "logps/chosen": -45.69947814941406, + "logps/rejected": -123.48897552490234, + "loss": 0.3475, + "losses/dpo": 0.0039491476491093636, + "losses/sft": 0.6805664300918579, + "losses/total": 0.0039491476491093636, + "ref_logps/chosen": -15.519075393676758, + "ref_logps/rejected": -47.35173034667969, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.018040657043457, + "rewards/margins": 4.5956830978393555, + "rewards/rejected": -7.613724231719971, + "step": 364 + }, + { + "epoch": 0.73, + "grad_norm": 24.093891143798828, + "learning_rate": 4.2037037037037036e-07, + "logps/chosen": -37.241790771484375, + "logps/rejected": -149.10354614257812, + "loss": 0.078, + "losses/dpo": 0.2201634645462036, + "losses/sft": 0.4000563323497772, + "losses/total": 0.2201634645462036, + "ref_logps/chosen": -18.728097915649414, + "ref_logps/rejected": -60.30243682861328, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8513693809509277, + "rewards/margins": 7.028740406036377, + "rewards/rejected": -8.880109786987305, + "step": 365 + }, + { + "epoch": 0.73, + "grad_norm": 31.15038299560547, + "learning_rate": 4.1999999999999995e-07, + "logps/chosen": -37.69719314575195, + "logps/rejected": -116.640869140625, + "loss": 0.0791, + "losses/dpo": 0.1046699583530426, + "losses/sft": 0.5402253270149231, + "losses/total": 0.1046699583530426, + "ref_logps/chosen": -14.103933334350586, + "ref_logps/rejected": -36.36824417114258, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.359325885772705, + "rewards/margins": 5.6679368019104, + "rewards/rejected": -8.027262687683105, + "step": 366 + }, + { + "epoch": 0.73, + "grad_norm": 62.569313049316406, + "learning_rate": 4.196296296296296e-07, + "logps/chosen": -38.47657775878906, + "logps/rejected": -107.96405792236328, + "loss": 0.2867, + "losses/dpo": 0.33012351393699646, + "losses/sft": 0.91560298204422, + "losses/total": 0.33012351393699646, + "ref_logps/chosen": -12.317329406738281, + "ref_logps/rejected": -38.52401351928711, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.615924835205078, + "rewards/margins": 4.328080177307129, + "rewards/rejected": -6.944005489349365, + "step": 367 + }, + { + "epoch": 0.74, + "grad_norm": 25.4249267578125, + "learning_rate": 4.1925925925925923e-07, + "logps/chosen": -41.08087921142578, + "logps/rejected": -132.00863647460938, + "loss": 0.093, + "losses/dpo": 0.175454780459404, + "losses/sft": 0.7735669612884521, + "losses/total": 0.175454780459404, + "ref_logps/chosen": -16.551727294921875, + "ref_logps/rejected": -49.65811538696289, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4529147148132324, + "rewards/margins": 5.782136917114258, + "rewards/rejected": -8.235052108764648, + "step": 368 + }, + { + "epoch": 0.74, + "grad_norm": 38.16954040527344, + "learning_rate": 4.1888888888888887e-07, + "logps/chosen": -35.82998275756836, + "logps/rejected": -96.82465362548828, + "loss": 0.0917, + "losses/dpo": 0.021983064711093903, + "losses/sft": 0.570757269859314, + "losses/total": 0.021983064711093903, + "ref_logps/chosen": -13.760212898254395, + "ref_logps/rejected": -32.04032516479492, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.206976890563965, + "rewards/margins": 4.271455764770508, + "rewards/rejected": -6.478432655334473, + "step": 369 + }, + { + "epoch": 0.74, + "grad_norm": 18.851280212402344, + "learning_rate": 4.185185185185185e-07, + "logps/chosen": -44.81340026855469, + "logps/rejected": -119.28524780273438, + "loss": 0.0781, + "losses/dpo": 0.012161415070295334, + "losses/sft": 0.599815845489502, + "losses/total": 0.012161415070295334, + "ref_logps/chosen": -19.234413146972656, + "ref_logps/rejected": -46.1428108215332, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.557898998260498, + "rewards/margins": 4.756345748901367, + "rewards/rejected": -7.314244270324707, + "step": 370 + }, + { + "epoch": 0.74, + "grad_norm": 8.515548706054688, + "learning_rate": 4.1814814814814815e-07, + "logps/chosen": -36.610347747802734, + "logps/rejected": -104.79900360107422, + "loss": 0.0264, + "losses/dpo": 0.023754268884658813, + "losses/sft": 0.8289971947669983, + "losses/total": 0.023754268884658813, + "ref_logps/chosen": -13.560300827026367, + "ref_logps/rejected": -33.50470733642578, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.305004596710205, + "rewards/margins": 4.824424743652344, + "rewards/rejected": -7.129429817199707, + "step": 371 + }, + { + "epoch": 0.74, + "grad_norm": 54.51543426513672, + "learning_rate": 4.177777777777778e-07, + "logps/chosen": -40.711090087890625, + "logps/rejected": -114.20442962646484, + "loss": 0.1797, + "losses/dpo": 0.01912502571940422, + "losses/sft": 0.7507596015930176, + "losses/total": 0.01912502571940422, + "ref_logps/chosen": -8.806960105895996, + "ref_logps/rejected": -40.375144958496094, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.190412998199463, + "rewards/margins": 4.1925153732299805, + "rewards/rejected": -7.382928371429443, + "step": 372 + }, + { + "epoch": 0.75, + "grad_norm": 34.81773376464844, + "learning_rate": 4.174074074074074e-07, + "logps/chosen": -39.61103057861328, + "logps/rejected": -139.82696533203125, + "loss": 0.0626, + "losses/dpo": 0.0014186090556904674, + "losses/sft": 0.5715436339378357, + "losses/total": 0.0014186090556904674, + "ref_logps/chosen": -19.26919937133789, + "ref_logps/rejected": -52.185546875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0341830253601074, + "rewards/margins": 6.729958534240723, + "rewards/rejected": -8.764141082763672, + "step": 373 + }, + { + "epoch": 0.75, + "grad_norm": 61.708709716796875, + "learning_rate": 4.17037037037037e-07, + "logps/chosen": -38.32596969604492, + "logps/rejected": -139.75997924804688, + "loss": 0.1031, + "losses/dpo": 0.03926457092165947, + "losses/sft": 0.5558092594146729, + "losses/total": 0.03926457092165947, + "ref_logps/chosen": -14.84549617767334, + "ref_logps/rejected": -56.434959411621094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3480472564697266, + "rewards/margins": 5.984454154968262, + "rewards/rejected": -8.332502365112305, + "step": 374 + }, + { + "epoch": 0.75, + "grad_norm": 38.21398162841797, + "learning_rate": 4.1666666666666667e-07, + "logps/chosen": -29.691017150878906, + "logps/rejected": -132.10032653808594, + "loss": 0.0956, + "losses/dpo": 0.05562710016965866, + "losses/sft": 0.7517297267913818, + "losses/total": 0.05562710016965866, + "ref_logps/chosen": -9.127862930297852, + "ref_logps/rejected": -46.169281005859375, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0563154220581055, + "rewards/margins": 6.5367889404296875, + "rewards/rejected": -8.593104362487793, + "step": 375 + }, + { + "epoch": 0.75, + "grad_norm": 39.852298736572266, + "learning_rate": 4.1629629629629625e-07, + "logps/chosen": -34.75245666503906, + "logps/rejected": -136.33731079101562, + "loss": 0.142, + "losses/dpo": 0.31215742230415344, + "losses/sft": 0.5526888370513916, + "losses/total": 0.31215742230415344, + "ref_logps/chosen": -15.972925186157227, + "ref_logps/rejected": -50.73971176147461, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8779528141021729, + "rewards/margins": 6.681807518005371, + "rewards/rejected": -8.559760093688965, + "step": 376 + }, + { + "epoch": 0.75, + "grad_norm": 36.76629638671875, + "learning_rate": 4.159259259259259e-07, + "logps/chosen": -38.22616195678711, + "logps/rejected": -103.80357360839844, + "loss": 0.1058, + "losses/dpo": 0.12199509143829346, + "losses/sft": 0.79285728931427, + "losses/total": 0.12199509143829346, + "ref_logps/chosen": -13.101573944091797, + "ref_logps/rejected": -33.90134048461914, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5124588012695312, + "rewards/margins": 4.47776460647583, + "rewards/rejected": -6.9902238845825195, + "step": 377 + }, + { + "epoch": 0.76, + "grad_norm": 49.55006408691406, + "learning_rate": 4.1555555555555554e-07, + "logps/chosen": -37.08061218261719, + "logps/rejected": -144.79342651367188, + "loss": 0.1404, + "losses/dpo": 0.3125501871109009, + "losses/sft": 0.604672372341156, + "losses/total": 0.3125501871109009, + "ref_logps/chosen": -16.00560760498047, + "ref_logps/rejected": -55.147979736328125, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1075005531311035, + "rewards/margins": 6.857043266296387, + "rewards/rejected": -8.964544296264648, + "step": 378 + }, + { + "epoch": 0.76, + "grad_norm": 19.39054298400879, + "learning_rate": 4.151851851851852e-07, + "logps/chosen": -33.039878845214844, + "logps/rejected": -144.43283081054688, + "loss": 0.0474, + "losses/dpo": 0.03315652906894684, + "losses/sft": 0.6851824522018433, + "losses/total": 0.03315652906894684, + "ref_logps/chosen": -11.591955184936523, + "ref_logps/rejected": -54.99343490600586, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.144792079925537, + "rewards/margins": 6.79914665222168, + "rewards/rejected": -8.943939208984375, + "step": 379 + }, + { + "epoch": 0.76, + "grad_norm": 14.704289436340332, + "learning_rate": 4.1481481481481476e-07, + "logps/chosen": -36.49052810668945, + "logps/rejected": -112.64339447021484, + "loss": 0.0508, + "losses/dpo": 0.05176647752523422, + "losses/sft": 0.7867259979248047, + "losses/total": 0.05176647752523422, + "ref_logps/chosen": -16.324478149414062, + "ref_logps/rejected": -38.75141906738281, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0166049003601074, + "rewards/margins": 5.372592926025391, + "rewards/rejected": -7.389198303222656, + "step": 380 + }, + { + "epoch": 0.76, + "grad_norm": 9.825971603393555, + "learning_rate": 4.144444444444444e-07, + "logps/chosen": -38.247528076171875, + "logps/rejected": -134.92242431640625, + "loss": 0.0183, + "losses/dpo": 0.037824541330337524, + "losses/sft": 0.624715268611908, + "losses/total": 0.037824541330337524, + "ref_logps/chosen": -16.953731536865234, + "ref_logps/rejected": -50.335994720458984, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1293797492980957, + "rewards/margins": 6.329263687133789, + "rewards/rejected": -8.458642959594727, + "step": 381 + }, + { + "epoch": 0.76, + "grad_norm": 9.921040534973145, + "learning_rate": 4.140740740740741e-07, + "logps/chosen": -41.63106155395508, + "logps/rejected": -138.38983154296875, + "loss": 0.0245, + "losses/dpo": 0.024639783427119255, + "losses/sft": 0.7088327407836914, + "losses/total": 0.024639783427119255, + "ref_logps/chosen": -16.87957763671875, + "ref_logps/rejected": -50.51683807373047, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4751484394073486, + "rewards/margins": 6.312151908874512, + "rewards/rejected": -8.787300109863281, + "step": 382 + }, + { + "epoch": 0.77, + "grad_norm": 11.32041072845459, + "learning_rate": 4.137037037037037e-07, + "logps/chosen": -41.30024337768555, + "logps/rejected": -127.23213958740234, + "loss": 0.0502, + "losses/dpo": 0.11755054444074631, + "losses/sft": 0.6314442157745361, + "losses/total": 0.11755054444074631, + "ref_logps/chosen": -22.129472732543945, + "ref_logps/rejected": -50.32065200805664, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.917076826095581, + "rewards/margins": 5.77407169342041, + "rewards/rejected": -7.69114875793457, + "step": 383 + }, + { + "epoch": 0.77, + "grad_norm": 50.16292953491211, + "learning_rate": 4.1333333333333333e-07, + "logps/chosen": -44.699562072753906, + "logps/rejected": -103.92740631103516, + "loss": 0.0995, + "losses/dpo": 0.205742746591568, + "losses/sft": 0.5135591626167297, + "losses/total": 0.205742746591568, + "ref_logps/chosen": -21.934329986572266, + "ref_logps/rejected": -38.492950439453125, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2765228748321533, + "rewards/margins": 4.2669219970703125, + "rewards/rejected": -6.543445587158203, + "step": 384 + }, + { + "epoch": 0.77, + "grad_norm": 26.36185073852539, + "learning_rate": 4.1296296296296297e-07, + "logps/chosen": -48.22766876220703, + "logps/rejected": -134.123779296875, + "loss": 0.0517, + "losses/dpo": 0.058786191046237946, + "losses/sft": 0.8921412229537964, + "losses/total": 0.058786191046237946, + "ref_logps/chosen": -16.519601821899414, + "ref_logps/rejected": -44.94132995605469, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.170806884765625, + "rewards/margins": 5.747437953948975, + "rewards/rejected": -8.918245315551758, + "step": 385 + }, + { + "epoch": 0.77, + "grad_norm": 49.396728515625, + "learning_rate": 4.1259259259259256e-07, + "logps/chosen": -40.38239288330078, + "logps/rejected": -129.00726318359375, + "loss": 0.1603, + "losses/dpo": 0.26252609491348267, + "losses/sft": 0.7657068371772766, + "losses/total": 0.26252609491348267, + "ref_logps/chosen": -14.554398536682129, + "ref_logps/rejected": -49.01328659057617, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5827994346618652, + "rewards/margins": 5.416598320007324, + "rewards/rejected": -7.999397277832031, + "step": 386 + }, + { + "epoch": 0.77, + "grad_norm": 32.83975601196289, + "learning_rate": 4.122222222222222e-07, + "logps/chosen": -40.15536880493164, + "logps/rejected": -117.92662048339844, + "loss": 0.0849, + "losses/dpo": 0.2725093960762024, + "losses/sft": 0.75511634349823, + "losses/total": 0.2725093960762024, + "ref_logps/chosen": -16.408966064453125, + "ref_logps/rejected": -45.3177604675293, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.374640464782715, + "rewards/margins": 4.886244773864746, + "rewards/rejected": -7.260885238647461, + "step": 387 + }, + { + "epoch": 0.78, + "grad_norm": 55.663578033447266, + "learning_rate": 4.1185185185185184e-07, + "logps/chosen": -43.84164047241211, + "logps/rejected": -110.4112319946289, + "loss": 0.1738, + "losses/dpo": 0.023735491558909416, + "losses/sft": 0.7960777282714844, + "losses/total": 0.023735491558909416, + "ref_logps/chosen": -15.776308059692383, + "ref_logps/rejected": -38.48309326171875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8065333366394043, + "rewards/margins": 4.386280536651611, + "rewards/rejected": -7.192813873291016, + "step": 388 + }, + { + "epoch": 0.78, + "grad_norm": 46.071250915527344, + "learning_rate": 4.114814814814815e-07, + "logps/chosen": -41.7428092956543, + "logps/rejected": -143.06747436523438, + "loss": 0.1379, + "losses/dpo": 0.0022226774599403143, + "losses/sft": 0.5881319642066956, + "losses/total": 0.0022226774599403143, + "ref_logps/chosen": -15.068788528442383, + "ref_logps/rejected": -46.10564422607422, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6674017906188965, + "rewards/margins": 7.028781414031982, + "rewards/rejected": -9.696184158325195, + "step": 389 + }, + { + "epoch": 0.78, + "grad_norm": 18.005714416503906, + "learning_rate": 4.1111111111111107e-07, + "logps/chosen": -47.57895278930664, + "logps/rejected": -116.93091583251953, + "loss": 0.0512, + "losses/dpo": 0.021306635811924934, + "losses/sft": 0.5592812895774841, + "losses/total": 0.021306635811924934, + "ref_logps/chosen": -20.549213409423828, + "ref_logps/rejected": -42.300048828125, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7029738426208496, + "rewards/margins": 4.760113716125488, + "rewards/rejected": -7.463088035583496, + "step": 390 + }, + { + "epoch": 0.78, + "grad_norm": 62.33810043334961, + "learning_rate": 4.107407407407407e-07, + "logps/chosen": -44.71644592285156, + "logps/rejected": -147.71343994140625, + "loss": 0.1605, + "losses/dpo": 0.025463107973337173, + "losses/sft": 0.6098695993423462, + "losses/total": 0.025463107973337173, + "ref_logps/chosen": -14.803157806396484, + "ref_logps/rejected": -50.16062927246094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.991328716278076, + "rewards/margins": 6.76395320892334, + "rewards/rejected": -9.755282402038574, + "step": 391 + }, + { + "epoch": 0.78, + "grad_norm": 24.61797523498535, + "learning_rate": 4.1037037037037035e-07, + "logps/chosen": -45.47924041748047, + "logps/rejected": -134.3936004638672, + "loss": 0.0449, + "losses/dpo": 0.1113649234175682, + "losses/sft": 0.7880243062973022, + "losses/total": 0.1113649234175682, + "ref_logps/chosen": -16.07276153564453, + "ref_logps/rejected": -39.296424865722656, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.940647840499878, + "rewards/margins": 6.5690693855285645, + "rewards/rejected": -9.509716987609863, + "step": 392 + }, + { + "epoch": 0.79, + "grad_norm": 62.34648895263672, + "learning_rate": 4.0999999999999994e-07, + "logps/chosen": -53.482086181640625, + "logps/rejected": -143.56060791015625, + "loss": 0.1232, + "losses/dpo": 0.05823798477649689, + "losses/sft": 1.0682368278503418, + "losses/total": 0.05823798477649689, + "ref_logps/chosen": -18.055744171142578, + "ref_logps/rejected": -50.04603576660156, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.5426340103149414, + "rewards/margins": 5.8088226318359375, + "rewards/rejected": -9.351457595825195, + "step": 393 + }, + { + "epoch": 0.79, + "grad_norm": 27.68257713317871, + "learning_rate": 4.096296296296296e-07, + "logps/chosen": -41.640113830566406, + "logps/rejected": -128.0128631591797, + "loss": 0.0461, + "losses/dpo": 0.052962690591812134, + "losses/sft": 0.46492961049079895, + "losses/total": 0.052962690591812134, + "ref_logps/chosen": -14.865592002868652, + "ref_logps/rejected": -43.59786605834961, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6774520874023438, + "rewards/margins": 5.764047622680664, + "rewards/rejected": -8.441499710083008, + "step": 394 + }, + { + "epoch": 0.79, + "grad_norm": 23.043331146240234, + "learning_rate": 4.092592592592593e-07, + "logps/chosen": -35.16297912597656, + "logps/rejected": -113.73015594482422, + "loss": 0.0636, + "losses/dpo": 0.05132364481687546, + "losses/sft": 0.5571942329406738, + "losses/total": 0.05132364481687546, + "ref_logps/chosen": -11.585695266723633, + "ref_logps/rejected": -32.01404571533203, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3577282428741455, + "rewards/margins": 5.813882827758789, + "rewards/rejected": -8.171610832214355, + "step": 395 + }, + { + "epoch": 0.79, + "grad_norm": 41.0941276550293, + "learning_rate": 4.088888888888889e-07, + "logps/chosen": -42.99498748779297, + "logps/rejected": -106.75965881347656, + "loss": 0.1797, + "losses/dpo": 0.40317678451538086, + "losses/sft": 0.5293663740158081, + "losses/total": 0.40317678451538086, + "ref_logps/chosen": -14.02493953704834, + "ref_logps/rejected": -32.46437454223633, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.897005081176758, + "rewards/margins": 4.532524108886719, + "rewards/rejected": -7.429529190063477, + "step": 396 + }, + { + "epoch": 0.79, + "grad_norm": 35.72065734863281, + "learning_rate": 4.085185185185185e-07, + "logps/chosen": -54.67194366455078, + "logps/rejected": -155.37124633789062, + "loss": 0.0593, + "losses/dpo": 0.015326184220612049, + "losses/sft": 0.7019369602203369, + "losses/total": 0.015326184220612049, + "ref_logps/chosen": -20.30701446533203, + "ref_logps/rejected": -50.850345611572266, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.436492919921875, + "rewards/margins": 7.015597343444824, + "rewards/rejected": -10.452091217041016, + "step": 397 + }, + { + "epoch": 0.8, + "grad_norm": 43.3508415222168, + "learning_rate": 4.0814814814814814e-07, + "logps/chosen": -47.85393524169922, + "logps/rejected": -113.46194458007812, + "loss": 0.1027, + "losses/dpo": 0.026841329410672188, + "losses/sft": 0.7813979387283325, + "losses/total": 0.026841329410672188, + "ref_logps/chosen": -14.83338737487793, + "ref_logps/rejected": -40.13924789428711, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.3020548820495605, + "rewards/margins": 4.030215263366699, + "rewards/rejected": -7.33227014541626, + "step": 398 + }, + { + "epoch": 0.8, + "grad_norm": 49.429466247558594, + "learning_rate": 4.077777777777778e-07, + "logps/chosen": -43.05474853515625, + "logps/rejected": -133.95877075195312, + "loss": 0.1155, + "losses/dpo": 0.2445557415485382, + "losses/sft": 0.6521463394165039, + "losses/total": 0.2445557415485382, + "ref_logps/chosen": -15.042274475097656, + "ref_logps/rejected": -43.79792404174805, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8012471199035645, + "rewards/margins": 6.214838027954102, + "rewards/rejected": -9.016085624694824, + "step": 399 + }, + { + "epoch": 0.8, + "grad_norm": 37.26084899902344, + "learning_rate": 4.0740740740740737e-07, + "logps/chosen": -38.936378479003906, + "logps/rejected": -123.11367797851562, + "loss": 0.097, + "losses/dpo": 0.2155800759792328, + "losses/sft": 0.5946239829063416, + "losses/total": 0.2155800759792328, + "ref_logps/chosen": -13.216875076293945, + "ref_logps/rejected": -42.703304290771484, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5719504356384277, + "rewards/margins": 5.469087600708008, + "rewards/rejected": -8.041038513183594, + "step": 400 + }, + { + "epoch": 0.8, + "grad_norm": 31.475679397583008, + "learning_rate": 4.07037037037037e-07, + "logps/chosen": -34.12742614746094, + "logps/rejected": -130.51797485351562, + "loss": 0.0595, + "losses/dpo": 0.03444742411375046, + "losses/sft": 0.668463945388794, + "losses/total": 0.03444742411375046, + "ref_logps/chosen": -15.204010009765625, + "ref_logps/rejected": -47.96771240234375, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8923413753509521, + "rewards/margins": 6.36268424987793, + "rewards/rejected": -8.255025863647461, + "step": 401 + }, + { + "epoch": 0.8, + "grad_norm": 47.62376022338867, + "learning_rate": 4.0666666666666666e-07, + "logps/chosen": -38.50322723388672, + "logps/rejected": -122.90655517578125, + "loss": 0.1303, + "losses/dpo": 0.1284504532814026, + "losses/sft": 0.7032700777053833, + "losses/total": 0.1284504532814026, + "ref_logps/chosen": -14.979454040527344, + "ref_logps/rejected": -43.11996841430664, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.352377414703369, + "rewards/margins": 5.62628173828125, + "rewards/rejected": -7.978658676147461, + "step": 402 + }, + { + "epoch": 0.81, + "grad_norm": 23.952451705932617, + "learning_rate": 4.062962962962963e-07, + "logps/chosen": -45.23053741455078, + "logps/rejected": -132.68417358398438, + "loss": 0.0404, + "losses/dpo": 0.09107367694377899, + "losses/sft": 0.8164919018745422, + "losses/total": 0.09107367694377899, + "ref_logps/chosen": -15.182647705078125, + "ref_logps/rejected": -45.23405456542969, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.004788875579834, + "rewards/margins": 5.740222930908203, + "rewards/rejected": -8.745011329650879, + "step": 403 + }, + { + "epoch": 0.81, + "grad_norm": 53.22486114501953, + "learning_rate": 4.059259259259259e-07, + "logps/chosen": -42.65351867675781, + "logps/rejected": -136.69607543945312, + "loss": 0.1512, + "losses/dpo": 0.19927635788917542, + "losses/sft": 0.6840373277664185, + "losses/total": 0.19927635788917542, + "ref_logps/chosen": -18.133865356445312, + "ref_logps/rejected": -49.21907043457031, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.451965808868408, + "rewards/margins": 6.295734882354736, + "rewards/rejected": -8.747699737548828, + "step": 404 + }, + { + "epoch": 0.81, + "grad_norm": 51.80613327026367, + "learning_rate": 4.055555555555555e-07, + "logps/chosen": -36.814353942871094, + "logps/rejected": -134.43319702148438, + "loss": 0.0981, + "losses/dpo": 0.28013885021209717, + "losses/sft": 0.7207523584365845, + "losses/total": 0.28013885021209717, + "ref_logps/chosen": -11.887121200561523, + "ref_logps/rejected": -44.62443542480469, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.492722988128662, + "rewards/margins": 6.488152503967285, + "rewards/rejected": -8.980875015258789, + "step": 405 + }, + { + "epoch": 0.81, + "grad_norm": 24.698190689086914, + "learning_rate": 4.0518518518518517e-07, + "logps/chosen": -31.342304229736328, + "logps/rejected": -114.50019836425781, + "loss": 0.0902, + "losses/dpo": 0.04047441482543945, + "losses/sft": 0.8049769401550293, + "losses/total": 0.04047441482543945, + "ref_logps/chosen": -12.826787948608398, + "ref_logps/rejected": -41.65668869018555, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8515514135360718, + "rewards/margins": 5.43280029296875, + "rewards/rejected": -7.284351348876953, + "step": 406 + }, + { + "epoch": 0.81, + "grad_norm": 45.53025436401367, + "learning_rate": 4.0481481481481475e-07, + "logps/chosen": -37.1544189453125, + "logps/rejected": -121.78252410888672, + "loss": 0.1076, + "losses/dpo": 0.32062214612960815, + "losses/sft": 0.5877598524093628, + "losses/total": 0.32062214612960815, + "ref_logps/chosen": -11.954411506652832, + "ref_logps/rejected": -42.16912841796875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.52000093460083, + "rewards/margins": 5.441339015960693, + "rewards/rejected": -7.961340427398682, + "step": 407 + }, + { + "epoch": 0.82, + "grad_norm": 17.013662338256836, + "learning_rate": 4.044444444444444e-07, + "logps/chosen": -36.11525344848633, + "logps/rejected": -126.46176147460938, + "loss": 0.1019, + "losses/dpo": 0.06139393895864487, + "losses/sft": 0.6258932948112488, + "losses/total": 0.06139393895864487, + "ref_logps/chosen": -12.489351272583008, + "ref_logps/rejected": -47.692588806152344, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3625903129577637, + "rewards/margins": 5.514326095581055, + "rewards/rejected": -7.87691593170166, + "step": 408 + }, + { + "epoch": 0.82, + "grad_norm": 13.054990768432617, + "learning_rate": 4.040740740740741e-07, + "logps/chosen": -39.20123291015625, + "logps/rejected": -159.31448364257812, + "loss": 0.0268, + "losses/dpo": 0.054594915360212326, + "losses/sft": 0.7585716843605042, + "losses/total": 0.054594915360212326, + "ref_logps/chosen": -17.77873992919922, + "ref_logps/rejected": -61.389122009277344, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.14224910736084, + "rewards/margins": 7.650287628173828, + "rewards/rejected": -9.792536735534668, + "step": 409 + }, + { + "epoch": 0.82, + "grad_norm": 47.22980880737305, + "learning_rate": 4.0370370370370373e-07, + "logps/chosen": -47.70698928833008, + "logps/rejected": -131.38491821289062, + "loss": 0.0699, + "losses/dpo": 0.22821170091629028, + "losses/sft": 1.048500657081604, + "losses/total": 0.22821170091629028, + "ref_logps/chosen": -16.506330490112305, + "ref_logps/rejected": -40.986934661865234, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.120065689086914, + "rewards/margins": 5.919732093811035, + "rewards/rejected": -9.03979778289795, + "step": 410 + }, + { + "epoch": 0.82, + "grad_norm": 22.45016860961914, + "learning_rate": 4.033333333333333e-07, + "logps/chosen": -40.05971908569336, + "logps/rejected": -125.94699096679688, + "loss": 0.1039, + "losses/dpo": 0.05524634197354317, + "losses/sft": 0.7447971105575562, + "losses/total": 0.05524634197354317, + "ref_logps/chosen": -13.532623291015625, + "ref_logps/rejected": -42.57023620605469, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.652709484100342, + "rewards/margins": 5.68496561050415, + "rewards/rejected": -8.337675094604492, + "step": 411 + }, + { + "epoch": 0.82, + "grad_norm": 36.30315399169922, + "learning_rate": 4.0296296296296296e-07, + "logps/chosen": -39.070838928222656, + "logps/rejected": -163.58050537109375, + "loss": 0.0916, + "losses/dpo": 0.21355107426643372, + "losses/sft": 0.7528839707374573, + "losses/total": 0.21355107426643372, + "ref_logps/chosen": -14.982392311096191, + "ref_logps/rejected": -60.20352554321289, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4088447093963623, + "rewards/margins": 7.928853988647461, + "rewards/rejected": -10.337697982788086, + "step": 412 + }, + { + "epoch": 0.83, + "grad_norm": 12.464835166931152, + "learning_rate": 4.025925925925926e-07, + "logps/chosen": -37.97372055053711, + "logps/rejected": -147.67945861816406, + "loss": 0.0455, + "losses/dpo": 0.14440147578716278, + "losses/sft": 1.0511807203292847, + "losses/total": 0.14440147578716278, + "ref_logps/chosen": -10.53607177734375, + "ref_logps/rejected": -51.0684814453125, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.743765115737915, + "rewards/margins": 6.917333602905273, + "rewards/rejected": -9.66109848022461, + "step": 413 + }, + { + "epoch": 0.83, + "grad_norm": 34.10635757446289, + "learning_rate": 4.022222222222222e-07, + "logps/chosen": -48.30426025390625, + "logps/rejected": -106.06845092773438, + "loss": 0.0948, + "losses/dpo": 0.15190474689006805, + "losses/sft": 0.7456372976303101, + "losses/total": 0.15190474689006805, + "ref_logps/chosen": -17.072410583496094, + "ref_logps/rejected": -34.360252380371094, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1231849193573, + "rewards/margins": 4.047634601593018, + "rewards/rejected": -7.170819282531738, + "step": 414 + }, + { + "epoch": 0.83, + "grad_norm": 19.513986587524414, + "learning_rate": 4.0185185185185183e-07, + "logps/chosen": -34.9712028503418, + "logps/rejected": -150.7466278076172, + "loss": 0.066, + "losses/dpo": 0.1424039751291275, + "losses/sft": 0.4295719265937805, + "losses/total": 0.1424039751291275, + "ref_logps/chosen": -14.411608695983887, + "ref_logps/rejected": -58.60187530517578, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.055959701538086, + "rewards/margins": 7.158515930175781, + "rewards/rejected": -9.214475631713867, + "step": 415 + }, + { + "epoch": 0.83, + "grad_norm": 30.81915855407715, + "learning_rate": 4.0148148148148147e-07, + "logps/chosen": -40.00956726074219, + "logps/rejected": -146.01316833496094, + "loss": 0.141, + "losses/dpo": 0.2193525731563568, + "losses/sft": 0.7795408964157104, + "losses/total": 0.2193525731563568, + "ref_logps/chosen": -14.545209884643555, + "ref_logps/rejected": -53.822059631347656, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.546435832977295, + "rewards/margins": 6.672675132751465, + "rewards/rejected": -9.219110488891602, + "step": 416 + }, + { + "epoch": 0.83, + "grad_norm": 11.609371185302734, + "learning_rate": 4.0111111111111106e-07, + "logps/chosen": -38.845733642578125, + "logps/rejected": -145.8206024169922, + "loss": 0.0669, + "losses/dpo": 0.23528042435646057, + "losses/sft": 0.813218355178833, + "losses/total": 0.23528042435646057, + "ref_logps/chosen": -13.398239135742188, + "ref_logps/rejected": -48.828125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5447497367858887, + "rewards/margins": 7.154498100280762, + "rewards/rejected": -9.699247360229492, + "step": 417 + }, + { + "epoch": 0.84, + "grad_norm": 15.4019193649292, + "learning_rate": 4.007407407407407e-07, + "logps/chosen": -25.659170150756836, + "logps/rejected": -113.36529541015625, + "loss": 0.0456, + "losses/dpo": 0.07958737760782242, + "losses/sft": 0.48200511932373047, + "losses/total": 0.07958737760782242, + "ref_logps/chosen": -11.393777847290039, + "ref_logps/rejected": -39.325225830078125, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.426539421081543, + "rewards/margins": 5.977468490600586, + "rewards/rejected": -7.404007911682129, + "step": 418 + }, + { + "epoch": 0.84, + "grad_norm": 37.076507568359375, + "learning_rate": 4.0037037037037034e-07, + "logps/chosen": -33.09357833862305, + "logps/rejected": -125.79651641845703, + "loss": 0.1009, + "losses/dpo": 0.3316452205181122, + "losses/sft": 0.946098804473877, + "losses/total": 0.3316452205181122, + "ref_logps/chosen": -11.90180492401123, + "ref_logps/rejected": -47.329490661621094, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1191773414611816, + "rewards/margins": 5.727524757385254, + "rewards/rejected": -7.846702575683594, + "step": 419 + }, + { + "epoch": 0.84, + "grad_norm": 31.1119384765625, + "learning_rate": 4e-07, + "logps/chosen": -37.595184326171875, + "logps/rejected": -114.86734008789062, + "loss": 0.0765, + "losses/dpo": 0.15944091975688934, + "losses/sft": 1.0308585166931152, + "losses/total": 0.15944091975688934, + "ref_logps/chosen": -14.364852905273438, + "ref_logps/rejected": -39.427894592285156, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.323032855987549, + "rewards/margins": 5.220911026000977, + "rewards/rejected": -7.543943881988525, + "step": 420 + }, + { + "epoch": 0.84, + "grad_norm": 31.826692581176758, + "learning_rate": 3.9962962962962957e-07, + "logps/chosen": -38.185302734375, + "logps/rejected": -102.858642578125, + "loss": 0.1069, + "losses/dpo": 0.007780781015753746, + "losses/sft": 0.9135001301765442, + "losses/total": 0.007780781015753746, + "ref_logps/chosen": -12.624923706054688, + "ref_logps/rejected": -36.459877014160156, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5560381412506104, + "rewards/margins": 4.083838939666748, + "rewards/rejected": -6.6398773193359375, + "step": 421 + }, + { + "epoch": 0.84, + "grad_norm": 48.20242691040039, + "learning_rate": 3.9925925925925926e-07, + "logps/chosen": -39.689151763916016, + "logps/rejected": -107.40838623046875, + "loss": 0.1396, + "losses/dpo": 0.08528731763362885, + "losses/sft": 0.7178528308868408, + "losses/total": 0.08528731763362885, + "ref_logps/chosen": -14.831443786621094, + "ref_logps/rejected": -35.24195861816406, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4857709407806396, + "rewards/margins": 4.73087215423584, + "rewards/rejected": -7.2166428565979, + "step": 422 + }, + { + "epoch": 0.85, + "grad_norm": 35.273048400878906, + "learning_rate": 3.988888888888889e-07, + "logps/chosen": -35.35859680175781, + "logps/rejected": -95.65461730957031, + "loss": 0.1087, + "losses/dpo": 0.16164197027683258, + "losses/sft": 0.9079208374023438, + "losses/total": 0.16164197027683258, + "ref_logps/chosen": -11.853939056396484, + "ref_logps/rejected": -33.45928192138672, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.350465774536133, + "rewards/margins": 3.8690683841705322, + "rewards/rejected": -6.219534397125244, + "step": 423 + }, + { + "epoch": 0.85, + "grad_norm": 20.826913833618164, + "learning_rate": 3.985185185185185e-07, + "logps/chosen": -33.33007049560547, + "logps/rejected": -121.75100708007812, + "loss": 0.0494, + "losses/dpo": 0.08566058427095413, + "losses/sft": 0.4007030129432678, + "losses/total": 0.08566058427095413, + "ref_logps/chosen": -18.737146377563477, + "ref_logps/rejected": -46.80057144165039, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4592924118041992, + "rewards/margins": 6.035750389099121, + "rewards/rejected": -7.4950432777404785, + "step": 424 + }, + { + "epoch": 0.85, + "grad_norm": 10.35861587524414, + "learning_rate": 3.9814814814814813e-07, + "logps/chosen": -34.687767028808594, + "logps/rejected": -121.10075378417969, + "loss": 0.0208, + "losses/dpo": 0.005229136906564236, + "losses/sft": 0.6532012224197388, + "losses/total": 0.005229136906564236, + "ref_logps/chosen": -13.203108787536621, + "ref_logps/rejected": -41.180877685546875, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.148465871810913, + "rewards/margins": 5.843521595001221, + "rewards/rejected": -7.991987705230713, + "step": 425 + }, + { + "epoch": 0.85, + "grad_norm": 20.87963104248047, + "learning_rate": 3.977777777777778e-07, + "logps/chosen": -39.72007751464844, + "logps/rejected": -120.63939666748047, + "loss": 0.06, + "losses/dpo": 0.057065702974796295, + "losses/sft": 0.828675389289856, + "losses/total": 0.057065702974796295, + "ref_logps/chosen": -13.151759147644043, + "ref_logps/rejected": -41.24854278564453, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.656831979751587, + "rewards/margins": 5.282253265380859, + "rewards/rejected": -7.939085006713867, + "step": 426 + }, + { + "epoch": 0.85, + "grad_norm": 13.600708961486816, + "learning_rate": 3.974074074074074e-07, + "logps/chosen": -30.19253921508789, + "logps/rejected": -126.12801361083984, + "loss": 0.0383, + "losses/dpo": 0.0667053759098053, + "losses/sft": 0.6852974891662598, + "losses/total": 0.0667053759098053, + "ref_logps/chosen": -9.91547966003418, + "ref_logps/rejected": -44.596343994140625, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.027705669403076, + "rewards/margins": 6.125460624694824, + "rewards/rejected": -8.153166770935059, + "step": 427 + }, + { + "epoch": 0.86, + "grad_norm": 18.37424659729004, + "learning_rate": 3.97037037037037e-07, + "logps/chosen": -38.085697174072266, + "logps/rejected": -116.51532745361328, + "loss": 0.0591, + "losses/dpo": 0.0367395393550396, + "losses/sft": 0.8513802289962769, + "losses/total": 0.0367395393550396, + "ref_logps/chosen": -12.653528213500977, + "ref_logps/rejected": -40.740966796875, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5432169437408447, + "rewards/margins": 5.034219741821289, + "rewards/rejected": -7.577436447143555, + "step": 428 + }, + { + "epoch": 0.86, + "grad_norm": 86.92479705810547, + "learning_rate": 3.9666666666666665e-07, + "logps/chosen": -33.40900802612305, + "logps/rejected": -97.5163345336914, + "loss": 0.1676, + "losses/dpo": 0.06531870365142822, + "losses/sft": 0.718450665473938, + "losses/total": 0.06531870365142822, + "ref_logps/chosen": -12.761130332946777, + "ref_logps/rejected": -34.69020462036133, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0647878646850586, + "rewards/margins": 4.217825412750244, + "rewards/rejected": -6.282613277435303, + "step": 429 + }, + { + "epoch": 0.86, + "grad_norm": 16.273277282714844, + "learning_rate": 3.962962962962963e-07, + "logps/chosen": -38.851871490478516, + "logps/rejected": -114.59307098388672, + "loss": 0.0732, + "losses/dpo": 0.1843334287405014, + "losses/sft": 0.5088679790496826, + "losses/total": 0.1843334287405014, + "ref_logps/chosen": -20.497560501098633, + "ref_logps/rejected": -39.37632751464844, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8354310989379883, + "rewards/margins": 5.686243057250977, + "rewards/rejected": -7.521674156188965, + "step": 430 + }, + { + "epoch": 0.86, + "grad_norm": 23.66449737548828, + "learning_rate": 3.959259259259259e-07, + "logps/chosen": -36.00865936279297, + "logps/rejected": -99.69367218017578, + "loss": 0.0811, + "losses/dpo": 0.11141210049390793, + "losses/sft": 0.8997406959533691, + "losses/total": 0.11141210049390793, + "ref_logps/chosen": -11.728845596313477, + "ref_logps/rejected": -33.64956283569336, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.42798113822937, + "rewards/margins": 4.176429748535156, + "rewards/rejected": -6.604410648345947, + "step": 431 + }, + { + "epoch": 0.86, + "grad_norm": 22.341249465942383, + "learning_rate": 3.955555555555555e-07, + "logps/chosen": -46.45829391479492, + "logps/rejected": -127.15127563476562, + "loss": 0.0827, + "losses/dpo": 0.14688825607299805, + "losses/sft": 0.8811045289039612, + "losses/total": 0.14688825607299805, + "ref_logps/chosen": -14.368127822875977, + "ref_logps/rejected": -39.30841064453125, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.2090163230895996, + "rewards/margins": 5.57526969909668, + "rewards/rejected": -8.784286499023438, + "step": 432 + }, + { + "epoch": 0.87, + "grad_norm": 20.467079162597656, + "learning_rate": 3.9518518518518516e-07, + "logps/chosen": -40.27122497558594, + "logps/rejected": -133.2883758544922, + "loss": 0.0947, + "losses/dpo": 0.00427745096385479, + "losses/sft": 0.7645794749259949, + "losses/total": 0.00427745096385479, + "ref_logps/chosen": -16.05865478515625, + "ref_logps/rejected": -45.82427978515625, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4212567806243896, + "rewards/margins": 6.325152397155762, + "rewards/rejected": -8.74640941619873, + "step": 433 + }, + { + "epoch": 0.87, + "grad_norm": 43.505855560302734, + "learning_rate": 3.948148148148148e-07, + "logps/chosen": -41.396522521972656, + "logps/rejected": -124.23265838623047, + "loss": 0.1128, + "losses/dpo": 0.003313305089250207, + "losses/sft": 0.8131544589996338, + "losses/total": 0.003313305089250207, + "ref_logps/chosen": -12.796215057373047, + "ref_logps/rejected": -39.55033874511719, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8600306510925293, + "rewards/margins": 5.60820198059082, + "rewards/rejected": -8.468232154846191, + "step": 434 + }, + { + "epoch": 0.87, + "grad_norm": 43.20351028442383, + "learning_rate": 3.9444444444444444e-07, + "logps/chosen": -39.572059631347656, + "logps/rejected": -113.74073028564453, + "loss": 0.1422, + "losses/dpo": 0.3530581593513489, + "losses/sft": 0.836111307144165, + "losses/total": 0.3530581593513489, + "ref_logps/chosen": -14.05323600769043, + "ref_logps/rejected": -39.58940505981445, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.55188250541687, + "rewards/margins": 4.863250732421875, + "rewards/rejected": -7.415132522583008, + "step": 435 + }, + { + "epoch": 0.87, + "grad_norm": 17.075193405151367, + "learning_rate": 3.940740740740741e-07, + "logps/chosen": -35.89055252075195, + "logps/rejected": -122.32846069335938, + "loss": 0.0407, + "losses/dpo": 0.09936435520648956, + "losses/sft": 0.6812224984169006, + "losses/total": 0.09936435520648956, + "ref_logps/chosen": -12.606057167053223, + "ref_logps/rejected": -42.71528625488281, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3284494876861572, + "rewards/margins": 5.632867813110352, + "rewards/rejected": -7.961318016052246, + "step": 436 + }, + { + "epoch": 0.87, + "grad_norm": 23.870943069458008, + "learning_rate": 3.937037037037037e-07, + "logps/chosen": -41.031890869140625, + "logps/rejected": -152.42169189453125, + "loss": 0.0415, + "losses/dpo": 0.001001848024316132, + "losses/sft": 0.504059910774231, + "losses/total": 0.001001848024316132, + "ref_logps/chosen": -17.22356414794922, + "ref_logps/rejected": -54.20398712158203, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3808324337005615, + "rewards/margins": 7.4409379959106445, + "rewards/rejected": -9.821769714355469, + "step": 437 + }, + { + "epoch": 0.88, + "grad_norm": 62.79549789428711, + "learning_rate": 3.933333333333333e-07, + "logps/chosen": -51.775909423828125, + "logps/rejected": -132.87208557128906, + "loss": 0.1194, + "losses/dpo": 0.025578390806913376, + "losses/sft": 0.8345500230789185, + "losses/total": 0.025578390806913376, + "ref_logps/chosen": -20.002838134765625, + "ref_logps/rejected": -48.348453521728516, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.17730712890625, + "rewards/margins": 5.275055885314941, + "rewards/rejected": -8.452363014221191, + "step": 438 + }, + { + "epoch": 0.88, + "grad_norm": 56.714908599853516, + "learning_rate": 3.9296296296296295e-07, + "logps/chosen": -46.454166412353516, + "logps/rejected": -162.05075073242188, + "loss": 0.1845, + "losses/dpo": 0.10782374441623688, + "losses/sft": 0.814045786857605, + "losses/total": 0.10782374441623688, + "ref_logps/chosen": -16.248411178588867, + "ref_logps/rejected": -61.597415924072266, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.020575761795044, + "rewards/margins": 7.024758338928223, + "rewards/rejected": -10.045333862304688, + "step": 439 + }, + { + "epoch": 0.88, + "grad_norm": 55.224342346191406, + "learning_rate": 3.925925925925926e-07, + "logps/chosen": -39.01628875732422, + "logps/rejected": -111.15766143798828, + "loss": 0.1918, + "losses/dpo": 0.06872375309467316, + "losses/sft": 0.5836397409439087, + "losses/total": 0.06872375309467316, + "ref_logps/chosen": -10.98344898223877, + "ref_logps/rejected": -35.79201126098633, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.803284168243408, + "rewards/margins": 4.733281135559082, + "rewards/rejected": -7.536565780639648, + "step": 440 + }, + { + "epoch": 0.88, + "grad_norm": 77.9546890258789, + "learning_rate": 3.9222222222222223e-07, + "logps/chosen": -43.733333587646484, + "logps/rejected": -130.84396362304688, + "loss": 0.1259, + "losses/dpo": 0.02421252429485321, + "losses/sft": 0.8828197717666626, + "losses/total": 0.02421252429485321, + "ref_logps/chosen": -15.622417449951172, + "ref_logps/rejected": -42.70435333251953, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.811091899871826, + "rewards/margins": 6.002869606018066, + "rewards/rejected": -8.813961029052734, + "step": 441 + }, + { + "epoch": 0.88, + "grad_norm": 24.473661422729492, + "learning_rate": 3.918518518518518e-07, + "logps/chosen": -51.01894760131836, + "logps/rejected": -132.1776123046875, + "loss": 0.0618, + "losses/dpo": 0.05584227293729782, + "losses/sft": 0.6618070006370544, + "losses/total": 0.05584227293729782, + "ref_logps/chosen": -20.57441520690918, + "ref_logps/rejected": -45.225921630859375, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0444531440734863, + "rewards/margins": 5.6507158279418945, + "rewards/rejected": -8.695169448852539, + "step": 442 + }, + { + "epoch": 0.89, + "grad_norm": 28.41942024230957, + "learning_rate": 3.9148148148148146e-07, + "logps/chosen": -41.577335357666016, + "logps/rejected": -120.86549377441406, + "loss": 0.0793, + "losses/dpo": 0.02593686804175377, + "losses/sft": 0.5973250865936279, + "losses/total": 0.02593686804175377, + "ref_logps/chosen": -17.241764068603516, + "ref_logps/rejected": -46.92060089111328, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4335572719573975, + "rewards/margins": 4.960931777954102, + "rewards/rejected": -7.394489765167236, + "step": 443 + }, + { + "epoch": 0.89, + "grad_norm": 20.679649353027344, + "learning_rate": 3.911111111111111e-07, + "logps/chosen": -36.46300506591797, + "logps/rejected": -139.58978271484375, + "loss": 0.0482, + "losses/dpo": 0.06509046256542206, + "losses/sft": 0.7196379899978638, + "losses/total": 0.06509046256542206, + "ref_logps/chosen": -14.438896179199219, + "ref_logps/rejected": -48.49089050292969, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.202410936355591, + "rewards/margins": 6.907477855682373, + "rewards/rejected": -9.109889030456543, + "step": 444 + }, + { + "epoch": 0.89, + "grad_norm": 20.597949981689453, + "learning_rate": 3.907407407407407e-07, + "logps/chosen": -39.067344665527344, + "logps/rejected": -131.234619140625, + "loss": 0.0492, + "losses/dpo": 0.024714581668376923, + "losses/sft": 0.4357023239135742, + "losses/total": 0.024714581668376923, + "ref_logps/chosen": -17.182262420654297, + "ref_logps/rejected": -50.957862854003906, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1885082721710205, + "rewards/margins": 5.839167594909668, + "rewards/rejected": -8.02767562866211, + "step": 445 + }, + { + "epoch": 0.89, + "grad_norm": 23.935184478759766, + "learning_rate": 3.9037037037037033e-07, + "logps/chosen": -37.2645378112793, + "logps/rejected": -108.57901000976562, + "loss": 0.07, + "losses/dpo": 0.08478090912103653, + "losses/sft": 0.6661969423294067, + "losses/total": 0.08478090912103653, + "ref_logps/chosen": -16.068418502807617, + "ref_logps/rejected": -35.516510009765625, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1196117401123047, + "rewards/margins": 5.186638832092285, + "rewards/rejected": -7.30625057220459, + "step": 446 + }, + { + "epoch": 0.89, + "grad_norm": 56.30306625366211, + "learning_rate": 3.8999999999999997e-07, + "logps/chosen": -40.75579833984375, + "logps/rejected": -103.54489135742188, + "loss": 0.1354, + "losses/dpo": 0.06459180265665054, + "losses/sft": 0.676565945148468, + "losses/total": 0.06459180265665054, + "ref_logps/chosen": -18.714557647705078, + "ref_logps/rejected": -38.181575775146484, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2041239738464355, + "rewards/margins": 4.332207679748535, + "rewards/rejected": -6.536331653594971, + "step": 447 + }, + { + "epoch": 0.9, + "grad_norm": 53.9810905456543, + "learning_rate": 3.8962962962962956e-07, + "logps/chosen": -44.43140411376953, + "logps/rejected": -120.11578369140625, + "loss": 0.205, + "losses/dpo": 0.2935711741447449, + "losses/sft": 0.7037935256958008, + "losses/total": 0.2935711741447449, + "ref_logps/chosen": -17.124698638916016, + "ref_logps/rejected": -42.24882125854492, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.73067045211792, + "rewards/margins": 5.056025505065918, + "rewards/rejected": -7.786696434020996, + "step": 448 + }, + { + "epoch": 0.9, + "grad_norm": 21.460792541503906, + "learning_rate": 3.8925925925925925e-07, + "logps/chosen": -34.50408935546875, + "logps/rejected": -140.84860229492188, + "loss": 0.0972, + "losses/dpo": 0.0007553499890491366, + "losses/sft": 0.749845027923584, + "losses/total": 0.0007553499890491366, + "ref_logps/chosen": -13.832296371459961, + "ref_logps/rejected": -55.203399658203125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0671794414520264, + "rewards/margins": 6.497340202331543, + "rewards/rejected": -8.564519882202148, + "step": 449 + }, + { + "epoch": 0.9, + "grad_norm": 39.593719482421875, + "learning_rate": 3.888888888888889e-07, + "logps/chosen": -38.65271759033203, + "logps/rejected": -110.78114318847656, + "loss": 0.1592, + "losses/dpo": 0.2055157870054245, + "losses/sft": 0.3508860766887665, + "losses/total": 0.2055157870054245, + "ref_logps/chosen": -17.12994384765625, + "ref_logps/rejected": -39.235443115234375, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1522774696350098, + "rewards/margins": 5.002292633056641, + "rewards/rejected": -7.154570579528809, + "step": 450 + }, + { + "epoch": 0.9, + "grad_norm": 12.109270095825195, + "learning_rate": 3.8851851851851854e-07, + "logps/chosen": -49.65500259399414, + "logps/rejected": -144.41647338867188, + "loss": 0.0443, + "losses/dpo": 0.11350575089454651, + "losses/sft": 0.6022939085960388, + "losses/total": 0.11350575089454651, + "ref_logps/chosen": -20.374874114990234, + "ref_logps/rejected": -49.809410095214844, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.928013324737549, + "rewards/margins": 6.532693386077881, + "rewards/rejected": -9.46070671081543, + "step": 451 + }, + { + "epoch": 0.9, + "grad_norm": 11.183013916015625, + "learning_rate": 3.881481481481481e-07, + "logps/chosen": -36.81501007080078, + "logps/rejected": -180.78744506835938, + "loss": 0.032, + "losses/dpo": 0.01637108251452446, + "losses/sft": 0.5078365802764893, + "losses/total": 0.01637108251452446, + "ref_logps/chosen": -17.165584564208984, + "ref_logps/rejected": -79.67362213134766, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9649423360824585, + "rewards/margins": 8.146439552307129, + "rewards/rejected": -10.111382484436035, + "step": 452 + }, + { + "epoch": 0.91, + "grad_norm": 41.93173599243164, + "learning_rate": 3.8777777777777776e-07, + "logps/chosen": -46.13130569458008, + "logps/rejected": -105.22616577148438, + "loss": 0.1333, + "losses/dpo": 0.09395473450422287, + "losses/sft": 0.7680277228355408, + "losses/total": 0.09395473450422287, + "ref_logps/chosen": -16.89548110961914, + "ref_logps/rejected": -32.069122314453125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9235825538635254, + "rewards/margins": 4.392122268676758, + "rewards/rejected": -7.315704822540283, + "step": 453 + }, + { + "epoch": 0.91, + "grad_norm": 81.26608276367188, + "learning_rate": 3.874074074074074e-07, + "logps/chosen": -39.240081787109375, + "logps/rejected": -134.875244140625, + "loss": 0.3474, + "losses/dpo": 1.107574701309204, + "losses/sft": 0.8552225232124329, + "losses/total": 1.107574701309204, + "ref_logps/chosen": -12.673192977905273, + "ref_logps/rejected": -44.445648193359375, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.656689167022705, + "rewards/margins": 6.3862714767456055, + "rewards/rejected": -9.042960166931152, + "step": 454 + }, + { + "epoch": 0.91, + "grad_norm": 17.12772560119629, + "learning_rate": 3.87037037037037e-07, + "logps/chosen": -30.340044021606445, + "logps/rejected": -118.41051483154297, + "loss": 0.0557, + "losses/dpo": 0.06403327733278275, + "losses/sft": 0.8791080713272095, + "losses/total": 0.06403327733278275, + "ref_logps/chosen": -11.27092456817627, + "ref_logps/rejected": -44.00303649902344, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9069119691848755, + "rewards/margins": 5.533836364746094, + "rewards/rejected": -7.44074821472168, + "step": 455 + }, + { + "epoch": 0.91, + "grad_norm": 24.304636001586914, + "learning_rate": 3.8666666666666664e-07, + "logps/chosen": -31.13290023803711, + "logps/rejected": -126.29150390625, + "loss": 0.0812, + "losses/dpo": 0.08947796374559402, + "losses/sft": 0.5682837963104248, + "losses/total": 0.08947796374559402, + "ref_logps/chosen": -13.352134704589844, + "ref_logps/rejected": -42.56218338012695, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.778076410293579, + "rewards/margins": 6.594855308532715, + "rewards/rejected": -8.372931480407715, + "step": 456 + }, + { + "epoch": 0.91, + "grad_norm": 30.6898250579834, + "learning_rate": 3.862962962962963e-07, + "logps/chosen": -38.754295349121094, + "logps/rejected": -122.19239807128906, + "loss": 0.1225, + "losses/dpo": 0.27622735500335693, + "losses/sft": 0.6371285915374756, + "losses/total": 0.27622735500335693, + "ref_logps/chosen": -15.183916091918945, + "ref_logps/rejected": -41.27008819580078, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3570375442504883, + "rewards/margins": 5.735192775726318, + "rewards/rejected": -8.092230796813965, + "step": 457 + }, + { + "epoch": 0.92, + "grad_norm": 23.223514556884766, + "learning_rate": 3.859259259259259e-07, + "logps/chosen": -41.43684387207031, + "logps/rejected": -117.0394287109375, + "loss": 0.0683, + "losses/dpo": 0.011316630057990551, + "losses/sft": 0.6864824891090393, + "losses/total": 0.011316630057990551, + "ref_logps/chosen": -18.469993591308594, + "ref_logps/rejected": -42.62580108642578, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.296685218811035, + "rewards/margins": 5.14467716217041, + "rewards/rejected": -7.441361427307129, + "step": 458 + }, + { + "epoch": 0.92, + "grad_norm": 27.830724716186523, + "learning_rate": 3.855555555555555e-07, + "logps/chosen": -42.79747009277344, + "logps/rejected": -110.60104370117188, + "loss": 0.0738, + "losses/dpo": 0.08957645297050476, + "losses/sft": 0.6658487319946289, + "losses/total": 0.08957645297050476, + "ref_logps/chosen": -16.33731460571289, + "ref_logps/rejected": -38.744163513183594, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6460158824920654, + "rewards/margins": 4.539671897888184, + "rewards/rejected": -7.185688018798828, + "step": 459 + }, + { + "epoch": 0.92, + "grad_norm": 14.787923812866211, + "learning_rate": 3.8518518518518515e-07, + "logps/chosen": -36.30558776855469, + "logps/rejected": -123.15371704101562, + "loss": 0.0272, + "losses/dpo": 0.008614415302872658, + "losses/sft": 0.710669994354248, + "losses/total": 0.008614415302872658, + "ref_logps/chosen": -12.608972549438477, + "ref_logps/rejected": -41.72282409667969, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.369661808013916, + "rewards/margins": 5.773427963256836, + "rewards/rejected": -8.143089294433594, + "step": 460 + }, + { + "epoch": 0.92, + "grad_norm": 29.622129440307617, + "learning_rate": 3.8481481481481484e-07, + "logps/chosen": -43.93711471557617, + "logps/rejected": -134.57131958007812, + "loss": 0.0522, + "losses/dpo": 0.022071661427617073, + "losses/sft": 0.6957737803459167, + "losses/total": 0.022071661427617073, + "ref_logps/chosen": -19.354290008544922, + "ref_logps/rejected": -46.689414978027344, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.458282709121704, + "rewards/margins": 6.32990837097168, + "rewards/rejected": -8.788190841674805, + "step": 461 + }, + { + "epoch": 0.92, + "grad_norm": 39.33469009399414, + "learning_rate": 3.8444444444444443e-07, + "logps/chosen": -42.33057403564453, + "logps/rejected": -128.18800354003906, + "loss": 0.0846, + "losses/dpo": 0.07230532169342041, + "losses/sft": 0.7907838821411133, + "losses/total": 0.07230532169342041, + "ref_logps/chosen": -15.459431648254395, + "ref_logps/rejected": -44.470672607421875, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.687114715576172, + "rewards/margins": 5.684619426727295, + "rewards/rejected": -8.371733665466309, + "step": 462 + }, + { + "epoch": 0.93, + "grad_norm": 77.01020812988281, + "learning_rate": 3.8407407407407407e-07, + "logps/chosen": -41.150062561035156, + "logps/rejected": -97.16158294677734, + "loss": 0.2559, + "losses/dpo": 0.1757294088602066, + "losses/sft": 0.8417171239852905, + "losses/total": 0.1757294088602066, + "ref_logps/chosen": -12.81360912322998, + "ref_logps/rejected": -30.693498611450195, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.8336455821990967, + "rewards/margins": 3.8131628036499023, + "rewards/rejected": -6.646808624267578, + "step": 463 + }, + { + "epoch": 0.93, + "grad_norm": 44.11057662963867, + "learning_rate": 3.837037037037037e-07, + "logps/chosen": -39.26576232910156, + "logps/rejected": -128.0055694580078, + "loss": 0.0871, + "losses/dpo": 0.17416299879550934, + "losses/sft": 0.7194896936416626, + "losses/total": 0.17416299879550934, + "ref_logps/chosen": -14.974311828613281, + "ref_logps/rejected": -43.98797607421875, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.429144859313965, + "rewards/margins": 5.972614288330078, + "rewards/rejected": -8.40176010131836, + "step": 464 + }, + { + "epoch": 0.93, + "grad_norm": 12.82344913482666, + "learning_rate": 3.8333333333333335e-07, + "logps/chosen": -40.717891693115234, + "logps/rejected": -148.91168212890625, + "loss": 0.0333, + "losses/dpo": 0.040061675012111664, + "losses/sft": 0.8902965784072876, + "losses/total": 0.040061675012111664, + "ref_logps/chosen": -12.840675354003906, + "ref_logps/rejected": -51.16914367675781, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.787721633911133, + "rewards/margins": 6.986532211303711, + "rewards/rejected": -9.774253845214844, + "step": 465 + }, + { + "epoch": 0.93, + "grad_norm": 71.54187774658203, + "learning_rate": 3.8296296296296294e-07, + "logps/chosen": -44.93368148803711, + "logps/rejected": -149.85275268554688, + "loss": 0.1686, + "losses/dpo": 0.4314330816268921, + "losses/sft": 0.7240664958953857, + "losses/total": 0.4314330816268921, + "ref_logps/chosen": -13.75963020324707, + "ref_logps/rejected": -53.12620544433594, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1174049377441406, + "rewards/margins": 6.55525016784668, + "rewards/rejected": -9.67265510559082, + "step": 466 + }, + { + "epoch": 0.93, + "grad_norm": 22.28801727294922, + "learning_rate": 3.825925925925926e-07, + "logps/chosen": -39.607208251953125, + "logps/rejected": -104.81693267822266, + "loss": 0.0786, + "losses/dpo": 0.0375826358795166, + "losses/sft": 0.6283800601959229, + "losses/total": 0.0375826358795166, + "ref_logps/chosen": -18.35260772705078, + "ref_logps/rejected": -37.030540466308594, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.125459671020508, + "rewards/margins": 4.653180122375488, + "rewards/rejected": -6.778639793395996, + "step": 467 + }, + { + "epoch": 0.94, + "grad_norm": 19.925273895263672, + "learning_rate": 3.822222222222222e-07, + "logps/chosen": -42.20486068725586, + "logps/rejected": -147.87164306640625, + "loss": 0.0363, + "losses/dpo": 0.10509399324655533, + "losses/sft": 0.804567813873291, + "losses/total": 0.10509399324655533, + "ref_logps/chosen": -13.097880363464355, + "ref_logps/rejected": -49.719268798828125, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.910698413848877, + "rewards/margins": 6.904539108276367, + "rewards/rejected": -9.815237045288086, + "step": 468 + }, + { + "epoch": 0.94, + "grad_norm": 39.51319122314453, + "learning_rate": 3.818518518518518e-07, + "logps/chosen": -47.34339141845703, + "logps/rejected": -125.14842224121094, + "loss": 0.0805, + "losses/dpo": 0.09030229598283768, + "losses/sft": 0.8959075808525085, + "losses/total": 0.09030229598283768, + "ref_logps/chosen": -14.1100435256958, + "ref_logps/rejected": -38.99297332763672, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3233349323272705, + "rewards/margins": 5.292210578918457, + "rewards/rejected": -8.615545272827148, + "step": 469 + }, + { + "epoch": 0.94, + "grad_norm": 10.734367370605469, + "learning_rate": 3.8148148148148145e-07, + "logps/chosen": -59.496826171875, + "logps/rejected": -159.21438598632812, + "loss": 0.1082, + "losses/dpo": 0.3646402955055237, + "losses/sft": 0.7248314619064331, + "losses/total": 0.3646402955055237, + "ref_logps/chosen": -22.694992065429688, + "ref_logps/rejected": -59.701148986816406, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.680183172225952, + "rewards/margins": 6.271141529083252, + "rewards/rejected": -9.951324462890625, + "step": 470 + }, + { + "epoch": 0.94, + "grad_norm": 18.338951110839844, + "learning_rate": 3.811111111111111e-07, + "logps/chosen": -40.23529815673828, + "logps/rejected": -144.66590881347656, + "loss": 0.0466, + "losses/dpo": 0.00546817434951663, + "losses/sft": 1.0657308101654053, + "losses/total": 0.00546817434951663, + "ref_logps/chosen": -11.44991683959961, + "ref_logps/rejected": -45.38568115234375, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8785383701324463, + "rewards/margins": 7.049485206604004, + "rewards/rejected": -9.928024291992188, + "step": 471 + }, + { + "epoch": 0.94, + "grad_norm": 35.54021072387695, + "learning_rate": 3.8074074074074073e-07, + "logps/chosen": -42.15109634399414, + "logps/rejected": -151.8326416015625, + "loss": 0.0724, + "losses/dpo": 0.028490465134382248, + "losses/sft": 0.8861966133117676, + "losses/total": 0.028490465134382248, + "ref_logps/chosen": -13.033123970031738, + "ref_logps/rejected": -51.36277389526367, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.911797285079956, + "rewards/margins": 7.135190486907959, + "rewards/rejected": -10.046987533569336, + "step": 472 + }, + { + "epoch": 0.95, + "grad_norm": 56.16077423095703, + "learning_rate": 3.803703703703703e-07, + "logps/chosen": -47.64544677734375, + "logps/rejected": -105.49983215332031, + "loss": 0.2322, + "losses/dpo": 0.06201707571744919, + "losses/sft": 1.4952541589736938, + "losses/total": 0.06201707571744919, + "ref_logps/chosen": -10.078681945800781, + "ref_logps/rejected": -29.885452270507812, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.75667667388916, + "rewards/margins": 3.804760694503784, + "rewards/rejected": -7.561437129974365, + "step": 473 + }, + { + "epoch": 0.95, + "grad_norm": 66.81434631347656, + "learning_rate": 3.7999999999999996e-07, + "logps/chosen": -47.747276306152344, + "logps/rejected": -119.25900268554688, + "loss": 0.1594, + "losses/dpo": 0.46635541319847107, + "losses/sft": 0.7916244268417358, + "losses/total": 0.46635541319847107, + "ref_logps/chosen": -18.893095016479492, + "ref_logps/rejected": -41.94963836669922, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.885418176651001, + "rewards/margins": 4.845518112182617, + "rewards/rejected": -7.730936527252197, + "step": 474 + }, + { + "epoch": 0.95, + "grad_norm": 31.911237716674805, + "learning_rate": 3.7962962962962966e-07, + "logps/chosen": -38.076499938964844, + "logps/rejected": -132.6082763671875, + "loss": 0.0621, + "losses/dpo": 0.0007034945301711559, + "losses/sft": 0.6102277636528015, + "losses/total": 0.0007034945301711559, + "ref_logps/chosen": -9.867688179016113, + "ref_logps/rejected": -48.1339111328125, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8208813667297363, + "rewards/margins": 5.626555442810059, + "rewards/rejected": -8.447437286376953, + "step": 475 + }, + { + "epoch": 0.95, + "grad_norm": 31.460481643676758, + "learning_rate": 3.7925925925925924e-07, + "logps/chosen": -36.52131271362305, + "logps/rejected": -137.44662475585938, + "loss": 0.0412, + "losses/dpo": 0.013979414477944374, + "losses/sft": 0.40546882152557373, + "losses/total": 0.013979414477944374, + "ref_logps/chosen": -12.335689544677734, + "ref_logps/rejected": -46.54832458496094, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.418562412261963, + "rewards/margins": 6.671266555786133, + "rewards/rejected": -9.089829444885254, + "step": 476 + }, + { + "epoch": 0.95, + "grad_norm": 38.84501647949219, + "learning_rate": 3.788888888888889e-07, + "logps/chosen": -51.319183349609375, + "logps/rejected": -155.2894744873047, + "loss": 0.1021, + "losses/dpo": 0.23493897914886475, + "losses/sft": 0.7687029838562012, + "losses/total": 0.23493897914886475, + "ref_logps/chosen": -20.000171661376953, + "ref_logps/rejected": -53.499454498291016, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.1319010257720947, + "rewards/margins": 7.04710054397583, + "rewards/rejected": -10.179001808166504, + "step": 477 + }, + { + "epoch": 0.96, + "grad_norm": 22.33247184753418, + "learning_rate": 3.785185185185185e-07, + "logps/chosen": -40.62896728515625, + "logps/rejected": -123.1474609375, + "loss": 0.0551, + "losses/dpo": 0.05076390877366066, + "losses/sft": 0.9945937991142273, + "losses/total": 0.05076390877366066, + "ref_logps/chosen": -11.130529403686523, + "ref_logps/rejected": -41.4739875793457, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9498438835144043, + "rewards/margins": 5.217503547668457, + "rewards/rejected": -8.167346954345703, + "step": 478 + }, + { + "epoch": 0.96, + "grad_norm": 44.30374526977539, + "learning_rate": 3.781481481481481e-07, + "logps/chosen": -42.91387176513672, + "logps/rejected": -134.16275024414062, + "loss": 0.0772, + "losses/dpo": 0.02010273188352585, + "losses/sft": 0.7935068607330322, + "losses/total": 0.02010273188352585, + "ref_logps/chosen": -13.567787170410156, + "ref_logps/rejected": -44.79924774169922, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9346084594726562, + "rewards/margins": 6.001742839813232, + "rewards/rejected": -8.936351776123047, + "step": 479 + }, + { + "epoch": 0.96, + "grad_norm": 18.497716903686523, + "learning_rate": 3.7777777777777775e-07, + "logps/chosen": -38.83562088012695, + "logps/rejected": -119.28911590576172, + "loss": 0.0535, + "losses/dpo": 0.09974934905767441, + "losses/sft": 0.7755734324455261, + "losses/total": 0.09974934905767441, + "ref_logps/chosen": -13.980093955993652, + "ref_logps/rejected": -43.400691986083984, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4855527877807617, + "rewards/margins": 5.10329008102417, + "rewards/rejected": -7.58884334564209, + "step": 480 + }, + { + "epoch": 0.96, + "grad_norm": 26.344017028808594, + "learning_rate": 3.774074074074074e-07, + "logps/chosen": -34.87080383300781, + "logps/rejected": -129.17410278320312, + "loss": 0.0709, + "losses/dpo": 0.09608794748783112, + "losses/sft": 0.6589397192001343, + "losses/total": 0.09608794748783112, + "ref_logps/chosen": -14.861343383789062, + "ref_logps/rejected": -49.386024475097656, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.000946044921875, + "rewards/margins": 5.977862358093262, + "rewards/rejected": -7.978808403015137, + "step": 481 + }, + { + "epoch": 0.96, + "grad_norm": 23.19811248779297, + "learning_rate": 3.7703703703703704e-07, + "logps/chosen": -38.46788787841797, + "logps/rejected": -129.5819549560547, + "loss": 0.0465, + "losses/dpo": 0.031357597559690475, + "losses/sft": 0.6037446856498718, + "losses/total": 0.031357597559690475, + "ref_logps/chosen": -17.20541000366211, + "ref_logps/rejected": -42.455162048339844, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1262478828430176, + "rewards/margins": 6.586432933807373, + "rewards/rejected": -8.71268081665039, + "step": 482 + }, + { + "epoch": 0.97, + "grad_norm": 58.5645751953125, + "learning_rate": 3.766666666666666e-07, + "logps/chosen": -36.00074005126953, + "logps/rejected": -140.50314331054688, + "loss": 0.093, + "losses/dpo": 5.722598507418297e-05, + "losses/sft": 0.7557867169380188, + "losses/total": 5.722598507418297e-05, + "ref_logps/chosen": -12.93038272857666, + "ref_logps/rejected": -47.007694244384766, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.307035446166992, + "rewards/margins": 7.042509078979492, + "rewards/rejected": -9.349544525146484, + "step": 483 + }, + { + "epoch": 0.97, + "grad_norm": 8.557694435119629, + "learning_rate": 3.7629629629629627e-07, + "logps/chosen": -50.58363342285156, + "logps/rejected": -162.1007080078125, + "loss": 0.0152, + "losses/dpo": 0.043552931398153305, + "losses/sft": 0.7928053140640259, + "losses/total": 0.043552931398153305, + "ref_logps/chosen": -16.90772247314453, + "ref_logps/rejected": -52.684326171875, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.36759090423584, + "rewards/margins": 7.574047088623047, + "rewards/rejected": -10.941637992858887, + "step": 484 + }, + { + "epoch": 0.97, + "grad_norm": 45.088653564453125, + "learning_rate": 3.759259259259259e-07, + "logps/chosen": -39.90214538574219, + "logps/rejected": -137.9638671875, + "loss": 0.1535, + "losses/dpo": 0.1695178747177124, + "losses/sft": 0.8066875338554382, + "losses/total": 0.1695178747177124, + "ref_logps/chosen": -12.019525527954102, + "ref_logps/rejected": -50.03221130371094, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.788262367248535, + "rewards/margins": 6.004901885986328, + "rewards/rejected": -8.79316520690918, + "step": 485 + }, + { + "epoch": 0.97, + "grad_norm": 60.69712829589844, + "learning_rate": 3.755555555555555e-07, + "logps/chosen": -45.3070182800293, + "logps/rejected": -136.23126220703125, + "loss": 0.1879, + "losses/dpo": 0.14317449927330017, + "losses/sft": 0.4515833258628845, + "losses/total": 0.14317449927330017, + "ref_logps/chosen": -18.59630012512207, + "ref_logps/rejected": -48.34473419189453, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.671072006225586, + "rewards/margins": 6.117581367492676, + "rewards/rejected": -8.788653373718262, + "step": 486 + }, + { + "epoch": 0.97, + "grad_norm": 8.105025291442871, + "learning_rate": 3.7518518518518514e-07, + "logps/chosen": -42.688236236572266, + "logps/rejected": -125.89185333251953, + "loss": 0.0201, + "losses/dpo": 0.03883223608136177, + "losses/sft": 0.7434147596359253, + "losses/total": 0.03883223608136177, + "ref_logps/chosen": -18.851747512817383, + "ref_logps/rejected": -39.032981872558594, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3836491107940674, + "rewards/margins": 6.3022379875183105, + "rewards/rejected": -8.68588638305664, + "step": 487 + }, + { + "epoch": 0.98, + "grad_norm": 8.502668380737305, + "learning_rate": 3.7481481481481483e-07, + "logps/chosen": -35.2692756652832, + "logps/rejected": -142.82443237304688, + "loss": 0.0284, + "losses/dpo": 0.06037643551826477, + "losses/sft": 0.9963451623916626, + "losses/total": 0.06037643551826477, + "ref_logps/chosen": -11.326520919799805, + "ref_logps/rejected": -46.48693084716797, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.394275426864624, + "rewards/margins": 7.239475250244141, + "rewards/rejected": -9.633749961853027, + "step": 488 + }, + { + "epoch": 0.98, + "grad_norm": 20.321788787841797, + "learning_rate": 3.7444444444444447e-07, + "logps/chosen": -37.23273849487305, + "logps/rejected": -177.05303955078125, + "loss": 0.0364, + "losses/dpo": 0.021463543176651, + "losses/sft": 0.8575751781463623, + "losses/total": 0.021463543176651, + "ref_logps/chosen": -13.77434253692627, + "ref_logps/rejected": -58.2335205078125, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.345839738845825, + "rewards/margins": 9.536112785339355, + "rewards/rejected": -11.881952285766602, + "step": 489 + }, + { + "epoch": 0.98, + "grad_norm": 66.36872100830078, + "learning_rate": 3.7407407407407406e-07, + "logps/chosen": -46.914161682128906, + "logps/rejected": -101.332763671875, + "loss": 0.2092, + "losses/dpo": 0.4841746687889099, + "losses/sft": 0.9192337989807129, + "losses/total": 0.4841746687889099, + "ref_logps/chosen": -14.84537124633789, + "ref_logps/rejected": -30.192947387695312, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.206879138946533, + "rewards/margins": 3.9071030616760254, + "rewards/rejected": -7.1139817237854, + "step": 490 + }, + { + "epoch": 0.98, + "grad_norm": 17.200246810913086, + "learning_rate": 3.737037037037037e-07, + "logps/chosen": -39.54547119140625, + "logps/rejected": -117.11639404296875, + "loss": 0.0466, + "losses/dpo": 0.056069329380989075, + "losses/sft": 0.6392867565155029, + "losses/total": 0.056069329380989075, + "ref_logps/chosen": -15.112298011779785, + "ref_logps/rejected": -42.334571838378906, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4433178901672363, + "rewards/margins": 5.034865379333496, + "rewards/rejected": -7.478182792663574, + "step": 491 + }, + { + "epoch": 0.98, + "grad_norm": 55.23065185546875, + "learning_rate": 3.7333333333333334e-07, + "logps/chosen": -39.42041778564453, + "logps/rejected": -126.77442169189453, + "loss": 0.1834, + "losses/dpo": 0.4038184881210327, + "losses/sft": 0.6278300285339355, + "losses/total": 0.4038184881210327, + "ref_logps/chosen": -12.766145706176758, + "ref_logps/rejected": -43.821372985839844, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6654272079467773, + "rewards/margins": 5.62987756729126, + "rewards/rejected": -8.295305252075195, + "step": 492 + }, + { + "epoch": 0.99, + "grad_norm": 43.6976318359375, + "learning_rate": 3.7296296296296293e-07, + "logps/chosen": -37.804630279541016, + "logps/rejected": -143.37440490722656, + "loss": 0.0993, + "losses/dpo": 0.27842968702316284, + "losses/sft": 0.7042969465255737, + "losses/total": 0.27842968702316284, + "ref_logps/chosen": -15.15415096282959, + "ref_logps/rejected": -56.54274368286133, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.265048027038574, + "rewards/margins": 6.418117523193359, + "rewards/rejected": -8.68316650390625, + "step": 493 + }, + { + "epoch": 0.99, + "grad_norm": 20.171466827392578, + "learning_rate": 3.7259259259259257e-07, + "logps/chosen": -43.048973083496094, + "logps/rejected": -141.72682189941406, + "loss": 0.0436, + "losses/dpo": 0.044200535863637924, + "losses/sft": 0.6779994964599609, + "losses/total": 0.044200535863637924, + "ref_logps/chosen": -23.580673217773438, + "ref_logps/rejected": -57.72456359863281, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.946830153465271, + "rewards/margins": 6.453395366668701, + "rewards/rejected": -8.400225639343262, + "step": 494 + }, + { + "epoch": 0.99, + "grad_norm": 74.94364166259766, + "learning_rate": 3.722222222222222e-07, + "logps/chosen": -48.40394973754883, + "logps/rejected": -145.72518920898438, + "loss": 0.2522, + "losses/dpo": 0.19847548007965088, + "losses/sft": 1.0321438312530518, + "losses/total": 0.19847548007965088, + "ref_logps/chosen": -15.101470947265625, + "ref_logps/rejected": -49.36922836303711, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.3302478790283203, + "rewards/margins": 6.305349826812744, + "rewards/rejected": -9.635598182678223, + "step": 495 + }, + { + "epoch": 0.99, + "grad_norm": 29.127410888671875, + "learning_rate": 3.7185185185185185e-07, + "logps/chosen": -41.526939392089844, + "logps/rejected": -146.7291259765625, + "loss": 0.0671, + "losses/dpo": 0.013721669092774391, + "losses/sft": 0.9741775989532471, + "losses/total": 0.013721669092774391, + "ref_logps/chosen": -16.26078224182129, + "ref_logps/rejected": -49.89807891845703, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.526616096496582, + "rewards/margins": 7.15648889541626, + "rewards/rejected": -9.68310546875, + "step": 496 + }, + { + "epoch": 0.99, + "grad_norm": 16.872493743896484, + "learning_rate": 3.7148148148148144e-07, + "logps/chosen": -39.59349060058594, + "logps/rejected": -137.2967987060547, + "loss": 0.045, + "losses/dpo": 0.1408008486032486, + "losses/sft": 1.041797399520874, + "losses/total": 0.1408008486032486, + "ref_logps/chosen": -14.02829360961914, + "ref_logps/rejected": -45.13361740112305, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5565197467803955, + "rewards/margins": 6.659798622131348, + "rewards/rejected": -9.216318130493164, + "step": 497 + }, + { + "epoch": 1.0, + "grad_norm": 50.911476135253906, + "learning_rate": 3.711111111111111e-07, + "logps/chosen": -38.06005096435547, + "logps/rejected": -123.10880279541016, + "loss": 0.1425, + "losses/dpo": 0.01334542315453291, + "losses/sft": 0.7792074680328369, + "losses/total": 0.01334542315453291, + "ref_logps/chosen": -14.236019134521484, + "ref_logps/rejected": -38.64054489135742, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3824031352996826, + "rewards/margins": 6.064422607421875, + "rewards/rejected": -8.446825981140137, + "step": 498 + }, + { + "epoch": 1.0, + "grad_norm": 37.36387252807617, + "learning_rate": 3.707407407407407e-07, + "logps/chosen": -34.86865997314453, + "logps/rejected": -124.82080078125, + "loss": 0.148, + "losses/dpo": 0.02250811830163002, + "losses/sft": 0.4309656322002411, + "losses/total": 0.02250811830163002, + "ref_logps/chosen": -13.683731079101562, + "ref_logps/rejected": -43.348628997802734, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.118492603302002, + "rewards/margins": 6.0287251472473145, + "rewards/rejected": -8.147217750549316, + "step": 499 + }, + { + "epoch": 1.0, + "grad_norm": 16.937225341796875, + "learning_rate": 3.703703703703703e-07, + "logps/chosen": -40.25804901123047, + "logps/rejected": -125.63944244384766, + "loss": 0.0417, + "losses/dpo": 0.008113143965601921, + "losses/sft": 0.924973726272583, + "losses/total": 0.008113143965601921, + "ref_logps/chosen": -12.001253128051758, + "ref_logps/rejected": -40.38880920410156, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.8256797790527344, + "rewards/margins": 5.699383735656738, + "rewards/rejected": -8.525063514709473, + "step": 500 + } + ], + "logging_steps": 1.0, + "max_steps": 1500, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}