{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 636, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.12, "learning_rate": 7.8125e-06, "logps/chosen": -122.16297149658203, "logps/rejected": -71.43323516845703, "loss": 0.4952, "losses/dpo": 0.4956728219985962, "losses/sft": 0.7316558957099915, "losses/total": 0.4956728219985962, "ref_logps/chosen": -127.74378204345703, "ref_logps/rejected": -70.59587860107422, "rewards/accuracies": 0.8070000410079956, "rewards/chosen": 0.5580801367759705, "rewards/margins": 0.6418154239654541, "rewards/rejected": -0.08373536914587021, "step": 25 }, { "epoch": 0.24, "learning_rate": 1.5625e-05, "logps/chosen": -94.26205444335938, "logps/rejected": -85.60575866699219, "loss": 0.0691, "losses/dpo": 0.07388682663440704, "losses/sft": 0.5650071501731873, "losses/total": 0.07388682663440704, "ref_logps/chosen": -128.54661560058594, "ref_logps/rejected": -72.49893951416016, "rewards/accuracies": 0.9929999709129333, "rewards/chosen": 3.428455114364624, "rewards/margins": 4.739137649536133, "rewards/rejected": -1.310682773590088, "step": 50 }, { "epoch": 0.35, "learning_rate": 1.9615384615384617e-05, "logps/chosen": -85.07345581054688, "logps/rejected": -101.97691345214844, "loss": 0.0179, "losses/dpo": 0.014726839028298855, "losses/sft": 0.5030468106269836, "losses/total": 0.014726839028298855, "ref_logps/chosen": -129.9876708984375, "ref_logps/rejected": -72.3249282836914, "rewards/accuracies": 0.9989999532699585, "rewards/chosen": 4.491419792175293, "rewards/margins": 7.45661735534668, "rewards/rejected": -2.9651970863342285, "step": 75 }, { "epoch": 0.47, "learning_rate": 1.8741258741258744e-05, "logps/chosen": -85.64691162109375, "logps/rejected": -110.90087890625, "loss": 0.0096, "losses/dpo": 0.012412017211318016, "losses/sft": 0.5199429988861084, "losses/total": 0.012412017211318016, "ref_logps/chosen": -130.2884979248047, "ref_logps/rejected": -71.44290924072266, "rewards/accuracies": 0.9984999299049377, "rewards/chosen": 4.464157581329346, "rewards/margins": 8.409955024719238, "rewards/rejected": -3.9457967281341553, "step": 100 }, { "epoch": 0.59, "learning_rate": 1.7867132867132868e-05, "logps/chosen": -82.34768676757812, "logps/rejected": -116.94005584716797, "loss": 0.0061, "losses/dpo": 0.008614934980869293, "losses/sft": 0.49562689661979675, "losses/total": 0.008614934980869293, "ref_logps/chosen": -128.71200561523438, "ref_logps/rejected": -71.86701202392578, "rewards/accuracies": 0.9994999766349792, "rewards/chosen": 4.636431694030762, "rewards/margins": 9.143735885620117, "rewards/rejected": -4.507305145263672, "step": 125 }, { "epoch": 0.71, "learning_rate": 1.6993006993006995e-05, "logps/chosen": -85.28910064697266, "logps/rejected": -123.17980194091797, "loss": 0.0053, "losses/dpo": 0.004700234159827232, "losses/sft": 0.5220319032669067, "losses/total": 0.004700234159827232, "ref_logps/chosen": -129.39625549316406, "ref_logps/rejected": -70.16360473632812, "rewards/accuracies": 0.9994999766349792, "rewards/chosen": 4.410714149475098, "rewards/margins": 9.712334632873535, "rewards/rejected": -5.301620006561279, "step": 150 }, { "epoch": 0.83, "learning_rate": 1.611888111888112e-05, "logps/chosen": -84.59432983398438, "logps/rejected": -131.1455535888672, "loss": 0.0051, "losses/dpo": 0.003301014890894294, "losses/sft": 0.5115602016448975, "losses/total": 0.003301014890894294, "ref_logps/chosen": -127.61747741699219, "ref_logps/rejected": -71.97355651855469, "rewards/accuracies": 0.9994999766349792, "rewards/chosen": 4.302317142486572, "rewards/margins": 10.219517707824707, "rewards/rejected": -5.917200088500977, "step": 175 }, { "epoch": 0.94, "learning_rate": 1.5244755244755244e-05, "logps/chosen": -88.2319564819336, "logps/rejected": -141.83016967773438, "loss": 0.0028, "losses/dpo": 0.002293643541634083, "losses/sft": 0.5383260846138, "losses/total": 0.002293643541634083, "ref_logps/chosen": -129.1661376953125, "ref_logps/rejected": -71.8288803100586, "rewards/accuracies": 1.0, "rewards/chosen": 4.093417167663574, "rewards/margins": 11.093545913696289, "rewards/rejected": -7.000128746032715, "step": 200 }, { "epoch": 1.06, "learning_rate": 1.4370629370629371e-05, "logps/chosen": -89.62419891357422, "logps/rejected": -153.9451904296875, "loss": 0.0033, "losses/dpo": 0.0030320805963128805, "losses/sft": 0.532666027545929, "losses/total": 0.0030320805963128805, "ref_logps/chosen": -128.41148376464844, "ref_logps/rejected": -71.97950744628906, "rewards/accuracies": 0.9989999532699585, "rewards/chosen": 3.878729820251465, "rewards/margins": 12.075300216674805, "rewards/rejected": -8.196569442749023, "step": 225 }, { "epoch": 1.18, "learning_rate": 1.3496503496503497e-05, "logps/chosen": -86.68380737304688, "logps/rejected": -156.0954132080078, "loss": 0.0013, "losses/dpo": 0.0008313562138937414, "losses/sft": 0.518293559551239, "losses/total": 0.0008313562138937414, "ref_logps/chosen": -128.29469299316406, "ref_logps/rejected": -71.77529907226562, "rewards/accuracies": 1.0, "rewards/chosen": 4.161087989807129, "rewards/margins": 12.593099594116211, "rewards/rejected": -8.432010650634766, "step": 250 }, { "epoch": 1.3, "learning_rate": 1.2622377622377624e-05, "logps/chosen": -88.83470916748047, "logps/rejected": -160.05836486816406, "loss": 0.0015, "losses/dpo": 0.0008700879407115281, "losses/sft": 0.5323516726493835, "losses/total": 0.0008700879407115281, "ref_logps/chosen": -128.95960998535156, "ref_logps/rejected": -70.340576171875, "rewards/accuracies": 0.9994999766349792, "rewards/chosen": 4.012491703033447, "rewards/margins": 12.984270095825195, "rewards/rejected": -8.971778869628906, "step": 275 }, { "epoch": 1.42, "learning_rate": 1.1748251748251748e-05, "logps/chosen": -90.86172485351562, "logps/rejected": -162.376953125, "loss": 0.0011, "losses/dpo": 0.00106943363789469, "losses/sft": 0.5586134195327759, "losses/total": 0.00106943363789469, "ref_logps/chosen": -129.39662170410156, "ref_logps/rejected": -71.82042694091797, "rewards/accuracies": 1.0, "rewards/chosen": 3.853489398956299, "rewards/margins": 12.909143447875977, "rewards/rejected": -9.055652618408203, "step": 300 }, { "epoch": 1.53, "learning_rate": 1.0874125874125875e-05, "logps/chosen": -93.8604507446289, "logps/rejected": -171.50653076171875, "loss": 0.0013, "losses/dpo": 0.0009554739226587117, "losses/sft": 0.5667473077774048, "losses/total": 0.0009554739226587117, "ref_logps/chosen": -128.62173461914062, "ref_logps/rejected": -72.07390594482422, "rewards/accuracies": 1.0, "rewards/chosen": 3.476128339767456, "rewards/margins": 13.419390678405762, "rewards/rejected": -9.943263053894043, "step": 325 }, { "epoch": 1.65, "learning_rate": 1e-05, "logps/chosen": -92.96914672851562, "logps/rejected": -173.7914276123047, "loss": 0.0009, "losses/dpo": 0.00044292627717368305, "losses/sft": 0.561470627784729, "losses/total": 0.00044292627717368305, "ref_logps/chosen": -128.7430877685547, "ref_logps/rejected": -72.57361602783203, "rewards/accuracies": 0.9994999766349792, "rewards/chosen": 3.577392578125, "rewards/margins": 13.699174880981445, "rewards/rejected": -10.121781349182129, "step": 350 }, { "epoch": 1.77, "learning_rate": 9.125874125874126e-06, "logps/chosen": -94.47010040283203, "logps/rejected": -175.3832550048828, "loss": 0.0006, "losses/dpo": 0.00038047495763748884, "losses/sft": 0.567641019821167, "losses/total": 0.00038047495763748884, "ref_logps/chosen": -128.09613037109375, "ref_logps/rejected": -71.36651611328125, "rewards/accuracies": 1.0, "rewards/chosen": 3.3626015186309814, "rewards/margins": 13.764276504516602, "rewards/rejected": -10.401673316955566, "step": 375 }, { "epoch": 1.89, "learning_rate": 8.251748251748254e-06, "logps/chosen": -96.90827941894531, "logps/rejected": -179.17970275878906, "loss": 0.0004, "losses/dpo": 0.0003745325666386634, "losses/sft": 0.5794407725334167, "losses/total": 0.0003745325666386634, "ref_logps/chosen": -129.79989624023438, "ref_logps/rejected": -71.4466323852539, "rewards/accuracies": 1.0, "rewards/chosen": 3.2891619205474854, "rewards/margins": 14.062468528747559, "rewards/rejected": -10.773306846618652, "step": 400 }, { "epoch": 2.0, "learning_rate": 7.377622377622379e-06, "logps/chosen": -96.15299987792969, "logps/rejected": -181.71612548828125, "loss": 0.001, "losses/dpo": 0.0020445636473596096, "losses/sft": 0.5634098052978516, "losses/total": 0.0020445636473596096, "ref_logps/chosen": -130.4124298095703, "ref_logps/rejected": -71.5147933959961, "rewards/accuracies": 1.0, "rewards/chosen": 3.4259424209594727, "rewards/margins": 14.446078300476074, "rewards/rejected": -11.020133972167969, "step": 425 }, { "epoch": 2.12, "learning_rate": 6.503496503496504e-06, "logps/chosen": -93.45982360839844, "logps/rejected": -177.76284790039062, "loss": 0.0003, "losses/dpo": 0.0001847467792686075, "losses/sft": 0.5676508545875549, "losses/total": 0.0001847467792686075, "ref_logps/chosen": -128.48521423339844, "ref_logps/rejected": -71.86593627929688, "rewards/accuracies": 1.0, "rewards/chosen": 3.5025393962860107, "rewards/margins": 14.092233657836914, "rewards/rejected": -10.589694023132324, "step": 450 }, { "epoch": 2.24, "learning_rate": 5.629370629370629e-06, "logps/chosen": -95.26840209960938, "logps/rejected": -181.77322387695312, "loss": 0.0003, "losses/dpo": 0.00029167634784244, "losses/sft": 0.5723408460617065, "losses/total": 0.00029167634784244, "ref_logps/chosen": -129.8194580078125, "ref_logps/rejected": -71.22503662109375, "rewards/accuracies": 1.0, "rewards/chosen": 3.455106496810913, "rewards/margins": 14.509923934936523, "rewards/rejected": -11.054819107055664, "step": 475 }, { "epoch": 2.36, "learning_rate": 4.755244755244756e-06, "logps/chosen": -96.92023468017578, "logps/rejected": -186.9994354248047, "loss": 0.0002, "losses/dpo": 0.00031228098669089377, "losses/sft": 0.5785849690437317, "losses/total": 0.00031228098669089377, "ref_logps/chosen": -129.39373779296875, "ref_logps/rejected": -72.0064468383789, "rewards/accuracies": 1.0, "rewards/chosen": 3.2473514080047607, "rewards/margins": 14.746650695800781, "rewards/rejected": -11.499299049377441, "step": 500 }, { "epoch": 2.48, "learning_rate": 3.881118881118881e-06, "logps/chosen": -96.56753540039062, "logps/rejected": -188.7633819580078, "loss": 0.0003, "losses/dpo": 0.00019269342010375112, "losses/sft": 0.5762569904327393, "losses/total": 0.00019269342010375112, "ref_logps/chosen": -128.0542449951172, "ref_logps/rejected": -71.33090209960938, "rewards/accuracies": 1.0, "rewards/chosen": 3.148669481277466, "rewards/margins": 14.891918182373047, "rewards/rejected": -11.743247985839844, "step": 525 }, { "epoch": 2.59, "learning_rate": 3.006993006993007e-06, "logps/chosen": -97.47007751464844, "logps/rejected": -189.46685791015625, "loss": 0.0002, "losses/dpo": 0.00025137903867289424, "losses/sft": 0.5845997333526611, "losses/total": 0.00025137903867289424, "ref_logps/chosen": -128.27481079101562, "ref_logps/rejected": -71.0475082397461, "rewards/accuracies": 1.0, "rewards/chosen": 3.0804734230041504, "rewards/margins": 14.922408103942871, "rewards/rejected": -11.841936111450195, "step": 550 }, { "epoch": 2.71, "learning_rate": 2.132867132867133e-06, "logps/chosen": -97.53392791748047, "logps/rejected": -190.3184356689453, "loss": 0.0003, "losses/dpo": 0.00022768642520532012, "losses/sft": 0.5818451642990112, "losses/total": 0.00022768642520532012, "ref_logps/chosen": -127.79641723632812, "ref_logps/rejected": -71.30667877197266, "rewards/accuracies": 1.0, "rewards/chosen": 3.0262484550476074, "rewards/margins": 14.927424430847168, "rewards/rejected": -11.901176452636719, "step": 575 }, { "epoch": 2.83, "learning_rate": 1.258741258741259e-06, "logps/chosen": -98.7781753540039, "logps/rejected": -194.68101501464844, "loss": 0.0002, "losses/dpo": 0.00027412467170506716, "losses/sft": 0.5883935689926147, "losses/total": 0.00027412467170506716, "ref_logps/chosen": -129.11810302734375, "ref_logps/rejected": -72.69854736328125, "rewards/accuracies": 1.0, "rewards/chosen": 3.0339913368225098, "rewards/margins": 15.232237815856934, "rewards/rejected": -12.198246955871582, "step": 600 }, { "epoch": 2.95, "learning_rate": 3.846153846153847e-07, "logps/chosen": -99.09100341796875, "logps/rejected": -193.7891387939453, "loss": 0.0002, "losses/dpo": 0.0002839878143277019, "losses/sft": 0.5941969752311707, "losses/total": 0.0002839878143277019, "ref_logps/chosen": -129.47601318359375, "ref_logps/rejected": -71.78772735595703, "rewards/accuracies": 1.0, "rewards/chosen": 3.0384998321533203, "rewards/margins": 15.238642692565918, "rewards/rejected": -12.200141906738281, "step": 625 }, { "epoch": 3.0, "step": 636, "total_flos": 0.0, "train_loss": 0.02455043116600528, "train_runtime": 18942.3632, "train_samples_per_second": 2.686, "train_steps_per_second": 0.034 } ], "logging_steps": 25, "max_steps": 636, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }