{ "best_global_step": 98, "best_metric": 0.14128435, "best_model_checkpoint": "./output_dpo/v0-20260226-085120/checkpoint-98", "epoch": 1.9861635220125786, "eval_steps": 50, "global_step": 98, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02012578616352201, "grad_norm": 1.4377635717391968, "learning_rate": 2e-05, "logits/chosen": -1.7360858917236328, "logits/rejected": -1.7113451957702637, "logps/chosen": -111.01881408691406, "logps/rejected": -147.11973571777344, "loss": 1.319612741470337, "memory(GiB)": 239.65, "nll_loss": 0.6264656782150269, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1, "train_speed(iter/s)": 0.004523 }, { "epoch": 0.10062893081761007, "grad_norm": 1.4525984525680542, "learning_rate": 0.0001, "logits/chosen": -1.6965384483337402, "logits/rejected": -1.681287407875061, "logps/chosen": -111.46014404296875, "logps/rejected": -143.75, "loss": 1.361119270324707, "memory(GiB)": 239.65, "nll_loss": 0.6818519830703735, "rewards/accuracies": 0.5078125, "rewards/chosen": 0.04633765667676926, "rewards/margins": 0.02939797379076481, "rewards/rejected": 0.01693967543542385, "step": 5, "train_speed(iter/s)": 0.003936 }, { "epoch": 0.20125786163522014, "grad_norm": 1.0078742504119873, "learning_rate": 9.928848976574019e-05, "logits/chosen": -1.7403156757354736, "logits/rejected": -1.726575255393982, "logps/chosen": -92.17589569091797, "logps/rejected": -137.906005859375, "loss": 0.9127995491027832, "memory(GiB)": 284.39, "nll_loss": 0.5469792485237122, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 2.0497705936431885, "rewards/margins": 1.318472146987915, "rewards/rejected": 0.7312980890274048, "step": 10, "train_speed(iter/s)": 0.003899 }, { "epoch": 0.3018867924528302, "grad_norm": 1.1189488172531128, "learning_rate": 9.717420893549902e-05, "logits/chosen": -1.8927457332611084, "logits/rejected": -1.8742872476577759, "logps/chosen": -56.06190872192383, "logps/rejected": -129.63563537597656, "loss": 0.5759311199188233, "memory(GiB)": 284.39, "nll_loss": 0.3772023618221283, "rewards/accuracies": 0.96875, "rewards/chosen": 5.639416694641113, "rewards/margins": 3.9854512214660645, "rewards/rejected": 1.6539649963378906, "step": 15, "train_speed(iter/s)": 0.003905 }, { "epoch": 0.4025157232704403, "grad_norm": 2.065215826034546, "learning_rate": 9.371733080722911e-05, "logits/chosen": -2.0726945400238037, "logits/rejected": -2.0517024993896484, "logps/chosen": -38.782867431640625, "logps/rejected": -141.28872680664062, "loss": 0.34540715217590334, "memory(GiB)": 284.39, "nll_loss": 0.24602404236793518, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 7.165956020355225, "rewards/margins": 7.171680450439453, "rewards/rejected": -0.005724119953811169, "step": 20, "train_speed(iter/s)": 0.003816 }, { "epoch": 0.5031446540880503, "grad_norm": 0.9638963937759399, "learning_rate": 8.90162395476046e-05, "logits/chosen": -2.205498456954956, "logits/rejected": -2.182650089263916, "logps/chosen": -34.5748405456543, "logps/rejected": -168.0699462890625, "loss": 0.29475107192993166, "memory(GiB)": 284.39, "nll_loss": 0.2225954234600067, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 7.66351842880249, "rewards/margins": 9.37935733795166, "rewards/rejected": -1.7158397436141968, "step": 25, "train_speed(iter/s)": 0.00375 }, { "epoch": 0.6037735849056604, "grad_norm": 0.5872039794921875, "learning_rate": 8.320473013836196e-05, "logits/chosen": -2.2474639415740967, "logits/rejected": -2.2216179370880127, "logps/chosen": -23.524024963378906, "logps/rejected": -159.84942626953125, "loss": 0.23147854804992676, "memory(GiB)": 284.39, "nll_loss": 0.18826261162757874, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 8.869623184204102, "rewards/margins": 10.225828170776367, "rewards/rejected": -1.3562055826187134, "step": 30, "train_speed(iter/s)": 0.003835 }, { "epoch": 0.7044025157232704, "grad_norm": 0.8212366700172424, "learning_rate": 7.644820051634812e-05, "logits/chosen": -2.2804150581359863, "logits/rejected": -2.2608768939971924, "logps/chosen": -20.996126174926758, "logps/rejected": -161.36029052734375, "loss": 0.1881607413291931, "memory(GiB)": 284.39, "nll_loss": 0.13474711775779724, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 9.420888900756836, "rewards/margins": 10.558382987976074, "rewards/rejected": -1.137495517730713, "step": 35, "train_speed(iter/s)": 0.003906 }, { "epoch": 0.8050314465408805, "grad_norm": 0.9303659200668335, "learning_rate": 6.89389442805288e-05, "logits/chosen": -2.2562363147735596, "logits/rejected": -2.2325804233551025, "logps/chosen": -26.601587295532227, "logps/rejected": -155.21389770507812, "loss": 0.21106297969818116, "memory(GiB)": 284.39, "nll_loss": 0.15431135892868042, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 9.138971328735352, "rewards/margins": 9.48228931427002, "rewards/rejected": -0.34331730008125305, "step": 40, "train_speed(iter/s)": 0.003892 }, { "epoch": 0.9056603773584906, "grad_norm": 0.8759572505950928, "learning_rate": 6.0890677937442574e-05, "logits/chosen": -2.2504515647888184, "logits/rejected": -2.236832618713379, "logps/chosen": -24.932228088378906, "logps/rejected": -150.9632110595703, "loss": 0.21578831672668458, "memory(GiB)": 284.39, "nll_loss": 0.1573367863893509, "rewards/accuracies": 1.0, "rewards/chosen": 8.940786361694336, "rewards/margins": 9.429086685180664, "rewards/rejected": -0.488300621509552, "step": 45, "train_speed(iter/s)": 0.003836 }, { "epoch": 1.020125786163522, "grad_norm": 1.6238784790039062, "learning_rate": 5.2532458441935636e-05, "logits/chosen": -2.3447046279907227, "logits/rejected": -2.316112995147705, "logps/chosen": -17.97600746154785, "logps/rejected": -169.5856475830078, "loss": 0.1865710735321045, "memory(GiB)": 284.39, "nll_loss": 0.1159815713763237, "rewards/accuracies": 0.9767441749572754, "rewards/chosen": 9.204967498779297, "rewards/margins": 11.411535263061523, "rewards/rejected": -2.2065672874450684, "step": 50, "train_speed(iter/s)": 0.003798 }, { "epoch": 1.020125786163522, "eval_logits/chosen": -2.462606191635132, "eval_logits/rejected": -2.437251091003418, "eval_logps/chosen": -19.061992645263672, "eval_logps/rejected": -184.38104248046875, "eval_loss": 0.1830219328403473, "eval_nll_loss": 0.17293420433998108, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 8.904085159301758, "eval_rewards/margins": 12.307174682617188, "eval_rewards/rejected": -3.4030885696411133, "eval_runtime": 55.6446, "eval_samples_per_second": 0.288, "eval_steps_per_second": 0.144, "step": 50 }, { "epoch": 1.120754716981132, "grad_norm": 0.5176746249198914, "learning_rate": 4.410216414245771e-05, "logits/chosen": -2.3740134239196777, "logits/rejected": -2.3573694229125977, "logps/chosen": -26.2227840423584, "logps/rejected": -179.9822540283203, "loss": 0.19258421659469604, "memory(GiB)": 284.39, "nll_loss": 0.17000555992126465, "rewards/accuracies": 1.0, "rewards/chosen": 8.813023567199707, "rewards/margins": 11.842904090881348, "rewards/rejected": -3.029881715774536, "step": 55, "train_speed(iter/s)": 0.00372 }, { "epoch": 1.221383647798742, "grad_norm": 0.6022250056266785, "learning_rate": 3.58397246658848e-05, "logits/chosen": -2.4972939491271973, "logits/rejected": -2.4699082374572754, "logps/chosen": -14.000228881835938, "logps/rejected": -196.9097442626953, "loss": 0.10635790824890137, "memory(GiB)": 284.39, "nll_loss": 0.08761530369520187, "rewards/accuracies": 1.0, "rewards/chosen": 9.721292495727539, "rewards/margins": 14.493858337402344, "rewards/rejected": -4.772566795349121, "step": 60, "train_speed(iter/s)": 0.003743 }, { "epoch": 1.3220125786163521, "grad_norm": 0.2501760721206665, "learning_rate": 2.798029242211828e-05, "logits/chosen": -2.5347957611083984, "logits/rejected": -2.50445818901062, "logps/chosen": -23.887548446655273, "logps/rejected": -183.65591430664062, "loss": 0.18030774593353271, "memory(GiB)": 284.39, "nll_loss": 0.14212127029895782, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 9.50097370147705, "rewards/margins": 12.938058853149414, "rewards/rejected": -3.4370861053466797, "step": 65, "train_speed(iter/s)": 0.003757 }, { "epoch": 1.4226415094339622, "grad_norm": 0.42134493589401245, "learning_rate": 2.074755007023461e-05, "logits/chosen": -2.5006675720214844, "logits/rejected": -2.478884220123291, "logps/chosen": -12.177281379699707, "logps/rejected": -190.2030487060547, "loss": 0.09010829329490662, "memory(GiB)": 284.39, "nll_loss": 0.07332514226436615, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 10.052157402038574, "rewards/margins": 13.963111877441406, "rewards/rejected": -3.910953998565674, "step": 70, "train_speed(iter/s)": 0.003774 }, { "epoch": 1.5232704402515722, "grad_norm": 0.5933993458747864, "learning_rate": 1.434734441843899e-05, "logits/chosen": -2.502887487411499, "logits/rejected": -2.486396551132202, "logps/chosen": -18.57794189453125, "logps/rejected": -170.333740234375, "loss": 0.13938431739807128, "memory(GiB)": 284.39, "nll_loss": 0.11240720748901367, "rewards/accuracies": 1.0, "rewards/chosen": 9.312703132629395, "rewards/margins": 11.638362884521484, "rewards/rejected": -2.325660467147827, "step": 75, "train_speed(iter/s)": 0.00376 }, { "epoch": 1.6238993710691823, "grad_norm": 0.2634561061859131, "learning_rate": 8.961827939636196e-06, "logits/chosen": -2.5577776432037354, "logits/rejected": -2.5379796028137207, "logps/chosen": -16.603967666625977, "logps/rejected": -171.06466674804688, "loss": 0.10857141017913818, "memory(GiB)": 284.39, "nll_loss": 0.09158992022275925, "rewards/accuracies": 1.0, "rewards/chosen": 9.913006782531738, "rewards/margins": 12.598286628723145, "rewards/rejected": -2.685279369354248, "step": 80, "train_speed(iter/s)": 0.003778 }, { "epoch": 1.7245283018867923, "grad_norm": 0.3385748267173767, "learning_rate": 4.744274637483936e-06, "logits/chosen": -2.562164783477783, "logits/rejected": -2.5376689434051514, "logps/chosen": -14.094012260437012, "logps/rejected": -163.73416137695312, "loss": 0.11240246295928955, "memory(GiB)": 284.39, "nll_loss": 0.09068052470684052, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 9.352496147155762, "rewards/margins": 11.771881103515625, "rewards/rejected": -2.419384479522705, "step": 85, "train_speed(iter/s)": 0.003777 }, { "epoch": 1.8251572327044026, "grad_norm": 0.3210693895816803, "learning_rate": 1.8147178055029579e-06, "logits/chosen": -2.602306842803955, "logits/rejected": -2.567457675933838, "logps/chosen": -17.956844329833984, "logps/rejected": -175.5157470703125, "loss": 0.11938213109970093, "memory(GiB)": 284.39, "nll_loss": 0.09758913516998291, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 9.622145652770996, "rewards/margins": 12.495707511901855, "rewards/rejected": -2.873561382293701, "step": 90, "train_speed(iter/s)": 0.003796 }, { "epoch": 1.9257861635220126, "grad_norm": 0.3329070210456848, "learning_rate": 2.5653383040524227e-07, "logits/chosen": -2.591177463531494, "logits/rejected": -2.568394422531128, "logps/chosen": -17.04227638244629, "logps/rejected": -188.1129913330078, "loss": 0.11813113689422608, "memory(GiB)": 284.39, "nll_loss": 0.10378739982843399, "rewards/accuracies": 1.0, "rewards/chosen": 9.477154731750488, "rewards/margins": 13.704524040222168, "rewards/rejected": -4.227367877960205, "step": 95, "train_speed(iter/s)": 0.003803 }, { "epoch": 1.9861635220125786, "eval_logits/chosen": -2.6464767456054688, "eval_logits/rejected": -2.6153650283813477, "eval_logps/chosen": -15.376700401306152, "eval_logps/rejected": -193.30332946777344, "eval_loss": 0.14128434658050537, "eval_nll_loss": 0.14024823904037476, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": 9.272613525390625, "eval_rewards/margins": 13.567930221557617, "eval_rewards/rejected": -4.295315742492676, "eval_runtime": 55.5933, "eval_samples_per_second": 0.288, "eval_steps_per_second": 0.144, "step": 98 } ], "logging_steps": 5, "max_steps": 98, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.261229460544324e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }