{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996332966629996, "eval_steps": 50, "global_step": 1363, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018335166850018333, "grad_norm": 107.5, "learning_rate": 4.911958914159941e-07, "logits/chosen": -1.3776370286941528, "logits/rejected": -1.2306550741195679, "logps/chosen": -411.12249755859375, "logps/rejected": -369.989990234375, "loss": 0.6907, "rewards/accuracies": 0.23749999701976776, "rewards/chosen": 0.0027563476469367743, "rewards/margins": 0.00781173724681139, "rewards/rejected": -0.0050407410599291325, "step": 25 }, { "epoch": 0.03667033370003667, "grad_norm": 100.5, "learning_rate": 4.820249449743213e-07, "logits/chosen": -1.514649510383606, "logits/rejected": NaN, "logps/chosen": -430.3500061035156, "logps/rejected": -395.49749755859375, "loss": 0.6956, "rewards/accuracies": 0.24250000715255737, "rewards/chosen": 0.004258117638528347, "rewards/margins": 0.0015393065987154841, "rewards/rejected": 0.002720947377383709, "step": 50 }, { "epoch": 0.03667033370003667, "eval_logits/chosen": -1.534035086631775, "eval_logits/rejected": -1.435206413269043, "eval_logps/chosen": -377.2894592285156, "eval_logps/rejected": -350.399658203125, "eval_loss": 0.6936107873916626, "eval_rewards/accuracies": 0.23519736528396606, "eval_rewards/chosen": 0.0011091734049841762, "eval_rewards/margins": 0.005430823657661676, "eval_rewards/rejected": -0.00432586669921875, "eval_runtime": 183.6723, "eval_samples_per_second": 6.599, "eval_steps_per_second": 0.828, "step": 50 }, { "epoch": 0.05500550055005501, "grad_norm": 125.5, "learning_rate": 4.7285399853264857e-07, "logits/chosen": -1.5126913785934448, "logits/rejected": -1.361869215965271, "logps/chosen": -440.635009765625, "logps/rejected": -423.7550048828125, "loss": 0.7034, "rewards/accuracies": 0.1899999976158142, "rewards/chosen": 0.001637954730540514, "rewards/margins": -0.015832213684916496, "rewards/rejected": 0.01747741736471653, "step": 75 }, { "epoch": 0.07334066740007333, "grad_norm": 102.0, "learning_rate": 4.636830520909757e-07, "logits/chosen": -1.4500524997711182, "logits/rejected": -1.3146843910217285, "logps/chosen": -444.37249755859375, "logps/rejected": -417.4649963378906, "loss": 0.6997, "rewards/accuracies": 0.2549999952316284, "rewards/chosen": -0.0015260315267369151, "rewards/margins": -0.005664672702550888, "rewards/rejected": 0.004122619517147541, "step": 100 }, { "epoch": 0.07334066740007333, "eval_logits/chosen": -1.533652901649475, "eval_logits/rejected": -1.4345542192459106, "eval_logps/chosen": -377.2927551269531, "eval_logps/rejected": -350.2820739746094, "eval_loss": 0.6919476985931396, "eval_rewards/accuracies": 0.23766447603702545, "eval_rewards/chosen": 0.008811799809336662, "eval_rewards/margins": 0.007162897381931543, "eval_rewards/rejected": 0.0016679262043908238, "eval_runtime": 183.6267, "eval_samples_per_second": 6.6, "eval_steps_per_second": 0.828, "step": 100 }, { "epoch": 0.09167583425009168, "grad_norm": 101.0, "learning_rate": 4.54512105649303e-07, "logits/chosen": -1.429406762123108, "logits/rejected": NaN, "logps/chosen": -436.73748779296875, "logps/rejected": -403.6050109863281, "loss": 0.6898, "rewards/accuracies": 0.2549999952316284, "rewards/chosen": 0.01620025560259819, "rewards/margins": 0.012928924523293972, "rewards/rejected": 0.0032881165388971567, "step": 125 }, { "epoch": 0.11001100110011001, "grad_norm": 108.5, "learning_rate": 4.4534115920763023e-07, "logits/chosen": -1.4991015195846558, "logits/rejected": -1.323161005973816, "logps/chosen": -458.50750732421875, "logps/rejected": -410.1050109863281, "loss": 0.6872, "rewards/accuracies": 0.2775000035762787, "rewards/chosen": 0.017397155985236168, "rewards/margins": 0.019593505188822746, "rewards/rejected": -0.0021939086727797985, "step": 150 }, { "epoch": 0.11001100110011001, "eval_logits/chosen": -1.5319759845733643, "eval_logits/rejected": -1.432373046875, "eval_logps/chosen": -377.319091796875, "eval_logps/rejected": -350.2779541015625, "eval_loss": 0.6953898668289185, "eval_rewards/accuracies": 0.23026315867900848, "eval_rewards/chosen": 0.006703075487166643, "eval_rewards/margins": 0.002716064453125, "eval_rewards/rejected": 0.003982142545282841, "eval_runtime": 183.6328, "eval_samples_per_second": 6.6, "eval_steps_per_second": 0.828, "step": 150 }, { "epoch": 0.12834616795012835, "grad_norm": 107.0, "learning_rate": 4.3617021276595744e-07, "logits/chosen": NaN, "logits/rejected": -1.2460485696792603, "logps/chosen": -432.5924987792969, "logps/rejected": -419.864990234375, "loss": 0.6915, "rewards/accuracies": 0.2775000035762787, "rewards/chosen": 0.02058563195168972, "rewards/margins": 0.010975646786391735, "rewards/rejected": 0.009614868089556694, "step": 175 }, { "epoch": 0.14668133480014667, "grad_norm": 108.5, "learning_rate": 4.2699926632428464e-07, "logits/chosen": -1.5161709785461426, "logits/rejected": -1.2818182706832886, "logps/chosen": -443.9725036621094, "logps/rejected": -402.0425109863281, "loss": 0.6887, "rewards/accuracies": 0.2574999928474426, "rewards/chosen": 0.023589782416820526, "rewards/margins": 0.015720978379249573, "rewards/rejected": 0.007876587100327015, "step": 200 }, { "epoch": 0.14668133480014667, "eval_logits/chosen": -1.529746651649475, "eval_logits/rejected": -1.430788516998291, "eval_logps/chosen": -377.256591796875, "eval_logps/rejected": -350.24835205078125, "eval_loss": 0.6939330697059631, "eval_rewards/accuracies": 0.23273026943206787, "eval_rewards/chosen": 0.010709461756050587, "eval_rewards/margins": 0.003307191887870431, "eval_rewards/rejected": 0.007396697998046875, "eval_runtime": 183.6754, "eval_samples_per_second": 6.599, "eval_steps_per_second": 0.828, "step": 200 }, { "epoch": 0.16501650165016502, "grad_norm": 88.5, "learning_rate": 4.1782831988261185e-07, "logits/chosen": -1.4257241487503052, "logits/rejected": -1.1836668252944946, "logps/chosen": -410.3500061035156, "logps/rejected": -390.2325134277344, "loss": 0.7044, "rewards/accuracies": 0.23000000417232513, "rewards/chosen": -0.004284057766199112, "rewards/margins": -0.016582336276769638, "rewards/rejected": 0.012289123609662056, "step": 225 }, { "epoch": 0.18335166850018336, "grad_norm": 96.5, "learning_rate": 4.086573734409391e-07, "logits/chosen": -1.4944552183151245, "logits/rejected": -1.2437607049942017, "logps/chosen": -431.1650085449219, "logps/rejected": -407.6449890136719, "loss": 0.692, "rewards/accuracies": 0.2574999928474426, "rewards/chosen": 0.02587219327688217, "rewards/margins": 0.011192931793630123, "rewards/rejected": 0.01466323807835579, "step": 250 }, { "epoch": 0.18335166850018336, "eval_logits/chosen": -1.5293899774551392, "eval_logits/rejected": -1.4297887086868286, "eval_logps/chosen": -377.2006530761719, "eval_logps/rejected": -350.2894592285156, "eval_loss": 0.6944616436958313, "eval_rewards/accuracies": 0.23026315867900848, "eval_rewards/chosen": 0.012256572023034096, "eval_rewards/margins": 0.004177294205874205, "eval_rewards/rejected": 0.00807877629995346, "eval_runtime": 183.6704, "eval_samples_per_second": 6.599, "eval_steps_per_second": 0.828, "step": 250 }, { "epoch": 0.20168683535020168, "grad_norm": 113.0, "learning_rate": 3.994864269992663e-07, "logits/chosen": -1.4493054151535034, "logits/rejected": -1.3403005599975586, "logps/chosen": -430.05999755859375, "logps/rejected": -410.04998779296875, "loss": 0.699, "rewards/accuracies": 0.2775000035762787, "rewards/chosen": 0.016153564676642418, "rewards/margins": -0.0009202575893141329, "rewards/rejected": 0.01705078035593033, "step": 275 }, { "epoch": 0.22002200220022003, "grad_norm": 130.0, "learning_rate": 3.903154805575935e-07, "logits/chosen": -1.3159887790679932, "logits/rejected": NaN, "logps/chosen": -428.38751220703125, "logps/rejected": -422.7925109863281, "loss": 0.6976, "rewards/accuracies": 0.24250000715255737, "rewards/chosen": 0.01615051180124283, "rewards/margins": -0.00038818360189907253, "rewards/rejected": 0.016550598666071892, "step": 300 }, { "epoch": 0.22002200220022003, "eval_logits/chosen": -1.528660774230957, "eval_logits/rejected": -1.4295405149459839, "eval_logps/chosen": -377.2154541015625, "eval_logps/rejected": -350.28125, "eval_loss": 0.6940748691558838, "eval_rewards/accuracies": 0.2409539520740509, "eval_rewards/chosen": 0.010566108860075474, "eval_rewards/margins": 0.005987267941236496, "eval_rewards/rejected": 0.00458752503618598, "eval_runtime": 183.6126, "eval_samples_per_second": 6.601, "eval_steps_per_second": 0.828, "step": 300 }, { "epoch": 0.23835716905023835, "grad_norm": 84.5, "learning_rate": 3.811445341159207e-07, "logits/chosen": -1.5210723876953125, "logits/rejected": -1.3567346334457397, "logps/chosen": -407.4012451171875, "logps/rejected": -392.4549865722656, "loss": 0.6985, "rewards/accuracies": 0.25999999046325684, "rewards/chosen": 0.013754882849752903, "rewards/margins": -0.0055097960866987705, "rewards/rejected": 0.01927383430302143, "step": 325 }, { "epoch": 0.2566923359002567, "grad_norm": 119.5, "learning_rate": 3.7197358767424797e-07, "logits/chosen": NaN, "logits/rejected": -1.246942162513733, "logps/chosen": -436.572509765625, "logps/rejected": -423.2974853515625, "loss": 0.6926, "rewards/accuracies": 0.25999999046325684, "rewards/chosen": 0.013691024854779243, "rewards/margins": 0.0036587524227797985, "rewards/rejected": 0.01002708449959755, "step": 350 }, { "epoch": 0.2566923359002567, "eval_logits/chosen": -1.5272730588912964, "eval_logits/rejected": -1.4281728267669678, "eval_logps/chosen": -377.25, "eval_logps/rejected": -350.162841796875, "eval_loss": 0.6958668231964111, "eval_rewards/accuracies": 0.22615131735801697, "eval_rewards/chosen": 0.013866023160517216, "eval_rewards/margins": 0.00107443961314857, "eval_rewards/rejected": 0.012793390080332756, "eval_runtime": 183.7079, "eval_samples_per_second": 6.597, "eval_steps_per_second": 0.827, "step": 350 }, { "epoch": 0.27502750275027504, "grad_norm": 107.5, "learning_rate": 3.6280264123257523e-07, "logits/chosen": -1.4459222555160522, "logits/rejected": -1.3630120754241943, "logps/chosen": -455.4324951171875, "logps/rejected": -415.572509765625, "loss": 0.6992, "rewards/accuracies": 0.2150000035762787, "rewards/chosen": 0.016205139458179474, "rewards/margins": -0.0074514769949018955, "rewards/rejected": 0.02364654466509819, "step": 375 }, { "epoch": 0.29336266960029334, "grad_norm": 111.5, "learning_rate": 3.536316947909024e-07, "logits/chosen": -1.4444499015808105, "logits/rejected": -1.2767553329467773, "logps/chosen": -425.5450134277344, "logps/rejected": -379.00250244140625, "loss": 0.6962, "rewards/accuracies": 0.24500000476837158, "rewards/chosen": 0.01742446981370449, "rewards/margins": -0.0001217651370097883, "rewards/rejected": 0.01754501275718212, "step": 400 }, { "epoch": 0.29336266960029334, "eval_logits/chosen": -1.5267494916915894, "eval_logits/rejected": -1.4275885820388794, "eval_logps/chosen": -377.2302551269531, "eval_logps/rejected": -350.2368469238281, "eval_loss": 0.6963567733764648, "eval_rewards/accuracies": 0.23848684132099152, "eval_rewards/chosen": 0.012201008386909962, "eval_rewards/margins": 0.0003203341912012547, "eval_rewards/rejected": 0.011873997747898102, "eval_runtime": 183.6474, "eval_samples_per_second": 6.6, "eval_steps_per_second": 0.828, "step": 400 }, { "epoch": 0.3116978364503117, "grad_norm": 90.5, "learning_rate": 3.4446074834922964e-07, "logits/chosen": NaN, "logits/rejected": -1.3805227279663086, "logps/chosen": -404.4849853515625, "logps/rejected": -390.4962463378906, "loss": 0.6861, "rewards/accuracies": 0.27250000834465027, "rewards/chosen": 0.022739257663488388, "rewards/margins": 0.018987426534295082, "rewards/rejected": 0.003755493089556694, "step": 425 }, { "epoch": 0.33003300330033003, "grad_norm": 108.0, "learning_rate": 3.3528980190755684e-07, "logits/chosen": -1.461000919342041, "logits/rejected": -1.2780396938323975, "logps/chosen": -414.50750732421875, "logps/rejected": -408.3500061035156, "loss": 0.6926, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.018767546862363815, "rewards/margins": 0.005626831203699112, "rewards/rejected": 0.01314392127096653, "step": 450 }, { "epoch": 0.33003300330033003, "eval_logits/chosen": -1.5271477699279785, "eval_logits/rejected": -1.4275585412979126, "eval_logps/chosen": -377.1217041015625, "eval_logps/rejected": -350.1554260253906, "eval_loss": 0.6922184228897095, "eval_rewards/accuracies": 0.24671052396297455, "eval_rewards/chosen": 0.018702909350395203, "eval_rewards/margins": 0.007914392277598381, "eval_rewards/rejected": 0.010786859318614006, "eval_runtime": 183.7247, "eval_samples_per_second": 6.597, "eval_steps_per_second": 0.827, "step": 450 }, { "epoch": 0.3483681701503484, "grad_norm": 125.5, "learning_rate": 3.261188554658841e-07, "logits/chosen": -1.432356595993042, "logits/rejected": -1.2240395545959473, "logps/chosen": -438.5174865722656, "logps/rejected": -409.4549865722656, "loss": 0.7048, "rewards/accuracies": 0.2199999988079071, "rewards/chosen": 0.011979827657341957, "rewards/margins": -0.015614014118909836, "rewards/rejected": 0.02758941613137722, "step": 475 }, { "epoch": 0.3667033370003667, "grad_norm": 100.0, "learning_rate": 3.1694790902421125e-07, "logits/chosen": -1.4630835056304932, "logits/rejected": -1.3660200834274292, "logps/chosen": -431.2174987792969, "logps/rejected": -405.39373779296875, "loss": 0.6841, "rewards/accuracies": 0.2874999940395355, "rewards/chosen": 0.02687728963792324, "rewards/margins": 0.02634170465171337, "rewards/rejected": 0.0005162048619240522, "step": 500 }, { "epoch": 0.3667033370003667, "eval_logits/chosen": -1.5262771844863892, "eval_logits/rejected": -1.4270143508911133, "eval_logps/chosen": -377.21875, "eval_logps/rejected": -350.1620178222656, "eval_loss": 0.6944873929023743, "eval_rewards/accuracies": 0.23848684132099152, "eval_rewards/chosen": 0.014546644873917103, "eval_rewards/margins": 0.0022586018312722445, "eval_rewards/rejected": 0.012288796715438366, "eval_runtime": 183.6962, "eval_samples_per_second": 6.598, "eval_steps_per_second": 0.827, "step": 500 }, { "epoch": 0.385038503850385, "grad_norm": 114.5, "learning_rate": 3.077769625825385e-07, "logits/chosen": -1.4934594631195068, "logits/rejected": -1.316256046295166, "logps/chosen": -446.3924865722656, "logps/rejected": -413.6081237792969, "loss": 0.6988, "rewards/accuracies": 0.2524999976158142, "rewards/chosen": 0.017985381186008453, "rewards/margins": -0.003291168250143528, "rewards/rejected": 0.021276244893670082, "step": 525 }, { "epoch": 0.40337367070040336, "grad_norm": 88.5, "learning_rate": 2.986060161408657e-07, "logits/chosen": -1.3428008556365967, "logits/rejected": -1.264622688293457, "logps/chosen": -425.7149963378906, "logps/rejected": -417.7699890136719, "loss": 0.6939, "rewards/accuracies": 0.26499998569488525, "rewards/chosen": 0.021398009732365608, "rewards/margins": 0.007883605547249317, "rewards/rejected": 0.013519592583179474, "step": 550 }, { "epoch": 0.40337367070040336, "eval_logits/chosen": -1.5262964963912964, "eval_logits/rejected": -1.4273858070373535, "eval_logps/chosen": -377.16119384765625, "eval_logps/rejected": -350.23272705078125, "eval_loss": 0.6917800903320312, "eval_rewards/accuracies": 0.24588815867900848, "eval_rewards/chosen": 0.015943175181746483, "eval_rewards/margins": 0.008131027221679688, "eval_rewards/rejected": 0.007824345491826534, "eval_runtime": 183.6867, "eval_samples_per_second": 6.598, "eval_steps_per_second": 0.827, "step": 550 }, { "epoch": 0.4217088375504217, "grad_norm": 95.0, "learning_rate": 2.8943506969919296e-07, "logits/chosen": -1.4449292421340942, "logits/rejected": -1.2489904165267944, "logps/chosen": -434.9649963378906, "logps/rejected": -409.8125, "loss": 0.6943, "rewards/accuracies": 0.24250000715255737, "rewards/chosen": 0.02709350548684597, "rewards/margins": 0.005654601845890284, "rewards/rejected": 0.021434325724840164, "step": 575 }, { "epoch": 0.44004400440044006, "grad_norm": 137.0, "learning_rate": 2.8026412325752017e-07, "logits/chosen": -1.4576478004455566, "logits/rejected": -1.312269926071167, "logps/chosen": -417.552490234375, "logps/rejected": -382.11248779296875, "loss": 0.6943, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.02476959303021431, "rewards/margins": 0.0036778259091079235, "rewards/rejected": 0.021088866516947746, "step": 600 }, { "epoch": 0.44004400440044006, "eval_logits/chosen": -1.5249665975570679, "eval_logits/rejected": -1.4268261194229126, "eval_logps/chosen": -377.1414489746094, "eval_logps/rejected": -350.2212219238281, "eval_loss": 0.6920637488365173, "eval_rewards/accuracies": 0.2319078892469406, "eval_rewards/chosen": 0.018822118639945984, "eval_rewards/margins": 0.007972114719450474, "eval_rewards/rejected": 0.010848095640540123, "eval_runtime": 183.6918, "eval_samples_per_second": 6.598, "eval_steps_per_second": 0.827, "step": 600 }, { "epoch": 0.4583791712504584, "grad_norm": 113.0, "learning_rate": 2.7109317681584737e-07, "logits/chosen": -1.3854376077651978, "logits/rejected": NaN, "logps/chosen": -430.0950012207031, "logps/rejected": -378.32000732421875, "loss": 0.699, "rewards/accuracies": 0.22750000655651093, "rewards/chosen": 0.0017354583833366632, "rewards/margins": -0.00892486609518528, "rewards/rejected": 0.010651321150362492, "step": 625 }, { "epoch": 0.4767143381004767, "grad_norm": 90.5, "learning_rate": 2.619222303741746e-07, "logits/chosen": -1.5457714796066284, "logits/rejected": -1.3327239751815796, "logps/chosen": -443.5487365722656, "logps/rejected": -417.052490234375, "loss": 0.6899, "rewards/accuracies": 0.2574999928474426, "rewards/chosen": 0.02349899336695671, "rewards/margins": 0.01311645470559597, "rewards/rejected": 0.010387726128101349, "step": 650 }, { "epoch": 0.4767143381004767, "eval_logits/chosen": -1.5264860391616821, "eval_logits/rejected": -1.4273176193237305, "eval_logps/chosen": -377.1990051269531, "eval_logps/rejected": -350.26397705078125, "eval_loss": 0.6901170611381531, "eval_rewards/accuracies": 0.25986841320991516, "eval_rewards/chosen": 0.019354568794369698, "eval_rewards/margins": 0.013522299006581306, "eval_rewards/rejected": 0.005830463487654924, "eval_runtime": 183.7079, "eval_samples_per_second": 6.597, "eval_steps_per_second": 0.827, "step": 650 }, { "epoch": 0.49504950495049505, "grad_norm": 95.0, "learning_rate": 2.5275128393250183e-07, "logits/chosen": -1.4289679527282715, "logits/rejected": -1.2180871963500977, "logps/chosen": -425.9100036621094, "logps/rejected": -406.896240234375, "loss": 0.6923, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.030851593241095543, "rewards/margins": 0.008693389594554901, "rewards/rejected": 0.022180786356329918, "step": 675 }, { "epoch": 0.5133846718005134, "grad_norm": 112.0, "learning_rate": 2.4358033749082904e-07, "logits/chosen": -1.4609838724136353, "logits/rejected": -1.2298834323883057, "logps/chosen": -420.2749938964844, "logps/rejected": -415.197509765625, "loss": 0.6966, "rewards/accuracies": 0.26499998569488525, "rewards/chosen": 0.026063384488224983, "rewards/margins": 0.0011651611421257257, "rewards/rejected": 0.024905700236558914, "step": 700 }, { "epoch": 0.5133846718005134, "eval_logits/chosen": -1.5253231525421143, "eval_logits/rejected": -1.4264132976531982, "eval_logps/chosen": -377.131591796875, "eval_logps/rejected": -350.2458801269531, "eval_loss": 0.6932626962661743, "eval_rewards/accuracies": 0.2368421107530594, "eval_rewards/chosen": 0.019523821771144867, "eval_rewards/margins": 0.005818919278681278, "eval_rewards/rejected": 0.01370515301823616, "eval_runtime": 183.7031, "eval_samples_per_second": 6.598, "eval_steps_per_second": 0.827, "step": 700 }, { "epoch": 0.5317198386505317, "grad_norm": 88.0, "learning_rate": 2.3440939104915627e-07, "logits/chosen": -1.414352536201477, "logits/rejected": -1.2296130657196045, "logps/chosen": -406.614990234375, "logps/rejected": -388.9599914550781, "loss": 0.706, "rewards/accuracies": 0.23999999463558197, "rewards/chosen": 0.014584961347281933, "rewards/margins": -0.01819046027958393, "rewards/rejected": 0.032747648656368256, "step": 725 }, { "epoch": 0.5500550055005501, "grad_norm": 117.5, "learning_rate": 2.2523844460748347e-07, "logits/chosen": -1.6487542390823364, "logits/rejected": -1.505617380142212, "logps/chosen": -415.32000732421875, "logps/rejected": -388.822509765625, "loss": 0.6909, "rewards/accuracies": 0.27000001072883606, "rewards/chosen": 0.027718810364603996, "rewards/margins": 0.00900314375758171, "rewards/rejected": 0.01870529167354107, "step": 750 }, { "epoch": 0.5500550055005501, "eval_logits/chosen": -1.5252236127853394, "eval_logits/rejected": -1.4261868000030518, "eval_logps/chosen": -377.18585205078125, "eval_logps/rejected": -350.2220458984375, "eval_loss": 0.6894466876983643, "eval_rewards/accuracies": 0.24259868264198303, "eval_rewards/chosen": 0.019582247361540794, "eval_rewards/margins": 0.012999284081161022, "eval_rewards/rejected": 0.006589788943529129, "eval_runtime": 183.6443, "eval_samples_per_second": 6.6, "eval_steps_per_second": 0.828, "step": 750 }, { "epoch": 0.5683901723505684, "grad_norm": 100.5, "learning_rate": 2.160674981658107e-07, "logits/chosen": NaN, "logits/rejected": -1.316674828529358, "logps/chosen": -418.80499267578125, "logps/rejected": -402.65875244140625, "loss": 0.6904, "rewards/accuracies": 0.2574999928474426, "rewards/chosen": 0.03401367366313934, "rewards/margins": 0.013011474162340164, "rewards/rejected": 0.020986633375287056, "step": 775 }, { "epoch": 0.5867253392005867, "grad_norm": 88.5, "learning_rate": 2.0689655172413793e-07, "logits/chosen": -1.4807385206222534, "logits/rejected": -1.2883676290512085, "logps/chosen": -425.5325012207031, "logps/rejected": -403.2650146484375, "loss": 0.6893, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.02227478101849556, "rewards/margins": 0.012271422892808914, "rewards/rejected": 0.010012512095272541, "step": 800 }, { "epoch": 0.5867253392005867, "eval_logits/chosen": -1.5252236127853394, "eval_logits/rejected": -1.4258739948272705, "eval_logps/chosen": -377.2467041015625, "eval_logps/rejected": -350.1842041015625, "eval_loss": 0.6948097348213196, "eval_rewards/accuracies": 0.23355263471603394, "eval_rewards/chosen": 0.014182441867887974, "eval_rewards/margins": 0.002087994711473584, "eval_rewards/rejected": 0.012089378200471401, "eval_runtime": 183.6876, "eval_samples_per_second": 6.598, "eval_steps_per_second": 0.827, "step": 800 }, { "epoch": 0.605060506050605, "grad_norm": 80.5, "learning_rate": 1.9772560528246516e-07, "logits/chosen": -1.494598388671875, "logits/rejected": NaN, "logps/chosen": -427.4649963378906, "logps/rejected": -406.42498779296875, "loss": 0.6926, "rewards/accuracies": 0.23499999940395355, "rewards/chosen": 0.018895873799920082, "rewards/margins": 0.010480347089469433, "rewards/rejected": 0.008425445295870304, "step": 825 }, { "epoch": 0.6233956729006234, "grad_norm": 114.0, "learning_rate": 1.8855465884079237e-07, "logits/chosen": -1.546240210533142, "logits/rejected": -1.399204134941101, "logps/chosen": -416.7099914550781, "logps/rejected": -391.4775085449219, "loss": 0.6964, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.011256103403866291, "rewards/margins": 0.0009084320045076311, "rewards/rejected": 0.010347671806812286, "step": 850 }, { "epoch": 0.6233956729006234, "eval_logits/chosen": -1.524696707725525, "eval_logits/rejected": -1.4260004758834839, "eval_logps/chosen": -377.21875, "eval_logps/rejected": -350.2689208984375, "eval_loss": 0.6896400451660156, "eval_rewards/accuracies": 0.25740131735801697, "eval_rewards/chosen": 0.0202172938734293, "eval_rewards/margins": 0.012124011293053627, "eval_rewards/rejected": 0.008097749203443527, "eval_runtime": 183.7166, "eval_samples_per_second": 6.597, "eval_steps_per_second": 0.827, "step": 850 }, { "epoch": 0.6417308397506417, "grad_norm": 109.0, "learning_rate": 1.793837123991196e-07, "logits/chosen": -1.505387544631958, "logits/rejected": -1.3573604822158813, "logps/chosen": -424.7099914550781, "logps/rejected": -410.5162353515625, "loss": 0.7019, "rewards/accuracies": 0.23999999463558197, "rewards/chosen": 0.009785156697034836, "rewards/margins": -0.007974395528435707, "rewards/rejected": 0.017765656113624573, "step": 875 }, { "epoch": 0.6600660066006601, "grad_norm": 102.5, "learning_rate": 1.702127659574468e-07, "logits/chosen": -1.519402265548706, "logits/rejected": -1.3906365633010864, "logps/chosen": -425.7699890136719, "logps/rejected": -410.8399963378906, "loss": 0.6951, "rewards/accuracies": 0.2874999940395355, "rewards/chosen": 0.021494140848517418, "rewards/margins": 0.003243102924898267, "rewards/rejected": 0.018245697021484375, "step": 900 }, { "epoch": 0.6600660066006601, "eval_logits/chosen": -1.5251914262771606, "eval_logits/rejected": -1.42595636844635, "eval_logps/chosen": -377.2302551269531, "eval_logps/rejected": -350.2039489746094, "eval_loss": 0.6908003091812134, "eval_rewards/accuracies": 0.24424342811107635, "eval_rewards/chosen": 0.01913321577012539, "eval_rewards/margins": 0.010163256898522377, "eval_rewards/rejected": 0.008959268219769001, "eval_runtime": 183.6568, "eval_samples_per_second": 6.599, "eval_steps_per_second": 0.828, "step": 900 }, { "epoch": 0.6784011734506784, "grad_norm": 100.0, "learning_rate": 1.6104181951577403e-07, "logits/chosen": -1.548006534576416, "logits/rejected": -1.3452630043029785, "logps/chosen": -442.739990234375, "logps/rejected": -419.04217529296875, "loss": 0.696, "rewards/accuracies": 0.24500000476837158, "rewards/chosen": 0.027564391493797302, "rewards/margins": 0.0037604523822665215, "rewards/rejected": 0.0237899012863636, "step": 925 }, { "epoch": 0.6967363403006968, "grad_norm": 103.0, "learning_rate": 1.5187087307410123e-07, "logits/chosen": -1.4493930339813232, "logits/rejected": NaN, "logps/chosen": -446.68499755859375, "logps/rejected": -405.67999267578125, "loss": 0.6963, "rewards/accuracies": 0.24250000715255737, "rewards/chosen": 0.028537597507238388, "rewards/margins": 0.0007598876836709678, "rewards/rejected": 0.027764510363340378, "step": 950 }, { "epoch": 0.6967363403006968, "eval_logits/chosen": -1.5243659019470215, "eval_logits/rejected": -1.4253950119018555, "eval_logps/chosen": -377.1759948730469, "eval_logps/rejected": -350.2154541015625, "eval_loss": 0.693004846572876, "eval_rewards/accuracies": 0.24013157188892365, "eval_rewards/chosen": 0.018045425415039062, "eval_rewards/margins": 0.00674940412864089, "eval_rewards/rejected": 0.011289797723293304, "eval_runtime": 183.7309, "eval_samples_per_second": 6.597, "eval_steps_per_second": 0.827, "step": 950 }, { "epoch": 0.7150715071507151, "grad_norm": 98.5, "learning_rate": 1.4269992663242846e-07, "logits/chosen": -1.6109237670898438, "logits/rejected": -1.3813133239746094, "logps/chosen": -421.8275146484375, "logps/rejected": -401.74249267578125, "loss": 0.6906, "rewards/accuracies": 0.22750000655651093, "rewards/chosen": 0.019305266439914703, "rewards/margins": 0.009662169963121414, "rewards/rejected": 0.009637146256864071, "step": 975 }, { "epoch": 0.7334066740007334, "grad_norm": 87.5, "learning_rate": 1.3352898019075567e-07, "logits/chosen": -1.4307568073272705, "logits/rejected": -1.2897155284881592, "logps/chosen": -428.614990234375, "logps/rejected": -416.0874938964844, "loss": 0.6913, "rewards/accuracies": 0.26750001311302185, "rewards/chosen": 0.027062682434916496, "rewards/margins": 0.010381927713751793, "rewards/rejected": 0.016668854281306267, "step": 1000 }, { "epoch": 0.7334066740007334, "eval_logits/chosen": -1.5244911909103394, "eval_logits/rejected": -1.4257001876831055, "eval_logps/chosen": -377.1414489746094, "eval_logps/rejected": -350.1036071777344, "eval_loss": 0.6934689283370972, "eval_rewards/accuracies": 0.22944079339504242, "eval_rewards/chosen": 0.023268749937415123, "eval_rewards/margins": 0.004975419491529465, "eval_rewards/rejected": 0.018304072320461273, "eval_runtime": 183.7804, "eval_samples_per_second": 6.595, "eval_steps_per_second": 0.827, "step": 1000 }, { "epoch": 0.7517418408507518, "grad_norm": 108.5, "learning_rate": 1.243580337490829e-07, "logits/chosen": -1.545839786529541, "logits/rejected": -1.3644452095031738, "logps/chosen": -423.6575012207031, "logps/rejected": -404.2225036621094, "loss": 0.6925, "rewards/accuracies": 0.27250000834465027, "rewards/chosen": 0.030840760096907616, "rewards/margins": 0.007886047475039959, "rewards/rejected": 0.02297058142721653, "step": 1025 }, { "epoch": 0.77007700770077, "grad_norm": 84.5, "learning_rate": 1.1518708730741012e-07, "logits/chosen": NaN, "logits/rejected": -1.2828707695007324, "logps/chosen": -411.5050048828125, "logps/rejected": -384.7875061035156, "loss": 0.6972, "rewards/accuracies": 0.26499998569488525, "rewards/chosen": 0.010768127627670765, "rewards/margins": -0.001839599572122097, "rewards/rejected": 0.012620086781680584, "step": 1050 }, { "epoch": 0.77007700770077, "eval_logits/chosen": -1.5248092412948608, "eval_logits/rejected": -1.4259984493255615, "eval_logps/chosen": -377.1019592285156, "eval_logps/rejected": -350.2105407714844, "eval_loss": 0.6938557028770447, "eval_rewards/accuracies": 0.25082236528396606, "eval_rewards/chosen": 0.020134273916482925, "eval_rewards/margins": 0.004903642926365137, "eval_rewards/rejected": 0.015223653987050056, "eval_runtime": 183.6531, "eval_samples_per_second": 6.599, "eval_steps_per_second": 0.828, "step": 1050 }, { "epoch": 0.7884121745507884, "grad_norm": 105.0, "learning_rate": 1.0601614086573733e-07, "logits/chosen": -1.4120160341262817, "logits/rejected": -1.2870084047317505, "logps/chosen": -426.989990234375, "logps/rejected": -409.2774963378906, "loss": 0.6944, "rewards/accuracies": 0.27000001072883606, "rewards/chosen": 0.02048644982278347, "rewards/margins": 0.004820861853659153, "rewards/rejected": 0.01568496786057949, "step": 1075 }, { "epoch": 0.8067473414008067, "grad_norm": 97.5, "learning_rate": 9.684519442406455e-08, "logits/chosen": -1.5207568407058716, "logits/rejected": -1.271683931350708, "logps/chosen": -412.1675109863281, "logps/rejected": -380.9200134277344, "loss": 0.7016, "rewards/accuracies": 0.25999999046325684, "rewards/chosen": 0.016521912068128586, "rewards/margins": -0.011280059814453125, "rewards/rejected": 0.02781723067164421, "step": 1100 }, { "epoch": 0.8067473414008067, "eval_logits/chosen": -1.524436593055725, "eval_logits/rejected": -1.4248079061508179, "eval_logps/chosen": -377.15130615234375, "eval_logps/rejected": -350.2006530761719, "eval_loss": 0.6940104365348816, "eval_rewards/accuracies": 0.22861842811107635, "eval_rewards/chosen": 0.018003061413764954, "eval_rewards/margins": 0.002626720117405057, "eval_rewards/rejected": 0.01539022009819746, "eval_runtime": 183.6449, "eval_samples_per_second": 6.6, "eval_steps_per_second": 0.828, "step": 1100 }, { "epoch": 0.8250825082508251, "grad_norm": 105.0, "learning_rate": 8.767424798239178e-08, "logits/chosen": -1.534997582435608, "logits/rejected": NaN, "logps/chosen": -407.0662536621094, "logps/rejected": -381.228759765625, "loss": 0.6907, "rewards/accuracies": 0.2574999928474426, "rewards/chosen": 0.02077072113752365, "rewards/margins": 0.011860962025821209, "rewards/rejected": 0.008914489299058914, "step": 1125 }, { "epoch": 0.8434176751008434, "grad_norm": 118.5, "learning_rate": 7.850330154071901e-08, "logits/chosen": -1.448728084564209, "logits/rejected": -1.2549041509628296, "logps/chosen": -424.74749755859375, "logps/rejected": -409.44500732421875, "loss": 0.6998, "rewards/accuracies": 0.26249998807907104, "rewards/chosen": 0.016925200819969177, "rewards/margins": -0.007094268687069416, "rewards/rejected": 0.02402496337890625, "step": 1150 }, { "epoch": 0.8434176751008434, "eval_logits/chosen": -1.5245393514633179, "eval_logits/rejected": -1.4257924556732178, "eval_logps/chosen": -377.2730407714844, "eval_logps/rejected": -350.22039794921875, "eval_loss": 0.6929017305374146, "eval_rewards/accuracies": 0.23026315867900848, "eval_rewards/chosen": 0.01627480424940586, "eval_rewards/margins": 0.005090211518108845, "eval_rewards/rejected": 0.011195835657417774, "eval_runtime": 183.7208, "eval_samples_per_second": 6.597, "eval_steps_per_second": 0.827, "step": 1150 }, { "epoch": 0.8617528419508618, "grad_norm": 94.0, "learning_rate": 6.933235509904623e-08, "logits/chosen": -1.643198847770691, "logits/rejected": -1.4097143411636353, "logps/chosen": -416.32061767578125, "logps/rejected": -420.0243835449219, "loss": 0.701, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.019934996962547302, "rewards/margins": -0.004024200607091188, "rewards/rejected": 0.0239674374461174, "step": 1175 }, { "epoch": 0.8800880088008801, "grad_norm": 113.0, "learning_rate": 6.016140865737343e-08, "logits/chosen": -1.4947460889816284, "logits/rejected": -1.2584409713745117, "logps/chosen": -406.197509765625, "logps/rejected": -374.1000061035156, "loss": 0.6912, "rewards/accuracies": 0.29249998927116394, "rewards/chosen": 0.023181457072496414, "rewards/margins": 0.013177642598748207, "rewards/rejected": 0.010010071098804474, "step": 1200 }, { "epoch": 0.8800880088008801, "eval_logits/chosen": -1.5244590044021606, "eval_logits/rejected": -1.4259113073349, "eval_logps/chosen": -377.118408203125, "eval_logps/rejected": -350.1998291015625, "eval_loss": 0.6906456351280212, "eval_rewards/accuracies": 0.2409539520740509, "eval_rewards/chosen": 0.019954681396484375, "eval_rewards/margins": 0.011732828803360462, "eval_rewards/rejected": 0.008238466456532478, "eval_runtime": 183.8199, "eval_samples_per_second": 6.593, "eval_steps_per_second": 0.827, "step": 1200 }, { "epoch": 0.8984231756508985, "grad_norm": 108.0, "learning_rate": 5.0990462215700656e-08, "logits/chosen": -1.4646357297897339, "logits/rejected": -1.2686426639556885, "logps/chosen": -419.42498779296875, "logps/rejected": -413.2925109863281, "loss": 0.6974, "rewards/accuracies": 0.2175000011920929, "rewards/chosen": 0.01217727642506361, "rewards/margins": -0.0036750794388353825, "rewards/rejected": 0.015859374776482582, "step": 1225 }, { "epoch": 0.9167583425009168, "grad_norm": 125.0, "learning_rate": 4.181951577402787e-08, "logits/chosen": -1.3645446300506592, "logits/rejected": -1.2459040880203247, "logps/chosen": -433.9324951171875, "logps/rejected": -397.8374938964844, "loss": 0.6948, "rewards/accuracies": 0.30250000953674316, "rewards/chosen": 0.023784179240465164, "rewards/margins": 0.004344787448644638, "rewards/rejected": 0.019432831555604935, "step": 1250 }, { "epoch": 0.9167583425009168, "eval_logits/chosen": -1.5248445272445679, "eval_logits/rejected": -1.4260269403457642, "eval_logps/chosen": -377.15460205078125, "eval_logps/rejected": -350.25494384765625, "eval_loss": 0.6929919719696045, "eval_rewards/accuracies": 0.24259868264198303, "eval_rewards/chosen": 0.0216668788343668, "eval_rewards/margins": 0.0056954436004161835, "eval_rewards/rejected": 0.01596139557659626, "eval_runtime": 183.7346, "eval_samples_per_second": 6.596, "eval_steps_per_second": 0.827, "step": 1250 }, { "epoch": 0.935093509350935, "grad_norm": 103.0, "learning_rate": 3.26485693323551e-08, "logits/chosen": -1.543642520904541, "logits/rejected": -1.4146960973739624, "logps/chosen": -432.8800048828125, "logps/rejected": -407.4825134277344, "loss": 0.6917, "rewards/accuracies": 0.27000001072883606, "rewards/chosen": 0.02718307450413704, "rewards/margins": 0.008900909684598446, "rewards/rejected": 0.01827133260667324, "step": 1275 }, { "epoch": 0.9534286762009534, "grad_norm": 107.0, "learning_rate": 2.3477622890682317e-08, "logits/chosen": -1.4744700193405151, "logits/rejected": NaN, "logps/chosen": -432.228759765625, "logps/rejected": -398.2674865722656, "loss": 0.6905, "rewards/accuracies": 0.2775000035762787, "rewards/chosen": 0.029524916782975197, "rewards/margins": 0.01431709248572588, "rewards/rejected": 0.015177459456026554, "step": 1300 }, { "epoch": 0.9534286762009534, "eval_logits/chosen": -1.52471923828125, "eval_logits/rejected": -1.4258816242218018, "eval_logps/chosen": -377.1759948730469, "eval_logps/rejected": -350.13568115234375, "eval_loss": 0.6930950880050659, "eval_rewards/accuracies": 0.2253289520740509, "eval_rewards/chosen": 0.019946148619055748, "eval_rewards/margins": 0.005028975661844015, "eval_rewards/rejected": 0.014918653294444084, "eval_runtime": 183.6858, "eval_samples_per_second": 6.598, "eval_steps_per_second": 0.827, "step": 1300 }, { "epoch": 0.9717638430509717, "grad_norm": 100.0, "learning_rate": 1.4306676449009536e-08, "logits/chosen": -1.454483985900879, "logits/rejected": -1.3722683191299438, "logps/chosen": -422.8074951171875, "logps/rejected": -396.38250732421875, "loss": 0.6941, "rewards/accuracies": 0.2549999952316284, "rewards/chosen": 0.028248444199562073, "rewards/margins": 0.004119873046875, "rewards/rejected": 0.024119414389133453, "step": 1325 }, { "epoch": 0.9900990099009901, "grad_norm": 113.0, "learning_rate": 5.135730007336757e-09, "logits/chosen": NaN, "logits/rejected": -1.327661395072937, "logps/chosen": -423.99749755859375, "logps/rejected": -390.8299865722656, "loss": 0.6953, "rewards/accuracies": 0.2775000035762787, "rewards/chosen": 0.024477539584040642, "rewards/margins": 0.002653961069881916, "rewards/rejected": 0.021812591701745987, "step": 1350 }, { "epoch": 0.9900990099009901, "eval_logits/chosen": -1.524070382118225, "eval_logits/rejected": -1.4259716272354126, "eval_logps/chosen": -377.162841796875, "eval_logps/rejected": -350.21380615234375, "eval_loss": 0.6933400630950928, "eval_rewards/accuracies": 0.24259868264198303, "eval_rewards/chosen": 0.02030799351632595, "eval_rewards/margins": 0.005902240052819252, "eval_rewards/rejected": 0.01440710760653019, "eval_runtime": 183.6314, "eval_samples_per_second": 6.6, "eval_steps_per_second": 0.828, "step": 1350 } ], "logging_steps": 25, "max_steps": 1363, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }