{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9499796665311102, "eval_steps": 500, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.032533550223668155, "grad_norm": 103.71580505371094, "kl": 0.0, "learning_rate": 6.493506493506494e-09, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -158.7671875, "logps/rejected": -235.8359375, "loss": 0.5, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 10 }, { "epoch": 0.06506710044733631, "grad_norm": 94.47801208496094, "kl": 0.05097656324505806, "learning_rate": 2.1103896103896103e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -158.2953125, "logps/rejected": -232.075, "loss": 0.498, "rewards/chosen": 0.009334802627563477, "rewards/margins": 0.012342309951782227, "rewards/rejected": -0.00300750732421875, "step": 20 }, { "epoch": 0.09760065067100447, "grad_norm": 143.15447998046875, "kl": 0.592968761920929, "learning_rate": 3.733766233766234e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -156.840625, "logps/rejected": -231.65, "loss": 0.471, "rewards/chosen": 0.14929332733154296, "rewards/margins": 0.1863230228424072, "rewards/rejected": -0.037029695510864255, "step": 30 }, { "epoch": 0.13013420089467262, "grad_norm": 81.36482238769531, "kl": 1.6533203125, "learning_rate": 5.3571428571428564e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -151.7375, "logps/rejected": -231.0375, "loss": 0.3677, "rewards/chosen": 0.6113525390625, "rewards/margins": 0.9034156799316406, "rewards/rejected": -0.29206314086914065, "step": 40 }, { "epoch": 0.16266775111834078, "grad_norm": 76.27654266357422, "kl": 0.14501953125, "learning_rate": 6.98051948051948e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -141.165625, "logps/rejected": -251.2015625, "loss": 0.1578, "rewards/chosen": 1.392755126953125, "rewards/margins": 2.920855712890625, "rewards/rejected": -1.5281005859375, "step": 50 }, { "epoch": 0.19520130134200894, "grad_norm": 3.925260066986084, "kl": 0.0025390624068677425, "learning_rate": 8.603896103896104e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -127.53515625, "logps/rejected": -274.090625, "loss": 0.0543, "rewards/chosen": 2.5130126953125, "rewards/margins": 5.8057373046875, "rewards/rejected": -3.292724609375, "step": 60 }, { "epoch": 0.2277348515656771, "grad_norm": 0.3158203661441803, "kl": 0.0, "learning_rate": 1.0227272727272728e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -100.13359375, "logps/rejected": -305.453125, "loss": 0.0025, "rewards/chosen": 4.975732421875, "rewards/margins": 10.637548828124999, "rewards/rejected": -5.66181640625, "step": 70 }, { "epoch": 0.26026840178934524, "grad_norm": 0.01107876282185316, "kl": 0.0, "learning_rate": 1.1850649350649349e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -75.421875, "logps/rejected": -338.096875, "loss": 0.0007, "rewards/chosen": 6.189892578125, "rewards/margins": 13.384521484375, "rewards/rejected": -7.19462890625, "step": 80 }, { "epoch": 0.2928019520130134, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.3474025974025975e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -56.57421875, "logps/rejected": -360.18125, "loss": 0.0015, "rewards/chosen": 8.42177734375, "rewards/margins": 18.254101562499997, "rewards/rejected": -9.83232421875, "step": 90 }, { "epoch": 0.32533550223668156, "grad_norm": 0.0, "kl": 0.15000000596046448, "learning_rate": 1.5097402597402597e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -49.366015625, "logps/rejected": -357.228125, "loss": 0.0, "rewards/chosen": 8.46904296875, "rewards/margins": 17.947460937499997, "rewards/rejected": -9.47841796875, "step": 100 }, { "epoch": 0.3578690524603497, "grad_norm": 0.0, "kl": 0.26777344942092896, "learning_rate": 1.6720779220779217e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.25625, "logps/rejected": -355.371875, "loss": 0.0, "rewards/chosen": 9.096875, "rewards/margins": 19.25791015625, "rewards/rejected": -10.16103515625, "step": 110 }, { "epoch": 0.3904026026840179, "grad_norm": 0.0, "kl": 0.934863269329071, "learning_rate": 1.8344155844155843e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -47.00859375, "logps/rejected": -353.48125, "loss": 0.0002, "rewards/chosen": 8.9310546875, "rewards/margins": 17.91005859375, "rewards/rejected": -8.97900390625, "step": 120 }, { "epoch": 0.42293615290768605, "grad_norm": 0.029346637427806854, "kl": 3.103515625, "learning_rate": 1.9967532467532466e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -46.458203125, "logps/rejected": -350.59375, "loss": 0.0, "rewards/chosen": 8.2421875, "rewards/margins": 17.04013671875, "rewards/rejected": -8.79794921875, "step": 130 }, { "epoch": 0.4554697031313542, "grad_norm": 0.0, "kl": 0.69140625, "learning_rate": 2.159090909090909e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -47.403125, "logps/rejected": -356.59375, "loss": 0.0001, "rewards/chosen": 8.01259765625, "rewards/margins": 18.42900390625, "rewards/rejected": -10.41640625, "step": 140 }, { "epoch": 0.48800325335502237, "grad_norm": 0.0, "kl": 0.16386719048023224, "learning_rate": 2.3214285714285714e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -47.1765625, "logps/rejected": -367.81875, "loss": 0.0, "rewards/chosen": 8.698828125, "rewards/margins": 18.972265625, "rewards/rejected": -10.2734375, "step": 150 }, { "epoch": 0.5205368035786905, "grad_norm": 0.0938071757555008, "kl": 3.0655274391174316, "learning_rate": 2.483766233766234e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -47.46953125, "logps/rejected": -350.990625, "loss": 0.0001, "rewards/chosen": 8.87724609375, "rewards/margins": 17.825390624999997, "rewards/rejected": -8.94814453125, "step": 160 }, { "epoch": 0.5530703538023587, "grad_norm": 0.0, "kl": 0.19960936903953552, "learning_rate": 2.6461038961038964e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.409765625, "logps/rejected": -372.10625, "loss": 0.0, "rewards/chosen": 8.11689453125, "rewards/margins": 19.501953125, "rewards/rejected": -11.38505859375, "step": 170 }, { "epoch": 0.5856039040260268, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.8084415584415584e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -49.136328125, "logps/rejected": -387.340625, "loss": 0.0, "rewards/chosen": 8.95927734375, "rewards/margins": 20.40771484375, "rewards/rejected": -11.4484375, "step": 180 }, { "epoch": 0.618137454249695, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.9707792207792204e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -50.2359375, "logps/rejected": -391.128125, "loss": 0.0001, "rewards/chosen": 8.42666015625, "rewards/margins": 20.3193359375, "rewards/rejected": -11.89267578125, "step": 190 }, { "epoch": 0.6506710044733631, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.133116883116883e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -49.67421875, "logps/rejected": -390.3875, "loss": 0.0001, "rewards/chosen": 8.5646484375, "rewards/margins": 20.190722656250003, "rewards/rejected": -11.62607421875, "step": 200 }, { "epoch": 0.6832045546970313, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.295454545454545e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -50.19453125, "logps/rejected": -393.453125, "loss": 0.0, "rewards/chosen": 8.9052734375, "rewards/margins": 21.5578125, "rewards/rejected": -12.6525390625, "step": 210 }, { "epoch": 0.7157381049206994, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.457792207792208e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -49.93125, "logps/rejected": -401.3375, "loss": 0.0, "rewards/chosen": 9.0279296875, "rewards/margins": 22.509765625, "rewards/rejected": -13.4818359375, "step": 220 }, { "epoch": 0.7482716551443677, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.62012987012987e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -50.351171875, "logps/rejected": -399.60625, "loss": 0.0, "rewards/chosen": 8.4427734375, "rewards/margins": 22.73662109375, "rewards/rejected": -14.29384765625, "step": 230 }, { "epoch": 0.7808052053680358, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.782467532467532e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -50.937109375, "logps/rejected": -398.54375, "loss": 0.0, "rewards/chosen": 9.748828125, "rewards/margins": 22.75693359375, "rewards/rejected": -13.00810546875, "step": 240 }, { "epoch": 0.813338755591704, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 3.9448051948051946e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -50.4765625, "logps/rejected": -406.259375, "loss": 0.0, "rewards/chosen": 7.9296875, "rewards/margins": 21.64990234375, "rewards/rejected": -13.72021484375, "step": 250 }, { "epoch": 0.8458723058153721, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.1071428571428566e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -50.065234375, "logps/rejected": -397.978125, "loss": 0.0, "rewards/chosen": 8.79658203125, "rewards/margins": 22.427539062500003, "rewards/rejected": -13.63095703125, "step": 260 }, { "epoch": 0.8784058560390403, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.2694805194805197e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.828515625, "logps/rejected": -395.05, "loss": 0.0, "rewards/chosen": 8.353515625, "rewards/margins": 19.6958984375, "rewards/rejected": -11.3423828125, "step": 270 }, { "epoch": 0.9109394062627084, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.4318181818181817e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -49.0359375, "logps/rejected": -401.634375, "loss": 0.0, "rewards/chosen": 8.53701171875, "rewards/margins": 20.25283203125, "rewards/rejected": -11.7158203125, "step": 280 }, { "epoch": 0.9434729564863765, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.5941558441558437e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -49.019140625, "logps/rejected": -393.425, "loss": 0.0, "rewards/chosen": 8.6474609375, "rewards/margins": 20.7501953125, "rewards/rejected": -12.102734375, "step": 290 }, { "epoch": 0.9760065067100447, "grad_norm": 0.016830716282129288, "kl": 0.0, "learning_rate": 4.756493506493506e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -49.725, "logps/rejected": -394.653125, "loss": 0.0, "rewards/chosen": 8.255859375, "rewards/margins": 21.61572265625, "rewards/rejected": -13.35986328125, "step": 300 }, { "epoch": 1.0065067100447336, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.918831168831168e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.37416666666667, "logps/rejected": -396.87, "loss": 0.0, "rewards/chosen": 7.7615625, "rewards/margins": 19.706875, "rewards/rejected": -11.9453125, "step": 310 }, { "epoch": 1.0390402602684017, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.999959861406242e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.408203125, "logps/rejected": -403.834375, "loss": 0.0, "rewards/chosen": 9.22265625, "rewards/margins": 22.9748046875, "rewards/rejected": -13.7521484375, "step": 320 }, { "epoch": 1.07157381049207, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.999638760389452e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.765625, "logps/rejected": -396.6625, "loss": 0.0, "rewards/chosen": 9.08193359375, "rewards/margins": 19.86611328125, "rewards/rejected": -10.7841796875, "step": 330 }, { "epoch": 1.1041073607157381, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.998996599598879e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.26953125, "logps/rejected": -393.775, "loss": 0.0, "rewards/chosen": 8.9400390625, "rewards/margins": 20.629199218750003, "rewards/rejected": -11.68916015625, "step": 340 }, { "epoch": 1.1366409109394062, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.998033461515242e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.61484375, "logps/rejected": -396.7875, "loss": 0.0, "rewards/chosen": 9.119140625, "rewards/margins": 22.4150390625, "rewards/rejected": -13.2958984375, "step": 350 }, { "epoch": 1.1691744611630743, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.996749469846372e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.3609375, "logps/rejected": -396.115625, "loss": 0.0, "rewards/chosen": 8.59248046875, "rewards/margins": 22.016796875, "rewards/rejected": -13.42431640625, "step": 360 }, { "epoch": 1.2017080113867427, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.995144789511329e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.6953125, "logps/rejected": -401.90625, "loss": 0.0, "rewards/chosen": 8.97890625, "rewards/margins": 22.325878906249997, "rewards/rejected": -13.34697265625, "step": 370 }, { "epoch": 1.2342415616104108, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.993219626619219e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.890625, "logps/rejected": -401.4625, "loss": 0.0, "rewards/chosen": 8.95244140625, "rewards/margins": 22.37314453125, "rewards/rejected": -13.420703125, "step": 380 }, { "epoch": 1.2667751118340789, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.990974228442717e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.7734375, "logps/rejected": -403.28125, "loss": 0.0, "rewards/chosen": 8.423828125, "rewards/margins": 20.5177734375, "rewards/rejected": -12.0939453125, "step": 390 }, { "epoch": 1.299308662057747, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.988408883386307e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.671875, "logps/rejected": -399.890625, "loss": 0.0, "rewards/chosen": 8.93525390625, "rewards/margins": 21.9240234375, "rewards/rejected": -12.98876953125, "step": 400 }, { "epoch": 1.331842212281415, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.985523920949242e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.679296875, "logps/rejected": -402.096875, "loss": 0.0, "rewards/chosen": 8.42080078125, "rewards/margins": 20.993359374999997, "rewards/rejected": -12.57255859375, "step": 410 }, { "epoch": 1.3643757625050834, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.982319711683221e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -49.55546875, "logps/rejected": -395.29375, "loss": 0.0, "rewards/chosen": 9.683203125, "rewards/margins": 22.695703125, "rewards/rejected": -13.0125, "step": 420 }, { "epoch": 1.3969093127287515, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.978796667144791e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.65859375, "logps/rejected": -403.265625, "loss": 0.0, "rewards/chosen": 8.24130859375, "rewards/margins": 21.69130859375, "rewards/rejected": -13.45, "step": 430 }, { "epoch": 1.4294428629524196, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.974955239842493e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -49.275390625, "logps/rejected": -410.9125, "loss": 0.0, "rewards/chosen": 8.18759765625, "rewards/margins": 21.62451171875, "rewards/rejected": -13.4369140625, "step": 440 }, { "epoch": 1.461976413176088, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.970795923178733e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -49.339453125, "logps/rejected": -405.471875, "loss": 0.0, "rewards/chosen": 7.85322265625, "rewards/margins": 21.57744140625, "rewards/rejected": -13.72421875, "step": 450 }, { "epoch": 1.494509963399756, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.966319251386412e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -49.5921875, "logps/rejected": -401.665625, "loss": 0.0, "rewards/chosen": 8.530078125, "rewards/margins": 21.6490234375, "rewards/rejected": -13.1189453125, "step": 460 }, { "epoch": 1.5270435136234242, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.961525799460308e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -49.540625, "logps/rejected": -408.703125, "loss": 0.0, "rewards/chosen": 8.6064453125, "rewards/margins": 22.0228515625, "rewards/rejected": -13.41640625, "step": 470 }, { "epoch": 1.5595770638470923, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.956416183083221e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.923046875, "logps/rejected": -407.4, "loss": 0.0, "rewards/chosen": 8.303515625, "rewards/margins": 23.2646484375, "rewards/rejected": -14.9611328125, "step": 480 }, { "epoch": 1.5921106140707604, "grad_norm": 0.0061869011260569096, "kl": 0.0, "learning_rate": 4.950991058546892e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.971484375, "logps/rejected": -404.6, "loss": 0.0, "rewards/chosen": 8.97802734375, "rewards/margins": 21.27333984375, "rewards/rejected": -12.2953125, "step": 490 }, { "epoch": 1.6246441642944287, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.945251122667714e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.584765625, "logps/rejected": -411.175, "loss": 0.0, "rewards/chosen": 8.23486328125, "rewards/margins": 21.76025390625, "rewards/rejected": -13.525390625, "step": 500 }, { "epoch": 1.6571777145180968, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.93919711269722e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.616015625, "logps/rejected": -403.053125, "loss": 0.0, "rewards/chosen": 8.54775390625, "rewards/margins": 21.637109375, "rewards/rejected": -13.08935546875, "step": 510 }, { "epoch": 1.689711264741765, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.932829806227398e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.595703125, "logps/rejected": -401.678125, "loss": 0.0, "rewards/chosen": 9.25576171875, "rewards/margins": 22.1841796875, "rewards/rejected": -12.92841796875, "step": 520 }, { "epoch": 1.7222448149654332, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.926150021090812e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.584765625, "logps/rejected": -408.21875, "loss": 0.0, "rewards/chosen": 9.191796875, "rewards/margins": 23.53251953125, "rewards/rejected": -14.34072265625, "step": 530 }, { "epoch": 1.7547783651891011, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.919158615255555e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.476953125, "logps/rejected": -402.609375, "loss": 0.0, "rewards/chosen": 8.7083984375, "rewards/margins": 22.474316406249997, "rewards/rejected": -13.76591796875, "step": 540 }, { "epoch": 1.7873119154127695, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.911856486715056e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -49.780078125, "logps/rejected": -407.21875, "loss": 0.0, "rewards/chosen": 9.62373046875, "rewards/margins": 23.37060546875, "rewards/rejected": -13.746875, "step": 550 }, { "epoch": 1.8198454656364376, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.904244573372733e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -49.086328125, "logps/rejected": -407.178125, "loss": 0.0, "rewards/chosen": 8.03115234375, "rewards/margins": 22.18642578125, "rewards/rejected": -14.1552734375, "step": 560 }, { "epoch": 1.8523790158601057, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.896323852921527e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -49.037890625, "logps/rejected": -406.178125, "loss": 0.0, "rewards/chosen": 8.99189453125, "rewards/margins": 22.690234375000003, "rewards/rejected": -13.69833984375, "step": 570 }, { "epoch": 1.884912566083774, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.888095342718329e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.119140625, "logps/rejected": -405.759375, "loss": 0.0, "rewards/chosen": 8.451171875, "rewards/margins": 21.19541015625, "rewards/rejected": -12.74423828125, "step": 580 }, { "epoch": 1.9174461163074419, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.879560099653306e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.169921875, "logps/rejected": -406.915625, "loss": 0.0, "rewards/chosen": 8.71015625, "rewards/margins": 20.17109375, "rewards/rejected": -11.4609375, "step": 590 }, { "epoch": 1.9499796665311102, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 4.870719220014149e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -48.301953125, "logps/rejected": -401.065625, "loss": 0.0, "rewards/chosen": 8.30107421875, "rewards/margins": 21.400097656249997, "rewards/rejected": -13.0990234375, "step": 600 } ], "logging_steps": 10, "max_steps": 3080, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }