{ "best_metric": null, "best_model_checkpoint": null, "epoch": 19.578059071729957, "eval_steps": 500, "global_step": 580, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.33755274261603374, "grad_norm": 4.021343231201172, "learning_rate": 8.620689655172415e-07, "logits/chosen": -2.3571667671203613, "logits/rejected": -2.3413684368133545, "logps/chosen": -66.12004852294922, "logps/rejected": -75.67423248291016, "loss": 0.6916, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.002384235616773367, "rewards/margins": 0.0006812589708715677, "rewards/rejected": 0.00170297606382519, "step": 10 }, { "epoch": 0.6751054852320675, "grad_norm": 4.169963359832764, "learning_rate": 1.724137931034483e-06, "logits/chosen": -2.2970731258392334, "logits/rejected": -2.3126332759857178, "logps/chosen": -75.44676208496094, "logps/rejected": -91.96070861816406, "loss": 0.695, "rewards/accuracies": 0.4375, "rewards/chosen": -0.004610966891050339, "rewards/margins": -0.0035776779986917973, "rewards/rejected": -0.0010332881938666105, "step": 20 }, { "epoch": 1.0126582278481013, "grad_norm": 3.956296682357788, "learning_rate": 2.5862068965517246e-06, "logits/chosen": -2.323883056640625, "logits/rejected": -2.360252857208252, "logps/chosen": -82.34379577636719, "logps/rejected": -76.72650146484375, "loss": 0.6932, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0038711726665496826, "rewards/margins": -0.002081885002553463, "rewards/rejected": -0.001789287431165576, "step": 30 }, { "epoch": 1.350210970464135, "grad_norm": 3.930093765258789, "learning_rate": 3.448275862068966e-06, "logits/chosen": -2.3154587745666504, "logits/rejected": -2.3474326133728027, "logps/chosen": -65.43012237548828, "logps/rejected": -73.15928649902344, "loss": 0.691, "rewards/accuracies": 0.4375, "rewards/chosen": 0.005101156421005726, "rewards/margins": 0.008008326403796673, "rewards/rejected": -0.0029071702156215906, "step": 40 }, { "epoch": 1.6877637130801688, "grad_norm": 4.285252094268799, "learning_rate": 4.310344827586207e-06, "logits/chosen": -2.3710691928863525, "logits/rejected": -2.371490478515625, "logps/chosen": -71.6757583618164, "logps/rejected": -70.73356628417969, "loss": 0.6889, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0005036738002672791, "rewards/margins": 0.01000114344060421, "rewards/rejected": -0.009497471153736115, "step": 50 }, { "epoch": 2.0253164556962027, "grad_norm": 4.272525787353516, "learning_rate": 4.999818897894192e-06, "logits/chosen": -2.3313004970550537, "logits/rejected": -2.3659963607788086, "logps/chosen": -76.61670684814453, "logps/rejected": -83.35383605957031, "loss": 0.6826, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0012120052706450224, "rewards/margins": 0.0215681791305542, "rewards/rejected": -0.020356174558401108, "step": 60 }, { "epoch": 2.3628691983122363, "grad_norm": 3.9699583053588867, "learning_rate": 4.9934830787948756e-06, "logits/chosen": -2.335836172103882, "logits/rejected": -2.346723794937134, "logps/chosen": -77.20027923583984, "logps/rejected": -76.7348403930664, "loss": 0.668, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.01467165071517229, "rewards/margins": 0.05109705403447151, "rewards/rejected": -0.036425404250621796, "step": 70 }, { "epoch": 2.70042194092827, "grad_norm": 4.201504230499268, "learning_rate": 4.978118375700895e-06, "logits/chosen": -2.3483119010925293, "logits/rejected": -2.314948320388794, "logps/chosen": -86.64659881591797, "logps/rejected": -73.74983215332031, "loss": 0.6543, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.015483448281884193, "rewards/margins": 0.07252755016088486, "rewards/rejected": -0.057044100016355515, "step": 80 }, { "epoch": 3.037974683544304, "grad_norm": 4.158086776733398, "learning_rate": 4.953780424089803e-06, "logits/chosen": -2.353440523147583, "logits/rejected": -2.3548269271850586, "logps/chosen": -66.6883773803711, "logps/rejected": -70.85858917236328, "loss": 0.6434, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.027489716187119484, "rewards/margins": 0.11177561432123184, "rewards/rejected": -0.08428589999675751, "step": 90 }, { "epoch": 3.3755274261603376, "grad_norm": 4.082221031188965, "learning_rate": 4.920557351506409e-06, "logits/chosen": -2.341866970062256, "logits/rejected": -2.364499092102051, "logps/chosen": -129.14761352539062, "logps/rejected": -73.15742492675781, "loss": 0.6058, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4784974157810211, "rewards/margins": 0.6413174867630005, "rewards/rejected": -0.16281995177268982, "step": 100 }, { "epoch": 3.7130801687763713, "grad_norm": 4.832098007202148, "learning_rate": 4.878569458453592e-06, "logits/chosen": -2.345391035079956, "logits/rejected": -2.348513603210449, "logps/chosen": -77.12025451660156, "logps/rejected": -87.58562469482422, "loss": 0.583, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.005122403614223003, "rewards/margins": 0.2564489245414734, "rewards/rejected": -0.26157131791114807, "step": 110 }, { "epoch": 4.050632911392405, "grad_norm": 4.12258243560791, "learning_rate": 4.827968782785062e-06, "logits/chosen": -2.373936176300049, "logits/rejected": -2.4137730598449707, "logps/chosen": -75.44515228271484, "logps/rejected": -93.63223266601562, "loss": 0.5739, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.009799259714782238, "rewards/margins": 0.2614334523677826, "rewards/rejected": -0.2712326943874359, "step": 120 }, { "epoch": 4.3881856540084385, "grad_norm": 4.568253993988037, "learning_rate": 4.7689385491773934e-06, "logits/chosen": -2.3857738971710205, "logits/rejected": -2.3939366340637207, "logps/chosen": -75.08316802978516, "logps/rejected": -85.54856872558594, "loss": 0.5264, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.057138361036777496, "rewards/margins": 0.4834250807762146, "rewards/rejected": -0.5405634641647339, "step": 130 }, { "epoch": 4.725738396624473, "grad_norm": 5.059521675109863, "learning_rate": 4.70169250567482e-06, "logits/chosen": -2.403332471847534, "logits/rejected": -2.4016706943511963, "logps/chosen": -77.38340759277344, "logps/rejected": -77.5801010131836, "loss": 0.5097, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.15228016674518585, "rewards/margins": 0.39201101660728455, "rewards/rejected": -0.5442911982536316, "step": 140 }, { "epoch": 5.063291139240507, "grad_norm": 4.922734260559082, "learning_rate": 4.626474149709127e-06, "logits/chosen": -2.400696039199829, "logits/rejected": -2.3924992084503174, "logps/chosen": -65.39146423339844, "logps/rejected": -72.46720886230469, "loss": 0.5004, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.16622374951839447, "rewards/margins": 0.5306353569030762, "rewards/rejected": -0.6968590617179871, "step": 150 }, { "epoch": 5.40084388185654, "grad_norm": 4.739672660827637, "learning_rate": 4.54355584639723e-06, "logits/chosen": -2.426323413848877, "logits/rejected": -2.4180703163146973, "logps/chosen": -89.33406066894531, "logps/rejected": -86.6529541015625, "loss": 0.4472, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.2703530192375183, "rewards/margins": 0.723961353302002, "rewards/rejected": -0.9943143725395203, "step": 160 }, { "epoch": 5.738396624472574, "grad_norm": 5.088452339172363, "learning_rate": 4.45323784230908e-06, "logits/chosen": -2.371295690536499, "logits/rejected": -2.4007980823516846, "logps/chosen": -74.7086410522461, "logps/rejected": -110.96791076660156, "loss": 0.4337, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.19489821791648865, "rewards/margins": 0.838965117931366, "rewards/rejected": -1.0338633060455322, "step": 170 }, { "epoch": 6.075949367088608, "grad_norm": 4.700001239776611, "learning_rate": 4.355847178277025e-06, "logits/chosen": -2.408874750137329, "logits/rejected": -2.433443546295166, "logps/chosen": -73.986083984375, "logps/rejected": -92.54002380371094, "loss": 0.4201, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.3620097041130066, "rewards/margins": 0.8090314865112305, "rewards/rejected": -1.1710412502288818, "step": 180 }, { "epoch": 6.413502109704641, "grad_norm": 5.745143890380859, "learning_rate": 4.2517365051833564e-06, "logits/chosen": -2.413661479949951, "logits/rejected": -2.4319045543670654, "logps/chosen": -61.93586349487305, "logps/rejected": -87.93816375732422, "loss": 0.3822, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.3493840992450714, "rewards/margins": 1.0286495685577393, "rewards/rejected": -1.3780337572097778, "step": 190 }, { "epoch": 6.751054852320675, "grad_norm": 14.167346000671387, "learning_rate": 4.141282807014034e-06, "logits/chosen": -2.376776695251465, "logits/rejected": -2.386939287185669, "logps/chosen": -78.03498840332031, "logps/rejected": -103.80584716796875, "loss": 0.3533, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.5522706508636475, "rewards/margins": 1.1880146265029907, "rewards/rejected": -1.7402851581573486, "step": 200 }, { "epoch": 7.0886075949367084, "grad_norm": 5.238346576690674, "learning_rate": 4.024886035802432e-06, "logits/chosen": -2.399014711380005, "logits/rejected": -2.4000418186187744, "logps/chosen": -91.64801025390625, "logps/rejected": -88.77143096923828, "loss": 0.3512, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6410180330276489, "rewards/margins": 1.2458090782165527, "rewards/rejected": -1.8868271112442017, "step": 210 }, { "epoch": 7.4261603375527425, "grad_norm": 4.895415306091309, "learning_rate": 3.9029676634059565e-06, "logits/chosen": -2.38824725151062, "logits/rejected": -2.390937328338623, "logps/chosen": -76.24591064453125, "logps/rejected": -109.47352600097656, "loss": 0.3182, "rewards/accuracies": 0.9375, "rewards/chosen": -0.6207951307296753, "rewards/margins": 1.5345582962036133, "rewards/rejected": -2.155353546142578, "step": 220 }, { "epoch": 7.763713080168777, "grad_norm": 7.413693904876709, "learning_rate": 3.7759691553595214e-06, "logits/chosen": -2.399376153945923, "logits/rejected": -2.3986117839813232, "logps/chosen": -90.97004699707031, "logps/rejected": -98.67308807373047, "loss": 0.2989, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.7282214760780334, "rewards/margins": 1.5992071628570557, "rewards/rejected": -2.3274283409118652, "step": 230 }, { "epoch": 8.10126582278481, "grad_norm": 4.8047990798950195, "learning_rate": 3.6443503723320837e-06, "logits/chosen": -2.3746752738952637, "logits/rejected": -2.3904380798339844, "logps/chosen": -81.03557586669922, "logps/rejected": -98.62934112548828, "loss": 0.2891, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.0007418394088745, "rewards/margins": 1.6062211990356445, "rewards/rejected": -2.6069629192352295, "step": 240 }, { "epoch": 8.438818565400844, "grad_norm": 5.075074672698975, "learning_rate": 3.508587904974522e-06, "logits/chosen": -2.3420817852020264, "logits/rejected": -2.3538601398468018, "logps/chosen": -86.0930404663086, "logps/rejected": -113.33358001708984, "loss": 0.2451, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8170707821846008, "rewards/margins": 1.765887975692749, "rewards/rejected": -2.582958698272705, "step": 250 }, { "epoch": 8.776371308016877, "grad_norm": 5.683614730834961, "learning_rate": 3.3691733481883693e-06, "logits/chosen": -2.3793487548828125, "logits/rejected": -2.376232862472534, "logps/chosen": -82.44889831542969, "logps/rejected": -100.43474578857422, "loss": 0.2561, "rewards/accuracies": 0.9375, "rewards/chosen": -1.052356481552124, "rewards/margins": 1.7894868850708008, "rewards/rejected": -2.841843605041504, "step": 260 }, { "epoch": 9.113924050632912, "grad_norm": 5.043191909790039, "learning_rate": 3.226611521064278e-06, "logits/chosen": -2.3715972900390625, "logits/rejected": -2.3901712894439697, "logps/chosen": -78.08219909667969, "logps/rejected": -101.30404663085938, "loss": 0.2135, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.124599575996399, "rewards/margins": 1.955810546875, "rewards/rejected": -3.0804100036621094, "step": 270 }, { "epoch": 9.451476793248945, "grad_norm": 4.911313533782959, "learning_rate": 3.0814186389357765e-06, "logits/chosen": -2.365330696105957, "logits/rejected": -2.3815040588378906, "logps/chosen": -74.75053405761719, "logps/rejected": -96.59932708740234, "loss": 0.2016, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.2124760150909424, "rewards/margins": 2.1574602127075195, "rewards/rejected": -3.369936466217041, "step": 280 }, { "epoch": 9.789029535864978, "grad_norm": 5.809771537780762, "learning_rate": 2.9341204441673267e-06, "logits/chosen": -2.349513292312622, "logits/rejected": -2.362490653991699, "logps/chosen": -82.00188446044922, "logps/rejected": -115.06053161621094, "loss": 0.1873, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5614944696426392, "rewards/margins": 2.464742660522461, "rewards/rejected": -4.0262370109558105, "step": 290 }, { "epoch": 10.126582278481013, "grad_norm": 5.956116676330566, "learning_rate": 2.785250302445062e-06, "logits/chosen": -2.3490607738494873, "logits/rejected": -2.3738341331481934, "logps/chosen": -96.7169418334961, "logps/rejected": -125.81837463378906, "loss": 0.1794, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.4949434995651245, "rewards/margins": 2.48805832862854, "rewards/rejected": -3.983001708984375, "step": 300 }, { "epoch": 10.464135021097047, "grad_norm": 5.315141201019287, "learning_rate": 2.6353472714635443e-06, "logits/chosen": -2.3448591232299805, "logits/rejected": -2.3330416679382324, "logps/chosen": -77.31942749023438, "logps/rejected": -108.85356140136719, "loss": 0.1444, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.4829410314559937, "rewards/margins": 2.6632163524627686, "rewards/rejected": -4.146157264709473, "step": 310 }, { "epoch": 10.80168776371308, "grad_norm": 5.832997798919678, "learning_rate": 2.4849541490017868e-06, "logits/chosen": -2.3063769340515137, "logits/rejected": -2.31026029586792, "logps/chosen": -89.78933715820312, "logps/rejected": -124.50910949707031, "loss": 0.1507, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.8659601211547852, "rewards/margins": 2.7945058345794678, "rewards/rejected": -4.660466194152832, "step": 320 }, { "epoch": 11.139240506329115, "grad_norm": 5.006984233856201, "learning_rate": 2.3346155074564712e-06, "logits/chosen": -2.3069465160369873, "logits/rejected": -2.3103697299957275, "logps/chosen": -94.30183410644531, "logps/rejected": -111.3757553100586, "loss": 0.1389, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.8566234111785889, "rewards/margins": 2.650669574737549, "rewards/rejected": -4.507293701171875, "step": 330 }, { "epoch": 11.476793248945148, "grad_norm": 5.461178302764893, "learning_rate": 2.184875721949277e-06, "logits/chosen": -2.307982921600342, "logits/rejected": -2.2805938720703125, "logps/chosen": -95.00460815429688, "logps/rejected": -130.14364624023438, "loss": 0.1192, "rewards/accuracies": 1.0, "rewards/chosen": -2.095489025115967, "rewards/margins": 3.2359142303466797, "rewards/rejected": -5.331402778625488, "step": 340 }, { "epoch": 11.814345991561181, "grad_norm": 5.430806636810303, "learning_rate": 2.0362769991485514e-06, "logits/chosen": -2.305732488632202, "logits/rejected": -2.325230360031128, "logps/chosen": -85.6950454711914, "logps/rejected": -129.57199096679688, "loss": 0.1168, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.912523865699768, "rewards/margins": 3.0515921115875244, "rewards/rejected": -4.964116096496582, "step": 350 }, { "epoch": 12.151898734177216, "grad_norm": 4.317180156707764, "learning_rate": 1.8893574139429226e-06, "logits/chosen": -2.29160737991333, "logits/rejected": -2.3020012378692627, "logps/chosen": -88.52080535888672, "logps/rejected": -132.19122314453125, "loss": 0.0974, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.399042844772339, "rewards/margins": 3.41259503364563, "rewards/rejected": -5.811637878417969, "step": 360 }, { "epoch": 12.48945147679325, "grad_norm": 4.4959611892700195, "learning_rate": 1.744648961076068e-06, "logits/chosen": -2.2502753734588623, "logits/rejected": -2.26035737991333, "logps/chosen": -100.68859100341797, "logps/rejected": -137.6667938232422, "loss": 0.084, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.559737205505371, "rewards/margins": 3.8113670349121094, "rewards/rejected": -6.3711042404174805, "step": 370 }, { "epoch": 12.827004219409282, "grad_norm": 5.346553802490234, "learning_rate": 1.602675628797636e-06, "logits/chosen": -2.2998809814453125, "logits/rejected": -2.2750256061553955, "logps/chosen": -100.53385925292969, "logps/rejected": -129.20062255859375, "loss": 0.0921, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.7617571353912354, "rewards/margins": 3.2644877433776855, "rewards/rejected": -6.0262451171875, "step": 380 }, { "epoch": 13.164556962025316, "grad_norm": 4.991717338562012, "learning_rate": 1.4639515015056205e-06, "logits/chosen": -2.249573230743408, "logits/rejected": -2.265824556350708, "logps/chosen": -104.0100326538086, "logps/rejected": -141.9222869873047, "loss": 0.09, "rewards/accuracies": 1.0, "rewards/chosen": -2.6467862129211426, "rewards/margins": 3.3599770069122314, "rewards/rejected": -6.006763458251953, "step": 390 }, { "epoch": 13.50210970464135, "grad_norm": 5.525568962097168, "learning_rate": 1.328978898250525e-06, "logits/chosen": -2.304884672164917, "logits/rejected": -2.2955222129821777, "logps/chosen": -99.25491333007812, "logps/rejected": -142.86819458007812, "loss": 0.0758, "rewards/accuracies": 1.0, "rewards/chosen": -2.675879955291748, "rewards/margins": 3.701845645904541, "rewards/rejected": -6.377726078033447, "step": 400 }, { "epoch": 13.839662447257384, "grad_norm": 4.815173149108887, "learning_rate": 1.198246553841744e-06, "logits/chosen": -2.295896053314209, "logits/rejected": -2.3182671070098877, "logps/chosen": -90.5086669921875, "logps/rejected": -123.36918640136719, "loss": 0.0631, "rewards/accuracies": 1.0, "rewards/chosen": -2.5161099433898926, "rewards/margins": 3.6973776817321777, "rewards/rejected": -6.21348762512207, "step": 410 }, { "epoch": 14.177215189873417, "grad_norm": 3.638730525970459, "learning_rate": 1.0722278491423998e-06, "logits/chosen": -2.2437102794647217, "logits/rejected": -2.259737014770508, "logps/chosen": -100.89008331298828, "logps/rejected": -145.32876586914062, "loss": 0.0618, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.0369720458984375, "rewards/margins": 4.202728271484375, "rewards/rejected": -7.2397003173828125, "step": 420 }, { "epoch": 14.514767932489452, "grad_norm": 3.4701929092407227, "learning_rate": 9.513790969606926e-07, "logits/chosen": -2.235750913619995, "logits/rejected": -2.2566628456115723, "logps/chosen": -113.2632827758789, "logps/rejected": -151.08389282226562, "loss": 0.0575, "rewards/accuracies": 1.0, "rewards/chosen": -3.1561920642852783, "rewards/margins": 3.824761152267456, "rewards/rejected": -6.980954170227051, "step": 430 }, { "epoch": 14.852320675105485, "grad_norm": 3.5095152854919434, "learning_rate": 8.361378897445643e-07, "logits/chosen": -2.22627592086792, "logits/rejected": -2.257497787475586, "logps/chosen": -114.46488952636719, "logps/rejected": -151.49237060546875, "loss": 0.0614, "rewards/accuracies": 1.0, "rewards/chosen": -3.645853042602539, "rewards/margins": 4.079274654388428, "rewards/rejected": -7.725128173828125, "step": 440 }, { "epoch": 15.189873417721518, "grad_norm": 4.185005187988281, "learning_rate": 7.269215150626391e-07, "logits/chosen": -2.2187466621398926, "logits/rejected": -2.2320122718811035, "logps/chosen": -113.44010925292969, "logps/rejected": -154.74827575683594, "loss": 0.0552, "rewards/accuracies": 1.0, "rewards/chosen": -3.342592239379883, "rewards/margins": 4.421627998352051, "rewards/rejected": -7.764220237731934, "step": 450 }, { "epoch": 15.527426160337553, "grad_norm": 3.698930263519287, "learning_rate": 6.241254446089942e-07, "logits/chosen": -2.194887638092041, "logits/rejected": -2.2225141525268555, "logps/chosen": -106.3698501586914, "logps/rejected": -159.8157501220703, "loss": 0.0494, "rewards/accuracies": 1.0, "rewards/chosen": -3.6747384071350098, "rewards/margins": 4.638253211975098, "rewards/rejected": -8.31299114227295, "step": 460 }, { "epoch": 15.864978902953586, "grad_norm": 4.312787055969238, "learning_rate": 5.281219022030423e-07, "logits/chosen": -2.215421438217163, "logits/rejected": -2.2375741004943848, "logps/chosen": -98.69608306884766, "logps/rejected": -165.375, "loss": 0.0477, "rewards/accuracies": 1.0, "rewards/chosen": -3.6675498485565186, "rewards/margins": 4.56928825378418, "rewards/rejected": -8.236838340759277, "step": 470 }, { "epoch": 16.20253164556962, "grad_norm": 3.4455654621124268, "learning_rate": 4.392585159698087e-07, "logits/chosen": -2.20900297164917, "logits/rejected": -2.214944362640381, "logps/chosen": -92.8326416015625, "logps/rejected": -133.35049438476562, "loss": 0.0518, "rewards/accuracies": 1.0, "rewards/chosen": -3.2708263397216797, "rewards/margins": 4.095736980438232, "rewards/rejected": -7.366563320159912, "step": 480 }, { "epoch": 16.540084388185655, "grad_norm": 4.105606555938721, "learning_rate": 3.578570595810274e-07, "logits/chosen": -2.1989166736602783, "logits/rejected": -2.2026238441467285, "logps/chosen": -124.64225006103516, "logps/rejected": -160.11962890625, "loss": 0.0486, "rewards/accuracies": 1.0, "rewards/chosen": -3.9146201610565186, "rewards/margins": 4.5453619956970215, "rewards/rejected": -8.459983825683594, "step": 490 }, { "epoch": 16.877637130801688, "grad_norm": 3.0307629108428955, "learning_rate": 2.8421228711503127e-07, "logits/chosen": -2.2583324909210205, "logits/rejected": -2.264362335205078, "logps/chosen": -109.14697265625, "logps/rejected": -153.9482421875, "loss": 0.0416, "rewards/accuracies": 1.0, "rewards/chosen": -4.014981269836426, "rewards/margins": 4.060934066772461, "rewards/rejected": -8.075915336608887, "step": 500 }, { "epoch": 17.21518987341772, "grad_norm": 7.35497522354126, "learning_rate": 2.1859086575439225e-07, "logits/chosen": -2.2129454612731934, "logits/rejected": -2.2381744384765625, "logps/chosen": -105.6734848022461, "logps/rejected": -155.99111938476562, "loss": 0.0448, "rewards/accuracies": 1.0, "rewards/chosen": -3.604158878326416, "rewards/margins": 4.801852703094482, "rewards/rejected": -8.406011581420898, "step": 510 }, { "epoch": 17.552742616033754, "grad_norm": 3.5699377059936523, "learning_rate": 1.6123041018599766e-07, "logits/chosen": -2.195000410079956, "logits/rejected": -2.2445883750915527, "logps/chosen": -116.91619873046875, "logps/rejected": -181.08287048339844, "loss": 0.0348, "rewards/accuracies": 1.0, "rewards/chosen": -4.4316792488098145, "rewards/margins": 4.899362087249756, "rewards/rejected": -9.33104133605957, "step": 520 }, { "epoch": 17.890295358649787, "grad_norm": 4.2693352699279785, "learning_rate": 1.1233862220001168e-07, "logits/chosen": -2.172609806060791, "logits/rejected": -2.196518898010254, "logps/chosen": -117.672119140625, "logps/rejected": -155.81735229492188, "loss": 0.0494, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.591332197189331, "rewards/margins": 4.197774410247803, "rewards/rejected": -7.789106845855713, "step": 530 }, { "epoch": 18.227848101265824, "grad_norm": 3.4541165828704834, "learning_rate": 7.209253860320897e-08, "logits/chosen": -2.206496000289917, "logits/rejected": -2.2164080142974854, "logps/chosen": -114.9886474609375, "logps/rejected": -154.38595581054688, "loss": 0.0396, "rewards/accuracies": 1.0, "rewards/chosen": -4.102456092834473, "rewards/margins": 4.425411701202393, "rewards/rejected": -8.527867317199707, "step": 540 }, { "epoch": 18.565400843881857, "grad_norm": 3.7231645584106445, "learning_rate": 4.063789016999331e-08, "logits/chosen": -2.2224667072296143, "logits/rejected": -2.248166561126709, "logps/chosen": -100.99537658691406, "logps/rejected": -148.0446319580078, "loss": 0.0473, "rewards/accuracies": 1.0, "rewards/chosen": -3.5166335105895996, "rewards/margins": 4.642723560333252, "rewards/rejected": -8.159357070922852, "step": 550 }, { "epoch": 18.90295358649789, "grad_norm": 3.382079601287842, "learning_rate": 1.808857395232788e-08, "logits/chosen": -2.1830036640167236, "logits/rejected": -2.205293893814087, "logps/chosen": -110.93768310546875, "logps/rejected": -149.43922424316406, "loss": 0.0378, "rewards/accuracies": 1.0, "rewards/chosen": -3.975515842437744, "rewards/margins": 4.391709327697754, "rewards/rejected": -8.36722469329834, "step": 560 }, { "epoch": 19.240506329113924, "grad_norm": 3.477522850036621, "learning_rate": 4.526240859345499e-09, "logits/chosen": -2.1929779052734375, "logits/rejected": -2.1888599395751953, "logps/chosen": -112.3604507446289, "logps/rejected": -159.12904357910156, "loss": 0.0428, "rewards/accuracies": 1.0, "rewards/chosen": -4.224286079406738, "rewards/margins": 4.6413350105285645, "rewards/rejected": -8.865621566772461, "step": 570 }, { "epoch": 19.578059071729957, "grad_norm": 2.8677260875701904, "learning_rate": 0.0, "logits/chosen": -2.2252745628356934, "logits/rejected": -2.255455732345581, "logps/chosen": -98.1314468383789, "logps/rejected": -156.11318969726562, "loss": 0.041, "rewards/accuracies": 1.0, "rewards/chosen": -3.2994728088378906, "rewards/margins": 4.715044975280762, "rewards/rejected": -8.014516830444336, "step": 580 }, { "epoch": 19.578059071729957, "step": 580, "total_flos": 2.1485149125974426e+18, "train_loss": 0.2753299895545532, "train_runtime": 4556.676, "train_samples_per_second": 8.322, "train_steps_per_second": 0.127 } ], "logging_steps": 10, "max_steps": 580, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1485149125974426e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }