{ "best_metric": null, "best_model_checkpoint": null, "epoch": 19.502092050209207, "eval_steps": 500, "global_step": 580, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.33472803347280333, "grad_norm": 3.9892160892486572, "learning_rate": 8.620689655172415e-07, "logits/chosen": -2.315223217010498, "logits/rejected": -2.3654401302337646, "logps/chosen": -65.86729431152344, "logps/rejected": -77.53572845458984, "loss": 0.6929, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.0023138518445193768, "rewards/margins": -0.001122759305872023, "rewards/rejected": 0.0034366101026535034, "step": 10 }, { "epoch": 0.6694560669456067, "grad_norm": 3.5659756660461426, "learning_rate": 1.724137931034483e-06, "logits/chosen": -2.341399669647217, "logits/rejected": -2.3567094802856445, "logps/chosen": -66.60242462158203, "logps/rejected": -69.70094299316406, "loss": 0.6929, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0013719359412789345, "rewards/margins": -0.0035000313073396683, "rewards/rejected": 0.002128095831722021, "step": 20 }, { "epoch": 1.00418410041841, "grad_norm": 4.912586688995361, "learning_rate": 2.5862068965517246e-06, "logits/chosen": -2.3429622650146484, "logits/rejected": -2.3658394813537598, "logps/chosen": -71.6301040649414, "logps/rejected": -78.41346740722656, "loss": 0.6938, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.003577103139832616, "rewards/margins": 0.00785654503852129, "rewards/rejected": -0.004279441200196743, "step": 30 }, { "epoch": 1.3389121338912133, "grad_norm": 4.810107707977295, "learning_rate": 3.448275862068966e-06, "logits/chosen": -2.3610458374023438, "logits/rejected": -2.3885395526885986, "logps/chosen": -66.8291244506836, "logps/rejected": -62.15415573120117, "loss": 0.6893, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -6.734435737598687e-05, "rewards/margins": 0.006865750066936016, "rewards/rejected": -0.0069330958649516106, "step": 40 }, { "epoch": 1.6736401673640167, "grad_norm": 4.670071125030518, "learning_rate": 4.310344827586207e-06, "logits/chosen": -2.304999351501465, "logits/rejected": -2.335301399230957, "logps/chosen": -75.09913635253906, "logps/rejected": -77.72399139404297, "loss": 0.6878, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.003225918160751462, "rewards/margins": 0.010454346425831318, "rewards/rejected": -0.007228427566587925, "step": 50 }, { "epoch": 2.00836820083682, "grad_norm": 4.2342000007629395, "learning_rate": 4.999818897894192e-06, "logits/chosen": -2.363574504852295, "logits/rejected": -2.363882064819336, "logps/chosen": -62.84125900268555, "logps/rejected": -61.92932891845703, "loss": 0.6855, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0032769464887678623, "rewards/margins": 0.02090405486524105, "rewards/rejected": -0.024181004613637924, "step": 60 }, { "epoch": 2.3430962343096233, "grad_norm": 4.369245529174805, "learning_rate": 4.9934830787948756e-06, "logits/chosen": -2.378016948699951, "logits/rejected": -2.373137950897217, "logps/chosen": -74.67327880859375, "logps/rejected": -69.20399475097656, "loss": 0.668, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.0003526444488670677, "rewards/margins": 0.04865006357431412, "rewards/rejected": -0.04900271072983742, "step": 70 }, { "epoch": 2.6778242677824267, "grad_norm": 4.444687366485596, "learning_rate": 4.978118375700895e-06, "logits/chosen": -2.3403103351593018, "logits/rejected": -2.370321273803711, "logps/chosen": -77.29728698730469, "logps/rejected": -85.79756164550781, "loss": 0.6566, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.0051120575517416, "rewards/margins": 0.09415190666913986, "rewards/rejected": -0.08903985470533371, "step": 80 }, { "epoch": 3.01255230125523, "grad_norm": 4.876573085784912, "learning_rate": 4.953780424089803e-06, "logits/chosen": -2.3614611625671387, "logits/rejected": -2.385697841644287, "logps/chosen": -73.22442626953125, "logps/rejected": -82.25682067871094, "loss": 0.645, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.016868198290467262, "rewards/margins": 0.10679063946008682, "rewards/rejected": -0.12365883588790894, "step": 90 }, { "epoch": 3.3472803347280333, "grad_norm": 4.355966567993164, "learning_rate": 4.920557351506409e-06, "logits/chosen": -2.323256254196167, "logits/rejected": -2.341057300567627, "logps/chosen": -78.37105560302734, "logps/rejected": -86.8406982421875, "loss": 0.6072, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.015012519434094429, "rewards/margins": 0.20561759173870087, "rewards/rejected": -0.22063009440898895, "step": 100 }, { "epoch": 3.7698744769874475, "grad_norm": 4.361391067504883, "learning_rate": 4.878569458453592e-06, "logits/chosen": -2.3163838386535645, "logits/rejected": -2.3566031455993652, "logps/chosen": -83.33145904541016, "logps/rejected": -96.48517608642578, "loss": 0.5908, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.08870697021484375, "rewards/margins": 0.24879300594329834, "rewards/rejected": -0.3374999761581421, "step": 110 }, { "epoch": 4.104602510460251, "grad_norm": 4.315061569213867, "learning_rate": 4.827968782785062e-06, "logits/chosen": -2.3728129863739014, "logits/rejected": -2.3889667987823486, "logps/chosen": -73.0484619140625, "logps/rejected": -73.4913558959961, "loss": 0.5783, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.0605628564953804, "rewards/margins": 0.2945060133934021, "rewards/rejected": -0.3550689220428467, "step": 120 }, { "epoch": 4.439330543933054, "grad_norm": 4.438860893249512, "learning_rate": 4.7689385491773934e-06, "logits/chosen": -2.3526523113250732, "logits/rejected": -2.364795684814453, "logps/chosen": -67.69630432128906, "logps/rejected": -84.85731506347656, "loss": 0.5338, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1054786667227745, "rewards/margins": 0.4161924421787262, "rewards/rejected": -0.5216711759567261, "step": 130 }, { "epoch": 4.7740585774058575, "grad_norm": 4.5405473709106445, "learning_rate": 4.70169250567482e-06, "logits/chosen": -2.3756489753723145, "logits/rejected": -2.374919891357422, "logps/chosen": -68.5466079711914, "logps/rejected": -76.15412902832031, "loss": 0.5215, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.16213981807231903, "rewards/margins": 0.47565969824790955, "rewards/rejected": -0.6377995610237122, "step": 140 }, { "epoch": 5.108786610878661, "grad_norm": 4.596691608428955, "learning_rate": 4.626474149709127e-06, "logits/chosen": -2.428659439086914, "logits/rejected": -2.4141571521759033, "logps/chosen": -78.08479309082031, "logps/rejected": -68.3617172241211, "loss": 0.5019, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.20662447810173035, "rewards/margins": 0.4026559889316559, "rewards/rejected": -0.6092804670333862, "step": 150 }, { "epoch": 5.443514644351464, "grad_norm": 4.364648818969727, "learning_rate": 4.54355584639723e-06, "logits/chosen": -2.408982992172241, "logits/rejected": -2.4170727729797363, "logps/chosen": -81.3556900024414, "logps/rejected": -86.85897064208984, "loss": 0.4586, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.23941104114055634, "rewards/margins": 0.675674319267273, "rewards/rejected": -0.9150853157043457, "step": 160 }, { "epoch": 5.7782426778242675, "grad_norm": 5.241800308227539, "learning_rate": 4.45323784230908e-06, "logits/chosen": -2.4194908142089844, "logits/rejected": -2.4498963356018066, "logps/chosen": -62.32392120361328, "logps/rejected": -76.39479064941406, "loss": 0.4442, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.26662638783454895, "rewards/margins": 0.6662653088569641, "rewards/rejected": -0.9328916668891907, "step": 170 }, { "epoch": 6.112970711297071, "grad_norm": 4.73954439163208, "learning_rate": 4.355847178277025e-06, "logits/chosen": -2.4365036487579346, "logits/rejected": -2.435439348220825, "logps/chosen": -73.06513977050781, "logps/rejected": -81.04569244384766, "loss": 0.4355, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.37924182415008545, "rewards/margins": 0.7773979902267456, "rewards/rejected": -1.156639814376831, "step": 180 }, { "epoch": 6.447698744769874, "grad_norm": 5.250921726226807, "learning_rate": 4.2517365051833564e-06, "logits/chosen": -2.387922525405884, "logits/rejected": -2.3835678100585938, "logps/chosen": -64.85784912109375, "logps/rejected": -90.08439636230469, "loss": 0.3719, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.42228370904922485, "rewards/margins": 1.0562283992767334, "rewards/rejected": -1.478512167930603, "step": 190 }, { "epoch": 6.7824267782426775, "grad_norm": 5.088508129119873, "learning_rate": 4.141282807014034e-06, "logits/chosen": -2.376319169998169, "logits/rejected": -2.3985953330993652, "logps/chosen": -70.64585876464844, "logps/rejected": -89.17048645019531, "loss": 0.3829, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5233972072601318, "rewards/margins": 1.1063960790634155, "rewards/rejected": -1.629793405532837, "step": 200 }, { "epoch": 7.117154811715481, "grad_norm": 4.6062092781066895, "learning_rate": 4.024886035802432e-06, "logits/chosen": -2.371851682662964, "logits/rejected": -2.3844287395477295, "logps/chosen": -74.63328552246094, "logps/rejected": -97.81452178955078, "loss": 0.3522, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.6278538703918457, "rewards/margins": 1.2317354679107666, "rewards/rejected": -1.8595889806747437, "step": 210 }, { "epoch": 7.451882845188284, "grad_norm": 5.105669021606445, "learning_rate": 3.9029676634059565e-06, "logits/chosen": -2.4011385440826416, "logits/rejected": -2.4039382934570312, "logps/chosen": -75.92952728271484, "logps/rejected": -78.41490936279297, "loss": 0.3219, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.39645594358444214, "rewards/margins": 1.2095177173614502, "rewards/rejected": -1.6059738397598267, "step": 220 }, { "epoch": 7.786610878661088, "grad_norm": 6.292915344238281, "learning_rate": 3.7759691553595214e-06, "logits/chosen": -2.3707780838012695, "logits/rejected": -2.377169609069824, "logps/chosen": -88.07064056396484, "logps/rejected": -108.6225814819336, "loss": 0.3041, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.9827474355697632, "rewards/margins": 1.3651618957519531, "rewards/rejected": -2.3479092121124268, "step": 230 }, { "epoch": 8.121338912133892, "grad_norm": 5.0669097900390625, "learning_rate": 3.6443503723320837e-06, "logits/chosen": -2.3608062267303467, "logits/rejected": -2.3792402744293213, "logps/chosen": -72.83047485351562, "logps/rejected": -91.09341430664062, "loss": 0.3065, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.9334943890571594, "rewards/margins": 1.3210034370422363, "rewards/rejected": -2.25449800491333, "step": 240 }, { "epoch": 8.456066945606695, "grad_norm": 5.0598931312561035, "learning_rate": 3.508587904974522e-06, "logits/chosen": -2.324855327606201, "logits/rejected": -2.364541530609131, "logps/chosen": -90.57644653320312, "logps/rejected": -106.41752624511719, "loss": 0.2498, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8531273007392883, "rewards/margins": 1.8315904140472412, "rewards/rejected": -2.684717893600464, "step": 250 }, { "epoch": 8.790794979079498, "grad_norm": 6.120776653289795, "learning_rate": 3.3691733481883693e-06, "logits/chosen": -2.3436760902404785, "logits/rejected": -2.3720099925994873, "logps/chosen": -86.95789337158203, "logps/rejected": -102.34903717041016, "loss": 0.2532, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.1573801040649414, "rewards/margins": 1.7690637111663818, "rewards/rejected": -2.9264438152313232, "step": 260 }, { "epoch": 9.125523012552302, "grad_norm": 4.666015625, "learning_rate": 3.226611521064278e-06, "logits/chosen": -2.3132309913635254, "logits/rejected": -2.309297800064087, "logps/chosen": -78.139404296875, "logps/rejected": -99.09760284423828, "loss": 0.2314, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0649818181991577, "rewards/margins": 1.8774713277816772, "rewards/rejected": -2.942453384399414, "step": 270 }, { "epoch": 9.460251046025105, "grad_norm": 8.85567855834961, "learning_rate": 3.0814186389357765e-06, "logits/chosen": -2.3629987239837646, "logits/rejected": -2.385927200317383, "logps/chosen": -91.09283447265625, "logps/rejected": -102.37603759765625, "loss": 0.2142, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.5404099225997925, "rewards/margins": 2.121422290802002, "rewards/rejected": -3.661832094192505, "step": 280 }, { "epoch": 9.794979079497908, "grad_norm": 5.228074550628662, "learning_rate": 2.9341204441673267e-06, "logits/chosen": -2.356905221939087, "logits/rejected": -2.3635311126708984, "logps/chosen": -91.65778350830078, "logps/rejected": -117.89949035644531, "loss": 0.1881, "rewards/accuracies": 1.0, "rewards/chosen": -1.6620346307754517, "rewards/margins": 2.1766200065612793, "rewards/rejected": -3.8386547565460205, "step": 290 }, { "epoch": 10.129707112970712, "grad_norm": 5.115809440612793, "learning_rate": 2.785250302445062e-06, "logits/chosen": -2.2903695106506348, "logits/rejected": -2.2926692962646484, "logps/chosen": -104.5173110961914, "logps/rejected": -123.13216400146484, "loss": 0.1798, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.7224146127700806, "rewards/margins": 2.3892369270324707, "rewards/rejected": -4.111651420593262, "step": 300 }, { "epoch": 10.464435146443515, "grad_norm": 5.882064342498779, "learning_rate": 2.6353472714635443e-06, "logits/chosen": -2.2836384773254395, "logits/rejected": -2.2969231605529785, "logps/chosen": -88.8235855102539, "logps/rejected": -119.67433166503906, "loss": 0.1558, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.6937462091445923, "rewards/margins": 2.4059743881225586, "rewards/rejected": -4.0997209548950195, "step": 310 }, { "epoch": 10.799163179916318, "grad_norm": 6.9003376960754395, "learning_rate": 2.4849541490017868e-06, "logits/chosen": -2.289567232131958, "logits/rejected": -2.3216423988342285, "logps/chosen": -90.58432006835938, "logps/rejected": -118.13006591796875, "loss": 0.1538, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.6574989557266235, "rewards/margins": 2.9354054927825928, "rewards/rejected": -4.592904567718506, "step": 320 }, { "epoch": 11.133891213389122, "grad_norm": 4.916522979736328, "learning_rate": 2.3346155074564712e-06, "logits/chosen": -2.2699310779571533, "logits/rejected": -2.3001017570495605, "logps/chosen": -100.2576675415039, "logps/rejected": -133.8759307861328, "loss": 0.1373, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.174388885498047, "rewards/margins": 3.038696050643921, "rewards/rejected": -5.213086128234863, "step": 330 }, { "epoch": 11.468619246861925, "grad_norm": 6.739722728729248, "learning_rate": 2.184875721949277e-06, "logits/chosen": -2.2740581035614014, "logits/rejected": -2.315854549407959, "logps/chosen": -83.28224182128906, "logps/rejected": -107.7516098022461, "loss": 0.1257, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.777440071105957, "rewards/margins": 2.704913377761841, "rewards/rejected": -4.482353687286377, "step": 340 }, { "epoch": 11.803347280334728, "grad_norm": 4.988001823425293, "learning_rate": 2.0362769991485514e-06, "logits/chosen": -2.2616047859191895, "logits/rejected": -2.2596449851989746, "logps/chosen": -107.07649230957031, "logps/rejected": -139.80697631835938, "loss": 0.1184, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.618734359741211, "rewards/margins": 3.291966199874878, "rewards/rejected": -5.910700798034668, "step": 350 }, { "epoch": 12.138075313807532, "grad_norm": 4.956677436828613, "learning_rate": 1.8893574139429226e-06, "logits/chosen": -2.233889102935791, "logits/rejected": -2.2601330280303955, "logps/chosen": -95.82877349853516, "logps/rejected": -138.9019775390625, "loss": 0.1106, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.5194194316864014, "rewards/margins": 3.470710039138794, "rewards/rejected": -5.990128993988037, "step": 360 }, { "epoch": 12.472803347280335, "grad_norm": 4.895273208618164, "learning_rate": 1.744648961076068e-06, "logits/chosen": -2.2324471473693848, "logits/rejected": -2.233158588409424, "logps/chosen": -117.90779113769531, "logps/rejected": -141.53753662109375, "loss": 0.0907, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.7019529342651367, "rewards/margins": 3.4567368030548096, "rewards/rejected": -6.158689975738525, "step": 370 }, { "epoch": 12.807531380753138, "grad_norm": 5.789585590362549, "learning_rate": 1.602675628797636e-06, "logits/chosen": -2.2296676635742188, "logits/rejected": -2.2535061836242676, "logps/chosen": -117.69709777832031, "logps/rejected": -150.61538696289062, "loss": 0.0923, "rewards/accuracies": 1.0, "rewards/chosen": -3.4674232006073, "rewards/margins": 3.8480961322784424, "rewards/rejected": -7.3155198097229, "step": 380 }, { "epoch": 13.142259414225942, "grad_norm": 4.082385540008545, "learning_rate": 1.4639515015056205e-06, "logits/chosen": -2.232024908065796, "logits/rejected": -2.235680103302002, "logps/chosen": -96.60597229003906, "logps/rejected": -130.7404022216797, "loss": 0.0876, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.816174268722534, "rewards/margins": 3.2225749492645264, "rewards/rejected": -6.038748741149902, "step": 390 }, { "epoch": 13.476987447698745, "grad_norm": 4.423525333404541, "learning_rate": 1.328978898250525e-06, "logits/chosen": -2.2275261878967285, "logits/rejected": -2.2222421169281006, "logps/chosen": -107.16130065917969, "logps/rejected": -148.48500061035156, "loss": 0.0662, "rewards/accuracies": 1.0, "rewards/chosen": -3.10858154296875, "rewards/margins": 3.9508070945739746, "rewards/rejected": -7.059388637542725, "step": 400 }, { "epoch": 13.811715481171548, "grad_norm": 3.721898078918457, "learning_rate": 1.198246553841744e-06, "logits/chosen": -2.2333359718322754, "logits/rejected": -2.2442851066589355, "logps/chosen": -104.8399429321289, "logps/rejected": -137.98049926757812, "loss": 0.0808, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.3306171894073486, "rewards/margins": 3.471170425415039, "rewards/rejected": -6.80178689956665, "step": 410 }, { "epoch": 14.146443514644352, "grad_norm": 4.411396026611328, "learning_rate": 1.0722278491423998e-06, "logits/chosen": -2.2033934593200684, "logits/rejected": -2.206735610961914, "logps/chosen": -122.04057312011719, "logps/rejected": -139.2510528564453, "loss": 0.0651, "rewards/accuracies": 1.0, "rewards/chosen": -3.4570648670196533, "rewards/margins": 3.5551300048828125, "rewards/rejected": -7.012194633483887, "step": 420 }, { "epoch": 14.481171548117155, "grad_norm": 4.514885902404785, "learning_rate": 9.513790969606926e-07, "logits/chosen": -2.1915841102600098, "logits/rejected": -2.23836088180542, "logps/chosen": -111.24171447753906, "logps/rejected": -159.8766326904297, "loss": 0.0609, "rewards/accuracies": 1.0, "rewards/chosen": -3.719008207321167, "rewards/margins": 4.095301628112793, "rewards/rejected": -7.814309597015381, "step": 430 }, { "epoch": 14.815899581589958, "grad_norm": 6.274470329284668, "learning_rate": 8.361378897445643e-07, "logits/chosen": -2.2278056144714355, "logits/rejected": -2.2360167503356934, "logps/chosen": -95.31124877929688, "logps/rejected": -136.5842742919922, "loss": 0.0624, "rewards/accuracies": 1.0, "rewards/chosen": -3.1528682708740234, "rewards/margins": 4.095580577850342, "rewards/rejected": -7.248448848724365, "step": 440 }, { "epoch": 15.150627615062762, "grad_norm": 4.49701452255249, "learning_rate": 7.269215150626391e-07, "logits/chosen": -2.196305513381958, "logits/rejected": -2.2363815307617188, "logps/chosen": -101.97003173828125, "logps/rejected": -151.15646362304688, "loss": 0.0513, "rewards/accuracies": 1.0, "rewards/chosen": -3.562624454498291, "rewards/margins": 4.104978084564209, "rewards/rejected": -7.667603492736816, "step": 450 }, { "epoch": 15.485355648535565, "grad_norm": 4.746140956878662, "learning_rate": 6.241254446089942e-07, "logits/chosen": -2.1973156929016113, "logits/rejected": -2.217236042022705, "logps/chosen": -108.36579895019531, "logps/rejected": -146.45358276367188, "loss": 0.0588, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.8717312812805176, "rewards/margins": 3.9130451679229736, "rewards/rejected": -7.784776210784912, "step": 460 }, { "epoch": 15.820083682008368, "grad_norm": 2.910703182220459, "learning_rate": 5.281219022030423e-07, "logits/chosen": -2.1933655738830566, "logits/rejected": -2.193134307861328, "logps/chosen": -125.05366516113281, "logps/rejected": -158.47085571289062, "loss": 0.0484, "rewards/accuracies": 1.0, "rewards/chosen": -3.9004642963409424, "rewards/margins": 4.252579689025879, "rewards/rejected": -8.153043746948242, "step": 470 }, { "epoch": 16.15481171548117, "grad_norm": 2.814772367477417, "learning_rate": 4.392585159698087e-07, "logits/chosen": -2.1886072158813477, "logits/rejected": -2.1937201023101807, "logps/chosen": -113.6917724609375, "logps/rejected": -160.83851623535156, "loss": 0.0443, "rewards/accuracies": 1.0, "rewards/chosen": -3.779496669769287, "rewards/margins": 4.261423587799072, "rewards/rejected": -8.04092025756836, "step": 480 }, { "epoch": 16.489539748953973, "grad_norm": 3.579289197921753, "learning_rate": 3.578570595810274e-07, "logits/chosen": -2.19553542137146, "logits/rejected": -2.1956517696380615, "logps/chosen": -110.0953140258789, "logps/rejected": -165.99652099609375, "loss": 0.0483, "rewards/accuracies": 1.0, "rewards/chosen": -3.7303287982940674, "rewards/margins": 4.63196325302124, "rewards/rejected": -8.36229133605957, "step": 490 }, { "epoch": 16.824267782426777, "grad_norm": 4.428997039794922, "learning_rate": 2.8421228711503127e-07, "logits/chosen": -2.1704812049865723, "logits/rejected": -2.183809280395508, "logps/chosen": -99.66941833496094, "logps/rejected": -152.3459930419922, "loss": 0.0468, "rewards/accuracies": 1.0, "rewards/chosen": -3.7393298149108887, "rewards/margins": 4.549952030181885, "rewards/rejected": -8.289281845092773, "step": 500 }, { "epoch": 17.15899581589958, "grad_norm": 3.5141501426696777, "learning_rate": 2.1859086575439225e-07, "logits/chosen": -2.114220380783081, "logits/rejected": -2.1453700065612793, "logps/chosen": -119.66983795166016, "logps/rejected": -161.91326904296875, "loss": 0.0398, "rewards/accuracies": 1.0, "rewards/chosen": -4.162126064300537, "rewards/margins": 4.615090370178223, "rewards/rejected": -8.777216911315918, "step": 510 }, { "epoch": 17.493723849372383, "grad_norm": 3.1655192375183105, "learning_rate": 1.6123041018599766e-07, "logits/chosen": -2.1598916053771973, "logits/rejected": -2.151259660720825, "logps/chosen": -112.63690185546875, "logps/rejected": -166.2643280029297, "loss": 0.0436, "rewards/accuracies": 1.0, "rewards/chosen": -3.9895179271698, "rewards/margins": 4.71376895904541, "rewards/rejected": -8.703287124633789, "step": 520 }, { "epoch": 17.828451882845187, "grad_norm": 3.882448673248291, "learning_rate": 1.1233862220001168e-07, "logits/chosen": -2.1259069442749023, "logits/rejected": -2.1679906845092773, "logps/chosen": -125.42464447021484, "logps/rejected": -172.642822265625, "loss": 0.0477, "rewards/accuracies": 1.0, "rewards/chosen": -4.517868995666504, "rewards/margins": 4.534079551696777, "rewards/rejected": -9.051949501037598, "step": 530 }, { "epoch": 18.16317991631799, "grad_norm": 4.275852203369141, "learning_rate": 7.209253860320897e-08, "logits/chosen": -2.1740193367004395, "logits/rejected": -2.1897895336151123, "logps/chosen": -133.6866455078125, "logps/rejected": -160.42288208007812, "loss": 0.0408, "rewards/accuracies": 1.0, "rewards/chosen": -4.688433647155762, "rewards/margins": 4.159676551818848, "rewards/rejected": -8.848111152648926, "step": 540 }, { "epoch": 18.497907949790793, "grad_norm": 3.586958646774292, "learning_rate": 4.063789016999331e-08, "logits/chosen": -2.157022476196289, "logits/rejected": -2.179140567779541, "logps/chosen": -122.80704498291016, "logps/rejected": -170.018798828125, "loss": 0.0423, "rewards/accuracies": 1.0, "rewards/chosen": -4.441340923309326, "rewards/margins": 4.718876838684082, "rewards/rejected": -9.16021728515625, "step": 550 }, { "epoch": 18.8326359832636, "grad_norm": 2.9948108196258545, "learning_rate": 1.808857395232788e-08, "logits/chosen": -2.1356325149536133, "logits/rejected": -2.1427738666534424, "logps/chosen": -112.40225982666016, "logps/rejected": -166.0186767578125, "loss": 0.04, "rewards/accuracies": 1.0, "rewards/chosen": -4.3510541915893555, "rewards/margins": 4.859889030456543, "rewards/rejected": -9.210943222045898, "step": 560 }, { "epoch": 19.1673640167364, "grad_norm": 3.9700310230255127, "learning_rate": 4.526240859345499e-09, "logits/chosen": -2.1602721214294434, "logits/rejected": -2.168781042098999, "logps/chosen": -125.03184509277344, "logps/rejected": -174.86129760742188, "loss": 0.041, "rewards/accuracies": 1.0, "rewards/chosen": -4.168228626251221, "rewards/margins": 4.846875190734863, "rewards/rejected": -9.015104293823242, "step": 570 }, { "epoch": 19.502092050209207, "grad_norm": 3.226668119430542, "learning_rate": 0.0, "logits/chosen": -2.183656692504883, "logits/rejected": -2.190368175506592, "logps/chosen": -107.56755065917969, "logps/rejected": -153.33026123046875, "loss": 0.0408, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.3311662673950195, "rewards/margins": 4.147943019866943, "rewards/rejected": -8.479108810424805, "step": 580 }, { "epoch": 19.502092050209207, "step": 580, "total_flos": 2.1306294447112192e+18, "train_loss": 0.1631378454381022, "train_runtime": 3724.909, "train_samples_per_second": 10.245, "train_steps_per_second": 0.156 } ], "logging_steps": 10, "max_steps": 580, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1306294447112192e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }