{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 2471, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004046944556859571, "grad_norm": 3260930.195416938, "learning_rate": 2.0161290322580643e-09, "logits/chosen": -2.216688871383667, "logits/rejected": -2.1725575923919678, "logps/chosen": -62.37783432006836, "logps/rejected": -57.61228561401367, "loss": 137728.9531, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.004046944556859571, "grad_norm": 3951641.9256235515, "learning_rate": 2.0161290322580644e-08, "logits/chosen": -2.3231096267700195, "logits/rejected": -2.3050363063812256, "logps/chosen": -109.29280090332031, "logps/rejected": -105.20187377929688, "loss": 128824.3056, "rewards/accuracies": 0.4027777910232544, "rewards/chosen": -8.181909652194008e-05, "rewards/margins": -8.144730236381292e-05, "rewards/rejected": -3.7179981404733553e-07, "step": 10 }, { "epoch": 0.008093889113719142, "grad_norm": 3636837.085798033, "learning_rate": 4.032258064516129e-08, "logits/chosen": -2.3102259635925293, "logits/rejected": -2.3181633949279785, "logps/chosen": -102.9901351928711, "logps/rejected": -103.0818099975586, "loss": 128439.1625, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -3.3925286970770685e-06, "rewards/margins": -7.394707154162461e-06, "rewards/rejected": 4.00217959395377e-06, "step": 20 }, { "epoch": 0.012140833670578713, "grad_norm": 4189016.609602417, "learning_rate": 6.048387096774194e-08, "logits/chosen": -2.2731196880340576, "logits/rejected": -2.261061191558838, "logps/chosen": -104.67350769042969, "logps/rejected": -116.59749603271484, "loss": 124740.475, "rewards/accuracies": 0.5, "rewards/chosen": -0.00028529245173558593, "rewards/margins": 2.6274694391759112e-05, "rewards/rejected": -0.00031156709883362055, "step": 30 }, { "epoch": 0.016187778227438283, "grad_norm": 3141568.670348898, "learning_rate": 8.064516129032257e-08, "logits/chosen": -2.3156943321228027, "logits/rejected": -2.294349193572998, "logps/chosen": -129.86062622070312, "logps/rejected": -117.5326156616211, "loss": 131411.2, "rewards/accuracies": 0.4375, "rewards/chosen": -0.00011367227125447243, "rewards/margins": -1.9382667233003303e-05, "rewards/rejected": -9.428960038349032e-05, "step": 40 }, { "epoch": 0.020234722784297856, "grad_norm": 4300244.422627452, "learning_rate": 1.0080645161290321e-07, "logits/chosen": -2.271444320678711, "logits/rejected": -2.2707998752593994, "logps/chosen": -107.74246978759766, "logps/rejected": -112.56591796875, "loss": 128522.9375, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0001433270808774978, "rewards/margins": 8.896701183402911e-05, "rewards/rejected": -0.00023229411453939974, "step": 50 }, { "epoch": 0.024281667341157425, "grad_norm": 4087404.2243083506, "learning_rate": 1.2096774193548387e-07, "logits/chosen": -2.2509658336639404, "logits/rejected": -2.235924005508423, "logps/chosen": -98.1602783203125, "logps/rejected": -97.8387222290039, "loss": 134684.625, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.988773187302286e-06, "rewards/margins": 0.0002523847797419876, "rewards/rejected": -0.00025637357612140477, "step": 60 }, { "epoch": 0.028328611898016998, "grad_norm": 3240131.8848123536, "learning_rate": 1.4112903225806453e-07, "logits/chosen": -2.3215599060058594, "logits/rejected": -2.3164916038513184, "logps/chosen": -113.9156265258789, "logps/rejected": -114.72650146484375, "loss": 127554.8875, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0005882935947738588, "rewards/margins": -0.0001803641061997041, "rewards/rejected": 0.000768657773733139, "step": 70 }, { "epoch": 0.03237555645487657, "grad_norm": 4463707.543855141, "learning_rate": 1.6129032258064515e-07, "logits/chosen": -2.197829246520996, "logits/rejected": -2.2096786499023438, "logps/chosen": -99.81291198730469, "logps/rejected": -96.83836364746094, "loss": 129532.875, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 3.9735146856401116e-05, "rewards/margins": 0.00038970523746684194, "rewards/rejected": -0.0003499701269902289, "step": 80 }, { "epoch": 0.036422501011736136, "grad_norm": 5504977.483755038, "learning_rate": 1.814516129032258e-07, "logits/chosen": -2.2197558879852295, "logits/rejected": -2.200068712234497, "logps/chosen": -112.21453857421875, "logps/rejected": -110.07649993896484, "loss": 132607.275, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.00027725097606889904, "rewards/margins": 0.00037192669697105885, "rewards/rejected": -0.0006491777021437883, "step": 90 }, { "epoch": 0.04046944556859571, "grad_norm": 3275423.408726695, "learning_rate": 2.0161290322580642e-07, "logits/chosen": -2.2803494930267334, "logits/rejected": -2.277498245239258, "logps/chosen": -118.47029876708984, "logps/rejected": -121.81834411621094, "loss": 129364.775, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.00027665990637615323, "rewards/margins": 0.0005457916995510459, "rewards/rejected": -0.00026913188048638403, "step": 100 }, { "epoch": 0.04451639012545528, "grad_norm": 3537595.5413078354, "learning_rate": 2.2177419354838707e-07, "logits/chosen": -2.2598938941955566, "logits/rejected": -2.243565797805786, "logps/chosen": -123.01219177246094, "logps/rejected": -127.5718994140625, "loss": 128605.475, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.001143028261139989, "rewards/margins": 0.0005904460558667779, "rewards/rejected": -0.0017334744334220886, "step": 110 }, { "epoch": 0.04856333468231485, "grad_norm": 4700408.411362441, "learning_rate": 2.4193548387096775e-07, "logits/chosen": -2.189763307571411, "logits/rejected": -2.230834484100342, "logps/chosen": -111.58349609375, "logps/rejected": -116.3633041381836, "loss": 132832.7375, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0004227511235512793, "rewards/margins": -0.00036064969026483595, "rewards/rejected": -6.210146966623142e-05, "step": 120 }, { "epoch": 0.052610279239174426, "grad_norm": 4839768.05175113, "learning_rate": 2.6209677419354835e-07, "logits/chosen": -2.172719717025757, "logits/rejected": -2.154069423675537, "logps/chosen": -131.54049682617188, "logps/rejected": -127.31382751464844, "loss": 126528.7375, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0012601370690390468, "rewards/margins": 0.0014206544728949666, "rewards/rejected": -0.0026807915419340134, "step": 130 }, { "epoch": 0.056657223796033995, "grad_norm": 4462123.98520813, "learning_rate": 2.8225806451612905e-07, "logits/chosen": -2.2781708240509033, "logits/rejected": -2.2529187202453613, "logps/chosen": -109.3487319946289, "logps/rejected": -108.7385025024414, "loss": 128939.875, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0024039470590651035, "rewards/margins": 0.0019583911634981632, "rewards/rejected": -0.004362338222563267, "step": 140 }, { "epoch": 0.060704168352893564, "grad_norm": 4413918.737440498, "learning_rate": 3.0241935483870965e-07, "logits/chosen": -2.0262560844421387, "logits/rejected": -2.0333077907562256, "logps/chosen": -115.6955337524414, "logps/rejected": -129.337890625, "loss": 125950.125, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -3.777583970077103e-06, "rewards/margins": -0.00044618500396609306, "rewards/rejected": 0.0004424075596034527, "step": 150 }, { "epoch": 0.06475111290975313, "grad_norm": 4956705.44191673, "learning_rate": 3.225806451612903e-07, "logits/chosen": -2.127880096435547, "logits/rejected": -2.081531524658203, "logps/chosen": -115.7586669921875, "logps/rejected": -115.57160949707031, "loss": 127159.3875, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0029864097014069557, "rewards/margins": 0.0028749522753059864, "rewards/rejected": -0.005861361511051655, "step": 160 }, { "epoch": 0.0687980574666127, "grad_norm": 5249631.129700843, "learning_rate": 3.4274193548387095e-07, "logits/chosen": -1.924232840538025, "logits/rejected": -1.9467108249664307, "logps/chosen": -130.4487762451172, "logps/rejected": -133.85560607910156, "loss": 125375.3875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.008038095198571682, "rewards/margins": 0.002187486505135894, "rewards/rejected": -0.01022558193653822, "step": 170 }, { "epoch": 0.07284500202347227, "grad_norm": 4122924.0724422527, "learning_rate": 3.629032258064516e-07, "logits/chosen": -2.045342445373535, "logits/rejected": -2.0415282249450684, "logps/chosen": -118.37638854980469, "logps/rejected": -112.38387298583984, "loss": 126785.075, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.006844646297395229, "rewards/margins": 0.0005232656258158386, "rewards/rejected": -0.00736791267991066, "step": 180 }, { "epoch": 0.07689194658033185, "grad_norm": 4041640.919843376, "learning_rate": 3.8306451612903225e-07, "logits/chosen": -2.0350680351257324, "logits/rejected": -2.0383336544036865, "logps/chosen": -96.37462615966797, "logps/rejected": -109.77508544921875, "loss": 123590.025, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.005209728144109249, "rewards/margins": 0.0012681197840720415, "rewards/rejected": -0.006477847695350647, "step": 190 }, { "epoch": 0.08093889113719142, "grad_norm": 5009394.271837804, "learning_rate": 4.0322580645161285e-07, "logits/chosen": -2.0026402473449707, "logits/rejected": -1.9795843362808228, "logps/chosen": -111.41777038574219, "logps/rejected": -113.9148178100586, "loss": 126448.05, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.010595456697046757, "rewards/margins": 0.003854970680549741, "rewards/rejected": -0.01445042621344328, "step": 200 }, { "epoch": 0.08498583569405099, "grad_norm": 5619272.0636549145, "learning_rate": 4.2338709677419355e-07, "logits/chosen": -2.1407713890075684, "logits/rejected": -2.1545071601867676, "logps/chosen": -110.00148010253906, "logps/rejected": -112.6539077758789, "loss": 128766.15, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.009294772520661354, "rewards/margins": 0.0027922452427446842, "rewards/rejected": -0.012087016366422176, "step": 210 }, { "epoch": 0.08903278025091056, "grad_norm": 4415731.805933493, "learning_rate": 4.4354838709677415e-07, "logits/chosen": -2.3430728912353516, "logits/rejected": -2.3087539672851562, "logps/chosen": -131.23556518554688, "logps/rejected": -134.5377655029297, "loss": 132823.0125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007352087646722794, "rewards/margins": 0.004128883592784405, "rewards/rejected": -0.011480971239507198, "step": 220 }, { "epoch": 0.09307972480777013, "grad_norm": 5355155.642181672, "learning_rate": 4.637096774193548e-07, "logits/chosen": -2.242619752883911, "logits/rejected": -2.2341275215148926, "logps/chosen": -127.72953033447266, "logps/rejected": -131.52122497558594, "loss": 126450.9125, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.01162335742264986, "rewards/margins": 0.005030062980949879, "rewards/rejected": -0.01665342040359974, "step": 230 }, { "epoch": 0.0971266693646297, "grad_norm": 4222976.278932744, "learning_rate": 4.838709677419355e-07, "logits/chosen": -2.2030246257781982, "logits/rejected": -2.2015702724456787, "logps/chosen": -108.4349365234375, "logps/rejected": -110.720703125, "loss": 133174.225, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.009825309738516808, "rewards/margins": 0.003020837437361479, "rewards/rejected": -0.012846146710216999, "step": 240 }, { "epoch": 0.10117361392148927, "grad_norm": 5332420.53657513, "learning_rate": 4.995501574448943e-07, "logits/chosen": -2.1023497581481934, "logits/rejected": -2.1095871925354004, "logps/chosen": -110.6066665649414, "logps/rejected": -117.8967056274414, "loss": 127655.45, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.006971948780119419, "rewards/margins": 0.0035569421015679836, "rewards/rejected": -0.010528890416026115, "step": 250 }, { "epoch": 0.10522055847834885, "grad_norm": 5679375.003121498, "learning_rate": 4.973009446693657e-07, "logits/chosen": -2.199481964111328, "logits/rejected": -2.18941330909729, "logps/chosen": -117.3616943359375, "logps/rejected": -118.54112243652344, "loss": 132409.6875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.013199470937252045, "rewards/margins": 0.001887517748400569, "rewards/rejected": -0.015086987987160683, "step": 260 }, { "epoch": 0.10926750303520842, "grad_norm": 4610431.437626582, "learning_rate": 4.950517318938372e-07, "logits/chosen": -2.3225109577178955, "logits/rejected": -2.3390707969665527, "logps/chosen": -124.8027572631836, "logps/rejected": -129.42922973632812, "loss": 125030.1375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.012582078576087952, "rewards/margins": 0.0034210742451250553, "rewards/rejected": -0.01600315235555172, "step": 270 }, { "epoch": 0.11331444759206799, "grad_norm": 6556255.599818757, "learning_rate": 4.928025191183086e-07, "logits/chosen": -2.1859679222106934, "logits/rejected": -2.2081589698791504, "logps/chosen": -114.65242767333984, "logps/rejected": -124.5443344116211, "loss": 124704.9, "rewards/accuracies": 0.5625, "rewards/chosen": -0.016057247295975685, "rewards/margins": 0.002059857128188014, "rewards/rejected": -0.018117103725671768, "step": 280 }, { "epoch": 0.11736139214892756, "grad_norm": 6805409.426599127, "learning_rate": 4.9055330634278e-07, "logits/chosen": -2.219308853149414, "logits/rejected": -2.214458465576172, "logps/chosen": -134.7147979736328, "logps/rejected": -142.11083984375, "loss": 127160.525, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.009862681850790977, "rewards/margins": 0.005548264365643263, "rewards/rejected": -0.015410944819450378, "step": 290 }, { "epoch": 0.12140833670578713, "grad_norm": 6068819.6586095495, "learning_rate": 4.883040935672515e-07, "logits/chosen": -2.258293390274048, "logits/rejected": -2.228738307952881, "logps/chosen": -132.33441162109375, "logps/rejected": -141.69737243652344, "loss": 128142.7625, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.008503363467752934, "rewards/margins": 0.007645074278116226, "rewards/rejected": -0.016148436814546585, "step": 300 }, { "epoch": 0.1254552812626467, "grad_norm": 5267448.920762056, "learning_rate": 4.860548807917229e-07, "logits/chosen": -2.2171027660369873, "logits/rejected": -2.2104790210723877, "logps/chosen": -125.05142974853516, "logps/rejected": -133.34071350097656, "loss": 125674.1, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.019455790519714355, "rewards/margins": 0.0074443453922867775, "rewards/rejected": -0.026900136843323708, "step": 310 }, { "epoch": 0.12950222581950627, "grad_norm": 6667685.083680488, "learning_rate": 4.838056680161944e-07, "logits/chosen": -2.1860244274139404, "logits/rejected": -2.2035775184631348, "logps/chosen": -122.4665756225586, "logps/rejected": -132.46490478515625, "loss": 125480.4125, "rewards/accuracies": 0.625, "rewards/chosen": -0.017699861899018288, "rewards/margins": 0.006000404246151447, "rewards/rejected": -0.02370026707649231, "step": 320 }, { "epoch": 0.13354917037636585, "grad_norm": 6064623.088294091, "learning_rate": 4.815564552406658e-07, "logits/chosen": -2.0421011447906494, "logits/rejected": -2.057572603225708, "logps/chosen": -134.10183715820312, "logps/rejected": -144.3116912841797, "loss": 124604.7875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.019064148887991905, "rewards/margins": 0.006390860769897699, "rewards/rejected": -0.025455012917518616, "step": 330 }, { "epoch": 0.1375961149332254, "grad_norm": 12560788.46443257, "learning_rate": 4.793072424651372e-07, "logits/chosen": -1.9278684854507446, "logits/rejected": -1.909166693687439, "logps/chosen": -146.60585021972656, "logps/rejected": -166.07937622070312, "loss": 140379.8375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0366508811712265, "rewards/margins": 0.013624541461467743, "rewards/rejected": -0.050275422632694244, "step": 340 }, { "epoch": 0.141643059490085, "grad_norm": 6487628.638191885, "learning_rate": 4.770580296896087e-07, "logits/chosen": -2.11842679977417, "logits/rejected": -2.1011595726013184, "logps/chosen": -119.56195068359375, "logps/rejected": -136.84823608398438, "loss": 130511.3625, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0215927604585886, "rewards/margins": 0.007697033230215311, "rewards/rejected": -0.029289793223142624, "step": 350 }, { "epoch": 0.14569000404694454, "grad_norm": 4799050.3946148325, "learning_rate": 4.7480881691408005e-07, "logits/chosen": -2.0867960453033447, "logits/rejected": -2.082698345184326, "logps/chosen": -128.99429321289062, "logps/rejected": -130.89785766601562, "loss": 127926.0, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.016279883682727814, "rewards/margins": 0.0010558776557445526, "rewards/rejected": -0.017335761338472366, "step": 360 }, { "epoch": 0.14973694860380413, "grad_norm": 5249060.947314388, "learning_rate": 4.725596041385515e-07, "logits/chosen": -2.1251657009124756, "logits/rejected": -2.1052744388580322, "logps/chosen": -121.3799819946289, "logps/rejected": -121.27701568603516, "loss": 131676.2375, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.017586207017302513, "rewards/margins": 0.003813033225014806, "rewards/rejected": -0.0213992390781641, "step": 370 }, { "epoch": 0.1537838931606637, "grad_norm": 5293517.066249103, "learning_rate": 4.7031039136302294e-07, "logits/chosen": -2.15531587600708, "logits/rejected": -2.153560161590576, "logps/chosen": -159.96005249023438, "logps/rejected": -153.87448120117188, "loss": 121504.05, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.017819028347730637, "rewards/margins": 0.008313515223562717, "rewards/rejected": -0.02613254450261593, "step": 380 }, { "epoch": 0.15783083771752326, "grad_norm": 5270002.835932803, "learning_rate": 4.6806117858749433e-07, "logits/chosen": -2.1870741844177246, "logits/rejected": -2.171494483947754, "logps/chosen": -148.86929321289062, "logps/rejected": -164.97915649414062, "loss": 129892.05, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.02027943730354309, "rewards/margins": 0.0176930520683527, "rewards/rejected": -0.03797249123454094, "step": 390 }, { "epoch": 0.16187778227438285, "grad_norm": 4898003.511863262, "learning_rate": 4.658119658119658e-07, "logits/chosen": -2.1435184478759766, "logits/rejected": -2.148679256439209, "logps/chosen": -128.7902069091797, "logps/rejected": -139.18150329589844, "loss": 122692.925, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.018103724345564842, "rewards/margins": 0.006611344870179892, "rewards/rejected": -0.02471506968140602, "step": 400 }, { "epoch": 0.1659247268312424, "grad_norm": 4183644.18979808, "learning_rate": 4.635627530364372e-07, "logits/chosen": -2.150381565093994, "logits/rejected": -2.154317617416382, "logps/chosen": -108.93717193603516, "logps/rejected": -118.07032775878906, "loss": 126758.0375, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.021253790706396103, "rewards/margins": 0.003508577588945627, "rewards/rejected": -0.024762369692325592, "step": 410 }, { "epoch": 0.16997167138810199, "grad_norm": 5593714.467836783, "learning_rate": 4.6131354026090867e-07, "logits/chosen": -2.180170774459839, "logits/rejected": -2.1523594856262207, "logps/chosen": -126.38621520996094, "logps/rejected": -136.35755920410156, "loss": 121196.2, "rewards/accuracies": 0.625, "rewards/chosen": -0.01847546175122261, "rewards/margins": 0.0067595853470265865, "rewards/rejected": -0.025235047563910484, "step": 420 }, { "epoch": 0.17401861594496154, "grad_norm": 3566616.7901411816, "learning_rate": 4.590643274853801e-07, "logits/chosen": -2.120450258255005, "logits/rejected": -2.150542974472046, "logps/chosen": -137.63836669921875, "logps/rejected": -141.17825317382812, "loss": 132284.5875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.02999441884458065, "rewards/margins": 0.0025890106335282326, "rewards/rejected": -0.03258342668414116, "step": 430 }, { "epoch": 0.17806556050182112, "grad_norm": 6039791.29116107, "learning_rate": 4.568151147098515e-07, "logits/chosen": -2.2097067832946777, "logits/rejected": -2.1825873851776123, "logps/chosen": -127.94209289550781, "logps/rejected": -137.39776611328125, "loss": 128589.475, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.023606717586517334, "rewards/margins": 0.009609794244170189, "rewards/rejected": -0.03321651369333267, "step": 440 }, { "epoch": 0.1821125050586807, "grad_norm": 6343148.886392033, "learning_rate": 4.54565901934323e-07, "logits/chosen": -2.1717894077301025, "logits/rejected": -2.2131998538970947, "logps/chosen": -129.89688110351562, "logps/rejected": -145.33839416503906, "loss": 124381.275, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.020499037578701973, "rewards/margins": 0.013838306069374084, "rewards/rejected": -0.03433733806014061, "step": 450 }, { "epoch": 0.18615944961554026, "grad_norm": 4981408.5070092585, "learning_rate": 4.523166891587944e-07, "logits/chosen": -2.2632086277008057, "logits/rejected": -2.306267738342285, "logps/chosen": -163.80706787109375, "logps/rejected": -155.72915649414062, "loss": 158881.6375, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.03987263888120651, "rewards/margins": -0.008330432698130608, "rewards/rejected": -0.03154221177101135, "step": 460 }, { "epoch": 0.19020639417239985, "grad_norm": 6186406.38261752, "learning_rate": 4.500674763832658e-07, "logits/chosen": -2.4067013263702393, "logits/rejected": -2.4073116779327393, "logps/chosen": -123.8814697265625, "logps/rejected": -133.23178100585938, "loss": 129765.4625, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.01776200719177723, "rewards/margins": 0.006926923058927059, "rewards/rejected": -0.024688933044672012, "step": 470 }, { "epoch": 0.1942533387292594, "grad_norm": 7924184.670127909, "learning_rate": 4.478182636077373e-07, "logits/chosen": -2.4064009189605713, "logits/rejected": -2.3933303356170654, "logps/chosen": -120.53520202636719, "logps/rejected": -124.30986022949219, "loss": 127188.5875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.013866530731320381, "rewards/margins": 0.0045671057887375355, "rewards/rejected": -0.01843363419175148, "step": 480 }, { "epoch": 0.19830028328611898, "grad_norm": 6796881.168124855, "learning_rate": 4.455690508322087e-07, "logits/chosen": -2.35581636428833, "logits/rejected": -2.276433229446411, "logps/chosen": -113.40742492675781, "logps/rejected": -126.89019775390625, "loss": 122585.6875, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.015190203674137592, "rewards/margins": 0.010421663522720337, "rewards/rejected": -0.025611868128180504, "step": 490 }, { "epoch": 0.20234722784297854, "grad_norm": 9409785.188721178, "learning_rate": 4.433198380566802e-07, "logits/chosen": -2.200453519821167, "logits/rejected": -2.2011332511901855, "logps/chosen": -156.01809692382812, "logps/rejected": -169.92514038085938, "loss": 129704.3, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.025881418958306313, "rewards/margins": 0.010276483371853828, "rewards/rejected": -0.03615789860486984, "step": 500 }, { "epoch": 0.20639417239983812, "grad_norm": 5757712.5781175345, "learning_rate": 4.410706252811516e-07, "logits/chosen": -2.127547025680542, "logits/rejected": -2.1388392448425293, "logps/chosen": -130.27249145507812, "logps/rejected": -145.90647888183594, "loss": 123361.8125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03281703591346741, "rewards/margins": 0.009785661473870277, "rewards/rejected": -0.042602695524692535, "step": 510 }, { "epoch": 0.2104411169566977, "grad_norm": 5742087.523014036, "learning_rate": 4.3882141250562297e-07, "logits/chosen": -2.2757978439331055, "logits/rejected": -2.2460601329803467, "logps/chosen": -153.6471710205078, "logps/rejected": -165.54989624023438, "loss": 127158.9125, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.02743702568113804, "rewards/margins": 0.017125947400927544, "rewards/rejected": -0.04456297308206558, "step": 520 }, { "epoch": 0.21448806151355726, "grad_norm": 6000988.402036818, "learning_rate": 4.3657219973009447e-07, "logits/chosen": -2.14945387840271, "logits/rejected": -2.160613775253296, "logps/chosen": -152.8687286376953, "logps/rejected": -157.02215576171875, "loss": 130855.4125, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.03712679445743561, "rewards/margins": -0.002356339478865266, "rewards/rejected": -0.03477045148611069, "step": 530 }, { "epoch": 0.21853500607041684, "grad_norm": 7039581.88958706, "learning_rate": 4.3432298695456586e-07, "logits/chosen": -2.1952900886535645, "logits/rejected": -2.125767946243286, "logps/chosen": -121.56607818603516, "logps/rejected": -136.8863983154297, "loss": 124032.45, "rewards/accuracies": 0.6875, "rewards/chosen": -0.021455224603414536, "rewards/margins": 0.01183997467160225, "rewards/rejected": -0.03329520300030708, "step": 540 }, { "epoch": 0.2225819506272764, "grad_norm": 6851510.087607766, "learning_rate": 4.3207377417903736e-07, "logits/chosen": -2.3099186420440674, "logits/rejected": -2.2750840187072754, "logps/chosen": -133.94058227539062, "logps/rejected": -165.82687377929688, "loss": 127159.35, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02089579775929451, "rewards/margins": 0.011938202194869518, "rewards/rejected": -0.0328340008854866, "step": 550 }, { "epoch": 0.22662889518413598, "grad_norm": 7651455.0301742535, "learning_rate": 4.2982456140350876e-07, "logits/chosen": -2.281270980834961, "logits/rejected": -2.291888475418091, "logps/chosen": -139.83163452148438, "logps/rejected": -141.5286865234375, "loss": 130547.225, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.02193923108279705, "rewards/margins": 0.0074460976757109165, "rewards/rejected": -0.02938532829284668, "step": 560 }, { "epoch": 0.23067583974099554, "grad_norm": 4842418.287253727, "learning_rate": 4.2757534862798015e-07, "logits/chosen": -2.28908634185791, "logits/rejected": -2.2613823413848877, "logps/chosen": -130.56756591796875, "logps/rejected": -136.48858642578125, "loss": 129810.7, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.018774276599287987, "rewards/margins": 0.012544331140816212, "rewards/rejected": -0.03131860867142677, "step": 570 }, { "epoch": 0.23472278429785512, "grad_norm": 5753286.585250832, "learning_rate": 4.2532613585245165e-07, "logits/chosen": -2.3290882110595703, "logits/rejected": -2.2913310527801514, "logps/chosen": -128.60073852539062, "logps/rejected": -144.4147491455078, "loss": 125407.5625, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.022057032212615013, "rewards/margins": 0.013317006640136242, "rewards/rejected": -0.03537403792142868, "step": 580 }, { "epoch": 0.2387697288547147, "grad_norm": 6854533.186683347, "learning_rate": 4.2307692307692304e-07, "logits/chosen": -2.1821513175964355, "logits/rejected": -2.227368116378784, "logps/chosen": -132.9744873046875, "logps/rejected": -143.91380310058594, "loss": 119907.075, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.024536501616239548, "rewards/margins": 0.00792471133172512, "rewards/rejected": -0.03246121481060982, "step": 590 }, { "epoch": 0.24281667341157426, "grad_norm": 7000163.800918494, "learning_rate": 4.208277103013945e-07, "logits/chosen": -2.2966506481170654, "logits/rejected": -2.274991989135742, "logps/chosen": -140.1864776611328, "logps/rejected": -142.9268798828125, "loss": 129494.7625, "rewards/accuracies": 0.625, "rewards/chosen": -0.026311520487070084, "rewards/margins": 0.005165449343621731, "rewards/rejected": -0.03147696703672409, "step": 600 }, { "epoch": 0.24686361796843384, "grad_norm": 5155538.44716785, "learning_rate": 4.1857849752586593e-07, "logits/chosen": -2.2126269340515137, "logits/rejected": -2.2339818477630615, "logps/chosen": -143.7578125, "logps/rejected": -148.81027221679688, "loss": 131088.325, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.022633636370301247, "rewards/margins": 0.005649174097925425, "rewards/rejected": -0.028282811865210533, "step": 610 }, { "epoch": 0.2509105625252934, "grad_norm": 6494761.148749808, "learning_rate": 4.1632928475033733e-07, "logits/chosen": -2.2412619590759277, "logits/rejected": -2.215108633041382, "logps/chosen": -133.82061767578125, "logps/rejected": -144.2487030029297, "loss": 127834.35, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.023778211325407028, "rewards/margins": 0.008780455216765404, "rewards/rejected": -0.03255866840481758, "step": 620 }, { "epoch": 0.254957507082153, "grad_norm": 6581411.527197339, "learning_rate": 4.140800719748088e-07, "logits/chosen": -2.3006882667541504, "logits/rejected": -2.279165744781494, "logps/chosen": -127.95011901855469, "logps/rejected": -144.5232696533203, "loss": 128899.5125, "rewards/accuracies": 0.625, "rewards/chosen": -0.017114771530032158, "rewards/margins": 0.012585528194904327, "rewards/rejected": -0.029700294137001038, "step": 630 }, { "epoch": 0.25900445163901253, "grad_norm": 6993144.620436077, "learning_rate": 4.118308591992802e-07, "logits/chosen": -2.288159132003784, "logits/rejected": -2.27152681350708, "logps/chosen": -116.51515197753906, "logps/rejected": -134.83572387695312, "loss": 122510.6375, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.014996351674199104, "rewards/margins": 0.018196506425738335, "rewards/rejected": -0.03319285809993744, "step": 640 }, { "epoch": 0.2630513961958721, "grad_norm": 5352708.94864355, "learning_rate": 4.0958164642375167e-07, "logits/chosen": -2.33659029006958, "logits/rejected": -2.3185806274414062, "logps/chosen": -143.27899169921875, "logps/rejected": -154.21240234375, "loss": 128047.15, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.022664163261651993, "rewards/margins": 0.016272926703095436, "rewards/rejected": -0.03893708810210228, "step": 650 }, { "epoch": 0.2670983407527317, "grad_norm": 5853928.770602061, "learning_rate": 4.073324336482231e-07, "logits/chosen": -2.2209713459014893, "logits/rejected": -2.197364091873169, "logps/chosen": -154.97152709960938, "logps/rejected": -164.9137725830078, "loss": 126285.6125, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.02928345464169979, "rewards/margins": 0.017678027972579002, "rewards/rejected": -0.046961478888988495, "step": 660 }, { "epoch": 0.27114528530959126, "grad_norm": 5468563.620033422, "learning_rate": 4.0508322087269456e-07, "logits/chosen": -2.368302822113037, "logits/rejected": -2.359222888946533, "logps/chosen": -138.3487091064453, "logps/rejected": -131.19773864746094, "loss": 135010.325, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.022097600623965263, "rewards/margins": -0.0010355912381783128, "rewards/rejected": -0.021062009036540985, "step": 670 }, { "epoch": 0.2751922298664508, "grad_norm": 5145007.282508669, "learning_rate": 4.02834008097166e-07, "logits/chosen": -2.2279224395751953, "logits/rejected": -2.227818250656128, "logps/chosen": -151.80599975585938, "logps/rejected": -155.39369201660156, "loss": 124851.875, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.023444540798664093, "rewards/margins": 0.006362411193549633, "rewards/rejected": -0.0298069529235363, "step": 680 }, { "epoch": 0.2792391744233104, "grad_norm": 5800338.778716969, "learning_rate": 4.005847953216374e-07, "logits/chosen": -2.3348867893218994, "logits/rejected": -2.3266310691833496, "logps/chosen": -125.41386413574219, "logps/rejected": -131.49343872070312, "loss": 127372.8125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.021091241389513016, "rewards/margins": 0.00724734365940094, "rewards/rejected": -0.028338585048913956, "step": 690 }, { "epoch": 0.28328611898017, "grad_norm": 8105894.218684362, "learning_rate": 3.9833558254610884e-07, "logits/chosen": -2.309593677520752, "logits/rejected": -2.2981934547424316, "logps/chosen": -132.08596801757812, "logps/rejected": -137.70201110839844, "loss": 124781.725, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.021190345287322998, "rewards/margins": 0.0066665285266935825, "rewards/rejected": -0.027856875211000443, "step": 700 }, { "epoch": 0.28733306353702953, "grad_norm": 5039380.169629858, "learning_rate": 3.960863697705803e-07, "logits/chosen": -2.315074920654297, "logits/rejected": -2.3194656372070312, "logps/chosen": -147.24713134765625, "logps/rejected": -158.99636840820312, "loss": 128105.9625, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.018308859318494797, "rewards/margins": 0.00702436501160264, "rewards/rejected": -0.0253332257270813, "step": 710 }, { "epoch": 0.2913800080938891, "grad_norm": 6770507.385238732, "learning_rate": 3.9383715699505173e-07, "logits/chosen": -2.3582499027252197, "logits/rejected": -2.307143211364746, "logps/chosen": -141.00454711914062, "logps/rejected": -145.4442901611328, "loss": 128073.85, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.02170463278889656, "rewards/margins": 0.004128533415496349, "rewards/rejected": -0.025833168998360634, "step": 720 }, { "epoch": 0.2954269526507487, "grad_norm": 6843599.0452563455, "learning_rate": 3.9158794421952313e-07, "logits/chosen": -2.2773690223693848, "logits/rejected": -2.2705273628234863, "logps/chosen": -127.78352355957031, "logps/rejected": -128.8694305419922, "loss": 133363.3375, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.019749607890844345, "rewards/margins": 0.0013956364709883928, "rewards/rejected": -0.02114524319767952, "step": 730 }, { "epoch": 0.29947389720760825, "grad_norm": 6414207.014030725, "learning_rate": 3.893387314439946e-07, "logits/chosen": -2.2219457626342773, "logits/rejected": -2.1614620685577393, "logps/chosen": -138.95530700683594, "logps/rejected": -159.24916076660156, "loss": 125832.575, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.01657973788678646, "rewards/margins": 0.015021143481135368, "rewards/rejected": -0.03160088509321213, "step": 740 }, { "epoch": 0.3035208417644678, "grad_norm": 6251391.785537995, "learning_rate": 3.87089518668466e-07, "logits/chosen": -2.216029167175293, "logits/rejected": -2.2095859050750732, "logps/chosen": -139.25477600097656, "logps/rejected": -146.38613891601562, "loss": 126431.6625, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.02343796379864216, "rewards/margins": 0.012653100304305553, "rewards/rejected": -0.03609105944633484, "step": 750 }, { "epoch": 0.3075677863213274, "grad_norm": 5534957.115190962, "learning_rate": 3.8484030589293747e-07, "logits/chosen": -2.2073702812194824, "logits/rejected": -2.209057569503784, "logps/chosen": -130.53199768066406, "logps/rejected": -137.91250610351562, "loss": 127669.2, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.01518569327890873, "rewards/margins": 0.007037720177322626, "rewards/rejected": -0.02222341299057007, "step": 760 }, { "epoch": 0.311614730878187, "grad_norm": 4890109.310049175, "learning_rate": 3.825910931174089e-07, "logits/chosen": -2.225956678390503, "logits/rejected": -2.210540294647217, "logps/chosen": -127.26595306396484, "logps/rejected": -133.6049041748047, "loss": 124534.6875, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02195551246404648, "rewards/margins": 0.006126697175204754, "rewards/rejected": -0.02808220684528351, "step": 770 }, { "epoch": 0.31566167543504653, "grad_norm": 6427608.185533696, "learning_rate": 3.803418803418803e-07, "logits/chosen": -2.2634310722351074, "logits/rejected": -2.245199203491211, "logps/chosen": -137.40240478515625, "logps/rejected": -143.7775115966797, "loss": 129704.1875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.022509312257170677, "rewards/margins": 0.003963272087275982, "rewards/rejected": -0.026472587138414383, "step": 780 }, { "epoch": 0.3197086199919061, "grad_norm": 6730619.873094239, "learning_rate": 3.7809266756635175e-07, "logits/chosen": -2.1104772090911865, "logits/rejected": -2.0919671058654785, "logps/chosen": -125.5869369506836, "logps/rejected": -133.48800659179688, "loss": 125677.675, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.024079788476228714, "rewards/margins": 0.007324723992496729, "rewards/rejected": -0.0314045175909996, "step": 790 }, { "epoch": 0.3237555645487657, "grad_norm": 6156066.531026818, "learning_rate": 3.758434547908232e-07, "logits/chosen": -2.213543176651001, "logits/rejected": -2.1960647106170654, "logps/chosen": -145.46665954589844, "logps/rejected": -159.2154541015625, "loss": 121552.525, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.021815134212374687, "rewards/margins": 0.014243106357753277, "rewards/rejected": -0.03605823963880539, "step": 800 }, { "epoch": 0.32780250910562525, "grad_norm": 6503545.886073305, "learning_rate": 3.735942420152946e-07, "logits/chosen": -2.120095729827881, "logits/rejected": -2.0986738204956055, "logps/chosen": -134.55508422851562, "logps/rejected": -152.37815856933594, "loss": 122828.6875, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.025746628642082214, "rewards/margins": 0.01750759594142437, "rewards/rejected": -0.043254222720861435, "step": 810 }, { "epoch": 0.3318494536624848, "grad_norm": 5263993.227861122, "learning_rate": 3.713450292397661e-07, "logits/chosen": -2.236570358276367, "logits/rejected": -2.216663360595703, "logps/chosen": -137.65792846679688, "logps/rejected": -137.9815673828125, "loss": 125940.1375, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.025968383997678757, "rewards/margins": 0.009893245995044708, "rewards/rejected": -0.03586163371801376, "step": 820 }, { "epoch": 0.3358963982193444, "grad_norm": 5564470.498348531, "learning_rate": 3.690958164642375e-07, "logits/chosen": -2.2721188068389893, "logits/rejected": -2.2634165287017822, "logps/chosen": -146.41432189941406, "logps/rejected": -148.6261749267578, "loss": 130783.825, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.028409641236066818, "rewards/margins": 0.013621616177260876, "rewards/rejected": -0.04203125834465027, "step": 830 }, { "epoch": 0.33994334277620397, "grad_norm": 4256533.420167086, "learning_rate": 3.66846603688709e-07, "logits/chosen": -2.355905532836914, "logits/rejected": -2.3262717723846436, "logps/chosen": -135.9013671875, "logps/rejected": -144.34478759765625, "loss": 126088.525, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.02681833505630493, "rewards/margins": 0.009647052735090256, "rewards/rejected": -0.03646538779139519, "step": 840 }, { "epoch": 0.3439902873330635, "grad_norm": 6179484.090448809, "learning_rate": 3.645973909131804e-07, "logits/chosen": -2.2327308654785156, "logits/rejected": -2.194852828979492, "logps/chosen": -131.39376831054688, "logps/rejected": -155.78671264648438, "loss": 125825.075, "rewards/accuracies": 0.625, "rewards/chosen": -0.02463443949818611, "rewards/margins": 0.01199124101549387, "rewards/rejected": -0.03662567585706711, "step": 850 }, { "epoch": 0.3480372318899231, "grad_norm": 5448182.802456733, "learning_rate": 3.6234817813765177e-07, "logits/chosen": -2.2509052753448486, "logits/rejected": -2.2193102836608887, "logps/chosen": -131.55270385742188, "logps/rejected": -144.63186645507812, "loss": 130804.4625, "rewards/accuracies": 0.6875, "rewards/chosen": -0.019251421093940735, "rewards/margins": 0.008828094229102135, "rewards/rejected": -0.02807951346039772, "step": 860 }, { "epoch": 0.3520841764467827, "grad_norm": 4837603.177346373, "learning_rate": 3.6009896536212327e-07, "logits/chosen": -2.433258056640625, "logits/rejected": -2.404008150100708, "logps/chosen": -135.0418243408203, "logps/rejected": -134.2267303466797, "loss": 122885.925, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.017599385231733322, "rewards/margins": 0.0028229092713445425, "rewards/rejected": -0.02042229473590851, "step": 870 }, { "epoch": 0.35613112100364225, "grad_norm": 5692190.523993364, "learning_rate": 3.5784975258659466e-07, "logits/chosen": -2.372664213180542, "logits/rejected": -2.4038546085357666, "logps/chosen": -145.62425231933594, "logps/rejected": -161.4815216064453, "loss": 125917.475, "rewards/accuracies": 0.625, "rewards/chosen": -0.021967049688100815, "rewards/margins": 0.006153530441224575, "rewards/rejected": -0.028120581060647964, "step": 880 }, { "epoch": 0.3601780655605018, "grad_norm": 5143465.092976311, "learning_rate": 3.5560053981106616e-07, "logits/chosen": -2.4298062324523926, "logits/rejected": -2.441378116607666, "logps/chosen": -114.91922760009766, "logps/rejected": -128.6441192626953, "loss": 125689.425, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.02063891664147377, "rewards/margins": 0.0056837559677660465, "rewards/rejected": -0.02632267400622368, "step": 890 }, { "epoch": 0.3642250101173614, "grad_norm": 7348686.998660731, "learning_rate": 3.5335132703553755e-07, "logits/chosen": -2.3470609188079834, "logits/rejected": -2.338306427001953, "logps/chosen": -142.22169494628906, "logps/rejected": -155.0847625732422, "loss": 127013.675, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.021488003432750702, "rewards/margins": 0.00833697896450758, "rewards/rejected": -0.029824981465935707, "step": 900 }, { "epoch": 0.36827195467422097, "grad_norm": 5455491.748505866, "learning_rate": 3.5110211426000895e-07, "logits/chosen": -2.328141689300537, "logits/rejected": -2.300947666168213, "logps/chosen": -144.3502655029297, "logps/rejected": -160.5428466796875, "loss": 132699.2, "rewards/accuracies": 0.625, "rewards/chosen": -0.019702186807990074, "rewards/margins": 0.012580236420035362, "rewards/rejected": -0.032282426953315735, "step": 910 }, { "epoch": 0.3723188992310805, "grad_norm": 5918454.321642784, "learning_rate": 3.4885290148448044e-07, "logits/chosen": -2.2618203163146973, "logits/rejected": -2.273591995239258, "logps/chosen": -140.4850616455078, "logps/rejected": -144.3483123779297, "loss": 126713.925, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02276991680264473, "rewards/margins": 0.008334951475262642, "rewards/rejected": -0.03110486827790737, "step": 920 }, { "epoch": 0.3763658437879401, "grad_norm": 7045240.388394526, "learning_rate": 3.4660368870895184e-07, "logits/chosen": -2.3371381759643555, "logits/rejected": -2.3148632049560547, "logps/chosen": -141.95742797851562, "logps/rejected": -160.81312561035156, "loss": 124856.1375, "rewards/accuracies": 0.75, "rewards/chosen": -0.02805008552968502, "rewards/margins": 0.014153921976685524, "rewards/rejected": -0.042204007506370544, "step": 930 }, { "epoch": 0.3804127883447997, "grad_norm": 5526632.5094240755, "learning_rate": 3.443544759334233e-07, "logits/chosen": -2.3289644718170166, "logits/rejected": -2.3078227043151855, "logps/chosen": -151.3744354248047, "logps/rejected": -153.30511474609375, "loss": 126556.475, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.02584829553961754, "rewards/margins": 0.005777581594884396, "rewards/rejected": -0.03162587806582451, "step": 940 }, { "epoch": 0.38445973290165925, "grad_norm": 6075148.892704811, "learning_rate": 3.4210526315789473e-07, "logits/chosen": -2.2021899223327637, "logits/rejected": -2.199693441390991, "logps/chosen": -126.38993072509766, "logps/rejected": -135.55516052246094, "loss": 130061.4375, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.02466515824198723, "rewards/margins": 0.0069196284748613834, "rewards/rejected": -0.03158479183912277, "step": 950 }, { "epoch": 0.3885066774585188, "grad_norm": 5994722.402682892, "learning_rate": 3.398560503823661e-07, "logits/chosen": -2.375749349594116, "logits/rejected": -2.350696086883545, "logps/chosen": -135.23800659179688, "logps/rejected": -143.55755615234375, "loss": 130424.2625, "rewards/accuracies": 0.625, "rewards/chosen": -0.020815346390008926, "rewards/margins": 0.009448934346437454, "rewards/rejected": -0.03026428259909153, "step": 960 }, { "epoch": 0.3925536220153784, "grad_norm": 6955347.020210707, "learning_rate": 3.376068376068376e-07, "logits/chosen": -2.41917085647583, "logits/rejected": -2.3552684783935547, "logps/chosen": -133.61973571777344, "logps/rejected": -150.07192993164062, "loss": 126116.675, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.023608971387147903, "rewards/margins": 0.013977563008666039, "rewards/rejected": -0.03758653253316879, "step": 970 }, { "epoch": 0.39660056657223797, "grad_norm": 7026469.492711891, "learning_rate": 3.35357624831309e-07, "logits/chosen": -2.472712993621826, "logits/rejected": -2.4403810501098633, "logps/chosen": -144.36697387695312, "logps/rejected": -160.89016723632812, "loss": 125593.175, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0269068144261837, "rewards/margins": 0.013755050487816334, "rewards/rejected": -0.04066186398267746, "step": 980 }, { "epoch": 0.4006475111290975, "grad_norm": 5279874.300128242, "learning_rate": 3.3310841205578046e-07, "logits/chosen": -2.3706583976745605, "logits/rejected": -2.362694263458252, "logps/chosen": -130.1677703857422, "logps/rejected": -150.29214477539062, "loss": 122425.7125, "rewards/accuracies": 0.625, "rewards/chosen": -0.02780333161354065, "rewards/margins": 0.010212745517492294, "rewards/rejected": -0.038016077131032944, "step": 990 }, { "epoch": 0.4046944556859571, "grad_norm": 7346421.9033947745, "learning_rate": 3.308591992802519e-07, "logits/chosen": -2.3910489082336426, "logits/rejected": -2.360917091369629, "logps/chosen": -134.7192840576172, "logps/rejected": -145.80410766601562, "loss": 120740.6875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02175028808414936, "rewards/margins": 0.012434590607881546, "rewards/rejected": -0.03418487682938576, "step": 1000 }, { "epoch": 0.4087414002428167, "grad_norm": 6266220.786790677, "learning_rate": 3.286099865047233e-07, "logits/chosen": -2.258577585220337, "logits/rejected": -2.278409957885742, "logps/chosen": -134.9305877685547, "logps/rejected": -154.0025177001953, "loss": 127529.4875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.020327283069491386, "rewards/margins": 0.010549711063504219, "rewards/rejected": -0.030876994132995605, "step": 1010 }, { "epoch": 0.41278834479967624, "grad_norm": 6569607.008702802, "learning_rate": 3.263607737291948e-07, "logits/chosen": -2.2700555324554443, "logits/rejected": -2.2413737773895264, "logps/chosen": -145.36404418945312, "logps/rejected": -159.6646270751953, "loss": 129882.5125, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.020192014053463936, "rewards/margins": 0.008731147274374962, "rewards/rejected": -0.0289231576025486, "step": 1020 }, { "epoch": 0.4168352893565358, "grad_norm": 6604946.805112461, "learning_rate": 3.241115609536662e-07, "logits/chosen": -2.2907137870788574, "logits/rejected": -2.2590463161468506, "logps/chosen": -148.82757568359375, "logps/rejected": -158.55458068847656, "loss": 123561.0125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02791331335902214, "rewards/margins": 0.01067260093986988, "rewards/rejected": -0.03858591616153717, "step": 1030 }, { "epoch": 0.4208822339133954, "grad_norm": 7819744.1215137215, "learning_rate": 3.2186234817813764e-07, "logits/chosen": -2.3501906394958496, "logits/rejected": -2.382286548614502, "logps/chosen": -145.73556518554688, "logps/rejected": -145.95733642578125, "loss": 125984.175, "rewards/accuracies": 0.625, "rewards/chosen": -0.024780739098787308, "rewards/margins": 0.01095888763666153, "rewards/rejected": -0.03573962673544884, "step": 1040 }, { "epoch": 0.42492917847025496, "grad_norm": 8329276.874833419, "learning_rate": 3.196131354026091e-07, "logits/chosen": -2.3430895805358887, "logits/rejected": -2.2940633296966553, "logps/chosen": -156.47262573242188, "logps/rejected": -172.1248321533203, "loss": 127542.5875, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.026906628161668777, "rewards/margins": 0.02299944870173931, "rewards/rejected": -0.04990607872605324, "step": 1050 }, { "epoch": 0.4289761230271145, "grad_norm": 5411023.24022027, "learning_rate": 3.1736392262708053e-07, "logits/chosen": -2.350010633468628, "logits/rejected": -2.348132610321045, "logps/chosen": -134.76171875, "logps/rejected": -165.0662384033203, "loss": 124288.825, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.026082511991262436, "rewards/margins": 0.02408730424940586, "rewards/rejected": -0.05016981437802315, "step": 1060 }, { "epoch": 0.4330230675839741, "grad_norm": 6438956.528553201, "learning_rate": 3.151147098515519e-07, "logits/chosen": -2.403751850128174, "logits/rejected": -2.391162395477295, "logps/chosen": -133.4490966796875, "logps/rejected": -145.66363525390625, "loss": 122699.675, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.023318186402320862, "rewards/margins": 0.010946491733193398, "rewards/rejected": -0.03426467627286911, "step": 1070 }, { "epoch": 0.4370700121408337, "grad_norm": 5922234.372544115, "learning_rate": 3.1286549707602337e-07, "logits/chosen": -2.2476916313171387, "logits/rejected": -2.2207980155944824, "logps/chosen": -142.42433166503906, "logps/rejected": -152.08865356445312, "loss": 123837.95, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.028287163004279137, "rewards/margins": 0.015818050131201744, "rewards/rejected": -0.04410521313548088, "step": 1080 }, { "epoch": 0.44111695669769324, "grad_norm": 5883039.362660401, "learning_rate": 3.106162843004948e-07, "logits/chosen": -2.3883793354034424, "logits/rejected": -2.3448328971862793, "logps/chosen": -135.72998046875, "logps/rejected": -153.7415008544922, "loss": 124484.025, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.026858652010560036, "rewards/margins": 0.02032136358320713, "rewards/rejected": -0.04718000814318657, "step": 1090 }, { "epoch": 0.4451639012545528, "grad_norm": 7158679.357876272, "learning_rate": 3.0836707152496626e-07, "logits/chosen": -2.3638851642608643, "logits/rejected": -2.3225362300872803, "logps/chosen": -145.95896911621094, "logps/rejected": -169.71176147460938, "loss": 130674.3625, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.025300731882452965, "rewards/margins": 0.016643613576889038, "rewards/rejected": -0.04194434732198715, "step": 1100 }, { "epoch": 0.4492108458114124, "grad_norm": 7065276.414180166, "learning_rate": 3.061178587494377e-07, "logits/chosen": -2.3456435203552246, "logits/rejected": -2.3152847290039062, "logps/chosen": -126.73854064941406, "logps/rejected": -143.69662475585938, "loss": 127769.775, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.01884400099515915, "rewards/margins": 0.015868009999394417, "rewards/rejected": -0.03471200913190842, "step": 1110 }, { "epoch": 0.45325779036827196, "grad_norm": 8872861.336008936, "learning_rate": 3.038686459739091e-07, "logits/chosen": -2.3893070220947266, "logits/rejected": -2.379615068435669, "logps/chosen": -135.2264404296875, "logps/rejected": -147.5668487548828, "loss": 121978.65, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.025070184841752052, "rewards/margins": 0.01242685504257679, "rewards/rejected": -0.037497036159038544, "step": 1120 }, { "epoch": 0.4573047349251315, "grad_norm": 4362461.84620477, "learning_rate": 3.0161943319838055e-07, "logits/chosen": -2.3373289108276367, "logits/rejected": -2.3283610343933105, "logps/chosen": -113.62736511230469, "logps/rejected": -132.5222930908203, "loss": 122763.6, "rewards/accuracies": 0.625, "rewards/chosen": -0.027866637334227562, "rewards/margins": 0.010607337579131126, "rewards/rejected": -0.03847397491335869, "step": 1130 }, { "epoch": 0.4613516794819911, "grad_norm": 6441902.5854437305, "learning_rate": 2.99370220422852e-07, "logits/chosen": -2.4195449352264404, "logits/rejected": -2.421095848083496, "logps/chosen": -138.25782775878906, "logps/rejected": -152.2244110107422, "loss": 128506.9875, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.01849151961505413, "rewards/margins": 0.007775165140628815, "rewards/rejected": -0.026266688480973244, "step": 1140 }, { "epoch": 0.4653986240388507, "grad_norm": 7047614.500405596, "learning_rate": 2.971210076473234e-07, "logits/chosen": -2.4957115650177, "logits/rejected": -2.4509148597717285, "logps/chosen": -137.063720703125, "logps/rejected": -144.99172973632812, "loss": 121503.125, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.018852662295103073, "rewards/margins": 0.013833269476890564, "rewards/rejected": -0.03268593177199364, "step": 1150 }, { "epoch": 0.46944556859571024, "grad_norm": 6473613.10465131, "learning_rate": 2.948717948717949e-07, "logits/chosen": -2.508885145187378, "logits/rejected": -2.4511702060699463, "logps/chosen": -144.10775756835938, "logps/rejected": -154.82540893554688, "loss": 129245.7125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.025161290541291237, "rewards/margins": 0.008113402873277664, "rewards/rejected": -0.03327469527721405, "step": 1160 }, { "epoch": 0.4734925131525698, "grad_norm": 6225189.741747939, "learning_rate": 2.926225820962663e-07, "logits/chosen": -2.5162465572357178, "logits/rejected": -2.526261568069458, "logps/chosen": -134.27059936523438, "logps/rejected": -153.3767852783203, "loss": 129228.5, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.020399171859025955, "rewards/margins": 0.01304579060524702, "rewards/rejected": -0.03344495967030525, "step": 1170 }, { "epoch": 0.4775394577094294, "grad_norm": 6686211.632899741, "learning_rate": 2.903733693207377e-07, "logits/chosen": -2.500845432281494, "logits/rejected": -2.4776079654693604, "logps/chosen": -139.28172302246094, "logps/rejected": -162.630126953125, "loss": 127296.0, "rewards/accuracies": 0.6875, "rewards/chosen": -0.021778758615255356, "rewards/margins": 0.016411086544394493, "rewards/rejected": -0.038189847022295, "step": 1180 }, { "epoch": 0.48158640226628896, "grad_norm": 7327493.262833852, "learning_rate": 2.8812415654520917e-07, "logits/chosen": -2.5000369548797607, "logits/rejected": -2.4850993156433105, "logps/chosen": -133.94406127929688, "logps/rejected": -149.57473754882812, "loss": 130669.6, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.02554786205291748, "rewards/margins": 0.0159430094063282, "rewards/rejected": -0.04149087145924568, "step": 1190 }, { "epoch": 0.4856333468231485, "grad_norm": 8827001.924529044, "learning_rate": 2.8587494376968056e-07, "logits/chosen": -2.411595582962036, "logits/rejected": -2.4169204235076904, "logps/chosen": -130.80325317382812, "logps/rejected": -142.50013732910156, "loss": 121415.625, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.019139764830470085, "rewards/margins": 0.018100781366229057, "rewards/rejected": -0.03724054619669914, "step": 1200 }, { "epoch": 0.48968029138000807, "grad_norm": 5557571.235890019, "learning_rate": 2.8362573099415206e-07, "logits/chosen": -2.538846015930176, "logits/rejected": -2.502953052520752, "logps/chosen": -134.97543334960938, "logps/rejected": -142.17556762695312, "loss": 118867.85, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.013044399209320545, "rewards/margins": 0.011013238690793514, "rewards/rejected": -0.02405763790011406, "step": 1210 }, { "epoch": 0.4937272359368677, "grad_norm": 6914411.411931411, "learning_rate": 2.8137651821862346e-07, "logits/chosen": -2.389519691467285, "logits/rejected": -2.3551812171936035, "logps/chosen": -138.05276489257812, "logps/rejected": -160.35946655273438, "loss": 127666.15, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.025238817557692528, "rewards/margins": 0.02308265119791031, "rewards/rejected": -0.048321474343538284, "step": 1220 }, { "epoch": 0.49777418049372724, "grad_norm": 5609340.931669485, "learning_rate": 2.7912730544309496e-07, "logits/chosen": -2.4456872940063477, "logits/rejected": -2.410588026046753, "logps/chosen": -152.31521606445312, "logps/rejected": -166.41482543945312, "loss": 126962.65, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.029917621985077858, "rewards/margins": 0.013386559672653675, "rewards/rejected": -0.04330417513847351, "step": 1230 }, { "epoch": 0.5018211250505868, "grad_norm": 5540227.088123434, "learning_rate": 2.7687809266756635e-07, "logits/chosen": -2.3677735328674316, "logits/rejected": -2.354952335357666, "logps/chosen": -126.54608154296875, "logps/rejected": -145.7926483154297, "loss": 127253.85, "rewards/accuracies": 0.6875, "rewards/chosen": -0.018986444920301437, "rewards/margins": 0.016626928001642227, "rewards/rejected": -0.035613369196653366, "step": 1240 }, { "epoch": 0.5058680696074463, "grad_norm": 8562833.744693786, "learning_rate": 2.7462887989203774e-07, "logits/chosen": -2.344916820526123, "logits/rejected": -2.312051296234131, "logps/chosen": -138.89639282226562, "logps/rejected": -144.3848876953125, "loss": 134452.375, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.02468470111489296, "rewards/margins": 0.008161008358001709, "rewards/rejected": -0.03284571319818497, "step": 1250 }, { "epoch": 0.509915014164306, "grad_norm": 5502511.320429619, "learning_rate": 2.7237966711650924e-07, "logits/chosen": -2.283324718475342, "logits/rejected": -2.2585034370422363, "logps/chosen": -142.3020782470703, "logps/rejected": -157.03909301757812, "loss": 126080.5375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.02461249753832817, "rewards/margins": 0.02032613940536976, "rewards/rejected": -0.04493863508105278, "step": 1260 }, { "epoch": 0.5139619587211656, "grad_norm": 8682050.76660753, "learning_rate": 2.7013045434098063e-07, "logits/chosen": -2.275059223175049, "logits/rejected": -2.2415084838867188, "logps/chosen": -140.72640991210938, "logps/rejected": -159.70445251464844, "loss": 128182.575, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.033078595995903015, "rewards/margins": 0.01954609341919422, "rewards/rejected": -0.052624695003032684, "step": 1270 }, { "epoch": 0.5180089032780251, "grad_norm": 9710402.98065158, "learning_rate": 2.678812415654521e-07, "logits/chosen": -2.2827441692352295, "logits/rejected": -2.2365641593933105, "logps/chosen": -159.8201446533203, "logps/rejected": -167.04348754882812, "loss": 126161.6, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0369146391749382, "rewards/margins": 0.009988631121814251, "rewards/rejected": -0.04690327122807503, "step": 1280 }, { "epoch": 0.5220558478348847, "grad_norm": 5687965.997013683, "learning_rate": 2.656320287899235e-07, "logits/chosen": -2.440713405609131, "logits/rejected": -2.420994281768799, "logps/chosen": -141.1729278564453, "logps/rejected": -146.4890899658203, "loss": 120775.6875, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.02869614027440548, "rewards/margins": 0.01065666601061821, "rewards/rejected": -0.03935280814766884, "step": 1290 }, { "epoch": 0.5261027923917442, "grad_norm": 6948542.641971354, "learning_rate": 2.633828160143949e-07, "logits/chosen": -2.462017774581909, "logits/rejected": -2.479309558868408, "logps/chosen": -148.1350860595703, "logps/rejected": -158.99916076660156, "loss": 127668.2, "rewards/accuracies": 0.625, "rewards/chosen": -0.020126869902014732, "rewards/margins": 0.011567593552172184, "rewards/rejected": -0.03169446438550949, "step": 1300 }, { "epoch": 0.5301497369486038, "grad_norm": 8950839.210890554, "learning_rate": 2.611336032388664e-07, "logits/chosen": -2.379216432571411, "logits/rejected": -2.349857807159424, "logps/chosen": -158.7649688720703, "logps/rejected": -158.4715118408203, "loss": 133855.0375, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.030759122222661972, "rewards/margins": 0.004436601884663105, "rewards/rejected": -0.0351957306265831, "step": 1310 }, { "epoch": 0.5341966815054634, "grad_norm": 5080965.438845941, "learning_rate": 2.588843904633378e-07, "logits/chosen": -2.4408226013183594, "logits/rejected": -2.4230995178222656, "logps/chosen": -122.5213394165039, "logps/rejected": -136.46041870117188, "loss": 125633.5875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.022939473390579224, "rewards/margins": 0.01324677187949419, "rewards/rejected": -0.03618624433875084, "step": 1320 }, { "epoch": 0.5382436260623229, "grad_norm": 5182504.597846507, "learning_rate": 2.5663517768780926e-07, "logits/chosen": -2.505174160003662, "logits/rejected": -2.483182191848755, "logps/chosen": -142.41513061523438, "logps/rejected": -153.15907287597656, "loss": 123496.5125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.027721602469682693, "rewards/margins": 0.015740955248475075, "rewards/rejected": -0.04346255585551262, "step": 1330 }, { "epoch": 0.5422905706191825, "grad_norm": 7128845.279886402, "learning_rate": 2.543859649122807e-07, "logits/chosen": -2.478231430053711, "logits/rejected": -2.448133945465088, "logps/chosen": -135.66380310058594, "logps/rejected": -153.63766479492188, "loss": 126797.3625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.02139298990368843, "rewards/margins": 0.010669348761439323, "rewards/rejected": -0.0320623405277729, "step": 1340 }, { "epoch": 0.5463375151760421, "grad_norm": 6676875.1940184785, "learning_rate": 2.521367521367521e-07, "logits/chosen": -2.4742424488067627, "logits/rejected": -2.4604861736297607, "logps/chosen": -117.7722396850586, "logps/rejected": -130.4296875, "loss": 125562.3, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.01471949927508831, "rewards/margins": 0.010869570076465607, "rewards/rejected": -0.025589067488908768, "step": 1350 }, { "epoch": 0.5503844597329016, "grad_norm": 5238344.574016525, "learning_rate": 2.4988753936122354e-07, "logits/chosen": -2.404810905456543, "logits/rejected": -2.386918544769287, "logps/chosen": -129.4068603515625, "logps/rejected": -142.20303344726562, "loss": 121443.45, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.014744667336344719, "rewards/margins": 0.01344493217766285, "rewards/rejected": -0.02818959951400757, "step": 1360 }, { "epoch": 0.5544314042897612, "grad_norm": 6640943.9300566595, "learning_rate": 2.47638326585695e-07, "logits/chosen": -2.3393630981445312, "logits/rejected": -2.3323869705200195, "logps/chosen": -132.86630249023438, "logps/rejected": -144.26113891601562, "loss": 127953.15, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.026765987277030945, "rewards/margins": 0.01323648076504469, "rewards/rejected": -0.04000247269868851, "step": 1370 }, { "epoch": 0.5584783488466208, "grad_norm": 11500911.034452418, "learning_rate": 2.4538911381016643e-07, "logits/chosen": -2.2898106575012207, "logits/rejected": -2.346161127090454, "logps/chosen": -147.8282928466797, "logps/rejected": -163.23367309570312, "loss": 116464.0875, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.018574833869934082, "rewards/margins": 0.024278491735458374, "rewards/rejected": -0.042853325605392456, "step": 1380 }, { "epoch": 0.5625252934034803, "grad_norm": 5617383.1061038, "learning_rate": 2.431399010346379e-07, "logits/chosen": -2.4383928775787354, "logits/rejected": -2.447169780731201, "logps/chosen": -125.36312103271484, "logps/rejected": -135.82142639160156, "loss": 127916.95, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.018465066328644753, "rewards/margins": 0.007541149854660034, "rewards/rejected": -0.026006218045949936, "step": 1390 }, { "epoch": 0.56657223796034, "grad_norm": 7455055.133065385, "learning_rate": 2.408906882591093e-07, "logits/chosen": -2.351076126098633, "logits/rejected": -2.386265993118286, "logps/chosen": -138.0536346435547, "logps/rejected": -153.21102905273438, "loss": 126554.1, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02320500835776329, "rewards/margins": 0.008944050408899784, "rewards/rejected": -0.0321490578353405, "step": 1400 }, { "epoch": 0.5706191825171996, "grad_norm": 5233092.811472115, "learning_rate": 2.386414754835807e-07, "logits/chosen": -2.378037929534912, "logits/rejected": -2.3643617630004883, "logps/chosen": -160.79556274414062, "logps/rejected": -167.969970703125, "loss": 121087.4625, "rewards/accuracies": 0.625, "rewards/chosen": -0.021083252504467964, "rewards/margins": 0.008589145727455616, "rewards/rejected": -0.029672399163246155, "step": 1410 }, { "epoch": 0.5746661270740591, "grad_norm": 8648367.034730982, "learning_rate": 2.363922627080522e-07, "logits/chosen": -2.4605114459991455, "logits/rejected": -2.432900905609131, "logps/chosen": -145.24966430664062, "logps/rejected": -169.27865600585938, "loss": 127293.625, "rewards/accuracies": 0.75, "rewards/chosen": -0.028744569048285484, "rewards/margins": 0.01984976790845394, "rewards/rejected": -0.04859434440732002, "step": 1420 }, { "epoch": 0.5787130716309187, "grad_norm": 6917627.189571037, "learning_rate": 2.3414304993252359e-07, "logits/chosen": -2.415008783340454, "logits/rejected": -2.390291213989258, "logps/chosen": -118.60847473144531, "logps/rejected": -137.45223999023438, "loss": 126428.6625, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.021594971418380737, "rewards/margins": 0.010223200544714928, "rewards/rejected": -0.031818170100450516, "step": 1430 }, { "epoch": 0.5827600161877782, "grad_norm": 7112073.222735109, "learning_rate": 2.3189383715699503e-07, "logits/chosen": -2.361323595046997, "logits/rejected": -2.359731674194336, "logps/chosen": -136.58473205566406, "logps/rejected": -162.69363403320312, "loss": 126602.9125, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.02461722306907177, "rewards/margins": 0.010918731801211834, "rewards/rejected": -0.03553595766425133, "step": 1440 }, { "epoch": 0.5868069607446378, "grad_norm": 5694863.468784067, "learning_rate": 2.2964462438146648e-07, "logits/chosen": -2.4715747833251953, "logits/rejected": -2.4460928440093994, "logps/chosen": -139.74227905273438, "logps/rejected": -143.89366149902344, "loss": 124850.7625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.019559044390916824, "rewards/margins": 0.007027704268693924, "rewards/rejected": -0.0265867467969656, "step": 1450 }, { "epoch": 0.5908539053014974, "grad_norm": 6596951.588588771, "learning_rate": 2.2739541160593792e-07, "logits/chosen": -2.405214786529541, "logits/rejected": -2.376192569732666, "logps/chosen": -132.67037963867188, "logps/rejected": -152.36544799804688, "loss": 129629.9375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.019801389425992966, "rewards/margins": 0.02020254358649254, "rewards/rejected": -0.040003933012485504, "step": 1460 }, { "epoch": 0.5949008498583569, "grad_norm": 8604502.436566744, "learning_rate": 2.2514619883040934e-07, "logits/chosen": -2.4290502071380615, "logits/rejected": -2.4105029106140137, "logps/chosen": -138.2572784423828, "logps/rejected": -157.66226196289062, "loss": 126705.45, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.023596590384840965, "rewards/margins": 0.01850738190114498, "rewards/rejected": -0.042103976011276245, "step": 1470 }, { "epoch": 0.5989477944152165, "grad_norm": 6476200.112160567, "learning_rate": 2.2289698605488076e-07, "logits/chosen": -2.405041217803955, "logits/rejected": -2.343621253967285, "logps/chosen": -135.63980102539062, "logps/rejected": -157.7008056640625, "loss": 124231.4625, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.025133201852440834, "rewards/margins": 0.02330349013209343, "rewards/rejected": -0.048436690121889114, "step": 1480 }, { "epoch": 0.6029947389720761, "grad_norm": 6131917.113665815, "learning_rate": 2.206477732793522e-07, "logits/chosen": -2.413145065307617, "logits/rejected": -2.4113070964813232, "logps/chosen": -131.548583984375, "logps/rejected": -143.9263153076172, "loss": 123647.9125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.024967512115836143, "rewards/margins": 0.014087630435824394, "rewards/rejected": -0.03905514255166054, "step": 1490 }, { "epoch": 0.6070416835289356, "grad_norm": 6175564.2334703235, "learning_rate": 2.1839856050382366e-07, "logits/chosen": -2.4009850025177, "logits/rejected": -2.3908042907714844, "logps/chosen": -138.0840606689453, "logps/rejected": -150.7329864501953, "loss": 128591.4875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.02667851746082306, "rewards/margins": 0.0038915693294256926, "rewards/rejected": -0.03057008981704712, "step": 1500 }, { "epoch": 0.6110886280857952, "grad_norm": 7212353.914117165, "learning_rate": 2.161493477282951e-07, "logits/chosen": -2.405449628829956, "logits/rejected": -2.3872292041778564, "logps/chosen": -123.17295837402344, "logps/rejected": -143.22288513183594, "loss": 130148.7, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.01992269791662693, "rewards/margins": 0.015165319666266441, "rewards/rejected": -0.03508801758289337, "step": 1510 }, { "epoch": 0.6151355726426548, "grad_norm": 9046114.113444956, "learning_rate": 2.1390013495276652e-07, "logits/chosen": -2.405348777770996, "logits/rejected": -2.4298527240753174, "logps/chosen": -147.79513549804688, "logps/rejected": -168.75872802734375, "loss": 127527.1, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.026391273364424706, "rewards/margins": 0.009268445894122124, "rewards/rejected": -0.03565971553325653, "step": 1520 }, { "epoch": 0.6191825171995143, "grad_norm": 6130066.161443972, "learning_rate": 2.1165092217723797e-07, "logits/chosen": -2.3494181632995605, "logits/rejected": -2.3200573921203613, "logps/chosen": -131.47329711914062, "logps/rejected": -151.9276885986328, "loss": 121959.575, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.02297963574528694, "rewards/margins": 0.016111956909298897, "rewards/rejected": -0.03909159451723099, "step": 1530 }, { "epoch": 0.623229461756374, "grad_norm": 6650864.453901279, "learning_rate": 2.0940170940170939e-07, "logits/chosen": -2.3809990882873535, "logits/rejected": -2.3723671436309814, "logps/chosen": -156.38284301757812, "logps/rejected": -171.9077606201172, "loss": 122674.425, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.02390839345753193, "rewards/margins": 0.019179565832018852, "rewards/rejected": -0.04308795928955078, "step": 1540 }, { "epoch": 0.6272764063132336, "grad_norm": 5183894.402422156, "learning_rate": 2.0715249662618083e-07, "logits/chosen": -2.4737048149108887, "logits/rejected": -2.446381092071533, "logps/chosen": -145.76119995117188, "logps/rejected": -165.5288848876953, "loss": 125087.2875, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.02032746747136116, "rewards/margins": 0.009251989424228668, "rewards/rejected": -0.029579460620880127, "step": 1550 }, { "epoch": 0.6313233508700931, "grad_norm": 6969624.578927646, "learning_rate": 2.0490328385065225e-07, "logits/chosen": -2.405435800552368, "logits/rejected": -2.4140048027038574, "logps/chosen": -119.48077392578125, "logps/rejected": -130.77532958984375, "loss": 125878.4625, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.018460571765899658, "rewards/margins": 0.010110612958669662, "rewards/rejected": -0.02857118286192417, "step": 1560 }, { "epoch": 0.6353702954269527, "grad_norm": 5800628.746003852, "learning_rate": 2.026540710751237e-07, "logits/chosen": -2.366516351699829, "logits/rejected": -2.3698983192443848, "logps/chosen": -147.12881469726562, "logps/rejected": -146.3048553466797, "loss": 129275.3375, "rewards/accuracies": 0.625, "rewards/chosen": -0.01768730953335762, "rewards/margins": 0.011509931646287441, "rewards/rejected": -0.029197242110967636, "step": 1570 }, { "epoch": 0.6394172399838122, "grad_norm": 14918875.434130527, "learning_rate": 2.0040485829959514e-07, "logits/chosen": -2.4734253883361816, "logits/rejected": -2.4595344066619873, "logps/chosen": -125.6633529663086, "logps/rejected": -142.96157836914062, "loss": 123910.425, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.018165288493037224, "rewards/margins": 0.015782013535499573, "rewards/rejected": -0.033947303891181946, "step": 1580 }, { "epoch": 0.6434641845406718, "grad_norm": 5025901.152056148, "learning_rate": 1.981556455240666e-07, "logits/chosen": -2.4370574951171875, "logits/rejected": -2.4227161407470703, "logps/chosen": -141.5856475830078, "logps/rejected": -164.05941772460938, "loss": 128733.9125, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.021845245733857155, "rewards/margins": 0.011800579726696014, "rewards/rejected": -0.03364582732319832, "step": 1590 }, { "epoch": 0.6475111290975314, "grad_norm": 5965330.851620259, "learning_rate": 1.9590643274853798e-07, "logits/chosen": -2.398038387298584, "logits/rejected": -2.37715482711792, "logps/chosen": -118.133544921875, "logps/rejected": -130.4759063720703, "loss": 123004.3375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.01905783638358116, "rewards/margins": 0.011883154511451721, "rewards/rejected": -0.030940990895032883, "step": 1600 }, { "epoch": 0.6515580736543909, "grad_norm": 6299921.629356138, "learning_rate": 1.9365721997300943e-07, "logits/chosen": -2.3423843383789062, "logits/rejected": -2.2989087104797363, "logps/chosen": -115.65093994140625, "logps/rejected": -138.21340942382812, "loss": 127277.8375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02753433585166931, "rewards/margins": 0.014595555141568184, "rewards/rejected": -0.042129892855882645, "step": 1610 }, { "epoch": 0.6556050182112505, "grad_norm": 7576070.098325265, "learning_rate": 1.9140800719748088e-07, "logits/chosen": -2.320422649383545, "logits/rejected": -2.290821075439453, "logps/chosen": -117.01118469238281, "logps/rejected": -125.79508972167969, "loss": 124367.8875, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.01931951195001602, "rewards/margins": 0.006707245949655771, "rewards/rejected": -0.026026759296655655, "step": 1620 }, { "epoch": 0.6596519627681101, "grad_norm": 6162429.013600917, "learning_rate": 1.8915879442195232e-07, "logits/chosen": -2.334224224090576, "logits/rejected": -2.3541088104248047, "logps/chosen": -136.18832397460938, "logps/rejected": -151.46710205078125, "loss": 122585.7625, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.019429903477430344, "rewards/margins": 0.011553862132132053, "rewards/rejected": -0.03098376654088497, "step": 1630 }, { "epoch": 0.6636989073249696, "grad_norm": 4988734.781634246, "learning_rate": 1.8690958164642374e-07, "logits/chosen": -2.4520297050476074, "logits/rejected": -2.42329478263855, "logps/chosen": -144.2743682861328, "logps/rejected": -155.20095825195312, "loss": 124995.575, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.016223471611738205, "rewards/margins": 0.015287751331925392, "rewards/rejected": -0.031511224806308746, "step": 1640 }, { "epoch": 0.6677458518818292, "grad_norm": 6629567.008457434, "learning_rate": 1.8466036887089516e-07, "logits/chosen": -2.345116376876831, "logits/rejected": -2.348301887512207, "logps/chosen": -129.6711883544922, "logps/rejected": -151.17837524414062, "loss": 122800.6625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.020190779119729996, "rewards/margins": 0.0185395535081625, "rewards/rejected": -0.03873033449053764, "step": 1650 }, { "epoch": 0.6717927964386888, "grad_norm": 5601907.507778279, "learning_rate": 1.824111560953666e-07, "logits/chosen": -2.2657582759857178, "logits/rejected": -2.260693311691284, "logps/chosen": -128.01144409179688, "logps/rejected": -155.50888061523438, "loss": 124625.45, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.027610983699560165, "rewards/margins": 0.020689968019723892, "rewards/rejected": -0.04830095171928406, "step": 1660 }, { "epoch": 0.6758397409955483, "grad_norm": 5845172.102872888, "learning_rate": 1.8016194331983805e-07, "logits/chosen": -2.289998769760132, "logits/rejected": -2.2984931468963623, "logps/chosen": -120.5185775756836, "logps/rejected": -140.23655700683594, "loss": 125251.7625, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.029199788346886635, "rewards/margins": 0.014481378719210625, "rewards/rejected": -0.04368116706609726, "step": 1670 }, { "epoch": 0.6798866855524079, "grad_norm": 6195370.996741281, "learning_rate": 1.779127305443095e-07, "logits/chosen": -2.3538708686828613, "logits/rejected": -2.334139347076416, "logps/chosen": -136.63980102539062, "logps/rejected": -144.39056396484375, "loss": 129559.5375, "rewards/accuracies": 0.5, "rewards/chosen": -0.030268553644418716, "rewards/margins": 0.0073168775998055935, "rewards/rejected": -0.03758542984724045, "step": 1680 }, { "epoch": 0.6839336301092676, "grad_norm": 8790702.611604873, "learning_rate": 1.7566351776878092e-07, "logits/chosen": -2.3637337684631348, "logits/rejected": -2.3632633686065674, "logps/chosen": -129.40554809570312, "logps/rejected": -147.02755737304688, "loss": 128803.0875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.023326028138399124, "rewards/margins": 0.011800029315054417, "rewards/rejected": -0.03512606397271156, "step": 1690 }, { "epoch": 0.687980574666127, "grad_norm": 8047500.436527203, "learning_rate": 1.7341430499325237e-07, "logits/chosen": -2.2449162006378174, "logits/rejected": -2.254812717437744, "logps/chosen": -128.66867065429688, "logps/rejected": -134.86846923828125, "loss": 130052.75, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.021912669762969017, "rewards/margins": 0.007869280874729156, "rewards/rejected": -0.029781952500343323, "step": 1700 }, { "epoch": 0.6920275192229867, "grad_norm": 7961693.976507484, "learning_rate": 1.7116509221772378e-07, "logits/chosen": -2.308650255203247, "logits/rejected": -2.325552463531494, "logps/chosen": -118.10685729980469, "logps/rejected": -132.20530700683594, "loss": 125613.825, "rewards/accuracies": 0.625, "rewards/chosen": -0.022883836179971695, "rewards/margins": 0.00968220829963684, "rewards/rejected": -0.032566044479608536, "step": 1710 }, { "epoch": 0.6960744637798462, "grad_norm": 8983241.448290937, "learning_rate": 1.6891587944219523e-07, "logits/chosen": -2.3228538036346436, "logits/rejected": -2.295989990234375, "logps/chosen": -136.067138671875, "logps/rejected": -149.92922973632812, "loss": 124795.0875, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.01805921457707882, "rewards/margins": 0.018675491213798523, "rewards/rejected": -0.03673470392823219, "step": 1720 }, { "epoch": 0.7001214083367058, "grad_norm": 6598233.046729883, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -2.321946144104004, "logits/rejected": -2.2634482383728027, "logps/chosen": -156.81881713867188, "logps/rejected": -175.48133850097656, "loss": 124281.6625, "rewards/accuracies": 0.6875, "rewards/chosen": -0.023901356384158134, "rewards/margins": 0.020154178142547607, "rewards/rejected": -0.04405553638935089, "step": 1730 }, { "epoch": 0.7041683528935654, "grad_norm": 7393573.408673971, "learning_rate": 1.644174538911381e-07, "logits/chosen": -2.157721996307373, "logits/rejected": -2.1384575366973877, "logps/chosen": -156.61431884765625, "logps/rejected": -169.68267822265625, "loss": 123967.125, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.033402346074581146, "rewards/margins": 0.017039528116583824, "rewards/rejected": -0.05044187977910042, "step": 1740 }, { "epoch": 0.7082152974504249, "grad_norm": 8650281.232154809, "learning_rate": 1.6216824111560954e-07, "logits/chosen": -2.3099396228790283, "logits/rejected": -2.314627170562744, "logps/chosen": -140.70175170898438, "logps/rejected": -165.3170928955078, "loss": 125535.2875, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02742253616452217, "rewards/margins": 0.015254299156367779, "rewards/rejected": -0.04267684370279312, "step": 1750 }, { "epoch": 0.7122622420072845, "grad_norm": 6208948.317347317, "learning_rate": 1.5991902834008096e-07, "logits/chosen": -2.3783583641052246, "logits/rejected": -2.362631320953369, "logps/chosen": -148.7383270263672, "logps/rejected": -162.62327575683594, "loss": 121080.075, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.014549913816154003, "rewards/margins": 0.013842826709151268, "rewards/rejected": -0.028392743319272995, "step": 1760 }, { "epoch": 0.7163091865641441, "grad_norm": 7739071.113911002, "learning_rate": 1.5766981556455238e-07, "logits/chosen": -2.299868583679199, "logits/rejected": -2.2598750591278076, "logps/chosen": -162.82052612304688, "logps/rejected": -184.3058319091797, "loss": 122385.7125, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.030940508469939232, "rewards/margins": 0.023769445717334747, "rewards/rejected": -0.05470995977520943, "step": 1770 }, { "epoch": 0.7203561311210036, "grad_norm": 6707665.892655141, "learning_rate": 1.5542060278902383e-07, "logits/chosen": -2.3239502906799316, "logits/rejected": -2.3085806369781494, "logps/chosen": -139.06484985351562, "logps/rejected": -157.50460815429688, "loss": 115194.475, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03321670740842819, "rewards/margins": 0.021816464141011238, "rewards/rejected": -0.05503316968679428, "step": 1780 }, { "epoch": 0.7244030756778632, "grad_norm": 7475588.891135754, "learning_rate": 1.5317139001349527e-07, "logits/chosen": -2.380169630050659, "logits/rejected": -2.3587822914123535, "logps/chosen": -134.81069946289062, "logps/rejected": -149.78839111328125, "loss": 135028.0125, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.03387707471847534, "rewards/margins": 0.011253075674176216, "rewards/rejected": -0.04513014853000641, "step": 1790 }, { "epoch": 0.7284500202347228, "grad_norm": 6224311.633435066, "learning_rate": 1.5092217723796672e-07, "logits/chosen": -2.4899191856384277, "logits/rejected": -2.461540460586548, "logps/chosen": -139.72994995117188, "logps/rejected": -154.91757202148438, "loss": 127101.55, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.022551989182829857, "rewards/margins": 0.019368382170796394, "rewards/rejected": -0.04192037135362625, "step": 1800 }, { "epoch": 0.7324969647915823, "grad_norm": 6407363.569135414, "learning_rate": 1.4867296446243814e-07, "logits/chosen": -2.457529067993164, "logits/rejected": -2.4312427043914795, "logps/chosen": -171.8442840576172, "logps/rejected": -170.40664672851562, "loss": 126581.2375, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.02567433752119541, "rewards/margins": 0.008936228230595589, "rewards/rejected": -0.0346105620265007, "step": 1810 }, { "epoch": 0.7365439093484419, "grad_norm": 5335773.687286384, "learning_rate": 1.4642375168690956e-07, "logits/chosen": -2.442826986312866, "logits/rejected": -2.424445867538452, "logps/chosen": -130.82366943359375, "logps/rejected": -150.00717163085938, "loss": 121689.35, "rewards/accuracies": 0.625, "rewards/chosen": -0.020857717841863632, "rewards/margins": 0.011433606036007404, "rewards/rejected": -0.03229131922125816, "step": 1820 }, { "epoch": 0.7405908539053015, "grad_norm": 5919606.114162943, "learning_rate": 1.44174538911381e-07, "logits/chosen": -2.4367711544036865, "logits/rejected": -2.4152512550354004, "logps/chosen": -116.6092758178711, "logps/rejected": -137.42446899414062, "loss": 124829.175, "rewards/accuracies": 0.6875, "rewards/chosen": -0.018279392272233963, "rewards/margins": 0.017404617741703987, "rewards/rejected": -0.0356840081512928, "step": 1830 }, { "epoch": 0.744637798462161, "grad_norm": 4526671.180016859, "learning_rate": 1.4192532613585245e-07, "logits/chosen": -2.3979544639587402, "logits/rejected": -2.3597800731658936, "logps/chosen": -135.9434814453125, "logps/rejected": -138.03778076171875, "loss": 129111.95, "rewards/accuracies": 0.5625, "rewards/chosen": -0.024272512644529343, "rewards/margins": 0.012156413868069649, "rewards/rejected": -0.03642892464995384, "step": 1840 }, { "epoch": 0.7486847430190207, "grad_norm": 7139221.010538934, "learning_rate": 1.396761133603239e-07, "logits/chosen": -2.4428467750549316, "logits/rejected": -2.428190231323242, "logps/chosen": -123.2089614868164, "logps/rejected": -138.09390258789062, "loss": 128958.6625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.018383827060461044, "rewards/margins": 0.010701683349907398, "rewards/rejected": -0.029085511341691017, "step": 1850 }, { "epoch": 0.7527316875758802, "grad_norm": 7046675.547216455, "learning_rate": 1.3742690058479532e-07, "logits/chosen": -2.4591715335845947, "logits/rejected": -2.426462411880493, "logps/chosen": -133.52520751953125, "logps/rejected": -138.1920623779297, "loss": 130433.475, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.014568162150681019, "rewards/margins": 0.013440297916531563, "rewards/rejected": -0.028008460998535156, "step": 1860 }, { "epoch": 0.7567786321327398, "grad_norm": 6183615.802176011, "learning_rate": 1.3517768780926674e-07, "logits/chosen": -2.4390716552734375, "logits/rejected": -2.3869736194610596, "logps/chosen": -127.33221435546875, "logps/rejected": -149.60716247558594, "loss": 126095.0375, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.020692896097898483, "rewards/margins": 0.015832407400012016, "rewards/rejected": -0.03652530163526535, "step": 1870 }, { "epoch": 0.7608255766895994, "grad_norm": 5139359.676607012, "learning_rate": 1.3292847503373818e-07, "logits/chosen": -2.4137744903564453, "logits/rejected": -2.4111902713775635, "logps/chosen": -138.07791137695312, "logps/rejected": -150.23367309570312, "loss": 122845.4375, "rewards/accuracies": 0.625, "rewards/chosen": -0.015661675482988358, "rewards/margins": 0.011621621437370777, "rewards/rejected": -0.02728329598903656, "step": 1880 }, { "epoch": 0.7648725212464589, "grad_norm": 5436765.081995142, "learning_rate": 1.3067926225820963e-07, "logits/chosen": -2.2953848838806152, "logits/rejected": -2.261265754699707, "logps/chosen": -131.72573852539062, "logps/rejected": -158.7117156982422, "loss": 120437.1625, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.018344033509492874, "rewards/margins": 0.025271952152252197, "rewards/rejected": -0.04361598566174507, "step": 1890 }, { "epoch": 0.7689194658033185, "grad_norm": 7773151.683082246, "learning_rate": 1.2843004948268105e-07, "logits/chosen": -2.229933023452759, "logits/rejected": -2.166466474533081, "logps/chosen": -147.3013153076172, "logps/rejected": -160.14205932617188, "loss": 130466.3875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.024005400016903877, "rewards/margins": 0.014198745600879192, "rewards/rejected": -0.038204144686460495, "step": 1900 }, { "epoch": 0.7729664103601781, "grad_norm": 6242141.939931843, "learning_rate": 1.261808367071525e-07, "logits/chosen": -2.2624001502990723, "logits/rejected": -2.229830503463745, "logps/chosen": -138.10633850097656, "logps/rejected": -152.7989044189453, "loss": 127404.4375, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02552894689142704, "rewards/margins": 0.008202909491956234, "rewards/rejected": -0.033731859177351, "step": 1910 }, { "epoch": 0.7770133549170376, "grad_norm": 6920375.30724732, "learning_rate": 1.2393162393162394e-07, "logits/chosen": -2.350060224533081, "logits/rejected": -2.3308169841766357, "logps/chosen": -132.56320190429688, "logps/rejected": -153.30557250976562, "loss": 126830.1, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.029025813564658165, "rewards/margins": 0.019299551844596863, "rewards/rejected": -0.04832536727190018, "step": 1920 }, { "epoch": 0.7810602994738972, "grad_norm": 6671009.085790114, "learning_rate": 1.2168241115609536e-07, "logits/chosen": -2.2904415130615234, "logits/rejected": -2.329463481903076, "logps/chosen": -141.00816345214844, "logps/rejected": -143.0104217529297, "loss": 129713.6875, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.03294295817613602, "rewards/margins": 0.0017857927596196532, "rewards/rejected": -0.034728746861219406, "step": 1930 }, { "epoch": 0.7851072440307568, "grad_norm": 6655872.215382386, "learning_rate": 1.194331983805668e-07, "logits/chosen": -2.3119730949401855, "logits/rejected": -2.2890148162841797, "logps/chosen": -131.12327575683594, "logps/rejected": -148.35281372070312, "loss": 126911.35, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.024319607764482498, "rewards/margins": 0.012184834107756615, "rewards/rejected": -0.036504440009593964, "step": 1940 }, { "epoch": 0.7891541885876163, "grad_norm": 7036617.58409423, "learning_rate": 1.1718398560503823e-07, "logits/chosen": -2.3747105598449707, "logits/rejected": -2.3655359745025635, "logps/chosen": -127.3541259765625, "logps/rejected": -142.77593994140625, "loss": 125537.975, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.017514357343316078, "rewards/margins": 0.013664362952113152, "rewards/rejected": -0.03117872215807438, "step": 1950 }, { "epoch": 0.7932011331444759, "grad_norm": 6573765.081555526, "learning_rate": 1.1493477282950967e-07, "logits/chosen": -2.412942409515381, "logits/rejected": -2.390746593475342, "logps/chosen": -134.2810821533203, "logps/rejected": -158.460693359375, "loss": 123726.75, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.021586496382951736, "rewards/margins": 0.017149869352579117, "rewards/rejected": -0.03873636573553085, "step": 1960 }, { "epoch": 0.7972480777013355, "grad_norm": 7038673.672056439, "learning_rate": 1.1268556005398109e-07, "logits/chosen": -2.370753765106201, "logits/rejected": -2.3683507442474365, "logps/chosen": -124.46659851074219, "logps/rejected": -131.4204559326172, "loss": 126820.75, "rewards/accuracies": 0.625, "rewards/chosen": -0.024847570806741714, "rewards/margins": 0.011085819453001022, "rewards/rejected": -0.03593338653445244, "step": 1970 }, { "epoch": 0.801295022258195, "grad_norm": 6451758.008444387, "learning_rate": 1.1043634727845254e-07, "logits/chosen": -2.325690984725952, "logits/rejected": -2.336920976638794, "logps/chosen": -122.97123718261719, "logps/rejected": -147.48297119140625, "loss": 123985.45, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.016694985330104828, "rewards/margins": 0.016988877207040787, "rewards/rejected": -0.033683862537145615, "step": 1980 }, { "epoch": 0.8053419668150547, "grad_norm": 9482913.906109469, "learning_rate": 1.0818713450292397e-07, "logits/chosen": -2.2602345943450928, "logits/rejected": -2.243213653564453, "logps/chosen": -122.68096923828125, "logps/rejected": -138.34278869628906, "loss": 124861.1125, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.028803208842873573, "rewards/margins": 0.015811622142791748, "rewards/rejected": -0.04461482912302017, "step": 1990 }, { "epoch": 0.8093889113719142, "grad_norm": 6927553.6188276345, "learning_rate": 1.059379217273954e-07, "logits/chosen": -2.3502438068389893, "logits/rejected": -2.337284564971924, "logps/chosen": -132.24378967285156, "logps/rejected": -149.02110290527344, "loss": 125569.4625, "rewards/accuracies": 0.625, "rewards/chosen": -0.018480569124221802, "rewards/margins": 0.016136765480041504, "rewards/rejected": -0.034617334604263306, "step": 2000 }, { "epoch": 0.8134358559287738, "grad_norm": 6611093.761936569, "learning_rate": 1.0368870895186684e-07, "logits/chosen": -2.3684864044189453, "logits/rejected": -2.324704170227051, "logps/chosen": -135.81961059570312, "logps/rejected": -160.5324249267578, "loss": 121162.075, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.012911828234791756, "rewards/margins": 0.030595939606428146, "rewards/rejected": -0.04350776970386505, "step": 2010 }, { "epoch": 0.8174828004856334, "grad_norm": 6723882.38264995, "learning_rate": 1.0143949617633828e-07, "logits/chosen": -2.2761006355285645, "logits/rejected": -2.2606966495513916, "logps/chosen": -119.88040924072266, "logps/rejected": -145.41346740722656, "loss": 129622.7125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.025995198637247086, "rewards/margins": 0.019534587860107422, "rewards/rejected": -0.045529790222644806, "step": 2020 }, { "epoch": 0.8215297450424929, "grad_norm": 8229250.060941711, "learning_rate": 9.919028340080972e-08, "logits/chosen": -2.3351616859436035, "logits/rejected": -2.280089855194092, "logps/chosen": -138.04751586914062, "logps/rejected": -154.4039306640625, "loss": 121636.6375, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.02659204974770546, "rewards/margins": 0.022092049941420555, "rewards/rejected": -0.048684097826480865, "step": 2030 }, { "epoch": 0.8255766895993525, "grad_norm": 8360918.973606626, "learning_rate": 9.694107062528115e-08, "logits/chosen": -2.302302837371826, "logits/rejected": -2.3009400367736816, "logps/chosen": -133.8302459716797, "logps/rejected": -153.5994110107422, "loss": 124760.0625, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.020709946751594543, "rewards/margins": 0.014815042726695538, "rewards/rejected": -0.03552498668432236, "step": 2040 }, { "epoch": 0.8296236341562121, "grad_norm": 8136804.681969547, "learning_rate": 9.46918578497526e-08, "logits/chosen": -2.325496196746826, "logits/rejected": -2.3160691261291504, "logps/chosen": -133.07785034179688, "logps/rejected": -157.1102294921875, "loss": 122905.6875, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.022938497364521027, "rewards/margins": 0.02051146700978279, "rewards/rejected": -0.04344996064901352, "step": 2050 }, { "epoch": 0.8336705787130716, "grad_norm": 5880924.756183454, "learning_rate": 9.244264507422401e-08, "logits/chosen": -2.247741460800171, "logits/rejected": -2.2601161003112793, "logps/chosen": -138.5823974609375, "logps/rejected": -150.9891357421875, "loss": 122247.55, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.021218325942754745, "rewards/margins": 0.013738051056861877, "rewards/rejected": -0.034956373274326324, "step": 2060 }, { "epoch": 0.8377175232699312, "grad_norm": 6479318.093365777, "learning_rate": 9.019343229869546e-08, "logits/chosen": -2.287973403930664, "logits/rejected": -2.27508282661438, "logps/chosen": -148.0543975830078, "logps/rejected": -174.75563049316406, "loss": 122681.5375, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.028557494282722473, "rewards/margins": 0.02148330584168434, "rewards/rejected": -0.050040800124406815, "step": 2070 }, { "epoch": 0.8417644678267908, "grad_norm": 7530654.549282354, "learning_rate": 8.794421952316688e-08, "logits/chosen": -2.3192243576049805, "logits/rejected": -2.301488161087036, "logps/chosen": -140.3570556640625, "logps/rejected": -144.66439819335938, "loss": 127493.5, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.027281736955046654, "rewards/margins": 0.011399330571293831, "rewards/rejected": -0.03868107125163078, "step": 2080 }, { "epoch": 0.8458114123836503, "grad_norm": 5545588.639352677, "learning_rate": 8.569500674763833e-08, "logits/chosen": -2.3623504638671875, "logits/rejected": -2.327298641204834, "logps/chosen": -125.07554626464844, "logps/rejected": -162.51771545410156, "loss": 122307.35, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.023772019892930984, "rewards/margins": 0.024702411144971848, "rewards/rejected": -0.048474427312612534, "step": 2090 }, { "epoch": 0.8498583569405099, "grad_norm": 7677881.561277626, "learning_rate": 8.344579397210976e-08, "logits/chosen": -2.400023937225342, "logits/rejected": -2.398374557495117, "logps/chosen": -143.50096130371094, "logps/rejected": -154.2194061279297, "loss": 126753.125, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03331100195646286, "rewards/margins": 0.007773646619170904, "rewards/rejected": -0.0410846471786499, "step": 2100 }, { "epoch": 0.8539053014973695, "grad_norm": 9639697.081950434, "learning_rate": 8.119658119658119e-08, "logits/chosen": -2.2588629722595215, "logits/rejected": -2.2181789875030518, "logps/chosen": -136.6796417236328, "logps/rejected": -171.16500854492188, "loss": 127603.3375, "rewards/accuracies": 0.75, "rewards/chosen": -0.03126269578933716, "rewards/margins": 0.025771383196115494, "rewards/rejected": -0.057034075260162354, "step": 2110 }, { "epoch": 0.857952246054229, "grad_norm": 5314294.806910229, "learning_rate": 7.894736842105262e-08, "logits/chosen": -2.486797332763672, "logits/rejected": -2.4730780124664307, "logps/chosen": -146.88571166992188, "logps/rejected": -158.010986328125, "loss": 125283.2125, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.020792517811059952, "rewards/margins": 0.017872992902994156, "rewards/rejected": -0.03866551071405411, "step": 2120 }, { "epoch": 0.8619991906110887, "grad_norm": 8035986.499722224, "learning_rate": 7.669815564552407e-08, "logits/chosen": -2.421731472015381, "logits/rejected": -2.425063371658325, "logps/chosen": -116.5892562866211, "logps/rejected": -132.42050170898438, "loss": 125860.925, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.021887045353651047, "rewards/margins": 0.014476152136921883, "rewards/rejected": -0.03636319935321808, "step": 2130 }, { "epoch": 0.8660461351679482, "grad_norm": 6251020.741893531, "learning_rate": 7.444894286999549e-08, "logits/chosen": -2.364879608154297, "logits/rejected": -2.311974048614502, "logps/chosen": -120.81207275390625, "logps/rejected": -145.0894317626953, "loss": 119764.95, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.02619818225502968, "rewards/margins": 0.023076878860592842, "rewards/rejected": -0.04927505925297737, "step": 2140 }, { "epoch": 0.8700930797248078, "grad_norm": 6301369.537300293, "learning_rate": 7.219973009446694e-08, "logits/chosen": -2.379647970199585, "logits/rejected": -2.3476970195770264, "logps/chosen": -134.33572387695312, "logps/rejected": -155.9193115234375, "loss": 118915.75, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.019399352371692657, "rewards/margins": 0.016515102237462997, "rewards/rejected": -0.03591445833444595, "step": 2150 }, { "epoch": 0.8741400242816674, "grad_norm": 6200780.785181898, "learning_rate": 6.995051731893837e-08, "logits/chosen": -2.4005587100982666, "logits/rejected": -2.381075382232666, "logps/chosen": -134.69631958007812, "logps/rejected": -142.3704071044922, "loss": 122057.2, "rewards/accuracies": 0.5625, "rewards/chosen": -0.024357806891202927, "rewards/margins": 0.01147081982344389, "rewards/rejected": -0.03582862392067909, "step": 2160 }, { "epoch": 0.8781869688385269, "grad_norm": 7707200.943905766, "learning_rate": 6.77013045434098e-08, "logits/chosen": -2.1691622734069824, "logits/rejected": -2.1518099308013916, "logps/chosen": -133.11085510253906, "logps/rejected": -150.32522583007812, "loss": 124932.0875, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.02330431155860424, "rewards/margins": 0.0101194242015481, "rewards/rejected": -0.033423732966184616, "step": 2170 }, { "epoch": 0.8822339133953865, "grad_norm": 6068434.174209909, "learning_rate": 6.545209176788123e-08, "logits/chosen": -2.2563586235046387, "logits/rejected": -2.249168872833252, "logps/chosen": -126.6658935546875, "logps/rejected": -150.95640563964844, "loss": 125160.825, "rewards/accuracies": 0.6875, "rewards/chosen": -0.015440529212355614, "rewards/margins": 0.015524588525295258, "rewards/rejected": -0.030965115875005722, "step": 2180 }, { "epoch": 0.8862808579522461, "grad_norm": 7537201.72606691, "learning_rate": 6.320287899235267e-08, "logits/chosen": -2.364108085632324, "logits/rejected": -2.3574013710021973, "logps/chosen": -127.92137145996094, "logps/rejected": -143.64276123046875, "loss": 128988.85, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.015308101661503315, "rewards/margins": 0.012061825022101402, "rewards/rejected": -0.027369925752282143, "step": 2190 }, { "epoch": 0.8903278025091056, "grad_norm": 20144650.289658338, "learning_rate": 6.095366621682411e-08, "logits/chosen": -2.3388938903808594, "logits/rejected": -2.309027910232544, "logps/chosen": -131.75338745117188, "logps/rejected": -147.02613830566406, "loss": 131861.55, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.033888086676597595, "rewards/margins": 0.011972433887422085, "rewards/rejected": -0.045860521495342255, "step": 2200 }, { "epoch": 0.8943747470659652, "grad_norm": 7275705.0028550355, "learning_rate": 5.8704453441295546e-08, "logits/chosen": -2.4196584224700928, "logits/rejected": -2.4083645343780518, "logps/chosen": -137.82431030273438, "logps/rejected": -152.37692260742188, "loss": 125663.2125, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.01876234821975231, "rewards/margins": 0.0189601369202137, "rewards/rejected": -0.03772248700261116, "step": 2210 }, { "epoch": 0.8984216916228248, "grad_norm": 7203116.155779993, "learning_rate": 5.645524066576698e-08, "logits/chosen": -2.4381861686706543, "logits/rejected": -2.403198003768921, "logps/chosen": -131.7117156982422, "logps/rejected": -142.67185974121094, "loss": 123239.9375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.01855655573308468, "rewards/margins": 0.015223322436213493, "rewards/rejected": -0.03377988189458847, "step": 2220 }, { "epoch": 0.9024686361796843, "grad_norm": 5199237.236235915, "learning_rate": 5.420602789023841e-08, "logits/chosen": -2.3613171577453613, "logits/rejected": -2.2851357460021973, "logps/chosen": -153.357177734375, "logps/rejected": -159.00067138671875, "loss": 123367.6125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.024445852264761925, "rewards/margins": 0.012765263207256794, "rewards/rejected": -0.037211112678050995, "step": 2230 }, { "epoch": 0.9065155807365439, "grad_norm": 8322118.880155748, "learning_rate": 5.1956815114709844e-08, "logits/chosen": -2.4463276863098145, "logits/rejected": -2.4448184967041016, "logps/chosen": -166.32937622070312, "logps/rejected": -170.3524932861328, "loss": 127037.6, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.034686215221881866, "rewards/margins": 0.006833164487034082, "rewards/rejected": -0.04151938110589981, "step": 2240 }, { "epoch": 0.9105625252934035, "grad_norm": 7843111.5615214445, "learning_rate": 4.9707602339181284e-08, "logits/chosen": -2.406442165374756, "logits/rejected": -2.3739068508148193, "logps/chosen": -135.46450805664062, "logps/rejected": -151.98318481445312, "loss": 119829.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.027578959241509438, "rewards/margins": 0.019721323624253273, "rewards/rejected": -0.04730028659105301, "step": 2250 }, { "epoch": 0.914609469850263, "grad_norm": 6508134.384511007, "learning_rate": 4.745838956365272e-08, "logits/chosen": -2.374009609222412, "logits/rejected": -2.330867290496826, "logps/chosen": -147.81002807617188, "logps/rejected": -150.0068817138672, "loss": 123565.525, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.030267197638750076, "rewards/margins": 0.008453629910945892, "rewards/rejected": -0.03872082382440567, "step": 2260 }, { "epoch": 0.9186564144071226, "grad_norm": 4876699.168643324, "learning_rate": 4.5209176788124156e-08, "logits/chosen": -2.4444994926452637, "logits/rejected": -2.370756149291992, "logps/chosen": -148.59286499023438, "logps/rejected": -159.95957946777344, "loss": 121402.275, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.027829691767692566, "rewards/margins": 0.01744781993329525, "rewards/rejected": -0.045277513563632965, "step": 2270 }, { "epoch": 0.9227033589639821, "grad_norm": 7528661.562948466, "learning_rate": 4.2959964012595596e-08, "logits/chosen": -2.421567678451538, "logits/rejected": -2.4027442932128906, "logps/chosen": -138.11740112304688, "logps/rejected": -148.74745178222656, "loss": 125506.075, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.025351068004965782, "rewards/margins": 0.01246053259819746, "rewards/rejected": -0.037811603397130966, "step": 2280 }, { "epoch": 0.9267503035208418, "grad_norm": 5869282.00575302, "learning_rate": 4.071075123706703e-08, "logits/chosen": -2.344552993774414, "logits/rejected": -2.309044122695923, "logps/chosen": -135.99095153808594, "logps/rejected": -162.00845336914062, "loss": 119300.3375, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.01940598525106907, "rewards/margins": 0.02190936915576458, "rewards/rejected": -0.04131535068154335, "step": 2290 }, { "epoch": 0.9307972480777014, "grad_norm": 6508908.496606901, "learning_rate": 3.846153846153846e-08, "logits/chosen": -2.2711312770843506, "logits/rejected": -2.235738754272461, "logps/chosen": -156.75537109375, "logps/rejected": -162.16207885742188, "loss": 121451.2625, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0278861615806818, "rewards/margins": 0.010201343335211277, "rewards/rejected": -0.038087502121925354, "step": 2300 }, { "epoch": 0.9348441926345609, "grad_norm": 8865730.632557675, "learning_rate": 3.6212325686009894e-08, "logits/chosen": -2.324096202850342, "logits/rejected": -2.2920069694519043, "logps/chosen": -117.9946060180664, "logps/rejected": -133.47108459472656, "loss": 122291.0625, "rewards/accuracies": 0.6875, "rewards/chosen": -0.022024232894182205, "rewards/margins": 0.014470313675701618, "rewards/rejected": -0.03649454563856125, "step": 2310 }, { "epoch": 0.9388911371914205, "grad_norm": 9971453.387611723, "learning_rate": 3.3963112910481334e-08, "logits/chosen": -2.409850597381592, "logits/rejected": -2.3329081535339355, "logps/chosen": -146.290283203125, "logps/rejected": -182.57904052734375, "loss": 120819.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.024361763149499893, "rewards/margins": 0.03630813583731651, "rewards/rejected": -0.060669898986816406, "step": 2320 }, { "epoch": 0.9429380817482801, "grad_norm": 6070350.046980263, "learning_rate": 3.1713900134952766e-08, "logits/chosen": -2.34000301361084, "logits/rejected": -2.3207154273986816, "logps/chosen": -135.71852111816406, "logps/rejected": -158.15370178222656, "loss": 127869.825, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.026078131049871445, "rewards/margins": 0.018510058522224426, "rewards/rejected": -0.04458818584680557, "step": 2330 }, { "epoch": 0.9469850263051396, "grad_norm": 8157133.764333476, "learning_rate": 2.94646873594242e-08, "logits/chosen": -2.4080350399017334, "logits/rejected": -2.3741536140441895, "logps/chosen": -139.84112548828125, "logps/rejected": -175.55917358398438, "loss": 126005.05, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.029435906559228897, "rewards/margins": 0.02673642337322235, "rewards/rejected": -0.05617233365774155, "step": 2340 }, { "epoch": 0.9510319708619992, "grad_norm": 8440293.351897202, "learning_rate": 2.7215474583895635e-08, "logits/chosen": -2.4163994789123535, "logits/rejected": -2.384222984313965, "logps/chosen": -158.38975524902344, "logps/rejected": -164.0137176513672, "loss": 123603.3875, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03101927414536476, "rewards/margins": 0.011726012453436852, "rewards/rejected": -0.042745284736156464, "step": 2350 }, { "epoch": 0.9550789154188588, "grad_norm": 6375292.788715957, "learning_rate": 2.496626180836707e-08, "logits/chosen": -2.2860450744628906, "logits/rejected": -2.28193998336792, "logps/chosen": -137.55361938476562, "logps/rejected": -162.74652099609375, "loss": 125693.3125, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.024988356977701187, "rewards/margins": 0.023718636482954025, "rewards/rejected": -0.04870699718594551, "step": 2360 }, { "epoch": 0.9591258599757183, "grad_norm": 9472423.8957588, "learning_rate": 2.2717049032838504e-08, "logits/chosen": -2.359046459197998, "logits/rejected": -2.3519136905670166, "logps/chosen": -135.88697814941406, "logps/rejected": -158.06321716308594, "loss": 127380.25, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.027975231409072876, "rewards/margins": 0.01904093287885189, "rewards/rejected": -0.04701615869998932, "step": 2370 }, { "epoch": 0.9631728045325779, "grad_norm": 6950351.310431428, "learning_rate": 2.046783625730994e-08, "logits/chosen": -2.278303623199463, "logits/rejected": -2.271866798400879, "logps/chosen": -147.02255249023438, "logps/rejected": -162.41444396972656, "loss": 131236.1875, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.03206818923354149, "rewards/margins": 0.013242989778518677, "rewards/rejected": -0.045311179012060165, "step": 2380 }, { "epoch": 0.9672197490894374, "grad_norm": 6190616.100642323, "learning_rate": 1.8218623481781373e-08, "logits/chosen": -2.3274073600769043, "logits/rejected": -2.2292959690093994, "logps/chosen": -152.0672149658203, "logps/rejected": -174.8033905029297, "loss": 124131.975, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.027064388617873192, "rewards/margins": 0.015537412837147713, "rewards/rejected": -0.042601801455020905, "step": 2390 }, { "epoch": 0.971266693646297, "grad_norm": 8140978.954292232, "learning_rate": 1.5969410706252813e-08, "logits/chosen": -2.3674769401550293, "logits/rejected": -2.3571083545684814, "logps/chosen": -144.0354461669922, "logps/rejected": -160.3311004638672, "loss": 125102.2375, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02302565984427929, "rewards/margins": 0.0166311115026474, "rewards/rejected": -0.03965677320957184, "step": 2400 }, { "epoch": 0.9753136382031566, "grad_norm": 8862305.552745355, "learning_rate": 1.3720197930724246e-08, "logits/chosen": -2.178356647491455, "logits/rejected": -2.179384708404541, "logps/chosen": -143.9452362060547, "logps/rejected": -151.89974975585938, "loss": 123180.975, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.035528309643268585, "rewards/margins": 0.007553645875304937, "rewards/rejected": -0.04308196157217026, "step": 2410 }, { "epoch": 0.9793605827600161, "grad_norm": 4848306.613269352, "learning_rate": 1.1470985155195682e-08, "logits/chosen": -2.402296781539917, "logits/rejected": -2.3765056133270264, "logps/chosen": -125.8743896484375, "logps/rejected": -145.01162719726562, "loss": 122925.2125, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.017237985506653786, "rewards/margins": 0.019860463216900826, "rewards/rejected": -0.03709845244884491, "step": 2420 }, { "epoch": 0.9834075273168758, "grad_norm": 5809904.408249709, "learning_rate": 9.221772379667116e-09, "logits/chosen": -2.4065396785736084, "logits/rejected": -2.3716368675231934, "logps/chosen": -143.05075073242188, "logps/rejected": -167.95664978027344, "loss": 124604.825, "rewards/accuracies": 0.625, "rewards/chosen": -0.029827838763594627, "rewards/margins": 0.02450350485742092, "rewards/rejected": -0.05433133989572525, "step": 2430 }, { "epoch": 0.9874544718737354, "grad_norm": 7148593.283376818, "learning_rate": 6.972559604138551e-09, "logits/chosen": -2.3499531745910645, "logits/rejected": -2.3520779609680176, "logps/chosen": -130.91500854492188, "logps/rejected": -159.89820861816406, "loss": 119113.2625, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.026112830266356468, "rewards/margins": 0.027585214003920555, "rewards/rejected": -0.05369805172085762, "step": 2440 }, { "epoch": 0.9915014164305949, "grad_norm": 7332505.899956737, "learning_rate": 4.723346828609986e-09, "logits/chosen": -2.3807873725891113, "logits/rejected": -2.3282299041748047, "logps/chosen": -138.15525817871094, "logps/rejected": -150.75531005859375, "loss": 124786.675, "rewards/accuracies": 0.625, "rewards/chosen": -0.02242584154009819, "rewards/margins": 0.013798736035823822, "rewards/rejected": -0.03622458130121231, "step": 2450 }, { "epoch": 0.9955483609874545, "grad_norm": 5408793.194556523, "learning_rate": 2.474134053081421e-09, "logits/chosen": -2.305051803588867, "logits/rejected": -2.2709367275238037, "logps/chosen": -127.0162124633789, "logps/rejected": -154.36273193359375, "loss": 125528.575, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02522462233901024, "rewards/margins": 0.017697211354970932, "rewards/rejected": -0.04292182996869087, "step": 2460 }, { "epoch": 0.9995953055443141, "grad_norm": 6547393.597919743, "learning_rate": 2.249212775528565e-10, "logits/chosen": -2.3911020755767822, "logits/rejected": -2.3864622116088867, "logps/chosen": -147.30072021484375, "logps/rejected": -168.179443359375, "loss": 121667.85, "rewards/accuracies": 0.625, "rewards/chosen": -0.027930116280913353, "rewards/margins": 0.010017314925789833, "rewards/rejected": -0.037947431206703186, "step": 2470 } ], "logging_steps": 10, "max_steps": 2471, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }