diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3753 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 2471, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004046944556859571, + "grad_norm": 3260930.195416938, + "learning_rate": 2.0161290322580643e-09, + "logits/chosen": -2.216688871383667, + "logits/rejected": -2.1725575923919678, + "logps/chosen": -62.37783432006836, + "logps/rejected": -57.61228561401367, + "loss": 137728.9531, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.004046944556859571, + "grad_norm": 3951641.9256235515, + "learning_rate": 2.0161290322580644e-08, + "logits/chosen": -2.3231096267700195, + "logits/rejected": -2.3050363063812256, + "logps/chosen": -109.29280090332031, + "logps/rejected": -105.20187377929688, + "loss": 128824.3056, + "rewards/accuracies": 0.4027777910232544, + "rewards/chosen": -8.181909652194008e-05, + "rewards/margins": -8.144730236381292e-05, + "rewards/rejected": -3.7179981404733553e-07, + "step": 10 + }, + { + "epoch": 0.008093889113719142, + "grad_norm": 3636837.085798033, + "learning_rate": 4.032258064516129e-08, + "logits/chosen": -2.3102259635925293, + "logits/rejected": -2.3181633949279785, + "logps/chosen": -102.9901351928711, + "logps/rejected": -103.0818099975586, + "loss": 128439.1625, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -3.3925286970770685e-06, + "rewards/margins": -7.394707154162461e-06, + "rewards/rejected": 4.00217959395377e-06, + "step": 20 + }, + { + "epoch": 0.012140833670578713, + "grad_norm": 4189016.609602417, + "learning_rate": 6.048387096774194e-08, + "logits/chosen": -2.2731196880340576, + "logits/rejected": -2.261061191558838, + "logps/chosen": -104.67350769042969, + "logps/rejected": -116.59749603271484, + "loss": 124740.475, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00028529245173558593, + "rewards/margins": 2.6274694391759112e-05, + "rewards/rejected": -0.00031156709883362055, + "step": 30 + }, + { + "epoch": 0.016187778227438283, + "grad_norm": 3141568.670348898, + "learning_rate": 8.064516129032257e-08, + "logits/chosen": -2.3156943321228027, + "logits/rejected": -2.294349193572998, + "logps/chosen": -129.86062622070312, + "logps/rejected": -117.5326156616211, + "loss": 131411.2, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.00011367227125447243, + "rewards/margins": -1.9382667233003303e-05, + "rewards/rejected": -9.428960038349032e-05, + "step": 40 + }, + { + "epoch": 0.020234722784297856, + "grad_norm": 4300244.422627452, + "learning_rate": 1.0080645161290321e-07, + "logits/chosen": -2.271444320678711, + "logits/rejected": -2.2707998752593994, + "logps/chosen": -107.74246978759766, + "logps/rejected": -112.56591796875, + "loss": 128522.9375, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.0001433270808774978, + "rewards/margins": 8.896701183402911e-05, + "rewards/rejected": -0.00023229411453939974, + "step": 50 + }, + { + "epoch": 0.024281667341157425, + "grad_norm": 4087404.2243083506, + "learning_rate": 1.2096774193548387e-07, + "logits/chosen": -2.2509658336639404, + "logits/rejected": -2.235924005508423, + "logps/chosen": -98.1602783203125, + "logps/rejected": -97.8387222290039, + "loss": 134684.625, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -3.988773187302286e-06, + "rewards/margins": 0.0002523847797419876, + "rewards/rejected": -0.00025637357612140477, + "step": 60 + }, + { + "epoch": 0.028328611898016998, + "grad_norm": 3240131.8848123536, + "learning_rate": 1.4112903225806453e-07, + "logits/chosen": -2.3215599060058594, + "logits/rejected": -2.3164916038513184, + "logps/chosen": -113.9156265258789, + "logps/rejected": -114.72650146484375, + "loss": 127554.8875, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.0005882935947738588, + "rewards/margins": -0.0001803641061997041, + "rewards/rejected": 0.000768657773733139, + "step": 70 + }, + { + "epoch": 0.03237555645487657, + "grad_norm": 4463707.543855141, + "learning_rate": 1.6129032258064515e-07, + "logits/chosen": -2.197829246520996, + "logits/rejected": -2.2096786499023438, + "logps/chosen": -99.81291198730469, + "logps/rejected": -96.83836364746094, + "loss": 129532.875, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 3.9735146856401116e-05, + "rewards/margins": 0.00038970523746684194, + "rewards/rejected": -0.0003499701269902289, + "step": 80 + }, + { + "epoch": 0.036422501011736136, + "grad_norm": 5504977.483755038, + "learning_rate": 1.814516129032258e-07, + "logits/chosen": -2.2197558879852295, + "logits/rejected": -2.200068712234497, + "logps/chosen": -112.21453857421875, + "logps/rejected": -110.07649993896484, + "loss": 132607.275, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.00027725097606889904, + "rewards/margins": 0.00037192669697105885, + "rewards/rejected": -0.0006491777021437883, + "step": 90 + }, + { + "epoch": 0.04046944556859571, + "grad_norm": 3275423.408726695, + "learning_rate": 2.0161290322580642e-07, + "logits/chosen": -2.2803494930267334, + "logits/rejected": -2.277498245239258, + "logps/chosen": -118.47029876708984, + "logps/rejected": -121.81834411621094, + "loss": 129364.775, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.00027665990637615323, + "rewards/margins": 0.0005457916995510459, + "rewards/rejected": -0.00026913188048638403, + "step": 100 + }, + { + "epoch": 0.04451639012545528, + "grad_norm": 3537595.5413078354, + "learning_rate": 2.2177419354838707e-07, + "logits/chosen": -2.2598938941955566, + "logits/rejected": -2.243565797805786, + "logps/chosen": -123.01219177246094, + "logps/rejected": -127.5718994140625, + "loss": 128605.475, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.001143028261139989, + "rewards/margins": 0.0005904460558667779, + "rewards/rejected": -0.0017334744334220886, + "step": 110 + }, + { + "epoch": 0.04856333468231485, + "grad_norm": 4700408.411362441, + "learning_rate": 2.4193548387096775e-07, + "logits/chosen": -2.189763307571411, + "logits/rejected": -2.230834484100342, + "logps/chosen": -111.58349609375, + "logps/rejected": -116.3633041381836, + "loss": 132832.7375, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.0004227511235512793, + "rewards/margins": -0.00036064969026483595, + "rewards/rejected": -6.210146966623142e-05, + "step": 120 + }, + { + "epoch": 0.052610279239174426, + "grad_norm": 4839768.05175113, + "learning_rate": 2.6209677419354835e-07, + "logits/chosen": -2.172719717025757, + "logits/rejected": -2.154069423675537, + "logps/chosen": -131.54049682617188, + "logps/rejected": -127.31382751464844, + "loss": 126528.7375, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0012601370690390468, + "rewards/margins": 0.0014206544728949666, + "rewards/rejected": -0.0026807915419340134, + "step": 130 + }, + { + "epoch": 0.056657223796033995, + "grad_norm": 4462123.98520813, + "learning_rate": 2.8225806451612905e-07, + "logits/chosen": -2.2781708240509033, + "logits/rejected": -2.2529187202453613, + "logps/chosen": -109.3487319946289, + "logps/rejected": -108.7385025024414, + "loss": 128939.875, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.0024039470590651035, + "rewards/margins": 0.0019583911634981632, + "rewards/rejected": -0.004362338222563267, + "step": 140 + }, + { + "epoch": 0.060704168352893564, + "grad_norm": 4413918.737440498, + "learning_rate": 3.0241935483870965e-07, + "logits/chosen": -2.0262560844421387, + "logits/rejected": -2.0333077907562256, + "logps/chosen": -115.6955337524414, + "logps/rejected": -129.337890625, + "loss": 125950.125, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": -3.777583970077103e-06, + "rewards/margins": -0.00044618500396609306, + "rewards/rejected": 0.0004424075596034527, + "step": 150 + }, + { + "epoch": 0.06475111290975313, + "grad_norm": 4956705.44191673, + "learning_rate": 3.225806451612903e-07, + "logits/chosen": -2.127880096435547, + "logits/rejected": -2.081531524658203, + "logps/chosen": -115.7586669921875, + "logps/rejected": -115.57160949707031, + "loss": 127159.3875, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0029864097014069557, + "rewards/margins": 0.0028749522753059864, + "rewards/rejected": -0.005861361511051655, + "step": 160 + }, + { + "epoch": 0.0687980574666127, + "grad_norm": 5249631.129700843, + "learning_rate": 3.4274193548387095e-07, + "logits/chosen": -1.924232840538025, + "logits/rejected": -1.9467108249664307, + "logps/chosen": -130.4487762451172, + "logps/rejected": -133.85560607910156, + "loss": 125375.3875, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.008038095198571682, + "rewards/margins": 0.002187486505135894, + "rewards/rejected": -0.01022558193653822, + "step": 170 + }, + { + "epoch": 0.07284500202347227, + "grad_norm": 4122924.0724422527, + "learning_rate": 3.629032258064516e-07, + "logits/chosen": -2.045342445373535, + "logits/rejected": -2.0415282249450684, + "logps/chosen": -118.37638854980469, + "logps/rejected": -112.38387298583984, + "loss": 126785.075, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.006844646297395229, + "rewards/margins": 0.0005232656258158386, + "rewards/rejected": -0.00736791267991066, + "step": 180 + }, + { + "epoch": 0.07689194658033185, + "grad_norm": 4041640.919843376, + "learning_rate": 3.8306451612903225e-07, + "logits/chosen": -2.0350680351257324, + "logits/rejected": -2.0383336544036865, + "logps/chosen": -96.37462615966797, + "logps/rejected": -109.77508544921875, + "loss": 123590.025, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.005209728144109249, + "rewards/margins": 0.0012681197840720415, + "rewards/rejected": -0.006477847695350647, + "step": 190 + }, + { + "epoch": 0.08093889113719142, + "grad_norm": 5009394.271837804, + "learning_rate": 4.0322580645161285e-07, + "logits/chosen": -2.0026402473449707, + "logits/rejected": -1.9795843362808228, + "logps/chosen": -111.41777038574219, + "logps/rejected": -113.9148178100586, + "loss": 126448.05, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.010595456697046757, + "rewards/margins": 0.003854970680549741, + "rewards/rejected": -0.01445042621344328, + "step": 200 + }, + { + "epoch": 0.08498583569405099, + "grad_norm": 5619272.0636549145, + "learning_rate": 4.2338709677419355e-07, + "logits/chosen": -2.1407713890075684, + "logits/rejected": -2.1545071601867676, + "logps/chosen": -110.00148010253906, + "logps/rejected": -112.6539077758789, + "loss": 128766.15, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.009294772520661354, + "rewards/margins": 0.0027922452427446842, + "rewards/rejected": -0.012087016366422176, + "step": 210 + }, + { + "epoch": 0.08903278025091056, + "grad_norm": 4415731.805933493, + "learning_rate": 4.4354838709677415e-07, + "logits/chosen": -2.3430728912353516, + "logits/rejected": -2.3087539672851562, + "logps/chosen": -131.23556518554688, + "logps/rejected": -134.5377655029297, + "loss": 132823.0125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.007352087646722794, + "rewards/margins": 0.004128883592784405, + "rewards/rejected": -0.011480971239507198, + "step": 220 + }, + { + "epoch": 0.09307972480777013, + "grad_norm": 5355155.642181672, + "learning_rate": 4.637096774193548e-07, + "logits/chosen": -2.242619752883911, + "logits/rejected": -2.2341275215148926, + "logps/chosen": -127.72953033447266, + "logps/rejected": -131.52122497558594, + "loss": 126450.9125, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.01162335742264986, + "rewards/margins": 0.005030062980949879, + "rewards/rejected": -0.01665342040359974, + "step": 230 + }, + { + "epoch": 0.0971266693646297, + "grad_norm": 4222976.278932744, + "learning_rate": 4.838709677419355e-07, + "logits/chosen": -2.2030246257781982, + "logits/rejected": -2.2015702724456787, + "logps/chosen": -108.4349365234375, + "logps/rejected": -110.720703125, + "loss": 133174.225, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.009825309738516808, + "rewards/margins": 0.003020837437361479, + "rewards/rejected": -0.012846146710216999, + "step": 240 + }, + { + "epoch": 0.10117361392148927, + "grad_norm": 5332420.53657513, + "learning_rate": 4.995501574448943e-07, + "logits/chosen": -2.1023497581481934, + "logits/rejected": -2.1095871925354004, + "logps/chosen": -110.6066665649414, + "logps/rejected": -117.8967056274414, + "loss": 127655.45, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.006971948780119419, + "rewards/margins": 0.0035569421015679836, + "rewards/rejected": -0.010528890416026115, + "step": 250 + }, + { + "epoch": 0.10522055847834885, + "grad_norm": 5679375.003121498, + "learning_rate": 4.973009446693657e-07, + "logits/chosen": -2.199481964111328, + "logits/rejected": -2.18941330909729, + "logps/chosen": -117.3616943359375, + "logps/rejected": -118.54112243652344, + "loss": 132409.6875, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.013199470937252045, + "rewards/margins": 0.001887517748400569, + "rewards/rejected": -0.015086987987160683, + "step": 260 + }, + { + "epoch": 0.10926750303520842, + "grad_norm": 4610431.437626582, + "learning_rate": 4.950517318938372e-07, + "logits/chosen": -2.3225109577178955, + "logits/rejected": -2.3390707969665527, + "logps/chosen": -124.8027572631836, + "logps/rejected": -129.42922973632812, + "loss": 125030.1375, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.012582078576087952, + "rewards/margins": 0.0034210742451250553, + "rewards/rejected": -0.01600315235555172, + "step": 270 + }, + { + "epoch": 0.11331444759206799, + "grad_norm": 6556255.599818757, + "learning_rate": 4.928025191183086e-07, + "logits/chosen": -2.1859679222106934, + "logits/rejected": -2.2081589698791504, + "logps/chosen": -114.65242767333984, + "logps/rejected": -124.5443344116211, + "loss": 124704.9, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.016057247295975685, + "rewards/margins": 0.002059857128188014, + "rewards/rejected": -0.018117103725671768, + "step": 280 + }, + { + "epoch": 0.11736139214892756, + "grad_norm": 6805409.426599127, + "learning_rate": 4.9055330634278e-07, + "logits/chosen": -2.219308853149414, + "logits/rejected": -2.214458465576172, + "logps/chosen": -134.7147979736328, + "logps/rejected": -142.11083984375, + "loss": 127160.525, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.009862681850790977, + "rewards/margins": 0.005548264365643263, + "rewards/rejected": -0.015410944819450378, + "step": 290 + }, + { + "epoch": 0.12140833670578713, + "grad_norm": 6068819.6586095495, + "learning_rate": 4.883040935672515e-07, + "logits/chosen": -2.258293390274048, + "logits/rejected": -2.228738307952881, + "logps/chosen": -132.33441162109375, + "logps/rejected": -141.69737243652344, + "loss": 128142.7625, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.008503363467752934, + "rewards/margins": 0.007645074278116226, + "rewards/rejected": -0.016148436814546585, + "step": 300 + }, + { + "epoch": 0.1254552812626467, + "grad_norm": 5267448.920762056, + "learning_rate": 4.860548807917229e-07, + "logits/chosen": -2.2171027660369873, + "logits/rejected": -2.2104790210723877, + "logps/chosen": -125.05142974853516, + "logps/rejected": -133.34071350097656, + "loss": 125674.1, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.019455790519714355, + "rewards/margins": 0.0074443453922867775, + "rewards/rejected": -0.026900136843323708, + "step": 310 + }, + { + "epoch": 0.12950222581950627, + "grad_norm": 6667685.083680488, + "learning_rate": 4.838056680161944e-07, + "logits/chosen": -2.1860244274139404, + "logits/rejected": -2.2035775184631348, + "logps/chosen": -122.4665756225586, + "logps/rejected": -132.46490478515625, + "loss": 125480.4125, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.017699861899018288, + "rewards/margins": 0.006000404246151447, + "rewards/rejected": -0.02370026707649231, + "step": 320 + }, + { + "epoch": 0.13354917037636585, + "grad_norm": 6064623.088294091, + "learning_rate": 4.815564552406658e-07, + "logits/chosen": -2.0421011447906494, + "logits/rejected": -2.057572603225708, + "logps/chosen": -134.10183715820312, + "logps/rejected": -144.3116912841797, + "loss": 124604.7875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.019064148887991905, + "rewards/margins": 0.006390860769897699, + "rewards/rejected": -0.025455012917518616, + "step": 330 + }, + { + "epoch": 0.1375961149332254, + "grad_norm": 12560788.46443257, + "learning_rate": 4.793072424651372e-07, + "logits/chosen": -1.9278684854507446, + "logits/rejected": -1.909166693687439, + "logps/chosen": -146.60585021972656, + "logps/rejected": -166.07937622070312, + "loss": 140379.8375, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0366508811712265, + "rewards/margins": 0.013624541461467743, + "rewards/rejected": -0.050275422632694244, + "step": 340 + }, + { + "epoch": 0.141643059490085, + "grad_norm": 6487628.638191885, + "learning_rate": 4.770580296896087e-07, + "logits/chosen": -2.11842679977417, + "logits/rejected": -2.1011595726013184, + "logps/chosen": -119.56195068359375, + "logps/rejected": -136.84823608398438, + "loss": 130511.3625, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.0215927604585886, + "rewards/margins": 0.007697033230215311, + "rewards/rejected": -0.029289793223142624, + "step": 350 + }, + { + "epoch": 0.14569000404694454, + "grad_norm": 4799050.3946148325, + "learning_rate": 4.7480881691408005e-07, + "logits/chosen": -2.0867960453033447, + "logits/rejected": -2.082698345184326, + "logps/chosen": -128.99429321289062, + "logps/rejected": -130.89785766601562, + "loss": 127926.0, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.016279883682727814, + "rewards/margins": 0.0010558776557445526, + "rewards/rejected": -0.017335761338472366, + "step": 360 + }, + { + "epoch": 0.14973694860380413, + "grad_norm": 5249060.947314388, + "learning_rate": 4.725596041385515e-07, + "logits/chosen": -2.1251657009124756, + "logits/rejected": -2.1052744388580322, + "logps/chosen": -121.3799819946289, + "logps/rejected": -121.27701568603516, + "loss": 131676.2375, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.017586207017302513, + "rewards/margins": 0.003813033225014806, + "rewards/rejected": -0.0213992390781641, + "step": 370 + }, + { + "epoch": 0.1537838931606637, + "grad_norm": 5293517.066249103, + "learning_rate": 4.7031039136302294e-07, + "logits/chosen": -2.15531587600708, + "logits/rejected": -2.153560161590576, + "logps/chosen": -159.96005249023438, + "logps/rejected": -153.87448120117188, + "loss": 121504.05, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.017819028347730637, + "rewards/margins": 0.008313515223562717, + "rewards/rejected": -0.02613254450261593, + "step": 380 + }, + { + "epoch": 0.15783083771752326, + "grad_norm": 5270002.835932803, + "learning_rate": 4.6806117858749433e-07, + "logits/chosen": -2.1870741844177246, + "logits/rejected": -2.171494483947754, + "logps/chosen": -148.86929321289062, + "logps/rejected": -164.97915649414062, + "loss": 129892.05, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.02027943730354309, + "rewards/margins": 0.0176930520683527, + "rewards/rejected": -0.03797249123454094, + "step": 390 + }, + { + "epoch": 0.16187778227438285, + "grad_norm": 4898003.511863262, + "learning_rate": 4.658119658119658e-07, + "logits/chosen": -2.1435184478759766, + "logits/rejected": -2.148679256439209, + "logps/chosen": -128.7902069091797, + "logps/rejected": -139.18150329589844, + "loss": 122692.925, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.018103724345564842, + "rewards/margins": 0.006611344870179892, + "rewards/rejected": -0.02471506968140602, + "step": 400 + }, + { + "epoch": 0.1659247268312424, + "grad_norm": 4183644.18979808, + "learning_rate": 4.635627530364372e-07, + "logits/chosen": -2.150381565093994, + "logits/rejected": -2.154317617416382, + "logps/chosen": -108.93717193603516, + "logps/rejected": -118.07032775878906, + "loss": 126758.0375, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.021253790706396103, + "rewards/margins": 0.003508577588945627, + "rewards/rejected": -0.024762369692325592, + "step": 410 + }, + { + "epoch": 0.16997167138810199, + "grad_norm": 5593714.467836783, + "learning_rate": 4.6131354026090867e-07, + "logits/chosen": -2.180170774459839, + "logits/rejected": -2.1523594856262207, + "logps/chosen": -126.38621520996094, + "logps/rejected": -136.35755920410156, + "loss": 121196.2, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01847546175122261, + "rewards/margins": 0.0067595853470265865, + "rewards/rejected": -0.025235047563910484, + "step": 420 + }, + { + "epoch": 0.17401861594496154, + "grad_norm": 3566616.7901411816, + "learning_rate": 4.590643274853801e-07, + "logits/chosen": -2.120450258255005, + "logits/rejected": -2.150542974472046, + "logps/chosen": -137.63836669921875, + "logps/rejected": -141.17825317382812, + "loss": 132284.5875, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.02999441884458065, + "rewards/margins": 0.0025890106335282326, + "rewards/rejected": -0.03258342668414116, + "step": 430 + }, + { + "epoch": 0.17806556050182112, + "grad_norm": 6039791.29116107, + "learning_rate": 4.568151147098515e-07, + "logits/chosen": -2.2097067832946777, + "logits/rejected": -2.1825873851776123, + "logps/chosen": -127.94209289550781, + "logps/rejected": -137.39776611328125, + "loss": 128589.475, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.023606717586517334, + "rewards/margins": 0.009609794244170189, + "rewards/rejected": -0.03321651369333267, + "step": 440 + }, + { + "epoch": 0.1821125050586807, + "grad_norm": 6343148.886392033, + "learning_rate": 4.54565901934323e-07, + "logits/chosen": -2.1717894077301025, + "logits/rejected": -2.2131998538970947, + "logps/chosen": -129.89688110351562, + "logps/rejected": -145.33839416503906, + "loss": 124381.275, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.020499037578701973, + "rewards/margins": 0.013838306069374084, + "rewards/rejected": -0.03433733806014061, + "step": 450 + }, + { + "epoch": 0.18615944961554026, + "grad_norm": 4981408.5070092585, + "learning_rate": 4.523166891587944e-07, + "logits/chosen": -2.2632086277008057, + "logits/rejected": -2.306267738342285, + "logps/chosen": -163.80706787109375, + "logps/rejected": -155.72915649414062, + "loss": 158881.6375, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.03987263888120651, + "rewards/margins": -0.008330432698130608, + "rewards/rejected": -0.03154221177101135, + "step": 460 + }, + { + "epoch": 0.19020639417239985, + "grad_norm": 6186406.38261752, + "learning_rate": 4.500674763832658e-07, + "logits/chosen": -2.4067013263702393, + "logits/rejected": -2.4073116779327393, + "logps/chosen": -123.8814697265625, + "logps/rejected": -133.23178100585938, + "loss": 129765.4625, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.01776200719177723, + "rewards/margins": 0.006926923058927059, + "rewards/rejected": -0.024688933044672012, + "step": 470 + }, + { + "epoch": 0.1942533387292594, + "grad_norm": 7924184.670127909, + "learning_rate": 4.478182636077373e-07, + "logits/chosen": -2.4064009189605713, + "logits/rejected": -2.3933303356170654, + "logps/chosen": -120.53520202636719, + "logps/rejected": -124.30986022949219, + "loss": 127188.5875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.013866530731320381, + "rewards/margins": 0.0045671057887375355, + "rewards/rejected": -0.01843363419175148, + "step": 480 + }, + { + "epoch": 0.19830028328611898, + "grad_norm": 6796881.168124855, + "learning_rate": 4.455690508322087e-07, + "logits/chosen": -2.35581636428833, + "logits/rejected": -2.276433229446411, + "logps/chosen": -113.40742492675781, + "logps/rejected": -126.89019775390625, + "loss": 122585.6875, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.015190203674137592, + "rewards/margins": 0.010421663522720337, + "rewards/rejected": -0.025611868128180504, + "step": 490 + }, + { + "epoch": 0.20234722784297854, + "grad_norm": 9409785.188721178, + "learning_rate": 4.433198380566802e-07, + "logits/chosen": -2.200453519821167, + "logits/rejected": -2.2011332511901855, + "logps/chosen": -156.01809692382812, + "logps/rejected": -169.92514038085938, + "loss": 129704.3, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.025881418958306313, + "rewards/margins": 0.010276483371853828, + "rewards/rejected": -0.03615789860486984, + "step": 500 + }, + { + "epoch": 0.20639417239983812, + "grad_norm": 5757712.5781175345, + "learning_rate": 4.410706252811516e-07, + "logits/chosen": -2.127547025680542, + "logits/rejected": -2.1388392448425293, + "logps/chosen": -130.27249145507812, + "logps/rejected": -145.90647888183594, + "loss": 123361.8125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.03281703591346741, + "rewards/margins": 0.009785661473870277, + "rewards/rejected": -0.042602695524692535, + "step": 510 + }, + { + "epoch": 0.2104411169566977, + "grad_norm": 5742087.523014036, + "learning_rate": 4.3882141250562297e-07, + "logits/chosen": -2.2757978439331055, + "logits/rejected": -2.2460601329803467, + "logps/chosen": -153.6471710205078, + "logps/rejected": -165.54989624023438, + "loss": 127158.9125, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.02743702568113804, + "rewards/margins": 0.017125947400927544, + "rewards/rejected": -0.04456297308206558, + "step": 520 + }, + { + "epoch": 0.21448806151355726, + "grad_norm": 6000988.402036818, + "learning_rate": 4.3657219973009447e-07, + "logits/chosen": -2.14945387840271, + "logits/rejected": -2.160613775253296, + "logps/chosen": -152.8687286376953, + "logps/rejected": -157.02215576171875, + "loss": 130855.4125, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.03712679445743561, + "rewards/margins": -0.002356339478865266, + "rewards/rejected": -0.03477045148611069, + "step": 530 + }, + { + "epoch": 0.21853500607041684, + "grad_norm": 7039581.88958706, + "learning_rate": 4.3432298695456586e-07, + "logits/chosen": -2.1952900886535645, + "logits/rejected": -2.125767946243286, + "logps/chosen": -121.56607818603516, + "logps/rejected": -136.8863983154297, + "loss": 124032.45, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.021455224603414536, + "rewards/margins": 0.01183997467160225, + "rewards/rejected": -0.03329520300030708, + "step": 540 + }, + { + "epoch": 0.2225819506272764, + "grad_norm": 6851510.087607766, + "learning_rate": 4.3207377417903736e-07, + "logits/chosen": -2.3099186420440674, + "logits/rejected": -2.2750840187072754, + "logps/chosen": -133.94058227539062, + "logps/rejected": -165.82687377929688, + "loss": 127159.35, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.02089579775929451, + "rewards/margins": 0.011938202194869518, + "rewards/rejected": -0.0328340008854866, + "step": 550 + }, + { + "epoch": 0.22662889518413598, + "grad_norm": 7651455.0301742535, + "learning_rate": 4.2982456140350876e-07, + "logits/chosen": -2.281270980834961, + "logits/rejected": -2.291888475418091, + "logps/chosen": -139.83163452148438, + "logps/rejected": -141.5286865234375, + "loss": 130547.225, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.02193923108279705, + "rewards/margins": 0.0074460976757109165, + "rewards/rejected": -0.02938532829284668, + "step": 560 + }, + { + "epoch": 0.23067583974099554, + "grad_norm": 4842418.287253727, + "learning_rate": 4.2757534862798015e-07, + "logits/chosen": -2.28908634185791, + "logits/rejected": -2.2613823413848877, + "logps/chosen": -130.56756591796875, + "logps/rejected": -136.48858642578125, + "loss": 129810.7, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.018774276599287987, + "rewards/margins": 0.012544331140816212, + "rewards/rejected": -0.03131860867142677, + "step": 570 + }, + { + "epoch": 0.23472278429785512, + "grad_norm": 5753286.585250832, + "learning_rate": 4.2532613585245165e-07, + "logits/chosen": -2.3290882110595703, + "logits/rejected": -2.2913310527801514, + "logps/chosen": -128.60073852539062, + "logps/rejected": -144.4147491455078, + "loss": 125407.5625, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.022057032212615013, + "rewards/margins": 0.013317006640136242, + "rewards/rejected": -0.03537403792142868, + "step": 580 + }, + { + "epoch": 0.2387697288547147, + "grad_norm": 6854533.186683347, + "learning_rate": 4.2307692307692304e-07, + "logits/chosen": -2.1821513175964355, + "logits/rejected": -2.227368116378784, + "logps/chosen": -132.9744873046875, + "logps/rejected": -143.91380310058594, + "loss": 119907.075, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.024536501616239548, + "rewards/margins": 0.00792471133172512, + "rewards/rejected": -0.03246121481060982, + "step": 590 + }, + { + "epoch": 0.24281667341157426, + "grad_norm": 7000163.800918494, + "learning_rate": 4.208277103013945e-07, + "logits/chosen": -2.2966506481170654, + "logits/rejected": -2.274991989135742, + "logps/chosen": -140.1864776611328, + "logps/rejected": -142.9268798828125, + "loss": 129494.7625, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.026311520487070084, + "rewards/margins": 0.005165449343621731, + "rewards/rejected": -0.03147696703672409, + "step": 600 + }, + { + "epoch": 0.24686361796843384, + "grad_norm": 5155538.44716785, + "learning_rate": 4.1857849752586593e-07, + "logits/chosen": -2.2126269340515137, + "logits/rejected": -2.2339818477630615, + "logps/chosen": -143.7578125, + "logps/rejected": -148.81027221679688, + "loss": 131088.325, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.022633636370301247, + "rewards/margins": 0.005649174097925425, + "rewards/rejected": -0.028282811865210533, + "step": 610 + }, + { + "epoch": 0.2509105625252934, + "grad_norm": 6494761.148749808, + "learning_rate": 4.1632928475033733e-07, + "logits/chosen": -2.2412619590759277, + "logits/rejected": -2.215108633041382, + "logps/chosen": -133.82061767578125, + "logps/rejected": -144.2487030029297, + "loss": 127834.35, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.023778211325407028, + "rewards/margins": 0.008780455216765404, + "rewards/rejected": -0.03255866840481758, + "step": 620 + }, + { + "epoch": 0.254957507082153, + "grad_norm": 6581411.527197339, + "learning_rate": 4.140800719748088e-07, + "logits/chosen": -2.3006882667541504, + "logits/rejected": -2.279165744781494, + "logps/chosen": -127.95011901855469, + "logps/rejected": -144.5232696533203, + "loss": 128899.5125, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.017114771530032158, + "rewards/margins": 0.012585528194904327, + "rewards/rejected": -0.029700294137001038, + "step": 630 + }, + { + "epoch": 0.25900445163901253, + "grad_norm": 6993144.620436077, + "learning_rate": 4.118308591992802e-07, + "logits/chosen": -2.288159132003784, + "logits/rejected": -2.27152681350708, + "logps/chosen": -116.51515197753906, + "logps/rejected": -134.83572387695312, + "loss": 122510.6375, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.014996351674199104, + "rewards/margins": 0.018196506425738335, + "rewards/rejected": -0.03319285809993744, + "step": 640 + }, + { + "epoch": 0.2630513961958721, + "grad_norm": 5352708.94864355, + "learning_rate": 4.0958164642375167e-07, + "logits/chosen": -2.33659029006958, + "logits/rejected": -2.3185806274414062, + "logps/chosen": -143.27899169921875, + "logps/rejected": -154.21240234375, + "loss": 128047.15, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.022664163261651993, + "rewards/margins": 0.016272926703095436, + "rewards/rejected": -0.03893708810210228, + "step": 650 + }, + { + "epoch": 0.2670983407527317, + "grad_norm": 5853928.770602061, + "learning_rate": 4.073324336482231e-07, + "logits/chosen": -2.2209713459014893, + "logits/rejected": -2.197364091873169, + "logps/chosen": -154.97152709960938, + "logps/rejected": -164.9137725830078, + "loss": 126285.6125, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.02928345464169979, + "rewards/margins": 0.017678027972579002, + "rewards/rejected": -0.046961478888988495, + "step": 660 + }, + { + "epoch": 0.27114528530959126, + "grad_norm": 5468563.620033422, + "learning_rate": 4.0508322087269456e-07, + "logits/chosen": -2.368302822113037, + "logits/rejected": -2.359222888946533, + "logps/chosen": -138.3487091064453, + "logps/rejected": -131.19773864746094, + "loss": 135010.325, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.022097600623965263, + "rewards/margins": -0.0010355912381783128, + "rewards/rejected": -0.021062009036540985, + "step": 670 + }, + { + "epoch": 0.2751922298664508, + "grad_norm": 5145007.282508669, + "learning_rate": 4.02834008097166e-07, + "logits/chosen": -2.2279224395751953, + "logits/rejected": -2.227818250656128, + "logps/chosen": -151.80599975585938, + "logps/rejected": -155.39369201660156, + "loss": 124851.875, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.023444540798664093, + "rewards/margins": 0.006362411193549633, + "rewards/rejected": -0.0298069529235363, + "step": 680 + }, + { + "epoch": 0.2792391744233104, + "grad_norm": 5800338.778716969, + "learning_rate": 4.005847953216374e-07, + "logits/chosen": -2.3348867893218994, + "logits/rejected": -2.3266310691833496, + "logps/chosen": -125.41386413574219, + "logps/rejected": -131.49343872070312, + "loss": 127372.8125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.021091241389513016, + "rewards/margins": 0.00724734365940094, + "rewards/rejected": -0.028338585048913956, + "step": 690 + }, + { + "epoch": 0.28328611898017, + "grad_norm": 8105894.218684362, + "learning_rate": 3.9833558254610884e-07, + "logits/chosen": -2.309593677520752, + "logits/rejected": -2.2981934547424316, + "logps/chosen": -132.08596801757812, + "logps/rejected": -137.70201110839844, + "loss": 124781.725, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.021190345287322998, + "rewards/margins": 0.0066665285266935825, + "rewards/rejected": -0.027856875211000443, + "step": 700 + }, + { + "epoch": 0.28733306353702953, + "grad_norm": 5039380.169629858, + "learning_rate": 3.960863697705803e-07, + "logits/chosen": -2.315074920654297, + "logits/rejected": -2.3194656372070312, + "logps/chosen": -147.24713134765625, + "logps/rejected": -158.99636840820312, + "loss": 128105.9625, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.018308859318494797, + "rewards/margins": 0.00702436501160264, + "rewards/rejected": -0.0253332257270813, + "step": 710 + }, + { + "epoch": 0.2913800080938891, + "grad_norm": 6770507.385238732, + "learning_rate": 3.9383715699505173e-07, + "logits/chosen": -2.3582499027252197, + "logits/rejected": -2.307143211364746, + "logps/chosen": -141.00454711914062, + "logps/rejected": -145.4442901611328, + "loss": 128073.85, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.02170463278889656, + "rewards/margins": 0.004128533415496349, + "rewards/rejected": -0.025833168998360634, + "step": 720 + }, + { + "epoch": 0.2954269526507487, + "grad_norm": 6843599.0452563455, + "learning_rate": 3.9158794421952313e-07, + "logits/chosen": -2.2773690223693848, + "logits/rejected": -2.2705273628234863, + "logps/chosen": -127.78352355957031, + "logps/rejected": -128.8694305419922, + "loss": 133363.3375, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.019749607890844345, + "rewards/margins": 0.0013956364709883928, + "rewards/rejected": -0.02114524319767952, + "step": 730 + }, + { + "epoch": 0.29947389720760825, + "grad_norm": 6414207.014030725, + "learning_rate": 3.893387314439946e-07, + "logits/chosen": -2.2219457626342773, + "logits/rejected": -2.1614620685577393, + "logps/chosen": -138.95530700683594, + "logps/rejected": -159.24916076660156, + "loss": 125832.575, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.01657973788678646, + "rewards/margins": 0.015021143481135368, + "rewards/rejected": -0.03160088509321213, + "step": 740 + }, + { + "epoch": 0.3035208417644678, + "grad_norm": 6251391.785537995, + "learning_rate": 3.87089518668466e-07, + "logits/chosen": -2.216029167175293, + "logits/rejected": -2.2095859050750732, + "logps/chosen": -139.25477600097656, + "logps/rejected": -146.38613891601562, + "loss": 126431.6625, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.02343796379864216, + "rewards/margins": 0.012653100304305553, + "rewards/rejected": -0.03609105944633484, + "step": 750 + }, + { + "epoch": 0.3075677863213274, + "grad_norm": 5534957.115190962, + "learning_rate": 3.8484030589293747e-07, + "logits/chosen": -2.2073702812194824, + "logits/rejected": -2.209057569503784, + "logps/chosen": -130.53199768066406, + "logps/rejected": -137.91250610351562, + "loss": 127669.2, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.01518569327890873, + "rewards/margins": 0.007037720177322626, + "rewards/rejected": -0.02222341299057007, + "step": 760 + }, + { + "epoch": 0.311614730878187, + "grad_norm": 4890109.310049175, + "learning_rate": 3.825910931174089e-07, + "logits/chosen": -2.225956678390503, + "logits/rejected": -2.210540294647217, + "logps/chosen": -127.26595306396484, + "logps/rejected": -133.6049041748047, + "loss": 124534.6875, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.02195551246404648, + "rewards/margins": 0.006126697175204754, + "rewards/rejected": -0.02808220684528351, + "step": 770 + }, + { + "epoch": 0.31566167543504653, + "grad_norm": 6427608.185533696, + "learning_rate": 3.803418803418803e-07, + "logits/chosen": -2.2634310722351074, + "logits/rejected": -2.245199203491211, + "logps/chosen": -137.40240478515625, + "logps/rejected": -143.7775115966797, + "loss": 129704.1875, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.022509312257170677, + "rewards/margins": 0.003963272087275982, + "rewards/rejected": -0.026472587138414383, + "step": 780 + }, + { + "epoch": 0.3197086199919061, + "grad_norm": 6730619.873094239, + "learning_rate": 3.7809266756635175e-07, + "logits/chosen": -2.1104772090911865, + "logits/rejected": -2.0919671058654785, + "logps/chosen": -125.5869369506836, + "logps/rejected": -133.48800659179688, + "loss": 125677.675, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.024079788476228714, + "rewards/margins": 0.007324723992496729, + "rewards/rejected": -0.0314045175909996, + "step": 790 + }, + { + "epoch": 0.3237555645487657, + "grad_norm": 6156066.531026818, + "learning_rate": 3.758434547908232e-07, + "logits/chosen": -2.213543176651001, + "logits/rejected": -2.1960647106170654, + "logps/chosen": -145.46665954589844, + "logps/rejected": -159.2154541015625, + "loss": 121552.525, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.021815134212374687, + "rewards/margins": 0.014243106357753277, + "rewards/rejected": -0.03605823963880539, + "step": 800 + }, + { + "epoch": 0.32780250910562525, + "grad_norm": 6503545.886073305, + "learning_rate": 3.735942420152946e-07, + "logits/chosen": -2.120095729827881, + "logits/rejected": -2.0986738204956055, + "logps/chosen": -134.55508422851562, + "logps/rejected": -152.37815856933594, + "loss": 122828.6875, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.025746628642082214, + "rewards/margins": 0.01750759594142437, + "rewards/rejected": -0.043254222720861435, + "step": 810 + }, + { + "epoch": 0.3318494536624848, + "grad_norm": 5263993.227861122, + "learning_rate": 3.713450292397661e-07, + "logits/chosen": -2.236570358276367, + "logits/rejected": -2.216663360595703, + "logps/chosen": -137.65792846679688, + "logps/rejected": -137.9815673828125, + "loss": 125940.1375, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.025968383997678757, + "rewards/margins": 0.009893245995044708, + "rewards/rejected": -0.03586163371801376, + "step": 820 + }, + { + "epoch": 0.3358963982193444, + "grad_norm": 5564470.498348531, + "learning_rate": 3.690958164642375e-07, + "logits/chosen": -2.2721188068389893, + "logits/rejected": -2.2634165287017822, + "logps/chosen": -146.41432189941406, + "logps/rejected": -148.6261749267578, + "loss": 130783.825, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.028409641236066818, + "rewards/margins": 0.013621616177260876, + "rewards/rejected": -0.04203125834465027, + "step": 830 + }, + { + "epoch": 0.33994334277620397, + "grad_norm": 4256533.420167086, + "learning_rate": 3.66846603688709e-07, + "logits/chosen": -2.355905532836914, + "logits/rejected": -2.3262717723846436, + "logps/chosen": -135.9013671875, + "logps/rejected": -144.34478759765625, + "loss": 126088.525, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.02681833505630493, + "rewards/margins": 0.009647052735090256, + "rewards/rejected": -0.03646538779139519, + "step": 840 + }, + { + "epoch": 0.3439902873330635, + "grad_norm": 6179484.090448809, + "learning_rate": 3.645973909131804e-07, + "logits/chosen": -2.2327308654785156, + "logits/rejected": -2.194852828979492, + "logps/chosen": -131.39376831054688, + "logps/rejected": -155.78671264648438, + "loss": 125825.075, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02463443949818611, + "rewards/margins": 0.01199124101549387, + "rewards/rejected": -0.03662567585706711, + "step": 850 + }, + { + "epoch": 0.3480372318899231, + "grad_norm": 5448182.802456733, + "learning_rate": 3.6234817813765177e-07, + "logits/chosen": -2.2509052753448486, + "logits/rejected": -2.2193102836608887, + "logps/chosen": -131.55270385742188, + "logps/rejected": -144.63186645507812, + "loss": 130804.4625, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.019251421093940735, + "rewards/margins": 0.008828094229102135, + "rewards/rejected": -0.02807951346039772, + "step": 860 + }, + { + "epoch": 0.3520841764467827, + "grad_norm": 4837603.177346373, + "learning_rate": 3.6009896536212327e-07, + "logits/chosen": -2.433258056640625, + "logits/rejected": -2.404008150100708, + "logps/chosen": -135.0418243408203, + "logps/rejected": -134.2267303466797, + "loss": 122885.925, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.017599385231733322, + "rewards/margins": 0.0028229092713445425, + "rewards/rejected": -0.02042229473590851, + "step": 870 + }, + { + "epoch": 0.35613112100364225, + "grad_norm": 5692190.523993364, + "learning_rate": 3.5784975258659466e-07, + "logits/chosen": -2.372664213180542, + "logits/rejected": -2.4038546085357666, + "logps/chosen": -145.62425231933594, + "logps/rejected": -161.4815216064453, + "loss": 125917.475, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.021967049688100815, + "rewards/margins": 0.006153530441224575, + "rewards/rejected": -0.028120581060647964, + "step": 880 + }, + { + "epoch": 0.3601780655605018, + "grad_norm": 5143465.092976311, + "learning_rate": 3.5560053981106616e-07, + "logits/chosen": -2.4298062324523926, + "logits/rejected": -2.441378116607666, + "logps/chosen": -114.91922760009766, + "logps/rejected": -128.6441192626953, + "loss": 125689.425, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.02063891664147377, + "rewards/margins": 0.0056837559677660465, + "rewards/rejected": -0.02632267400622368, + "step": 890 + }, + { + "epoch": 0.3642250101173614, + "grad_norm": 7348686.998660731, + "learning_rate": 3.5335132703553755e-07, + "logits/chosen": -2.3470609188079834, + "logits/rejected": -2.338306427001953, + "logps/chosen": -142.22169494628906, + "logps/rejected": -155.0847625732422, + "loss": 127013.675, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.021488003432750702, + "rewards/margins": 0.00833697896450758, + "rewards/rejected": -0.029824981465935707, + "step": 900 + }, + { + "epoch": 0.36827195467422097, + "grad_norm": 5455491.748505866, + "learning_rate": 3.5110211426000895e-07, + "logits/chosen": -2.328141689300537, + "logits/rejected": -2.300947666168213, + "logps/chosen": -144.3502655029297, + "logps/rejected": -160.5428466796875, + "loss": 132699.2, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.019702186807990074, + "rewards/margins": 0.012580236420035362, + "rewards/rejected": -0.032282426953315735, + "step": 910 + }, + { + "epoch": 0.3723188992310805, + "grad_norm": 5918454.321642784, + "learning_rate": 3.4885290148448044e-07, + "logits/chosen": -2.2618203163146973, + "logits/rejected": -2.273591995239258, + "logps/chosen": -140.4850616455078, + "logps/rejected": -144.3483123779297, + "loss": 126713.925, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.02276991680264473, + "rewards/margins": 0.008334951475262642, + "rewards/rejected": -0.03110486827790737, + "step": 920 + }, + { + "epoch": 0.3763658437879401, + "grad_norm": 7045240.388394526, + "learning_rate": 3.4660368870895184e-07, + "logits/chosen": -2.3371381759643555, + "logits/rejected": -2.3148632049560547, + "logps/chosen": -141.95742797851562, + "logps/rejected": -160.81312561035156, + "loss": 124856.1375, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02805008552968502, + "rewards/margins": 0.014153921976685524, + "rewards/rejected": -0.042204007506370544, + "step": 930 + }, + { + "epoch": 0.3804127883447997, + "grad_norm": 5526632.5094240755, + "learning_rate": 3.443544759334233e-07, + "logits/chosen": -2.3289644718170166, + "logits/rejected": -2.3078227043151855, + "logps/chosen": -151.3744354248047, + "logps/rejected": -153.30511474609375, + "loss": 126556.475, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.02584829553961754, + "rewards/margins": 0.005777581594884396, + "rewards/rejected": -0.03162587806582451, + "step": 940 + }, + { + "epoch": 0.38445973290165925, + "grad_norm": 6075148.892704811, + "learning_rate": 3.4210526315789473e-07, + "logits/chosen": -2.2021899223327637, + "logits/rejected": -2.199693441390991, + "logps/chosen": -126.38993072509766, + "logps/rejected": -135.55516052246094, + "loss": 130061.4375, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.02466515824198723, + "rewards/margins": 0.0069196284748613834, + "rewards/rejected": -0.03158479183912277, + "step": 950 + }, + { + "epoch": 0.3885066774585188, + "grad_norm": 5994722.402682892, + "learning_rate": 3.398560503823661e-07, + "logits/chosen": -2.375749349594116, + "logits/rejected": -2.350696086883545, + "logps/chosen": -135.23800659179688, + "logps/rejected": -143.55755615234375, + "loss": 130424.2625, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.020815346390008926, + "rewards/margins": 0.009448934346437454, + "rewards/rejected": -0.03026428259909153, + "step": 960 + }, + { + "epoch": 0.3925536220153784, + "grad_norm": 6955347.020210707, + "learning_rate": 3.376068376068376e-07, + "logits/chosen": -2.41917085647583, + "logits/rejected": -2.3552684783935547, + "logps/chosen": -133.61973571777344, + "logps/rejected": -150.07192993164062, + "loss": 126116.675, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.023608971387147903, + "rewards/margins": 0.013977563008666039, + "rewards/rejected": -0.03758653253316879, + "step": 970 + }, + { + "epoch": 0.39660056657223797, + "grad_norm": 7026469.492711891, + "learning_rate": 3.35357624831309e-07, + "logits/chosen": -2.472712993621826, + "logits/rejected": -2.4403810501098633, + "logps/chosen": -144.36697387695312, + "logps/rejected": -160.89016723632812, + "loss": 125593.175, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0269068144261837, + "rewards/margins": 0.013755050487816334, + "rewards/rejected": -0.04066186398267746, + "step": 980 + }, + { + "epoch": 0.4006475111290975, + "grad_norm": 5279874.300128242, + "learning_rate": 3.3310841205578046e-07, + "logits/chosen": -2.3706583976745605, + "logits/rejected": -2.362694263458252, + "logps/chosen": -130.1677703857422, + "logps/rejected": -150.29214477539062, + "loss": 122425.7125, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02780333161354065, + "rewards/margins": 0.010212745517492294, + "rewards/rejected": -0.038016077131032944, + "step": 990 + }, + { + "epoch": 0.4046944556859571, + "grad_norm": 7346421.9033947745, + "learning_rate": 3.308591992802519e-07, + "logits/chosen": -2.3910489082336426, + "logits/rejected": -2.360917091369629, + "logps/chosen": -134.7192840576172, + "logps/rejected": -145.80410766601562, + "loss": 120740.6875, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02175028808414936, + "rewards/margins": 0.012434590607881546, + "rewards/rejected": -0.03418487682938576, + "step": 1000 + }, + { + "epoch": 0.4087414002428167, + "grad_norm": 6266220.786790677, + "learning_rate": 3.286099865047233e-07, + "logits/chosen": -2.258577585220337, + "logits/rejected": -2.278409957885742, + "logps/chosen": -134.9305877685547, + "logps/rejected": -154.0025177001953, + "loss": 127529.4875, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.020327283069491386, + "rewards/margins": 0.010549711063504219, + "rewards/rejected": -0.030876994132995605, + "step": 1010 + }, + { + "epoch": 0.41278834479967624, + "grad_norm": 6569607.008702802, + "learning_rate": 3.263607737291948e-07, + "logits/chosen": -2.2700555324554443, + "logits/rejected": -2.2413737773895264, + "logps/chosen": -145.36404418945312, + "logps/rejected": -159.6646270751953, + "loss": 129882.5125, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.020192014053463936, + "rewards/margins": 0.008731147274374962, + "rewards/rejected": -0.0289231576025486, + "step": 1020 + }, + { + "epoch": 0.4168352893565358, + "grad_norm": 6604946.805112461, + "learning_rate": 3.241115609536662e-07, + "logits/chosen": -2.2907137870788574, + "logits/rejected": -2.2590463161468506, + "logps/chosen": -148.82757568359375, + "logps/rejected": -158.55458068847656, + "loss": 123561.0125, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02791331335902214, + "rewards/margins": 0.01067260093986988, + "rewards/rejected": -0.03858591616153717, + "step": 1030 + }, + { + "epoch": 0.4208822339133954, + "grad_norm": 7819744.1215137215, + "learning_rate": 3.2186234817813764e-07, + "logits/chosen": -2.3501906394958496, + "logits/rejected": -2.382286548614502, + "logps/chosen": -145.73556518554688, + "logps/rejected": -145.95733642578125, + "loss": 125984.175, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.024780739098787308, + "rewards/margins": 0.01095888763666153, + "rewards/rejected": -0.03573962673544884, + "step": 1040 + }, + { + "epoch": 0.42492917847025496, + "grad_norm": 8329276.874833419, + "learning_rate": 3.196131354026091e-07, + "logits/chosen": -2.3430895805358887, + "logits/rejected": -2.2940633296966553, + "logps/chosen": -156.47262573242188, + "logps/rejected": -172.1248321533203, + "loss": 127542.5875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.026906628161668777, + "rewards/margins": 0.02299944870173931, + "rewards/rejected": -0.04990607872605324, + "step": 1050 + }, + { + "epoch": 0.4289761230271145, + "grad_norm": 5411023.24022027, + "learning_rate": 3.1736392262708053e-07, + "logits/chosen": -2.350010633468628, + "logits/rejected": -2.348132610321045, + "logps/chosen": -134.76171875, + "logps/rejected": -165.0662384033203, + "loss": 124288.825, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.026082511991262436, + "rewards/margins": 0.02408730424940586, + "rewards/rejected": -0.05016981437802315, + "step": 1060 + }, + { + "epoch": 0.4330230675839741, + "grad_norm": 6438956.528553201, + "learning_rate": 3.151147098515519e-07, + "logits/chosen": -2.403751850128174, + "logits/rejected": -2.391162395477295, + "logps/chosen": -133.4490966796875, + "logps/rejected": -145.66363525390625, + "loss": 122699.675, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.023318186402320862, + "rewards/margins": 0.010946491733193398, + "rewards/rejected": -0.03426467627286911, + "step": 1070 + }, + { + "epoch": 0.4370700121408337, + "grad_norm": 5922234.372544115, + "learning_rate": 3.1286549707602337e-07, + "logits/chosen": -2.2476916313171387, + "logits/rejected": -2.2207980155944824, + "logps/chosen": -142.42433166503906, + "logps/rejected": -152.08865356445312, + "loss": 123837.95, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.028287163004279137, + "rewards/margins": 0.015818050131201744, + "rewards/rejected": -0.04410521313548088, + "step": 1080 + }, + { + "epoch": 0.44111695669769324, + "grad_norm": 5883039.362660401, + "learning_rate": 3.106162843004948e-07, + "logits/chosen": -2.3883793354034424, + "logits/rejected": -2.3448328971862793, + "logps/chosen": -135.72998046875, + "logps/rejected": -153.7415008544922, + "loss": 124484.025, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.026858652010560036, + "rewards/margins": 0.02032136358320713, + "rewards/rejected": -0.04718000814318657, + "step": 1090 + }, + { + "epoch": 0.4451639012545528, + "grad_norm": 7158679.357876272, + "learning_rate": 3.0836707152496626e-07, + "logits/chosen": -2.3638851642608643, + "logits/rejected": -2.3225362300872803, + "logps/chosen": -145.95896911621094, + "logps/rejected": -169.71176147460938, + "loss": 130674.3625, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.025300731882452965, + "rewards/margins": 0.016643613576889038, + "rewards/rejected": -0.04194434732198715, + "step": 1100 + }, + { + "epoch": 0.4492108458114124, + "grad_norm": 7065276.414180166, + "learning_rate": 3.061178587494377e-07, + "logits/chosen": -2.3456435203552246, + "logits/rejected": -2.3152847290039062, + "logps/chosen": -126.73854064941406, + "logps/rejected": -143.69662475585938, + "loss": 127769.775, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.01884400099515915, + "rewards/margins": 0.015868009999394417, + "rewards/rejected": -0.03471200913190842, + "step": 1110 + }, + { + "epoch": 0.45325779036827196, + "grad_norm": 8872861.336008936, + "learning_rate": 3.038686459739091e-07, + "logits/chosen": -2.3893070220947266, + "logits/rejected": -2.379615068435669, + "logps/chosen": -135.2264404296875, + "logps/rejected": -147.5668487548828, + "loss": 121978.65, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.025070184841752052, + "rewards/margins": 0.01242685504257679, + "rewards/rejected": -0.037497036159038544, + "step": 1120 + }, + { + "epoch": 0.4573047349251315, + "grad_norm": 4362461.84620477, + "learning_rate": 3.0161943319838055e-07, + "logits/chosen": -2.3373289108276367, + "logits/rejected": -2.3283610343933105, + "logps/chosen": -113.62736511230469, + "logps/rejected": -132.5222930908203, + "loss": 122763.6, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.027866637334227562, + "rewards/margins": 0.010607337579131126, + "rewards/rejected": -0.03847397491335869, + "step": 1130 + }, + { + "epoch": 0.4613516794819911, + "grad_norm": 6441902.5854437305, + "learning_rate": 2.99370220422852e-07, + "logits/chosen": -2.4195449352264404, + "logits/rejected": -2.421095848083496, + "logps/chosen": -138.25782775878906, + "logps/rejected": -152.2244110107422, + "loss": 128506.9875, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.01849151961505413, + "rewards/margins": 0.007775165140628815, + "rewards/rejected": -0.026266688480973244, + "step": 1140 + }, + { + "epoch": 0.4653986240388507, + "grad_norm": 7047614.500405596, + "learning_rate": 2.971210076473234e-07, + "logits/chosen": -2.4957115650177, + "logits/rejected": -2.4509148597717285, + "logps/chosen": -137.063720703125, + "logps/rejected": -144.99172973632812, + "loss": 121503.125, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.018852662295103073, + "rewards/margins": 0.013833269476890564, + "rewards/rejected": -0.03268593177199364, + "step": 1150 + }, + { + "epoch": 0.46944556859571024, + "grad_norm": 6473613.10465131, + "learning_rate": 2.948717948717949e-07, + "logits/chosen": -2.508885145187378, + "logits/rejected": -2.4511702060699463, + "logps/chosen": -144.10775756835938, + "logps/rejected": -154.82540893554688, + "loss": 129245.7125, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.025161290541291237, + "rewards/margins": 0.008113402873277664, + "rewards/rejected": -0.03327469527721405, + "step": 1160 + }, + { + "epoch": 0.4734925131525698, + "grad_norm": 6225189.741747939, + "learning_rate": 2.926225820962663e-07, + "logits/chosen": -2.5162465572357178, + "logits/rejected": -2.526261568069458, + "logps/chosen": -134.27059936523438, + "logps/rejected": -153.3767852783203, + "loss": 129228.5, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.020399171859025955, + "rewards/margins": 0.01304579060524702, + "rewards/rejected": -0.03344495967030525, + "step": 1170 + }, + { + "epoch": 0.4775394577094294, + "grad_norm": 6686211.632899741, + "learning_rate": 2.903733693207377e-07, + "logits/chosen": -2.500845432281494, + "logits/rejected": -2.4776079654693604, + "logps/chosen": -139.28172302246094, + "logps/rejected": -162.630126953125, + "loss": 127296.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.021778758615255356, + "rewards/margins": 0.016411086544394493, + "rewards/rejected": -0.038189847022295, + "step": 1180 + }, + { + "epoch": 0.48158640226628896, + "grad_norm": 7327493.262833852, + "learning_rate": 2.8812415654520917e-07, + "logits/chosen": -2.5000369548797607, + "logits/rejected": -2.4850993156433105, + "logps/chosen": -133.94406127929688, + "logps/rejected": -149.57473754882812, + "loss": 130669.6, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.02554786205291748, + "rewards/margins": 0.0159430094063282, + "rewards/rejected": -0.04149087145924568, + "step": 1190 + }, + { + "epoch": 0.4856333468231485, + "grad_norm": 8827001.924529044, + "learning_rate": 2.8587494376968056e-07, + "logits/chosen": -2.411595582962036, + "logits/rejected": -2.4169204235076904, + "logps/chosen": -130.80325317382812, + "logps/rejected": -142.50013732910156, + "loss": 121415.625, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.019139764830470085, + "rewards/margins": 0.018100781366229057, + "rewards/rejected": -0.03724054619669914, + "step": 1200 + }, + { + "epoch": 0.48968029138000807, + "grad_norm": 5557571.235890019, + "learning_rate": 2.8362573099415206e-07, + "logits/chosen": -2.538846015930176, + "logits/rejected": -2.502953052520752, + "logps/chosen": -134.97543334960938, + "logps/rejected": -142.17556762695312, + "loss": 118867.85, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.013044399209320545, + "rewards/margins": 0.011013238690793514, + "rewards/rejected": -0.02405763790011406, + "step": 1210 + }, + { + "epoch": 0.4937272359368677, + "grad_norm": 6914411.411931411, + "learning_rate": 2.8137651821862346e-07, + "logits/chosen": -2.389519691467285, + "logits/rejected": -2.3551812171936035, + "logps/chosen": -138.05276489257812, + "logps/rejected": -160.35946655273438, + "loss": 127666.15, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.025238817557692528, + "rewards/margins": 0.02308265119791031, + "rewards/rejected": -0.048321474343538284, + "step": 1220 + }, + { + "epoch": 0.49777418049372724, + "grad_norm": 5609340.931669485, + "learning_rate": 2.7912730544309496e-07, + "logits/chosen": -2.4456872940063477, + "logits/rejected": -2.410588026046753, + "logps/chosen": -152.31521606445312, + "logps/rejected": -166.41482543945312, + "loss": 126962.65, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.029917621985077858, + "rewards/margins": 0.013386559672653675, + "rewards/rejected": -0.04330417513847351, + "step": 1230 + }, + { + "epoch": 0.5018211250505868, + "grad_norm": 5540227.088123434, + "learning_rate": 2.7687809266756635e-07, + "logits/chosen": -2.3677735328674316, + "logits/rejected": -2.354952335357666, + "logps/chosen": -126.54608154296875, + "logps/rejected": -145.7926483154297, + "loss": 127253.85, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.018986444920301437, + "rewards/margins": 0.016626928001642227, + "rewards/rejected": -0.035613369196653366, + "step": 1240 + }, + { + "epoch": 0.5058680696074463, + "grad_norm": 8562833.744693786, + "learning_rate": 2.7462887989203774e-07, + "logits/chosen": -2.344916820526123, + "logits/rejected": -2.312051296234131, + "logps/chosen": -138.89639282226562, + "logps/rejected": -144.3848876953125, + "loss": 134452.375, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.02468470111489296, + "rewards/margins": 0.008161008358001709, + "rewards/rejected": -0.03284571319818497, + "step": 1250 + }, + { + "epoch": 0.509915014164306, + "grad_norm": 5502511.320429619, + "learning_rate": 2.7237966711650924e-07, + "logits/chosen": -2.283324718475342, + "logits/rejected": -2.2585034370422363, + "logps/chosen": -142.3020782470703, + "logps/rejected": -157.03909301757812, + "loss": 126080.5375, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.02461249753832817, + "rewards/margins": 0.02032613940536976, + "rewards/rejected": -0.04493863508105278, + "step": 1260 + }, + { + "epoch": 0.5139619587211656, + "grad_norm": 8682050.76660753, + "learning_rate": 2.7013045434098063e-07, + "logits/chosen": -2.275059223175049, + "logits/rejected": -2.2415084838867188, + "logps/chosen": -140.72640991210938, + "logps/rejected": -159.70445251464844, + "loss": 128182.575, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.033078595995903015, + "rewards/margins": 0.01954609341919422, + "rewards/rejected": -0.052624695003032684, + "step": 1270 + }, + { + "epoch": 0.5180089032780251, + "grad_norm": 9710402.98065158, + "learning_rate": 2.678812415654521e-07, + "logits/chosen": -2.2827441692352295, + "logits/rejected": -2.2365641593933105, + "logps/chosen": -159.8201446533203, + "logps/rejected": -167.04348754882812, + "loss": 126161.6, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0369146391749382, + "rewards/margins": 0.009988631121814251, + "rewards/rejected": -0.04690327122807503, + "step": 1280 + }, + { + "epoch": 0.5220558478348847, + "grad_norm": 5687965.997013683, + "learning_rate": 2.656320287899235e-07, + "logits/chosen": -2.440713405609131, + "logits/rejected": -2.420994281768799, + "logps/chosen": -141.1729278564453, + "logps/rejected": -146.4890899658203, + "loss": 120775.6875, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.02869614027440548, + "rewards/margins": 0.01065666601061821, + "rewards/rejected": -0.03935280814766884, + "step": 1290 + }, + { + "epoch": 0.5261027923917442, + "grad_norm": 6948542.641971354, + "learning_rate": 2.633828160143949e-07, + "logits/chosen": -2.462017774581909, + "logits/rejected": -2.479309558868408, + "logps/chosen": -148.1350860595703, + "logps/rejected": -158.99916076660156, + "loss": 127668.2, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.020126869902014732, + "rewards/margins": 0.011567593552172184, + "rewards/rejected": -0.03169446438550949, + "step": 1300 + }, + { + "epoch": 0.5301497369486038, + "grad_norm": 8950839.210890554, + "learning_rate": 2.611336032388664e-07, + "logits/chosen": -2.379216432571411, + "logits/rejected": -2.349857807159424, + "logps/chosen": -158.7649688720703, + "logps/rejected": -158.4715118408203, + "loss": 133855.0375, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.030759122222661972, + "rewards/margins": 0.004436601884663105, + "rewards/rejected": -0.0351957306265831, + "step": 1310 + }, + { + "epoch": 0.5341966815054634, + "grad_norm": 5080965.438845941, + "learning_rate": 2.588843904633378e-07, + "logits/chosen": -2.4408226013183594, + "logits/rejected": -2.4230995178222656, + "logps/chosen": -122.5213394165039, + "logps/rejected": -136.46041870117188, + "loss": 125633.5875, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.022939473390579224, + "rewards/margins": 0.01324677187949419, + "rewards/rejected": -0.03618624433875084, + "step": 1320 + }, + { + "epoch": 0.5382436260623229, + "grad_norm": 5182504.597846507, + "learning_rate": 2.5663517768780926e-07, + "logits/chosen": -2.505174160003662, + "logits/rejected": -2.483182191848755, + "logps/chosen": -142.41513061523438, + "logps/rejected": -153.15907287597656, + "loss": 123496.5125, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.027721602469682693, + "rewards/margins": 0.015740955248475075, + "rewards/rejected": -0.04346255585551262, + "step": 1330 + }, + { + "epoch": 0.5422905706191825, + "grad_norm": 7128845.279886402, + "learning_rate": 2.543859649122807e-07, + "logits/chosen": -2.478231430053711, + "logits/rejected": -2.448133945465088, + "logps/chosen": -135.66380310058594, + "logps/rejected": -153.63766479492188, + "loss": 126797.3625, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.02139298990368843, + "rewards/margins": 0.010669348761439323, + "rewards/rejected": -0.0320623405277729, + "step": 1340 + }, + { + "epoch": 0.5463375151760421, + "grad_norm": 6676875.1940184785, + "learning_rate": 2.521367521367521e-07, + "logits/chosen": -2.4742424488067627, + "logits/rejected": -2.4604861736297607, + "logps/chosen": -117.7722396850586, + "logps/rejected": -130.4296875, + "loss": 125562.3, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.01471949927508831, + "rewards/margins": 0.010869570076465607, + "rewards/rejected": -0.025589067488908768, + "step": 1350 + }, + { + "epoch": 0.5503844597329016, + "grad_norm": 5238344.574016525, + "learning_rate": 2.4988753936122354e-07, + "logits/chosen": -2.404810905456543, + "logits/rejected": -2.386918544769287, + "logps/chosen": -129.4068603515625, + "logps/rejected": -142.20303344726562, + "loss": 121443.45, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.014744667336344719, + "rewards/margins": 0.01344493217766285, + "rewards/rejected": -0.02818959951400757, + "step": 1360 + }, + { + "epoch": 0.5544314042897612, + "grad_norm": 6640943.9300566595, + "learning_rate": 2.47638326585695e-07, + "logits/chosen": -2.3393630981445312, + "logits/rejected": -2.3323869705200195, + "logps/chosen": -132.86630249023438, + "logps/rejected": -144.26113891601562, + "loss": 127953.15, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.026765987277030945, + "rewards/margins": 0.01323648076504469, + "rewards/rejected": -0.04000247269868851, + "step": 1370 + }, + { + "epoch": 0.5584783488466208, + "grad_norm": 11500911.034452418, + "learning_rate": 2.4538911381016643e-07, + "logits/chosen": -2.2898106575012207, + "logits/rejected": -2.346161127090454, + "logps/chosen": -147.8282928466797, + "logps/rejected": -163.23367309570312, + "loss": 116464.0875, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.018574833869934082, + "rewards/margins": 0.024278491735458374, + "rewards/rejected": -0.042853325605392456, + "step": 1380 + }, + { + "epoch": 0.5625252934034803, + "grad_norm": 5617383.1061038, + "learning_rate": 2.431399010346379e-07, + "logits/chosen": -2.4383928775787354, + "logits/rejected": -2.447169780731201, + "logps/chosen": -125.36312103271484, + "logps/rejected": -135.82142639160156, + "loss": 127916.95, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.018465066328644753, + "rewards/margins": 0.007541149854660034, + "rewards/rejected": -0.026006218045949936, + "step": 1390 + }, + { + "epoch": 0.56657223796034, + "grad_norm": 7455055.133065385, + "learning_rate": 2.408906882591093e-07, + "logits/chosen": -2.351076126098633, + "logits/rejected": -2.386265993118286, + "logps/chosen": -138.0536346435547, + "logps/rejected": -153.21102905273438, + "loss": 126554.1, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.02320500835776329, + "rewards/margins": 0.008944050408899784, + "rewards/rejected": -0.0321490578353405, + "step": 1400 + }, + { + "epoch": 0.5706191825171996, + "grad_norm": 5233092.811472115, + "learning_rate": 2.386414754835807e-07, + "logits/chosen": -2.378037929534912, + "logits/rejected": -2.3643617630004883, + "logps/chosen": -160.79556274414062, + "logps/rejected": -167.969970703125, + "loss": 121087.4625, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.021083252504467964, + "rewards/margins": 0.008589145727455616, + "rewards/rejected": -0.029672399163246155, + "step": 1410 + }, + { + "epoch": 0.5746661270740591, + "grad_norm": 8648367.034730982, + "learning_rate": 2.363922627080522e-07, + "logits/chosen": -2.4605114459991455, + "logits/rejected": -2.432900905609131, + "logps/chosen": -145.24966430664062, + "logps/rejected": -169.27865600585938, + "loss": 127293.625, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.028744569048285484, + "rewards/margins": 0.01984976790845394, + "rewards/rejected": -0.04859434440732002, + "step": 1420 + }, + { + "epoch": 0.5787130716309187, + "grad_norm": 6917627.189571037, + "learning_rate": 2.3414304993252359e-07, + "logits/chosen": -2.415008783340454, + "logits/rejected": -2.390291213989258, + "logps/chosen": -118.60847473144531, + "logps/rejected": -137.45223999023438, + "loss": 126428.6625, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.021594971418380737, + "rewards/margins": 0.010223200544714928, + "rewards/rejected": -0.031818170100450516, + "step": 1430 + }, + { + "epoch": 0.5827600161877782, + "grad_norm": 7112073.222735109, + "learning_rate": 2.3189383715699503e-07, + "logits/chosen": -2.361323595046997, + "logits/rejected": -2.359731674194336, + "logps/chosen": -136.58473205566406, + "logps/rejected": -162.69363403320312, + "loss": 126602.9125, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.02461722306907177, + "rewards/margins": 0.010918731801211834, + "rewards/rejected": -0.03553595766425133, + "step": 1440 + }, + { + "epoch": 0.5868069607446378, + "grad_norm": 5694863.468784067, + "learning_rate": 2.2964462438146648e-07, + "logits/chosen": -2.4715747833251953, + "logits/rejected": -2.4460928440093994, + "logps/chosen": -139.74227905273438, + "logps/rejected": -143.89366149902344, + "loss": 124850.7625, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.019559044390916824, + "rewards/margins": 0.007027704268693924, + "rewards/rejected": -0.0265867467969656, + "step": 1450 + }, + { + "epoch": 0.5908539053014974, + "grad_norm": 6596951.588588771, + "learning_rate": 2.2739541160593792e-07, + "logits/chosen": -2.405214786529541, + "logits/rejected": -2.376192569732666, + "logps/chosen": -132.67037963867188, + "logps/rejected": -152.36544799804688, + "loss": 129629.9375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.019801389425992966, + "rewards/margins": 0.02020254358649254, + "rewards/rejected": -0.040003933012485504, + "step": 1460 + }, + { + "epoch": 0.5949008498583569, + "grad_norm": 8604502.436566744, + "learning_rate": 2.2514619883040934e-07, + "logits/chosen": -2.4290502071380615, + "logits/rejected": -2.4105029106140137, + "logps/chosen": -138.2572784423828, + "logps/rejected": -157.66226196289062, + "loss": 126705.45, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.023596590384840965, + "rewards/margins": 0.01850738190114498, + "rewards/rejected": -0.042103976011276245, + "step": 1470 + }, + { + "epoch": 0.5989477944152165, + "grad_norm": 6476200.112160567, + "learning_rate": 2.2289698605488076e-07, + "logits/chosen": -2.405041217803955, + "logits/rejected": -2.343621253967285, + "logps/chosen": -135.63980102539062, + "logps/rejected": -157.7008056640625, + "loss": 124231.4625, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.025133201852440834, + "rewards/margins": 0.02330349013209343, + "rewards/rejected": -0.048436690121889114, + "step": 1480 + }, + { + "epoch": 0.6029947389720761, + "grad_norm": 6131917.113665815, + "learning_rate": 2.206477732793522e-07, + "logits/chosen": -2.413145065307617, + "logits/rejected": -2.4113070964813232, + "logps/chosen": -131.548583984375, + "logps/rejected": -143.9263153076172, + "loss": 123647.9125, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.024967512115836143, + "rewards/margins": 0.014087630435824394, + "rewards/rejected": -0.03905514255166054, + "step": 1490 + }, + { + "epoch": 0.6070416835289356, + "grad_norm": 6175564.2334703235, + "learning_rate": 2.1839856050382366e-07, + "logits/chosen": -2.4009850025177, + "logits/rejected": -2.3908042907714844, + "logps/chosen": -138.0840606689453, + "logps/rejected": -150.7329864501953, + "loss": 128591.4875, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.02667851746082306, + "rewards/margins": 0.0038915693294256926, + "rewards/rejected": -0.03057008981704712, + "step": 1500 + }, + { + "epoch": 0.6110886280857952, + "grad_norm": 7212353.914117165, + "learning_rate": 2.161493477282951e-07, + "logits/chosen": -2.405449628829956, + "logits/rejected": -2.3872292041778564, + "logps/chosen": -123.17295837402344, + "logps/rejected": -143.22288513183594, + "loss": 130148.7, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.01992269791662693, + "rewards/margins": 0.015165319666266441, + "rewards/rejected": -0.03508801758289337, + "step": 1510 + }, + { + "epoch": 0.6151355726426548, + "grad_norm": 9046114.113444956, + "learning_rate": 2.1390013495276652e-07, + "logits/chosen": -2.405348777770996, + "logits/rejected": -2.4298527240753174, + "logps/chosen": -147.79513549804688, + "logps/rejected": -168.75872802734375, + "loss": 127527.1, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.026391273364424706, + "rewards/margins": 0.009268445894122124, + "rewards/rejected": -0.03565971553325653, + "step": 1520 + }, + { + "epoch": 0.6191825171995143, + "grad_norm": 6130066.161443972, + "learning_rate": 2.1165092217723797e-07, + "logits/chosen": -2.3494181632995605, + "logits/rejected": -2.3200573921203613, + "logps/chosen": -131.47329711914062, + "logps/rejected": -151.9276885986328, + "loss": 121959.575, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.02297963574528694, + "rewards/margins": 0.016111956909298897, + "rewards/rejected": -0.03909159451723099, + "step": 1530 + }, + { + "epoch": 0.623229461756374, + "grad_norm": 6650864.453901279, + "learning_rate": 2.0940170940170939e-07, + "logits/chosen": -2.3809990882873535, + "logits/rejected": -2.3723671436309814, + "logps/chosen": -156.38284301757812, + "logps/rejected": -171.9077606201172, + "loss": 122674.425, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.02390839345753193, + "rewards/margins": 0.019179565832018852, + "rewards/rejected": -0.04308795928955078, + "step": 1540 + }, + { + "epoch": 0.6272764063132336, + "grad_norm": 5183894.402422156, + "learning_rate": 2.0715249662618083e-07, + "logits/chosen": -2.4737048149108887, + "logits/rejected": -2.446381092071533, + "logps/chosen": -145.76119995117188, + "logps/rejected": -165.5288848876953, + "loss": 125087.2875, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.02032746747136116, + "rewards/margins": 0.009251989424228668, + "rewards/rejected": -0.029579460620880127, + "step": 1550 + }, + { + "epoch": 0.6313233508700931, + "grad_norm": 6969624.578927646, + "learning_rate": 2.0490328385065225e-07, + "logits/chosen": -2.405435800552368, + "logits/rejected": -2.4140048027038574, + "logps/chosen": -119.48077392578125, + "logps/rejected": -130.77532958984375, + "loss": 125878.4625, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.018460571765899658, + "rewards/margins": 0.010110612958669662, + "rewards/rejected": -0.02857118286192417, + "step": 1560 + }, + { + "epoch": 0.6353702954269527, + "grad_norm": 5800628.746003852, + "learning_rate": 2.026540710751237e-07, + "logits/chosen": -2.366516351699829, + "logits/rejected": -2.3698983192443848, + "logps/chosen": -147.12881469726562, + "logps/rejected": -146.3048553466797, + "loss": 129275.3375, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01768730953335762, + "rewards/margins": 0.011509931646287441, + "rewards/rejected": -0.029197242110967636, + "step": 1570 + }, + { + "epoch": 0.6394172399838122, + "grad_norm": 14918875.434130527, + "learning_rate": 2.0040485829959514e-07, + "logits/chosen": -2.4734253883361816, + "logits/rejected": -2.4595344066619873, + "logps/chosen": -125.6633529663086, + "logps/rejected": -142.96157836914062, + "loss": 123910.425, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.018165288493037224, + "rewards/margins": 0.015782013535499573, + "rewards/rejected": -0.033947303891181946, + "step": 1580 + }, + { + "epoch": 0.6434641845406718, + "grad_norm": 5025901.152056148, + "learning_rate": 1.981556455240666e-07, + "logits/chosen": -2.4370574951171875, + "logits/rejected": -2.4227161407470703, + "logps/chosen": -141.5856475830078, + "logps/rejected": -164.05941772460938, + "loss": 128733.9125, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.021845245733857155, + "rewards/margins": 0.011800579726696014, + "rewards/rejected": -0.03364582732319832, + "step": 1590 + }, + { + "epoch": 0.6475111290975314, + "grad_norm": 5965330.851620259, + "learning_rate": 1.9590643274853798e-07, + "logits/chosen": -2.398038387298584, + "logits/rejected": -2.37715482711792, + "logps/chosen": -118.133544921875, + "logps/rejected": -130.4759063720703, + "loss": 123004.3375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.01905783638358116, + "rewards/margins": 0.011883154511451721, + "rewards/rejected": -0.030940990895032883, + "step": 1600 + }, + { + "epoch": 0.6515580736543909, + "grad_norm": 6299921.629356138, + "learning_rate": 1.9365721997300943e-07, + "logits/chosen": -2.3423843383789062, + "logits/rejected": -2.2989087104797363, + "logps/chosen": -115.65093994140625, + "logps/rejected": -138.21340942382812, + "loss": 127277.8375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.02753433585166931, + "rewards/margins": 0.014595555141568184, + "rewards/rejected": -0.042129892855882645, + "step": 1610 + }, + { + "epoch": 0.6556050182112505, + "grad_norm": 7576070.098325265, + "learning_rate": 1.9140800719748088e-07, + "logits/chosen": -2.320422649383545, + "logits/rejected": -2.290821075439453, + "logps/chosen": -117.01118469238281, + "logps/rejected": -125.79508972167969, + "loss": 124367.8875, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.01931951195001602, + "rewards/margins": 0.006707245949655771, + "rewards/rejected": -0.026026759296655655, + "step": 1620 + }, + { + "epoch": 0.6596519627681101, + "grad_norm": 6162429.013600917, + "learning_rate": 1.8915879442195232e-07, + "logits/chosen": -2.334224224090576, + "logits/rejected": -2.3541088104248047, + "logps/chosen": -136.18832397460938, + "logps/rejected": -151.46710205078125, + "loss": 122585.7625, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.019429903477430344, + "rewards/margins": 0.011553862132132053, + "rewards/rejected": -0.03098376654088497, + "step": 1630 + }, + { + "epoch": 0.6636989073249696, + "grad_norm": 4988734.781634246, + "learning_rate": 1.8690958164642374e-07, + "logits/chosen": -2.4520297050476074, + "logits/rejected": -2.42329478263855, + "logps/chosen": -144.2743682861328, + "logps/rejected": -155.20095825195312, + "loss": 124995.575, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.016223471611738205, + "rewards/margins": 0.015287751331925392, + "rewards/rejected": -0.031511224806308746, + "step": 1640 + }, + { + "epoch": 0.6677458518818292, + "grad_norm": 6629567.008457434, + "learning_rate": 1.8466036887089516e-07, + "logits/chosen": -2.345116376876831, + "logits/rejected": -2.348301887512207, + "logps/chosen": -129.6711883544922, + "logps/rejected": -151.17837524414062, + "loss": 122800.6625, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.020190779119729996, + "rewards/margins": 0.0185395535081625, + "rewards/rejected": -0.03873033449053764, + "step": 1650 + }, + { + "epoch": 0.6717927964386888, + "grad_norm": 5601907.507778279, + "learning_rate": 1.824111560953666e-07, + "logits/chosen": -2.2657582759857178, + "logits/rejected": -2.260693311691284, + "logps/chosen": -128.01144409179688, + "logps/rejected": -155.50888061523438, + "loss": 124625.45, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.027610983699560165, + "rewards/margins": 0.020689968019723892, + "rewards/rejected": -0.04830095171928406, + "step": 1660 + }, + { + "epoch": 0.6758397409955483, + "grad_norm": 5845172.102872888, + "learning_rate": 1.8016194331983805e-07, + "logits/chosen": -2.289998769760132, + "logits/rejected": -2.2984931468963623, + "logps/chosen": -120.5185775756836, + "logps/rejected": -140.23655700683594, + "loss": 125251.7625, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.029199788346886635, + "rewards/margins": 0.014481378719210625, + "rewards/rejected": -0.04368116706609726, + "step": 1670 + }, + { + "epoch": 0.6798866855524079, + "grad_norm": 6195370.996741281, + "learning_rate": 1.779127305443095e-07, + "logits/chosen": -2.3538708686828613, + "logits/rejected": -2.334139347076416, + "logps/chosen": -136.63980102539062, + "logps/rejected": -144.39056396484375, + "loss": 129559.5375, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.030268553644418716, + "rewards/margins": 0.0073168775998055935, + "rewards/rejected": -0.03758542984724045, + "step": 1680 + }, + { + "epoch": 0.6839336301092676, + "grad_norm": 8790702.611604873, + "learning_rate": 1.7566351776878092e-07, + "logits/chosen": -2.3637337684631348, + "logits/rejected": -2.3632633686065674, + "logps/chosen": -129.40554809570312, + "logps/rejected": -147.02755737304688, + "loss": 128803.0875, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.023326028138399124, + "rewards/margins": 0.011800029315054417, + "rewards/rejected": -0.03512606397271156, + "step": 1690 + }, + { + "epoch": 0.687980574666127, + "grad_norm": 8047500.436527203, + "learning_rate": 1.7341430499325237e-07, + "logits/chosen": -2.2449162006378174, + "logits/rejected": -2.254812717437744, + "logps/chosen": -128.66867065429688, + "logps/rejected": -134.86846923828125, + "loss": 130052.75, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.021912669762969017, + "rewards/margins": 0.007869280874729156, + "rewards/rejected": -0.029781952500343323, + "step": 1700 + }, + { + "epoch": 0.6920275192229867, + "grad_norm": 7961693.976507484, + "learning_rate": 1.7116509221772378e-07, + "logits/chosen": -2.308650255203247, + "logits/rejected": -2.325552463531494, + "logps/chosen": -118.10685729980469, + "logps/rejected": -132.20530700683594, + "loss": 125613.825, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.022883836179971695, + "rewards/margins": 0.00968220829963684, + "rewards/rejected": -0.032566044479608536, + "step": 1710 + }, + { + "epoch": 0.6960744637798462, + "grad_norm": 8983241.448290937, + "learning_rate": 1.6891587944219523e-07, + "logits/chosen": -2.3228538036346436, + "logits/rejected": -2.295989990234375, + "logps/chosen": -136.067138671875, + "logps/rejected": -149.92922973632812, + "loss": 124795.0875, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.01805921457707882, + "rewards/margins": 0.018675491213798523, + "rewards/rejected": -0.03673470392823219, + "step": 1720 + }, + { + "epoch": 0.7001214083367058, + "grad_norm": 6598233.046729883, + "learning_rate": 1.6666666666666665e-07, + "logits/chosen": -2.321946144104004, + "logits/rejected": -2.2634482383728027, + "logps/chosen": -156.81881713867188, + "logps/rejected": -175.48133850097656, + "loss": 124281.6625, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.023901356384158134, + "rewards/margins": 0.020154178142547607, + "rewards/rejected": -0.04405553638935089, + "step": 1730 + }, + { + "epoch": 0.7041683528935654, + "grad_norm": 7393573.408673971, + "learning_rate": 1.644174538911381e-07, + "logits/chosen": -2.157721996307373, + "logits/rejected": -2.1384575366973877, + "logps/chosen": -156.61431884765625, + "logps/rejected": -169.68267822265625, + "loss": 123967.125, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.033402346074581146, + "rewards/margins": 0.017039528116583824, + "rewards/rejected": -0.05044187977910042, + "step": 1740 + }, + { + "epoch": 0.7082152974504249, + "grad_norm": 8650281.232154809, + "learning_rate": 1.6216824111560954e-07, + "logits/chosen": -2.3099396228790283, + "logits/rejected": -2.314627170562744, + "logps/chosen": -140.70175170898438, + "logps/rejected": -165.3170928955078, + "loss": 125535.2875, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02742253616452217, + "rewards/margins": 0.015254299156367779, + "rewards/rejected": -0.04267684370279312, + "step": 1750 + }, + { + "epoch": 0.7122622420072845, + "grad_norm": 6208948.317347317, + "learning_rate": 1.5991902834008096e-07, + "logits/chosen": -2.3783583641052246, + "logits/rejected": -2.362631320953369, + "logps/chosen": -148.7383270263672, + "logps/rejected": -162.62327575683594, + "loss": 121080.075, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.014549913816154003, + "rewards/margins": 0.013842826709151268, + "rewards/rejected": -0.028392743319272995, + "step": 1760 + }, + { + "epoch": 0.7163091865641441, + "grad_norm": 7739071.113911002, + "learning_rate": 1.5766981556455238e-07, + "logits/chosen": -2.299868583679199, + "logits/rejected": -2.2598750591278076, + "logps/chosen": -162.82052612304688, + "logps/rejected": -184.3058319091797, + "loss": 122385.7125, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.030940508469939232, + "rewards/margins": 0.023769445717334747, + "rewards/rejected": -0.05470995977520943, + "step": 1770 + }, + { + "epoch": 0.7203561311210036, + "grad_norm": 6707665.892655141, + "learning_rate": 1.5542060278902383e-07, + "logits/chosen": -2.3239502906799316, + "logits/rejected": -2.3085806369781494, + "logps/chosen": -139.06484985351562, + "logps/rejected": -157.50460815429688, + "loss": 115194.475, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03321670740842819, + "rewards/margins": 0.021816464141011238, + "rewards/rejected": -0.05503316968679428, + "step": 1780 + }, + { + "epoch": 0.7244030756778632, + "grad_norm": 7475588.891135754, + "learning_rate": 1.5317139001349527e-07, + "logits/chosen": -2.380169630050659, + "logits/rejected": -2.3587822914123535, + "logps/chosen": -134.81069946289062, + "logps/rejected": -149.78839111328125, + "loss": 135028.0125, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.03387707471847534, + "rewards/margins": 0.011253075674176216, + "rewards/rejected": -0.04513014853000641, + "step": 1790 + }, + { + "epoch": 0.7284500202347228, + "grad_norm": 6224311.633435066, + "learning_rate": 1.5092217723796672e-07, + "logits/chosen": -2.4899191856384277, + "logits/rejected": -2.461540460586548, + "logps/chosen": -139.72994995117188, + "logps/rejected": -154.91757202148438, + "loss": 127101.55, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.022551989182829857, + "rewards/margins": 0.019368382170796394, + "rewards/rejected": -0.04192037135362625, + "step": 1800 + }, + { + "epoch": 0.7324969647915823, + "grad_norm": 6407363.569135414, + "learning_rate": 1.4867296446243814e-07, + "logits/chosen": -2.457529067993164, + "logits/rejected": -2.4312427043914795, + "logps/chosen": -171.8442840576172, + "logps/rejected": -170.40664672851562, + "loss": 126581.2375, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.02567433752119541, + "rewards/margins": 0.008936228230595589, + "rewards/rejected": -0.0346105620265007, + "step": 1810 + }, + { + "epoch": 0.7365439093484419, + "grad_norm": 5335773.687286384, + "learning_rate": 1.4642375168690956e-07, + "logits/chosen": -2.442826986312866, + "logits/rejected": -2.424445867538452, + "logps/chosen": -130.82366943359375, + "logps/rejected": -150.00717163085938, + "loss": 121689.35, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.020857717841863632, + "rewards/margins": 0.011433606036007404, + "rewards/rejected": -0.03229131922125816, + "step": 1820 + }, + { + "epoch": 0.7405908539053015, + "grad_norm": 5919606.114162943, + "learning_rate": 1.44174538911381e-07, + "logits/chosen": -2.4367711544036865, + "logits/rejected": -2.4152512550354004, + "logps/chosen": -116.6092758178711, + "logps/rejected": -137.42446899414062, + "loss": 124829.175, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.018279392272233963, + "rewards/margins": 0.017404617741703987, + "rewards/rejected": -0.0356840081512928, + "step": 1830 + }, + { + "epoch": 0.744637798462161, + "grad_norm": 4526671.180016859, + "learning_rate": 1.4192532613585245e-07, + "logits/chosen": -2.3979544639587402, + "logits/rejected": -2.3597800731658936, + "logps/chosen": -135.9434814453125, + "logps/rejected": -138.03778076171875, + "loss": 129111.95, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.024272512644529343, + "rewards/margins": 0.012156413868069649, + "rewards/rejected": -0.03642892464995384, + "step": 1840 + }, + { + "epoch": 0.7486847430190207, + "grad_norm": 7139221.010538934, + "learning_rate": 1.396761133603239e-07, + "logits/chosen": -2.4428467750549316, + "logits/rejected": -2.428190231323242, + "logps/chosen": -123.2089614868164, + "logps/rejected": -138.09390258789062, + "loss": 128958.6625, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.018383827060461044, + "rewards/margins": 0.010701683349907398, + "rewards/rejected": -0.029085511341691017, + "step": 1850 + }, + { + "epoch": 0.7527316875758802, + "grad_norm": 7046675.547216455, + "learning_rate": 1.3742690058479532e-07, + "logits/chosen": -2.4591715335845947, + "logits/rejected": -2.426462411880493, + "logps/chosen": -133.52520751953125, + "logps/rejected": -138.1920623779297, + "loss": 130433.475, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.014568162150681019, + "rewards/margins": 0.013440297916531563, + "rewards/rejected": -0.028008460998535156, + "step": 1860 + }, + { + "epoch": 0.7567786321327398, + "grad_norm": 6183615.802176011, + "learning_rate": 1.3517768780926674e-07, + "logits/chosen": -2.4390716552734375, + "logits/rejected": -2.3869736194610596, + "logps/chosen": -127.33221435546875, + "logps/rejected": -149.60716247558594, + "loss": 126095.0375, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.020692896097898483, + "rewards/margins": 0.015832407400012016, + "rewards/rejected": -0.03652530163526535, + "step": 1870 + }, + { + "epoch": 0.7608255766895994, + "grad_norm": 5139359.676607012, + "learning_rate": 1.3292847503373818e-07, + "logits/chosen": -2.4137744903564453, + "logits/rejected": -2.4111902713775635, + "logps/chosen": -138.07791137695312, + "logps/rejected": -150.23367309570312, + "loss": 122845.4375, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.015661675482988358, + "rewards/margins": 0.011621621437370777, + "rewards/rejected": -0.02728329598903656, + "step": 1880 + }, + { + "epoch": 0.7648725212464589, + "grad_norm": 5436765.081995142, + "learning_rate": 1.3067926225820963e-07, + "logits/chosen": -2.2953848838806152, + "logits/rejected": -2.261265754699707, + "logps/chosen": -131.72573852539062, + "logps/rejected": -158.7117156982422, + "loss": 120437.1625, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.018344033509492874, + "rewards/margins": 0.025271952152252197, + "rewards/rejected": -0.04361598566174507, + "step": 1890 + }, + { + "epoch": 0.7689194658033185, + "grad_norm": 7773151.683082246, + "learning_rate": 1.2843004948268105e-07, + "logits/chosen": -2.229933023452759, + "logits/rejected": -2.166466474533081, + "logps/chosen": -147.3013153076172, + "logps/rejected": -160.14205932617188, + "loss": 130466.3875, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.024005400016903877, + "rewards/margins": 0.014198745600879192, + "rewards/rejected": -0.038204144686460495, + "step": 1900 + }, + { + "epoch": 0.7729664103601781, + "grad_norm": 6242141.939931843, + "learning_rate": 1.261808367071525e-07, + "logits/chosen": -2.2624001502990723, + "logits/rejected": -2.229830503463745, + "logps/chosen": -138.10633850097656, + "logps/rejected": -152.7989044189453, + "loss": 127404.4375, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.02552894689142704, + "rewards/margins": 0.008202909491956234, + "rewards/rejected": -0.033731859177351, + "step": 1910 + }, + { + "epoch": 0.7770133549170376, + "grad_norm": 6920375.30724732, + "learning_rate": 1.2393162393162394e-07, + "logits/chosen": -2.350060224533081, + "logits/rejected": -2.3308169841766357, + "logps/chosen": -132.56320190429688, + "logps/rejected": -153.30557250976562, + "loss": 126830.1, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.029025813564658165, + "rewards/margins": 0.019299551844596863, + "rewards/rejected": -0.04832536727190018, + "step": 1920 + }, + { + "epoch": 0.7810602994738972, + "grad_norm": 6671009.085790114, + "learning_rate": 1.2168241115609536e-07, + "logits/chosen": -2.2904415130615234, + "logits/rejected": -2.329463481903076, + "logps/chosen": -141.00816345214844, + "logps/rejected": -143.0104217529297, + "loss": 129713.6875, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.03294295817613602, + "rewards/margins": 0.0017857927596196532, + "rewards/rejected": -0.034728746861219406, + "step": 1930 + }, + { + "epoch": 0.7851072440307568, + "grad_norm": 6655872.215382386, + "learning_rate": 1.194331983805668e-07, + "logits/chosen": -2.3119730949401855, + "logits/rejected": -2.2890148162841797, + "logps/chosen": -131.12327575683594, + "logps/rejected": -148.35281372070312, + "loss": 126911.35, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.024319607764482498, + "rewards/margins": 0.012184834107756615, + "rewards/rejected": -0.036504440009593964, + "step": 1940 + }, + { + "epoch": 0.7891541885876163, + "grad_norm": 7036617.58409423, + "learning_rate": 1.1718398560503823e-07, + "logits/chosen": -2.3747105598449707, + "logits/rejected": -2.3655359745025635, + "logps/chosen": -127.3541259765625, + "logps/rejected": -142.77593994140625, + "loss": 125537.975, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.017514357343316078, + "rewards/margins": 0.013664362952113152, + "rewards/rejected": -0.03117872215807438, + "step": 1950 + }, + { + "epoch": 0.7932011331444759, + "grad_norm": 6573765.081555526, + "learning_rate": 1.1493477282950967e-07, + "logits/chosen": -2.412942409515381, + "logits/rejected": -2.390746593475342, + "logps/chosen": -134.2810821533203, + "logps/rejected": -158.460693359375, + "loss": 123726.75, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.021586496382951736, + "rewards/margins": 0.017149869352579117, + "rewards/rejected": -0.03873636573553085, + "step": 1960 + }, + { + "epoch": 0.7972480777013355, + "grad_norm": 7038673.672056439, + "learning_rate": 1.1268556005398109e-07, + "logits/chosen": -2.370753765106201, + "logits/rejected": -2.3683507442474365, + "logps/chosen": -124.46659851074219, + "logps/rejected": -131.4204559326172, + "loss": 126820.75, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.024847570806741714, + "rewards/margins": 0.011085819453001022, + "rewards/rejected": -0.03593338653445244, + "step": 1970 + }, + { + "epoch": 0.801295022258195, + "grad_norm": 6451758.008444387, + "learning_rate": 1.1043634727845254e-07, + "logits/chosen": -2.325690984725952, + "logits/rejected": -2.336920976638794, + "logps/chosen": -122.97123718261719, + "logps/rejected": -147.48297119140625, + "loss": 123985.45, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.016694985330104828, + "rewards/margins": 0.016988877207040787, + "rewards/rejected": -0.033683862537145615, + "step": 1980 + }, + { + "epoch": 0.8053419668150547, + "grad_norm": 9482913.906109469, + "learning_rate": 1.0818713450292397e-07, + "logits/chosen": -2.2602345943450928, + "logits/rejected": -2.243213653564453, + "logps/chosen": -122.68096923828125, + "logps/rejected": -138.34278869628906, + "loss": 124861.1125, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.028803208842873573, + "rewards/margins": 0.015811622142791748, + "rewards/rejected": -0.04461482912302017, + "step": 1990 + }, + { + "epoch": 0.8093889113719142, + "grad_norm": 6927553.6188276345, + "learning_rate": 1.059379217273954e-07, + "logits/chosen": -2.3502438068389893, + "logits/rejected": -2.337284564971924, + "logps/chosen": -132.24378967285156, + "logps/rejected": -149.02110290527344, + "loss": 125569.4625, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.018480569124221802, + "rewards/margins": 0.016136765480041504, + "rewards/rejected": -0.034617334604263306, + "step": 2000 + }, + { + "epoch": 0.8134358559287738, + "grad_norm": 6611093.761936569, + "learning_rate": 1.0368870895186684e-07, + "logits/chosen": -2.3684864044189453, + "logits/rejected": -2.324704170227051, + "logps/chosen": -135.81961059570312, + "logps/rejected": -160.5324249267578, + "loss": 121162.075, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.012911828234791756, + "rewards/margins": 0.030595939606428146, + "rewards/rejected": -0.04350776970386505, + "step": 2010 + }, + { + "epoch": 0.8174828004856334, + "grad_norm": 6723882.38264995, + "learning_rate": 1.0143949617633828e-07, + "logits/chosen": -2.2761006355285645, + "logits/rejected": -2.2606966495513916, + "logps/chosen": -119.88040924072266, + "logps/rejected": -145.41346740722656, + "loss": 129622.7125, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.025995198637247086, + "rewards/margins": 0.019534587860107422, + "rewards/rejected": -0.045529790222644806, + "step": 2020 + }, + { + "epoch": 0.8215297450424929, + "grad_norm": 8229250.060941711, + "learning_rate": 9.919028340080972e-08, + "logits/chosen": -2.3351616859436035, + "logits/rejected": -2.280089855194092, + "logps/chosen": -138.04751586914062, + "logps/rejected": -154.4039306640625, + "loss": 121636.6375, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.02659204974770546, + "rewards/margins": 0.022092049941420555, + "rewards/rejected": -0.048684097826480865, + "step": 2030 + }, + { + "epoch": 0.8255766895993525, + "grad_norm": 8360918.973606626, + "learning_rate": 9.694107062528115e-08, + "logits/chosen": -2.302302837371826, + "logits/rejected": -2.3009400367736816, + "logps/chosen": -133.8302459716797, + "logps/rejected": -153.5994110107422, + "loss": 124760.0625, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.020709946751594543, + "rewards/margins": 0.014815042726695538, + "rewards/rejected": -0.03552498668432236, + "step": 2040 + }, + { + "epoch": 0.8296236341562121, + "grad_norm": 8136804.681969547, + "learning_rate": 9.46918578497526e-08, + "logits/chosen": -2.325496196746826, + "logits/rejected": -2.3160691261291504, + "logps/chosen": -133.07785034179688, + "logps/rejected": -157.1102294921875, + "loss": 122905.6875, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.022938497364521027, + "rewards/margins": 0.02051146700978279, + "rewards/rejected": -0.04344996064901352, + "step": 2050 + }, + { + "epoch": 0.8336705787130716, + "grad_norm": 5880924.756183454, + "learning_rate": 9.244264507422401e-08, + "logits/chosen": -2.247741460800171, + "logits/rejected": -2.2601161003112793, + "logps/chosen": -138.5823974609375, + "logps/rejected": -150.9891357421875, + "loss": 122247.55, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.021218325942754745, + "rewards/margins": 0.013738051056861877, + "rewards/rejected": -0.034956373274326324, + "step": 2060 + }, + { + "epoch": 0.8377175232699312, + "grad_norm": 6479318.093365777, + "learning_rate": 9.019343229869546e-08, + "logits/chosen": -2.287973403930664, + "logits/rejected": -2.27508282661438, + "logps/chosen": -148.0543975830078, + "logps/rejected": -174.75563049316406, + "loss": 122681.5375, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.028557494282722473, + "rewards/margins": 0.02148330584168434, + "rewards/rejected": -0.050040800124406815, + "step": 2070 + }, + { + "epoch": 0.8417644678267908, + "grad_norm": 7530654.549282354, + "learning_rate": 8.794421952316688e-08, + "logits/chosen": -2.3192243576049805, + "logits/rejected": -2.301488161087036, + "logps/chosen": -140.3570556640625, + "logps/rejected": -144.66439819335938, + "loss": 127493.5, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.027281736955046654, + "rewards/margins": 0.011399330571293831, + "rewards/rejected": -0.03868107125163078, + "step": 2080 + }, + { + "epoch": 0.8458114123836503, + "grad_norm": 5545588.639352677, + "learning_rate": 8.569500674763833e-08, + "logits/chosen": -2.3623504638671875, + "logits/rejected": -2.327298641204834, + "logps/chosen": -125.07554626464844, + "logps/rejected": -162.51771545410156, + "loss": 122307.35, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.023772019892930984, + "rewards/margins": 0.024702411144971848, + "rewards/rejected": -0.048474427312612534, + "step": 2090 + }, + { + "epoch": 0.8498583569405099, + "grad_norm": 7677881.561277626, + "learning_rate": 8.344579397210976e-08, + "logits/chosen": -2.400023937225342, + "logits/rejected": -2.398374557495117, + "logps/chosen": -143.50096130371094, + "logps/rejected": -154.2194061279297, + "loss": 126753.125, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.03331100195646286, + "rewards/margins": 0.007773646619170904, + "rewards/rejected": -0.0410846471786499, + "step": 2100 + }, + { + "epoch": 0.8539053014973695, + "grad_norm": 9639697.081950434, + "learning_rate": 8.119658119658119e-08, + "logits/chosen": -2.2588629722595215, + "logits/rejected": -2.2181789875030518, + "logps/chosen": -136.6796417236328, + "logps/rejected": -171.16500854492188, + "loss": 127603.3375, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03126269578933716, + "rewards/margins": 0.025771383196115494, + "rewards/rejected": -0.057034075260162354, + "step": 2110 + }, + { + "epoch": 0.857952246054229, + "grad_norm": 5314294.806910229, + "learning_rate": 7.894736842105262e-08, + "logits/chosen": -2.486797332763672, + "logits/rejected": -2.4730780124664307, + "logps/chosen": -146.88571166992188, + "logps/rejected": -158.010986328125, + "loss": 125283.2125, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.020792517811059952, + "rewards/margins": 0.017872992902994156, + "rewards/rejected": -0.03866551071405411, + "step": 2120 + }, + { + "epoch": 0.8619991906110887, + "grad_norm": 8035986.499722224, + "learning_rate": 7.669815564552407e-08, + "logits/chosen": -2.421731472015381, + "logits/rejected": -2.425063371658325, + "logps/chosen": -116.5892562866211, + "logps/rejected": -132.42050170898438, + "loss": 125860.925, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.021887045353651047, + "rewards/margins": 0.014476152136921883, + "rewards/rejected": -0.03636319935321808, + "step": 2130 + }, + { + "epoch": 0.8660461351679482, + "grad_norm": 6251020.741893531, + "learning_rate": 7.444894286999549e-08, + "logits/chosen": -2.364879608154297, + "logits/rejected": -2.311974048614502, + "logps/chosen": -120.81207275390625, + "logps/rejected": -145.0894317626953, + "loss": 119764.95, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.02619818225502968, + "rewards/margins": 0.023076878860592842, + "rewards/rejected": -0.04927505925297737, + "step": 2140 + }, + { + "epoch": 0.8700930797248078, + "grad_norm": 6301369.537300293, + "learning_rate": 7.219973009446694e-08, + "logits/chosen": -2.379647970199585, + "logits/rejected": -2.3476970195770264, + "logps/chosen": -134.33572387695312, + "logps/rejected": -155.9193115234375, + "loss": 118915.75, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.019399352371692657, + "rewards/margins": 0.016515102237462997, + "rewards/rejected": -0.03591445833444595, + "step": 2150 + }, + { + "epoch": 0.8741400242816674, + "grad_norm": 6200780.785181898, + "learning_rate": 6.995051731893837e-08, + "logits/chosen": -2.4005587100982666, + "logits/rejected": -2.381075382232666, + "logps/chosen": -134.69631958007812, + "logps/rejected": -142.3704071044922, + "loss": 122057.2, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.024357806891202927, + "rewards/margins": 0.01147081982344389, + "rewards/rejected": -0.03582862392067909, + "step": 2160 + }, + { + "epoch": 0.8781869688385269, + "grad_norm": 7707200.943905766, + "learning_rate": 6.77013045434098e-08, + "logits/chosen": -2.1691622734069824, + "logits/rejected": -2.1518099308013916, + "logps/chosen": -133.11085510253906, + "logps/rejected": -150.32522583007812, + "loss": 124932.0875, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.02330431155860424, + "rewards/margins": 0.0101194242015481, + "rewards/rejected": -0.033423732966184616, + "step": 2170 + }, + { + "epoch": 0.8822339133953865, + "grad_norm": 6068434.174209909, + "learning_rate": 6.545209176788123e-08, + "logits/chosen": -2.2563586235046387, + "logits/rejected": -2.249168872833252, + "logps/chosen": -126.6658935546875, + "logps/rejected": -150.95640563964844, + "loss": 125160.825, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.015440529212355614, + "rewards/margins": 0.015524588525295258, + "rewards/rejected": -0.030965115875005722, + "step": 2180 + }, + { + "epoch": 0.8862808579522461, + "grad_norm": 7537201.72606691, + "learning_rate": 6.320287899235267e-08, + "logits/chosen": -2.364108085632324, + "logits/rejected": -2.3574013710021973, + "logps/chosen": -127.92137145996094, + "logps/rejected": -143.64276123046875, + "loss": 128988.85, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.015308101661503315, + "rewards/margins": 0.012061825022101402, + "rewards/rejected": -0.027369925752282143, + "step": 2190 + }, + { + "epoch": 0.8903278025091056, + "grad_norm": 20144650.289658338, + "learning_rate": 6.095366621682411e-08, + "logits/chosen": -2.3388938903808594, + "logits/rejected": -2.309027910232544, + "logps/chosen": -131.75338745117188, + "logps/rejected": -147.02613830566406, + "loss": 131861.55, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.033888086676597595, + "rewards/margins": 0.011972433887422085, + "rewards/rejected": -0.045860521495342255, + "step": 2200 + }, + { + "epoch": 0.8943747470659652, + "grad_norm": 7275705.0028550355, + "learning_rate": 5.8704453441295546e-08, + "logits/chosen": -2.4196584224700928, + "logits/rejected": -2.4083645343780518, + "logps/chosen": -137.82431030273438, + "logps/rejected": -152.37692260742188, + "loss": 125663.2125, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.01876234821975231, + "rewards/margins": 0.0189601369202137, + "rewards/rejected": -0.03772248700261116, + "step": 2210 + }, + { + "epoch": 0.8984216916228248, + "grad_norm": 7203116.155779993, + "learning_rate": 5.645524066576698e-08, + "logits/chosen": -2.4381861686706543, + "logits/rejected": -2.403198003768921, + "logps/chosen": -131.7117156982422, + "logps/rejected": -142.67185974121094, + "loss": 123239.9375, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.01855655573308468, + "rewards/margins": 0.015223322436213493, + "rewards/rejected": -0.03377988189458847, + "step": 2220 + }, + { + "epoch": 0.9024686361796843, + "grad_norm": 5199237.236235915, + "learning_rate": 5.420602789023841e-08, + "logits/chosen": -2.3613171577453613, + "logits/rejected": -2.2851357460021973, + "logps/chosen": -153.357177734375, + "logps/rejected": -159.00067138671875, + "loss": 123367.6125, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.024445852264761925, + "rewards/margins": 0.012765263207256794, + "rewards/rejected": -0.037211112678050995, + "step": 2230 + }, + { + "epoch": 0.9065155807365439, + "grad_norm": 8322118.880155748, + "learning_rate": 5.1956815114709844e-08, + "logits/chosen": -2.4463276863098145, + "logits/rejected": -2.4448184967041016, + "logps/chosen": -166.32937622070312, + "logps/rejected": -170.3524932861328, + "loss": 127037.6, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.034686215221881866, + "rewards/margins": 0.006833164487034082, + "rewards/rejected": -0.04151938110589981, + "step": 2240 + }, + { + "epoch": 0.9105625252934035, + "grad_norm": 7843111.5615214445, + "learning_rate": 4.9707602339181284e-08, + "logits/chosen": -2.406442165374756, + "logits/rejected": -2.3739068508148193, + "logps/chosen": -135.46450805664062, + "logps/rejected": -151.98318481445312, + "loss": 119829.0, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.027578959241509438, + "rewards/margins": 0.019721323624253273, + "rewards/rejected": -0.04730028659105301, + "step": 2250 + }, + { + "epoch": 0.914609469850263, + "grad_norm": 6508134.384511007, + "learning_rate": 4.745838956365272e-08, + "logits/chosen": -2.374009609222412, + "logits/rejected": -2.330867290496826, + "logps/chosen": -147.81002807617188, + "logps/rejected": -150.0068817138672, + "loss": 123565.525, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.030267197638750076, + "rewards/margins": 0.008453629910945892, + "rewards/rejected": -0.03872082382440567, + "step": 2260 + }, + { + "epoch": 0.9186564144071226, + "grad_norm": 4876699.168643324, + "learning_rate": 4.5209176788124156e-08, + "logits/chosen": -2.4444994926452637, + "logits/rejected": -2.370756149291992, + "logps/chosen": -148.59286499023438, + "logps/rejected": -159.95957946777344, + "loss": 121402.275, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.027829691767692566, + "rewards/margins": 0.01744781993329525, + "rewards/rejected": -0.045277513563632965, + "step": 2270 + }, + { + "epoch": 0.9227033589639821, + "grad_norm": 7528661.562948466, + "learning_rate": 4.2959964012595596e-08, + "logits/chosen": -2.421567678451538, + "logits/rejected": -2.4027442932128906, + "logps/chosen": -138.11740112304688, + "logps/rejected": -148.74745178222656, + "loss": 125506.075, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.025351068004965782, + "rewards/margins": 0.01246053259819746, + "rewards/rejected": -0.037811603397130966, + "step": 2280 + }, + { + "epoch": 0.9267503035208418, + "grad_norm": 5869282.00575302, + "learning_rate": 4.071075123706703e-08, + "logits/chosen": -2.344552993774414, + "logits/rejected": -2.309044122695923, + "logps/chosen": -135.99095153808594, + "logps/rejected": -162.00845336914062, + "loss": 119300.3375, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.01940598525106907, + "rewards/margins": 0.02190936915576458, + "rewards/rejected": -0.04131535068154335, + "step": 2290 + }, + { + "epoch": 0.9307972480777014, + "grad_norm": 6508908.496606901, + "learning_rate": 3.846153846153846e-08, + "logits/chosen": -2.2711312770843506, + "logits/rejected": -2.235738754272461, + "logps/chosen": -156.75537109375, + "logps/rejected": -162.16207885742188, + "loss": 121451.2625, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.0278861615806818, + "rewards/margins": 0.010201343335211277, + "rewards/rejected": -0.038087502121925354, + "step": 2300 + }, + { + "epoch": 0.9348441926345609, + "grad_norm": 8865730.632557675, + "learning_rate": 3.6212325686009894e-08, + "logits/chosen": -2.324096202850342, + "logits/rejected": -2.2920069694519043, + "logps/chosen": -117.9946060180664, + "logps/rejected": -133.47108459472656, + "loss": 122291.0625, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.022024232894182205, + "rewards/margins": 0.014470313675701618, + "rewards/rejected": -0.03649454563856125, + "step": 2310 + }, + { + "epoch": 0.9388911371914205, + "grad_norm": 9971453.387611723, + "learning_rate": 3.3963112910481334e-08, + "logits/chosen": -2.409850597381592, + "logits/rejected": -2.3329081535339355, + "logps/chosen": -146.290283203125, + "logps/rejected": -182.57904052734375, + "loss": 120819.0, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.024361763149499893, + "rewards/margins": 0.03630813583731651, + "rewards/rejected": -0.060669898986816406, + "step": 2320 + }, + { + "epoch": 0.9429380817482801, + "grad_norm": 6070350.046980263, + "learning_rate": 3.1713900134952766e-08, + "logits/chosen": -2.34000301361084, + "logits/rejected": -2.3207154273986816, + "logps/chosen": -135.71852111816406, + "logps/rejected": -158.15370178222656, + "loss": 127869.825, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.026078131049871445, + "rewards/margins": 0.018510058522224426, + "rewards/rejected": -0.04458818584680557, + "step": 2330 + }, + { + "epoch": 0.9469850263051396, + "grad_norm": 8157133.764333476, + "learning_rate": 2.94646873594242e-08, + "logits/chosen": -2.4080350399017334, + "logits/rejected": -2.3741536140441895, + "logps/chosen": -139.84112548828125, + "logps/rejected": -175.55917358398438, + "loss": 126005.05, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.029435906559228897, + "rewards/margins": 0.02673642337322235, + "rewards/rejected": -0.05617233365774155, + "step": 2340 + }, + { + "epoch": 0.9510319708619992, + "grad_norm": 8440293.351897202, + "learning_rate": 2.7215474583895635e-08, + "logits/chosen": -2.4163994789123535, + "logits/rejected": -2.384222984313965, + "logps/chosen": -158.38975524902344, + "logps/rejected": -164.0137176513672, + "loss": 123603.3875, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.03101927414536476, + "rewards/margins": 0.011726012453436852, + "rewards/rejected": -0.042745284736156464, + "step": 2350 + }, + { + "epoch": 0.9550789154188588, + "grad_norm": 6375292.788715957, + "learning_rate": 2.496626180836707e-08, + "logits/chosen": -2.2860450744628906, + "logits/rejected": -2.28193998336792, + "logps/chosen": -137.55361938476562, + "logps/rejected": -162.74652099609375, + "loss": 125693.3125, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.024988356977701187, + "rewards/margins": 0.023718636482954025, + "rewards/rejected": -0.04870699718594551, + "step": 2360 + }, + { + "epoch": 0.9591258599757183, + "grad_norm": 9472423.8957588, + "learning_rate": 2.2717049032838504e-08, + "logits/chosen": -2.359046459197998, + "logits/rejected": -2.3519136905670166, + "logps/chosen": -135.88697814941406, + "logps/rejected": -158.06321716308594, + "loss": 127380.25, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.027975231409072876, + "rewards/margins": 0.01904093287885189, + "rewards/rejected": -0.04701615869998932, + "step": 2370 + }, + { + "epoch": 0.9631728045325779, + "grad_norm": 6950351.310431428, + "learning_rate": 2.046783625730994e-08, + "logits/chosen": -2.278303623199463, + "logits/rejected": -2.271866798400879, + "logps/chosen": -147.02255249023438, + "logps/rejected": -162.41444396972656, + "loss": 131236.1875, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.03206818923354149, + "rewards/margins": 0.013242989778518677, + "rewards/rejected": -0.045311179012060165, + "step": 2380 + }, + { + "epoch": 0.9672197490894374, + "grad_norm": 6190616.100642323, + "learning_rate": 1.8218623481781373e-08, + "logits/chosen": -2.3274073600769043, + "logits/rejected": -2.2292959690093994, + "logps/chosen": -152.0672149658203, + "logps/rejected": -174.8033905029297, + "loss": 124131.975, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.027064388617873192, + "rewards/margins": 0.015537412837147713, + "rewards/rejected": -0.042601801455020905, + "step": 2390 + }, + { + "epoch": 0.971266693646297, + "grad_norm": 8140978.954292232, + "learning_rate": 1.5969410706252813e-08, + "logits/chosen": -2.3674769401550293, + "logits/rejected": -2.3571083545684814, + "logps/chosen": -144.0354461669922, + "logps/rejected": -160.3311004638672, + "loss": 125102.2375, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.02302565984427929, + "rewards/margins": 0.0166311115026474, + "rewards/rejected": -0.03965677320957184, + "step": 2400 + }, + { + "epoch": 0.9753136382031566, + "grad_norm": 8862305.552745355, + "learning_rate": 1.3720197930724246e-08, + "logits/chosen": -2.178356647491455, + "logits/rejected": -2.179384708404541, + "logps/chosen": -143.9452362060547, + "logps/rejected": -151.89974975585938, + "loss": 123180.975, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.035528309643268585, + "rewards/margins": 0.007553645875304937, + "rewards/rejected": -0.04308196157217026, + "step": 2410 + }, + { + "epoch": 0.9793605827600161, + "grad_norm": 4848306.613269352, + "learning_rate": 1.1470985155195682e-08, + "logits/chosen": -2.402296781539917, + "logits/rejected": -2.3765056133270264, + "logps/chosen": -125.8743896484375, + "logps/rejected": -145.01162719726562, + "loss": 122925.2125, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.017237985506653786, + "rewards/margins": 0.019860463216900826, + "rewards/rejected": -0.03709845244884491, + "step": 2420 + }, + { + "epoch": 0.9834075273168758, + "grad_norm": 5809904.408249709, + "learning_rate": 9.221772379667116e-09, + "logits/chosen": -2.4065396785736084, + "logits/rejected": -2.3716368675231934, + "logps/chosen": -143.05075073242188, + "logps/rejected": -167.95664978027344, + "loss": 124604.825, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.029827838763594627, + "rewards/margins": 0.02450350485742092, + "rewards/rejected": -0.05433133989572525, + "step": 2430 + }, + { + "epoch": 0.9874544718737354, + "grad_norm": 7148593.283376818, + "learning_rate": 6.972559604138551e-09, + "logits/chosen": -2.3499531745910645, + "logits/rejected": -2.3520779609680176, + "logps/chosen": -130.91500854492188, + "logps/rejected": -159.89820861816406, + "loss": 119113.2625, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.026112830266356468, + "rewards/margins": 0.027585214003920555, + "rewards/rejected": -0.05369805172085762, + "step": 2440 + }, + { + "epoch": 0.9915014164305949, + "grad_norm": 7332505.899956737, + "learning_rate": 4.723346828609986e-09, + "logits/chosen": -2.3807873725891113, + "logits/rejected": -2.3282299041748047, + "logps/chosen": -138.15525817871094, + "logps/rejected": -150.75531005859375, + "loss": 124786.675, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02242584154009819, + "rewards/margins": 0.013798736035823822, + "rewards/rejected": -0.03622458130121231, + "step": 2450 + }, + { + "epoch": 0.9955483609874545, + "grad_norm": 5408793.194556523, + "learning_rate": 2.474134053081421e-09, + "logits/chosen": -2.305051803588867, + "logits/rejected": -2.2709367275238037, + "logps/chosen": -127.0162124633789, + "logps/rejected": -154.36273193359375, + "loss": 125528.575, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.02522462233901024, + "rewards/margins": 0.017697211354970932, + "rewards/rejected": -0.04292182996869087, + "step": 2460 + }, + { + "epoch": 0.9995953055443141, + "grad_norm": 6547393.597919743, + "learning_rate": 2.249212775528565e-10, + "logits/chosen": -2.3911020755767822, + "logits/rejected": -2.3864622116088867, + "logps/chosen": -147.30072021484375, + "logps/rejected": -168.179443359375, + "loss": 121667.85, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.027930116280913353, + "rewards/margins": 0.010017314925789833, + "rewards/rejected": -0.037947431206703186, + "step": 2470 + } + ], + "logging_steps": 10, + "max_steps": 2471, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}