{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9871794871794872, "eval_steps": 500, "global_step": 699, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.021367521367521368, "grad_norm": 16.125, "learning_rate": 1.7556090538745385e-06, "logits/chosen": -3.5722389221191406, "logits/rejected": -3.5034377574920654, "logps/chosen": -41.095054626464844, "logps/rejected": -79.83882141113281, "loss": 0.6922, "rewards/accuracies": 0.3500000238418579, "rewards/chosen": 0.0014523781137540936, "rewards/margins": 0.001936142100021243, "rewards/rejected": -0.0004837641608901322, "step": 5 }, { "epoch": 0.042735042735042736, "grad_norm": 16.75, "learning_rate": 3.950120371217711e-06, "logits/chosen": -3.586623430252075, "logits/rejected": -3.506187915802002, "logps/chosen": -40.016441345214844, "logps/rejected": -78.24286651611328, "loss": 0.6796, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.002693606074899435, "rewards/margins": 0.027483653277158737, "rewards/rejected": -0.03017725981771946, "step": 10 }, { "epoch": 0.0641025641025641, "grad_norm": 14.75, "learning_rate": 6.144631688560886e-06, "logits/chosen": -3.5991673469543457, "logits/rejected": -3.50789213180542, "logps/chosen": -39.836097717285156, "logps/rejected": -81.42815399169922, "loss": 0.6306, "rewards/accuracies": 0.9916666746139526, "rewards/chosen": -0.006502463016659021, "rewards/margins": 0.1313387155532837, "rewards/rejected": -0.13784119486808777, "step": 15 }, { "epoch": 0.08547008547008547, "grad_norm": 13.625, "learning_rate": 8.339143005904057e-06, "logits/chosen": -3.5530943870544434, "logits/rejected": -3.492051601409912, "logps/chosen": -39.75938034057617, "logps/rejected": -80.72886657714844, "loss": 0.5434, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": -0.015247734263539314, "rewards/margins": 0.33922019600868225, "rewards/rejected": -0.3544679284095764, "step": 20 }, { "epoch": 0.10683760683760683, "grad_norm": 11.1875, "learning_rate": 1.0533654323247232e-05, "logits/chosen": -3.512582302093506, "logits/rejected": -3.4719951152801514, "logps/chosen": -39.996891021728516, "logps/rejected": -85.54742431640625, "loss": 0.3961, "rewards/accuracies": 0.98333340883255, "rewards/chosen": -0.04132762551307678, "rewards/margins": 0.7773466110229492, "rewards/rejected": -0.8186742067337036, "step": 25 }, { "epoch": 0.1282051282051282, "grad_norm": 7.5625, "learning_rate": 1.2728165640590407e-05, "logits/chosen": -3.4401345252990723, "logits/rejected": -3.4451375007629395, "logps/chosen": -42.438926696777344, "logps/rejected": -93.06343078613281, "loss": 0.2584, "rewards/accuracies": 0.9958332777023315, "rewards/chosen": -0.13156814873218536, "rewards/margins": 1.4302090406417847, "rewards/rejected": -1.561777114868164, "step": 30 }, { "epoch": 0.14957264957264957, "grad_norm": 4.28125, "learning_rate": 1.4922676957933578e-05, "logits/chosen": -3.220163345336914, "logits/rejected": -3.2657477855682373, "logps/chosen": -44.191307067871094, "logps/rejected": -108.76808166503906, "loss": 0.1195, "rewards/accuracies": 1.0, "rewards/chosen": -0.4397502541542053, "rewards/margins": 2.7580618858337402, "rewards/rejected": -3.19781231880188, "step": 35 }, { "epoch": 0.17094017094017094, "grad_norm": 2.0625, "learning_rate": 1.5360556888469565e-05, "logits/chosen": -2.7749853134155273, "logits/rejected": -2.807795286178589, "logps/chosen": -52.53513717651367, "logps/rejected": -143.27357482910156, "loss": 0.0426, "rewards/accuracies": 0.9916666746139526, "rewards/chosen": -1.2650867700576782, "rewards/margins": 5.236789226531982, "rewards/rejected": -6.501875877380371, "step": 40 }, { "epoch": 0.19230769230769232, "grad_norm": 1.71875, "learning_rate": 1.535640428282884e-05, "logits/chosen": -2.3302321434020996, "logits/rejected": -2.289696216583252, "logps/chosen": -51.40728759765625, "logps/rejected": -162.12364196777344, "loss": 0.0168, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -1.279262900352478, "rewards/margins": 7.274726867675781, "rewards/rejected": -8.553990364074707, "step": 45 }, { "epoch": 0.21367521367521367, "grad_norm": 6.5, "learning_rate": 1.5349059809872097e-05, "logits/chosen": -1.9786951541900635, "logits/rejected": -1.9170547723770142, "logps/chosen": -57.05157470703125, "logps/rejected": -180.14271545410156, "loss": 0.0191, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -1.664361596107483, "rewards/margins": 8.5733003616333, "rewards/rejected": -10.237661361694336, "step": 50 }, { "epoch": 0.23504273504273504, "grad_norm": 0.67578125, "learning_rate": 1.5338527542732884e-05, "logits/chosen": -1.976489782333374, "logits/rejected": -1.909419059753418, "logps/chosen": -58.60235595703125, "logps/rejected": -196.4000701904297, "loss": 0.0097, "rewards/accuracies": 0.9958332777023315, "rewards/chosen": -1.8706943988800049, "rewards/margins": 9.873316764831543, "rewards/rejected": -11.744011878967285, "step": 55 }, { "epoch": 0.2564102564102564, "grad_norm": 3.234375, "learning_rate": 1.532481332244717e-05, "logits/chosen": -1.9295637607574463, "logits/rejected": -1.817983627319336, "logps/chosen": -57.39198684692383, "logps/rejected": -199.2140350341797, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -1.8252334594726562, "rewards/margins": 10.293539047241211, "rewards/rejected": -12.11877155303955, "step": 60 }, { "epoch": 0.2777777777777778, "grad_norm": 9.125, "learning_rate": 1.5307924754713968e-05, "logits/chosen": -1.9402471780776978, "logits/rejected": -1.8663572072982788, "logps/chosen": -59.858306884765625, "logps/rejected": -196.60189819335938, "loss": 0.0108, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -1.9207541942596436, "rewards/margins": 9.978109359741211, "rewards/rejected": -11.89886474609375, "step": 65 }, { "epoch": 0.29914529914529914, "grad_norm": 0.298828125, "learning_rate": 1.528787120567736e-05, "logits/chosen": -2.046313762664795, "logits/rejected": -1.9556039571762085, "logps/chosen": -54.13606643676758, "logps/rejected": -192.9593963623047, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -1.4830687046051025, "rewards/margins": 10.119759559631348, "rewards/rejected": -11.602827072143555, "step": 70 }, { "epoch": 0.32051282051282054, "grad_norm": 0.400390625, "learning_rate": 1.526466379673215e-05, "logits/chosen": -2.0463297367095947, "logits/rejected": -1.9465014934539795, "logps/chosen": -53.60888671875, "logps/rejected": -200.31202697753906, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -1.4517700672149658, "rewards/margins": 10.894501686096191, "rewards/rejected": -12.346272468566895, "step": 75 }, { "epoch": 0.3418803418803419, "grad_norm": 14.0, "learning_rate": 1.5238315398356126e-05, "logits/chosen": -1.9893850088119507, "logits/rejected": -1.8696527481079102, "logps/chosen": -55.793907165527344, "logps/rejected": -205.0140838623047, "loss": 0.0099, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -1.6301085948944092, "rewards/margins": 11.256315231323242, "rewards/rejected": -12.88642406463623, "step": 80 }, { "epoch": 0.36324786324786323, "grad_norm": 0.040283203125, "learning_rate": 1.5208840622972272e-05, "logits/chosen": -1.9942238330841064, "logits/rejected": -1.8389371633529663, "logps/chosen": -60.305450439453125, "logps/rejected": -209.89016723632812, "loss": 0.0228, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -1.8958022594451904, "rewards/margins": 11.26887321472168, "rewards/rejected": -13.164674758911133, "step": 85 }, { "epoch": 0.38461538461538464, "grad_norm": 0.68359375, "learning_rate": 1.5176255816844948e-05, "logits/chosen": -1.958398461341858, "logits/rejected": -1.7746648788452148, "logps/chosen": -53.41706466674805, "logps/rejected": -208.37869262695312, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -1.439720869064331, "rewards/margins": 11.646267890930176, "rewards/rejected": -13.08598804473877, "step": 90 }, { "epoch": 0.405982905982906, "grad_norm": 0.67578125, "learning_rate": 1.5140579051014502e-05, "logits/chosen": -1.9176127910614014, "logits/rejected": -1.6993322372436523, "logps/chosen": -57.2091178894043, "logps/rejected": -224.92782592773438, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -1.716392159461975, "rewards/margins": 12.696627616882324, "rewards/rejected": -14.413020133972168, "step": 95 }, { "epoch": 0.42735042735042733, "grad_norm": 0.3203125, "learning_rate": 1.5101830111275334e-05, "logits/chosen": -1.874871015548706, "logits/rejected": -1.682807207107544, "logps/chosen": -63.0499267578125, "logps/rejected": -215.8572235107422, "loss": 0.0118, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -2.3144705295562744, "rewards/margins": 11.472696304321289, "rewards/rejected": -13.787165641784668, "step": 100 }, { "epoch": 0.44871794871794873, "grad_norm": 4.21875, "learning_rate": 1.5060030487203004e-05, "logits/chosen": -1.8294957876205444, "logits/rejected": -1.583496332168579, "logps/chosen": -67.67689514160156, "logps/rejected": -231.562744140625, "loss": 0.006, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -2.7576889991760254, "rewards/margins": 12.489922523498535, "rewards/rejected": -15.247611999511719, "step": 105 }, { "epoch": 0.4700854700854701, "grad_norm": 0.1015625, "learning_rate": 1.501520336023643e-05, "logits/chosen": -1.7654807567596436, "logits/rejected": -1.4742016792297363, "logps/chosen": -66.52511596679688, "logps/rejected": -227.6389923095703, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -2.638422966003418, "rewards/margins": 12.420553207397461, "rewards/rejected": -15.058975219726562, "step": 110 }, { "epoch": 0.49145299145299143, "grad_norm": 1.7421875, "learning_rate": 1.4967373590821828e-05, "logits/chosen": -1.7195736169815063, "logits/rejected": -1.4602675437927246, "logps/chosen": -63.84660720825195, "logps/rejected": -229.90017700195312, "loss": 0.0106, "rewards/accuracies": 0.9916666746139526, "rewards/chosen": -2.3248353004455566, "rewards/margins": 12.83845329284668, "rewards/rejected": -15.163289070129395, "step": 115 }, { "epoch": 0.5128205128205128, "grad_norm": 0.134765625, "learning_rate": 1.491656770462546e-05, "logits/chosen": -1.617491364479065, "logits/rejected": -1.2875694036483765, "logps/chosen": -59.133209228515625, "logps/rejected": -232.18191528320312, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -1.9275062084197998, "rewards/margins": 13.423171997070312, "rewards/rejected": -15.350679397583008, "step": 120 }, { "epoch": 0.5341880341880342, "grad_norm": 11.0, "learning_rate": 1.4862813877822923e-05, "logits/chosen": -1.6853389739990234, "logits/rejected": -1.3519870042800903, "logps/chosen": -58.672515869140625, "logps/rejected": -232.4235382080078, "loss": 0.0091, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -1.8344595432281494, "rewards/margins": 13.702409744262695, "rewards/rejected": -15.53686809539795, "step": 125 }, { "epoch": 0.5555555555555556, "grad_norm": 0.18359375, "learning_rate": 1.4806141921473063e-05, "logits/chosen": -1.711216926574707, "logits/rejected": -1.331209421157837, "logps/chosen": -58.75749969482422, "logps/rejected": -241.5647430419922, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -1.8602139949798584, "rewards/margins": 14.427963256835938, "rewards/rejected": -16.288179397583008, "step": 130 }, { "epoch": 0.5769230769230769, "grad_norm": 0.0157470703125, "learning_rate": 1.4746583264985202e-05, "logits/chosen": -1.73836350440979, "logits/rejected": -1.358798623085022, "logps/chosen": -57.656578063964844, "logps/rejected": -245.814697265625, "loss": 0.0054, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -1.727595329284668, "rewards/margins": 14.943672180175781, "rewards/rejected": -16.671268463134766, "step": 135 }, { "epoch": 0.5982905982905983, "grad_norm": 0.017578125, "learning_rate": 1.468417093868888e-05, "logits/chosen": -1.7839100360870361, "logits/rejected": -1.4424632787704468, "logps/chosen": -55.03651809692383, "logps/rejected": -240.14572143554688, "loss": 0.0047, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -1.5889627933502197, "rewards/margins": 14.578252792358398, "rewards/rejected": -16.16721534729004, "step": 140 }, { "epoch": 0.6196581196581197, "grad_norm": 0.06884765625, "learning_rate": 1.4618939555515721e-05, "logits/chosen": -1.7428079843521118, "logits/rejected": -1.4061057567596436, "logps/chosen": -59.45949172973633, "logps/rejected": -239.75048828125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -1.9328705072402954, "rewards/margins": 14.28913402557373, "rewards/rejected": -16.222003936767578, "step": 145 }, { "epoch": 0.6410256410256411, "grad_norm": 0.00933837890625, "learning_rate": 1.455092529180363e-05, "logits/chosen": -1.7827781438827515, "logits/rejected": -1.4337228536605835, "logps/chosen": -57.58295440673828, "logps/rejected": -243.2423553466797, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.8888952732086182, "rewards/margins": 14.729533195495605, "rewards/rejected": -16.618427276611328, "step": 150 }, { "epoch": 0.6623931623931624, "grad_norm": 1.1171875, "learning_rate": 1.4480165867233946e-05, "logits/chosen": -1.7574710845947266, "logits/rejected": -1.4313023090362549, "logps/chosen": -61.97917938232422, "logps/rejected": -249.78890991210938, "loss": 0.0062, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -2.226684093475342, "rewards/margins": 14.859643936157227, "rewards/rejected": -17.086326599121094, "step": 155 }, { "epoch": 0.6837606837606838, "grad_norm": 2.59375, "learning_rate": 1.440670052391267e-05, "logits/chosen": -1.776049017906189, "logits/rejected": -1.4136337041854858, "logps/chosen": -58.725502014160156, "logps/rejected": -239.2755126953125, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -2.0192863941192627, "rewards/margins": 14.30653190612793, "rewards/rejected": -16.32581901550293, "step": 160 }, { "epoch": 0.7051282051282052, "grad_norm": 1.7734375, "learning_rate": 1.4330570004607398e-05, "logits/chosen": -1.8287827968597412, "logits/rejected": -1.4543850421905518, "logps/chosen": -57.99534225463867, "logps/rejected": -249.4701690673828, "loss": 0.0035, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -1.8924716711044312, "rewards/margins": 15.258901596069336, "rewards/rejected": -17.15137481689453, "step": 165 }, { "epoch": 0.7264957264957265, "grad_norm": 1.359375, "learning_rate": 1.4251816530151986e-05, "logits/chosen": -1.7740707397460938, "logits/rejected": -1.4325586557388306, "logps/chosen": -60.86432647705078, "logps/rejected": -249.397216796875, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -2.059633255004883, "rewards/margins": 15.068583488464355, "rewards/rejected": -17.128215789794922, "step": 170 }, { "epoch": 0.7478632478632479, "grad_norm": 0.060546875, "learning_rate": 1.4170483776031526e-05, "logits/chosen": -1.7101682424545288, "logits/rejected": -1.3118056058883667, "logps/chosen": -60.89410400390625, "logps/rejected": -252.2559814453125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.1261119842529297, "rewards/margins": 15.250958442687988, "rewards/rejected": -17.377071380615234, "step": 175 }, { "epoch": 0.7692307692307693, "grad_norm": 0.031494140625, "learning_rate": 1.4086616848160574e-05, "logits/chosen": -1.6894880533218384, "logits/rejected": -1.2855875492095947, "logps/chosen": -66.87565612792969, "logps/rejected": -245.99557495117188, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.7102391719818115, "rewards/margins": 14.112167358398438, "rewards/rejected": -16.822406768798828, "step": 180 }, { "epoch": 0.7905982905982906, "grad_norm": 0.09912109375, "learning_rate": 1.4000262257868096e-05, "logits/chosen": -1.6548315286636353, "logits/rejected": -1.192333459854126, "logps/chosen": -64.77526092529297, "logps/rejected": -249.92196655273438, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -2.546926975250244, "rewards/margins": 14.788670539855957, "rewards/rejected": -17.33559799194336, "step": 185 }, { "epoch": 0.811965811965812, "grad_norm": 1.1953125, "learning_rate": 1.3911467896102994e-05, "logits/chosen": -1.6022329330444336, "logits/rejected": -1.1850754022598267, "logps/chosen": -60.396514892578125, "logps/rejected": -247.5742645263672, "loss": 0.0054, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -2.1046481132507324, "rewards/margins": 15.000350952148438, "rewards/rejected": -17.104999542236328, "step": 190 }, { "epoch": 0.8333333333333334, "grad_norm": 0.10400390625, "learning_rate": 1.3820283006874503e-05, "logits/chosen": -1.563820719718933, "logits/rejected": -1.137289047241211, "logps/chosen": -65.57173156738281, "logps/rejected": -252.38955688476562, "loss": 0.0085, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -2.3488571643829346, "rewards/margins": 15.110745429992676, "rewards/rejected": -17.45960235595703, "step": 195 }, { "epoch": 0.8547008547008547, "grad_norm": 0.1279296875, "learning_rate": 1.372675815994221e-05, "logits/chosen": -1.4945417642593384, "logits/rejected": -1.038334846496582, "logps/chosen": -54.2162971496582, "logps/rejected": -254.0222625732422, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.6657600402832031, "rewards/margins": 16.079242706298828, "rewards/rejected": -17.745004653930664, "step": 200 }, { "epoch": 0.8760683760683761, "grad_norm": 14.625, "learning_rate": 1.3630945222770829e-05, "logits/chosen": -1.5167819261550903, "logits/rejected": -1.0430529117584229, "logps/chosen": -60.5411262512207, "logps/rejected": -259.63555908203125, "loss": 0.0192, "rewards/accuracies": 0.9916666746139526, "rewards/chosen": -2.0362796783447266, "rewards/margins": 16.097753524780273, "rewards/rejected": -18.134033203125, "step": 205 }, { "epoch": 0.8974358974358975, "grad_norm": 0.2119140625, "learning_rate": 1.3532897331765301e-05, "logits/chosen": -1.5572597980499268, "logits/rejected": -1.053264856338501, "logps/chosen": -59.34284591674805, "logps/rejected": -260.30206298828125, "loss": 0.0151, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -1.9360477924346924, "rewards/margins": 16.303897857666016, "rewards/rejected": -18.239948272705078, "step": 210 }, { "epoch": 0.9188034188034188, "grad_norm": 0.0849609375, "learning_rate": 1.3432668862802134e-05, "logits/chosen": -1.4950945377349854, "logits/rejected": -1.014696478843689, "logps/chosen": -57.9535026550293, "logps/rejected": -256.272705078125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.7511317729949951, "rewards/margins": 15.973749160766602, "rewards/rejected": -17.724878311157227, "step": 215 }, { "epoch": 0.9401709401709402, "grad_norm": 0.08642578125, "learning_rate": 1.3330315401073371e-05, "logits/chosen": -1.5073899030685425, "logits/rejected": -1.0442817211151123, "logps/chosen": -59.66225051879883, "logps/rejected": -249.38534545898438, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -1.872859001159668, "rewards/margins": 15.302103042602539, "rewards/rejected": -17.17496109008789, "step": 220 }, { "epoch": 0.9615384615384616, "grad_norm": 2.625, "learning_rate": 1.3225893710259887e-05, "logits/chosen": -1.3290693759918213, "logits/rejected": -0.8046108484268188, "logps/chosen": -60.98704147338867, "logps/rejected": -252.48495483398438, "loss": 0.0065, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -2.179378032684326, "rewards/margins": 15.35717487335205, "rewards/rejected": -17.53655433654785, "step": 225 }, { "epoch": 0.9829059829059829, "grad_norm": 1.359375, "learning_rate": 1.3119461701051105e-05, "logits/chosen": -1.4031749963760376, "logits/rejected": -0.8651553988456726, "logps/chosen": -57.99671173095703, "logps/rejected": -251.2525634765625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.0349326133728027, "rewards/margins": 15.621841430664062, "rewards/rejected": -17.656774520874023, "step": 230 }, { "epoch": 0.9957264957264957, "eval_logits/chosen": -1.4502625465393066, "eval_logits/rejected": -0.9043333530426025, "eval_logps/chosen": -61.05683517456055, "eval_logps/rejected": -259.54730224609375, "eval_loss": 0.0003871396475005895, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -2.1613101959228516, "eval_rewards/margins": 15.991730690002441, "eval_rewards/rejected": -18.15304183959961, "eval_runtime": 9.8475, "eval_samples_per_second": 20.31, "eval_steps_per_second": 20.31, "step": 233 }, { "epoch": 1.0042735042735043, "grad_norm": 0.0341796875, "learning_rate": 1.3011078399028605e-05, "logits/chosen": -1.430901288986206, "logits/rejected": -0.9248638153076172, "logps/chosen": -60.4477424621582, "logps/rejected": -265.5071716308594, "loss": 0.0036, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -2.0871260166168213, "rewards/margins": 16.610048294067383, "rewards/rejected": -18.697174072265625, "step": 235 }, { "epoch": 1.0256410256410255, "grad_norm": 0.058349609375, "learning_rate": 1.2900803911931431e-05, "logits/chosen": -1.4457504749298096, "logits/rejected": -0.9034906625747681, "logps/chosen": -59.68601608276367, "logps/rejected": -263.2565612792969, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.9931907653808594, "rewards/margins": 16.479150772094727, "rewards/rejected": -18.472341537475586, "step": 240 }, { "epoch": 1.047008547008547, "grad_norm": 0.037841796875, "learning_rate": 1.2788699396321252e-05, "logits/chosen": -1.39047110080719, "logits/rejected": -0.8637332916259766, "logps/chosen": -57.831581115722656, "logps/rejected": -255.6790771484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.925370454788208, "rewards/margins": 16.076345443725586, "rewards/rejected": -18.00171661376953, "step": 245 }, { "epoch": 1.0683760683760684, "grad_norm": 0.0002689361572265625, "learning_rate": 1.2674827023665853e-05, "logits/chosen": -1.4293615818023682, "logits/rejected": -0.9234841465950012, "logps/chosen": -61.868263244628906, "logps/rejected": -270.99920654296875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.230020046234131, "rewards/margins": 17.106487274169922, "rewards/rejected": -19.336505889892578, "step": 250 }, { "epoch": 1.0897435897435896, "grad_norm": 0.08154296875, "learning_rate": 1.255924994585978e-05, "logits/chosen": -1.4079844951629639, "logits/rejected": -0.8683494329452515, "logps/chosen": -63.99333953857422, "logps/rejected": -266.74462890625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.376952886581421, "rewards/margins": 16.51467514038086, "rewards/rejected": -18.89162826538086, "step": 255 }, { "epoch": 1.1111111111111112, "grad_norm": 0.00147247314453125, "learning_rate": 1.2442032260201255e-05, "logits/chosen": -1.400887131690979, "logits/rejected": -0.890865683555603, "logps/chosen": -62.627723693847656, "logps/rejected": -263.9581604003906, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.308192729949951, "rewards/margins": 16.26297950744629, "rewards/rejected": -18.571170806884766, "step": 260 }, { "epoch": 1.1324786324786325, "grad_norm": 0.1982421875, "learning_rate": 1.2323238973844796e-05, "logits/chosen": -1.438955545425415, "logits/rejected": -0.9066799283027649, "logps/chosen": -63.32421875, "logps/rejected": -273.82049560546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.295706272125244, "rewards/margins": 17.04709243774414, "rewards/rejected": -19.342798233032227, "step": 265 }, { "epoch": 1.1538461538461537, "grad_norm": 0.011962890625, "learning_rate": 1.2202935967749212e-05, "logits/chosen": -1.3738555908203125, "logits/rejected": -0.8493305444717407, "logps/chosen": -67.02043151855469, "logps/rejected": -268.3296813964844, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.584259510040283, "rewards/margins": 16.276723861694336, "rewards/rejected": -18.86098289489746, "step": 270 }, { "epoch": 1.1752136752136753, "grad_norm": 1.765625, "learning_rate": 1.2081189960141038e-05, "logits/chosen": -1.424109697341919, "logits/rejected": -0.8860370516777039, "logps/chosen": -61.854644775390625, "logps/rejected": -267.197021484375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.1506810188293457, "rewards/margins": 16.72945213317871, "rewards/rejected": -18.880136489868164, "step": 275 }, { "epoch": 1.1965811965811965, "grad_norm": 0.185546875, "learning_rate": 1.1958068469513604e-05, "logits/chosen": -1.4285290241241455, "logits/rejected": -0.9055356979370117, "logps/chosen": -65.09664916992188, "logps/rejected": -277.3038635253906, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -2.459376096725464, "rewards/margins": 17.357646942138672, "rewards/rejected": -19.817026138305664, "step": 280 }, { "epoch": 1.217948717948718, "grad_norm": 0.0059814453125, "learning_rate": 1.1833639777182316e-05, "logits/chosen": -1.306983232498169, "logits/rejected": -0.7688174247741699, "logps/chosen": -59.734840393066406, "logps/rejected": -272.2923889160156, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.1238536834716797, "rewards/margins": 17.397279739379883, "rewards/rejected": -19.52113151550293, "step": 285 }, { "epoch": 1.2393162393162394, "grad_norm": 0.004547119140625, "learning_rate": 1.170797288941685e-05, "logits/chosen": -1.2420094013214111, "logits/rejected": -0.7252348065376282, "logps/chosen": -66.8678207397461, "logps/rejected": -281.2005920410156, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.6819348335266113, "rewards/margins": 17.403133392333984, "rewards/rejected": -20.085067749023438, "step": 290 }, { "epoch": 1.2606837606837606, "grad_norm": 0.8828125, "learning_rate": 1.1581137499171342e-05, "logits/chosen": -1.2711966037750244, "logits/rejected": -0.7393882870674133, "logps/chosen": -67.3682861328125, "logps/rejected": -272.0533142089844, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -2.7543585300445557, "rewards/margins": 16.66292953491211, "rewards/rejected": -19.41728973388672, "step": 295 }, { "epoch": 1.282051282051282, "grad_norm": 0.01080322265625, "learning_rate": 1.145320394743371e-05, "logits/chosen": -1.2599390745162964, "logits/rejected": -0.7466105222702026, "logps/chosen": -65.01288604736328, "logps/rejected": -266.981689453125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.5523085594177246, "rewards/margins": 16.560453414916992, "rewards/rejected": -19.112764358520508, "step": 300 }, { "epoch": 1.3034188034188035, "grad_norm": 0.005706787109375, "learning_rate": 1.1324243184215622e-05, "logits/chosen": -1.268808364868164, "logits/rejected": -0.7630107998847961, "logps/chosen": -67.40531921386719, "logps/rejected": -282.67205810546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.892889976501465, "rewards/margins": 17.570592880249023, "rewards/rejected": -20.463483810424805, "step": 305 }, { "epoch": 1.3247863247863247, "grad_norm": 0.00238037109375, "learning_rate": 1.1194326729204686e-05, "logits/chosen": -1.2974001169204712, "logits/rejected": -0.7796735167503357, "logps/chosen": -67.35159301757812, "logps/rejected": -279.50341796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.8483831882476807, "rewards/margins": 17.4374942779541, "rewards/rejected": -20.285879135131836, "step": 310 }, { "epoch": 1.3461538461538463, "grad_norm": 0.021484375, "learning_rate": 1.1063526632100717e-05, "logits/chosen": -1.2823902368545532, "logits/rejected": -0.8006687164306641, "logps/chosen": -71.19620513916016, "logps/rejected": -273.39569091796875, "loss": 0.0031, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -3.098231077194214, "rewards/margins": 16.550390243530273, "rewards/rejected": -19.648624420166016, "step": 315 }, { "epoch": 1.3675213675213675, "grad_norm": 0.022216796875, "learning_rate": 1.0931915432658055e-05, "logits/chosen": -1.2740647792816162, "logits/rejected": -0.7717633843421936, "logps/chosen": -68.88563537597656, "logps/rejected": -277.2671813964844, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.923701763153076, "rewards/margins": 17.007097244262695, "rewards/rejected": -19.930797576904297, "step": 320 }, { "epoch": 1.3888888888888888, "grad_norm": 0.048095703125, "learning_rate": 1.0799566120456133e-05, "logits/chosen": -1.2586907148361206, "logits/rejected": -0.7510126829147339, "logps/chosen": -71.87751770019531, "logps/rejected": -275.9109191894531, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.1944289207458496, "rewards/margins": 16.62671661376953, "rewards/rejected": -19.82114601135254, "step": 325 }, { "epoch": 1.4102564102564101, "grad_norm": 0.0052490234375, "learning_rate": 1.066655209442054e-05, "logits/chosen": -1.280989646911621, "logits/rejected": -0.772638201713562, "logps/chosen": -71.73796081542969, "logps/rejected": -279.8167419433594, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -3.172919511795044, "rewards/margins": 16.89200210571289, "rewards/rejected": -20.064922332763672, "step": 330 }, { "epoch": 1.4316239316239316, "grad_norm": 0.007720947265625, "learning_rate": 1.0532947122117101e-05, "logits/chosen": -1.27340567111969, "logits/rejected": -0.7604951858520508, "logps/chosen": -69.45366668701172, "logps/rejected": -278.88409423828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.9620535373687744, "rewards/margins": 17.169063568115234, "rewards/rejected": -20.13111686706543, "step": 335 }, { "epoch": 1.452991452991453, "grad_norm": 0.046630859375, "learning_rate": 1.0398825298841499e-05, "logits/chosen": -1.2809860706329346, "logits/rejected": -0.838448166847229, "logps/chosen": -72.61773681640625, "logps/rejected": -281.2303466796875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.1478874683380127, "rewards/margins": 17.092044830322266, "rewards/rejected": -20.239933013916016, "step": 340 }, { "epoch": 1.4743589743589745, "grad_norm": 0.01025390625, "learning_rate": 1.0264261006527144e-05, "logits/chosen": -1.301695704460144, "logits/rejected": -0.8668543100357056, "logps/chosen": -69.15229797363281, "logps/rejected": -270.77130126953125, "loss": 0.0035, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -2.9070522785186768, "rewards/margins": 16.549983978271484, "rewards/rejected": -19.457035064697266, "step": 345 }, { "epoch": 1.4957264957264957, "grad_norm": 0.5625, "learning_rate": 1.0129328872494075e-05, "logits/chosen": -1.385507345199585, "logits/rejected": -0.9304911494255066, "logps/chosen": -70.08064270019531, "logps/rejected": -279.90045166015625, "loss": 0.003, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -3.0408215522766113, "rewards/margins": 17.223588943481445, "rewards/rejected": -20.2644100189209, "step": 350 }, { "epoch": 1.517094017094017, "grad_norm": 0.12890625, "learning_rate": 9.994103728061786e-06, "logits/chosen": -1.3539297580718994, "logits/rejected": -0.8995206952095032, "logps/chosen": -71.8431625366211, "logps/rejected": -278.0020446777344, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.1649577617645264, "rewards/margins": 16.83130645751953, "rewards/rejected": -19.99626350402832, "step": 355 }, { "epoch": 1.5384615384615383, "grad_norm": 0.006134033203125, "learning_rate": 9.858660567048902e-06, "logits/chosen": -1.3628873825073242, "logits/rejected": -0.8617550730705261, "logps/chosen": -73.44374084472656, "logps/rejected": -291.49237060546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.3345115184783936, "rewards/margins": 17.942880630493164, "rewards/rejected": -21.277393341064453, "step": 360 }, { "epoch": 1.5598290598290598, "grad_norm": 0.036376953125, "learning_rate": 9.72307450418274e-06, "logits/chosen": -1.3687984943389893, "logits/rejected": -0.8952552676200867, "logps/chosen": -68.8270492553711, "logps/rejected": -276.6328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.877126932144165, "rewards/margins": 17.08051872253418, "rewards/rejected": -19.957645416259766, "step": 365 }, { "epoch": 1.5811965811965814, "grad_norm": 0.400390625, "learning_rate": 9.587420733441835e-06, "logits/chosen": -1.3641754388809204, "logits/rejected": -0.9082571864128113, "logps/chosen": -67.42304992675781, "logps/rejected": -281.8648986816406, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.7612674236297607, "rewards/margins": 17.61087989807129, "rewards/rejected": -20.372146606445312, "step": 370 }, { "epoch": 1.6025641025641026, "grad_norm": 0.000423431396484375, "learning_rate": 9.45177448635447e-06, "logits/chosen": -1.3942601680755615, "logits/rejected": -0.8580430746078491, "logps/chosen": -67.60791015625, "logps/rejected": -284.3832092285156, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.805607795715332, "rewards/margins": 18.005619049072266, "rewards/rejected": -20.811227798461914, "step": 375 }, { "epoch": 1.623931623931624, "grad_norm": 0.0255126953125, "learning_rate": 9.316210990276434e-06, "logits/chosen": -1.3189040422439575, "logits/rejected": -0.8662185668945312, "logps/chosen": -67.1756362915039, "logps/rejected": -272.69500732421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.7223994731903076, "rewards/margins": 16.855377197265625, "rewards/rejected": -19.577777862548828, "step": 380 }, { "epoch": 1.6452991452991452, "grad_norm": 0.04638671875, "learning_rate": 9.18080542667105e-06, "logits/chosen": -1.3573819398880005, "logits/rejected": -0.8372514843940735, "logps/chosen": -68.33662414550781, "logps/rejected": -291.3061218261719, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.794102191925049, "rewards/margins": 18.328411102294922, "rewards/rejected": -21.122512817382812, "step": 385 }, { "epoch": 1.6666666666666665, "grad_norm": 0.000885009765625, "learning_rate": 9.045632889414686e-06, "logits/chosen": -1.345085859298706, "logits/rejected": -0.8467508554458618, "logps/chosen": -66.54302978515625, "logps/rejected": -283.5255126953125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.698514461517334, "rewards/margins": 17.84862518310547, "rewards/rejected": -20.54714012145996, "step": 390 }, { "epoch": 1.688034188034188, "grad_norm": 0.02734375, "learning_rate": 8.910768343150828e-06, "logits/chosen": -1.3259168863296509, "logits/rejected": -0.855597198009491, "logps/chosen": -69.09947204589844, "logps/rejected": -284.14801025390625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.0127596855163574, "rewards/margins": 17.691822052001953, "rewards/rejected": -20.704580307006836, "step": 395 }, { "epoch": 1.7094017094017095, "grad_norm": 0.0032806396484375, "learning_rate": 8.77628658171581e-06, "logits/chosen": -1.3521082401275635, "logits/rejected": -0.896456241607666, "logps/chosen": -67.49749755859375, "logps/rejected": -278.388916015625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.764543056488037, "rewards/margins": 17.227275848388672, "rewards/rejected": -19.991817474365234, "step": 400 }, { "epoch": 1.7307692307692308, "grad_norm": 0.0830078125, "learning_rate": 8.642262186659298e-06, "logits/chosen": -1.311095952987671, "logits/rejected": -0.8420788049697876, "logps/chosen": -68.78193664550781, "logps/rejected": -280.7834777832031, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.862732172012329, "rewards/margins": 17.331756591796875, "rewards/rejected": -20.194486618041992, "step": 405 }, { "epoch": 1.7521367521367521, "grad_norm": 0.00640869140625, "learning_rate": 8.508769485882487e-06, "logits/chosen": -1.3232362270355225, "logits/rejected": -0.8540644645690918, "logps/chosen": -74.96504974365234, "logps/rejected": -283.9762268066406, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.3426380157470703, "rewards/margins": 17.237430572509766, "rewards/rejected": -20.580068588256836, "step": 410 }, { "epoch": 1.7735042735042734, "grad_norm": 0.91796875, "learning_rate": 8.375882512416969e-06, "logits/chosen": -1.2893245220184326, "logits/rejected": -0.7882084846496582, "logps/chosen": -72.9918212890625, "logps/rejected": -286.07623291015625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.248950481414795, "rewards/margins": 17.547632217407227, "rewards/rejected": -20.796581268310547, "step": 415 }, { "epoch": 1.7948717948717947, "grad_norm": 0.019287109375, "learning_rate": 8.243674963367137e-06, "logits/chosen": -1.3166277408599854, "logits/rejected": -0.8026930093765259, "logps/chosen": -76.18501281738281, "logps/rejected": -281.90447998046875, "loss": 0.0032, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -3.600315570831299, "rewards/margins": 16.638935089111328, "rewards/rejected": -20.239248275756836, "step": 420 }, { "epoch": 1.8162393162393162, "grad_norm": 0.0012359619140625, "learning_rate": 8.11222015903888e-06, "logits/chosen": -1.3423035144805908, "logits/rejected": -0.8135835528373718, "logps/chosen": -78.0848159790039, "logps/rejected": -287.83135986328125, "loss": 0.0047, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -3.862813949584961, "rewards/margins": 17.16622543334961, "rewards/rejected": -21.029037475585938, "step": 425 }, { "epoch": 1.8376068376068377, "grad_norm": 0.037353515625, "learning_rate": 7.981591002277265e-06, "logits/chosen": -1.3140472173690796, "logits/rejected": -0.8106688261032104, "logps/chosen": -77.36860656738281, "logps/rejected": -281.99664306640625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.733330488204956, "rewards/margins": 16.830041885375977, "rewards/rejected": -20.563371658325195, "step": 430 }, { "epoch": 1.858974358974359, "grad_norm": 0.03515625, "learning_rate": 7.851859938035712e-06, "logits/chosen": -1.304713487625122, "logits/rejected": -0.7914744019508362, "logps/chosen": -78.41984558105469, "logps/rejected": -291.4969177246094, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.7893600463867188, "rewards/margins": 17.505069732666016, "rewards/rejected": -21.294429779052734, "step": 435 }, { "epoch": 1.8803418803418803, "grad_norm": 0.03271484375, "learning_rate": 7.723098913199118e-06, "logits/chosen": -1.3396222591400146, "logits/rejected": -0.834884524345398, "logps/chosen": -75.61878967285156, "logps/rejected": -278.4013366699219, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.660414218902588, "rewards/margins": 16.659198760986328, "rewards/rejected": -20.31961441040039, "step": 440 }, { "epoch": 1.9017094017094016, "grad_norm": 0.015625, "learning_rate": 7.595379336683204e-06, "logits/chosen": -1.3091070652008057, "logits/rejected": -0.7569972276687622, "logps/chosen": -72.38371276855469, "logps/rejected": -287.745849609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.400171995162964, "rewards/margins": 17.77309226989746, "rewards/rejected": -21.17326545715332, "step": 445 }, { "epoch": 1.9230769230769231, "grad_norm": 0.004638671875, "learning_rate": 7.468772039832218e-06, "logits/chosen": -1.2781813144683838, "logits/rejected": -0.7406023740768433, "logps/chosen": -68.62843322753906, "logps/rejected": -279.81103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.0305323600769043, "rewards/margins": 17.346649169921875, "rewards/rejected": -20.37718391418457, "step": 450 }, { "epoch": 1.9444444444444444, "grad_norm": 0.0026092529296875, "learning_rate": 7.3433472371369404e-06, "logits/chosen": -1.349867820739746, "logits/rejected": -0.8363698720932007, "logps/chosen": -74.56883239746094, "logps/rejected": -285.27227783203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.506099224090576, "rewards/margins": 17.23251724243164, "rewards/rejected": -20.738616943359375, "step": 455 }, { "epoch": 1.965811965811966, "grad_norm": 0.00165557861328125, "learning_rate": 7.219174487294784e-06, "logits/chosen": -1.3465303182601929, "logits/rejected": -0.8472278714179993, "logps/chosen": -71.95396423339844, "logps/rejected": -288.6927795410156, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.223040819168091, "rewards/margins": 17.83417510986328, "rewards/rejected": -21.05721664428711, "step": 460 }, { "epoch": 1.9871794871794872, "grad_norm": 0.003509521484375, "learning_rate": 7.0963226546336e-06, "logits/chosen": -1.3585379123687744, "logits/rejected": -0.8536975979804993, "logps/chosen": -72.73930358886719, "logps/rejected": -283.37579345703125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.333233594894409, "rewards/margins": 17.287670135498047, "rewards/rejected": -20.62090492248535, "step": 465 }, { "epoch": 1.9914529914529915, "eval_logits/chosen": -1.3740124702453613, "eval_logits/rejected": -0.8455994129180908, "eval_logps/chosen": -73.04570007324219, "eval_logps/rejected": -284.90460205078125, "eval_loss": 0.00021937819838058203, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -3.360196590423584, "eval_rewards/margins": 17.328575134277344, "eval_rewards/rejected": -20.688772201538086, "eval_runtime": 9.5669, "eval_samples_per_second": 20.905, "eval_steps_per_second": 20.905, "step": 466 }, { "epoch": 2.0085470085470085, "grad_norm": 0.038818359375, "learning_rate": 6.974859870920561e-06, "logits/chosen": -1.2795295715332031, "logits/rejected": -0.8111523389816284, "logps/chosen": -75.71898651123047, "logps/rejected": -279.89599609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.6148743629455566, "rewards/margins": 16.733488082885742, "rewards/rejected": -20.348360061645508, "step": 470 }, { "epoch": 2.02991452991453, "grad_norm": 0.006195068359375, "learning_rate": 6.8548534975773135e-06, "logits/chosen": -1.3317922353744507, "logits/rejected": -0.8281890153884888, "logps/chosen": -75.41677856445312, "logps/rejected": -286.36761474609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.4711709022521973, "rewards/margins": 17.307971954345703, "rewards/rejected": -20.779144287109375, "step": 475 }, { "epoch": 2.051282051282051, "grad_norm": 0.341796875, "learning_rate": 6.736370088322359e-06, "logits/chosen": -1.3174855709075928, "logits/rejected": -0.7978845238685608, "logps/chosen": -74.10897064208984, "logps/rejected": -283.9443664550781, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.4749629497528076, "rewards/margins": 17.323604583740234, "rewards/rejected": -20.798566818237305, "step": 480 }, { "epoch": 2.072649572649573, "grad_norm": 0.007293701171875, "learning_rate": 6.619475352261356e-06, "logits/chosen": -1.3000952005386353, "logits/rejected": -0.8089855909347534, "logps/chosen": -78.87946319580078, "logps/rejected": -287.0957336425781, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.6867222785949707, "rewards/margins": 17.182767868041992, "rewards/rejected": -20.869489669799805, "step": 485 }, { "epoch": 2.094017094017094, "grad_norm": 0.002105712890625, "learning_rate": 6.504234117445857e-06, "logits/chosen": -1.3139859437942505, "logits/rejected": -0.8154487609863281, "logps/chosen": -74.31788635253906, "logps/rejected": -284.235107421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.5273959636688232, "rewards/margins": 17.246944427490234, "rewards/rejected": -20.774341583251953, "step": 490 }, { "epoch": 2.1153846153846154, "grad_norm": 0.0240478515625, "learning_rate": 6.39071029492065e-06, "logits/chosen": -1.2831732034683228, "logits/rejected": -0.7532753348350525, "logps/chosen": -73.73322296142578, "logps/rejected": -282.9715270996094, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.508289337158203, "rewards/margins": 17.217966079711914, "rewards/rejected": -20.726253509521484, "step": 495 }, { "epoch": 2.1367521367521367, "grad_norm": 0.162109375, "learning_rate": 6.2789668432796535e-06, "logits/chosen": -1.2966052293777466, "logits/rejected": -0.8182178735733032, "logps/chosen": -75.21055603027344, "logps/rejected": -284.99566650390625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.6024773120880127, "rewards/margins": 17.332172393798828, "rewards/rejected": -20.934650421142578, "step": 500 }, { "epoch": 2.1367521367521367, "eval_logits/chosen": -1.3704440593719482, "eval_logits/rejected": -0.8410933017730713, "eval_logps/chosen": -73.29509735107422, "eval_logps/rejected": -285.1749572753906, "eval_loss": 0.00022948597325012088, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -3.385136365890503, "eval_rewards/margins": 17.330673217773438, "eval_rewards/rejected": -20.715810775756836, "eval_runtime": 9.5382, "eval_samples_per_second": 20.968, "eval_steps_per_second": 20.968, "step": 500 }, { "epoch": 2.158119658119658, "grad_norm": 0.009033203125, "learning_rate": 6.16906573375004e-06, "logits/chosen": -1.3252205848693848, "logits/rejected": -0.8390571475028992, "logps/chosen": -74.536376953125, "logps/rejected": -282.44195556640625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.4960930347442627, "rewards/margins": 17.042308807373047, "rewards/rejected": -20.538402557373047, "step": 505 }, { "epoch": 2.1794871794871793, "grad_norm": 0.00933837890625, "learning_rate": 6.061067915823923e-06, "logits/chosen": -1.2685729265213013, "logits/rejected": -0.7679704427719116, "logps/chosen": -72.36498260498047, "logps/rejected": -282.66351318359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.318829298019409, "rewards/margins": 17.183773040771484, "rewards/rejected": -20.50260353088379, "step": 510 }, { "epoch": 2.200854700854701, "grad_norm": 0.0196533203125, "learning_rate": 5.955033283456711e-06, "logits/chosen": -1.2974504232406616, "logits/rejected": -0.7774112820625305, "logps/chosen": -78.15269470214844, "logps/rejected": -293.7340393066406, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.7473702430725098, "rewards/margins": 17.710281372070312, "rewards/rejected": -21.457651138305664, "step": 515 }, { "epoch": 2.2222222222222223, "grad_norm": 0.0023040771484375, "learning_rate": 5.8510206418507914e-06, "logits/chosen": -1.3559068441390991, "logits/rejected": -0.8591842651367188, "logps/chosen": -77.19640350341797, "logps/rejected": -300.4443359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.6723015308380127, "rewards/margins": 18.35979461669922, "rewards/rejected": -22.03209686279297, "step": 520 }, { "epoch": 2.2435897435897436, "grad_norm": 0.01385498046875, "learning_rate": 5.749087674843095e-06, "logits/chosen": -1.2999016046524048, "logits/rejected": -0.8241308927536011, "logps/chosen": -70.76306915283203, "logps/rejected": -283.5411376953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.232194423675537, "rewards/margins": 17.38874626159668, "rewards/rejected": -20.620941162109375, "step": 525 }, { "epoch": 2.264957264957265, "grad_norm": 0.0264892578125, "learning_rate": 5.649290912914482e-06, "logits/chosen": -1.3043696880340576, "logits/rejected": -0.8295344114303589, "logps/chosen": -79.15299224853516, "logps/rejected": -298.5386962890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.7782886028289795, "rewards/margins": 17.993247985839844, "rewards/rejected": -21.771535873413086, "step": 530 }, { "epoch": 2.286324786324786, "grad_norm": 0.04931640625, "learning_rate": 5.5516857018388144e-06, "logits/chosen": -1.355273962020874, "logits/rejected": -0.8746377229690552, "logps/chosen": -74.15048217773438, "logps/rejected": -281.2100524902344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.4310333728790283, "rewards/margins": 16.96903419494629, "rewards/rejected": -20.400066375732422, "step": 535 }, { "epoch": 2.3076923076923075, "grad_norm": 0.01397705078125, "learning_rate": 5.456326171989005e-06, "logits/chosen": -1.3123310804367065, "logits/rejected": -0.840388298034668, "logps/chosen": -71.68992614746094, "logps/rejected": -300.37091064453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.202878475189209, "rewards/margins": 18.644420623779297, "rewards/rejected": -21.84729766845703, "step": 540 }, { "epoch": 2.3290598290598292, "grad_norm": 0.0027618408203125, "learning_rate": 5.363265208317156e-06, "logits/chosen": -1.2788275480270386, "logits/rejected": -0.8199743032455444, "logps/chosen": -73.48957824707031, "logps/rejected": -281.1923828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.415186643600464, "rewards/margins": 17.106266021728516, "rewards/rejected": -20.52145004272461, "step": 545 }, { "epoch": 2.3504273504273505, "grad_norm": 0.01055908203125, "learning_rate": 5.272554421025347e-06, "logits/chosen": -1.3188756704330444, "logits/rejected": -0.8151782751083374, "logps/chosen": -74.64764404296875, "logps/rejected": -291.92108154296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.5020947456359863, "rewards/margins": 17.861783981323242, "rewards/rejected": -21.36387825012207, "step": 550 }, { "epoch": 2.371794871794872, "grad_norm": 0.05078125, "learning_rate": 5.184244116943411e-06, "logits/chosen": -1.3126680850982666, "logits/rejected": -0.8074380159378052, "logps/chosen": -73.89201354980469, "logps/rejected": -285.258056640625, "loss": 0.003, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -3.3727481365203857, "rewards/margins": 17.37813377380371, "rewards/rejected": -20.75088119506836, "step": 555 }, { "epoch": 2.393162393162393, "grad_norm": 0.00604248046875, "learning_rate": 5.098383271629512e-06, "logits/chosen": -1.3314543962478638, "logits/rejected": -0.8163145184516907, "logps/chosen": -73.50102233886719, "logps/rejected": -279.52532958984375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.414262294769287, "rewards/margins": 16.943660736083984, "rewards/rejected": -20.357921600341797, "step": 560 }, { "epoch": 2.4145299145299144, "grad_norm": 0.00311279296875, "learning_rate": 5.015019502209056e-06, "logits/chosen": -1.3196806907653809, "logits/rejected": -0.8105131387710571, "logps/chosen": -72.18685913085938, "logps/rejected": -275.95123291015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.361246109008789, "rewards/margins": 16.643529891967773, "rewards/rejected": -20.004776000976562, "step": 565 }, { "epoch": 2.435897435897436, "grad_norm": 0.000823974609375, "learning_rate": 4.934199040966955e-06, "logits/chosen": -1.3401740789413452, "logits/rejected": -0.8449984788894653, "logps/chosen": -73.99894714355469, "logps/rejected": -279.31915283203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.482757568359375, "rewards/margins": 16.8636531829834, "rewards/rejected": -20.346412658691406, "step": 570 }, { "epoch": 2.4572649572649574, "grad_norm": 0.03271484375, "learning_rate": 4.855966709707881e-06, "logits/chosen": -1.308977484703064, "logits/rejected": -0.8370776176452637, "logps/chosen": -77.53469848632812, "logps/rejected": -285.58367919921875, "loss": 0.0029, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -3.7107481956481934, "rewards/margins": 17.121841430664062, "rewards/rejected": -20.832592010498047, "step": 575 }, { "epoch": 2.4786324786324787, "grad_norm": 0.0299072265625, "learning_rate": 4.780365894898799e-06, "logits/chosen": -1.3271667957305908, "logits/rejected": -0.8259018063545227, "logps/chosen": -74.68269348144531, "logps/rejected": -287.12078857421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.4662697315216064, "rewards/margins": 17.42196273803711, "rewards/rejected": -20.888233184814453, "step": 580 }, { "epoch": 2.5, "grad_norm": 0.00921630859375, "learning_rate": 4.7074385236074684e-06, "logits/chosen": -1.3541457653045654, "logits/rejected": -0.8319869041442871, "logps/chosen": -78.34286499023438, "logps/rejected": -294.329833984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.6850643157958984, "rewards/margins": 17.795787811279297, "rewards/rejected": -21.480854034423828, "step": 585 }, { "epoch": 2.5213675213675213, "grad_norm": 0.052001953125, "learning_rate": 4.63722504025034e-06, "logits/chosen": -1.3320066928863525, "logits/rejected": -0.8415569067001343, "logps/chosen": -71.98558044433594, "logps/rejected": -284.99346923828125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -3.249178647994995, "rewards/margins": 17.486406326293945, "rewards/rejected": -20.735586166381836, "step": 590 }, { "epoch": 2.5427350427350426, "grad_norm": 0.01025390625, "learning_rate": 4.569764384162676e-06, "logits/chosen": -1.3463108539581299, "logits/rejected": -0.8353781700134277, "logps/chosen": -68.05410766601562, "logps/rejected": -284.91461181640625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.9681944847106934, "rewards/margins": 17.779176712036133, "rewards/rejected": -20.74736976623535, "step": 595 }, { "epoch": 2.564102564102564, "grad_norm": 0.01422119140625, "learning_rate": 4.50509396800341e-06, "logits/chosen": -1.2894313335418701, "logits/rejected": -0.7797183990478516, "logps/chosen": -72.90419006347656, "logps/rejected": -285.7866516113281, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.3224644660949707, "rewards/margins": 17.589801788330078, "rewards/rejected": -20.91226577758789, "step": 600 }, { "epoch": 2.5854700854700856, "grad_norm": 0.0247802734375, "learning_rate": 4.443249657006627e-06, "logits/chosen": -1.2982523441314697, "logits/rejected": -0.7844825983047485, "logps/chosen": -70.4168472290039, "logps/rejected": -290.3528747558594, "loss": 0.0029, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -3.1497464179992676, "rewards/margins": 18.14370346069336, "rewards/rejected": -21.293447494506836, "step": 605 }, { "epoch": 2.606837606837607, "grad_norm": 0.0164794921875, "learning_rate": 4.384265749091266e-06, "logits/chosen": -1.2762781381607056, "logits/rejected": -0.7862453460693359, "logps/chosen": -78.71661376953125, "logps/rejected": -288.5256042480469, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.7849228382110596, "rewards/margins": 17.296167373657227, "rewards/rejected": -21.081090927124023, "step": 610 }, { "epoch": 2.628205128205128, "grad_norm": 0.00482177734375, "learning_rate": 4.328174955840002e-06, "logits/chosen": -1.2989494800567627, "logits/rejected": -0.791740357875824, "logps/chosen": -68.19273376464844, "logps/rejected": -283.67010498046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.0757412910461426, "rewards/margins": 17.688796997070312, "rewards/rejected": -20.764535903930664, "step": 615 }, { "epoch": 2.6495726495726495, "grad_norm": 0.0026092529296875, "learning_rate": 4.275008384357902e-06, "logits/chosen": -1.3389320373535156, "logits/rejected": -0.8387205004692078, "logps/chosen": -72.53665924072266, "logps/rejected": -283.5371398925781, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.2912750244140625, "rewards/margins": 17.413835525512695, "rewards/rejected": -20.705108642578125, "step": 620 }, { "epoch": 2.6709401709401708, "grad_norm": 0.0028533935546875, "learning_rate": 4.224795520020898e-06, "logits/chosen": -1.2840917110443115, "logits/rejected": -0.7634187936782837, "logps/chosen": -75.63284301757812, "logps/rejected": -282.2013244628906, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.4395980834960938, "rewards/margins": 16.99862289428711, "rewards/rejected": -20.438220977783203, "step": 625 }, { "epoch": 2.6923076923076925, "grad_norm": 0.01446533203125, "learning_rate": 4.177564210123634e-06, "logits/chosen": -1.32615327835083, "logits/rejected": -0.8317953944206238, "logps/chosen": -72.77333068847656, "logps/rejected": -291.9281921386719, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.310988664627075, "rewards/margins": 17.982282638549805, "rewards/rejected": -21.293270111083984, "step": 630 }, { "epoch": 2.713675213675214, "grad_norm": 0.00921630859375, "learning_rate": 4.133340648435789e-06, "logits/chosen": -1.3119795322418213, "logits/rejected": -0.7846705913543701, "logps/chosen": -74.24435424804688, "logps/rejected": -289.03887939453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.4849464893341064, "rewards/margins": 17.662567138671875, "rewards/rejected": -21.147512435913086, "step": 635 }, { "epoch": 2.735042735042735, "grad_norm": 0.005828857421875, "learning_rate": 4.092149360675402e-06, "logits/chosen": -1.2881155014038086, "logits/rejected": -0.8033782243728638, "logps/chosen": -79.22930908203125, "logps/rejected": -294.1476745605469, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.7118358612060547, "rewards/margins": 17.629985809326172, "rewards/rejected": -21.341821670532227, "step": 640 }, { "epoch": 2.7564102564102564, "grad_norm": 0.019775390625, "learning_rate": 4.054013190907282e-06, "logits/chosen": -1.2686903476715088, "logits/rejected": -0.7805891633033752, "logps/chosen": -70.22049713134766, "logps/rejected": -284.9203796386719, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.1786301136016846, "rewards/margins": 17.73773956298828, "rewards/rejected": -20.916370391845703, "step": 645 }, { "epoch": 2.7777777777777777, "grad_norm": 0.004241943359375, "learning_rate": 4.018953288874035e-06, "logits/chosen": -1.3032779693603516, "logits/rejected": -0.8233755230903625, "logps/chosen": -74.59503936767578, "logps/rejected": -288.961669921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.4693355560302734, "rewards/margins": 17.675161361694336, "rewards/rejected": -21.14449691772461, "step": 650 }, { "epoch": 2.799145299145299, "grad_norm": 0.0040283203125, "learning_rate": 3.9869890982667385e-06, "logits/chosen": -1.310773491859436, "logits/rejected": -0.7524069547653198, "logps/chosen": -72.80381774902344, "logps/rejected": -290.0320739746094, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.345860004425049, "rewards/margins": 17.688016891479492, "rewards/rejected": -21.033876419067383, "step": 655 }, { "epoch": 2.8205128205128203, "grad_norm": 0.0189208984375, "learning_rate": 3.9581383459417625e-06, "logits/chosen": -1.291512370109558, "logits/rejected": -0.790591299533844, "logps/chosen": -78.11724853515625, "logps/rejected": -297.84283447265625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.7806007862091064, "rewards/margins": 18.05144500732422, "rewards/rejected": -21.832046508789062, "step": 660 }, { "epoch": 2.841880341880342, "grad_norm": 0.00494384765625, "learning_rate": 3.932417032089722e-06, "logits/chosen": -1.3292133808135986, "logits/rejected": -0.8189595937728882, "logps/chosen": -76.23522186279297, "logps/rejected": -293.2325744628906, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.59516978263855, "rewards/margins": 17.823274612426758, "rewards/rejected": -21.418445587158203, "step": 665 }, { "epoch": 2.8632478632478633, "grad_norm": 0.0029754638671875, "learning_rate": 3.909839421362017e-06, "logits/chosen": -1.2779964208602905, "logits/rejected": -0.7794166803359985, "logps/chosen": -74.2072982788086, "logps/rejected": -289.36956787109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.46783185005188, "rewards/margins": 17.554330825805664, "rewards/rejected": -21.02216148376465, "step": 670 }, { "epoch": 2.8846153846153846, "grad_norm": 0.038818359375, "learning_rate": 3.890418034959871e-06, "logits/chosen": -1.2737759351730347, "logits/rejected": -0.7384223937988281, "logps/chosen": -73.76658630371094, "logps/rejected": -286.06195068359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.443812847137451, "rewards/margins": 17.49923324584961, "rewards/rejected": -20.943044662475586, "step": 675 }, { "epoch": 2.905982905982906, "grad_norm": 0.06884765625, "learning_rate": 3.874163643690263e-06, "logits/chosen": -1.255707025527954, "logits/rejected": -0.7339369654655457, "logps/chosen": -80.28028869628906, "logps/rejected": -291.90179443359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.9126086235046387, "rewards/margins": 17.456329345703125, "rewards/rejected": -21.368938446044922, "step": 680 }, { "epoch": 2.9273504273504276, "grad_norm": 0.04296875, "learning_rate": 3.861085261992599e-06, "logits/chosen": -1.306028127670288, "logits/rejected": -0.8490394353866577, "logps/chosen": -78.53582763671875, "logps/rejected": -289.38330078125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.830167770385742, "rewards/margins": 17.217695236206055, "rewards/rejected": -21.047863006591797, "step": 685 }, { "epoch": 2.948717948717949, "grad_norm": 0.12255859375, "learning_rate": 3.851190142939442e-06, "logits/chosen": -1.2999136447906494, "logits/rejected": -0.8062965273857117, "logps/chosen": -72.80134582519531, "logps/rejected": -288.02532958984375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.2750275135040283, "rewards/margins": 17.672157287597656, "rewards/rejected": -20.94718360900879, "step": 690 }, { "epoch": 2.97008547008547, "grad_norm": 0.001434326171875, "learning_rate": 3.844483774214069e-06, "logits/chosen": -1.2856634855270386, "logits/rejected": -0.7375695705413818, "logps/chosen": -72.40345764160156, "logps/rejected": -288.3551940917969, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.333986759185791, "rewards/margins": 17.785888671875, "rewards/rejected": -21.119874954223633, "step": 695 }, { "epoch": 2.9871794871794872, "eval_logits/chosen": -1.3524901866912842, "eval_logits/rejected": -0.8189607262611389, "eval_logps/chosen": -73.21478271484375, "eval_logps/rejected": -285.69122314453125, "eval_loss": 0.00020370040147099644, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -3.377105474472046, "eval_rewards/margins": 17.390329360961914, "eval_rewards/rejected": -20.767436981201172, "eval_runtime": 9.5425, "eval_samples_per_second": 20.959, "eval_steps_per_second": 20.959, "step": 699 } ], "logging_steps": 5, "max_steps": 702, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 12, "trial_name": null, "trial_params": null }