EricLabile's picture
Model save
b9c1930 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9976019184652278,
"eval_steps": 500,
"global_step": 208,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 912.9860992431641,
"epoch": 0.004796163069544364,
"grad_norm": 0.12673589773378027,
"kl": 0.0,
"learning_rate": 4.7619047619047613e-08,
"loss": 0.0232,
"reward": 0.6874999850988388,
"reward_std": 0.3423890396952629,
"rewards/accuracy_reward": 0.2500000037252903,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4375,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 883.1944580078125,
"epoch": 0.009592326139088728,
"grad_norm": 0.116076838638148,
"kl": 0.0,
"learning_rate": 9.523809523809523e-08,
"loss": 0.0136,
"reward": 0.6423611119389534,
"reward_std": 0.3120992071926594,
"rewards/accuracy_reward": 0.243055559694767,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3993055522441864,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 851.4652862548828,
"epoch": 0.014388489208633094,
"grad_norm": 0.12982224030488893,
"kl": 2.41696834564209e-05,
"learning_rate": 1.4285714285714285e-07,
"loss": 0.0211,
"reward": 0.7118055671453476,
"reward_std": 0.3277251161634922,
"rewards/accuracy_reward": 0.2708333358168602,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4409722238779068,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 877.0138854980469,
"epoch": 0.019184652278177457,
"grad_norm": 0.12194604446708072,
"kl": 2.086162567138672e-05,
"learning_rate": 1.9047619047619045e-07,
"loss": 0.0285,
"reward": 0.6909722238779068,
"reward_std": 0.31570543721318245,
"rewards/accuracy_reward": 0.2847222248092294,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.40625,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 884.4236297607422,
"epoch": 0.023980815347721823,
"grad_norm": 0.13261607584021887,
"kl": 3.56137752532959e-05,
"learning_rate": 2.3809523809523806e-07,
"loss": 0.0312,
"reward": 0.626736119389534,
"reward_std": 0.2724486030638218,
"rewards/accuracy_reward": 0.23611111007630825,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3906250074505806,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 825.4166870117188,
"epoch": 0.02877697841726619,
"grad_norm": 0.12101637274777749,
"kl": 2.5272369384765625e-05,
"learning_rate": 2.857142857142857e-07,
"loss": 0.022,
"reward": 0.817708358168602,
"reward_std": 0.3249164782464504,
"rewards/accuracy_reward": 0.361111119389534,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4565972313284874,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 898.826416015625,
"epoch": 0.03357314148681055,
"grad_norm": 0.13410726618357155,
"kl": 3.2961368560791016e-05,
"learning_rate": 3.333333333333333e-07,
"loss": 0.017,
"reward": 0.6961805522441864,
"reward_std": 0.281472560018301,
"rewards/accuracy_reward": 0.2916666641831398,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4045138880610466,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 868.3472290039062,
"epoch": 0.03836930455635491,
"grad_norm": 0.14428783414562169,
"kl": 3.2767653465270996e-05,
"learning_rate": 3.809523809523809e-07,
"loss": 0.0364,
"reward": 0.7777777761220932,
"reward_std": 0.38908588513731956,
"rewards/accuracy_reward": 0.34722222574055195,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.430555559694767,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 922.3125152587891,
"epoch": 0.04316546762589928,
"grad_norm": 0.14202941889032566,
"kl": 3.263354301452637e-05,
"learning_rate": 4.285714285714285e-07,
"loss": 0.0361,
"reward": 0.524305559694767,
"reward_std": 0.23212899640202522,
"rewards/accuracy_reward": 0.15972222574055195,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3645833432674408,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 914.1597290039062,
"epoch": 0.047961630695443645,
"grad_norm": 0.12987837517343923,
"kl": 3.269314765930176e-05,
"learning_rate": 4.761904761904761e-07,
"loss": 0.0316,
"reward": 0.5885416567325592,
"reward_std": 0.2799038216471672,
"rewards/accuracy_reward": 0.19444444822147489,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3940972238779068,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 935.5347137451172,
"epoch": 0.05275779376498801,
"grad_norm": 0.12123557233064179,
"kl": 2.6702880859375e-05,
"learning_rate": 5.238095238095238e-07,
"loss": 0.0268,
"reward": 0.579861119389534,
"reward_std": 0.2995072081685066,
"rewards/accuracy_reward": 0.20833333395421505,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3715277761220932,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 902.8472290039062,
"epoch": 0.05755395683453238,
"grad_norm": 0.1263644550467175,
"kl": 1.638941466808319e-05,
"learning_rate": 5.714285714285714e-07,
"loss": 0.0182,
"reward": 0.5815972238779068,
"reward_std": 0.26795749366283417,
"rewards/accuracy_reward": 0.18055555410683155,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.401041679084301,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 910.9583435058594,
"epoch": 0.06235011990407674,
"grad_norm": 0.13348819214381502,
"kl": 3.2275915145874023e-05,
"learning_rate": 6.19047619047619e-07,
"loss": 0.033,
"reward": 0.5833333507180214,
"reward_std": 0.27443326637148857,
"rewards/accuracy_reward": 0.18750000186264515,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3958333432674408,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 837.5764007568359,
"epoch": 0.0671462829736211,
"grad_norm": 0.12534297846344097,
"kl": 2.5704503059387207e-05,
"learning_rate": 6.666666666666666e-07,
"loss": 0.0398,
"reward": 0.6614583432674408,
"reward_std": 0.25699039548635483,
"rewards/accuracy_reward": 0.2291666716337204,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4322916716337204,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 870.8958435058594,
"epoch": 0.07194244604316546,
"grad_norm": 0.1178768584334791,
"kl": 1.9222497940063477e-05,
"learning_rate": 7.142857142857143e-07,
"loss": 0.0223,
"reward": 0.6701388955116272,
"reward_std": 0.25698356330394745,
"rewards/accuracy_reward": 0.25694444589316845,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4131944477558136,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 855.7222137451172,
"epoch": 0.07673860911270983,
"grad_norm": 0.15574941119385063,
"kl": 2.9087066650390625e-05,
"learning_rate": 7.619047619047618e-07,
"loss": 0.0337,
"reward": 0.7760416865348816,
"reward_std": 0.4277946427464485,
"rewards/accuracy_reward": 0.3194444440305233,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4565972313284874,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 869.6944427490234,
"epoch": 0.0815347721822542,
"grad_norm": 0.1258833956177519,
"kl": 3.331899642944336e-05,
"learning_rate": 8.095238095238095e-07,
"loss": 0.0129,
"reward": 0.6701388955116272,
"reward_std": 0.30363673344254494,
"rewards/accuracy_reward": 0.2847222238779068,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3854166716337204,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 874.4097137451172,
"epoch": 0.08633093525179857,
"grad_norm": 0.1315962441453753,
"kl": 2.0717590814456344e-05,
"learning_rate": 8.57142857142857e-07,
"loss": 0.0301,
"reward": 0.6163194552063942,
"reward_std": 0.2553598415106535,
"rewards/accuracy_reward": 0.21527777705341578,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4010416716337204,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 815.0555572509766,
"epoch": 0.09112709832134293,
"grad_norm": 0.14298620074456062,
"kl": 3.075599670410156e-05,
"learning_rate": 9.047619047619047e-07,
"loss": 0.0405,
"reward": 0.763888880610466,
"reward_std": 0.2983681969344616,
"rewards/accuracy_reward": 0.3263888955116272,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4375000074505806,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 862.4861145019531,
"epoch": 0.09592326139088729,
"grad_norm": 0.13958983684341053,
"kl": 3.007054328918457e-05,
"learning_rate": 9.523809523809522e-07,
"loss": 0.0179,
"reward": 0.6788194626569748,
"reward_std": 0.2541828490793705,
"rewards/accuracy_reward": 0.24305555783212185,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4357638880610466,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 891.1597290039062,
"epoch": 0.10071942446043165,
"grad_norm": 0.14460083396808562,
"kl": 5.367398262023926e-05,
"learning_rate": 1e-06,
"loss": 0.0206,
"reward": 0.6215277761220932,
"reward_std": 0.2927175499498844,
"rewards/accuracy_reward": 0.22222222574055195,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.399305559694767,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 859.0,
"epoch": 0.10551558752997602,
"grad_norm": 0.13266204171177207,
"kl": 7.867813110351562e-05,
"learning_rate": 9.999364977905849e-07,
"loss": 0.0212,
"reward": 0.7864583432674408,
"reward_std": 0.329488068819046,
"rewards/accuracy_reward": 0.3472222238779068,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4392361119389534,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 841.5069427490234,
"epoch": 0.11031175059952038,
"grad_norm": 0.1368070463424762,
"kl": 0.00011658668518066406,
"learning_rate": 9.99746009084698e-07,
"loss": 0.0389,
"reward": 0.7725694477558136,
"reward_std": 0.3147674612700939,
"rewards/accuracy_reward": 0.3402777761220932,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.432291679084301,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 887.5139007568359,
"epoch": 0.11510791366906475,
"grad_norm": 0.1458860079179632,
"kl": 0.00013267993927001953,
"learning_rate": 9.994285876443557e-07,
"loss": 0.0341,
"reward": 0.626736119389534,
"reward_std": 0.2819124907255173,
"rewards/accuracy_reward": 0.2361111119389534,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3906250074505806,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 873.8611145019531,
"epoch": 0.11990407673860912,
"grad_norm": 0.130318186151234,
"kl": 0.00016427040100097656,
"learning_rate": 9.989843230560593e-07,
"loss": 0.0384,
"reward": 0.6493055671453476,
"reward_std": 0.28573132678866386,
"rewards/accuracy_reward": 0.22222222574055195,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4270833358168602,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 820.6111145019531,
"epoch": 0.12470023980815348,
"grad_norm": 0.1353493867926934,
"kl": 0.00028514862060546875,
"learning_rate": 9.984133407055104e-07,
"loss": 0.0057,
"reward": 0.7204861342906952,
"reward_std": 0.27508755773305893,
"rewards/accuracy_reward": 0.2777777798473835,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4427083358168602,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 903.1041717529297,
"epoch": 0.12949640287769784,
"grad_norm": 0.13147174706101974,
"kl": 0.0002446174621582031,
"learning_rate": 9.97715801742224e-07,
"loss": 0.043,
"reward": 0.6232638880610466,
"reward_std": 0.2895628921687603,
"rewards/accuracy_reward": 0.2361111156642437,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3871527835726738,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 870.1736145019531,
"epoch": 0.1342925659472422,
"grad_norm": 0.12627875749194337,
"kl": 0.0002665519714355469,
"learning_rate": 9.968919030340457e-07,
"loss": 0.0277,
"reward": 0.756944477558136,
"reward_std": 0.3117631673812866,
"rewards/accuracy_reward": 0.3194444477558136,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4375,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 896.8680725097656,
"epoch": 0.13908872901678657,
"grad_norm": 0.13847019160477245,
"kl": 0.00043702125549316406,
"learning_rate": 9.959418771115903e-07,
"loss": 0.0286,
"reward": 0.5954861268401146,
"reward_std": 0.268420971930027,
"rewards/accuracy_reward": 0.180555559694767,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.414930559694767,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 802.3194580078125,
"epoch": 0.14388489208633093,
"grad_norm": 0.14967482022344253,
"kl": 0.0006508827209472656,
"learning_rate": 9.948659921026139e-07,
"loss": 0.0318,
"reward": 0.8125000149011612,
"reward_std": 0.4278785213828087,
"rewards/accuracy_reward": 0.3541666716337204,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4583333283662796,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 794.6875,
"epoch": 0.1486810551558753,
"grad_norm": 0.17944099601126384,
"kl": 0.0007243156433105469,
"learning_rate": 9.936645516563387e-07,
"loss": 0.0583,
"reward": 0.8663194477558136,
"reward_std": 0.35557055473327637,
"rewards/accuracy_reward": 0.4027777835726738,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4635416716337204,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 880.7361145019531,
"epoch": 0.15347721822541965,
"grad_norm": 0.12117489678150584,
"kl": 0.0006771087646484375,
"learning_rate": 9.923378948577558e-07,
"loss": 0.0401,
"reward": 0.6406250074505806,
"reward_std": 0.26150013506412506,
"rewards/accuracy_reward": 0.23611111473292112,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4045138880610466,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 834.5555725097656,
"epoch": 0.15827338129496402,
"grad_norm": 0.1537599572741959,
"kl": 0.00096893310546875,
"learning_rate": 9.908863961319219e-07,
"loss": 0.0342,
"reward": 0.861111119389534,
"reward_std": 0.37204235792160034,
"rewards/accuracy_reward": 0.3958333432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4652777835726738,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 850.9791870117188,
"epoch": 0.1630695443645084,
"grad_norm": 0.1605541412094071,
"kl": 0.0012502670288085938,
"learning_rate": 9.893104651382861e-07,
"loss": 0.055,
"reward": 0.8003472238779068,
"reward_std": 0.3042585700750351,
"rewards/accuracy_reward": 0.3819444514811039,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4184027835726738,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 851.0069580078125,
"epoch": 0.16786570743405277,
"grad_norm": 0.13513924603619376,
"kl": 0.001659393310546875,
"learning_rate": 9.876105466550707e-07,
"loss": 0.0509,
"reward": 0.7881944477558136,
"reward_std": 0.3145363964140415,
"rewards/accuracy_reward": 0.3680555671453476,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4201388880610466,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 878.7361297607422,
"epoch": 0.17266187050359713,
"grad_norm": 0.1542251680807794,
"kl": 0.0016040802001953125,
"learning_rate": 9.857871204537401e-07,
"loss": 0.0544,
"reward": 0.6944444477558136,
"reward_std": 0.2541184388101101,
"rewards/accuracy_reward": 0.305555559694767,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3888888955116272,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 830.6805572509766,
"epoch": 0.1774580335731415,
"grad_norm": 0.13514283713713865,
"kl": 0.0020580291748046875,
"learning_rate": 9.838407011635942e-07,
"loss": 0.0246,
"reward": 0.8350694626569748,
"reward_std": 0.29664015769958496,
"rewards/accuracy_reward": 0.3958333283662796,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4392361119389534,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 896.7152862548828,
"epoch": 0.18225419664268586,
"grad_norm": 0.12350709906557038,
"kl": 0.0016126632690429688,
"learning_rate": 9.817718381265238e-07,
"loss": 0.0437,
"reward": 0.6128472238779068,
"reward_std": 0.31635782122612,
"rewards/accuracy_reward": 0.2291666679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3836805522441864,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 864.9444580078125,
"epoch": 0.18705035971223022,
"grad_norm": 0.12426066518400752,
"kl": 0.0020294189453125,
"learning_rate": 9.795811152419678e-07,
"loss": 0.0301,
"reward": 0.7291666716337204,
"reward_std": 0.24946986511349678,
"rewards/accuracy_reward": 0.3333333358168602,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.3958333358168602,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 800.3333282470703,
"epoch": 0.19184652278177458,
"grad_norm": 0.15357763591255594,
"kl": 0.0022525787353515625,
"learning_rate": 9.772691508021193e-07,
"loss": 0.042,
"reward": 0.8281250149011612,
"reward_std": 0.310004822909832,
"rewards/accuracy_reward": 0.3888888992369175,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4392361044883728,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 818.4653015136719,
"epoch": 0.19664268585131894,
"grad_norm": 0.14942346890583016,
"kl": 0.0027675628662109375,
"learning_rate": 9.748365973174227e-07,
"loss": 0.0492,
"reward": 0.8402778059244156,
"reward_std": 0.292511161416769,
"rewards/accuracy_reward": 0.375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4652777835726738,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 809.9514007568359,
"epoch": 0.2014388489208633,
"grad_norm": 0.15897417290331764,
"kl": 0.003749847412109375,
"learning_rate": 9.722841413324149e-07,
"loss": 0.0459,
"reward": 0.8593750149011612,
"reward_std": 0.3366158865392208,
"rewards/accuracy_reward": 0.4027777761220932,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4565972238779068,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 729.5972290039062,
"epoch": 0.20623501199040767,
"grad_norm": 0.176025608996368,
"kl": 0.002838134765625,
"learning_rate": 9.6961250323196e-07,
"loss": 0.0243,
"reward": 1.1562499850988388,
"reward_std": 0.33967938274145126,
"rewards/accuracy_reward": 0.6319444552063942,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5243055522441864,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 850.7222290039062,
"epoch": 0.21103117505995203,
"grad_norm": 0.11868611450690505,
"kl": 0.003307342529296875,
"learning_rate": 9.668224370379346e-07,
"loss": 0.0277,
"reward": 0.8246527910232544,
"reward_std": 0.2909863740205765,
"rewards/accuracy_reward": 0.4027777910232544,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4218750074505806,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 758.6111297607422,
"epoch": 0.2158273381294964,
"grad_norm": 0.15818031203002028,
"kl": 0.003475189208984375,
"learning_rate": 9.639147301964175e-07,
"loss": 0.0287,
"reward": 0.9652777910232544,
"reward_std": 0.301775723695755,
"rewards/accuracy_reward": 0.4791666716337204,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4861111119389534,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 838.8055725097656,
"epoch": 0.22062350119904076,
"grad_norm": 0.16002912577528097,
"kl": 0.004032135009765625,
"learning_rate": 9.608902033554475e-07,
"loss": 0.0429,
"reward": 0.7517361342906952,
"reward_std": 0.34651144593954086,
"rewards/accuracy_reward": 0.305555559694767,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.446180559694767,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 843.9861297607422,
"epoch": 0.22541966426858512,
"grad_norm": 0.1540903493857499,
"kl": 0.003780364990234375,
"learning_rate": 9.577497101334103e-07,
"loss": 0.0317,
"reward": 0.8159722238779068,
"reward_std": 0.3663570396602154,
"rewards/accuracy_reward": 0.3611111119389534,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.454861119389534,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 865.0486145019531,
"epoch": 0.2302158273381295,
"grad_norm": 0.14872925893794725,
"kl": 0.004302978515625,
"learning_rate": 9.544941368781208e-07,
"loss": 0.0514,
"reward": 0.7083333432674408,
"reward_std": 0.39198317378759384,
"rewards/accuracy_reward": 0.2777777798473835,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.430555559694767,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 808.4028015136719,
"epoch": 0.23501199040767387,
"grad_norm": 0.13499682062737456,
"kl": 0.004261016845703125,
"learning_rate": 9.51124402416666e-07,
"loss": 0.0296,
"reward": 0.8680555671453476,
"reward_std": 0.265322033315897,
"rewards/accuracy_reward": 0.4305555559694767,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4375,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 853.8611145019531,
"epoch": 0.23980815347721823,
"grad_norm": 0.16386389384814085,
"kl": 0.00469207763671875,
"learning_rate": 9.476414577960834e-07,
"loss": 0.0508,
"reward": 0.7951389104127884,
"reward_std": 0.33591291680932045,
"rewards/accuracy_reward": 0.3611111119389534,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4340277835726738,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 770.3402862548828,
"epoch": 0.2446043165467626,
"grad_norm": 0.17896254581630255,
"kl": 0.0062255859375,
"learning_rate": 9.440462860149451e-07,
"loss": 0.0483,
"reward": 0.8229166716337204,
"reward_std": 0.36454326659440994,
"rewards/accuracy_reward": 0.3749999962747097,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4479166716337204,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 839.0208435058594,
"epoch": 0.24940047961630696,
"grad_norm": 0.13648789584357146,
"kl": 0.00505828857421875,
"learning_rate": 9.403399017459234e-07,
"loss": 0.0323,
"reward": 0.8489583283662796,
"reward_std": 0.2952596992254257,
"rewards/accuracy_reward": 0.3958333358168602,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.453125,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 804.8263854980469,
"epoch": 0.2541966426858513,
"grad_norm": 0.1523526638090565,
"kl": 0.0061492919921875,
"learning_rate": 9.365233510494185e-07,
"loss": 0.0435,
"reward": 0.892361119389534,
"reward_std": 0.32454150170087814,
"rewards/accuracy_reward": 0.444444440305233,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4479166716337204,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 798.3125,
"epoch": 0.2589928057553957,
"grad_norm": 0.18724044885869282,
"kl": 0.0067901611328125,
"learning_rate": 9.325977110783263e-07,
"loss": 0.0222,
"reward": 0.9270833432674408,
"reward_std": 0.335986964404583,
"rewards/accuracy_reward": 0.4444444552063942,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.482638880610466,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 796.0347290039062,
"epoch": 0.2637889688249401,
"grad_norm": 0.15753263206218102,
"kl": 0.00702667236328125,
"learning_rate": 9.285640897740315e-07,
"loss": 0.0554,
"reward": 0.878472238779068,
"reward_std": 0.3221370540559292,
"rewards/accuracy_reward": 0.4097222238779068,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.46875,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 807.6458435058594,
"epoch": 0.2685851318944844,
"grad_norm": 0.1525585503261879,
"kl": 0.0070648193359375,
"learning_rate": 9.244236255537107e-07,
"loss": 0.0488,
"reward": 0.8350694477558136,
"reward_std": 0.25923068448901176,
"rewards/accuracy_reward": 0.3680555522441864,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4670139029622078,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 814.4583435058594,
"epoch": 0.2733812949640288,
"grad_norm": 0.15792440768758637,
"kl": 0.0050201416015625,
"learning_rate": 9.20177486989035e-07,
"loss": 0.0434,
"reward": 0.878472238779068,
"reward_std": 0.31662074103951454,
"rewards/accuracy_reward": 0.4166666641831398,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.461805559694767,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 816.2708435058594,
"epoch": 0.27817745803357313,
"grad_norm": 0.16159815697797508,
"kl": 0.0063934326171875,
"learning_rate": 9.158268724763614e-07,
"loss": 0.0424,
"reward": 0.894097238779068,
"reward_std": 0.3126923553645611,
"rewards/accuracy_reward": 0.4236111119389534,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.470486119389534,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 876.4583435058594,
"epoch": 0.2829736211031175,
"grad_norm": 0.1270858164567956,
"kl": 0.0079803466796875,
"learning_rate": 9.113730098985075e-07,
"loss": 0.0267,
"reward": 0.78125,
"reward_std": 0.2495138719677925,
"rewards/accuracy_reward": 0.3611111119389534,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4201388880610466,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 842.1111297607422,
"epoch": 0.28776978417266186,
"grad_norm": 0.14390666559593665,
"kl": 0.0057373046875,
"learning_rate": 9.068171562782021e-07,
"loss": 0.0467,
"reward": 0.8940972536802292,
"reward_std": 0.3231881149113178,
"rewards/accuracy_reward": 0.4305555559694767,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.463541679084301,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 816.5902862548828,
"epoch": 0.29256594724220625,
"grad_norm": 0.16728826541039396,
"kl": 0.00667572021484375,
"learning_rate": 9.021605974233152e-07,
"loss": 0.0724,
"reward": 0.989583358168602,
"reward_std": 0.36507341638207436,
"rewards/accuracy_reward": 0.4861111044883728,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5034722313284874,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 828.9861145019531,
"epoch": 0.2973621103117506,
"grad_norm": 0.15432237348385633,
"kl": 0.00737762451171875,
"learning_rate": 8.974046475639604e-07,
"loss": 0.0447,
"reward": 0.925347238779068,
"reward_std": 0.3722820319235325,
"rewards/accuracy_reward": 0.4513888880610466,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4739583432674408,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 810.2569427490234,
"epoch": 0.302158273381295,
"grad_norm": 0.1856747664961947,
"kl": 0.00745391845703125,
"learning_rate": 8.925506489815772e-07,
"loss": 0.0687,
"reward": 0.895833358168602,
"reward_std": 0.29615509510040283,
"rewards/accuracy_reward": 0.430555559694767,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4652777835726738,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 861.1111297607422,
"epoch": 0.3069544364508393,
"grad_norm": 0.13202082976554858,
"kl": 0.00617218017578125,
"learning_rate": 8.875999716300968e-07,
"loss": 0.0299,
"reward": 0.8020833432674408,
"reward_std": 0.3038054183125496,
"rewards/accuracy_reward": 0.3819444440305233,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4201388955116272,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 858.9097290039062,
"epoch": 0.3117505995203837,
"grad_norm": 0.152190266317737,
"kl": 0.00725555419921875,
"learning_rate": 8.825540127492965e-07,
"loss": 0.0571,
"reward": 0.7847222238779068,
"reward_std": 0.3564433120191097,
"rewards/accuracy_reward": 0.3472222313284874,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4375,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 798.2152862548828,
"epoch": 0.31654676258992803,
"grad_norm": 0.16383126534952586,
"kl": 0.00787353515625,
"learning_rate": 8.774141964704546e-07,
"loss": 0.0431,
"reward": 0.8836805671453476,
"reward_std": 0.29356446862220764,
"rewards/accuracy_reward": 0.4027777835726738,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4809027761220932,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 809.1597290039062,
"epoch": 0.3213429256594724,
"grad_norm": 0.17427922859293266,
"kl": 0.00984954833984375,
"learning_rate": 8.721819734144135e-07,
"loss": 0.0541,
"reward": 0.9930555671453476,
"reward_std": 0.36635252088308334,
"rewards/accuracy_reward": 0.4583333358168602,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5347222313284874,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 738.5833435058594,
"epoch": 0.3261390887290168,
"grad_norm": 0.16012047020291365,
"kl": 0.009185791015625,
"learning_rate": 8.668588202821706e-07,
"loss": 0.039,
"reward": 1.0850694477558136,
"reward_std": 0.23961883038282394,
"rewards/accuracy_reward": 0.5416666716337204,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5434027910232544,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 832.7847442626953,
"epoch": 0.33093525179856115,
"grad_norm": 0.15979089643431796,
"kl": 0.0091400146484375,
"learning_rate": 8.614462394381026e-07,
"loss": 0.0613,
"reward": 0.9340277761220932,
"reward_std": 0.3319687321782112,
"rewards/accuracy_reward": 0.4513888955116272,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4826388880610466,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 731.4444427490234,
"epoch": 0.33573141486810554,
"grad_norm": 0.2080530430054881,
"kl": 0.01006317138671875,
"learning_rate": 8.559457584859535e-07,
"loss": 0.0441,
"reward": 1.0954861342906952,
"reward_std": 0.42393119633197784,
"rewards/accuracy_reward": 0.5138888955116272,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5815972238779068,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 804.1528015136719,
"epoch": 0.3405275779376499,
"grad_norm": 0.16129162539469918,
"kl": 0.008331298828125,
"learning_rate": 8.503589298376931e-07,
"loss": 0.0347,
"reward": 0.9513888955116272,
"reward_std": 0.37669622898101807,
"rewards/accuracy_reward": 0.4583333358168602,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4930555671453476,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 851.0902862548828,
"epoch": 0.34532374100719426,
"grad_norm": 0.1421650821283864,
"kl": 0.00882720947265625,
"learning_rate": 8.446873302753783e-07,
"loss": 0.0403,
"reward": 0.892361119389534,
"reward_std": 0.2742934599518776,
"rewards/accuracy_reward": 0.423611119389534,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4687500074505806,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 834.8611145019531,
"epoch": 0.3501199040767386,
"grad_norm": 0.14158911723554238,
"kl": 0.00821685791015625,
"learning_rate": 8.389325605061341e-07,
"loss": 0.0319,
"reward": 0.9305555820465088,
"reward_std": 0.2332368977367878,
"rewards/accuracy_reward": 0.4583333283662796,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4722222238779068,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 872.9097442626953,
"epoch": 0.354916067146283,
"grad_norm": 0.1521349105449586,
"kl": 0.0106048583984375,
"learning_rate": 8.330962447103829e-07,
"loss": 0.0301,
"reward": 0.8125000149011612,
"reward_std": 0.35327186062932014,
"rewards/accuracy_reward": 0.3402777835726738,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4722222164273262,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 727.3472290039062,
"epoch": 0.3597122302158273,
"grad_norm": 0.1648475576874547,
"kl": 0.0107269287109375,
"learning_rate": 8.271800300834486e-07,
"loss": 0.0719,
"reward": 1.1545138657093048,
"reward_std": 0.3374630883336067,
"rewards/accuracy_reward": 0.5763888955116272,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.578125,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 754.5485992431641,
"epoch": 0.3645083932853717,
"grad_norm": 0.15678609737006508,
"kl": 0.0106353759765625,
"learning_rate": 8.211855863706654e-07,
"loss": 0.0206,
"reward": 1.1302083283662796,
"reward_std": 0.3273175358772278,
"rewards/accuracy_reward": 0.5625000149011612,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5677083358168602,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 758.7639007568359,
"epoch": 0.36930455635491605,
"grad_norm": 0.14059085263342075,
"kl": 0.011688232421875,
"learning_rate": 8.151146053961217e-07,
"loss": 0.0247,
"reward": 1.038194477558136,
"reward_std": 0.24932898953557014,
"rewards/accuracy_reward": 0.5138888955116272,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5243055447936058,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 762.6527709960938,
"epoch": 0.37410071942446044,
"grad_norm": 0.16861004490817355,
"kl": 0.011260986328125,
"learning_rate": 8.089688005851745e-07,
"loss": 0.0374,
"reward": 1.09375,
"reward_std": 0.362262312322855,
"rewards/accuracy_reward": 0.5416666716337204,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5520833432674408,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 836.3055725097656,
"epoch": 0.37889688249400477,
"grad_norm": 0.13259854891508993,
"kl": 0.009613037109375,
"learning_rate": 8.02749906480864e-07,
"loss": 0.0224,
"reward": 0.9062500149011612,
"reward_std": 0.2787036634981632,
"rewards/accuracy_reward": 0.4097222313284874,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4965277835726738,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 779.7361297607422,
"epoch": 0.38369304556354916,
"grad_norm": 0.16767324138234957,
"kl": 0.0133819580078125,
"learning_rate": 7.964596782543716e-07,
"loss": 0.0572,
"reward": 1.0520833432674408,
"reward_std": 0.28914331272244453,
"rewards/accuracy_reward": 0.5277777910232544,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.524305559694767,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 767.7708435058594,
"epoch": 0.38848920863309355,
"grad_norm": 0.16351300117480583,
"kl": 0.01165771484375,
"learning_rate": 7.900998912096527e-07,
"loss": 0.0307,
"reward": 0.9444444477558136,
"reward_std": 0.3029831796884537,
"rewards/accuracy_reward": 0.4166666567325592,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5277777835726738,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 780.3541717529297,
"epoch": 0.3932853717026379,
"grad_norm": 0.1546912166828873,
"kl": 0.013702392578125,
"learning_rate": 7.836723402823913e-07,
"loss": 0.0369,
"reward": 1.0538194626569748,
"reward_std": 0.3243863359093666,
"rewards/accuracy_reward": 0.5000000074505806,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5538194477558136,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 808.9375,
"epoch": 0.3980815347721823,
"grad_norm": 0.14595312337300426,
"kl": 0.012359619140625,
"learning_rate": 7.771788395334094e-07,
"loss": 0.0399,
"reward": 0.918402761220932,
"reward_std": 0.2620566040277481,
"rewards/accuracy_reward": 0.416666679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5017361119389534,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 767.1805419921875,
"epoch": 0.4028776978417266,
"grad_norm": 0.1382082777095037,
"kl": 0.0124969482421875,
"learning_rate": 7.706212216366819e-07,
"loss": 0.0237,
"reward": 0.9461805671453476,
"reward_std": 0.2770383469760418,
"rewards/accuracy_reward": 0.4444444514811039,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.501736119389534,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 773.5,
"epoch": 0.407673860911271,
"grad_norm": 0.17673710255712385,
"kl": 0.0130767822265625,
"learning_rate": 7.640013373620979e-07,
"loss": 0.0526,
"reward": 1.0694444626569748,
"reward_std": 0.36386215686798096,
"rewards/accuracy_reward": 0.548611119389534,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5208333432674408,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 709.4583435058594,
"epoch": 0.41247002398081534,
"grad_norm": 0.2107698032340365,
"kl": 0.0170440673828125,
"learning_rate": 7.573210550531125e-07,
"loss": 0.0816,
"reward": 1.222222238779068,
"reward_std": 0.4683116003870964,
"rewards/accuracy_reward": 0.6041666716337204,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6180555671453476,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 834.0694427490234,
"epoch": 0.4172661870503597,
"grad_norm": 0.13070064074785792,
"kl": 0.012725830078125,
"learning_rate": 7.505822600994423e-07,
"loss": 0.0331,
"reward": 0.8090277761220932,
"reward_std": 0.2008717618882656,
"rewards/accuracy_reward": 0.3541666641831398,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.454861119389534,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 741.8402862548828,
"epoch": 0.42206235011990406,
"grad_norm": 0.20183514921907542,
"kl": 0.01544189453125,
"learning_rate": 7.437868544049463e-07,
"loss": 0.0421,
"reward": 0.8489583432674408,
"reward_std": 0.2780023626983166,
"rewards/accuracy_reward": 0.3472222313284874,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.501736119389534,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 826.2777862548828,
"epoch": 0.42685851318944845,
"grad_norm": 0.15298818073279,
"kl": 0.013885498046875,
"learning_rate": 7.36936755850849e-07,
"loss": 0.054,
"reward": 0.8472222238779068,
"reward_std": 0.22455434128642082,
"rewards/accuracy_reward": 0.3750000074505806,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4722222313284874,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 796.0555419921875,
"epoch": 0.4316546762589928,
"grad_norm": 0.18224911397155316,
"kl": 0.0146942138671875,
"learning_rate": 7.300338977544519e-07,
"loss": 0.0238,
"reward": 0.9600694477558136,
"reward_std": 0.36052028089761734,
"rewards/accuracy_reward": 0.4375000074505806,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5225694477558136,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 816.0208587646484,
"epoch": 0.4364508393285372,
"grad_norm": 0.19516033120759366,
"kl": 0.0160675048828125,
"learning_rate": 7.230802283234904e-07,
"loss": 0.0525,
"reward": 0.989583358168602,
"reward_std": 0.3490638807415962,
"rewards/accuracy_reward": 0.472222238779068,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5173611044883728,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 801.7291870117188,
"epoch": 0.4412470023980815,
"grad_norm": 0.14966073132580007,
"kl": 0.0144195556640625,
"learning_rate": 7.160777101062865e-07,
"loss": 0.0341,
"reward": 1.0225694626569748,
"reward_std": 0.337300319224596,
"rewards/accuracy_reward": 0.4791666567325592,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5434027910232544,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 834.2222290039062,
"epoch": 0.4460431654676259,
"grad_norm": 0.13423539328804068,
"kl": 0.0140380859375,
"learning_rate": 7.090283194378542e-07,
"loss": 0.0035,
"reward": 0.921875,
"reward_std": 0.259520523250103,
"rewards/accuracy_reward": 0.423611119389534,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.4982638880610466,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 832.9444427490234,
"epoch": 0.45083932853717024,
"grad_norm": 0.16645556279200993,
"kl": 0.013580322265625,
"learning_rate": 7.019340458821159e-07,
"loss": 0.0388,
"reward": 0.9652777910232544,
"reward_std": 0.29097262397408485,
"rewards/accuracy_reward": 0.4652777761220932,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5000000074505806,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 760.0764007568359,
"epoch": 0.4556354916067146,
"grad_norm": 0.15120637517973337,
"kl": 0.01580810546875,
"learning_rate": 6.947968916703826e-07,
"loss": 0.0373,
"reward": 1.015625,
"reward_std": 0.2590954527258873,
"rewards/accuracy_reward": 0.486111119389534,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5295138880610466,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 765.4791564941406,
"epoch": 0.460431654676259,
"grad_norm": 0.16253895506587696,
"kl": 0.0163421630859375,
"learning_rate": 6.876188711362603e-07,
"loss": 0.0583,
"reward": 0.9513889104127884,
"reward_std": 0.3330418989062309,
"rewards/accuracy_reward": 0.4236111044883728,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5277777910232544,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 775.6736297607422,
"epoch": 0.46522781774580335,
"grad_norm": 0.20268144913502836,
"kl": 0.0174560546875,
"learning_rate": 6.80402010147141e-07,
"loss": 0.0393,
"reward": 1.017361119389534,
"reward_std": 0.3649497255682945,
"rewards/accuracy_reward": 0.4652777798473835,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5520833358168602,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 766.9652709960938,
"epoch": 0.47002398081534774,
"grad_norm": 0.1728775404147991,
"kl": 0.015045166015625,
"learning_rate": 6.731483455324374e-07,
"loss": 0.0282,
"reward": 1.1076389104127884,
"reward_std": 0.3713233917951584,
"rewards/accuracy_reward": 0.5347222238779068,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5729166716337204,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 781.7222442626953,
"epoch": 0.4748201438848921,
"grad_norm": 0.1996452779202398,
"kl": 0.01593017578125,
"learning_rate": 6.658599245087241e-07,
"loss": 0.0927,
"reward": 1.1597222536802292,
"reward_std": 0.3544151149690151,
"rewards/accuracy_reward": 0.5833333358168602,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5763888955116272,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 823.1527862548828,
"epoch": 0.47961630695443647,
"grad_norm": 0.14748523873535388,
"kl": 0.0178680419921875,
"learning_rate": 6.585388041019487e-07,
"loss": 0.0367,
"reward": 1.0312500149011612,
"reward_std": 0.27015675604343414,
"rewards/accuracy_reward": 0.4861111119389534,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5451388880610466,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 772.1944427490234,
"epoch": 0.4844124700239808,
"grad_norm": 0.5160967541046672,
"kl": 0.0177764892578125,
"learning_rate": 6.511870505668725e-07,
"loss": 0.0517,
"reward": 1.0746527761220932,
"reward_std": 0.32491182163357735,
"rewards/accuracy_reward": 0.5138888880610466,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5607638955116272,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 813.8680572509766,
"epoch": 0.4892086330935252,
"grad_norm": 0.13567321274406932,
"kl": 0.017913818359375,
"learning_rate": 6.438067388039064e-07,
"loss": 0.039,
"reward": 1.0138888955116272,
"reward_std": 0.2365904077887535,
"rewards/accuracy_reward": 0.4722222313284874,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5416666641831398,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 792.1250152587891,
"epoch": 0.4940047961630695,
"grad_norm": 0.16686045898429722,
"kl": 0.017578125,
"learning_rate": 6.36399951773509e-07,
"loss": 0.0349,
"reward": 1.052083358168602,
"reward_std": 0.3173811621963978,
"rewards/accuracy_reward": 0.5138888880610466,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5381944552063942,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 792.9583435058594,
"epoch": 0.4988009592326139,
"grad_norm": 0.16673634023147957,
"kl": 0.019927978515625,
"learning_rate": 6.289687799083072e-07,
"loss": 0.0385,
"reward": 0.954861119389534,
"reward_std": 0.34330564737319946,
"rewards/accuracy_reward": 0.4444444477558136,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.510416679084301,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 838.2152862548828,
"epoch": 0.5035971223021583,
"grad_norm": 0.19765705391364718,
"kl": 0.018402099609375,
"learning_rate": 6.2151532052311e-07,
"loss": 0.0526,
"reward": 0.9461805671453476,
"reward_std": 0.380074605345726,
"rewards/accuracy_reward": 0.4305555522441864,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5156250074505806,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 848.3958282470703,
"epoch": 0.5083932853717026,
"grad_norm": 0.16771910068496884,
"kl": 0.018890380859375,
"learning_rate": 6.140416772229784e-07,
"loss": 0.0449,
"reward": 0.8819444477558136,
"reward_std": 0.36511222273111343,
"rewards/accuracy_reward": 0.3819444552063942,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 763.1527862548828,
"epoch": 0.513189448441247,
"grad_norm": 0.1914924709345533,
"kl": 0.022308349609375,
"learning_rate": 6.065499593095208e-07,
"loss": 0.0358,
"reward": 1.0451388955116272,
"reward_std": 0.3345286548137665,
"rewards/accuracy_reward": 0.4722222238779068,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5729166716337204,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 795.0000152587891,
"epoch": 0.5179856115107914,
"grad_norm": 0.23442272993782337,
"kl": 0.02197265625,
"learning_rate": 5.990422811855812e-07,
"loss": 0.0786,
"reward": 0.9982638955116272,
"reward_std": 0.38987091183662415,
"rewards/accuracy_reward": 0.4375000037252903,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5607639029622078,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 801.8402862548828,
"epoch": 0.5227817745803357,
"grad_norm": 0.21980101064995708,
"kl": 0.02423095703125,
"learning_rate": 5.915207617584858e-07,
"loss": 0.0335,
"reward": 0.9427083432674408,
"reward_std": 0.36763929575681686,
"rewards/accuracy_reward": 0.4305555522441864,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5121527835726738,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 743.0069427490234,
"epoch": 0.5275779376498801,
"grad_norm": 0.20707736821413145,
"kl": 0.025299072265625,
"learning_rate": 5.839875238420205e-07,
"loss": 0.0706,
"reward": 1.1406249850988388,
"reward_std": 0.29043491929769516,
"rewards/accuracy_reward": 0.555555559694767,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.585069440305233,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 804.6111145019531,
"epoch": 0.5323741007194245,
"grad_norm": 0.17161222800354356,
"kl": 0.021148681640625,
"learning_rate": 5.764446935573041e-07,
"loss": 0.0426,
"reward": 0.9861110895872116,
"reward_std": 0.33307311683893204,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.548611119389534,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 778.125,
"epoch": 0.5371702637889688,
"grad_norm": 0.21031410356763766,
"kl": 0.0242919921875,
"learning_rate": 5.688943997327288e-07,
"loss": 0.05,
"reward": 0.9496527910232544,
"reward_std": 0.2749031111598015,
"rewards/accuracy_reward": 0.4027777761220932,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5468750074505806,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 767.6458435058594,
"epoch": 0.5419664268585132,
"grad_norm": 0.282246872790042,
"kl": 0.02569580078125,
"learning_rate": 5.613387733031375e-07,
"loss": 0.0988,
"reward": 1.1336805522441864,
"reward_std": 0.35546836256980896,
"rewards/accuracy_reward": 0.5486111268401146,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5850694477558136,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 727.4097137451172,
"epoch": 0.5467625899280576,
"grad_norm": 0.2177323521611779,
"kl": 0.0255126953125,
"learning_rate": 5.53779946708405e-07,
"loss": 0.047,
"reward": 1.1145833283662796,
"reward_std": 0.28548414260149,
"rewards/accuracy_reward": 0.5277777798473835,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.586805559694767,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 762.1875152587891,
"epoch": 0.5515587529976019,
"grad_norm": 0.2228573500540112,
"kl": 0.023681640625,
"learning_rate": 5.462200532915951e-07,
"loss": 0.0526,
"reward": 1.0694444626569748,
"reward_std": 0.3976950142532587,
"rewards/accuracy_reward": 0.4722222350537777,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5972222313284874,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 821.8263854980469,
"epoch": 0.5563549160671463,
"grad_norm": 0.22544332116422824,
"kl": 0.02313232421875,
"learning_rate": 5.386612266968625e-07,
"loss": 0.0808,
"reward": 0.9774305522441864,
"reward_std": 0.3071533590555191,
"rewards/accuracy_reward": 0.3958333283662796,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5815972238779068,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 732.5486145019531,
"epoch": 0.5611510791366906,
"grad_norm": 0.24884339296220886,
"kl": 0.026092529296875,
"learning_rate": 5.311056002672712e-07,
"loss": 0.0805,
"reward": 1.2187500298023224,
"reward_std": 0.33359793573617935,
"rewards/accuracy_reward": 0.5902777835726738,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6284722238779068,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 759.3264007568359,
"epoch": 0.565947242206235,
"grad_norm": 0.20316735987226078,
"kl": 0.029693603515625,
"learning_rate": 5.235553064426961e-07,
"loss": 0.0398,
"reward": 1.1354166865348816,
"reward_std": 0.29686928167939186,
"rewards/accuracy_reward": 0.506944440305233,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.628472238779068,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 727.5,
"epoch": 0.5707434052757794,
"grad_norm": 0.29440880209457704,
"kl": 0.0303955078125,
"learning_rate": 5.160124761579795e-07,
"loss": 0.0572,
"reward": 1.2673611044883728,
"reward_std": 0.3973044380545616,
"rewards/accuracy_reward": 0.6180555671453476,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6493055820465088,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 746.875,
"epoch": 0.5755395683453237,
"grad_norm": 0.2292130236322372,
"kl": 0.033233642578125,
"learning_rate": 5.084792382415141e-07,
"loss": 0.0549,
"reward": 1.0781250149011612,
"reward_std": 0.3586086630821228,
"rewards/accuracy_reward": 0.4791666641831398,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5989583432674408,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 846.0,
"epoch": 0.580335731414868,
"grad_norm": 0.2898186862467139,
"kl": 0.031280517578125,
"learning_rate": 5.009577188144188e-07,
"loss": 0.092,
"reward": 0.9079861044883728,
"reward_std": 0.3724118545651436,
"rewards/accuracy_reward": 0.3541666716337204,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5538194552063942,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 858.1944580078125,
"epoch": 0.5851318944844125,
"grad_norm": 0.2766016571596048,
"kl": 0.03631591796875,
"learning_rate": 4.93450040690479e-07,
"loss": 0.0593,
"reward": 0.9548611044883728,
"reward_std": 0.43608929216861725,
"rewards/accuracy_reward": 0.4027777761220932,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5520833283662796,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 788.2152862548828,
"epoch": 0.5899280575539568,
"grad_norm": 0.26705241094954724,
"kl": 0.0374755859375,
"learning_rate": 4.859583227770217e-07,
"loss": 0.0683,
"reward": 1.0520833879709244,
"reward_std": 0.2720135301351547,
"rewards/accuracy_reward": 0.4791666641831398,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5729166567325592,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 772.5069427490234,
"epoch": 0.5947242206235012,
"grad_norm": 0.2671582354543242,
"kl": 0.035980224609375,
"learning_rate": 4.784846794768901e-07,
"loss": 0.0564,
"reward": 1.1267361044883728,
"reward_std": 0.3838435262441635,
"rewards/accuracy_reward": 0.5277777761220932,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5989583283662796,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 807.1597442626953,
"epoch": 0.5995203836930456,
"grad_norm": 0.28942117017345487,
"kl": 0.03851318359375,
"learning_rate": 4.7103122009169283e-07,
"loss": 0.0337,
"reward": 1.0086805671453476,
"reward_std": 0.31841161847114563,
"rewards/accuracy_reward": 0.4513888880610466,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5572916716337204,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 794.0694580078125,
"epoch": 0.60431654676259,
"grad_norm": 0.29337843160217614,
"kl": 0.034027099609375,
"learning_rate": 4.63600048226491e-07,
"loss": 0.0638,
"reward": 1.0694444626569748,
"reward_std": 0.37363580614328384,
"rewards/accuracy_reward": 0.479166679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5902777910232544,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 682.5347290039062,
"epoch": 0.6091127098321343,
"grad_norm": 0.2980289320595657,
"kl": 0.046630859375,
"learning_rate": 4.5619326119609346e-07,
"loss": 0.0582,
"reward": 1.0850694626569748,
"reward_std": 0.3542333133518696,
"rewards/accuracy_reward": 0.4583333283662796,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.626736119389534,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 810.3055572509766,
"epoch": 0.6139088729016786,
"grad_norm": 0.4115921669385449,
"kl": 0.039794921875,
"learning_rate": 4.4881294943312756e-07,
"loss": 0.1129,
"reward": 0.9635416716337204,
"reward_std": 0.4346095398068428,
"rewards/accuracy_reward": 0.3958333358168602,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5677083283662796,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 796.3958282470703,
"epoch": 0.6187050359712231,
"grad_norm": 0.28690441517101206,
"kl": 0.03887939453125,
"learning_rate": 4.414611958980512e-07,
"loss": 0.0596,
"reward": 1.104166641831398,
"reward_std": 0.32108214125037193,
"rewards/accuracy_reward": 0.5138888955116272,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5902777910232544,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 704.2986145019531,
"epoch": 0.6235011990407674,
"grad_norm": 0.39366843219844283,
"kl": 0.045166015625,
"learning_rate": 4.3414007549127594e-07,
"loss": 0.0469,
"reward": 1.1545138955116272,
"reward_std": 0.362628273665905,
"rewards/accuracy_reward": 0.5208333283662796,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.633680559694767,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 792.5555572509766,
"epoch": 0.6282973621103117,
"grad_norm": 0.3049060746508426,
"kl": 0.044677734375,
"learning_rate": 4.268516544675628e-07,
"loss": 0.0332,
"reward": 1.0902777910232544,
"reward_std": 0.381888784468174,
"rewards/accuracy_reward": 0.5000000074505806,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5902777761220932,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 760.2291717529297,
"epoch": 0.6330935251798561,
"grad_norm": 0.5199022261670415,
"kl": 0.05535888671875,
"learning_rate": 4.195979898528589e-07,
"loss": 0.0576,
"reward": 1.131944477558136,
"reward_std": 0.45905186980962753,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6319444477558136,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 736.3541717529297,
"epoch": 0.6378896882494005,
"grad_norm": 0.4002608571694072,
"kl": 0.0509033203125,
"learning_rate": 4.1238112886373967e-07,
"loss": 0.0692,
"reward": 1.2430555820465088,
"reward_std": 0.43104151636362076,
"rewards/accuracy_reward": 0.5902777910232544,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6527777910232544,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 750.2986145019531,
"epoch": 0.6426858513189448,
"grad_norm": 0.47064244094639573,
"kl": 0.05303955078125,
"learning_rate": 4.0520310832961747e-07,
"loss": 0.0578,
"reward": 1.2552083730697632,
"reward_std": 0.3141016773879528,
"rewards/accuracy_reward": 0.604166679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6510416716337204,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 700.3611297607422,
"epoch": 0.6474820143884892,
"grad_norm": 0.5155783092634934,
"kl": 0.04730224609375,
"learning_rate": 3.980659541178841e-07,
"loss": 0.0851,
"reward": 1.1597222089767456,
"reward_std": 0.33885327726602554,
"rewards/accuracy_reward": 0.5625000074505806,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5972222238779068,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 733.5833435058594,
"epoch": 0.6522781774580336,
"grad_norm": 0.3556569979579297,
"kl": 0.0576171875,
"learning_rate": 3.909716805621458e-07,
"loss": 0.0298,
"reward": 1.1493055522441864,
"reward_std": 0.3029083050787449,
"rewards/accuracy_reward": 0.5208333358168602,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.628472238779068,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 707.9791717529297,
"epoch": 0.657074340527578,
"grad_norm": 0.529609284338842,
"kl": 0.06280517578125,
"learning_rate": 3.8392228989371357e-07,
"loss": 0.1004,
"reward": 1.0902777910232544,
"reward_std": 0.3477436378598213,
"rewards/accuracy_reward": 0.4722222313284874,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6180555522441864,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 735.5902938842773,
"epoch": 0.6618705035971223,
"grad_norm": 0.5906533852176611,
"kl": 0.06390380859375,
"learning_rate": 3.7691977167650947e-07,
"loss": 0.0947,
"reward": 1.2638888955116272,
"reward_std": 0.37084779888391495,
"rewards/accuracy_reward": 0.5972222313284874,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6666666865348816,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 787.9236145019531,
"epoch": 0.6666666666666666,
"grad_norm": 0.3518724040292374,
"kl": 0.05731201171875,
"learning_rate": 3.6996610224554815e-07,
"loss": 0.035,
"reward": 1.1510416865348816,
"reward_std": 0.39138108491897583,
"rewards/accuracy_reward": 0.5208333358168602,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6302083432674408,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 753.4236145019531,
"epoch": 0.6714628297362111,
"grad_norm": 0.5518379170803371,
"kl": 0.0836181640625,
"learning_rate": 3.630632441491511e-07,
"loss": 0.0206,
"reward": 1.1197917014360428,
"reward_std": 0.33788175135850906,
"rewards/accuracy_reward": 0.4930555745959282,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6267361044883728,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 847.3541717529297,
"epoch": 0.6762589928057554,
"grad_norm": 0.7174313824432543,
"kl": 0.08111572265625,
"learning_rate": 3.562131455950538e-07,
"loss": 0.075,
"reward": 0.940972238779068,
"reward_std": 0.39178355410695076,
"rewards/accuracy_reward": 0.3819444477558136,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5590277835726738,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 823.6180572509766,
"epoch": 0.6810551558752997,
"grad_norm": 0.8192916163585466,
"kl": 0.09405517578125,
"learning_rate": 3.4941773990055777e-07,
"loss": 0.0704,
"reward": 0.8750000149011612,
"reward_std": 0.40220723301172256,
"rewards/accuracy_reward": 0.3402777761220932,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.534722238779068,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 741.7569580078125,
"epoch": 0.6858513189448441,
"grad_norm": 0.6199201515779919,
"kl": 0.0787353515625,
"learning_rate": 3.426789449468873e-07,
"loss": 0.0473,
"reward": 1.1718749850988388,
"reward_std": 0.3498489521443844,
"rewards/accuracy_reward": 0.5486111119389534,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6232638955116272,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 832.3055572509766,
"epoch": 0.6906474820143885,
"grad_norm": 0.9765675758790994,
"kl": 0.08935546875,
"learning_rate": 3.359986626379022e-07,
"loss": 0.0842,
"reward": 0.984375,
"reward_std": 0.48340315371751785,
"rewards/accuracy_reward": 0.423611119389534,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5607639029622078,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 741.2639007568359,
"epoch": 0.6954436450839329,
"grad_norm": 0.6256951971346841,
"kl": 0.0902099609375,
"learning_rate": 3.293787783633182e-07,
"loss": 0.0524,
"reward": 1.092013880610466,
"reward_std": 0.35471441224217415,
"rewards/accuracy_reward": 0.4861111268401146,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6059027761220932,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 749.1250152587891,
"epoch": 0.7002398081534772,
"grad_norm": 0.4746047065439084,
"kl": 0.0960693359375,
"learning_rate": 3.2282116046659064e-07,
"loss": 0.0216,
"reward": 1.1197916567325592,
"reward_std": 0.3484783172607422,
"rewards/accuracy_reward": 0.5000000074505806,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6197916716337204,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 728.75,
"epoch": 0.7050359712230215,
"grad_norm": 0.4212278612821504,
"kl": 0.1041259765625,
"learning_rate": 3.163276597176087e-07,
"loss": 0.0352,
"reward": 1.3003472089767456,
"reward_std": 0.366548266261816,
"rewards/accuracy_reward": 0.6388888955116272,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.661458358168602,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 729.4166717529297,
"epoch": 0.709832134292566,
"grad_norm": 0.5703953599283403,
"kl": 0.1136474609375,
"learning_rate": 3.099001087903473e-07,
"loss": 0.0144,
"reward": 1.1718750298023224,
"reward_std": 0.44783008843660355,
"rewards/accuracy_reward": 0.5625000074505806,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.609375,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 807.3610992431641,
"epoch": 0.7146282973621103,
"grad_norm": 0.6651812784116454,
"kl": 0.12939453125,
"learning_rate": 3.0354032174562863e-07,
"loss": 0.0654,
"reward": 1.0920139104127884,
"reward_std": 0.3492956757545471,
"rewards/accuracy_reward": 0.4791666641831398,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6128472313284874,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 786.3194732666016,
"epoch": 0.7194244604316546,
"grad_norm": 0.689721941763239,
"kl": 0.1400146484375,
"learning_rate": 2.97250093519136e-07,
"loss": 0.0635,
"reward": 1.1111111342906952,
"reward_std": 0.3203607201576233,
"rewards/accuracy_reward": 0.486111119389534,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6250000149011612,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 792.2152709960938,
"epoch": 0.7242206235011991,
"grad_norm": 1.2387475983297178,
"kl": 0.1351318359375,
"learning_rate": 2.910311994148255e-07,
"loss": 0.071,
"reward": 1.09375,
"reward_std": 0.36406850814819336,
"rewards/accuracy_reward": 0.4930555559694767,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6006944477558136,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 767.8333435058594,
"epoch": 0.7290167865707434,
"grad_norm": 0.7047137897175717,
"kl": 0.1422119140625,
"learning_rate": 2.848853946038782e-07,
"loss": 0.0384,
"reward": 1.0711805522441864,
"reward_std": 0.2421913631260395,
"rewards/accuracy_reward": 0.4791666641831398,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5920139104127884,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 749.2708435058594,
"epoch": 0.7338129496402878,
"grad_norm": 1.144909232594251,
"kl": 0.1258544921875,
"learning_rate": 2.7881441362933464e-07,
"loss": 0.066,
"reward": 1.045138880610466,
"reward_std": 0.3445451110601425,
"rewards/accuracy_reward": 0.430555559694767,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6145833432674408,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 723.8055572509766,
"epoch": 0.7386091127098321,
"grad_norm": 0.9998994389867698,
"kl": 0.1502685546875,
"learning_rate": 2.7281996991655145e-07,
"loss": 0.0722,
"reward": 1.1649305671453476,
"reward_std": 0.4142308458685875,
"rewards/accuracy_reward": 0.5555555671453476,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.609375,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 712.1736297607422,
"epoch": 0.7434052757793765,
"grad_norm": 0.8719467247990521,
"kl": 0.1572265625,
"learning_rate": 2.669037552896172e-07,
"loss": 0.0362,
"reward": 1.1753472536802292,
"reward_std": 0.4305378869175911,
"rewards/accuracy_reward": 0.5277777835726738,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6475694477558136,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 792.1458282470703,
"epoch": 0.7482014388489209,
"grad_norm": 1.5075628106035353,
"kl": 0.19970703125,
"learning_rate": 2.6106743949386586e-07,
"loss": 0.0657,
"reward": 1.0902777910232544,
"reward_std": 0.39609089493751526,
"rewards/accuracy_reward": 0.4652777910232544,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6250000149011612,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 720.9791717529297,
"epoch": 0.7529976019184652,
"grad_norm": 1.1801204435963506,
"kl": 0.14697265625,
"learning_rate": 2.553126697246217e-07,
"loss": 0.0499,
"reward": 1.1493055820465088,
"reward_std": 0.40563249588012695,
"rewards/accuracy_reward": 0.541666679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6076388955116272,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 708.5486297607422,
"epoch": 0.7577937649880095,
"grad_norm": 1.3449351919779642,
"kl": 0.1728515625,
"learning_rate": 2.49641070162307e-07,
"loss": 0.0543,
"reward": 1.2118056118488312,
"reward_std": 0.3700602427124977,
"rewards/accuracy_reward": 0.5833333507180214,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6284722238779068,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 757.7916870117188,
"epoch": 0.762589928057554,
"grad_norm": 1.141025370807687,
"kl": 0.19677734375,
"learning_rate": 2.440542415140466e-07,
"loss": 0.0881,
"reward": 1.1232638955116272,
"reward_std": 0.4029542878270149,
"rewards/accuracy_reward": 0.493055559694767,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6302083432674408,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 677.4652862548828,
"epoch": 0.7673860911270983,
"grad_norm": 1.6300168405182138,
"kl": 0.2421875,
"learning_rate": 2.3855376056189737e-07,
"loss": 0.058,
"reward": 1.3194444477558136,
"reward_std": 0.44138168543577194,
"rewards/accuracy_reward": 0.604166679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7152777761220932,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 664.5694580078125,
"epoch": 0.7721822541966427,
"grad_norm": 1.5733647270439453,
"kl": 0.20703125,
"learning_rate": 2.3314117971782945e-07,
"loss": 0.0652,
"reward": 1.1788194477558136,
"reward_std": 0.3714512586593628,
"rewards/accuracy_reward": 0.541666679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6371527910232544,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 769.9722290039062,
"epoch": 0.7769784172661871,
"grad_norm": 1.533175643219152,
"kl": 0.23095703125,
"learning_rate": 2.2781802658558635e-07,
"loss": 0.0533,
"reward": 0.984375,
"reward_std": 0.39164651185274124,
"rewards/accuracy_reward": 0.4027777835726738,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.581597238779068,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 733.8125,
"epoch": 0.7817745803357314,
"grad_norm": 0.9142708944559201,
"kl": 0.22607421875,
"learning_rate": 2.2258580352954552e-07,
"loss": 0.0356,
"reward": 1.1076388955116272,
"reward_std": 0.32901762425899506,
"rewards/accuracy_reward": 0.472222238779068,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6354166716337204,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 743.4930572509766,
"epoch": 0.7865707434052758,
"grad_norm": 1.564329334465899,
"kl": 0.3662109375,
"learning_rate": 2.1744598725070347e-07,
"loss": 0.0512,
"reward": 1.0538194477558136,
"reward_std": 0.28839075565338135,
"rewards/accuracy_reward": 0.4652777835726738,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.5885416716337204,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 739.3611145019531,
"epoch": 0.7913669064748201,
"grad_norm": 1.1882320571551739,
"kl": 0.28759765625,
"learning_rate": 2.1240002836990328e-07,
"loss": 0.0243,
"reward": 1.1527777910232544,
"reward_std": 0.3735358491539955,
"rewards/accuracy_reward": 0.4791666716337204,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6736111044883728,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 706.4166717529297,
"epoch": 0.7961630695443646,
"grad_norm": 1.201792642717349,
"kl": 0.32080078125,
"learning_rate": 2.0744935101842275e-07,
"loss": 0.0349,
"reward": 1.1701389253139496,
"reward_std": 0.3558007851243019,
"rewards/accuracy_reward": 0.4930555671453476,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6770833283662796,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 669.9444580078125,
"epoch": 0.8009592326139089,
"grad_norm": 1.415131555543672,
"kl": 0.339111328125,
"learning_rate": 2.025953524360396e-07,
"loss": 0.0588,
"reward": 1.2465277910232544,
"reward_std": 0.3056763559579849,
"rewards/accuracy_reward": 0.6250000074505806,
"rewards/format_reward": 0.0069444444961845875,
"rewards/tag_count_reward": 0.6145833358168602,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 690.5277862548828,
"epoch": 0.8057553956834532,
"grad_norm": 1.8031709952858856,
"kl": 0.342041015625,
"learning_rate": 1.9783940257668473e-07,
"loss": 0.1108,
"reward": 1.1909722089767456,
"reward_std": 0.42986829578876495,
"rewards/accuracy_reward": 0.5416666716337204,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6493055671453476,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 731.9027862548828,
"epoch": 0.8105515587529976,
"grad_norm": 1.54016513318198,
"kl": 0.34375,
"learning_rate": 1.9318284372179783e-07,
"loss": 0.0829,
"reward": 1.0902778059244156,
"reward_std": 0.3709410950541496,
"rewards/accuracy_reward": 0.4791666641831398,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.611111119389534,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 678.4305572509766,
"epoch": 0.815347721822542,
"grad_norm": 1.2889679479137333,
"kl": 0.310791015625,
"learning_rate": 1.8862699010149265e-07,
"loss": 0.0637,
"reward": 1.1493055820465088,
"reward_std": 0.4024455025792122,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6493055671453476,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 666.5764007568359,
"epoch": 0.8201438848920863,
"grad_norm": 1.2493403412894837,
"kl": 0.37060546875,
"learning_rate": 1.8417312752363842e-07,
"loss": 0.0292,
"reward": 1.2986111342906952,
"reward_std": 0.39357686042785645,
"rewards/accuracy_reward": 0.5972222238779068,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7013888955116272,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 680.5,
"epoch": 0.8249400479616307,
"grad_norm": 1.3917142124264024,
"kl": 0.289794921875,
"learning_rate": 1.7982251301096496e-07,
"loss": 0.0587,
"reward": 1.2343749701976776,
"reward_std": 0.3718552738428116,
"rewards/accuracy_reward": 0.569444440305233,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6649305671453476,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 762.5139007568359,
"epoch": 0.829736211031175,
"grad_norm": 2.082423982787284,
"kl": 0.37939453125,
"learning_rate": 1.7557637444628934e-07,
"loss": 0.0734,
"reward": 1.0295139104127884,
"reward_std": 0.42010512948036194,
"rewards/accuracy_reward": 0.4166666716337204,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.612847238779068,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 758.2777862548828,
"epoch": 0.8345323741007195,
"grad_norm": 1.4700469055914496,
"kl": 0.31884765625,
"learning_rate": 1.7143591022596842e-07,
"loss": 0.0462,
"reward": 1.0850694626569748,
"reward_std": 0.3826001510024071,
"rewards/accuracy_reward": 0.4513888955116272,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6336805373430252,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 694.0694427490234,
"epoch": 0.8393285371702638,
"grad_norm": 1.3297784898512763,
"kl": 0.38623046875,
"learning_rate": 1.674022889216737e-07,
"loss": 0.0566,
"reward": 1.2083333134651184,
"reward_std": 0.36128322780132294,
"rewards/accuracy_reward": 0.5486111044883728,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.659722238779068,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 678.7152862548828,
"epoch": 0.8441247002398081,
"grad_norm": 1.4970986391973622,
"kl": 0.312744140625,
"learning_rate": 1.634766489505815e-07,
"loss": 0.0584,
"reward": 1.2951389253139496,
"reward_std": 0.39571166411042213,
"rewards/accuracy_reward": 0.6180555522441864,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6770833283662796,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 650.5069580078125,
"epoch": 0.8489208633093526,
"grad_norm": 1.3198452732539876,
"kl": 0.282958984375,
"learning_rate": 1.5966009825407664e-07,
"loss": 0.0487,
"reward": 1.2291666865348816,
"reward_std": 0.41252629458904266,
"rewards/accuracy_reward": 0.5763888955116272,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6527777910232544,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 709.4861145019531,
"epoch": 0.8537170263788969,
"grad_norm": 1.571938422464948,
"kl": 0.275146484375,
"learning_rate": 1.5595371398505497e-07,
"loss": 0.0601,
"reward": 1.1354167014360428,
"reward_std": 0.3936513438820839,
"rewards/accuracy_reward": 0.4444444552063942,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6909722238779068,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 644.2847290039062,
"epoch": 0.8585131894484412,
"grad_norm": 1.2655481838867129,
"kl": 0.313232421875,
"learning_rate": 1.523585422039165e-07,
"loss": 0.0395,
"reward": 1.2447916567325592,
"reward_std": 0.3132231794297695,
"rewards/accuracy_reward": 0.5555555671453476,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.689236119389534,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 727.8055725097656,
"epoch": 0.8633093525179856,
"grad_norm": 2.287249210625076,
"kl": 0.4541015625,
"learning_rate": 1.4887559758333408e-07,
"loss": 0.0809,
"reward": 1.1718749850988388,
"reward_std": 0.4368325099349022,
"rewards/accuracy_reward": 0.5069444477558136,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6649305671453476,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 677.8541641235352,
"epoch": 0.86810551558753,
"grad_norm": 2.029066779031394,
"kl": 0.44677734375,
"learning_rate": 1.4550586312187919e-07,
"loss": 0.0318,
"reward": 1.2395833432674408,
"reward_std": 0.34451349824666977,
"rewards/accuracy_reward": 0.597222238779068,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.642361119389534,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 668.8194427490234,
"epoch": 0.8729016786570744,
"grad_norm": 1.6348530863651882,
"kl": 0.3857421875,
"learning_rate": 1.4225028986658965e-07,
"loss": 0.0692,
"reward": 1.2500000298023224,
"reward_std": 0.4199504852294922,
"rewards/accuracy_reward": 0.5763888880610466,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.673611119389534,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 739.4930725097656,
"epoch": 0.8776978417266187,
"grad_norm": 2.171960155817612,
"kl": 0.31787109375,
"learning_rate": 1.391097966445526e-07,
"loss": 0.0609,
"reward": 1.1805555820465088,
"reward_std": 0.3399686738848686,
"rewards/accuracy_reward": 0.5208333432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.659722238779068,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 710.2638854980469,
"epoch": 0.882494004796163,
"grad_norm": 2.354582366359387,
"kl": 0.439208984375,
"learning_rate": 1.3608526980358242e-07,
"loss": 0.1236,
"reward": 1.1701389104127884,
"reward_std": 0.3848187327384949,
"rewards/accuracy_reward": 0.5,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.670138880610466,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 740.0833435058594,
"epoch": 0.8872901678657075,
"grad_norm": 1.0825401183719674,
"kl": 0.34619140625,
"learning_rate": 1.331775629620653e-07,
"loss": 0.0486,
"reward": 1.1493055522441864,
"reward_std": 0.36640702188014984,
"rewards/accuracy_reward": 0.4861111268401146,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6631944477558136,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 628.0208282470703,
"epoch": 0.8920863309352518,
"grad_norm": 1.7907760466473297,
"kl": 0.5361328125,
"learning_rate": 1.303874967680399e-07,
"loss": 0.0542,
"reward": 1.2604166567325592,
"reward_std": 0.4223191514611244,
"rewards/accuracy_reward": 0.5694444552063942,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.690972238779068,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 629.2708358764648,
"epoch": 0.8968824940047961,
"grad_norm": 1.7941484108862593,
"kl": 0.375,
"learning_rate": 1.277158586675852e-07,
"loss": 0.0782,
"reward": 1.1996527910232544,
"reward_std": 0.33358532190322876,
"rewards/accuracy_reward": 0.493055559694767,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.706597238779068,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 774.6180725097656,
"epoch": 0.9016786570743405,
"grad_norm": 1.3540196149379808,
"kl": 0.42333984375,
"learning_rate": 1.2516340268257737e-07,
"loss": 0.0613,
"reward": 1.065972238779068,
"reward_std": 0.3640429899096489,
"rewards/accuracy_reward": 0.3819444477558136,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6840277910232544,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 673.5208511352539,
"epoch": 0.9064748201438849,
"grad_norm": 2.3552439559585534,
"kl": 0.56640625,
"learning_rate": 1.2273084919788063e-07,
"loss": 0.0419,
"reward": 1.2378471940755844,
"reward_std": 0.40937893092632294,
"rewards/accuracy_reward": 0.5833333283662796,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.654513880610466,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 680.7222137451172,
"epoch": 0.9112709832134293,
"grad_norm": 2.0506106800441746,
"kl": 0.62939453125,
"learning_rate": 1.2041888475803217e-07,
"loss": 0.0987,
"reward": 1.1649305671453476,
"reward_std": 0.4104561358690262,
"rewards/accuracy_reward": 0.5208333432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6440972238779068,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 652.5694580078125,
"epoch": 0.9160671462829736,
"grad_norm": 2.011461189500051,
"kl": 0.650390625,
"learning_rate": 1.1822816187347622e-07,
"loss": 0.1134,
"reward": 1.1857638955116272,
"reward_std": 0.4204775467514992,
"rewards/accuracy_reward": 0.5347222313284874,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6510416716337204,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 672.8889007568359,
"epoch": 0.920863309352518,
"grad_norm": 1.766556315204042,
"kl": 0.52197265625,
"learning_rate": 1.1615929883640567e-07,
"loss": 0.0868,
"reward": 1.2239583283662796,
"reward_std": 0.37772539258003235,
"rewards/accuracy_reward": 0.5486111268401146,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.675347238779068,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 705.6805572509766,
"epoch": 0.9256594724220624,
"grad_norm": 1.2464963403957747,
"kl": 0.42431640625,
"learning_rate": 1.1421287954625985e-07,
"loss": 0.0538,
"reward": 1.2118055522441864,
"reward_std": 0.3169648088514805,
"rewards/accuracy_reward": 0.5416666716337204,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6701388955116272,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 735.0416870117188,
"epoch": 0.9304556354916067,
"grad_norm": 1.4114342882843525,
"kl": 0.4072265625,
"learning_rate": 1.1238945334492928e-07,
"loss": 0.038,
"reward": 1.1388888955116272,
"reward_std": 0.3398313596844673,
"rewards/accuracy_reward": 0.4791666716337204,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.659722238779068,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 734.25,
"epoch": 0.935251798561151,
"grad_norm": 1.3534299207814684,
"kl": 0.740234375,
"learning_rate": 1.1068953486171385e-07,
"loss": 0.0948,
"reward": 1.14930559694767,
"reward_std": 0.4659058451652527,
"rewards/accuracy_reward": 0.5138888955116272,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6354166716337204,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 767.3125,
"epoch": 0.9400479616306955,
"grad_norm": 1.8926261098268726,
"kl": 0.6171875,
"learning_rate": 1.0911360386807814e-07,
"loss": 0.0999,
"reward": 1.034722238779068,
"reward_std": 0.3850885070860386,
"rewards/accuracy_reward": 0.423611119389534,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6111111342906952,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 671.8333282470703,
"epoch": 0.9448441247002398,
"grad_norm": 2.2335073755128354,
"kl": 0.63671875,
"learning_rate": 1.0766210514224419e-07,
"loss": 0.0608,
"reward": 1.206597238779068,
"reward_std": 0.39280908554792404,
"rewards/accuracy_reward": 0.5486111342906952,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.657986119389534,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 690.3472137451172,
"epoch": 0.9496402877697842,
"grad_norm": 2.0084883063292747,
"kl": 0.505859375,
"learning_rate": 1.0633544834366123e-07,
"loss": 0.1037,
"reward": 1.2291666865348816,
"reward_std": 0.44404156506061554,
"rewards/accuracy_reward": 0.5277777835726738,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7013888955116272,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 706.125,
"epoch": 0.9544364508393285,
"grad_norm": 2.8291221690849255,
"kl": 0.67822265625,
"learning_rate": 1.051340078973863e-07,
"loss": 0.084,
"reward": 1.0954861044883728,
"reward_std": 0.43709662556648254,
"rewards/accuracy_reward": 0.4375,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6579861044883728,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 670.6319427490234,
"epoch": 0.9592326139088729,
"grad_norm": 1.3994189437001958,
"kl": 0.4013671875,
"learning_rate": 1.0405812288840964e-07,
"loss": 0.0765,
"reward": 1.2777777761220932,
"reward_std": 0.34174390137195587,
"rewards/accuracy_reward": 0.5763889029622078,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7013888955116272,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 629.4861145019531,
"epoch": 0.9640287769784173,
"grad_norm": 1.434366201343452,
"kl": 0.3046875,
"learning_rate": 1.031080969659543e-07,
"loss": 0.0855,
"reward": 1.3107638955116272,
"reward_std": 0.34832194447517395,
"rewards/accuracy_reward": 0.604166679084301,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7065972238779068,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 665.0833587646484,
"epoch": 0.9688249400479616,
"grad_norm": 1.5034799191037997,
"kl": 0.46337890625,
"learning_rate": 1.0228419825777602e-07,
"loss": 0.0582,
"reward": 1.2343750298023224,
"reward_std": 0.4124446362257004,
"rewards/accuracy_reward": 0.5763889029622078,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6579861342906952,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 781.5555572509766,
"epoch": 0.973621103117506,
"grad_norm": 2.678396312950966,
"kl": 0.5263671875,
"learning_rate": 1.0158665929448951e-07,
"loss": 0.0947,
"reward": 1.0694444477558136,
"reward_std": 0.42056479305028915,
"rewards/accuracy_reward": 0.3958333283662796,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6736111044883728,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 618.8888854980469,
"epoch": 0.9784172661870504,
"grad_norm": 2.12368752310343,
"kl": 0.5556640625,
"learning_rate": 1.0101567694394071e-07,
"loss": 0.1194,
"reward": 1.3229166567325592,
"reward_std": 0.41604190319776535,
"rewards/accuracy_reward": 0.6111111268401146,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.7118055522441864,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 715.125,
"epoch": 0.9832134292565947,
"grad_norm": 1.6931563983674274,
"kl": 0.5595703125,
"learning_rate": 1.0057141235564423e-07,
"loss": 0.0796,
"reward": 1.1458333432674408,
"reward_std": 0.39061587303876877,
"rewards/accuracy_reward": 0.5000000074505806,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6458333432674408,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 721.2014007568359,
"epoch": 0.988009592326139,
"grad_norm": 1.6094244664912092,
"kl": 0.51513671875,
"learning_rate": 1.0025399091530193e-07,
"loss": 0.0913,
"reward": 1.2239583432674408,
"reward_std": 0.34610963612794876,
"rewards/accuracy_reward": 0.5347222089767456,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6892361044883728,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 612.5069351196289,
"epoch": 0.9928057553956835,
"grad_norm": 1.7144937573704717,
"kl": 0.45556640625,
"learning_rate": 1.0006350220941502e-07,
"loss": 0.0338,
"reward": 1.3229166865348816,
"reward_std": 0.37486525624990463,
"rewards/accuracy_reward": 0.5833333432674408,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.739583358168602,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 573.1250076293945,
"epoch": 0.9976019184652278,
"grad_norm": 1.1702049182470995,
"kl": 0.43017578125,
"learning_rate": 1e-07,
"loss": 0.02,
"reward": 1.237847238779068,
"reward_std": 0.40600838512182236,
"rewards/accuracy_reward": 0.5625,
"rewards/format_reward": 0.0,
"rewards/tag_count_reward": 0.6753472089767456,
"step": 208
},
{
"epoch": 0.9976019184652278,
"step": 208,
"total_flos": 0.0,
"train_loss": 0.05009339519780882,
"train_runtime": 7148.6291,
"train_samples_per_second": 1.049,
"train_steps_per_second": 0.029
}
],
"logging_steps": 1,
"max_steps": 208,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}