BabyChou's picture
Upload folder using huggingface_hub
788ca6d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.48154093097913325,
"eval_steps": 500,
"global_step": 150,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 560.3958587646484,
"epoch": 0.0032102728731942215,
"grad_norm": 0.1884765625,
"kl": 0.0,
"learning_rate": 6.666666666666667e-08,
"loss": 0.0,
"reward": 0.6299200654029846,
"reward_std": 0.34568188339471817,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3343471363186836,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2955729365348816,
"step": 1
},
{
"completion_length": 574.9948120117188,
"epoch": 0.006420545746388443,
"grad_norm": 0.20703125,
"kl": 0.0,
"learning_rate": 1.3333333333333334e-07,
"loss": 0.0,
"reward": 0.6667226850986481,
"reward_std": 0.3381393700838089,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3724518120288849,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2942708432674408,
"step": 2
},
{
"completion_length": 597.8541870117188,
"epoch": 0.009630818619582664,
"grad_norm": 0.185546875,
"kl": 0.00022509081827593036,
"learning_rate": 2e-07,
"loss": 0.0,
"reward": 0.636066347360611,
"reward_std": 0.35888948291540146,
"rewards/expression_based_accuracy_reward_length_penalized": 0.33658717572689056,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.299479179084301,
"step": 3
},
{
"completion_length": 568.2656555175781,
"epoch": 0.012841091492776886,
"grad_norm": 0.2412109375,
"kl": 0.00023551580670755357,
"learning_rate": 2.6666666666666667e-07,
"loss": 0.0,
"reward": 0.6489640921354294,
"reward_std": 0.344046413898468,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3677141070365906,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2812500149011612,
"step": 4
},
{
"completion_length": 637.7135620117188,
"epoch": 0.016051364365971106,
"grad_norm": 0.169921875,
"kl": 0.00023702834732830524,
"learning_rate": 3.333333333333333e-07,
"loss": 0.0,
"reward": 0.6146693080663681,
"reward_std": 0.35021649301052094,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3256068006157875,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2890625037252903,
"step": 5
},
{
"completion_length": 607.0260620117188,
"epoch": 0.019261637239165328,
"grad_norm": 0.2080078125,
"kl": 0.00022915955560165457,
"learning_rate": 4e-07,
"loss": 0.0,
"reward": 0.5901845693588257,
"reward_std": 0.3410582020878792,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3063303604722023,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.283854179084301,
"step": 6
},
{
"completion_length": 536.9713745117188,
"epoch": 0.02247191011235955,
"grad_norm": 0.20703125,
"kl": 0.00023870709992479533,
"learning_rate": 4.6666666666666666e-07,
"loss": 0.0,
"reward": 0.5996608734130859,
"reward_std": 0.3273390009999275,
"rewards/expression_based_accuracy_reward_length_penalized": 0.29497333616018295,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3046875074505806,
"step": 7
},
{
"completion_length": 579.4062652587891,
"epoch": 0.025682182985553772,
"grad_norm": 0.193359375,
"kl": 0.0002171014821215067,
"learning_rate": 5.333333333333333e-07,
"loss": 0.0,
"reward": 0.6376358270645142,
"reward_std": 0.34004897624254227,
"rewards/expression_based_accuracy_reward_length_penalized": 0.355083703994751,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2825520932674408,
"step": 8
},
{
"completion_length": 546.3750228881836,
"epoch": 0.028892455858747994,
"grad_norm": 0.212890625,
"kl": 0.00022866667859489098,
"learning_rate": 6e-07,
"loss": 0.0,
"reward": 0.6513122767210007,
"reward_std": 0.32241296768188477,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3576924651861191,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2936197966337204,
"step": 9
},
{
"completion_length": 573.7396087646484,
"epoch": 0.03210272873194221,
"grad_norm": 0.1904296875,
"kl": 0.0002470466679369565,
"learning_rate": 6.666666666666666e-07,
"loss": 0.0,
"reward": 0.679995134472847,
"reward_std": 0.3322247415781021,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3909326568245888,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2890625074505806,
"step": 10
},
{
"completion_length": 563.1927337646484,
"epoch": 0.03531300160513644,
"grad_norm": 0.220703125,
"kl": 0.00023816750763216987,
"learning_rate": 7.333333333333332e-07,
"loss": 0.0,
"reward": 0.6119517982006073,
"reward_std": 0.3330337107181549,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3124726414680481,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2994791865348816,
"step": 11
},
{
"completion_length": 520.0416793823242,
"epoch": 0.038523274478330656,
"grad_norm": 0.2421875,
"kl": 0.0002197102876380086,
"learning_rate": 8e-07,
"loss": 0.0,
"reward": 0.608386904001236,
"reward_std": 0.3292866423726082,
"rewards/expression_based_accuracy_reward_length_penalized": 0.31997546553611755,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2884114608168602,
"step": 12
},
{
"completion_length": 564.2135543823242,
"epoch": 0.04173354735152488,
"grad_norm": 0.2158203125,
"kl": 0.00023579742264701054,
"learning_rate": 8.666666666666667e-07,
"loss": 0.0,
"reward": 0.6887014210224152,
"reward_std": 0.3306478410959244,
"rewards/expression_based_accuracy_reward_length_penalized": 0.40614935383200645,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2825520895421505,
"step": 13
},
{
"completion_length": 597.3411712646484,
"epoch": 0.0449438202247191,
"grad_norm": 0.18359375,
"kl": 0.00021818295135744847,
"learning_rate": 9.333333333333333e-07,
"loss": 0.0,
"reward": 0.5946466475725174,
"reward_std": 0.322207048535347,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3166518397629261,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2779947966337204,
"step": 14
},
{
"completion_length": 584.4505310058594,
"epoch": 0.048154093097913325,
"grad_norm": 0.17578125,
"kl": 0.00022104514937382191,
"learning_rate": 1e-06,
"loss": 0.0,
"reward": 0.5846492573618889,
"reward_std": 0.3315364196896553,
"rewards/expression_based_accuracy_reward_length_penalized": 0.2994930259883404,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2851562574505806,
"step": 15
},
{
"completion_length": 554.7083587646484,
"epoch": 0.051364365971107544,
"grad_norm": 0.2119140625,
"kl": 0.00023191924265120178,
"learning_rate": 9.998781585307575e-07,
"loss": 0.0,
"reward": 0.6661794185638428,
"reward_std": 0.3503050282597542,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3647470995783806,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.301432304084301,
"step": 16
},
{
"completion_length": 580.8854522705078,
"epoch": 0.05457463884430177,
"grad_norm": 0.1826171875,
"kl": 0.0002030548857874237,
"learning_rate": 9.99512700102336e-07,
"loss": 0.0,
"reward": 0.6631067544221878,
"reward_std": 0.3135067969560623,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3707890138030052,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2923177182674408,
"step": 17
},
{
"completion_length": 580.8359527587891,
"epoch": 0.05778491171749599,
"grad_norm": 0.2001953125,
"kl": 0.0002304925255884882,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0,
"reward": 0.651978924870491,
"reward_std": 0.36701615154743195,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3407810106873512,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.311197929084301,
"step": 18
},
{
"completion_length": 572.1041717529297,
"epoch": 0.060995184590690206,
"grad_norm": 0.2236328125,
"kl": 0.00021570282842731103,
"learning_rate": 9.98051855792412e-07,
"loss": 0.0,
"reward": 0.631376326084137,
"reward_std": 0.34789177030324936,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3279908671975136,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.303385429084301,
"step": 19
},
{
"completion_length": 569.4166717529297,
"epoch": 0.06420545746388442,
"grad_norm": 0.177734375,
"kl": 0.00021378670862759463,
"learning_rate": 9.969572609838744e-07,
"loss": 0.0,
"reward": 0.5896809697151184,
"reward_std": 0.3236342519521713,
"rewards/expression_based_accuracy_reward_length_penalized": 0.2973632514476776,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2923177182674408,
"step": 20
},
{
"completion_length": 580.0417022705078,
"epoch": 0.06741573033707865,
"grad_norm": 0.1923828125,
"kl": 0.00023035979393171147,
"learning_rate": 9.956206309337066e-07,
"loss": 0.0,
"reward": 0.6523794531822205,
"reward_std": 0.3744150176644325,
"rewards/expression_based_accuracy_reward_length_penalized": 0.35159818083047867,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3007812574505806,
"step": 21
},
{
"completion_length": 587.8021087646484,
"epoch": 0.07062600321027288,
"grad_norm": 0.1962890625,
"kl": 0.0002475921137374826,
"learning_rate": 9.940426894506606e-07,
"loss": 0.0,
"reward": 0.6196304857730865,
"reward_std": 0.3361932933330536,
"rewards/expression_based_accuracy_reward_length_penalized": 0.32015133649110794,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2994791753590107,
"step": 22
},
{
"completion_length": 518.0104217529297,
"epoch": 0.0738362760834671,
"grad_norm": 0.1904296875,
"kl": 0.00022199605882633477,
"learning_rate": 9.922242910178859e-07,
"loss": 0.0,
"reward": 0.737170621752739,
"reward_std": 0.3162895292043686,
"rewards/expression_based_accuracy_reward_length_penalized": 0.44485291838645935,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2923177182674408,
"step": 23
},
{
"completion_length": 539.7265625,
"epoch": 0.07704654895666131,
"grad_norm": 0.1943359375,
"kl": 0.0002175298322981689,
"learning_rate": 9.901664203302124e-07,
"loss": 0.0,
"reward": 0.7175260633230209,
"reward_std": 0.3508952334523201,
"rewards/expression_based_accuracy_reward_length_penalized": 0.40307293832302094,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3144531399011612,
"step": 24
},
{
"completion_length": 512.5755310058594,
"epoch": 0.08025682182985554,
"grad_norm": 0.224609375,
"kl": 0.0002242086047772318,
"learning_rate": 9.878701917609207e-07,
"loss": 0.0,
"reward": 0.6891498863697052,
"reward_std": 0.3474579304456711,
"rewards/expression_based_accuracy_reward_length_penalized": 0.38055606931447983,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3085937574505806,
"step": 25
},
{
"completion_length": 568.2630310058594,
"epoch": 0.08346709470304976,
"grad_norm": 0.20703125,
"kl": 0.0002301457461726386,
"learning_rate": 9.853368487582886e-07,
"loss": 0.0,
"reward": 0.6178770214319229,
"reward_std": 0.35181906819343567,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3281634747982025,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2897135466337204,
"step": 26
},
{
"completion_length": 538.7916946411133,
"epoch": 0.08667736757624397,
"grad_norm": 0.2041015625,
"kl": 0.00026182403962593526,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0,
"reward": 0.7029251009225845,
"reward_std": 0.360026091337204,
"rewards/expression_based_accuracy_reward_length_penalized": 0.39693548530340195,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3059896007180214,
"step": 27
},
{
"completion_length": 552.0781478881836,
"epoch": 0.0898876404494382,
"grad_norm": 0.19140625,
"kl": 0.00023814345331629738,
"learning_rate": 9.795644345114794e-07,
"loss": 0.0,
"reward": 0.7071576714515686,
"reward_std": 0.33075109869241714,
"rewards/expression_based_accuracy_reward_length_penalized": 0.4206993207335472,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2864583507180214,
"step": 28
},
{
"completion_length": 503.5703353881836,
"epoch": 0.09309791332263243,
"grad_norm": 0.2197265625,
"kl": 0.00023130706176743843,
"learning_rate": 9.76328489131448e-07,
"loss": 0.0,
"reward": 0.6565393060445786,
"reward_std": 0.2805866673588753,
"rewards/expression_based_accuracy_reward_length_penalized": 0.36096640676259995,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.295572929084301,
"step": 29
},
{
"completion_length": 532.4583435058594,
"epoch": 0.09630818619582665,
"grad_norm": 0.21484375,
"kl": 0.00023216806584969163,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0,
"reward": 0.6117298901081085,
"reward_std": 0.32376599311828613,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3122507072985172,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.299479179084301,
"step": 30
},
{
"completion_length": 576.1484527587891,
"epoch": 0.09951845906902086,
"grad_norm": 0.1953125,
"kl": 0.00021910631767241284,
"learning_rate": 9.69165882516764e-07,
"loss": 0.0,
"reward": 0.6560553312301636,
"reward_std": 0.3462247848510742,
"rewards/expression_based_accuracy_reward_length_penalized": 0.37480536848306656,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2812500074505806,
"step": 31
},
{
"completion_length": 592.3385696411133,
"epoch": 0.10272873194221509,
"grad_norm": 0.1767578125,
"kl": 0.0002467254307703115,
"learning_rate": 9.65243099959949e-07,
"loss": 0.0,
"reward": 0.5856707692146301,
"reward_std": 0.31634171307086945,
"rewards/expression_based_accuracy_reward_length_penalized": 0.28033220022916794,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.305338554084301,
"step": 32
},
{
"completion_length": 583.9010620117188,
"epoch": 0.10593900481540931,
"grad_norm": 0.2265625,
"kl": 0.00024941361698438413,
"learning_rate": 9.610954559391704e-07,
"loss": 0.0,
"reward": 0.6140669733285904,
"reward_std": 0.32649289071559906,
"rewards/expression_based_accuracy_reward_length_penalized": 0.327608622610569,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2864583358168602,
"step": 33
},
{
"completion_length": 538.0364685058594,
"epoch": 0.10914927768860354,
"grad_norm": 0.208984375,
"kl": 0.0002286795133841224,
"learning_rate": 9.567251964768342e-07,
"loss": 0.0,
"reward": 0.6336007714271545,
"reward_std": 0.32907337695360184,
"rewards/expression_based_accuracy_reward_length_penalized": 0.34258514642715454,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2910156399011612,
"step": 34
},
{
"completion_length": 507.6510543823242,
"epoch": 0.11235955056179775,
"grad_norm": 0.255859375,
"kl": 0.00024302997917402536,
"learning_rate": 9.521346881455354e-07,
"loss": 0.0,
"reward": 0.7129171043634415,
"reward_std": 0.35209202766418457,
"rewards/expression_based_accuracy_reward_length_penalized": 0.40757858008146286,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3053385615348816,
"step": 35
},
{
"completion_length": 584.4531555175781,
"epoch": 0.11556982343499198,
"grad_norm": 0.2138671875,
"kl": 0.00023655666518607177,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0,
"reward": 0.6754663735628128,
"reward_std": 0.33357472717761993,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3831486627459526,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2923177182674408,
"step": 36
},
{
"completion_length": 619.2396087646484,
"epoch": 0.1187800963081862,
"grad_norm": 0.1953125,
"kl": 0.0002523561015550513,
"learning_rate": 9.42302986163543e-07,
"loss": 0.0,
"reward": 0.6422896459698677,
"reward_std": 0.3401486799120903,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3831750750541687,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2591145858168602,
"step": 37
},
{
"completion_length": 632.1067962646484,
"epoch": 0.12199036918138041,
"grad_norm": 0.19140625,
"kl": 0.00025913729768944904,
"learning_rate": 9.370671165529144e-07,
"loss": 0.0,
"reward": 0.5953093469142914,
"reward_std": 0.33438971638679504,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3147103600203991,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2805989757180214,
"step": 38
},
{
"completion_length": 569.0026397705078,
"epoch": 0.12520064205457465,
"grad_norm": 0.208984375,
"kl": 0.0002631417410157155,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0,
"reward": 0.6718064844608307,
"reward_std": 0.3528323844075203,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3859991952776909,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2858073115348816,
"step": 39
},
{
"completion_length": 542.0260620117188,
"epoch": 0.12841091492776885,
"grad_norm": 0.2353515625,
"kl": 0.0002535913408792112,
"learning_rate": 9.259695151358214e-07,
"loss": 0.0,
"reward": 0.6311447024345398,
"reward_std": 0.3200613558292389,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3459884449839592,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2851562574505806,
"step": 40
},
{
"completion_length": 573.9167022705078,
"epoch": 0.13162118780096307,
"grad_norm": 0.1923828125,
"kl": 0.0002568592317402363,
"learning_rate": 9.20113792876298e-07,
"loss": 0.0,
"reward": 0.6579329371452332,
"reward_std": 0.33611204475164413,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3617089316248894,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2962239682674408,
"step": 41
},
{
"completion_length": 563.0416793823242,
"epoch": 0.1348314606741573,
"grad_norm": 0.205078125,
"kl": 0.00026875592448050156,
"learning_rate": 9.140576474687263e-07,
"loss": 0.0,
"reward": 0.6627669483423233,
"reward_std": 0.3593166694045067,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3750064894556999,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2877604253590107,
"step": 42
},
{
"completion_length": 496.56251525878906,
"epoch": 0.13804173354735153,
"grad_norm": 0.2275390625,
"kl": 0.0002509369187464472,
"learning_rate": 9.078043584226815e-07,
"loss": 0.0,
"reward": 0.693062499165535,
"reward_std": 0.3470332473516464,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3753541484475136,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3177083432674408,
"step": 43
},
{
"completion_length": 589.6536712646484,
"epoch": 0.14125200642054575,
"grad_norm": 0.1884765625,
"kl": 0.0002775079774437472,
"learning_rate": 9.013573120044966e-07,
"loss": 0.0,
"reward": 0.5451524406671524,
"reward_std": 0.3420337289571762,
"rewards/expression_based_accuracy_reward_length_penalized": 0.2665066123008728,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2786458432674408,
"step": 44
},
{
"completion_length": 515.5677261352539,
"epoch": 0.14446227929373998,
"grad_norm": 0.2431640625,
"kl": 0.00026737275038613006,
"learning_rate": 8.9471999940354e-07,
"loss": 0.0,
"reward": 0.6689368337392807,
"reward_std": 0.3494330644607544,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3753170371055603,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2936197966337204,
"step": 45
},
{
"completion_length": 531.6041870117188,
"epoch": 0.1476725521669342,
"grad_norm": 0.2158203125,
"kl": 0.00027584553754422814,
"learning_rate": 8.878960148416747e-07,
"loss": 0.0,
"reward": 0.6247715353965759,
"reward_std": 0.3459451347589493,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3357090353965759,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2890625149011612,
"step": 46
},
{
"completion_length": 523.8359527587891,
"epoch": 0.1508828250401284,
"grad_norm": 0.2109375,
"kl": 0.0002594252800918184,
"learning_rate": 8.808890536269229e-07,
"loss": 0.0,
"reward": 0.6625895947217941,
"reward_std": 0.35964568704366684,
"rewards/expression_based_accuracy_reward_length_penalized": 0.36180833727121353,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3007812574505806,
"step": 47
},
{
"completion_length": 572.2395935058594,
"epoch": 0.15409309791332262,
"grad_norm": 0.2001953125,
"kl": 0.0002760118877631612,
"learning_rate": 8.737029101523929e-07,
"loss": 0.0,
"reward": 0.6687695384025574,
"reward_std": 0.3379608243703842,
"rewards/expression_based_accuracy_reward_length_penalized": 0.36733726412057877,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.301432304084301,
"step": 48
},
{
"completion_length": 565.372428894043,
"epoch": 0.15730337078651685,
"grad_norm": 0.2109375,
"kl": 0.00026545282889856026,
"learning_rate": 8.663414758415478e-07,
"loss": 0.0,
"reward": 0.6460029184818268,
"reward_std": 0.33386022597551346,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3458726927638054,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3001302108168602,
"step": 49
},
{
"completion_length": 540.8411560058594,
"epoch": 0.16051364365971107,
"grad_norm": 0.2177734375,
"kl": 0.0002867219809559174,
"learning_rate": 8.588087370409302e-07,
"loss": 0.0,
"reward": 0.6432211250066757,
"reward_std": 0.35255035012960434,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3235596604645252,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3196614682674408,
"step": 50
},
{
"completion_length": 583.2864685058594,
"epoch": 0.1637239165329053,
"grad_norm": 0.220703125,
"kl": 0.0003001616059918888,
"learning_rate": 8.511087728614862e-07,
"loss": 0.0,
"reward": 0.6296520233154297,
"reward_std": 0.3602987676858902,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3295218013226986,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3001302182674408,
"step": 51
},
{
"completion_length": 596.6302185058594,
"epoch": 0.16693418940609953,
"grad_norm": 0.2412109375,
"kl": 0.0002572698904259596,
"learning_rate": 8.432457529696548e-07,
"loss": 0.0,
"reward": 0.6288764774799347,
"reward_std": 0.3630865290760994,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3443712741136551,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2845052257180214,
"step": 52
},
{
"completion_length": 476.7343978881836,
"epoch": 0.17014446227929375,
"grad_norm": 0.23828125,
"kl": 0.0003045099292648956,
"learning_rate": 8.352239353294194e-07,
"loss": 0.0,
"reward": 0.6977786123752594,
"reward_std": 0.36942026019096375,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3748619332909584,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.322916679084301,
"step": 53
},
{
"completion_length": 594.0599060058594,
"epoch": 0.17335473515248795,
"grad_norm": 0.1982421875,
"kl": 0.0002901406696764752,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0,
"reward": 0.614113561809063,
"reward_std": 0.30325619876384735,
"rewards/expression_based_accuracy_reward_length_penalized": 0.30291564762592316,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3111979216337204,
"step": 54
},
{
"completion_length": 540.8698120117188,
"epoch": 0.17656500802568217,
"grad_norm": 0.2177734375,
"kl": 0.0002815077095874585,
"learning_rate": 8.187213662662538e-07,
"loss": 0.0,
"reward": 0.7013998925685883,
"reward_std": 0.345312163233757,
"rewards/expression_based_accuracy_reward_length_penalized": 0.39671240001916885,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3046875074505806,
"step": 55
},
{
"completion_length": 551.1432342529297,
"epoch": 0.1797752808988764,
"grad_norm": 0.208984375,
"kl": 0.0003022913369932212,
"learning_rate": 8.102495512755938e-07,
"loss": 0.0,
"reward": 0.6621358841657639,
"reward_std": 0.3478364497423172,
"rewards/expression_based_accuracy_reward_length_penalized": 0.35614627599716187,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3059896007180214,
"step": 56
},
{
"completion_length": 541.5338668823242,
"epoch": 0.18298555377207062,
"grad_norm": 0.251953125,
"kl": 0.00029883202660130337,
"learning_rate": 8.01636806561836e-07,
"loss": 0.0,
"reward": 0.6321840733289719,
"reward_std": 0.3268617168068886,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3522360995411873,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2799479216337204,
"step": 57
},
{
"completion_length": 534.2135620117188,
"epoch": 0.18619582664526485,
"grad_norm": 0.2177734375,
"kl": 0.00031317536922870204,
"learning_rate": 7.928877960781808e-07,
"loss": 0.0,
"reward": 0.6300312578678131,
"reward_std": 0.31237364560365677,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3422708138823509,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2877604216337204,
"step": 58
},
{
"completion_length": 569.7265930175781,
"epoch": 0.18940609951845908,
"grad_norm": 0.1943359375,
"kl": 0.0002944675215985626,
"learning_rate": 7.840072575681468e-07,
"loss": 0.0,
"reward": 0.6045078411698341,
"reward_std": 0.33760548382997513,
"rewards/expression_based_accuracy_reward_length_penalized": 0.30958598107099533,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2949218824505806,
"step": 59
},
{
"completion_length": 546.7135467529297,
"epoch": 0.1926163723916533,
"grad_norm": 0.244140625,
"kl": 0.0003155921949655749,
"learning_rate": 7.75e-07,
"loss": 0.0,
"reward": 0.6555080115795135,
"reward_std": 0.32254888117313385,
"rewards/expression_based_accuracy_reward_length_penalized": 0.33779964968562126,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3177083432674408,
"step": 60
},
{
"completion_length": 544.5573120117188,
"epoch": 0.1958266452648475,
"grad_norm": 0.193359375,
"kl": 0.00029893887403886765,
"learning_rate": 7.658709009626109e-07,
"loss": 0.0,
"reward": 0.6744174212217331,
"reward_std": 0.33529237657785416,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3684278205037117,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3059895932674408,
"step": 61
},
{
"completion_length": 499.6224136352539,
"epoch": 0.19903691813804172,
"grad_norm": 0.2353515625,
"kl": 0.00032137856032932177,
"learning_rate": 7.566249040241553e-07,
"loss": 0.0,
"reward": 0.6523666083812714,
"reward_std": 0.32566210627555847,
"rewards/expression_based_accuracy_reward_length_penalized": 0.34898117184638977,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3033854216337204,
"step": 62
},
{
"completion_length": 578.7239837646484,
"epoch": 0.20224719101123595,
"grad_norm": 0.20703125,
"kl": 0.0003287481522420421,
"learning_rate": 7.472670160550848e-07,
"loss": 0.0,
"reward": 0.634161502122879,
"reward_std": 0.34120889008045197,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3372865132987499,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2968750074505806,
"step": 63
},
{
"completion_length": 506.3177261352539,
"epoch": 0.20545746388443017,
"grad_norm": 0.2412109375,
"kl": 0.0003212923475075513,
"learning_rate": 7.37802304516818e-07,
"loss": 0.0,
"reward": 0.6933595240116119,
"reward_std": 0.3754495605826378,
"rewards/expression_based_accuracy_reward_length_penalized": 0.38085950165987015,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3125000074505806,
"step": 64
},
{
"completion_length": 581.5833587646484,
"epoch": 0.2086677367576244,
"grad_norm": 0.181640625,
"kl": 0.00029418900521704927,
"learning_rate": 7.282358947176205e-07,
"loss": 0.0,
"reward": 0.6189248859882355,
"reward_std": 0.33084874600172043,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3142373785376549,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3046875149011612,
"step": 65
},
{
"completion_length": 534.5729446411133,
"epoch": 0.21187800963081863,
"grad_norm": 0.224609375,
"kl": 0.00033117266866611317,
"learning_rate": 7.185729670371604e-07,
"loss": 0.0,
"reward": 0.6608574390411377,
"reward_std": 0.31472062319517136,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3600761741399765,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3007812574505806,
"step": 66
},
{
"completion_length": 517.5755386352539,
"epoch": 0.21508828250401285,
"grad_norm": 0.23828125,
"kl": 0.00034513785067247227,
"learning_rate": 7.08818754121241e-07,
"loss": 0.0,
"reward": 0.6840898096561432,
"reward_std": 0.3518378511071205,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3644283339381218,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3196614682674408,
"step": 67
},
{
"completion_length": 534.2578353881836,
"epoch": 0.21829855537720708,
"grad_norm": 0.20703125,
"kl": 0.00032389759144280106,
"learning_rate": 6.989785380482312e-07,
"loss": 0.0,
"reward": 0.7169905304908752,
"reward_std": 0.3356803208589554,
"rewards/expression_based_accuracy_reward_length_penalized": 0.39472493529319763,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3222656324505806,
"step": 68
},
{
"completion_length": 593.1797027587891,
"epoch": 0.22150882825040127,
"grad_norm": 0.205078125,
"kl": 0.00034336688258918,
"learning_rate": 6.890576474687263e-07,
"loss": 0.0,
"reward": 0.6631477773189545,
"reward_std": 0.37854011356830597,
"rewards/expression_based_accuracy_reward_length_penalized": 0.34023110568523407,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.322916679084301,
"step": 69
},
{
"completion_length": 629.6015625,
"epoch": 0.2247191011235955,
"grad_norm": 0.185546875,
"kl": 0.00032993722561514005,
"learning_rate": 6.790614547199906e-07,
"loss": 0.0,
"reward": 0.5925078019499779,
"reward_std": 0.3088828846812248,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3125598691403866,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.279947929084301,
"step": 70
},
{
"completion_length": 559.5963897705078,
"epoch": 0.22792937399678972,
"grad_norm": 0.2255859375,
"kl": 0.0003137872990919277,
"learning_rate": 6.68995372916741e-07,
"loss": 0.0,
"reward": 0.7026459574699402,
"reward_std": 0.33306000381708145,
"rewards/expression_based_accuracy_reward_length_penalized": 0.392750084400177,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3098958432674408,
"step": 71
},
{
"completion_length": 494.39845275878906,
"epoch": 0.23113964686998395,
"grad_norm": 0.2275390625,
"kl": 0.0003587143437471241,
"learning_rate": 6.588648530198504e-07,
"loss": 0.0,
"reward": 0.6391649395227432,
"reward_std": 0.3157573267817497,
"rewards/expression_based_accuracy_reward_length_penalized": 0.32080554217100143,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3183593824505806,
"step": 72
},
{
"completion_length": 583.8672027587891,
"epoch": 0.23434991974317818,
"grad_norm": 0.1875,
"kl": 0.0002944624357041903,
"learning_rate": 6.486753808845564e-07,
"loss": 0.0,
"reward": 0.6462114006280899,
"reward_std": 0.33720824867486954,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3584509789943695,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.287760429084301,
"step": 73
},
{
"completion_length": 509.7161560058594,
"epoch": 0.2375601926163724,
"grad_norm": 0.2333984375,
"kl": 0.00037064859498059377,
"learning_rate": 6.384324742897735e-07,
"loss": 0.0,
"reward": 0.6612931340932846,
"reward_std": 0.3572119027376175,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3448868505656719,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3164062649011612,
"step": 74
},
{
"completion_length": 549.5651245117188,
"epoch": 0.24077046548956663,
"grad_norm": 0.197265625,
"kl": 0.00032304248452419415,
"learning_rate": 6.281416799501187e-07,
"loss": 0.0,
"reward": 0.6797159165143967,
"reward_std": 0.34857943654060364,
"rewards/expression_based_accuracy_reward_length_penalized": 0.36135654896497726,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3183593824505806,
"step": 75
},
{
"completion_length": 572.7578353881836,
"epoch": 0.24398073836276082,
"grad_norm": 0.2119140625,
"kl": 0.0003249031215091236,
"learning_rate": 6.178085705122674e-07,
"loss": 0.0,
"reward": 0.6995292603969574,
"reward_std": 0.3806586042046547,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3727063462138176,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3268229216337204,
"step": 76
},
{
"completion_length": 507.5078353881836,
"epoch": 0.24719101123595505,
"grad_norm": 0.255859375,
"kl": 0.0003559839096851647,
"learning_rate": 6.074387415372676e-07,
"loss": 0.0,
"reward": 0.7540216147899628,
"reward_std": 0.38066261261701584,
"rewards/expression_based_accuracy_reward_length_penalized": 0.43045392632484436,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3235677182674408,
"step": 77
},
{
"completion_length": 618.5390930175781,
"epoch": 0.2504012841091493,
"grad_norm": 0.220703125,
"kl": 0.0003840129793388769,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0,
"reward": 0.5318701416254044,
"reward_std": 0.35173140466213226,
"rewards/expression_based_accuracy_reward_length_penalized": 0.26559409499168396,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2662760466337204,
"step": 78
},
{
"completion_length": 514.5781555175781,
"epoch": 0.2536115569823435,
"grad_norm": 0.234375,
"kl": 0.00037649404839612544,
"learning_rate": 5.866114036005362e-07,
"loss": 0.0,
"reward": 0.677052691578865,
"reward_std": 0.36026471108198166,
"rewards/expression_based_accuracy_reward_length_penalized": 0.35348496586084366,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3235677257180214,
"step": 79
},
{
"completion_length": 546.5338745117188,
"epoch": 0.2568218298555377,
"grad_norm": 0.19921875,
"kl": 0.0003384711453691125,
"learning_rate": 5.761651730097142e-07,
"loss": 0.0,
"reward": 0.6351290941238403,
"reward_std": 0.34162163734436035,
"rewards/expression_based_accuracy_reward_length_penalized": 0.34281135350465775,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2923177257180214,
"step": 80
},
{
"completion_length": 545.1432495117188,
"epoch": 0.26003210272873195,
"grad_norm": 0.2001953125,
"kl": 0.0003302170734968968,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0,
"reward": 0.7321957647800446,
"reward_std": 0.3832404538989067,
"rewards/expression_based_accuracy_reward_length_penalized": 0.42946138232946396,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3027343824505806,
"step": 81
},
{
"completion_length": 602.4036712646484,
"epoch": 0.26324237560192615,
"grad_norm": 0.169921875,
"kl": 0.0003239936995669268,
"learning_rate": 5.552358696106288e-07,
"loss": 0.0,
"reward": 0.6142081022262573,
"reward_std": 0.33728527277708054,
"rewards/expression_based_accuracy_reward_length_penalized": 0.31277579814195633,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.301432304084301,
"step": 82
},
{
"completion_length": 562.7057495117188,
"epoch": 0.2664526484751204,
"grad_norm": 0.240234375,
"kl": 0.00037012308894190937,
"learning_rate": 5.447641303893714e-07,
"loss": 0.0,
"reward": 0.6191717982292175,
"reward_std": 0.3545895963907242,
"rewards/expression_based_accuracy_reward_length_penalized": 0.31578636169433594,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.303385429084301,
"step": 83
},
{
"completion_length": 543.9427261352539,
"epoch": 0.2696629213483146,
"grad_norm": 0.2314453125,
"kl": 0.00037831455847481266,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0,
"reward": 0.7242841571569443,
"reward_std": 0.3670550063252449,
"rewards/expression_based_accuracy_reward_length_penalized": 0.4020185172557831,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3222656324505806,
"step": 84
},
{
"completion_length": 558.5781555175781,
"epoch": 0.27287319422150885,
"grad_norm": 0.22265625,
"kl": 0.00036308395647211,
"learning_rate": 5.238348269902859e-07,
"loss": 0.0,
"reward": 0.6587125062942505,
"reward_std": 0.36182229965925217,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3572801947593689,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.301432304084301,
"step": 85
},
{
"completion_length": 581.3099060058594,
"epoch": 0.27608346709470305,
"grad_norm": 0.23046875,
"kl": 0.00038044428947614506,
"learning_rate": 5.133885963994639e-07,
"loss": 0.0,
"reward": 0.6719960719347,
"reward_std": 0.3624914661049843,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3588450253009796,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.313151054084301,
"step": 86
},
{
"completion_length": 580.1745147705078,
"epoch": 0.27929373996789725,
"grad_norm": 0.1767578125,
"kl": 0.00034601552761159837,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0,
"reward": 0.6323724538087845,
"reward_std": 0.32785172015428543,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3433099538087845,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2890625074505806,
"step": 87
},
{
"completion_length": 530.0156326293945,
"epoch": 0.2825040128410915,
"grad_norm": 0.220703125,
"kl": 0.00040156069735530764,
"learning_rate": 4.925612584627324e-07,
"loss": 0.0,
"reward": 0.7260984629392624,
"reward_std": 0.38204891979694366,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3940671756863594,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.33203125,
"step": 88
},
{
"completion_length": 548.8020935058594,
"epoch": 0.2857142857142857,
"grad_norm": 0.248046875,
"kl": 0.0004189757601125166,
"learning_rate": 4.821914294877326e-07,
"loss": 0.0,
"reward": 0.6541236937046051,
"reward_std": 0.344281330704689,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3533423990011215,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3007812574505806,
"step": 89
},
{
"completion_length": 609.1432342529297,
"epoch": 0.28892455858747995,
"grad_norm": 0.1982421875,
"kl": 0.0003810434936895035,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.0,
"reward": 0.5851198732852936,
"reward_std": 0.32441411167383194,
"rewards/expression_based_accuracy_reward_length_penalized": 0.2947552725672722,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2903645858168602,
"step": 90
},
{
"completion_length": 574.8021087646484,
"epoch": 0.29213483146067415,
"grad_norm": 0.2255859375,
"kl": 0.0003523045379552059,
"learning_rate": 4.6156752571022637e-07,
"loss": 0.0,
"reward": 0.6396794319152832,
"reward_std": 0.33973030745983124,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3525700494647026,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.287109375,
"step": 91
},
{
"completion_length": 551.4505462646484,
"epoch": 0.2953451043338684,
"grad_norm": 0.1845703125,
"kl": 0.00035572806518757716,
"learning_rate": 4.513246191154434e-07,
"loss": 0.0,
"reward": 0.6876581907272339,
"reward_std": 0.3704243451356888,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3888300210237503,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2988281399011612,
"step": 92
},
{
"completion_length": 560.6718902587891,
"epoch": 0.2985553772070626,
"grad_norm": 0.2001953125,
"kl": 0.0003872549714287743,
"learning_rate": 4.4113514698014953e-07,
"loss": 0.0,
"reward": 0.6538409739732742,
"reward_std": 0.35449104756116867,
"rewards/expression_based_accuracy_reward_length_penalized": 0.35175760090351105,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3020833432674408,
"step": 93
},
{
"completion_length": 607.7396087646484,
"epoch": 0.3017656500802568,
"grad_norm": 0.263671875,
"kl": 0.0003801950879278593,
"learning_rate": 4.3100462708325914e-07,
"loss": 0.0,
"reward": 0.5898270905017853,
"reward_std": 0.3407137244939804,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3066239655017853,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2832031287252903,
"step": 94
},
{
"completion_length": 496.19793701171875,
"epoch": 0.30497592295345105,
"grad_norm": 0.240234375,
"kl": 0.0003671470913104713,
"learning_rate": 4.209385452800095e-07,
"loss": 0.0,
"reward": 0.7184917479753494,
"reward_std": 0.3648832216858864,
"rewards/expression_based_accuracy_reward_length_penalized": 0.38646050542593,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3320312574505806,
"step": 95
},
{
"completion_length": 508.57554626464844,
"epoch": 0.30818619582664525,
"grad_norm": 0.2265625,
"kl": 0.00038119566306704655,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.0,
"reward": 0.6568552851676941,
"reward_std": 0.3511122092604637,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3293813392519951,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3274739682674408,
"step": 96
},
{
"completion_length": 549.0312652587891,
"epoch": 0.3113964686998395,
"grad_norm": 0.21484375,
"kl": 0.0003632343214121647,
"learning_rate": 4.0102146195176887e-07,
"loss": 0.0,
"reward": 0.7204606682062149,
"reward_std": 0.3499609977006912,
"rewards/expression_based_accuracy_reward_length_penalized": 0.4001481980085373,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3203125074505806,
"step": 97
},
{
"completion_length": 490.3802261352539,
"epoch": 0.3146067415730337,
"grad_norm": 0.2255859375,
"kl": 0.00044602488924283534,
"learning_rate": 3.911812458787591e-07,
"loss": 0.0,
"reward": 0.6808420717716217,
"reward_std": 0.35114526003599167,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3728993684053421,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3079427182674408,
"step": 98
},
{
"completion_length": 524.2682342529297,
"epoch": 0.31781701444622795,
"grad_norm": 0.20703125,
"kl": 0.0003882949022226967,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.0,
"reward": 0.6448683142662048,
"reward_std": 0.3429142013192177,
"rewards/expression_based_accuracy_reward_length_penalized": 0.33367037773132324,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.311197929084301,
"step": 99
},
{
"completion_length": 557.1823043823242,
"epoch": 0.32102728731942215,
"grad_norm": 0.1884765625,
"kl": 0.00035858208866557106,
"learning_rate": 3.7176410528237945e-07,
"loss": 0.0,
"reward": 0.6761815696954727,
"reward_std": 0.3675583600997925,
"rewards/expression_based_accuracy_reward_length_penalized": 0.35326486080884933,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.322916679084301,
"step": 100
},
{
"completion_length": 564.6927337646484,
"epoch": 0.32423756019261635,
"grad_norm": 0.22265625,
"kl": 0.00038343547930708155,
"learning_rate": 3.62197695483182e-07,
"loss": 0.0,
"reward": 0.6524051502346992,
"reward_std": 0.36947014927864075,
"rewards/expression_based_accuracy_reward_length_penalized": 0.34055614471435547,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3118489682674408,
"step": 101
},
{
"completion_length": 551.9453353881836,
"epoch": 0.3274478330658106,
"grad_norm": 0.2294921875,
"kl": 0.0003793273790506646,
"learning_rate": 3.5273298394491515e-07,
"loss": 0.0,
"reward": 0.6944572031497955,
"reward_std": 0.37888605892658234,
"rewards/expression_based_accuracy_reward_length_penalized": 0.37870199978351593,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3157552182674408,
"step": 102
},
{
"completion_length": 530.1771011352539,
"epoch": 0.3306581059390048,
"grad_norm": 0.22265625,
"kl": 0.00038907503767404705,
"learning_rate": 3.433750959758446e-07,
"loss": 0.0,
"reward": 0.6862371563911438,
"reward_std": 0.3600939214229584,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3555079624056816,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3307291716337204,
"step": 103
},
{
"completion_length": 504.72398376464844,
"epoch": 0.33386837881219905,
"grad_norm": 0.2265625,
"kl": 0.0004411861809785478,
"learning_rate": 3.3412909903738936e-07,
"loss": 0.0,
"reward": 0.7003691345453262,
"reward_std": 0.34579480439424515,
"rewards/expression_based_accuracy_reward_length_penalized": 0.38917120546102524,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.311197929084301,
"step": 104
},
{
"completion_length": 579.5859527587891,
"epoch": 0.33707865168539325,
"grad_norm": 0.2060546875,
"kl": 0.0003610364656196907,
"learning_rate": 3.250000000000001e-07,
"loss": 0.0,
"reward": 0.7041359394788742,
"reward_std": 0.3546976149082184,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3883807212114334,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3157552182674408,
"step": 105
},
{
"completion_length": 504.9349136352539,
"epoch": 0.3402889245585875,
"grad_norm": 0.2294921875,
"kl": 0.0004345797060523182,
"learning_rate": 3.159927424318531e-07,
"loss": 0.0,
"reward": 0.7195965945720673,
"reward_std": 0.34991642087697983,
"rewards/expression_based_accuracy_reward_length_penalized": 0.39863305538892746,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3209635466337204,
"step": 106
},
{
"completion_length": 521.9349060058594,
"epoch": 0.3434991974317817,
"grad_norm": 0.2333984375,
"kl": 0.0004348123256932013,
"learning_rate": 3.0711220392181934e-07,
"loss": 0.0,
"reward": 0.5767635926604271,
"reward_std": 0.3495699018239975,
"rewards/expression_based_accuracy_reward_length_penalized": 0.2896541878581047,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2871093824505806,
"step": 107
},
{
"completion_length": 578.0781402587891,
"epoch": 0.3467094703049759,
"grad_norm": 0.26171875,
"kl": 0.0003971747573814355,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0,
"reward": 0.5868955999612808,
"reward_std": 0.3408031612634659,
"rewards/expression_based_accuracy_reward_length_penalized": 0.29587996006011963,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2910156324505806,
"step": 108
},
{
"completion_length": 558.0286712646484,
"epoch": 0.34991974317817015,
"grad_norm": 0.1845703125,
"kl": 0.00036870845360681415,
"learning_rate": 2.897504487244061e-07,
"loss": 0.0,
"reward": 0.6787082105875015,
"reward_std": 0.3448420986533165,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3570936322212219,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3216145932674408,
"step": 109
},
{
"completion_length": 547.6562652587891,
"epoch": 0.35313001605136435,
"grad_norm": 0.2158203125,
"kl": 0.00039495840610470623,
"learning_rate": 2.812786337337463e-07,
"loss": 0.0,
"reward": 0.5997674912214279,
"reward_std": 0.32131277769804,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3054966703057289,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2942708432674408,
"step": 110
},
{
"completion_length": 552.3463745117188,
"epoch": 0.3563402889245586,
"grad_norm": 0.2578125,
"kl": 0.00039361264498438686,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0,
"reward": 0.5880802720785141,
"reward_std": 0.34414373338222504,
"rewards/expression_based_accuracy_reward_length_penalized": 0.2827417254447937,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.305338554084301,
"step": 111
},
{
"completion_length": 542.1927185058594,
"epoch": 0.3595505617977528,
"grad_norm": 0.1904296875,
"kl": 0.0003947726945625618,
"learning_rate": 2.6477606467058035e-07,
"loss": 0.0,
"reward": 0.6639807671308517,
"reward_std": 0.3379776254296303,
"rewards/expression_based_accuracy_reward_length_penalized": 0.32934536039829254,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.334635429084301,
"step": 112
},
{
"completion_length": 550.3698043823242,
"epoch": 0.36276083467094705,
"grad_norm": 0.2119140625,
"kl": 0.00041885858081514016,
"learning_rate": 2.567542470303452e-07,
"loss": 0.0,
"reward": 0.6352178752422333,
"reward_std": 0.3331167697906494,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3402960002422333,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2949218824505806,
"step": 113
},
{
"completion_length": 520.3073120117188,
"epoch": 0.36597110754414125,
"grad_norm": 0.2158203125,
"kl": 0.00037509016692638397,
"learning_rate": 2.488912271385139e-07,
"loss": 0.0,
"reward": 0.6496723890304565,
"reward_std": 0.36061549186706543,
"rewards/expression_based_accuracy_reward_length_penalized": 0.33131300657987595,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3183593824505806,
"step": 114
},
{
"completion_length": 545.2890701293945,
"epoch": 0.36918138041733545,
"grad_norm": 0.2412109375,
"kl": 0.00040495285793440416,
"learning_rate": 2.411912629590699e-07,
"loss": 0.0,
"reward": 0.6173844560980797,
"reward_std": 0.3021947294473648,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3328792154788971,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2845052108168602,
"step": 115
},
{
"completion_length": 597.0130462646484,
"epoch": 0.3723916532905297,
"grad_norm": 0.205078125,
"kl": 0.0003835263050859794,
"learning_rate": 2.336585241584522e-07,
"loss": 0.0,
"reward": 0.6083859652280807,
"reward_std": 0.34971795231103897,
"rewards/expression_based_accuracy_reward_length_penalized": 0.31671928614377975,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2916666716337204,
"step": 116
},
{
"completion_length": 505.15106201171875,
"epoch": 0.3756019261637239,
"grad_norm": 0.259765625,
"kl": 0.0004204789365758188,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.0,
"reward": 0.6160649359226227,
"reward_std": 0.3238491714000702,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3263513892889023,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2897135466337204,
"step": 117
},
{
"completion_length": 567.8411712646484,
"epoch": 0.37881219903691815,
"grad_norm": 0.197265625,
"kl": 0.0003820292549789883,
"learning_rate": 2.1911094637307714e-07,
"loss": 0.0,
"reward": 0.5847776532173157,
"reward_std": 0.33124052733182907,
"rewards/expression_based_accuracy_reward_length_penalized": 0.2970172315835953,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2877604216337204,
"step": 118
},
{
"completion_length": 539.3724136352539,
"epoch": 0.38202247191011235,
"grad_norm": 0.212890625,
"kl": 0.0003783565916819498,
"learning_rate": 2.1210398515832536e-07,
"loss": 0.0,
"reward": 0.7074552923440933,
"reward_std": 0.33786971867084503,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3910490423440933,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3164062574505806,
"step": 119
},
{
"completion_length": 578.6484527587891,
"epoch": 0.3852327447833066,
"grad_norm": 0.208984375,
"kl": 0.00036553355312207714,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0,
"reward": 0.6493179947137833,
"reward_std": 0.35857032984495163,
"rewards/expression_based_accuracy_reward_length_penalized": 0.35374507308006287,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.295572929084301,
"step": 120
},
{
"completion_length": 516.8437576293945,
"epoch": 0.3884430176565008,
"grad_norm": 0.2373046875,
"kl": 0.00045376412163022906,
"learning_rate": 1.986426879955034e-07,
"loss": 0.0,
"reward": 0.684567391872406,
"reward_std": 0.3590117618441582,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3818329870700836,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3027343824505806,
"step": 121
},
{
"completion_length": 551.9687652587891,
"epoch": 0.391653290529695,
"grad_norm": 0.193359375,
"kl": 0.0003975575600634329,
"learning_rate": 1.9219564157731844e-07,
"loss": 0.0,
"reward": 0.6631377786397934,
"reward_std": 0.377517007291317,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3408721387386322,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3222656399011612,
"step": 122
},
{
"completion_length": 525.5026245117188,
"epoch": 0.39486356340288925,
"grad_norm": 0.21875,
"kl": 0.00042099927668459713,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.0,
"reward": 0.7239128798246384,
"reward_std": 0.35999199748039246,
"rewards/expression_based_accuracy_reward_length_penalized": 0.40815767645835876,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3157552182674408,
"step": 123
},
{
"completion_length": 523.8567886352539,
"epoch": 0.39807383627608345,
"grad_norm": 0.318359375,
"kl": 0.00044889742275699973,
"learning_rate": 1.7988620712370195e-07,
"loss": 0.0,
"reward": 0.716105192899704,
"reward_std": 0.345996156334877,
"rewards/expression_based_accuracy_reward_length_penalized": 0.4315999895334244,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2845052182674408,
"step": 124
},
{
"completion_length": 522.7838668823242,
"epoch": 0.4012841091492777,
"grad_norm": 0.255859375,
"kl": 0.00039373226172756404,
"learning_rate": 1.7403048486417868e-07,
"loss": 0.0,
"reward": 0.6855793744325638,
"reward_std": 0.3608446344733238,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3704752177000046,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.315104179084301,
"step": 125
},
{
"completion_length": 557.4557495117188,
"epoch": 0.4044943820224719,
"grad_norm": 0.2060546875,
"kl": 0.00039951602957444265,
"learning_rate": 1.6837835672960831e-07,
"loss": 0.0,
"reward": 0.5974871069192886,
"reward_std": 0.3423160910606384,
"rewards/expression_based_accuracy_reward_length_penalized": 0.27131520584225655,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3261718824505806,
"step": 126
},
{
"completion_length": 578.3697967529297,
"epoch": 0.40770465489566615,
"grad_norm": 0.2041015625,
"kl": 0.00037851801607757807,
"learning_rate": 1.6293288344708566e-07,
"loss": 0.0,
"reward": 0.633305624127388,
"reward_std": 0.372529074549675,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3253629058599472,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3079427108168602,
"step": 127
},
{
"completion_length": 535.1666870117188,
"epoch": 0.41091492776886035,
"grad_norm": 0.21484375,
"kl": 0.0003694754414027557,
"learning_rate": 1.5769701383645698e-07,
"loss": 0.0,
"reward": 0.6848493814468384,
"reward_std": 0.344666950404644,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3814639300107956,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3033854216337204,
"step": 128
},
{
"completion_length": 513.1224060058594,
"epoch": 0.41412520064205455,
"grad_norm": 0.19921875,
"kl": 0.0003918400325346738,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0,
"reward": 0.6448424756526947,
"reward_std": 0.3401818424463272,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3212747722864151,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3235677257180214,
"step": 129
},
{
"completion_length": 541.6172027587891,
"epoch": 0.4173354735152488,
"grad_norm": 0.2060546875,
"kl": 0.0003936137800337747,
"learning_rate": 1.4786531185446452e-07,
"loss": 0.0,
"reward": 0.583847850561142,
"reward_std": 0.33960337191820145,
"rewards/expression_based_accuracy_reward_length_penalized": 0.27004576474428177,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3138020858168602,
"step": 130
},
{
"completion_length": 533.5599136352539,
"epoch": 0.420545746388443,
"grad_norm": 0.2197265625,
"kl": 0.00039682938950136304,
"learning_rate": 1.432748035231658e-07,
"loss": 0.0,
"reward": 0.6769755631685257,
"reward_std": 0.3399392068386078,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3683818504214287,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3085937649011612,
"step": 131
},
{
"completion_length": 523.9271087646484,
"epoch": 0.42375601926163725,
"grad_norm": 0.2265625,
"kl": 0.00040404664468951523,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0,
"reward": 0.6483045816421509,
"reward_std": 0.32681532204151154,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3390597552061081,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.309244804084301,
"step": 132
},
{
"completion_length": 471.9661636352539,
"epoch": 0.42696629213483145,
"grad_norm": 0.24609375,
"kl": 0.00040609255665913224,
"learning_rate": 1.3475690004005097e-07,
"loss": 0.0,
"reward": 0.7119551748037338,
"reward_std": 0.34096624702215195,
"rewards/expression_based_accuracy_reward_length_penalized": 0.39619994908571243,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3157552182674408,
"step": 133
},
{
"completion_length": 565.4192962646484,
"epoch": 0.4301765650080257,
"grad_norm": 0.205078125,
"kl": 0.00037678072112612426,
"learning_rate": 1.308341174832359e-07,
"loss": 0.0,
"reward": 0.6749380528926849,
"reward_std": 0.3803337290883064,
"rewards/expression_based_accuracy_reward_length_penalized": 0.37480782717466354,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3001302182674408,
"step": 134
},
{
"completion_length": 496.27345275878906,
"epoch": 0.4333868378812199,
"grad_norm": 0.27734375,
"kl": 0.0004564332193695009,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0,
"reward": 0.7301954329013824,
"reward_std": 0.3306322991847992,
"rewards/expression_based_accuracy_reward_length_penalized": 0.41378918290138245,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3164062649011612,
"step": 135
},
{
"completion_length": 530.8125228881836,
"epoch": 0.43659711075441415,
"grad_norm": 0.205078125,
"kl": 0.0003717996005434543,
"learning_rate": 1.2367151086855187e-07,
"loss": 0.0,
"reward": 0.6495877057313919,
"reward_std": 0.3487004414200783,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3253689482808113,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3242187649011612,
"step": 136
},
{
"completion_length": 554.2031402587891,
"epoch": 0.43980738362760835,
"grad_norm": 0.2109375,
"kl": 0.0003636257752077654,
"learning_rate": 1.2043556548852063e-07,
"loss": 0.0,
"reward": 0.5865623354911804,
"reward_std": 0.30131980776786804,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3040102533996105,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2825520932674408,
"step": 137
},
{
"completion_length": 545.6093826293945,
"epoch": 0.44301765650080255,
"grad_norm": 0.22265625,
"kl": 0.00041512529423926026,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0,
"reward": 0.6579451262950897,
"reward_std": 0.3593253716826439,
"rewards/expression_based_accuracy_reward_length_penalized": 0.36367426812648773,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2942708432674408,
"step": 138
},
{
"completion_length": 573.9088745117188,
"epoch": 0.4462279293739968,
"grad_norm": 0.1982421875,
"kl": 0.0003498100923025049,
"learning_rate": 1.1466315124171128e-07,
"loss": 0.0,
"reward": 0.6012589037418365,
"reward_std": 0.34214527904987335,
"rewards/expression_based_accuracy_reward_length_penalized": 0.31414950639009476,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2871093899011612,
"step": 139
},
{
"completion_length": 546.6432495117188,
"epoch": 0.449438202247191,
"grad_norm": 0.21875,
"kl": 0.0004052919539390132,
"learning_rate": 1.1212980823907929e-07,
"loss": 0.0,
"reward": 0.63412706553936,
"reward_std": 0.36361514031887054,
"rewards/expression_based_accuracy_reward_length_penalized": 0.33920522779226303,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2949218899011612,
"step": 140
},
{
"completion_length": 547.6432495117188,
"epoch": 0.45264847512038525,
"grad_norm": 0.2001953125,
"kl": 0.0003856433249893598,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.0,
"reward": 0.7091409862041473,
"reward_std": 0.3494722992181778,
"rewards/expression_based_accuracy_reward_length_penalized": 0.37906285375356674,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3300781324505806,
"step": 141
},
{
"completion_length": 562.7890777587891,
"epoch": 0.45585874799357945,
"grad_norm": 0.28515625,
"kl": 0.0004331854870542884,
"learning_rate": 1.0777570898211405e-07,
"loss": 0.0,
"reward": 0.677094116806984,
"reward_std": 0.36977435648441315,
"rewards/expression_based_accuracy_reward_length_penalized": 0.35808368027210236,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.319010429084301,
"step": 142
},
{
"completion_length": 514.6145858764648,
"epoch": 0.4590690208667737,
"grad_norm": 0.26171875,
"kl": 0.0004561090827337466,
"learning_rate": 1.0595731054933934e-07,
"loss": 0.0,
"reward": 0.7047944366931915,
"reward_std": 0.3853035420179367,
"rewards/expression_based_accuracy_reward_length_penalized": 0.39815381169319153,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3066406399011612,
"step": 143
},
{
"completion_length": 515.0520858764648,
"epoch": 0.4622792937399679,
"grad_norm": 0.228515625,
"kl": 0.00042895031219813973,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0,
"reward": 0.687195435166359,
"reward_std": 0.39286451041698456,
"rewards/expression_based_accuracy_reward_length_penalized": 0.37925272434949875,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3079427257180214,
"step": 144
},
{
"completion_length": 532.7969055175781,
"epoch": 0.4654895666131621,
"grad_norm": 0.212890625,
"kl": 0.00040866951167117804,
"learning_rate": 1.0304273901612565e-07,
"loss": 0.0,
"reward": 0.7079404592514038,
"reward_std": 0.3612729534506798,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3934873268008232,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3144531324505806,
"step": 145
},
{
"completion_length": 552.0989837646484,
"epoch": 0.46869983948635635,
"grad_norm": 0.2158203125,
"kl": 0.0003939080925192684,
"learning_rate": 1.0194814420758804e-07,
"loss": 0.0,
"reward": 0.6515837609767914,
"reward_std": 0.3383214473724365,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3384326733648777,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3131510466337204,
"step": 146
},
{
"completion_length": 595.5234527587891,
"epoch": 0.47191011235955055,
"grad_norm": 0.2099609375,
"kl": 0.0003522088081808761,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.0,
"reward": 0.5905841588973999,
"reward_std": 0.36369770765304565,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3151935264468193,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.2753906287252903,
"step": 147
},
{
"completion_length": 493.57032012939453,
"epoch": 0.4751203852327448,
"grad_norm": 0.228515625,
"kl": 0.0003881813900079578,
"learning_rate": 1.0048729989766394e-07,
"loss": 0.0,
"reward": 0.7446072101593018,
"reward_std": 0.37431684136390686,
"rewards/expression_based_accuracy_reward_length_penalized": 0.42234158515930176,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3222656399011612,
"step": 148
},
{
"completion_length": 543.2708587646484,
"epoch": 0.478330658105939,
"grad_norm": 0.1953125,
"kl": 0.00041060569492401555,
"learning_rate": 1.0012184146924223e-07,
"loss": 0.0,
"reward": 0.6233467310667038,
"reward_std": 0.3531793877482414,
"rewards/expression_based_accuracy_reward_length_penalized": 0.3141019344329834,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3092447966337204,
"step": 149
},
{
"completion_length": 487.1823043823242,
"epoch": 0.48154093097913325,
"grad_norm": 0.28515625,
"kl": 0.0004451891945791431,
"learning_rate": 1e-07,
"loss": 0.0,
"reward": 0.7395021021366119,
"reward_std": 0.35496869683265686,
"rewards/expression_based_accuracy_reward_length_penalized": 0.41202811151742935,
"rewards/format_reward": 0.0,
"rewards/soft_format_reward": 0.0,
"rewards/tag_count_reward": 0.3274739682674408,
"step": 150
}
],
"logging_steps": 1,
"max_steps": 150,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}