| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.48154093097913325, |
| "eval_steps": 500, |
| "global_step": 150, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 560.3958587646484, |
| "epoch": 0.0032102728731942215, |
| "grad_norm": 0.1884765625, |
| "kl": 0.0, |
| "learning_rate": 6.666666666666667e-08, |
| "loss": 0.0, |
| "reward": 0.6299200654029846, |
| "reward_std": 0.34568188339471817, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3343471363186836, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2955729365348816, |
| "step": 1 |
| }, |
| { |
| "completion_length": 574.9948120117188, |
| "epoch": 0.006420545746388443, |
| "grad_norm": 0.20703125, |
| "kl": 0.0, |
| "learning_rate": 1.3333333333333334e-07, |
| "loss": 0.0, |
| "reward": 0.6667226850986481, |
| "reward_std": 0.3381393700838089, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3724518120288849, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2942708432674408, |
| "step": 2 |
| }, |
| { |
| "completion_length": 597.8541870117188, |
| "epoch": 0.009630818619582664, |
| "grad_norm": 0.185546875, |
| "kl": 0.00022509081827593036, |
| "learning_rate": 2e-07, |
| "loss": 0.0, |
| "reward": 0.636066347360611, |
| "reward_std": 0.35888948291540146, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.33658717572689056, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.299479179084301, |
| "step": 3 |
| }, |
| { |
| "completion_length": 568.2656555175781, |
| "epoch": 0.012841091492776886, |
| "grad_norm": 0.2412109375, |
| "kl": 0.00023551580670755357, |
| "learning_rate": 2.6666666666666667e-07, |
| "loss": 0.0, |
| "reward": 0.6489640921354294, |
| "reward_std": 0.344046413898468, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3677141070365906, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2812500149011612, |
| "step": 4 |
| }, |
| { |
| "completion_length": 637.7135620117188, |
| "epoch": 0.016051364365971106, |
| "grad_norm": 0.169921875, |
| "kl": 0.00023702834732830524, |
| "learning_rate": 3.333333333333333e-07, |
| "loss": 0.0, |
| "reward": 0.6146693080663681, |
| "reward_std": 0.35021649301052094, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3256068006157875, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2890625037252903, |
| "step": 5 |
| }, |
| { |
| "completion_length": 607.0260620117188, |
| "epoch": 0.019261637239165328, |
| "grad_norm": 0.2080078125, |
| "kl": 0.00022915955560165457, |
| "learning_rate": 4e-07, |
| "loss": 0.0, |
| "reward": 0.5901845693588257, |
| "reward_std": 0.3410582020878792, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3063303604722023, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.283854179084301, |
| "step": 6 |
| }, |
| { |
| "completion_length": 536.9713745117188, |
| "epoch": 0.02247191011235955, |
| "grad_norm": 0.20703125, |
| "kl": 0.00023870709992479533, |
| "learning_rate": 4.6666666666666666e-07, |
| "loss": 0.0, |
| "reward": 0.5996608734130859, |
| "reward_std": 0.3273390009999275, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.29497333616018295, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3046875074505806, |
| "step": 7 |
| }, |
| { |
| "completion_length": 579.4062652587891, |
| "epoch": 0.025682182985553772, |
| "grad_norm": 0.193359375, |
| "kl": 0.0002171014821215067, |
| "learning_rate": 5.333333333333333e-07, |
| "loss": 0.0, |
| "reward": 0.6376358270645142, |
| "reward_std": 0.34004897624254227, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.355083703994751, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2825520932674408, |
| "step": 8 |
| }, |
| { |
| "completion_length": 546.3750228881836, |
| "epoch": 0.028892455858747994, |
| "grad_norm": 0.212890625, |
| "kl": 0.00022866667859489098, |
| "learning_rate": 6e-07, |
| "loss": 0.0, |
| "reward": 0.6513122767210007, |
| "reward_std": 0.32241296768188477, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3576924651861191, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2936197966337204, |
| "step": 9 |
| }, |
| { |
| "completion_length": 573.7396087646484, |
| "epoch": 0.03210272873194221, |
| "grad_norm": 0.1904296875, |
| "kl": 0.0002470466679369565, |
| "learning_rate": 6.666666666666666e-07, |
| "loss": 0.0, |
| "reward": 0.679995134472847, |
| "reward_std": 0.3322247415781021, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3909326568245888, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2890625074505806, |
| "step": 10 |
| }, |
| { |
| "completion_length": 563.1927337646484, |
| "epoch": 0.03531300160513644, |
| "grad_norm": 0.220703125, |
| "kl": 0.00023816750763216987, |
| "learning_rate": 7.333333333333332e-07, |
| "loss": 0.0, |
| "reward": 0.6119517982006073, |
| "reward_std": 0.3330337107181549, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3124726414680481, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2994791865348816, |
| "step": 11 |
| }, |
| { |
| "completion_length": 520.0416793823242, |
| "epoch": 0.038523274478330656, |
| "grad_norm": 0.2421875, |
| "kl": 0.0002197102876380086, |
| "learning_rate": 8e-07, |
| "loss": 0.0, |
| "reward": 0.608386904001236, |
| "reward_std": 0.3292866423726082, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.31997546553611755, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2884114608168602, |
| "step": 12 |
| }, |
| { |
| "completion_length": 564.2135543823242, |
| "epoch": 0.04173354735152488, |
| "grad_norm": 0.2158203125, |
| "kl": 0.00023579742264701054, |
| "learning_rate": 8.666666666666667e-07, |
| "loss": 0.0, |
| "reward": 0.6887014210224152, |
| "reward_std": 0.3306478410959244, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.40614935383200645, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2825520895421505, |
| "step": 13 |
| }, |
| { |
| "completion_length": 597.3411712646484, |
| "epoch": 0.0449438202247191, |
| "grad_norm": 0.18359375, |
| "kl": 0.00021818295135744847, |
| "learning_rate": 9.333333333333333e-07, |
| "loss": 0.0, |
| "reward": 0.5946466475725174, |
| "reward_std": 0.322207048535347, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3166518397629261, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2779947966337204, |
| "step": 14 |
| }, |
| { |
| "completion_length": 584.4505310058594, |
| "epoch": 0.048154093097913325, |
| "grad_norm": 0.17578125, |
| "kl": 0.00022104514937382191, |
| "learning_rate": 1e-06, |
| "loss": 0.0, |
| "reward": 0.5846492573618889, |
| "reward_std": 0.3315364196896553, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.2994930259883404, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2851562574505806, |
| "step": 15 |
| }, |
| { |
| "completion_length": 554.7083587646484, |
| "epoch": 0.051364365971107544, |
| "grad_norm": 0.2119140625, |
| "kl": 0.00023191924265120178, |
| "learning_rate": 9.998781585307575e-07, |
| "loss": 0.0, |
| "reward": 0.6661794185638428, |
| "reward_std": 0.3503050282597542, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3647470995783806, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.301432304084301, |
| "step": 16 |
| }, |
| { |
| "completion_length": 580.8854522705078, |
| "epoch": 0.05457463884430177, |
| "grad_norm": 0.1826171875, |
| "kl": 0.0002030548857874237, |
| "learning_rate": 9.99512700102336e-07, |
| "loss": 0.0, |
| "reward": 0.6631067544221878, |
| "reward_std": 0.3135067969560623, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3707890138030052, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2923177182674408, |
| "step": 17 |
| }, |
| { |
| "completion_length": 580.8359527587891, |
| "epoch": 0.05778491171749599, |
| "grad_norm": 0.2001953125, |
| "kl": 0.0002304925255884882, |
| "learning_rate": 9.989038226169207e-07, |
| "loss": 0.0, |
| "reward": 0.651978924870491, |
| "reward_std": 0.36701615154743195, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3407810106873512, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.311197929084301, |
| "step": 18 |
| }, |
| { |
| "completion_length": 572.1041717529297, |
| "epoch": 0.060995184590690206, |
| "grad_norm": 0.2236328125, |
| "kl": 0.00021570282842731103, |
| "learning_rate": 9.98051855792412e-07, |
| "loss": 0.0, |
| "reward": 0.631376326084137, |
| "reward_std": 0.34789177030324936, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3279908671975136, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.303385429084301, |
| "step": 19 |
| }, |
| { |
| "completion_length": 569.4166717529297, |
| "epoch": 0.06420545746388442, |
| "grad_norm": 0.177734375, |
| "kl": 0.00021378670862759463, |
| "learning_rate": 9.969572609838744e-07, |
| "loss": 0.0, |
| "reward": 0.5896809697151184, |
| "reward_std": 0.3236342519521713, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.2973632514476776, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2923177182674408, |
| "step": 20 |
| }, |
| { |
| "completion_length": 580.0417022705078, |
| "epoch": 0.06741573033707865, |
| "grad_norm": 0.1923828125, |
| "kl": 0.00023035979393171147, |
| "learning_rate": 9.956206309337066e-07, |
| "loss": 0.0, |
| "reward": 0.6523794531822205, |
| "reward_std": 0.3744150176644325, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.35159818083047867, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3007812574505806, |
| "step": 21 |
| }, |
| { |
| "completion_length": 587.8021087646484, |
| "epoch": 0.07062600321027288, |
| "grad_norm": 0.1962890625, |
| "kl": 0.0002475921137374826, |
| "learning_rate": 9.940426894506606e-07, |
| "loss": 0.0, |
| "reward": 0.6196304857730865, |
| "reward_std": 0.3361932933330536, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.32015133649110794, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2994791753590107, |
| "step": 22 |
| }, |
| { |
| "completion_length": 518.0104217529297, |
| "epoch": 0.0738362760834671, |
| "grad_norm": 0.1904296875, |
| "kl": 0.00022199605882633477, |
| "learning_rate": 9.922242910178859e-07, |
| "loss": 0.0, |
| "reward": 0.737170621752739, |
| "reward_std": 0.3162895292043686, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.44485291838645935, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2923177182674408, |
| "step": 23 |
| }, |
| { |
| "completion_length": 539.7265625, |
| "epoch": 0.07704654895666131, |
| "grad_norm": 0.1943359375, |
| "kl": 0.0002175298322981689, |
| "learning_rate": 9.901664203302124e-07, |
| "loss": 0.0, |
| "reward": 0.7175260633230209, |
| "reward_std": 0.3508952334523201, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.40307293832302094, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3144531399011612, |
| "step": 24 |
| }, |
| { |
| "completion_length": 512.5755310058594, |
| "epoch": 0.08025682182985554, |
| "grad_norm": 0.224609375, |
| "kl": 0.0002242086047772318, |
| "learning_rate": 9.878701917609207e-07, |
| "loss": 0.0, |
| "reward": 0.6891498863697052, |
| "reward_std": 0.3474579304456711, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.38055606931447983, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3085937574505806, |
| "step": 25 |
| }, |
| { |
| "completion_length": 568.2630310058594, |
| "epoch": 0.08346709470304976, |
| "grad_norm": 0.20703125, |
| "kl": 0.0002301457461726386, |
| "learning_rate": 9.853368487582886e-07, |
| "loss": 0.0, |
| "reward": 0.6178770214319229, |
| "reward_std": 0.35181906819343567, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3281634747982025, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2897135466337204, |
| "step": 26 |
| }, |
| { |
| "completion_length": 538.7916946411133, |
| "epoch": 0.08667736757624397, |
| "grad_norm": 0.2041015625, |
| "kl": 0.00026182403962593526, |
| "learning_rate": 9.825677631722435e-07, |
| "loss": 0.0, |
| "reward": 0.7029251009225845, |
| "reward_std": 0.360026091337204, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.39693548530340195, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3059896007180214, |
| "step": 27 |
| }, |
| { |
| "completion_length": 552.0781478881836, |
| "epoch": 0.0898876404494382, |
| "grad_norm": 0.19140625, |
| "kl": 0.00023814345331629738, |
| "learning_rate": 9.795644345114794e-07, |
| "loss": 0.0, |
| "reward": 0.7071576714515686, |
| "reward_std": 0.33075109869241714, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.4206993207335472, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2864583507180214, |
| "step": 28 |
| }, |
| { |
| "completion_length": 503.5703353881836, |
| "epoch": 0.09309791332263243, |
| "grad_norm": 0.2197265625, |
| "kl": 0.00023130706176743843, |
| "learning_rate": 9.76328489131448e-07, |
| "loss": 0.0, |
| "reward": 0.6565393060445786, |
| "reward_std": 0.2805866673588753, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.36096640676259995, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.295572929084301, |
| "step": 29 |
| }, |
| { |
| "completion_length": 532.4583435058594, |
| "epoch": 0.09630818619582665, |
| "grad_norm": 0.21484375, |
| "kl": 0.00023216806584969163, |
| "learning_rate": 9.728616793536587e-07, |
| "loss": 0.0, |
| "reward": 0.6117298901081085, |
| "reward_std": 0.32376599311828613, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3122507072985172, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.299479179084301, |
| "step": 30 |
| }, |
| { |
| "completion_length": 576.1484527587891, |
| "epoch": 0.09951845906902086, |
| "grad_norm": 0.1953125, |
| "kl": 0.00021910631767241284, |
| "learning_rate": 9.69165882516764e-07, |
| "loss": 0.0, |
| "reward": 0.6560553312301636, |
| "reward_std": 0.3462247848510742, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.37480536848306656, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2812500074505806, |
| "step": 31 |
| }, |
| { |
| "completion_length": 592.3385696411133, |
| "epoch": 0.10272873194221509, |
| "grad_norm": 0.1767578125, |
| "kl": 0.0002467254307703115, |
| "learning_rate": 9.65243099959949e-07, |
| "loss": 0.0, |
| "reward": 0.5856707692146301, |
| "reward_std": 0.31634171307086945, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.28033220022916794, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.305338554084301, |
| "step": 32 |
| }, |
| { |
| "completion_length": 583.9010620117188, |
| "epoch": 0.10593900481540931, |
| "grad_norm": 0.2265625, |
| "kl": 0.00024941361698438413, |
| "learning_rate": 9.610954559391704e-07, |
| "loss": 0.0, |
| "reward": 0.6140669733285904, |
| "reward_std": 0.32649289071559906, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.327608622610569, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2864583358168602, |
| "step": 33 |
| }, |
| { |
| "completion_length": 538.0364685058594, |
| "epoch": 0.10914927768860354, |
| "grad_norm": 0.208984375, |
| "kl": 0.0002286795133841224, |
| "learning_rate": 9.567251964768342e-07, |
| "loss": 0.0, |
| "reward": 0.6336007714271545, |
| "reward_std": 0.32907337695360184, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.34258514642715454, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2910156399011612, |
| "step": 34 |
| }, |
| { |
| "completion_length": 507.6510543823242, |
| "epoch": 0.11235955056179775, |
| "grad_norm": 0.255859375, |
| "kl": 0.00024302997917402536, |
| "learning_rate": 9.521346881455354e-07, |
| "loss": 0.0, |
| "reward": 0.7129171043634415, |
| "reward_std": 0.35209202766418457, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.40757858008146286, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3053385615348816, |
| "step": 35 |
| }, |
| { |
| "completion_length": 584.4531555175781, |
| "epoch": 0.11556982343499198, |
| "grad_norm": 0.2138671875, |
| "kl": 0.00023655666518607177, |
| "learning_rate": 9.473264167865171e-07, |
| "loss": 0.0, |
| "reward": 0.6754663735628128, |
| "reward_std": 0.33357472717761993, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3831486627459526, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2923177182674408, |
| "step": 36 |
| }, |
| { |
| "completion_length": 619.2396087646484, |
| "epoch": 0.1187800963081862, |
| "grad_norm": 0.1953125, |
| "kl": 0.0002523561015550513, |
| "learning_rate": 9.42302986163543e-07, |
| "loss": 0.0, |
| "reward": 0.6422896459698677, |
| "reward_std": 0.3401486799120903, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3831750750541687, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2591145858168602, |
| "step": 37 |
| }, |
| { |
| "completion_length": 632.1067962646484, |
| "epoch": 0.12199036918138041, |
| "grad_norm": 0.19140625, |
| "kl": 0.00025913729768944904, |
| "learning_rate": 9.370671165529144e-07, |
| "loss": 0.0, |
| "reward": 0.5953093469142914, |
| "reward_std": 0.33438971638679504, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3147103600203991, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2805989757180214, |
| "step": 38 |
| }, |
| { |
| "completion_length": 569.0026397705078, |
| "epoch": 0.12520064205457465, |
| "grad_norm": 0.208984375, |
| "kl": 0.0002631417410157155, |
| "learning_rate": 9.316216432703916e-07, |
| "loss": 0.0, |
| "reward": 0.6718064844608307, |
| "reward_std": 0.3528323844075203, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3859991952776909, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2858073115348816, |
| "step": 39 |
| }, |
| { |
| "completion_length": 542.0260620117188, |
| "epoch": 0.12841091492776885, |
| "grad_norm": 0.2353515625, |
| "kl": 0.0002535913408792112, |
| "learning_rate": 9.259695151358214e-07, |
| "loss": 0.0, |
| "reward": 0.6311447024345398, |
| "reward_std": 0.3200613558292389, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3459884449839592, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2851562574505806, |
| "step": 40 |
| }, |
| { |
| "completion_length": 573.9167022705078, |
| "epoch": 0.13162118780096307, |
| "grad_norm": 0.1923828125, |
| "kl": 0.0002568592317402363, |
| "learning_rate": 9.20113792876298e-07, |
| "loss": 0.0, |
| "reward": 0.6579329371452332, |
| "reward_std": 0.33611204475164413, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3617089316248894, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2962239682674408, |
| "step": 41 |
| }, |
| { |
| "completion_length": 563.0416793823242, |
| "epoch": 0.1348314606741573, |
| "grad_norm": 0.205078125, |
| "kl": 0.00026875592448050156, |
| "learning_rate": 9.140576474687263e-07, |
| "loss": 0.0, |
| "reward": 0.6627669483423233, |
| "reward_std": 0.3593166694045067, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3750064894556999, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2877604253590107, |
| "step": 42 |
| }, |
| { |
| "completion_length": 496.56251525878906, |
| "epoch": 0.13804173354735153, |
| "grad_norm": 0.2275390625, |
| "kl": 0.0002509369187464472, |
| "learning_rate": 9.078043584226815e-07, |
| "loss": 0.0, |
| "reward": 0.693062499165535, |
| "reward_std": 0.3470332473516464, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3753541484475136, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3177083432674408, |
| "step": 43 |
| }, |
| { |
| "completion_length": 589.6536712646484, |
| "epoch": 0.14125200642054575, |
| "grad_norm": 0.1884765625, |
| "kl": 0.0002775079774437472, |
| "learning_rate": 9.013573120044966e-07, |
| "loss": 0.0, |
| "reward": 0.5451524406671524, |
| "reward_std": 0.3420337289571762, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.2665066123008728, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2786458432674408, |
| "step": 44 |
| }, |
| { |
| "completion_length": 515.5677261352539, |
| "epoch": 0.14446227929373998, |
| "grad_norm": 0.2431640625, |
| "kl": 0.00026737275038613006, |
| "learning_rate": 8.9471999940354e-07, |
| "loss": 0.0, |
| "reward": 0.6689368337392807, |
| "reward_std": 0.3494330644607544, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3753170371055603, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2936197966337204, |
| "step": 45 |
| }, |
| { |
| "completion_length": 531.6041870117188, |
| "epoch": 0.1476725521669342, |
| "grad_norm": 0.2158203125, |
| "kl": 0.00027584553754422814, |
| "learning_rate": 8.878960148416747e-07, |
| "loss": 0.0, |
| "reward": 0.6247715353965759, |
| "reward_std": 0.3459451347589493, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3357090353965759, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2890625149011612, |
| "step": 46 |
| }, |
| { |
| "completion_length": 523.8359527587891, |
| "epoch": 0.1508828250401284, |
| "grad_norm": 0.2109375, |
| "kl": 0.0002594252800918184, |
| "learning_rate": 8.808890536269229e-07, |
| "loss": 0.0, |
| "reward": 0.6625895947217941, |
| "reward_std": 0.35964568704366684, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.36180833727121353, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3007812574505806, |
| "step": 47 |
| }, |
| { |
| "completion_length": 572.2395935058594, |
| "epoch": 0.15409309791332262, |
| "grad_norm": 0.2001953125, |
| "kl": 0.0002760118877631612, |
| "learning_rate": 8.737029101523929e-07, |
| "loss": 0.0, |
| "reward": 0.6687695384025574, |
| "reward_std": 0.3379608243703842, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.36733726412057877, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.301432304084301, |
| "step": 48 |
| }, |
| { |
| "completion_length": 565.372428894043, |
| "epoch": 0.15730337078651685, |
| "grad_norm": 0.2109375, |
| "kl": 0.00026545282889856026, |
| "learning_rate": 8.663414758415478e-07, |
| "loss": 0.0, |
| "reward": 0.6460029184818268, |
| "reward_std": 0.33386022597551346, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3458726927638054, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3001302108168602, |
| "step": 49 |
| }, |
| { |
| "completion_length": 540.8411560058594, |
| "epoch": 0.16051364365971107, |
| "grad_norm": 0.2177734375, |
| "kl": 0.0002867219809559174, |
| "learning_rate": 8.588087370409302e-07, |
| "loss": 0.0, |
| "reward": 0.6432211250066757, |
| "reward_std": 0.35255035012960434, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3235596604645252, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3196614682674408, |
| "step": 50 |
| }, |
| { |
| "completion_length": 583.2864685058594, |
| "epoch": 0.1637239165329053, |
| "grad_norm": 0.220703125, |
| "kl": 0.0003001616059918888, |
| "learning_rate": 8.511087728614862e-07, |
| "loss": 0.0, |
| "reward": 0.6296520233154297, |
| "reward_std": 0.3602987676858902, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3295218013226986, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3001302182674408, |
| "step": 51 |
| }, |
| { |
| "completion_length": 596.6302185058594, |
| "epoch": 0.16693418940609953, |
| "grad_norm": 0.2412109375, |
| "kl": 0.0002572698904259596, |
| "learning_rate": 8.432457529696548e-07, |
| "loss": 0.0, |
| "reward": 0.6288764774799347, |
| "reward_std": 0.3630865290760994, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3443712741136551, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2845052257180214, |
| "step": 52 |
| }, |
| { |
| "completion_length": 476.7343978881836, |
| "epoch": 0.17014446227929375, |
| "grad_norm": 0.23828125, |
| "kl": 0.0003045099292648956, |
| "learning_rate": 8.352239353294194e-07, |
| "loss": 0.0, |
| "reward": 0.6977786123752594, |
| "reward_std": 0.36942026019096375, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3748619332909584, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.322916679084301, |
| "step": 53 |
| }, |
| { |
| "completion_length": 594.0599060058594, |
| "epoch": 0.17335473515248795, |
| "grad_norm": 0.1982421875, |
| "kl": 0.0002901406696764752, |
| "learning_rate": 8.270476638965461e-07, |
| "loss": 0.0, |
| "reward": 0.614113561809063, |
| "reward_std": 0.30325619876384735, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.30291564762592316, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3111979216337204, |
| "step": 54 |
| }, |
| { |
| "completion_length": 540.8698120117188, |
| "epoch": 0.17656500802568217, |
| "grad_norm": 0.2177734375, |
| "kl": 0.0002815077095874585, |
| "learning_rate": 8.187213662662538e-07, |
| "loss": 0.0, |
| "reward": 0.7013998925685883, |
| "reward_std": 0.345312163233757, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.39671240001916885, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3046875074505806, |
| "step": 55 |
| }, |
| { |
| "completion_length": 551.1432342529297, |
| "epoch": 0.1797752808988764, |
| "grad_norm": 0.208984375, |
| "kl": 0.0003022913369932212, |
| "learning_rate": 8.102495512755938e-07, |
| "loss": 0.0, |
| "reward": 0.6621358841657639, |
| "reward_std": 0.3478364497423172, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.35614627599716187, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3059896007180214, |
| "step": 56 |
| }, |
| { |
| "completion_length": 541.5338668823242, |
| "epoch": 0.18298555377207062, |
| "grad_norm": 0.251953125, |
| "kl": 0.00029883202660130337, |
| "learning_rate": 8.01636806561836e-07, |
| "loss": 0.0, |
| "reward": 0.6321840733289719, |
| "reward_std": 0.3268617168068886, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3522360995411873, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2799479216337204, |
| "step": 57 |
| }, |
| { |
| "completion_length": 534.2135620117188, |
| "epoch": 0.18619582664526485, |
| "grad_norm": 0.2177734375, |
| "kl": 0.00031317536922870204, |
| "learning_rate": 7.928877960781808e-07, |
| "loss": 0.0, |
| "reward": 0.6300312578678131, |
| "reward_std": 0.31237364560365677, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3422708138823509, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2877604216337204, |
| "step": 58 |
| }, |
| { |
| "completion_length": 569.7265930175781, |
| "epoch": 0.18940609951845908, |
| "grad_norm": 0.1943359375, |
| "kl": 0.0002944675215985626, |
| "learning_rate": 7.840072575681468e-07, |
| "loss": 0.0, |
| "reward": 0.6045078411698341, |
| "reward_std": 0.33760548382997513, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.30958598107099533, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2949218824505806, |
| "step": 59 |
| }, |
| { |
| "completion_length": 546.7135467529297, |
| "epoch": 0.1926163723916533, |
| "grad_norm": 0.244140625, |
| "kl": 0.0003155921949655749, |
| "learning_rate": 7.75e-07, |
| "loss": 0.0, |
| "reward": 0.6555080115795135, |
| "reward_std": 0.32254888117313385, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.33779964968562126, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3177083432674408, |
| "step": 60 |
| }, |
| { |
| "completion_length": 544.5573120117188, |
| "epoch": 0.1958266452648475, |
| "grad_norm": 0.193359375, |
| "kl": 0.00029893887403886765, |
| "learning_rate": 7.658709009626109e-07, |
| "loss": 0.0, |
| "reward": 0.6744174212217331, |
| "reward_std": 0.33529237657785416, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3684278205037117, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3059895932674408, |
| "step": 61 |
| }, |
| { |
| "completion_length": 499.6224136352539, |
| "epoch": 0.19903691813804172, |
| "grad_norm": 0.2353515625, |
| "kl": 0.00032137856032932177, |
| "learning_rate": 7.566249040241553e-07, |
| "loss": 0.0, |
| "reward": 0.6523666083812714, |
| "reward_std": 0.32566210627555847, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.34898117184638977, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3033854216337204, |
| "step": 62 |
| }, |
| { |
| "completion_length": 578.7239837646484, |
| "epoch": 0.20224719101123595, |
| "grad_norm": 0.20703125, |
| "kl": 0.0003287481522420421, |
| "learning_rate": 7.472670160550848e-07, |
| "loss": 0.0, |
| "reward": 0.634161502122879, |
| "reward_std": 0.34120889008045197, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3372865132987499, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2968750074505806, |
| "step": 63 |
| }, |
| { |
| "completion_length": 506.3177261352539, |
| "epoch": 0.20545746388443017, |
| "grad_norm": 0.2412109375, |
| "kl": 0.0003212923475075513, |
| "learning_rate": 7.37802304516818e-07, |
| "loss": 0.0, |
| "reward": 0.6933595240116119, |
| "reward_std": 0.3754495605826378, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.38085950165987015, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3125000074505806, |
| "step": 64 |
| }, |
| { |
| "completion_length": 581.5833587646484, |
| "epoch": 0.2086677367576244, |
| "grad_norm": 0.181640625, |
| "kl": 0.00029418900521704927, |
| "learning_rate": 7.282358947176205e-07, |
| "loss": 0.0, |
| "reward": 0.6189248859882355, |
| "reward_std": 0.33084874600172043, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3142373785376549, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3046875149011612, |
| "step": 65 |
| }, |
| { |
| "completion_length": 534.5729446411133, |
| "epoch": 0.21187800963081863, |
| "grad_norm": 0.224609375, |
| "kl": 0.00033117266866611317, |
| "learning_rate": 7.185729670371604e-07, |
| "loss": 0.0, |
| "reward": 0.6608574390411377, |
| "reward_std": 0.31472062319517136, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3600761741399765, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3007812574505806, |
| "step": 66 |
| }, |
| { |
| "completion_length": 517.5755386352539, |
| "epoch": 0.21508828250401285, |
| "grad_norm": 0.23828125, |
| "kl": 0.00034513785067247227, |
| "learning_rate": 7.08818754121241e-07, |
| "loss": 0.0, |
| "reward": 0.6840898096561432, |
| "reward_std": 0.3518378511071205, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3644283339381218, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3196614682674408, |
| "step": 67 |
| }, |
| { |
| "completion_length": 534.2578353881836, |
| "epoch": 0.21829855537720708, |
| "grad_norm": 0.20703125, |
| "kl": 0.00032389759144280106, |
| "learning_rate": 6.989785380482312e-07, |
| "loss": 0.0, |
| "reward": 0.7169905304908752, |
| "reward_std": 0.3356803208589554, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.39472493529319763, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3222656324505806, |
| "step": 68 |
| }, |
| { |
| "completion_length": 593.1797027587891, |
| "epoch": 0.22150882825040127, |
| "grad_norm": 0.205078125, |
| "kl": 0.00034336688258918, |
| "learning_rate": 6.890576474687263e-07, |
| "loss": 0.0, |
| "reward": 0.6631477773189545, |
| "reward_std": 0.37854011356830597, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.34023110568523407, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.322916679084301, |
| "step": 69 |
| }, |
| { |
| "completion_length": 629.6015625, |
| "epoch": 0.2247191011235955, |
| "grad_norm": 0.185546875, |
| "kl": 0.00032993722561514005, |
| "learning_rate": 6.790614547199906e-07, |
| "loss": 0.0, |
| "reward": 0.5925078019499779, |
| "reward_std": 0.3088828846812248, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3125598691403866, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.279947929084301, |
| "step": 70 |
| }, |
| { |
| "completion_length": 559.5963897705078, |
| "epoch": 0.22792937399678972, |
| "grad_norm": 0.2255859375, |
| "kl": 0.0003137872990919277, |
| "learning_rate": 6.68995372916741e-07, |
| "loss": 0.0, |
| "reward": 0.7026459574699402, |
| "reward_std": 0.33306000381708145, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.392750084400177, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3098958432674408, |
| "step": 71 |
| }, |
| { |
| "completion_length": 494.39845275878906, |
| "epoch": 0.23113964686998395, |
| "grad_norm": 0.2275390625, |
| "kl": 0.0003587143437471241, |
| "learning_rate": 6.588648530198504e-07, |
| "loss": 0.0, |
| "reward": 0.6391649395227432, |
| "reward_std": 0.3157573267817497, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.32080554217100143, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3183593824505806, |
| "step": 72 |
| }, |
| { |
| "completion_length": 583.8672027587891, |
| "epoch": 0.23434991974317818, |
| "grad_norm": 0.1875, |
| "kl": 0.0002944624357041903, |
| "learning_rate": 6.486753808845564e-07, |
| "loss": 0.0, |
| "reward": 0.6462114006280899, |
| "reward_std": 0.33720824867486954, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3584509789943695, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.287760429084301, |
| "step": 73 |
| }, |
| { |
| "completion_length": 509.7161560058594, |
| "epoch": 0.2375601926163724, |
| "grad_norm": 0.2333984375, |
| "kl": 0.00037064859498059377, |
| "learning_rate": 6.384324742897735e-07, |
| "loss": 0.0, |
| "reward": 0.6612931340932846, |
| "reward_std": 0.3572119027376175, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3448868505656719, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3164062649011612, |
| "step": 74 |
| }, |
| { |
| "completion_length": 549.5651245117188, |
| "epoch": 0.24077046548956663, |
| "grad_norm": 0.197265625, |
| "kl": 0.00032304248452419415, |
| "learning_rate": 6.281416799501187e-07, |
| "loss": 0.0, |
| "reward": 0.6797159165143967, |
| "reward_std": 0.34857943654060364, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.36135654896497726, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3183593824505806, |
| "step": 75 |
| }, |
| { |
| "completion_length": 572.7578353881836, |
| "epoch": 0.24398073836276082, |
| "grad_norm": 0.2119140625, |
| "kl": 0.0003249031215091236, |
| "learning_rate": 6.178085705122674e-07, |
| "loss": 0.0, |
| "reward": 0.6995292603969574, |
| "reward_std": 0.3806586042046547, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3727063462138176, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3268229216337204, |
| "step": 76 |
| }, |
| { |
| "completion_length": 507.5078353881836, |
| "epoch": 0.24719101123595505, |
| "grad_norm": 0.255859375, |
| "kl": 0.0003559839096851647, |
| "learning_rate": 6.074387415372676e-07, |
| "loss": 0.0, |
| "reward": 0.7540216147899628, |
| "reward_std": 0.38066261261701584, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.43045392632484436, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3235677182674408, |
| "step": 77 |
| }, |
| { |
| "completion_length": 618.5390930175781, |
| "epoch": 0.2504012841091493, |
| "grad_norm": 0.220703125, |
| "kl": 0.0003840129793388769, |
| "learning_rate": 5.97037808470444e-07, |
| "loss": 0.0, |
| "reward": 0.5318701416254044, |
| "reward_std": 0.35173140466213226, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.26559409499168396, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2662760466337204, |
| "step": 78 |
| }, |
| { |
| "completion_length": 514.5781555175781, |
| "epoch": 0.2536115569823435, |
| "grad_norm": 0.234375, |
| "kl": 0.00037649404839612544, |
| "learning_rate": 5.866114036005362e-07, |
| "loss": 0.0, |
| "reward": 0.677052691578865, |
| "reward_std": 0.36026471108198166, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.35348496586084366, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3235677257180214, |
| "step": 79 |
| }, |
| { |
| "completion_length": 546.5338745117188, |
| "epoch": 0.2568218298555377, |
| "grad_norm": 0.19921875, |
| "kl": 0.0003384711453691125, |
| "learning_rate": 5.761651730097142e-07, |
| "loss": 0.0, |
| "reward": 0.6351290941238403, |
| "reward_std": 0.34162163734436035, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.34281135350465775, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2923177257180214, |
| "step": 80 |
| }, |
| { |
| "completion_length": 545.1432495117188, |
| "epoch": 0.26003210272873195, |
| "grad_norm": 0.2001953125, |
| "kl": 0.0003302170734968968, |
| "learning_rate": 5.657047735161255e-07, |
| "loss": 0.0, |
| "reward": 0.7321957647800446, |
| "reward_std": 0.3832404538989067, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.42946138232946396, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3027343824505806, |
| "step": 81 |
| }, |
| { |
| "completion_length": 602.4036712646484, |
| "epoch": 0.26324237560192615, |
| "grad_norm": 0.169921875, |
| "kl": 0.0003239936995669268, |
| "learning_rate": 5.552358696106288e-07, |
| "loss": 0.0, |
| "reward": 0.6142081022262573, |
| "reward_std": 0.33728527277708054, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.31277579814195633, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.301432304084301, |
| "step": 82 |
| }, |
| { |
| "completion_length": 562.7057495117188, |
| "epoch": 0.2664526484751204, |
| "grad_norm": 0.240234375, |
| "kl": 0.00037012308894190937, |
| "learning_rate": 5.447641303893714e-07, |
| "loss": 0.0, |
| "reward": 0.6191717982292175, |
| "reward_std": 0.3545895963907242, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.31578636169433594, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.303385429084301, |
| "step": 83 |
| }, |
| { |
| "completion_length": 543.9427261352539, |
| "epoch": 0.2696629213483146, |
| "grad_norm": 0.2314453125, |
| "kl": 0.00037831455847481266, |
| "learning_rate": 5.342952264838747e-07, |
| "loss": 0.0, |
| "reward": 0.7242841571569443, |
| "reward_std": 0.3670550063252449, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.4020185172557831, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3222656324505806, |
| "step": 84 |
| }, |
| { |
| "completion_length": 558.5781555175781, |
| "epoch": 0.27287319422150885, |
| "grad_norm": 0.22265625, |
| "kl": 0.00036308395647211, |
| "learning_rate": 5.238348269902859e-07, |
| "loss": 0.0, |
| "reward": 0.6587125062942505, |
| "reward_std": 0.36182229965925217, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3572801947593689, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.301432304084301, |
| "step": 85 |
| }, |
| { |
| "completion_length": 581.3099060058594, |
| "epoch": 0.27608346709470305, |
| "grad_norm": 0.23046875, |
| "kl": 0.00038044428947614506, |
| "learning_rate": 5.133885963994639e-07, |
| "loss": 0.0, |
| "reward": 0.6719960719347, |
| "reward_std": 0.3624914661049843, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3588450253009796, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.313151054084301, |
| "step": 86 |
| }, |
| { |
| "completion_length": 580.1745147705078, |
| "epoch": 0.27929373996789725, |
| "grad_norm": 0.1767578125, |
| "kl": 0.00034601552761159837, |
| "learning_rate": 5.02962191529556e-07, |
| "loss": 0.0, |
| "reward": 0.6323724538087845, |
| "reward_std": 0.32785172015428543, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3433099538087845, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2890625074505806, |
| "step": 87 |
| }, |
| { |
| "completion_length": 530.0156326293945, |
| "epoch": 0.2825040128410915, |
| "grad_norm": 0.220703125, |
| "kl": 0.00040156069735530764, |
| "learning_rate": 4.925612584627324e-07, |
| "loss": 0.0, |
| "reward": 0.7260984629392624, |
| "reward_std": 0.38204891979694366, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3940671756863594, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.33203125, |
| "step": 88 |
| }, |
| { |
| "completion_length": 548.8020935058594, |
| "epoch": 0.2857142857142857, |
| "grad_norm": 0.248046875, |
| "kl": 0.0004189757601125166, |
| "learning_rate": 4.821914294877326e-07, |
| "loss": 0.0, |
| "reward": 0.6541236937046051, |
| "reward_std": 0.344281330704689, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3533423990011215, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3007812574505806, |
| "step": 89 |
| }, |
| { |
| "completion_length": 609.1432342529297, |
| "epoch": 0.28892455858747995, |
| "grad_norm": 0.1982421875, |
| "kl": 0.0003810434936895035, |
| "learning_rate": 4.7185832004988133e-07, |
| "loss": 0.0, |
| "reward": 0.5851198732852936, |
| "reward_std": 0.32441411167383194, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.2947552725672722, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2903645858168602, |
| "step": 90 |
| }, |
| { |
| "completion_length": 574.8021087646484, |
| "epoch": 0.29213483146067415, |
| "grad_norm": 0.2255859375, |
| "kl": 0.0003523045379552059, |
| "learning_rate": 4.6156752571022637e-07, |
| "loss": 0.0, |
| "reward": 0.6396794319152832, |
| "reward_std": 0.33973030745983124, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3525700494647026, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.287109375, |
| "step": 91 |
| }, |
| { |
| "completion_length": 551.4505462646484, |
| "epoch": 0.2953451043338684, |
| "grad_norm": 0.1845703125, |
| "kl": 0.00035572806518757716, |
| "learning_rate": 4.513246191154434e-07, |
| "loss": 0.0, |
| "reward": 0.6876581907272339, |
| "reward_std": 0.3704243451356888, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3888300210237503, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2988281399011612, |
| "step": 92 |
| }, |
| { |
| "completion_length": 560.6718902587891, |
| "epoch": 0.2985553772070626, |
| "grad_norm": 0.2001953125, |
| "kl": 0.0003872549714287743, |
| "learning_rate": 4.4113514698014953e-07, |
| "loss": 0.0, |
| "reward": 0.6538409739732742, |
| "reward_std": 0.35449104756116867, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.35175760090351105, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3020833432674408, |
| "step": 93 |
| }, |
| { |
| "completion_length": 607.7396087646484, |
| "epoch": 0.3017656500802568, |
| "grad_norm": 0.263671875, |
| "kl": 0.0003801950879278593, |
| "learning_rate": 4.3100462708325914e-07, |
| "loss": 0.0, |
| "reward": 0.5898270905017853, |
| "reward_std": 0.3407137244939804, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3066239655017853, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2832031287252903, |
| "step": 94 |
| }, |
| { |
| "completion_length": 496.19793701171875, |
| "epoch": 0.30497592295345105, |
| "grad_norm": 0.240234375, |
| "kl": 0.0003671470913104713, |
| "learning_rate": 4.209385452800095e-07, |
| "loss": 0.0, |
| "reward": 0.7184917479753494, |
| "reward_std": 0.3648832216858864, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.38646050542593, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3320312574505806, |
| "step": 95 |
| }, |
| { |
| "completion_length": 508.57554626464844, |
| "epoch": 0.30818619582664525, |
| "grad_norm": 0.2265625, |
| "kl": 0.00038119566306704655, |
| "learning_rate": 4.1094235253127374e-07, |
| "loss": 0.0, |
| "reward": 0.6568552851676941, |
| "reward_std": 0.3511122092604637, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3293813392519951, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3274739682674408, |
| "step": 96 |
| }, |
| { |
| "completion_length": 549.0312652587891, |
| "epoch": 0.3113964686998395, |
| "grad_norm": 0.21484375, |
| "kl": 0.0003632343214121647, |
| "learning_rate": 4.0102146195176887e-07, |
| "loss": 0.0, |
| "reward": 0.7204606682062149, |
| "reward_std": 0.3499609977006912, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.4001481980085373, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3203125074505806, |
| "step": 97 |
| }, |
| { |
| "completion_length": 490.3802261352539, |
| "epoch": 0.3146067415730337, |
| "grad_norm": 0.2255859375, |
| "kl": 0.00044602488924283534, |
| "learning_rate": 3.911812458787591e-07, |
| "loss": 0.0, |
| "reward": 0.6808420717716217, |
| "reward_std": 0.35114526003599167, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3728993684053421, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3079427182674408, |
| "step": 98 |
| }, |
| { |
| "completion_length": 524.2682342529297, |
| "epoch": 0.31781701444622795, |
| "grad_norm": 0.20703125, |
| "kl": 0.0003882949022226967, |
| "learning_rate": 3.8142703296283953e-07, |
| "loss": 0.0, |
| "reward": 0.6448683142662048, |
| "reward_std": 0.3429142013192177, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.33367037773132324, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.311197929084301, |
| "step": 99 |
| }, |
| { |
| "completion_length": 557.1823043823242, |
| "epoch": 0.32102728731942215, |
| "grad_norm": 0.1884765625, |
| "kl": 0.00035858208866557106, |
| "learning_rate": 3.7176410528237945e-07, |
| "loss": 0.0, |
| "reward": 0.6761815696954727, |
| "reward_std": 0.3675583600997925, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.35326486080884933, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.322916679084301, |
| "step": 100 |
| }, |
| { |
| "completion_length": 564.6927337646484, |
| "epoch": 0.32423756019261635, |
| "grad_norm": 0.22265625, |
| "kl": 0.00038343547930708155, |
| "learning_rate": 3.62197695483182e-07, |
| "loss": 0.0, |
| "reward": 0.6524051502346992, |
| "reward_std": 0.36947014927864075, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.34055614471435547, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3118489682674408, |
| "step": 101 |
| }, |
| { |
| "completion_length": 551.9453353881836, |
| "epoch": 0.3274478330658106, |
| "grad_norm": 0.2294921875, |
| "kl": 0.0003793273790506646, |
| "learning_rate": 3.5273298394491515e-07, |
| "loss": 0.0, |
| "reward": 0.6944572031497955, |
| "reward_std": 0.37888605892658234, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.37870199978351593, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3157552182674408, |
| "step": 102 |
| }, |
| { |
| "completion_length": 530.1771011352539, |
| "epoch": 0.3306581059390048, |
| "grad_norm": 0.22265625, |
| "kl": 0.00038907503767404705, |
| "learning_rate": 3.433750959758446e-07, |
| "loss": 0.0, |
| "reward": 0.6862371563911438, |
| "reward_std": 0.3600939214229584, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3555079624056816, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3307291716337204, |
| "step": 103 |
| }, |
| { |
| "completion_length": 504.72398376464844, |
| "epoch": 0.33386837881219905, |
| "grad_norm": 0.2265625, |
| "kl": 0.0004411861809785478, |
| "learning_rate": 3.3412909903738936e-07, |
| "loss": 0.0, |
| "reward": 0.7003691345453262, |
| "reward_std": 0.34579480439424515, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.38917120546102524, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.311197929084301, |
| "step": 104 |
| }, |
| { |
| "completion_length": 579.5859527587891, |
| "epoch": 0.33707865168539325, |
| "grad_norm": 0.2060546875, |
| "kl": 0.0003610364656196907, |
| "learning_rate": 3.250000000000001e-07, |
| "loss": 0.0, |
| "reward": 0.7041359394788742, |
| "reward_std": 0.3546976149082184, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3883807212114334, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3157552182674408, |
| "step": 105 |
| }, |
| { |
| "completion_length": 504.9349136352539, |
| "epoch": 0.3402889245585875, |
| "grad_norm": 0.2294921875, |
| "kl": 0.0004345797060523182, |
| "learning_rate": 3.159927424318531e-07, |
| "loss": 0.0, |
| "reward": 0.7195965945720673, |
| "reward_std": 0.34991642087697983, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.39863305538892746, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3209635466337204, |
| "step": 106 |
| }, |
| { |
| "completion_length": 521.9349060058594, |
| "epoch": 0.3434991974317817, |
| "grad_norm": 0.2333984375, |
| "kl": 0.0004348123256932013, |
| "learning_rate": 3.0711220392181934e-07, |
| "loss": 0.0, |
| "reward": 0.5767635926604271, |
| "reward_std": 0.3495699018239975, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.2896541878581047, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2871093824505806, |
| "step": 107 |
| }, |
| { |
| "completion_length": 578.0781402587891, |
| "epoch": 0.3467094703049759, |
| "grad_norm": 0.26171875, |
| "kl": 0.0003971747573814355, |
| "learning_rate": 2.9836319343816397e-07, |
| "loss": 0.0, |
| "reward": 0.5868955999612808, |
| "reward_std": 0.3408031612634659, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.29587996006011963, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2910156324505806, |
| "step": 108 |
| }, |
| { |
| "completion_length": 558.0286712646484, |
| "epoch": 0.34991974317817015, |
| "grad_norm": 0.1845703125, |
| "kl": 0.00036870845360681415, |
| "learning_rate": 2.897504487244061e-07, |
| "loss": 0.0, |
| "reward": 0.6787082105875015, |
| "reward_std": 0.3448420986533165, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3570936322212219, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3216145932674408, |
| "step": 109 |
| }, |
| { |
| "completion_length": 547.6562652587891, |
| "epoch": 0.35313001605136435, |
| "grad_norm": 0.2158203125, |
| "kl": 0.00039495840610470623, |
| "learning_rate": 2.812786337337463e-07, |
| "loss": 0.0, |
| "reward": 0.5997674912214279, |
| "reward_std": 0.32131277769804, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3054966703057289, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2942708432674408, |
| "step": 110 |
| }, |
| { |
| "completion_length": 552.3463745117188, |
| "epoch": 0.3563402889245586, |
| "grad_norm": 0.2578125, |
| "kl": 0.00039361264498438686, |
| "learning_rate": 2.729523361034538e-07, |
| "loss": 0.0, |
| "reward": 0.5880802720785141, |
| "reward_std": 0.34414373338222504, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.2827417254447937, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.305338554084301, |
| "step": 111 |
| }, |
| { |
| "completion_length": 542.1927185058594, |
| "epoch": 0.3595505617977528, |
| "grad_norm": 0.1904296875, |
| "kl": 0.0003947726945625618, |
| "learning_rate": 2.6477606467058035e-07, |
| "loss": 0.0, |
| "reward": 0.6639807671308517, |
| "reward_std": 0.3379776254296303, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.32934536039829254, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.334635429084301, |
| "step": 112 |
| }, |
| { |
| "completion_length": 550.3698043823242, |
| "epoch": 0.36276083467094705, |
| "grad_norm": 0.2119140625, |
| "kl": 0.00041885858081514016, |
| "learning_rate": 2.567542470303452e-07, |
| "loss": 0.0, |
| "reward": 0.6352178752422333, |
| "reward_std": 0.3331167697906494, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3402960002422333, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2949218824505806, |
| "step": 113 |
| }, |
| { |
| "completion_length": 520.3073120117188, |
| "epoch": 0.36597110754414125, |
| "grad_norm": 0.2158203125, |
| "kl": 0.00037509016692638397, |
| "learning_rate": 2.488912271385139e-07, |
| "loss": 0.0, |
| "reward": 0.6496723890304565, |
| "reward_std": 0.36061549186706543, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.33131300657987595, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3183593824505806, |
| "step": 114 |
| }, |
| { |
| "completion_length": 545.2890701293945, |
| "epoch": 0.36918138041733545, |
| "grad_norm": 0.2412109375, |
| "kl": 0.00040495285793440416, |
| "learning_rate": 2.411912629590699e-07, |
| "loss": 0.0, |
| "reward": 0.6173844560980797, |
| "reward_std": 0.3021947294473648, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3328792154788971, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2845052108168602, |
| "step": 115 |
| }, |
| { |
| "completion_length": 597.0130462646484, |
| "epoch": 0.3723916532905297, |
| "grad_norm": 0.205078125, |
| "kl": 0.0003835263050859794, |
| "learning_rate": 2.336585241584522e-07, |
| "loss": 0.0, |
| "reward": 0.6083859652280807, |
| "reward_std": 0.34971795231103897, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.31671928614377975, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2916666716337204, |
| "step": 116 |
| }, |
| { |
| "completion_length": 505.15106201171875, |
| "epoch": 0.3756019261637239, |
| "grad_norm": 0.259765625, |
| "kl": 0.0004204789365758188, |
| "learning_rate": 2.2629708984760706e-07, |
| "loss": 0.0, |
| "reward": 0.6160649359226227, |
| "reward_std": 0.3238491714000702, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3263513892889023, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2897135466337204, |
| "step": 117 |
| }, |
| { |
| "completion_length": 567.8411712646484, |
| "epoch": 0.37881219903691815, |
| "grad_norm": 0.197265625, |
| "kl": 0.0003820292549789883, |
| "learning_rate": 2.1911094637307714e-07, |
| "loss": 0.0, |
| "reward": 0.5847776532173157, |
| "reward_std": 0.33124052733182907, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.2970172315835953, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2877604216337204, |
| "step": 118 |
| }, |
| { |
| "completion_length": 539.3724136352539, |
| "epoch": 0.38202247191011235, |
| "grad_norm": 0.212890625, |
| "kl": 0.0003783565916819498, |
| "learning_rate": 2.1210398515832536e-07, |
| "loss": 0.0, |
| "reward": 0.7074552923440933, |
| "reward_std": 0.33786971867084503, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3910490423440933, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3164062574505806, |
| "step": 119 |
| }, |
| { |
| "completion_length": 578.6484527587891, |
| "epoch": 0.3852327447833066, |
| "grad_norm": 0.208984375, |
| "kl": 0.00036553355312207714, |
| "learning_rate": 2.0528000059645995e-07, |
| "loss": 0.0, |
| "reward": 0.6493179947137833, |
| "reward_std": 0.35857032984495163, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.35374507308006287, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.295572929084301, |
| "step": 120 |
| }, |
| { |
| "completion_length": 516.8437576293945, |
| "epoch": 0.3884430176565008, |
| "grad_norm": 0.2373046875, |
| "kl": 0.00045376412163022906, |
| "learning_rate": 1.986426879955034e-07, |
| "loss": 0.0, |
| "reward": 0.684567391872406, |
| "reward_std": 0.3590117618441582, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3818329870700836, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3027343824505806, |
| "step": 121 |
| }, |
| { |
| "completion_length": 551.9687652587891, |
| "epoch": 0.391653290529695, |
| "grad_norm": 0.193359375, |
| "kl": 0.0003975575600634329, |
| "learning_rate": 1.9219564157731844e-07, |
| "loss": 0.0, |
| "reward": 0.6631377786397934, |
| "reward_std": 0.377517007291317, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3408721387386322, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3222656399011612, |
| "step": 122 |
| }, |
| { |
| "completion_length": 525.5026245117188, |
| "epoch": 0.39486356340288925, |
| "grad_norm": 0.21875, |
| "kl": 0.00042099927668459713, |
| "learning_rate": 1.8594235253127372e-07, |
| "loss": 0.0, |
| "reward": 0.7239128798246384, |
| "reward_std": 0.35999199748039246, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.40815767645835876, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3157552182674408, |
| "step": 123 |
| }, |
| { |
| "completion_length": 523.8567886352539, |
| "epoch": 0.39807383627608345, |
| "grad_norm": 0.318359375, |
| "kl": 0.00044889742275699973, |
| "learning_rate": 1.7988620712370195e-07, |
| "loss": 0.0, |
| "reward": 0.716105192899704, |
| "reward_std": 0.345996156334877, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.4315999895334244, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2845052182674408, |
| "step": 124 |
| }, |
| { |
| "completion_length": 522.7838668823242, |
| "epoch": 0.4012841091492777, |
| "grad_norm": 0.255859375, |
| "kl": 0.00039373226172756404, |
| "learning_rate": 1.7403048486417868e-07, |
| "loss": 0.0, |
| "reward": 0.6855793744325638, |
| "reward_std": 0.3608446344733238, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3704752177000046, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.315104179084301, |
| "step": 125 |
| }, |
| { |
| "completion_length": 557.4557495117188, |
| "epoch": 0.4044943820224719, |
| "grad_norm": 0.2060546875, |
| "kl": 0.00039951602957444265, |
| "learning_rate": 1.6837835672960831e-07, |
| "loss": 0.0, |
| "reward": 0.5974871069192886, |
| "reward_std": 0.3423160910606384, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.27131520584225655, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3261718824505806, |
| "step": 126 |
| }, |
| { |
| "completion_length": 578.3697967529297, |
| "epoch": 0.40770465489566615, |
| "grad_norm": 0.2041015625, |
| "kl": 0.00037851801607757807, |
| "learning_rate": 1.6293288344708566e-07, |
| "loss": 0.0, |
| "reward": 0.633305624127388, |
| "reward_std": 0.372529074549675, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3253629058599472, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3079427108168602, |
| "step": 127 |
| }, |
| { |
| "completion_length": 535.1666870117188, |
| "epoch": 0.41091492776886035, |
| "grad_norm": 0.21484375, |
| "kl": 0.0003694754414027557, |
| "learning_rate": 1.5769701383645698e-07, |
| "loss": 0.0, |
| "reward": 0.6848493814468384, |
| "reward_std": 0.344666950404644, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3814639300107956, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3033854216337204, |
| "step": 128 |
| }, |
| { |
| "completion_length": 513.1224060058594, |
| "epoch": 0.41412520064205455, |
| "grad_norm": 0.19921875, |
| "kl": 0.0003918400325346738, |
| "learning_rate": 1.5267358321348285e-07, |
| "loss": 0.0, |
| "reward": 0.6448424756526947, |
| "reward_std": 0.3401818424463272, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3212747722864151, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3235677257180214, |
| "step": 129 |
| }, |
| { |
| "completion_length": 541.6172027587891, |
| "epoch": 0.4173354735152488, |
| "grad_norm": 0.2060546875, |
| "kl": 0.0003936137800337747, |
| "learning_rate": 1.4786531185446452e-07, |
| "loss": 0.0, |
| "reward": 0.583847850561142, |
| "reward_std": 0.33960337191820145, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.27004576474428177, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3138020858168602, |
| "step": 130 |
| }, |
| { |
| "completion_length": 533.5599136352539, |
| "epoch": 0.420545746388443, |
| "grad_norm": 0.2197265625, |
| "kl": 0.00039682938950136304, |
| "learning_rate": 1.432748035231658e-07, |
| "loss": 0.0, |
| "reward": 0.6769755631685257, |
| "reward_std": 0.3399392068386078, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3683818504214287, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3085937649011612, |
| "step": 131 |
| }, |
| { |
| "completion_length": 523.9271087646484, |
| "epoch": 0.42375601926163725, |
| "grad_norm": 0.2265625, |
| "kl": 0.00040404664468951523, |
| "learning_rate": 1.3890454406082956e-07, |
| "loss": 0.0, |
| "reward": 0.6483045816421509, |
| "reward_std": 0.32681532204151154, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3390597552061081, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.309244804084301, |
| "step": 132 |
| }, |
| { |
| "completion_length": 471.9661636352539, |
| "epoch": 0.42696629213483145, |
| "grad_norm": 0.24609375, |
| "kl": 0.00040609255665913224, |
| "learning_rate": 1.3475690004005097e-07, |
| "loss": 0.0, |
| "reward": 0.7119551748037338, |
| "reward_std": 0.34096624702215195, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.39619994908571243, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3157552182674408, |
| "step": 133 |
| }, |
| { |
| "completion_length": 565.4192962646484, |
| "epoch": 0.4301765650080257, |
| "grad_norm": 0.205078125, |
| "kl": 0.00037678072112612426, |
| "learning_rate": 1.308341174832359e-07, |
| "loss": 0.0, |
| "reward": 0.6749380528926849, |
| "reward_std": 0.3803337290883064, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.37480782717466354, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3001302182674408, |
| "step": 134 |
| }, |
| { |
| "completion_length": 496.27345275878906, |
| "epoch": 0.4333868378812199, |
| "grad_norm": 0.27734375, |
| "kl": 0.0004564332193695009, |
| "learning_rate": 1.2713832064634125e-07, |
| "loss": 0.0, |
| "reward": 0.7301954329013824, |
| "reward_std": 0.3306322991847992, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.41378918290138245, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3164062649011612, |
| "step": 135 |
| }, |
| { |
| "completion_length": 530.8125228881836, |
| "epoch": 0.43659711075441415, |
| "grad_norm": 0.205078125, |
| "kl": 0.0003717996005434543, |
| "learning_rate": 1.2367151086855187e-07, |
| "loss": 0.0, |
| "reward": 0.6495877057313919, |
| "reward_std": 0.3487004414200783, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3253689482808113, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3242187649011612, |
| "step": 136 |
| }, |
| { |
| "completion_length": 554.2031402587891, |
| "epoch": 0.43980738362760835, |
| "grad_norm": 0.2109375, |
| "kl": 0.0003636257752077654, |
| "learning_rate": 1.2043556548852063e-07, |
| "loss": 0.0, |
| "reward": 0.5865623354911804, |
| "reward_std": 0.30131980776786804, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3040102533996105, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2825520932674408, |
| "step": 137 |
| }, |
| { |
| "completion_length": 545.6093826293945, |
| "epoch": 0.44301765650080255, |
| "grad_norm": 0.22265625, |
| "kl": 0.00041512529423926026, |
| "learning_rate": 1.1743223682775649e-07, |
| "loss": 0.0, |
| "reward": 0.6579451262950897, |
| "reward_std": 0.3593253716826439, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.36367426812648773, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2942708432674408, |
| "step": 138 |
| }, |
| { |
| "completion_length": 573.9088745117188, |
| "epoch": 0.4462279293739968, |
| "grad_norm": 0.1982421875, |
| "kl": 0.0003498100923025049, |
| "learning_rate": 1.1466315124171128e-07, |
| "loss": 0.0, |
| "reward": 0.6012589037418365, |
| "reward_std": 0.34214527904987335, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.31414950639009476, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2871093899011612, |
| "step": 139 |
| }, |
| { |
| "completion_length": 546.6432495117188, |
| "epoch": 0.449438202247191, |
| "grad_norm": 0.21875, |
| "kl": 0.0004052919539390132, |
| "learning_rate": 1.1212980823907929e-07, |
| "loss": 0.0, |
| "reward": 0.63412706553936, |
| "reward_std": 0.36361514031887054, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.33920522779226303, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2949218899011612, |
| "step": 140 |
| }, |
| { |
| "completion_length": 547.6432495117188, |
| "epoch": 0.45264847512038525, |
| "grad_norm": 0.2001953125, |
| "kl": 0.0003856433249893598, |
| "learning_rate": 1.0983357966978745e-07, |
| "loss": 0.0, |
| "reward": 0.7091409862041473, |
| "reward_std": 0.3494722992181778, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.37906285375356674, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3300781324505806, |
| "step": 141 |
| }, |
| { |
| "completion_length": 562.7890777587891, |
| "epoch": 0.45585874799357945, |
| "grad_norm": 0.28515625, |
| "kl": 0.0004331854870542884, |
| "learning_rate": 1.0777570898211405e-07, |
| "loss": 0.0, |
| "reward": 0.677094116806984, |
| "reward_std": 0.36977435648441315, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.35808368027210236, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.319010429084301, |
| "step": 142 |
| }, |
| { |
| "completion_length": 514.6145858764648, |
| "epoch": 0.4590690208667737, |
| "grad_norm": 0.26171875, |
| "kl": 0.0004561090827337466, |
| "learning_rate": 1.0595731054933934e-07, |
| "loss": 0.0, |
| "reward": 0.7047944366931915, |
| "reward_std": 0.3853035420179367, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.39815381169319153, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3066406399011612, |
| "step": 143 |
| }, |
| { |
| "completion_length": 515.0520858764648, |
| "epoch": 0.4622792937399679, |
| "grad_norm": 0.228515625, |
| "kl": 0.00042895031219813973, |
| "learning_rate": 1.0437936906629334e-07, |
| "loss": 0.0, |
| "reward": 0.687195435166359, |
| "reward_std": 0.39286451041698456, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.37925272434949875, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3079427257180214, |
| "step": 144 |
| }, |
| { |
| "completion_length": 532.7969055175781, |
| "epoch": 0.4654895666131621, |
| "grad_norm": 0.212890625, |
| "kl": 0.00040866951167117804, |
| "learning_rate": 1.0304273901612565e-07, |
| "loss": 0.0, |
| "reward": 0.7079404592514038, |
| "reward_std": 0.3612729534506798, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3934873268008232, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3144531324505806, |
| "step": 145 |
| }, |
| { |
| "completion_length": 552.0989837646484, |
| "epoch": 0.46869983948635635, |
| "grad_norm": 0.2158203125, |
| "kl": 0.0003939080925192684, |
| "learning_rate": 1.0194814420758804e-07, |
| "loss": 0.0, |
| "reward": 0.6515837609767914, |
| "reward_std": 0.3383214473724365, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3384326733648777, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3131510466337204, |
| "step": 146 |
| }, |
| { |
| "completion_length": 595.5234527587891, |
| "epoch": 0.47191011235955055, |
| "grad_norm": 0.2099609375, |
| "kl": 0.0003522088081808761, |
| "learning_rate": 1.0109617738307911e-07, |
| "loss": 0.0, |
| "reward": 0.5905841588973999, |
| "reward_std": 0.36369770765304565, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3151935264468193, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.2753906287252903, |
| "step": 147 |
| }, |
| { |
| "completion_length": 493.57032012939453, |
| "epoch": 0.4751203852327448, |
| "grad_norm": 0.228515625, |
| "kl": 0.0003881813900079578, |
| "learning_rate": 1.0048729989766394e-07, |
| "loss": 0.0, |
| "reward": 0.7446072101593018, |
| "reward_std": 0.37431684136390686, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.42234158515930176, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3222656399011612, |
| "step": 148 |
| }, |
| { |
| "completion_length": 543.2708587646484, |
| "epoch": 0.478330658105939, |
| "grad_norm": 0.1953125, |
| "kl": 0.00041060569492401555, |
| "learning_rate": 1.0012184146924223e-07, |
| "loss": 0.0, |
| "reward": 0.6233467310667038, |
| "reward_std": 0.3531793877482414, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.3141019344329834, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3092447966337204, |
| "step": 149 |
| }, |
| { |
| "completion_length": 487.1823043823242, |
| "epoch": 0.48154093097913325, |
| "grad_norm": 0.28515625, |
| "kl": 0.0004451891945791431, |
| "learning_rate": 1e-07, |
| "loss": 0.0, |
| "reward": 0.7395021021366119, |
| "reward_std": 0.35496869683265686, |
| "rewards/expression_based_accuracy_reward_length_penalized": 0.41202811151742935, |
| "rewards/format_reward": 0.0, |
| "rewards/soft_format_reward": 0.0, |
| "rewards/tag_count_reward": 0.3274739682674408, |
| "step": 150 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 150, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|