leonMW's picture
Upload folder using huggingface_hub
213ee0a verified
{
"best_global_step": 92,
"best_metric": 0.0008370681316591799,
"best_model_checkpoint": "data/DeepSeek-R1-Distill-Qwen-1.5B-Staged-4/checkpoint-92",
"epoch": 2.0,
"eval_steps": 500,
"global_step": 184,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 473.0,
"completions/max_terminated_length": 473.0,
"completions/mean_length": 399.97607421875,
"completions/mean_terminated_length": 399.97607421875,
"completions/min_length": 304.0,
"completions/min_terminated_length": 304.0,
"entropy": 0.35566435009241104,
"epoch": 0.010869565217391304,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.027134701502786768,
"learning_rate": 1e-05,
"loss": 0.0026,
"num_tokens": 2869071.0,
"reward": 3.4189200401306152,
"reward_std": 0.13538040220737457,
"rewards/ngram_repetition2/mean": 0.9907151460647583,
"rewards/ngram_repetition2/std": 0.007372148334980011,
"rewards/ngram_repetition3/mean": 0.9988653659820557,
"rewards/ngram_repetition3/std": 0.0037813771050423384,
"rewards/symbolic_reward_accuracy/mean": 0.7431640625,
"rewards/symbolic_reward_accuracy/std": 0.43699485063552856,
"rewards/symbolic_reward_partial_score/mean": 0.9029541015625,
"rewards/symbolic_reward_partial_score/std": 0.19717122614383698,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9742211699485779,
"rewards/thinking_answer_ratio_reward/std": 0.004831792786717415,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1269948482513428,
"sampling/importance_sampling_ratio/min": 3.996394298155792e-05,
"sampling/sampling_logp_difference/max": 10.127532958984375,
"sampling/sampling_logp_difference/mean": 0.19825759530067444,
"step": 1
},
{
"clip_ratio/high_max": 0.3385416666666667,
"clip_ratio/high_mean": 0.19986979166666666,
"clip_ratio/low_mean": 0.2779947916666667,
"clip_ratio/low_min": 0.125,
"clip_ratio/region_mean": 0.4778645833333333,
"entropy": 0.3625817572077115,
"epoch": 0.043478260869565216,
"grad_norm": 0.025662250505496927,
"learning_rate": 1e-05,
"loss": -0.0008,
"step": 4
},
{
"clip_ratio/high_max": 0.26953125,
"clip_ratio/high_mean": 0.15087890625,
"clip_ratio/low_mean": 0.201171875,
"clip_ratio/low_min": 0.078125,
"clip_ratio/region_mean": 0.35205078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 483.0,
"completions/max_terminated_length": 483.0,
"completions/mean_length": 389.82080078125,
"completions/mean_terminated_length": 389.82080078125,
"completions/min_length": 299.0,
"completions/min_terminated_length": 299.0,
"entropy": 0.36707244999706745,
"epoch": 0.08695652173913043,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02048054919935291,
"learning_rate": 1e-05,
"loss": 0.0,
"num_tokens": 5758528.0,
"reward": 3.2567176818847656,
"reward_std": 0.11631277203559875,
"rewards/ngram_repetition2/mean": 0.9903280138969421,
"rewards/ngram_repetition2/std": 0.008003082126379013,
"rewards/ngram_repetition3/mean": 0.9988331198692322,
"rewards/ngram_repetition3/std": 0.004472358617931604,
"rewards/symbolic_reward_accuracy/mean": 0.662109375,
"rewards/symbolic_reward_accuracy/std": 0.47310659289360046,
"rewards/symbolic_reward_partial_score/mean": 0.9028727412223816,
"rewards/symbolic_reward_partial_score/std": 0.16355262696743011,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9734619855880737,
"rewards/thinking_answer_ratio_reward/std": 0.005536045413464308,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1309096813201904,
"sampling/importance_sampling_ratio/min": 1.0269287486153189e-05,
"sampling/sampling_logp_difference/max": 11.486352920532227,
"sampling/sampling_logp_difference/mean": 0.20535901188850403,
"step": 8
},
{
"clip_ratio/high_max": 0.27734375,
"clip_ratio/high_mean": 0.16259765625,
"clip_ratio/low_mean": 0.197265625,
"clip_ratio/low_min": 0.09375,
"clip_ratio/region_mean": 0.35986328125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 503.0,
"completions/max_terminated_length": 503.0,
"completions/mean_length": 391.505859375,
"completions/mean_terminated_length": 391.505859375,
"completions/min_length": 311.0,
"completions/min_terminated_length": 311.0,
"entropy": 0.388662975281477,
"epoch": 0.13043478260869565,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02483318093046019,
"learning_rate": 1e-05,
"loss": 0.0001,
"num_tokens": 8607084.0,
"reward": 3.3946127891540527,
"reward_std": 0.11570382118225098,
"rewards/ngram_repetition2/mean": 0.9894477725028992,
"rewards/ngram_repetition2/std": 0.007921576499938965,
"rewards/ngram_repetition3/mean": 0.9988635778427124,
"rewards/ngram_repetition3/std": 0.00399815896525979,
"rewards/symbolic_reward_accuracy/mean": 0.72119140625,
"rewards/symbolic_reward_accuracy/std": 0.448522686958313,
"rewards/symbolic_reward_partial_score/mean": 0.922607421875,
"rewards/symbolic_reward_partial_score/std": 0.14913025498390198,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9739440679550171,
"rewards/thinking_answer_ratio_reward/std": 0.004836579784750938,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1356343030929565,
"sampling/importance_sampling_ratio/min": 0.0001149894596892409,
"sampling/sampling_logp_difference/max": 9.070670127868652,
"sampling/sampling_logp_difference/mean": 0.21488171815872192,
"step": 12
},
{
"clip_ratio/high_max": 0.296875,
"clip_ratio/high_mean": 0.177734375,
"clip_ratio/low_mean": 0.1787109375,
"clip_ratio/low_min": 0.07421875,
"clip_ratio/region_mean": 0.3564453125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 492.0,
"completions/max_terminated_length": 492.0,
"completions/mean_length": 393.09716796875,
"completions/mean_terminated_length": 393.09716796875,
"completions/min_length": 313.0,
"completions/min_terminated_length": 313.0,
"entropy": 0.39366098679602146,
"epoch": 0.17391304347826086,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03379401199661016,
"learning_rate": 1e-05,
"loss": 0.0008,
"num_tokens": 11465235.0,
"reward": 3.1309444904327393,
"reward_std": 0.27350103855133057,
"rewards/ngram_repetition2/mean": 0.9783711433410645,
"rewards/ngram_repetition2/std": 0.021668143570423126,
"rewards/ngram_repetition3/mean": 0.9917559623718262,
"rewards/ngram_repetition3/std": 0.014161880128085613,
"rewards/symbolic_reward_accuracy/mean": 0.625,
"rewards/symbolic_reward_accuracy/std": 0.48424115777015686,
"rewards/symbolic_reward_partial_score/mean": 0.8516031503677368,
"rewards/symbolic_reward_partial_score/std": 0.22019875049591064,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9640083312988281,
"rewards/thinking_answer_ratio_reward/std": 0.01703455112874508,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1348872184753418,
"sampling/importance_sampling_ratio/min": 4.663659638026729e-06,
"sampling/sampling_logp_difference/max": 12.275710105895996,
"sampling/sampling_logp_difference/mean": 0.21620666980743408,
"step": 16
},
{
"clip_ratio/high_max": 0.296875,
"clip_ratio/high_mean": 0.1826171875,
"clip_ratio/low_mean": 0.17724609375,
"clip_ratio/low_min": 0.09375,
"clip_ratio/region_mean": 0.35986328125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 508.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 386.9951171875,
"completions/mean_terminated_length": 386.9951171875,
"completions/min_length": 305.0,
"completions/min_terminated_length": 305.0,
"entropy": 0.44838985428214073,
"epoch": 0.21739130434782608,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02561700687134101,
"learning_rate": 1e-05,
"loss": 0.0009,
"num_tokens": 14298217.0,
"reward": 3.3726930618286133,
"reward_std": 0.11649461090564728,
"rewards/ngram_repetition2/mean": 0.9818264245986938,
"rewards/ngram_repetition2/std": 0.01684102788567543,
"rewards/ngram_repetition3/mean": 0.9955232739448547,
"rewards/ngram_repetition3/std": 0.010431738570332527,
"rewards/symbolic_reward_accuracy/mean": 0.72021484375,
"rewards/symbolic_reward_accuracy/std": 0.4490031898021698,
"rewards/symbolic_reward_partial_score/mean": 0.9027913212776184,
"rewards/symbolic_reward_partial_score/std": 0.1795635223388672,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9698631763458252,
"rewards/thinking_answer_ratio_reward/std": 0.012476499192416668,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1498932838439941,
"sampling/importance_sampling_ratio/min": 4.004936636192724e-05,
"sampling/sampling_logp_difference/max": 10.125397682189941,
"sampling/sampling_logp_difference/mean": 0.23924380540847778,
"step": 20
},
{
"clip_ratio/high_max": 0.328125,
"clip_ratio/high_mean": 0.20703125,
"clip_ratio/low_mean": 0.17822265625,
"clip_ratio/low_min": 0.05859375,
"clip_ratio/region_mean": 0.38525390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 501.0,
"completions/max_terminated_length": 501.0,
"completions/mean_length": 381.43408203125,
"completions/mean_terminated_length": 381.43408203125,
"completions/min_length": 293.0,
"completions/min_terminated_length": 293.0,
"entropy": 0.45580120012164116,
"epoch": 0.2608695652173913,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02764242724586242,
"learning_rate": 1e-05,
"loss": 0.0008,
"num_tokens": 17107138.0,
"reward": 3.2511990070343018,
"reward_std": 0.14526695013046265,
"rewards/ngram_repetition2/mean": 0.9758055210113525,
"rewards/ngram_repetition2/std": 0.026150498539209366,
"rewards/ngram_repetition3/mean": 0.991034984588623,
"rewards/ngram_repetition3/std": 0.018479736521840096,
"rewards/symbolic_reward_accuracy/mean": 0.666015625,
"rewards/symbolic_reward_accuracy/std": 0.47174936532974243,
"rewards/symbolic_reward_partial_score/mean": 0.8898518681526184,
"rewards/symbolic_reward_partial_score/std": 0.18666595220565796,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9647442102432251,
"rewards/thinking_answer_ratio_reward/std": 0.018395813181996346,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.151930332183838,
"sampling/importance_sampling_ratio/min": 2.3768032406223938e-05,
"sampling/sampling_logp_difference/max": 10.64716911315918,
"sampling/sampling_logp_difference/mean": 0.242381751537323,
"step": 24
},
{
"clip_ratio/high_max": 0.34375,
"clip_ratio/high_mean": 0.21533203125,
"clip_ratio/low_mean": 0.16552734375,
"clip_ratio/low_min": 0.07421875,
"clip_ratio/region_mean": 0.380859375,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 480.0,
"completions/mean_length": 370.24658203125,
"completions/mean_terminated_length": 368.9267272949219,
"completions/min_length": 290.0,
"completions/min_terminated_length": 290.0,
"entropy": 0.47484372183680534,
"epoch": 0.30434782608695654,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02571369534600996,
"learning_rate": 1e-05,
"loss": 0.0019,
"num_tokens": 19915323.0,
"reward": 3.4752464294433594,
"reward_std": 0.1370949149131775,
"rewards/ngram_repetition2/mean": 0.9834737181663513,
"rewards/ngram_repetition2/std": 0.016693396493792534,
"rewards/ngram_repetition3/mean": 0.996229887008667,
"rewards/ngram_repetition3/std": 0.01113222073763609,
"rewards/symbolic_reward_accuracy/mean": 0.7587890625,
"rewards/symbolic_reward_accuracy/std": 0.42792245745658875,
"rewards/symbolic_reward_partial_score/mean": 0.9284260869026184,
"rewards/symbolic_reward_partial_score/std": 0.15036074817180634,
"rewards/tag_count_reward/mean": 0.999755859375,
"rewards/tag_count_reward/std": 0.011048543266952038,
"rewards/thinking_answer_ratio_reward/mean": 0.9689278602600098,
"rewards/thinking_answer_ratio_reward/std": 0.024484839290380478,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1540093421936035,
"sampling/importance_sampling_ratio/min": 4.7302735765697435e-05,
"sampling/sampling_logp_difference/max": 9.958942413330078,
"sampling/sampling_logp_difference/mean": 0.24644868075847626,
"step": 28
},
{
"clip_ratio/high_max": 0.3046875,
"clip_ratio/high_mean": 0.17724609375,
"clip_ratio/low_mean": 0.19384765625,
"clip_ratio/low_min": 0.078125,
"clip_ratio/region_mean": 0.37109375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 452.0,
"completions/max_terminated_length": 452.0,
"completions/mean_length": 361.70166015625,
"completions/mean_terminated_length": 361.70166015625,
"completions/min_length": 275.0,
"completions/min_terminated_length": 275.0,
"entropy": 0.4775677230209112,
"epoch": 0.34782608695652173,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.022611441969899414,
"learning_rate": 1e-05,
"loss": 0.0003,
"num_tokens": 22712344.0,
"reward": 3.4439804553985596,
"reward_std": 0.07519370317459106,
"rewards/ngram_repetition2/mean": 0.9849820137023926,
"rewards/ngram_repetition2/std": 0.015775341540575027,
"rewards/ngram_repetition3/mean": 0.9966925382614136,
"rewards/ngram_repetition3/std": 0.010486208833754063,
"rewards/symbolic_reward_accuracy/mean": 0.7451171875,
"rewards/symbolic_reward_accuracy/std": 0.4359017610549927,
"rewards/symbolic_reward_partial_score/mean": 0.9242349863052368,
"rewards/symbolic_reward_partial_score/std": 0.1511358916759491,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9694297313690186,
"rewards/thinking_answer_ratio_reward/std": 0.01129063218832016,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1553937196731567,
"sampling/importance_sampling_ratio/min": 1.3447781777031764e-10,
"sampling/sampling_logp_difference/max": 22.72962188720703,
"sampling/sampling_logp_difference/mean": 0.24660193920135498,
"step": 32
},
{
"clip_ratio/high_max": 0.296875,
"clip_ratio/high_mean": 0.18896484375,
"clip_ratio/low_mean": 0.18798828125,
"clip_ratio/low_min": 0.08203125,
"clip_ratio/region_mean": 0.376953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 457.0,
"completions/max_terminated_length": 457.0,
"completions/mean_length": 360.79736328125,
"completions/mean_terminated_length": 360.79736328125,
"completions/min_length": 286.0,
"completions/min_terminated_length": 286.0,
"entropy": 0.4877959694713354,
"epoch": 0.391304347826087,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02221078365492021,
"learning_rate": 1e-05,
"loss": 0.0004,
"num_tokens": 25517017.0,
"reward": 3.241891622543335,
"reward_std": 0.03938647359609604,
"rewards/ngram_repetition2/mean": 0.9857202768325806,
"rewards/ngram_repetition2/std": 0.013620593585073948,
"rewards/ngram_repetition3/mean": 0.9973255395889282,
"rewards/ngram_repetition3/std": 0.008434941992163658,
"rewards/symbolic_reward_accuracy/mean": 0.65234375,
"rewards/symbolic_reward_accuracy/std": 0.47634249925613403,
"rewards/symbolic_reward_partial_score/mean": 0.9076741933822632,
"rewards/symbolic_reward_partial_score/std": 0.141921728849411,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9699524641036987,
"rewards/thinking_answer_ratio_reward/std": 0.009303269907832146,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.156609296798706,
"sampling/importance_sampling_ratio/min": 1.4608130260995722e-09,
"sampling/sampling_logp_difference/max": 20.34427261352539,
"sampling/sampling_logp_difference/mean": 0.24704143404960632,
"step": 36
},
{
"clip_ratio/high_max": 0.31640625,
"clip_ratio/high_mean": 0.20263671875,
"clip_ratio/low_mean": 0.17431640625,
"clip_ratio/low_min": 0.0546875,
"clip_ratio/region_mean": 0.376953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 470.0,
"completions/max_terminated_length": 470.0,
"completions/mean_length": 365.380859375,
"completions/mean_terminated_length": 365.380859375,
"completions/min_length": 289.0,
"completions/min_terminated_length": 289.0,
"entropy": 0.500596784055233,
"epoch": 0.43478260869565216,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.028585581797482114,
"learning_rate": 1e-05,
"loss": 0.0003,
"num_tokens": 28302565.0,
"reward": 3.6249711513519287,
"reward_std": 0.031787335872650146,
"rewards/ngram_repetition2/mean": 0.9847173690795898,
"rewards/ngram_repetition2/std": 0.01566314324736595,
"rewards/ngram_repetition3/mean": 0.9967639446258545,
"rewards/ngram_repetition3/std": 0.010298742912709713,
"rewards/symbolic_reward_accuracy/mean": 0.8203125,
"rewards/symbolic_reward_accuracy/std": 0.38402071595191956,
"rewards/symbolic_reward_partial_score/mean": 0.954833984375,
"rewards/symbolic_reward_partial_score/std": 0.11018021404743195,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9697376489639282,
"rewards/thinking_answer_ratio_reward/std": 0.011102610267698765,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1601823568344116,
"sampling/importance_sampling_ratio/min": 2.0568222680594772e-05,
"sampling/sampling_logp_difference/max": 10.791763305664062,
"sampling/sampling_logp_difference/mean": 0.24917525053024292,
"step": 40
},
{
"clip_ratio/high_max": 0.26171875,
"clip_ratio/high_mean": 0.14892578125,
"clip_ratio/low_mean": 0.1982421875,
"clip_ratio/low_min": 0.08203125,
"clip_ratio/region_mean": 0.34716796875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 469.0,
"completions/max_terminated_length": 469.0,
"completions/mean_length": 373.4169921875,
"completions/mean_terminated_length": 373.4169921875,
"completions/min_length": 306.0,
"completions/min_terminated_length": 306.0,
"entropy": 0.5211230479180813,
"epoch": 0.4782608695652174,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.031003841790188075,
"learning_rate": 1e-05,
"loss": 0.0002,
"num_tokens": 31126747.0,
"reward": 3.277402639389038,
"reward_std": 0.036474019289016724,
"rewards/ngram_repetition2/mean": 0.9847090840339661,
"rewards/ngram_repetition2/std": 0.01521327905356884,
"rewards/ngram_repetition3/mean": 0.9967399835586548,
"rewards/ngram_repetition3/std": 0.00949870329350233,
"rewards/symbolic_reward_accuracy/mean": 0.6708984375,
"rewards/symbolic_reward_accuracy/std": 0.4700016975402832,
"rewards/symbolic_reward_partial_score/mean": 0.9060872793197632,
"rewards/symbolic_reward_partial_score/std": 0.16207432746887207,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9704017043113708,
"rewards/thinking_answer_ratio_reward/std": 0.010555099695920944,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1634962558746338,
"sampling/importance_sampling_ratio/min": 3.222545501557761e-06,
"sampling/sampling_logp_difference/max": 12.645339012145996,
"sampling/sampling_logp_difference/mean": 0.2521994411945343,
"step": 44
},
{
"clip_ratio/high_max": 0.31640625,
"clip_ratio/high_mean": 0.20166015625,
"clip_ratio/low_mean": 0.16064453125,
"clip_ratio/low_min": 0.05078125,
"clip_ratio/region_mean": 0.3623046875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 447.0,
"completions/max_terminated_length": 447.0,
"completions/mean_length": 376.16943359375,
"completions/mean_terminated_length": 376.16943359375,
"completions/min_length": 287.0,
"completions/min_terminated_length": 287.0,
"entropy": 0.5292842984199524,
"epoch": 0.5217391304347826,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.029177978249695414,
"learning_rate": 1e-05,
"loss": 0.0003,
"num_tokens": 33956566.0,
"reward": 3.5402560234069824,
"reward_std": 0.05406097322702408,
"rewards/ngram_repetition2/mean": 0.9857374429702759,
"rewards/ngram_repetition2/std": 0.011830438859760761,
"rewards/ngram_repetition3/mean": 0.9977642297744751,
"rewards/ngram_repetition3/std": 0.00712351780384779,
"rewards/symbolic_reward_accuracy/mean": 0.7822265625,
"rewards/symbolic_reward_accuracy/std": 0.4128333628177643,
"rewards/symbolic_reward_partial_score/mean": 0.9462483525276184,
"rewards/symbolic_reward_partial_score/std": 0.11659117788076401,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9719350934028625,
"rewards/thinking_answer_ratio_reward/std": 0.0077917324379086494,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1653231382369995,
"sampling/importance_sampling_ratio/min": 1.30699368128262e-06,
"sampling/sampling_logp_difference/max": 13.547780990600586,
"sampling/sampling_logp_difference/mean": 0.2539900541305542,
"step": 48
},
{
"clip_ratio/high_max": 0.30078125,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.19287109375,
"clip_ratio/low_min": 0.05859375,
"clip_ratio/region_mean": 0.38037109375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 476.0,
"completions/max_terminated_length": 476.0,
"completions/mean_length": 375.197265625,
"completions/mean_terminated_length": 375.197265625,
"completions/min_length": 285.0,
"completions/min_terminated_length": 285.0,
"entropy": 0.5228982605040073,
"epoch": 0.5652173913043478,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.019909033791539,
"learning_rate": 1e-05,
"loss": 0.0002,
"num_tokens": 36800234.0,
"reward": 3.3348586559295654,
"reward_std": 0.031567975878715515,
"rewards/ngram_repetition2/mean": 0.9860277771949768,
"rewards/ngram_repetition2/std": 0.011108696460723877,
"rewards/ngram_repetition3/mean": 0.9980593919754028,
"rewards/ngram_repetition3/std": 0.006017275620251894,
"rewards/symbolic_reward_accuracy/mean": 0.69580078125,
"rewards/symbolic_reward_accuracy/std": 0.46017980575561523,
"rewards/symbolic_reward_partial_score/mean": 0.9136962890625,
"rewards/symbolic_reward_partial_score/std": 0.15295757353305817,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9719891548156738,
"rewards/thinking_answer_ratio_reward/std": 0.007201826199889183,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.165921688079834,
"sampling/importance_sampling_ratio/min": 7.888810068834573e-05,
"sampling/sampling_logp_difference/max": 9.447480201721191,
"sampling/sampling_logp_difference/mean": 0.2536054253578186,
"step": 52
},
{
"clip_ratio/high_max": 0.2890625,
"clip_ratio/high_mean": 0.19287109375,
"clip_ratio/low_mean": 0.18603515625,
"clip_ratio/low_min": 0.08203125,
"clip_ratio/region_mean": 0.37890625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 469.0,
"completions/max_terminated_length": 469.0,
"completions/mean_length": 373.00830078125,
"completions/mean_terminated_length": 373.00830078125,
"completions/min_length": 310.0,
"completions/min_terminated_length": 310.0,
"entropy": 0.5138954482972622,
"epoch": 0.6086956521739131,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.018577199260664073,
"learning_rate": 1e-05,
"loss": 0.0001,
"num_tokens": 39623579.0,
"reward": 3.4861793518066406,
"reward_std": 0.01453761849552393,
"rewards/ngram_repetition2/mean": 0.9885779619216919,
"rewards/ngram_repetition2/std": 0.008892485871911049,
"rewards/ngram_repetition3/mean": 0.9986856579780579,
"rewards/ngram_repetition3/std": 0.004819902591407299,
"rewards/symbolic_reward_accuracy/mean": 0.76318359375,
"rewards/symbolic_reward_accuracy/std": 0.42523249983787537,
"rewards/symbolic_reward_partial_score/mean": 0.9302164316177368,
"rewards/symbolic_reward_partial_score/std": 0.14360538125038147,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9723219871520996,
"rewards/thinking_answer_ratio_reward/std": 0.006340987980365753,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1633408069610596,
"sampling/importance_sampling_ratio/min": 9.12318591872463e-06,
"sampling/sampling_logp_difference/max": 11.604691505432129,
"sampling/sampling_logp_difference/mean": 0.2456064522266388,
"step": 56
},
{
"clip_ratio/high_max": 0.26953125,
"clip_ratio/high_mean": 0.17626953125,
"clip_ratio/low_mean": 0.18798828125,
"clip_ratio/low_min": 0.08984375,
"clip_ratio/region_mean": 0.3642578125,
"completions/clipped_ratio": 0.0009765625,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 473.0,
"completions/mean_length": 383.15380859375,
"completions/mean_terminated_length": 380.5254211425781,
"completions/min_length": 303.0,
"completions/min_terminated_length": 303.0,
"entropy": 0.5303183943033218,
"epoch": 0.6521739130434783,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.023767814234547865,
"learning_rate": 1e-05,
"loss": 0.0032,
"num_tokens": 42464534.0,
"reward": 3.4200026988983154,
"reward_std": 0.044837385416030884,
"rewards/ngram_repetition2/mean": 0.9886395931243896,
"rewards/ngram_repetition2/std": 0.026615649461746216,
"rewards/ngram_repetition3/mean": 0.9978044629096985,
"rewards/ngram_repetition3/std": 0.02556409314274788,
"rewards/symbolic_reward_accuracy/mean": 0.72705078125,
"rewards/symbolic_reward_accuracy/std": 0.4455837607383728,
"rewards/symbolic_reward_partial_score/mean": 0.9368082284927368,
"rewards/symbolic_reward_partial_score/std": 0.12158261984586716,
"rewards/tag_count_reward/mean": 0.99951171875,
"rewards/tag_count_reward/std": 0.015621182508766651,
"rewards/thinking_answer_ratio_reward/mean": 0.971657395362854,
"rewards/thinking_answer_ratio_reward/std": 0.031062643975019455,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1663137674331665,
"sampling/importance_sampling_ratio/min": 0.00022333291417453438,
"sampling/sampling_logp_difference/max": 8.40684700012207,
"sampling/sampling_logp_difference/mean": 0.2500694990158081,
"step": 60
},
{
"clip_ratio/high_max": 0.2734375,
"clip_ratio/high_mean": 0.1826171875,
"clip_ratio/low_mean": 0.1728515625,
"clip_ratio/low_min": 0.0625,
"clip_ratio/region_mean": 0.35546875,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 539.0,
"completions/mean_length": 386.85986328125,
"completions/mean_terminated_length": 385.5481262207031,
"completions/min_length": 326.0,
"completions/min_terminated_length": 326.0,
"entropy": 0.5535794571042061,
"epoch": 0.6956521739130435,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.033414142847404786,
"learning_rate": 1e-05,
"loss": 0.0016,
"num_tokens": 45319415.0,
"reward": 3.5498650074005127,
"reward_std": 0.044402044266462326,
"rewards/ngram_repetition2/mean": 0.9896550178527832,
"rewards/ngram_repetition2/std": 0.01944366842508316,
"rewards/ngram_repetition3/mean": 0.998386025428772,
"rewards/ngram_repetition3/std": 0.017801163718104362,
"rewards/symbolic_reward_accuracy/mean": 0.78857421875,
"rewards/symbolic_reward_accuracy/std": 0.40841934084892273,
"rewards/symbolic_reward_partial_score/mean": 0.943359375,
"rewards/symbolic_reward_partial_score/std": 0.1245667040348053,
"rewards/tag_count_reward/mean": 0.999755859375,
"rewards/tag_count_reward/std": 0.011048543266952038,
"rewards/thinking_answer_ratio_reward/mean": 0.9720906019210815,
"rewards/thinking_answer_ratio_reward/std": 0.022436225786805153,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1719954013824463,
"sampling/importance_sampling_ratio/min": 3.318129529361613e-05,
"sampling/sampling_logp_difference/max": 10.31352424621582,
"sampling/sampling_logp_difference/mean": 0.25759539008140564,
"step": 64
},
{
"clip_ratio/high_max": 0.3203125,
"clip_ratio/high_mean": 0.21630859375,
"clip_ratio/low_mean": 0.16748046875,
"clip_ratio/low_min": 0.0625,
"clip_ratio/region_mean": 0.3837890625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 513.0,
"completions/max_terminated_length": 513.0,
"completions/mean_length": 394.04443359375,
"completions/mean_terminated_length": 394.04443359375,
"completions/min_length": 325.0,
"completions/min_terminated_length": 325.0,
"entropy": 0.5683293081820011,
"epoch": 0.7391304347826086,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02077882386260629,
"learning_rate": 1e-05,
"loss": -0.0001,
"num_tokens": 48182674.0,
"reward": 3.4722495079040527,
"reward_std": 0.027738399803638458,
"rewards/ngram_repetition2/mean": 0.990430474281311,
"rewards/ngram_repetition2/std": 0.008477425202727318,
"rewards/ngram_repetition3/mean": 0.998767614364624,
"rewards/ngram_repetition3/std": 0.004312796052545309,
"rewards/symbolic_reward_accuracy/mean": 0.75390625,
"rewards/symbolic_reward_accuracy/std": 0.43083900213241577,
"rewards/symbolic_reward_partial_score/mean": 0.934814453125,
"rewards/symbolic_reward_partial_score/std": 0.1286529004573822,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9730780124664307,
"rewards/thinking_answer_ratio_reward/std": 0.006791951600462198,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.174986720085144,
"sampling/importance_sampling_ratio/min": 5.481481275637634e-05,
"sampling/sampling_logp_difference/max": 9.81155014038086,
"sampling/sampling_logp_difference/mean": 0.2610657513141632,
"step": 68
},
{
"clip_ratio/high_max": 0.23046875,
"clip_ratio/high_mean": 0.13134765625,
"clip_ratio/low_mean": 0.203125,
"clip_ratio/low_min": 0.09765625,
"clip_ratio/region_mean": 0.33447265625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 525.0,
"completions/max_terminated_length": 525.0,
"completions/mean_length": 402.0390625,
"completions/mean_terminated_length": 402.0390625,
"completions/min_length": 305.0,
"completions/min_terminated_length": 305.0,
"entropy": 0.5930491574108601,
"epoch": 0.782608695652174,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0719362481523671,
"learning_rate": 1e-05,
"loss": 0.0002,
"num_tokens": 51030626.0,
"reward": 3.507338047027588,
"reward_std": 0.07821064442396164,
"rewards/ngram_repetition2/mean": 0.9908634424209595,
"rewards/ngram_repetition2/std": 0.006825427990406752,
"rewards/ngram_repetition3/mean": 0.9992104768753052,
"rewards/ngram_repetition3/std": 0.0024387796875089407,
"rewards/symbolic_reward_accuracy/mean": 0.7705078125,
"rewards/symbolic_reward_accuracy/std": 0.4206089675426483,
"rewards/symbolic_reward_partial_score/mean": 0.9366861581802368,
"rewards/symbolic_reward_partial_score/std": 0.12974172830581665,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9735254049301147,
"rewards/thinking_answer_ratio_reward/std": 0.006129096262156963,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1793080568313599,
"sampling/importance_sampling_ratio/min": 8.93013350378169e-07,
"sampling/sampling_logp_difference/max": 13.928664207458496,
"sampling/sampling_logp_difference/mean": 0.2646637558937073,
"step": 72
},
{
"clip_ratio/high_max": 0.16796875,
"clip_ratio/high_mean": 0.10595703125,
"clip_ratio/low_mean": 0.2255859375,
"clip_ratio/low_min": 0.11328125,
"clip_ratio/region_mean": 0.33154296875,
"completions/clipped_ratio": 0.00634765625,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 664.0,
"completions/mean_length": 425.64794921875,
"completions/mean_terminated_length": 408.74249267578125,
"completions/min_length": 326.0,
"completions/min_terminated_length": 326.0,
"entropy": 0.6297206245362759,
"epoch": 0.8260869565217391,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17694802563627895,
"learning_rate": 1e-05,
"loss": 0.0207,
"num_tokens": 53958609.0,
"reward": 3.434380054473877,
"reward_std": 0.15169459581375122,
"rewards/ngram_repetition2/mean": 0.986931562423706,
"rewards/ngram_repetition2/std": 0.04639098048210144,
"rewards/ngram_repetition3/mean": 0.9962227940559387,
"rewards/ngram_repetition3/std": 0.046196915209293365,
"rewards/symbolic_reward_accuracy/mean": 0.74072265625,
"rewards/symbolic_reward_accuracy/std": 0.4383451044559479,
"rewards/symbolic_reward_partial_score/mean": 0.9265950918197632,
"rewards/symbolic_reward_partial_score/std": 0.153251513838768,
"rewards/tag_count_reward/mean": 0.996826171875,
"rewards/tag_count_reward/std": 0.03971915319561958,
"rewards/thinking_answer_ratio_reward/mean": 0.9682024717330933,
"rewards/thinking_answer_ratio_reward/std": 0.07753153145313263,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1941275596618652,
"sampling/importance_sampling_ratio/min": 4.905196692561731e-05,
"sampling/sampling_logp_difference/max": 9.922630310058594,
"sampling/sampling_logp_difference/mean": 0.27940261363983154,
"step": 76
},
{
"clip_ratio/high_max": 0.2109375,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.1884765625,
"clip_ratio/low_min": 0.07421875,
"clip_ratio/region_mean": 0.3134765625,
"completions/clipped_ratio": 0.025390625,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 846.0,
"completions/mean_length": 474.76806640625,
"completions/mean_terminated_length": 407.1047058105469,
"completions/min_length": 308.0,
"completions/min_terminated_length": 308.0,
"entropy": 0.6249676272273064,
"epoch": 0.8695652173913043,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.036438229713036584,
"learning_rate": 1e-05,
"loss": 0.0733,
"num_tokens": 56952342.0,
"reward": 3.3732757568359375,
"reward_std": 0.3173109292984009,
"rewards/ngram_repetition2/mean": 0.9734116792678833,
"rewards/ngram_repetition2/std": 0.09751972556114197,
"rewards/ngram_repetition3/mean": 0.9850020408630371,
"rewards/ngram_repetition3/std": 0.09716872870922089,
"rewards/symbolic_reward_accuracy/mean": 0.73046875,
"rewards/symbolic_reward_accuracy/std": 0.4438246786594391,
"rewards/symbolic_reward_partial_score/mean": 0.8959553837776184,
"rewards/symbolic_reward_partial_score/std": 0.21540075540542603,
"rewards/tag_count_reward/mean": 0.9873046875,
"rewards/tag_count_reward/std": 0.07867342233657837,
"rewards/thinking_answer_ratio_reward/mean": 0.9493899941444397,
"rewards/thinking_answer_ratio_reward/std": 0.15333302319049835,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1926825046539307,
"sampling/importance_sampling_ratio/min": 1.8732294847723097e-05,
"sampling/sampling_logp_difference/max": 10.885261535644531,
"sampling/sampling_logp_difference/mean": 0.27355605363845825,
"step": 80
},
{
"clip_ratio/high_max": 0.16015625,
"clip_ratio/high_mean": 0.08154296875,
"clip_ratio/low_mean": 0.1630859375,
"clip_ratio/low_min": 0.0546875,
"clip_ratio/region_mean": 0.24462890625,
"completions/clipped_ratio": 0.041015625,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1910.0,
"completions/mean_length": 513.650390625,
"completions/mean_terminated_length": 404.23016357421875,
"completions/min_length": 283.0,
"completions/min_terminated_length": 283.0,
"entropy": 0.6995769254863262,
"epoch": 0.9130434782608695,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3126366897938619,
"learning_rate": 1e-05,
"loss": 0.1153,
"num_tokens": 60057386.0,
"reward": 3.4059977531433105,
"reward_std": 0.4714565873146057,
"rewards/ngram_repetition2/mean": 0.9679386615753174,
"rewards/ngram_repetition2/std": 0.09275452047586441,
"rewards/ngram_repetition3/mean": 0.984743595123291,
"rewards/ngram_repetition3/std": 0.09202881902456284,
"rewards/symbolic_reward_accuracy/mean": 0.7451171875,
"rewards/symbolic_reward_accuracy/std": 0.4359017610549927,
"rewards/symbolic_reward_partial_score/mean": 0.9079182744026184,
"rewards/symbolic_reward_partial_score/std": 0.21840326488018036,
"rewards/tag_count_reward/mean": 0.97900390625,
"rewards/tag_count_reward/std": 0.10031013935804367,
"rewards/thinking_answer_ratio_reward/mean": 0.9314362406730652,
"rewards/thinking_answer_ratio_reward/std": 0.19521328806877136,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2058464288711548,
"sampling/importance_sampling_ratio/min": 0.00020593531371559948,
"sampling/sampling_logp_difference/max": 8.487948417663574,
"sampling/sampling_logp_difference/mean": 0.2925470471382141,
"step": 84
},
{
"clip_ratio/high_max": 0.04296875,
"clip_ratio/high_mean": 0.013671875,
"clip_ratio/low_mean": 0.30078125,
"clip_ratio/low_min": 0.16796875,
"clip_ratio/region_mean": 0.314453125,
"completions/clipped_ratio": 0.00146484375,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 578.0,
"completions/mean_length": 382.1064453125,
"completions/mean_terminated_length": 378.160400390625,
"completions/min_length": 266.0,
"completions/min_terminated_length": 266.0,
"entropy": 0.7504777312278748,
"epoch": 0.9565217391304348,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.08070835974712572,
"learning_rate": 1e-05,
"loss": 0.0066,
"num_tokens": 62905700.0,
"reward": 3.419804811477661,
"reward_std": 0.09055649489164352,
"rewards/ngram_repetition2/mean": 0.980319619178772,
"rewards/ngram_repetition2/std": 0.011525592766702175,
"rewards/ngram_repetition3/mean": 0.9975647330284119,
"rewards/ngram_repetition3/std": 0.005651315674185753,
"rewards/symbolic_reward_accuracy/mean": 0.732421875,
"rewards/symbolic_reward_accuracy/std": 0.4428044855594635,
"rewards/symbolic_reward_partial_score/mean": 0.9267171025276184,
"rewards/symbolic_reward_partial_score/std": 0.14161133766174316,
"rewards/tag_count_reward/mean": 0.998779296875,
"rewards/tag_count_reward/std": 0.02468114346265793,
"rewards/thinking_answer_ratio_reward/mean": 0.9685767292976379,
"rewards/thinking_answer_ratio_reward/std": 0.04834046587347984,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.249918818473816,
"sampling/importance_sampling_ratio/min": 0.0003460382577031851,
"sampling/sampling_logp_difference/max": 7.968961238861084,
"sampling/sampling_logp_difference/mean": 0.3476927876472473,
"step": 88
},
{
"clip_ratio/high_max": 0.140625,
"clip_ratio/high_mean": 0.07470703125,
"clip_ratio/low_mean": 0.2451171875,
"clip_ratio/low_min": 0.12890625,
"clip_ratio/region_mean": 0.31982421875,
"completions/clipped_ratio": 0.00146484375,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 579.0,
"completions/mean_length": 347.271484375,
"completions/mean_terminated_length": 343.2743225097656,
"completions/min_length": 187.0,
"completions/min_terminated_length": 187.0,
"entropy": 0.8337125517427921,
"epoch": 1.0,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.09077319995516756,
"learning_rate": 1e-05,
"loss": 0.0058,
"num_tokens": 65689008.0,
"reward": 3.366065502166748,
"reward_std": 0.12272138148546219,
"rewards/ngram_repetition2/mean": 0.9778505563735962,
"rewards/ngram_repetition2/std": 0.02180611714720726,
"rewards/ngram_repetition3/mean": 0.9966850280761719,
"rewards/ngram_repetition3/std": 0.01893492229282856,
"rewards/symbolic_reward_accuracy/mean": 0.7119140625,
"rewards/symbolic_reward_accuracy/std": 0.4529819190502167,
"rewards/symbolic_reward_partial_score/mean": 0.91357421875,
"rewards/symbolic_reward_partial_score/std": 0.1592041552066803,
"rewards/tag_count_reward/mean": 0.999267578125,
"rewards/tag_count_reward/std": 0.019127286970615387,
"rewards/thinking_answer_ratio_reward/mean": 0.9650210738182068,
"rewards/thinking_answer_ratio_reward/std": 0.03827888146042824,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.254873275756836,
"sampling/importance_sampling_ratio/min": 0.00030355059425346553,
"sampling/sampling_logp_difference/max": 8.09996223449707,
"sampling/sampling_logp_difference/mean": 0.347744882106781,
"step": 92
},
{
"epoch": 1.0,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.0008223684210526315,
"eval_completions/max_length": 730.6842105263158,
"eval_completions/max_terminated_length": 454.36842105263156,
"eval_completions/mean_length": 323.31825657894734,
"eval_completions/mean_terminated_length": 321.05658762078536,
"eval_completions/min_length": 223.05263157894737,
"eval_completions/min_terminated_length": 223.05263157894737,
"eval_entropy": 0.8378904399118925,
"eval_frac_reward_zero_std": 0.0,
"eval_loss": 0.0008370681316591799,
"eval_num_tokens": 65689008.0,
"eval_reward": 3.2917319975401225,
"eval_reward_std": 0.11801875697749079,
"eval_rewards/ngram_repetition2/mean": 0.9756150371149966,
"eval_rewards/ngram_repetition2/std": 0.01365309997804855,
"eval_rewards/ngram_repetition3/mean": 0.9962926067804035,
"eval_rewards/ngram_repetition3/std": 0.006524833691257395,
"eval_rewards/symbolic_reward_accuracy/mean": 0.678453947368421,
"eval_rewards/symbolic_reward_accuracy/std": 0.4427479756505866,
"eval_rewards/symbolic_reward_partial_score/mean": 0.9067297138665852,
"eval_rewards/symbolic_reward_partial_score/std": 0.1448917659489732,
"eval_rewards/tag_count_reward/mean": 0.998766447368421,
"eval_rewards/tag_count_reward/std": 0.012580533757021553,
"eval_rewards/thinking_answer_ratio_reward/mean": 0.9608869458499708,
"eval_rewards/thinking_answer_ratio_reward/std": 0.03243385211221481,
"eval_runtime": 180.375,
"eval_samples_per_second": 0.832,
"eval_sampling/importance_sampling_ratio/max": 2.0,
"eval_sampling/importance_sampling_ratio/mean": 1.2766095650823492,
"eval_sampling/importance_sampling_ratio/min": 0.0030842775350289516,
"eval_sampling/sampling_logp_difference/max": 5.888615081184788,
"eval_sampling/sampling_logp_difference/mean": 0.37607322868547943,
"eval_steps_per_second": 0.011,
"step": 92
},
{
"clip_ratio/high_max": 0.1796875,
"clip_ratio/high_mean": 0.08984375,
"clip_ratio/low_mean": 0.23876953125,
"clip_ratio/low_min": 0.10546875,
"clip_ratio/region_mean": 0.32861328125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 484.0,
"completions/max_terminated_length": 484.0,
"completions/mean_length": 318.796875,
"completions/mean_terminated_length": 318.796875,
"completions/min_length": 188.0,
"completions/min_terminated_length": 188.0,
"entropy": 0.8296824619174004,
"epoch": 1.0434782608695652,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.035556589091200325,
"learning_rate": 1e-05,
"loss": 0.0004,
"num_tokens": 68391824.0,
"reward": 3.526369333267212,
"reward_std": 0.10698950290679932,
"rewards/ngram_repetition2/mean": 0.976723313331604,
"rewards/ngram_repetition2/std": 0.013903986662626266,
"rewards/ngram_repetition3/mean": 0.9966170787811279,
"rewards/ngram_repetition3/std": 0.007368884980678558,
"rewards/symbolic_reward_accuracy/mean": 0.779296875,
"rewards/symbolic_reward_accuracy/std": 0.414821982383728,
"rewards/symbolic_reward_partial_score/mean": 0.9383952021598816,
"rewards/symbolic_reward_partial_score/std": 0.13407477736473083,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9647086262702942,
"rewards/thinking_answer_ratio_reward/std": 0.010662911459803581,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.277923345565796,
"sampling/importance_sampling_ratio/min": 0.0010678451508283615,
"sampling/sampling_logp_difference/max": 6.8421125411987305,
"sampling/sampling_logp_difference/mean": 0.37708646059036255,
"step": 96
},
{
"clip_ratio/high_max": 0.3515625,
"clip_ratio/high_mean": 0.21728515625,
"clip_ratio/low_mean": 0.1474609375,
"clip_ratio/low_min": 0.05078125,
"clip_ratio/region_mean": 0.36474609375,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 474.0,
"completions/mean_length": 312.2705078125,
"completions/mean_terminated_length": 310.9223327636719,
"completions/min_length": 199.0,
"completions/min_terminated_length": 199.0,
"entropy": 0.8829836808145046,
"epoch": 1.0869565217391304,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.026459033753815097,
"learning_rate": 1e-05,
"loss": 0.0015,
"num_tokens": 71097114.0,
"reward": 3.375732421875,
"reward_std": 0.16172271966934204,
"rewards/ngram_repetition2/mean": 0.9744597673416138,
"rewards/ngram_repetition2/std": 0.013245878741145134,
"rewards/ngram_repetition3/mean": 0.996464729309082,
"rewards/ngram_repetition3/std": 0.006197268608957529,
"rewards/symbolic_reward_accuracy/mean": 0.71337890625,
"rewards/symbolic_reward_accuracy/std": 0.45229339599609375,
"rewards/symbolic_reward_partial_score/mean": 0.9201253056526184,
"rewards/symbolic_reward_partial_score/std": 0.14969654381275177,
"rewards/tag_count_reward/mean": 0.99951171875,
"rewards/tag_count_reward/std": 0.015621182508766651,
"rewards/thinking_answer_ratio_reward/mean": 0.9628262519836426,
"rewards/thinking_answer_ratio_reward/std": 0.03255803510546684,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.280361533164978,
"sampling/importance_sampling_ratio/min": 0.000973947171587497,
"sampling/sampling_logp_difference/max": 6.9341535568237305,
"sampling/sampling_logp_difference/mean": 0.3825719654560089,
"step": 100
},
{
"clip_ratio/high_max": 0.2421875,
"clip_ratio/high_mean": 0.1552734375,
"clip_ratio/low_mean": 0.21044921875,
"clip_ratio/low_min": 0.1015625,
"clip_ratio/region_mean": 0.36572265625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 507.0,
"completions/max_terminated_length": 507.0,
"completions/mean_length": 305.24658203125,
"completions/mean_terminated_length": 305.24658203125,
"completions/min_length": 184.0,
"completions/min_terminated_length": 184.0,
"entropy": 0.93367725238204,
"epoch": 1.1304347826086956,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.023622811381027897,
"learning_rate": 1e-05,
"loss": -0.0013,
"num_tokens": 73753171.0,
"reward": 3.5650792121887207,
"reward_std": 0.07873280346393585,
"rewards/ngram_repetition2/mean": 0.9779536724090576,
"rewards/ngram_repetition2/std": 0.011236137710511684,
"rewards/ngram_repetition3/mean": 0.997450590133667,
"rewards/ngram_repetition3/std": 0.004056216217577457,
"rewards/symbolic_reward_accuracy/mean": 0.79541015625,
"rewards/symbolic_reward_accuracy/std": 0.40350010991096497,
"rewards/symbolic_reward_partial_score/mean": 0.9448649287223816,
"rewards/symbolic_reward_partial_score/std": 0.12454802542924881,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9639977812767029,
"rewards/thinking_answer_ratio_reward/std": 0.010575964115560055,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2900059223175049,
"sampling/importance_sampling_ratio/min": 0.00017328630201518536,
"sampling/sampling_logp_difference/max": 8.660565376281738,
"sampling/sampling_logp_difference/mean": 0.3951420187950134,
"step": 104
},
{
"clip_ratio/high_max": 0.3046875,
"clip_ratio/high_mean": 0.193359375,
"clip_ratio/low_mean": 0.1904296875,
"clip_ratio/low_min": 0.07421875,
"clip_ratio/region_mean": 0.3837890625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 538.0,
"completions/max_terminated_length": 538.0,
"completions/mean_length": 312.146484375,
"completions/mean_terminated_length": 312.146484375,
"completions/min_length": 181.0,
"completions/min_terminated_length": 181.0,
"entropy": 1.0210995934903622,
"epoch": 1.1739130434782608,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.028388966774426528,
"learning_rate": 1e-05,
"loss": -0.0009,
"num_tokens": 76420191.0,
"reward": 3.539149761199951,
"reward_std": 0.10342732071876526,
"rewards/ngram_repetition2/mean": 0.9775739908218384,
"rewards/ngram_repetition2/std": 0.011158421635627747,
"rewards/ngram_repetition3/mean": 0.99737548828125,
"rewards/ngram_repetition3/std": 0.0038673817180097103,
"rewards/symbolic_reward_accuracy/mean": 0.78369140625,
"rewards/symbolic_reward_accuracy/std": 0.4118276536464691,
"rewards/symbolic_reward_partial_score/mean": 0.942626953125,
"rewards/symbolic_reward_partial_score/std": 0.12709258496761322,
"rewards/tag_count_reward/mean": 0.999755859375,
"rewards/tag_count_reward/std": 0.011048543266952038,
"rewards/thinking_answer_ratio_reward/mean": 0.9634621739387512,
"rewards/thinking_answer_ratio_reward/std": 0.02338651940226555,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.30389404296875,
"sampling/importance_sampling_ratio/min": 0.0012677250197157264,
"sampling/sampling_logp_difference/max": 6.670531272888184,
"sampling/sampling_logp_difference/mean": 0.4129188656806946,
"step": 108
},
{
"clip_ratio/high_max": 0.26171875,
"clip_ratio/high_mean": 0.15966796875,
"clip_ratio/low_mean": 0.212890625,
"clip_ratio/low_min": 0.10546875,
"clip_ratio/region_mean": 0.37255859375,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1036.0,
"completions/mean_length": 323.71337890625,
"completions/mean_terminated_length": 322.37078857421875,
"completions/min_length": 186.0,
"completions/min_terminated_length": 186.0,
"entropy": 1.1148979514837265,
"epoch": 1.2173913043478262,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.025973854193448703,
"learning_rate": 1e-05,
"loss": 0.0012,
"num_tokens": 79101396.0,
"reward": 3.5500121116638184,
"reward_std": 0.07026512920856476,
"rewards/ngram_repetition2/mean": 0.9797195792198181,
"rewards/ngram_repetition2/std": 0.011294333264231682,
"rewards/ngram_repetition3/mean": 0.9979097843170166,
"rewards/ngram_repetition3/std": 0.003578017931431532,
"rewards/symbolic_reward_accuracy/mean": 0.7890625,
"rewards/symbolic_reward_accuracy/std": 0.408073753118515,
"rewards/symbolic_reward_partial_score/mean": 0.9429525136947632,
"rewards/symbolic_reward_partial_score/std": 0.12270573526620865,
"rewards/tag_count_reward/mean": 0.99951171875,
"rewards/tag_count_reward/std": 0.015621182508766651,
"rewards/thinking_answer_ratio_reward/mean": 0.9646666049957275,
"rewards/thinking_answer_ratio_reward/std": 0.03140328451991081,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.3225877285003662,
"sampling/importance_sampling_ratio/min": 0.0020175804384052753,
"sampling/sampling_logp_difference/max": 6.2058563232421875,
"sampling/sampling_logp_difference/mean": 0.43698880076408386,
"step": 112
},
{
"clip_ratio/high_max": 0.30859375,
"clip_ratio/high_mean": 0.16650390625,
"clip_ratio/low_mean": 0.19873046875,
"clip_ratio/low_min": 0.0703125,
"clip_ratio/region_mean": 0.365234375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 609.0,
"completions/max_terminated_length": 609.0,
"completions/mean_length": 321.759765625,
"completions/mean_terminated_length": 321.759765625,
"completions/min_length": 191.0,
"completions/min_terminated_length": 191.0,
"entropy": 1.1531813517212868,
"epoch": 1.2608695652173914,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03531320773224689,
"learning_rate": 1e-05,
"loss": -0.0021,
"num_tokens": 81835624.0,
"reward": 3.5205469131469727,
"reward_std": 0.06826324760913849,
"rewards/ngram_repetition2/mean": 0.9809833765029907,
"rewards/ngram_repetition2/std": 0.009805475361645222,
"rewards/ngram_repetition3/mean": 0.998246431350708,
"rewards/ngram_repetition3/std": 0.003229390596970916,
"rewards/symbolic_reward_accuracy/mean": 0.77392578125,
"rewards/symbolic_reward_accuracy/std": 0.4183899462223053,
"rewards/symbolic_reward_partial_score/mean": 0.9432373046875,
"rewards/symbolic_reward_partial_score/std": 0.1203688383102417,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9665986895561218,
"rewards/thinking_answer_ratio_reward/std": 0.007385050877928734,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.3322265148162842,
"sampling/importance_sampling_ratio/min": 0.0010348489740863442,
"sampling/sampling_logp_difference/max": 6.873499870300293,
"sampling/sampling_logp_difference/mean": 0.44727569818496704,
"step": 116
},
{
"clip_ratio/high_max": 0.2890625,
"clip_ratio/high_mean": 0.1962890625,
"clip_ratio/low_mean": 0.1875,
"clip_ratio/low_min": 0.078125,
"clip_ratio/region_mean": 0.3837890625,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 723.0,
"completions/mean_length": 349.056640625,
"completions/mean_terminated_length": 347.7264404296875,
"completions/min_length": 210.0,
"completions/min_terminated_length": 210.0,
"entropy": 1.2782283127307892,
"epoch": 1.3043478260869565,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.034579763552097215,
"learning_rate": 1e-05,
"loss": 0.0015,
"num_tokens": 84613084.0,
"reward": 3.4549241065979004,
"reward_std": 0.06410142034292221,
"rewards/ngram_repetition2/mean": 0.9802985191345215,
"rewards/ngram_repetition2/std": 0.010372255928814411,
"rewards/ngram_repetition3/mean": 0.9981052875518799,
"rewards/ngram_repetition3/std": 0.0034116564784199,
"rewards/symbolic_reward_accuracy/mean": 0.7470703125,
"rewards/symbolic_reward_accuracy/std": 0.43479716777801514,
"rewards/symbolic_reward_partial_score/mean": 0.9315592050552368,
"rewards/symbolic_reward_partial_score/std": 0.14198674261569977,
"rewards/tag_count_reward/mean": 0.999755859375,
"rewards/tag_count_reward/std": 0.011048543266952038,
"rewards/thinking_answer_ratio_reward/mean": 0.9684212803840637,
"rewards/thinking_answer_ratio_reward/std": 0.022755270823836327,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.3524930477142334,
"sampling/importance_sampling_ratio/min": 0.0009223732049576938,
"sampling/sampling_logp_difference/max": 6.988560676574707,
"sampling/sampling_logp_difference/mean": 0.4782055616378784,
"step": 120
},
{
"clip_ratio/high_max": 0.25390625,
"clip_ratio/high_mean": 0.13916015625,
"clip_ratio/low_mean": 0.23193359375,
"clip_ratio/low_min": 0.11328125,
"clip_ratio/region_mean": 0.37109375,
"completions/clipped_ratio": 0.00146484375,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 644.0,
"completions/mean_length": 365.08154296875,
"completions/mean_terminated_length": 361.1105041503906,
"completions/min_length": 207.0,
"completions/min_terminated_length": 207.0,
"entropy": 1.3577501401305199,
"epoch": 1.3478260869565217,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03263029986961485,
"learning_rate": 1e-05,
"loss": 0.0038,
"num_tokens": 87413859.0,
"reward": 3.3977296352386475,
"reward_std": 0.07217299938201904,
"rewards/ngram_repetition2/mean": 0.9815422296524048,
"rewards/ngram_repetition2/std": 0.017534635961055756,
"rewards/ngram_repetition3/mean": 0.9980996251106262,
"rewards/ngram_repetition3/std": 0.014925251714885235,
"rewards/symbolic_reward_accuracy/mean": 0.72265625,
"rewards/symbolic_reward_accuracy/std": 0.44779694080352783,
"rewards/symbolic_reward_partial_score/mean": 0.9239094853401184,
"rewards/symbolic_reward_partial_score/std": 0.14968585968017578,
"rewards/tag_count_reward/mean": 0.9990234375,
"rewards/tag_count_reward/std": 0.022080888971686363,
"rewards/thinking_answer_ratio_reward/mean": 0.9687855243682861,
"rewards/thinking_answer_ratio_reward/std": 0.043307721614837646,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.3644728660583496,
"sampling/importance_sampling_ratio/min": 4.628046553989407e-06,
"sampling/sampling_logp_difference/max": 12.28337574005127,
"sampling/sampling_logp_difference/mean": 0.4966655373573303,
"step": 124
},
{
"clip_ratio/high_max": 0.26953125,
"clip_ratio/high_mean": 0.1572265625,
"clip_ratio/low_mean": 0.189453125,
"clip_ratio/low_min": 0.078125,
"clip_ratio/region_mean": 0.3466796875,
"completions/clipped_ratio": 0.001953125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 662.0,
"completions/mean_length": 374.17041015625,
"completions/mean_terminated_length": 368.8908996582031,
"completions/min_length": 236.0,
"completions/min_terminated_length": 236.0,
"entropy": 1.4103393778204918,
"epoch": 1.391304347826087,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03271499886368749,
"learning_rate": 1e-05,
"loss": 0.0068,
"num_tokens": 90233248.0,
"reward": 3.470027446746826,
"reward_std": 0.08837710320949554,
"rewards/ngram_repetition2/mean": 0.9828887581825256,
"rewards/ngram_repetition2/std": 0.008584595285356045,
"rewards/ngram_repetition3/mean": 0.9987025260925293,
"rewards/ngram_repetition3/std": 0.0025142852682620287,
"rewards/symbolic_reward_accuracy/mean": 0.75390625,
"rewards/symbolic_reward_accuracy/std": 0.43083900213241577,
"rewards/symbolic_reward_partial_score/mean": 0.9336751103401184,
"rewards/symbolic_reward_partial_score/std": 0.1408122330904007,
"rewards/tag_count_reward/mean": 0.9990234375,
"rewards/tag_count_reward/std": 0.022080888971686363,
"rewards/thinking_answer_ratio_reward/mean": 0.9700585007667542,
"rewards/thinking_answer_ratio_reward/std": 0.043263670057058334,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.3686330318450928,
"sampling/importance_sampling_ratio/min": 1.2951417147633038e-06,
"sampling/sampling_logp_difference/max": 13.556890487670898,
"sampling/sampling_logp_difference/mean": 0.5069293975830078,
"step": 128
},
{
"clip_ratio/high_max": 0.28515625,
"clip_ratio/high_mean": 0.1611328125,
"clip_ratio/low_mean": 0.2099609375,
"clip_ratio/low_min": 0.1015625,
"clip_ratio/region_mean": 0.37109375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 640.0,
"completions/max_terminated_length": 640.0,
"completions/mean_length": 372.28515625,
"completions/mean_terminated_length": 372.28515625,
"completions/min_length": 222.0,
"completions/min_terminated_length": 222.0,
"entropy": 1.4540528357028961,
"epoch": 1.434782608695652,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.037706615340187905,
"learning_rate": 1e-05,
"loss": -0.0007,
"num_tokens": 93048776.0,
"reward": 3.6577959060668945,
"reward_std": 0.08277644217014313,
"rewards/ngram_repetition2/mean": 0.9837426543235779,
"rewards/ngram_repetition2/std": 0.008059200830757618,
"rewards/ngram_repetition3/mean": 0.9987409710884094,
"rewards/ngram_repetition3/std": 0.002457347232848406,
"rewards/symbolic_reward_accuracy/mean": 0.8359375,
"rewards/symbolic_reward_accuracy/std": 0.37042272090911865,
"rewards/symbolic_reward_partial_score/mean": 0.9568685293197632,
"rewards/symbolic_reward_partial_score/std": 0.11145035177469254,
"rewards/tag_count_reward/mean": 0.99951171875,
"rewards/tag_count_reward/std": 0.015621182508766651,
"rewards/thinking_answer_ratio_reward/mean": 0.9716061353683472,
"rewards/thinking_answer_ratio_reward/std": 0.03078635036945343,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.3747305870056152,
"sampling/importance_sampling_ratio/min": 6.537237368320348e-06,
"sampling/sampling_logp_difference/max": 11.937995910644531,
"sampling/sampling_logp_difference/mean": 0.5204066634178162,
"step": 132
},
{
"clip_ratio/high_max": 0.24609375,
"clip_ratio/high_mean": 0.1455078125,
"clip_ratio/low_mean": 0.2216796875,
"clip_ratio/low_min": 0.109375,
"clip_ratio/region_mean": 0.3671875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 690.0,
"completions/max_terminated_length": 690.0,
"completions/mean_length": 384.67431640625,
"completions/mean_terminated_length": 384.67431640625,
"completions/min_length": 243.0,
"completions/min_terminated_length": 243.0,
"entropy": 1.5088882371783257,
"epoch": 1.4782608695652173,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.037293164778679895,
"learning_rate": 1e-05,
"loss": -0.0015,
"num_tokens": 95896013.0,
"reward": 3.551879405975342,
"reward_std": 0.05557756870985031,
"rewards/ngram_repetition2/mean": 0.9850949048995972,
"rewards/ngram_repetition2/std": 0.007369986269623041,
"rewards/ngram_repetition3/mean": 0.9990456104278564,
"rewards/ngram_repetition3/std": 0.002027435228228569,
"rewards/symbolic_reward_accuracy/mean": 0.7919921875,
"rewards/symbolic_reward_accuracy/std": 0.40598157048225403,
"rewards/symbolic_reward_partial_score/mean": 0.9383137822151184,
"rewards/symbolic_reward_partial_score/std": 0.13532200455665588,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9739952683448792,
"rewards/thinking_answer_ratio_reward/std": 0.00430486723780632,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.3827202320098877,
"sampling/importance_sampling_ratio/min": 3.3355458981532138e-06,
"sampling/sampling_logp_difference/max": 12.61087417602539,
"sampling/sampling_logp_difference/mean": 0.5356423854827881,
"step": 136
},
{
"clip_ratio/high_max": 0.265625,
"clip_ratio/high_mean": 0.14111328125,
"clip_ratio/low_mean": 0.2041015625,
"clip_ratio/low_min": 0.08203125,
"clip_ratio/region_mean": 0.34521484375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 697.0,
"completions/max_terminated_length": 697.0,
"completions/mean_length": 411.45556640625,
"completions/mean_terminated_length": 411.45556640625,
"completions/min_length": 243.0,
"completions/min_terminated_length": 243.0,
"entropy": 1.632333055138588,
"epoch": 1.5217391304347827,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.04682422298321068,
"learning_rate": 1e-05,
"loss": -0.0012,
"num_tokens": 98772754.0,
"reward": 3.4415788650512695,
"reward_std": 0.06567732989788055,
"rewards/ngram_repetition2/mean": 0.9847305417060852,
"rewards/ngram_repetition2/std": 0.007420970126986504,
"rewards/ngram_repetition3/mean": 0.9989989399909973,
"rewards/ngram_repetition3/std": 0.0022030072286725044,
"rewards/symbolic_reward_accuracy/mean": 0.74072265625,
"rewards/symbolic_reward_accuracy/std": 0.4383451044559479,
"rewards/symbolic_reward_partial_score/mean": 0.9307861328125,
"rewards/symbolic_reward_partial_score/std": 0.1378849595785141,
"rewards/tag_count_reward/mean": 0.999755859375,
"rewards/tag_count_reward/std": 0.011048543266952038,
"rewards/thinking_answer_ratio_reward/mean": 0.9754141569137573,
"rewards/thinking_answer_ratio_reward/std": 0.022000106051564217,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.3969695568084717,
"sampling/importance_sampling_ratio/min": 2.535395157710063e-11,
"sampling/sampling_logp_difference/max": 24.398086547851562,
"sampling/sampling_logp_difference/mean": 0.5631133317947388,
"step": 140
},
{
"clip_ratio/high_max": 0.24609375,
"clip_ratio/high_mean": 0.15283203125,
"clip_ratio/low_mean": 0.203125,
"clip_ratio/low_min": 0.1015625,
"clip_ratio/region_mean": 0.35595703125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 890.0,
"completions/max_terminated_length": 890.0,
"completions/mean_length": 425.41552734375,
"completions/mean_terminated_length": 425.41552734375,
"completions/min_length": 258.0,
"completions/min_terminated_length": 258.0,
"entropy": 1.726756490767002,
"epoch": 1.5652173913043477,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.038093541651771236,
"learning_rate": 1e-05,
"loss": -0.001,
"num_tokens": 101674917.0,
"reward": 3.5725719928741455,
"reward_std": 0.017834221944212914,
"rewards/ngram_repetition2/mean": 0.9846716523170471,
"rewards/ngram_repetition2/std": 0.0074483552016317844,
"rewards/ngram_repetition3/mean": 0.9990028738975525,
"rewards/ngram_repetition3/std": 0.00226139766164124,
"rewards/symbolic_reward_accuracy/mean": 0.80322265625,
"rewards/symbolic_reward_accuracy/std": 0.39765968918800354,
"rewards/symbolic_reward_partial_score/mean": 0.9365234375,
"rewards/symbolic_reward_partial_score/std": 0.14711622893810272,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9766725897789001,
"rewards/thinking_answer_ratio_reward/std": 0.004060372244566679,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.4077180624008179,
"sampling/importance_sampling_ratio/min": 5.527023176910006e-07,
"sampling/sampling_logp_difference/max": 14.408446311950684,
"sampling/sampling_logp_difference/mean": 0.5831520557403564,
"step": 144
},
{
"clip_ratio/high_max": 0.2734375,
"clip_ratio/high_mean": 0.1513671875,
"clip_ratio/low_mean": 0.22314453125,
"clip_ratio/low_min": 0.10546875,
"clip_ratio/region_mean": 0.37451171875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 764.0,
"completions/max_terminated_length": 764.0,
"completions/mean_length": 434.2431640625,
"completions/mean_terminated_length": 434.2431640625,
"completions/min_length": 236.0,
"completions/min_terminated_length": 236.0,
"entropy": 1.815441645681858,
"epoch": 1.608695652173913,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.06173162254334352,
"learning_rate": 1e-05,
"loss": -0.0011,
"num_tokens": 104649015.0,
"reward": 3.4869613647460938,
"reward_std": 0.0348396934568882,
"rewards/ngram_repetition2/mean": 0.9839403629302979,
"rewards/ngram_repetition2/std": 0.00729979295283556,
"rewards/ngram_repetition3/mean": 0.9989305138587952,
"rewards/ngram_repetition3/std": 0.002313731238245964,
"rewards/symbolic_reward_accuracy/mean": 0.75830078125,
"rewards/symbolic_reward_accuracy/std": 0.4282175302505493,
"rewards/symbolic_reward_partial_score/mean": 0.9407551884651184,
"rewards/symbolic_reward_partial_score/std": 0.12213268131017685,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9775879383087158,
"rewards/thinking_answer_ratio_reward/std": 0.003819839097559452,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.4123494625091553,
"sampling/importance_sampling_ratio/min": 8.588379569118842e-05,
"sampling/sampling_logp_difference/max": 9.362515449523926,
"sampling/sampling_logp_difference/mean": 0.5984525084495544,
"step": 148
},
{
"clip_ratio/high_max": 0.2265625,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.2197265625,
"clip_ratio/low_min": 0.1015625,
"clip_ratio/region_mean": 0.3525390625,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 932.0,
"completions/mean_length": 463.5791015625,
"completions/mean_terminated_length": 462.3048400878906,
"completions/min_length": 241.0,
"completions/min_terminated_length": 241.0,
"entropy": 1.9924634099006653,
"epoch": 1.6521739130434783,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.10959925778321504,
"learning_rate": 1e-05,
"loss": 0.0013,
"num_tokens": 107645177.0,
"reward": 3.647918462753296,
"reward_std": 0.0664098709821701,
"rewards/ngram_repetition2/mean": 0.9830807447433472,
"rewards/ngram_repetition2/std": 0.007356169633567333,
"rewards/ngram_repetition3/mean": 0.9989252090454102,
"rewards/ngram_repetition3/std": 0.0019529308192431927,
"rewards/symbolic_reward_accuracy/mean": 0.82958984375,
"rewards/symbolic_reward_accuracy/std": 0.376084566116333,
"rewards/symbolic_reward_partial_score/mean": 0.9603678584098816,
"rewards/symbolic_reward_partial_score/std": 0.09740671515464783,
"rewards/tag_count_reward/mean": 0.998779296875,
"rewards/tag_count_reward/std": 0.02468114346265793,
"rewards/thinking_answer_ratio_reward/mean": 0.9771405458450317,
"rewards/thinking_answer_ratio_reward/std": 0.043395884335041046,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.4253789186477661,
"sampling/importance_sampling_ratio/min": 0.0006438745185732841,
"sampling/sampling_logp_difference/max": 7.348006725311279,
"sampling/sampling_logp_difference/mean": 0.6298946738243103,
"step": 152
},
{
"clip_ratio/high_max": 0.140625,
"clip_ratio/high_mean": 0.08349609375,
"clip_ratio/low_mean": 0.263671875,
"clip_ratio/low_min": 0.1328125,
"clip_ratio/region_mean": 0.34716796875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 824.0,
"completions/max_terminated_length": 824.0,
"completions/mean_length": 462.9541015625,
"completions/mean_terminated_length": 462.9541015625,
"completions/min_length": 234.0,
"completions/min_terminated_length": 234.0,
"entropy": 2.0628679618239403,
"epoch": 1.6956521739130435,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24161130829987623,
"learning_rate": 1e-05,
"loss": -0.0003,
"num_tokens": 110662235.0,
"reward": 3.3783979415893555,
"reward_std": 0.036025457084178925,
"rewards/ngram_repetition2/mean": 0.9834574460983276,
"rewards/ngram_repetition2/std": 0.0074860225431621075,
"rewards/ngram_repetition3/mean": 0.9989546537399292,
"rewards/ngram_repetition3/std": 0.002006194554269314,
"rewards/symbolic_reward_accuracy/mean": 0.70654296875,
"rewards/symbolic_reward_accuracy/std": 0.45545724034309387,
"rewards/symbolic_reward_partial_score/mean": 0.9361978769302368,
"rewards/symbolic_reward_partial_score/std": 0.10914558917284012,
"rewards/tag_count_reward/mean": 0.99951171875,
"rewards/tag_count_reward/std": 0.015621182508766651,
"rewards/thinking_answer_ratio_reward/mean": 0.9778301119804382,
"rewards/thinking_answer_ratio_reward/std": 0.030854862183332443,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.4279154539108276,
"sampling/importance_sampling_ratio/min": 2.4768311050138436e-05,
"sampling/sampling_logp_difference/max": 10.605945587158203,
"sampling/sampling_logp_difference/mean": 0.6437482237815857,
"step": 156
},
{
"clip_ratio/high_max": 0.15234375,
"clip_ratio/high_mean": 0.0888671875,
"clip_ratio/low_mean": 0.23388671875,
"clip_ratio/low_min": 0.125,
"clip_ratio/region_mean": 0.32275390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 993.0,
"completions/max_terminated_length": 993.0,
"completions/mean_length": 498.11767578125,
"completions/mean_terminated_length": 498.11767578125,
"completions/min_length": 291.0,
"completions/min_terminated_length": 291.0,
"entropy": 2.2991671413183212,
"epoch": 1.7391304347826086,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3548338605641257,
"learning_rate": 1e-05,
"loss": 0.0016,
"num_tokens": 113729132.0,
"reward": 3.5990939140319824,
"reward_std": 0.0962018221616745,
"rewards/ngram_repetition2/mean": 0.9809653162956238,
"rewards/ngram_repetition2/std": 0.007712055929005146,
"rewards/ngram_repetition3/mean": 0.9986952543258667,
"rewards/ngram_repetition3/std": 0.001996663399040699,
"rewards/symbolic_reward_accuracy/mean": 0.8076171875,
"rewards/symbolic_reward_accuracy/std": 0.3942683935165405,
"rewards/symbolic_reward_partial_score/mean": 0.9549967050552368,
"rewards/symbolic_reward_partial_score/std": 0.10741008818149567,
"rewards/tag_count_reward/mean": 0.999267578125,
"rewards/tag_count_reward/std": 0.019127286970615387,
"rewards/thinking_answer_ratio_reward/mean": 0.979852557182312,
"rewards/thinking_answer_ratio_reward/std": 0.02200859785079956,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.4530794620513916,
"sampling/importance_sampling_ratio/min": 3.576672469307596e-08,
"sampling/sampling_logp_difference/max": 17.14624786376953,
"sampling/sampling_logp_difference/mean": 0.6910567283630371,
"step": 160
},
{
"clip_ratio/high_max": 0.16796875,
"clip_ratio/high_mean": 0.10107421875,
"clip_ratio/low_mean": 0.24560546875,
"clip_ratio/low_min": 0.1328125,
"clip_ratio/region_mean": 0.3466796875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1344.0,
"completions/max_terminated_length": 1344.0,
"completions/mean_length": 543.86083984375,
"completions/mean_terminated_length": 543.86083984375,
"completions/min_length": 310.0,
"completions/min_terminated_length": 310.0,
"entropy": 2.6379848271608353,
"epoch": 1.7826086956521738,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.08513499323190751,
"learning_rate": 1e-05,
"loss": 0.0133,
"num_tokens": 116905551.0,
"reward": 3.548675775527954,
"reward_std": 0.09981994330883026,
"rewards/ngram_repetition2/mean": 0.9711927771568298,
"rewards/ngram_repetition2/std": 0.011247237212955952,
"rewards/ngram_repetition3/mean": 0.9973983764648438,
"rewards/ngram_repetition3/std": 0.002803381998091936,
"rewards/symbolic_reward_accuracy/mean": 0.7880859375,
"rewards/symbolic_reward_accuracy/std": 0.4087640941143036,
"rewards/symbolic_reward_partial_score/mean": 0.9429931640625,
"rewards/symbolic_reward_partial_score/std": 0.12854379415512085,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9824697971343994,
"rewards/thinking_answer_ratio_reward/std": 0.004015693906694651,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.4711500406265259,
"sampling/importance_sampling_ratio/min": 1.322712250839686e-07,
"sampling/sampling_logp_difference/max": 15.838411331176758,
"sampling/sampling_logp_difference/mean": 0.7328225374221802,
"step": 164
},
{
"clip_ratio/high_max": 0.24609375,
"clip_ratio/high_mean": 0.1396484375,
"clip_ratio/low_mean": 0.23193359375,
"clip_ratio/low_min": 0.10546875,
"clip_ratio/region_mean": 0.37158203125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1438.0,
"completions/max_terminated_length": 1438.0,
"completions/mean_length": 503.81298828125,
"completions/mean_terminated_length": 503.81298828125,
"completions/min_length": 263.0,
"completions/min_terminated_length": 263.0,
"entropy": 2.788048267364502,
"epoch": 1.8260869565217392,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.09455791725368402,
"learning_rate": 1e-05,
"loss": 0.0074,
"num_tokens": 120015792.0,
"reward": 3.349081039428711,
"reward_std": 0.10028517991304398,
"rewards/ngram_repetition2/mean": 0.9654586315155029,
"rewards/ngram_repetition2/std": 0.012577379122376442,
"rewards/ngram_repetition3/mean": 0.9965716004371643,
"rewards/ngram_repetition3/std": 0.0033189503010362387,
"rewards/symbolic_reward_accuracy/mean": 0.69970703125,
"rewards/symbolic_reward_accuracy/std": 0.4584972560405731,
"rewards/symbolic_reward_partial_score/mean": 0.9207356572151184,
"rewards/symbolic_reward_partial_score/std": 0.14194521307945251,
"rewards/tag_count_reward/mean": 0.99951171875,
"rewards/tag_count_reward/std": 0.015621182508766651,
"rewards/thinking_answer_ratio_reward/mean": 0.979932427406311,
"rewards/thinking_answer_ratio_reward/std": 0.031026024371385574,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.473034381866455,
"sampling/importance_sampling_ratio/min": 2.001723487410345e-06,
"sampling/sampling_logp_difference/max": 13.121501922607422,
"sampling/sampling_logp_difference/mean": 0.7396732568740845,
"step": 168
},
{
"clip_ratio/high_max": 0.19140625,
"clip_ratio/high_mean": 0.10546875,
"clip_ratio/low_mean": 0.25341796875,
"clip_ratio/low_min": 0.13671875,
"clip_ratio/region_mean": 0.35888671875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 898.0,
"completions/max_terminated_length": 898.0,
"completions/mean_length": 467.646484375,
"completions/mean_terminated_length": 467.646484375,
"completions/min_length": 187.0,
"completions/min_terminated_length": 187.0,
"entropy": 2.768448531627655,
"epoch": 1.8695652173913042,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.06805611273845928,
"learning_rate": 1e-05,
"loss": 0.0022,
"num_tokens": 123004444.0,
"reward": 3.5058183670043945,
"reward_std": 0.07877355813980103,
"rewards/ngram_repetition2/mean": 0.9659230709075928,
"rewards/ngram_repetition2/std": 0.011437240056693554,
"rewards/ngram_repetition3/mean": 0.996779203414917,
"rewards/ngram_repetition3/std": 0.003355607157573104,
"rewards/symbolic_reward_accuracy/mean": 0.77001953125,
"rewards/symbolic_reward_accuracy/std": 0.42092275619506836,
"rewards/symbolic_reward_partial_score/mean": 0.9363606572151184,
"rewards/symbolic_reward_partial_score/std": 0.13817265629768372,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9791755676269531,
"rewards/thinking_answer_ratio_reward/std": 0.005196898244321346,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.4763879776000977,
"sampling/importance_sampling_ratio/min": 4.474750312510878e-05,
"sampling/sampling_logp_difference/max": 10.014474868774414,
"sampling/sampling_logp_difference/mean": 0.7400292158126831,
"step": 172
},
{
"clip_ratio/high_max": 0.30859375,
"clip_ratio/high_mean": 0.18505859375,
"clip_ratio/low_mean": 0.17431640625,
"clip_ratio/low_min": 0.078125,
"clip_ratio/region_mean": 0.359375,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1376.0,
"completions/mean_length": 430.4609375,
"completions/mean_terminated_length": 429.1705017089844,
"completions/min_length": 189.0,
"completions/min_terminated_length": 189.0,
"entropy": 2.8502298444509506,
"epoch": 1.9130434782608696,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.06663024928165782,
"learning_rate": 1e-05,
"loss": 0.003,
"num_tokens": 125954956.0,
"reward": 3.483172655105591,
"reward_std": 0.1899569034576416,
"rewards/ngram_repetition2/mean": 0.9636595845222473,
"rewards/ngram_repetition2/std": 0.016547439619898796,
"rewards/ngram_repetition3/mean": 0.9965729117393494,
"rewards/ngram_repetition3/std": 0.011003647930920124,
"rewards/symbolic_reward_accuracy/mean": 0.76025390625,
"rewards/symbolic_reward_accuracy/std": 0.4270327091217041,
"rewards/symbolic_reward_partial_score/mean": 0.9335530400276184,
"rewards/symbolic_reward_partial_score/std": 0.14367683231830597,
"rewards/tag_count_reward/mean": 0.999755859375,
"rewards/tag_count_reward/std": 0.011048543266952038,
"rewards/thinking_answer_ratio_reward/mean": 0.9753477573394775,
"rewards/thinking_answer_ratio_reward/std": 0.03124345652759075,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.4821131229400635,
"sampling/importance_sampling_ratio/min": 2.7937696359003894e-05,
"sampling/sampling_logp_difference/max": 10.485533714294434,
"sampling/sampling_logp_difference/mean": 0.7600972652435303,
"step": 176
},
{
"clip_ratio/high_max": 0.2421875,
"clip_ratio/high_mean": 0.14013671875,
"clip_ratio/low_mean": 0.23681640625,
"clip_ratio/low_min": 0.12109375,
"clip_ratio/region_mean": 0.376953125,
"completions/clipped_ratio": 0.00146484375,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 696.0,
"completions/mean_length": 415.51611328125,
"completions/mean_terminated_length": 411.61907958984375,
"completions/min_length": 171.0,
"completions/min_terminated_length": 171.0,
"entropy": 2.858258455991745,
"epoch": 1.9565217391304348,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.07966906477050349,
"learning_rate": 1e-05,
"loss": 0.0034,
"num_tokens": 128874861.0,
"reward": 3.4829840660095215,
"reward_std": 0.08554819971323013,
"rewards/ngram_repetition2/mean": 0.9658774733543396,
"rewards/ngram_repetition2/std": 0.011492163874208927,
"rewards/ngram_repetition3/mean": 0.9971028566360474,
"rewards/ngram_repetition3/std": 0.0032859230414032936,
"rewards/symbolic_reward_accuracy/mean": 0.7587890625,
"rewards/symbolic_reward_accuracy/std": 0.42792245745658875,
"rewards/symbolic_reward_partial_score/mean": 0.93701171875,
"rewards/symbolic_reward_partial_score/std": 0.13184432685375214,
"rewards/tag_count_reward/mean": 0.9990234375,
"rewards/tag_count_reward/std": 0.022080888971686363,
"rewards/thinking_answer_ratio_reward/mean": 0.9741167426109314,
"rewards/thinking_answer_ratio_reward/std": 0.04080890119075775,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.4818463325500488,
"sampling/importance_sampling_ratio/min": 1.6995619489534874e-06,
"sampling/sampling_logp_difference/max": 13.285140037536621,
"sampling/sampling_logp_difference/mean": 0.747634768486023,
"step": 180
},
{
"clip_ratio/high_max": 0.1640625,
"clip_ratio/high_mean": 0.09521484375,
"clip_ratio/low_mean": 0.2451171875,
"clip_ratio/low_min": 0.125,
"clip_ratio/region_mean": 0.34033203125,
"completions/clipped_ratio": 0.00048828125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2981.0,
"completions/mean_length": 415.7646484375,
"completions/mean_terminated_length": 414.4670104980469,
"completions/min_length": 197.0,
"completions/min_terminated_length": 197.0,
"entropy": 3.0020454972982407,
"epoch": 2.0,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20941599091305263,
"learning_rate": 1e-05,
"loss": 0.0019,
"num_tokens": 131785771.0,
"reward": 3.4347691535949707,
"reward_std": 0.0830899327993393,
"rewards/ngram_repetition2/mean": 0.9693027138710022,
"rewards/ngram_repetition2/std": 0.02014472335577011,
"rewards/ngram_repetition3/mean": 0.9970437288284302,
"rewards/ngram_repetition3/std": 0.01778826303780079,
"rewards/symbolic_reward_accuracy/mean": 0.73974609375,
"rewards/symbolic_reward_accuracy/std": 0.43888023495674133,
"rewards/symbolic_reward_partial_score/mean": 0.9258626103401184,
"rewards/symbolic_reward_partial_score/std": 0.15298709273338318,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9750823378562927,
"rewards/thinking_answer_ratio_reward/std": 0.022159311920404434,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.4881045818328857,
"sampling/importance_sampling_ratio/min": 6.7104201662004925e-06,
"sampling/sampling_logp_difference/max": 11.911849021911621,
"sampling/sampling_logp_difference/mean": 0.7676164507865906,
"step": 184
},
{
"epoch": 2.0,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.001644736842105263,
"eval_completions/max_length": 895.8947368421053,
"eval_completions/max_terminated_length": 641.0526315789474,
"eval_completions/mean_length": 447.49136513157896,
"eval_completions/mean_terminated_length": 443.1748866031044,
"eval_completions/min_length": 229.31578947368422,
"eval_completions/min_terminated_length": 229.31578947368422,
"eval_entropy": 2.969714365507427,
"eval_frac_reward_zero_std": 0.0,
"eval_loss": 0.0008429406443610787,
"eval_num_tokens": 131785771.0,
"eval_reward": 3.461897749649851,
"eval_reward_std": 0.11290462953145738,
"eval_rewards/ngram_repetition2/mean": 0.9665061580507379,
"eval_rewards/ngram_repetition2/std": 0.010592278240150526,
"eval_rewards/ngram_repetition3/mean": 0.9974528551101685,
"eval_rewards/ngram_repetition3/std": 0.002903796377052602,
"eval_rewards/symbolic_reward_accuracy/mean": 0.7504111842105263,
"eval_rewards/symbolic_reward_accuracy/std": 0.3684137702772492,
"eval_rewards/symbolic_reward_partial_score/mean": 0.9324972535434523,
"eval_rewards/symbolic_reward_partial_score/std": 0.11325683170243313,
"eval_rewards/tag_count_reward/mean": 0.9991776315789473,
"eval_rewards/tag_count_reward/std": 0.006552994643387042,
"eval_rewards/thinking_answer_ratio_reward/mean": 0.9760947635299281,
"eval_rewards/thinking_answer_ratio_reward/std": 0.018366032500604267,
"eval_runtime": 197.726,
"eval_samples_per_second": 0.759,
"eval_sampling/importance_sampling_ratio/max": 2.0,
"eval_sampling/importance_sampling_ratio/mean": 1.4970186572325856,
"eval_sampling/importance_sampling_ratio/min": 0.002377865083537089,
"eval_sampling/sampling_logp_difference/max": 6.292696877529747,
"eval_sampling/sampling_logp_difference/mean": 0.7791855492089924,
"eval_steps_per_second": 0.01,
"step": 184
},
{
"epoch": 2.0,
"step": 184,
"total_flos": 0.0,
"train_loss": 0.005923212121358475,
"train_runtime": 6121.2967,
"train_samples_per_second": 0.98,
"train_steps_per_second": 0.03
}
],
"logging_steps": 4,
"max_steps": 184,
"num_input_tokens_seen": 131785771,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}