leonMW's picture
Upload folder using huggingface_hub
3aa56a3 verified
{
"best_global_step": 64,
"best_metric": 0.00070223119109869,
"best_model_checkpoint": "data/DeepSeek-R1-Distill-Qwen-1.5B-Staged-1/checkpoint-64",
"epoch": 2.0,
"eval_steps": 500,
"global_step": 64,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.11669921875,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2985.0,
"completions/mean_length": 1047.6865234375,
"completions/mean_terminated_length": 780.2398681640625,
"completions/min_length": 230.0,
"completions/min_terminated_length": 230.0,
"entropy": 0.3189637362957001,
"epoch": 0.03125,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.10914598126181888,
"learning_rate": 1e-05,
"loss": 0.1667,
"num_tokens": 3492862.0,
"reward": 1.400852918624878,
"reward_std": 0.34828251600265503,
"rewards/ngram_repetition2/mean": 0.5693519115447998,
"rewards/ngram_repetition2/std": 0.17405439913272858,
"rewards/ngram_repetition3/mean": 0.7214103937149048,
"rewards/ngram_repetition3/std": 0.1819220334291458,
"rewards/symbolic_reward_accuracy/mean": 0.01123046875,
"rewards/symbolic_reward_accuracy/std": 0.10540289431810379,
"rewards/symbolic_reward_partial_score/mean": 0.41455078125,
"rewards/symbolic_reward_partial_score/std": 0.20262853801250458,
"rewards/tag_count_reward/mean": 0.94287109375,
"rewards/tag_count_reward/std": 0.15910091996192932,
"rewards/thinking_answer_ratio_reward/mean": 0.8062490820884705,
"rewards/thinking_answer_ratio_reward/std": 0.3022027909755707,
"step": 1
},
{
"clip_ratio/high_max": 0.3697916666666667,
"clip_ratio/high_mean": 0.23893229166666666,
"clip_ratio/low_mean": 0.224609375,
"clip_ratio/low_min": 0.0625,
"clip_ratio/region_mean": 0.4635416666666667,
"entropy": 0.3202968165278435,
"epoch": 0.125,
"grad_norm": 0.04593035359379921,
"learning_rate": 1e-05,
"loss": 0.1373,
"step": 4
},
{
"clip_ratio/high_max": 0.33984375,
"clip_ratio/high_mean": 0.21923828125,
"clip_ratio/low_mean": 0.16162109375,
"clip_ratio/low_min": 0.05859375,
"clip_ratio/region_mean": 0.380859375,
"completions/clipped_ratio": 0.05712890625,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3030.0,
"completions/mean_length": 714.38330078125,
"completions/mean_terminated_length": 571.534423828125,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"entropy": 0.3361353427171707,
"epoch": 0.25,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.046721866146913066,
"learning_rate": 1e-05,
"loss": 0.0956,
"num_tokens": 6308111.0,
"reward": 1.5210591554641724,
"reward_std": 0.38517600297927856,
"rewards/ngram_repetition2/mean": 0.6417093276977539,
"rewards/ngram_repetition2/std": 0.16069720685482025,
"rewards/ngram_repetition3/mean": 0.7863442301750183,
"rewards/ngram_repetition3/std": 0.1631278693675995,
"rewards/symbolic_reward_accuracy/mean": 0.029296875,
"rewards/symbolic_reward_accuracy/std": 0.16867858171463013,
"rewards/symbolic_reward_partial_score/mean": 0.46630859375,
"rewards/symbolic_reward_partial_score/std": 0.17428098618984222,
"rewards/tag_count_reward/mean": 0.973388671875,
"rewards/tag_count_reward/std": 0.11226600408554077,
"rewards/thinking_answer_ratio_reward/mean": 0.8487584590911865,
"rewards/thinking_answer_ratio_reward/std": 0.22293001413345337,
"step": 8
},
{
"clip_ratio/high_max": 0.1953125,
"clip_ratio/high_mean": 0.09716796875,
"clip_ratio/low_mean": 0.4169921875,
"clip_ratio/low_min": 0.29296875,
"clip_ratio/region_mean": 0.51416015625,
"completions/clipped_ratio": 0.0419921875,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2810.0,
"completions/mean_length": 612.14404296875,
"completions/mean_terminated_length": 504.32159423828125,
"completions/min_length": 161.0,
"completions/min_terminated_length": 161.0,
"entropy": 0.34443395026028156,
"epoch": 0.375,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.030918519373438542,
"learning_rate": 1e-05,
"loss": 0.0396,
"num_tokens": 8915638.0,
"reward": 1.937060832977295,
"reward_std": 0.9647762775421143,
"rewards/ngram_repetition2/mean": 0.6723504662513733,
"rewards/ngram_repetition2/std": 0.14847470819950104,
"rewards/ngram_repetition3/mean": 0.8140700459480286,
"rewards/ngram_repetition3/std": 0.15126831829547882,
"rewards/symbolic_reward_accuracy/mean": 0.18310546875,
"rewards/symbolic_reward_accuracy/std": 0.38684743642807007,
"rewards/symbolic_reward_partial_score/mean": 0.5675048828125,
"rewards/symbolic_reward_partial_score/std": 0.23073548078536987,
"rewards/tag_count_reward/mean": 0.979736328125,
"rewards/tag_count_reward/std": 0.09862032532691956,
"rewards/thinking_answer_ratio_reward/mean": 0.8744399547576904,
"rewards/thinking_answer_ratio_reward/std": 0.19187742471694946,
"step": 12
},
{
"clip_ratio/high_max": 0.390625,
"clip_ratio/high_mean": 0.23583984375,
"clip_ratio/low_mean": 0.12841796875,
"clip_ratio/low_min": 0.046875,
"clip_ratio/region_mean": 0.3642578125,
"completions/clipped_ratio": 0.0224609375,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2416.0,
"completions/mean_length": 584.31982421875,
"completions/mean_terminated_length": 527.1603393554688,
"completions/min_length": 174.0,
"completions/min_terminated_length": 174.0,
"entropy": 0.36808328330516815,
"epoch": 0.5,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.032998073100716244,
"learning_rate": 1e-05,
"loss": 0.0433,
"num_tokens": 11472837.0,
"reward": 3.4143524169921875,
"reward_std": 1.0181639194488525,
"rewards/ngram_repetition2/mean": 0.682662844657898,
"rewards/ngram_repetition2/std": 0.12800198793411255,
"rewards/ngram_repetition3/mean": 0.8277437686920166,
"rewards/ngram_repetition3/std": 0.1255839467048645,
"rewards/symbolic_reward_accuracy/mean": 0.76953125,
"rewards/symbolic_reward_accuracy/std": 0.42123574018478394,
"rewards/symbolic_reward_partial_score/mean": 0.8619384765625,
"rewards/symbolic_reward_partial_score/std": 0.26891157031059265,
"rewards/tag_count_reward/mean": 0.9892578125,
"rewards/tag_count_reward/std": 0.07251390814781189,
"rewards/thinking_answer_ratio_reward/mean": 0.8989672660827637,
"rewards/thinking_answer_ratio_reward/std": 0.14425675570964813,
"step": 16
},
{
"clip_ratio/high_max": 0.3359375,
"clip_ratio/high_mean": 0.2333984375,
"clip_ratio/low_mean": 0.10986328125,
"clip_ratio/low_min": 0.03515625,
"clip_ratio/region_mean": 0.34326171875,
"completions/clipped_ratio": 0.02197265625,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 2756.0,
"completions/mean_length": 520.4208984375,
"completions/mean_terminated_length": 463.0963439941406,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"entropy": 0.3659926615655422,
"epoch": 0.625,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0355213854083778,
"learning_rate": 1e-05,
"loss": 0.0495,
"num_tokens": 13864227.0,
"reward": 3.7088823318481445,
"reward_std": 0.667622447013855,
"rewards/ngram_repetition2/mean": 0.7104234099388123,
"rewards/ngram_repetition2/std": 0.12404344230890274,
"rewards/ngram_repetition3/mean": 0.84934002161026,
"rewards/ngram_repetition3/std": 0.12357836216688156,
"rewards/symbolic_reward_accuracy/mean": 0.88818359375,
"rewards/symbolic_reward_accuracy/std": 0.31521740555763245,
"rewards/symbolic_reward_partial_score/mean": 0.9185791015625,
"rewards/symbolic_reward_partial_score/std": 0.2433638721704483,
"rewards/tag_count_reward/mean": 0.989501953125,
"rewards/tag_count_reward/std": 0.07170303165912628,
"rewards/thinking_answer_ratio_reward/mean": 0.8836514949798584,
"rewards/thinking_answer_ratio_reward/std": 0.14230027794837952,
"step": 20
},
{
"clip_ratio/high_max": 0.328125,
"clip_ratio/high_mean": 0.1982421875,
"clip_ratio/low_mean": 0.16943359375,
"clip_ratio/low_min": 0.06640625,
"clip_ratio/region_mean": 0.36767578125,
"completions/clipped_ratio": 0.013671875,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 3019.0,
"completions/mean_length": 460.240234375,
"completions/mean_terminated_length": 424.0376281738281,
"completions/min_length": 128.0,
"completions/min_terminated_length": 128.0,
"entropy": 0.36848987452685833,
"epoch": 0.75,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.027143384638001866,
"learning_rate": 1e-05,
"loss": 0.049,
"num_tokens": 16149007.0,
"reward": 3.875516414642334,
"reward_std": 0.3786875903606415,
"rewards/ngram_repetition2/mean": 0.7316169738769531,
"rewards/ngram_repetition2/std": 0.10848518460988998,
"rewards/ngram_repetition3/mean": 0.8663702011108398,
"rewards/ngram_repetition3/std": 0.10447894036769867,
"rewards/symbolic_reward_accuracy/mean": 0.94775390625,
"rewards/symbolic_reward_accuracy/std": 0.22257724404335022,
"rewards/symbolic_reward_partial_score/mean": 0.961181640625,
"rewards/symbolic_reward_partial_score/std": 0.17496450245380402,
"rewards/tag_count_reward/mean": 0.993896484375,
"rewards/tag_count_reward/std": 0.054917916655540466,
"rewards/thinking_answer_ratio_reward/mean": 0.8950520157814026,
"rewards/thinking_answer_ratio_reward/std": 0.11731007695198059,
"step": 24
},
{
"clip_ratio/high_max": 0.30859375,
"clip_ratio/high_mean": 0.17626953125,
"clip_ratio/low_mean": 0.22119140625,
"clip_ratio/low_min": 0.10546875,
"clip_ratio/region_mean": 0.3974609375,
"completions/clipped_ratio": 0.00634765625,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1444.0,
"completions/mean_length": 378.6123046875,
"completions/mean_terminated_length": 361.4063720703125,
"completions/min_length": 151.0,
"completions/min_terminated_length": 151.0,
"entropy": 0.3799332305788994,
"epoch": 0.875,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.024146589341786776,
"learning_rate": 1e-05,
"loss": 0.031,
"num_tokens": 18269941.0,
"reward": 3.9713735580444336,
"reward_std": 0.15463097393512726,
"rewards/ngram_repetition2/mean": 0.7776297330856323,
"rewards/ngram_repetition2/std": 0.08481114357709885,
"rewards/ngram_repetition3/mean": 0.9015278816223145,
"rewards/ngram_repetition3/std": 0.07951432466506958,
"rewards/symbolic_reward_accuracy/mean": 0.9814453125,
"rewards/symbolic_reward_accuracy/std": 0.13497892022132874,
"rewards/symbolic_reward_partial_score/mean": 0.985107421875,
"rewards/symbolic_reward_partial_score/std": 0.11334022879600525,
"rewards/tag_count_reward/mean": 0.99755859375,
"rewards/tag_count_reward/std": 0.034861668944358826,
"rewards/thinking_answer_ratio_reward/mean": 0.9025194048881531,
"rewards/thinking_answer_ratio_reward/std": 0.08864665776491165,
"step": 28
},
{
"clip_ratio/high_max": 0.3359375,
"clip_ratio/high_mean": 0.22119140625,
"clip_ratio/low_mean": 0.189453125,
"clip_ratio/low_min": 0.09765625,
"clip_ratio/region_mean": 0.41064453125,
"completions/clipped_ratio": 0.00341796875,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 1606.0,
"completions/mean_length": 326.34814453125,
"completions/mean_terminated_length": 316.931396484375,
"completions/min_length": 99.0,
"completions/min_terminated_length": 99.0,
"entropy": 0.3929165042936802,
"epoch": 1.0,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02371361608194468,
"learning_rate": 1e-05,
"loss": 0.0211,
"num_tokens": 20282174.0,
"reward": 3.976555109024048,
"reward_std": 0.16526807844638824,
"rewards/ngram_repetition2/mean": 0.8133348226547241,
"rewards/ngram_repetition2/std": 0.07101932913064957,
"rewards/ngram_repetition3/mean": 0.9278632402420044,
"rewards/ngram_repetition3/std": 0.06339241564273834,
"rewards/symbolic_reward_accuracy/mean": 0.982421875,
"rewards/symbolic_reward_accuracy/std": 0.13144417107105255,
"rewards/symbolic_reward_partial_score/mean": 0.9866943359375,
"rewards/symbolic_reward_partial_score/std": 0.10472454875707626,
"rewards/tag_count_reward/mean": 0.998291015625,
"rewards/tag_count_reward/std": 0.02918882668018341,
"rewards/thinking_answer_ratio_reward/mean": 0.9313849210739136,
"rewards/thinking_answer_ratio_reward/std": 0.06515223532915115,
"step": 32
},
{
"epoch": 1.0,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.006578947368421052,
"eval_completions/max_length": 1925.842105263158,
"eval_completions/max_terminated_length": 641.1578947368421,
"eval_completions/mean_length": 305.5797697368421,
"eval_completions/mean_terminated_length": 287.26832982113484,
"eval_completions/min_length": 145.52631578947367,
"eval_completions/min_terminated_length": 145.52631578947367,
"eval_entropy": 0.3921029740258267,
"eval_frac_reward_zero_std": 0.0,
"eval_loss": 0.017673568800091743,
"eval_num_tokens": 20282174.0,
"eval_reward": 3.8416001420272026,
"eval_reward_std": 0.4433499896212628,
"eval_rewards/ngram_repetition2/mean": 0.8298829511592263,
"eval_rewards/ngram_repetition2/std": 0.0766919782679332,
"eval_rewards/ngram_repetition3/mean": 0.9369708268265975,
"eval_rewards/ngram_repetition3/std": 0.0686160976949491,
"eval_rewards/symbolic_reward_accuracy/mean": 0.930921052631579,
"eval_rewards/symbolic_reward_accuracy/std": 0.24749822326396642,
"eval_rewards/symbolic_reward_partial_score/mean": 0.9561060855263158,
"eval_rewards/symbolic_reward_partial_score/std": 0.1654511419566054,
"eval_rewards/tag_count_reward/mean": 0.9967105263157895,
"eval_rewards/tag_count_reward/std": 0.028502884664033588,
"eval_rewards/thinking_answer_ratio_reward/mean": 0.9272897714062741,
"eval_rewards/thinking_answer_ratio_reward/std": 0.06353752030745934,
"eval_runtime": 291.3147,
"eval_samples_per_second": 0.515,
"eval_steps_per_second": 0.007,
"step": 32
},
{
"clip_ratio/high_max": 0.3125,
"clip_ratio/high_mean": 0.17919921875,
"clip_ratio/low_mean": 0.21875,
"clip_ratio/low_min": 0.10546875,
"clip_ratio/region_mean": 0.39794921875,
"completions/clipped_ratio": 0.001953125,
"completions/max_length": 3072.0,
"completions/max_terminated_length": 853.0,
"completions/mean_length": 287.046875,
"completions/mean_terminated_length": 281.59686279296875,
"completions/min_length": 124.0,
"completions/min_terminated_length": 124.0,
"entropy": 0.39634467102587223,
"epoch": 1.125,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.021677013253874763,
"learning_rate": 1e-05,
"loss": 0.0116,
"num_tokens": 22213918.0,
"reward": 3.9796054363250732,
"reward_std": 0.13036790490150452,
"rewards/ngram_repetition2/mean": 0.8469281792640686,
"rewards/ngram_repetition2/std": 0.05668526515364647,
"rewards/ngram_repetition3/mean": 0.9489122033119202,
"rewards/ngram_repetition3/std": 0.048189926892519,
"rewards/symbolic_reward_accuracy/mean": 0.98291015625,
"rewards/symbolic_reward_accuracy/std": 0.1296379119157791,
"rewards/symbolic_reward_partial_score/mean": 0.9874267578125,
"rewards/symbolic_reward_partial_score/std": 0.10065428167581558,
"rewards/tag_count_reward/mean": 0.9990234375,
"rewards/tag_count_reward/std": 0.022080888971686363,
"rewards/thinking_answer_ratio_reward/mean": 0.9376400709152222,
"rewards/thinking_answer_ratio_reward/std": 0.04486775025725365,
"step": 36
},
{
"clip_ratio/high_max": 0.3125,
"clip_ratio/high_mean": 0.19140625,
"clip_ratio/low_mean": 0.2021484375,
"clip_ratio/low_min": 0.08984375,
"clip_ratio/region_mean": 0.3935546875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1465.0,
"completions/max_terminated_length": 1465.0,
"completions/mean_length": 254.92529296875,
"completions/mean_terminated_length": 254.92529296875,
"completions/min_length": 112.0,
"completions/min_terminated_length": 112.0,
"entropy": 0.4012875221669674,
"epoch": 1.25,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.018544320597192198,
"learning_rate": 1e-05,
"loss": 0.0049,
"num_tokens": 24081541.0,
"reward": 4.009942054748535,
"reward_std": 0.060945987701416016,
"rewards/ngram_repetition2/mean": 0.8762841820716858,
"rewards/ngram_repetition2/std": 0.037716496735811234,
"rewards/ngram_repetition3/mean": 0.9659276008605957,
"rewards/ngram_repetition3/std": 0.025332553312182426,
"rewards/symbolic_reward_accuracy/mean": 0.9931640625,
"rewards/symbolic_reward_accuracy/std": 0.08241677284240723,
"rewards/symbolic_reward_partial_score/mean": 0.995849609375,
"rewards/symbolic_reward_partial_score/std": 0.052837058901786804,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.934229850769043,
"rewards/thinking_answer_ratio_reward/std": 0.017355602234601974,
"step": 40
},
{
"clip_ratio/high_max": 0.26953125,
"clip_ratio/high_mean": 0.166015625,
"clip_ratio/low_mean": 0.234375,
"clip_ratio/low_min": 0.1171875,
"clip_ratio/region_mean": 0.400390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 502.0,
"completions/max_terminated_length": 502.0,
"completions/mean_length": 234.970703125,
"completions/mean_terminated_length": 234.970703125,
"completions/min_length": 112.0,
"completions/min_terminated_length": 112.0,
"entropy": 0.39688388258218765,
"epoch": 1.375,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.019234315035359873,
"learning_rate": 1e-05,
"loss": 0.0024,
"num_tokens": 25914953.0,
"reward": 4.015607833862305,
"reward_std": 0.033271413296461105,
"rewards/ngram_repetition2/mean": 0.8983770608901978,
"rewards/ngram_repetition2/std": 0.030990201979875565,
"rewards/ngram_repetition3/mean": 0.9768623113632202,
"rewards/ngram_repetition3/std": 0.016665907576680183,
"rewards/symbolic_reward_accuracy/mean": 0.9951171875,
"rewards/symbolic_reward_accuracy/std": 0.06972333788871765,
"rewards/symbolic_reward_partial_score/mean": 0.997314453125,
"rewards/symbolic_reward_partial_score/std": 0.03975516930222511,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9306961894035339,
"rewards/thinking_answer_ratio_reward/std": 0.016988540068268776,
"step": 44
},
{
"clip_ratio/high_max": 0.30078125,
"clip_ratio/high_mean": 0.173828125,
"clip_ratio/low_mean": 0.2197265625,
"clip_ratio/low_min": 0.1015625,
"clip_ratio/region_mean": 0.3935546875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 557.0,
"completions/max_terminated_length": 557.0,
"completions/mean_length": 224.271484375,
"completions/mean_terminated_length": 224.271484375,
"completions/min_length": 107.0,
"completions/min_terminated_length": 107.0,
"entropy": 0.3952790927141905,
"epoch": 1.5,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.017839049202086836,
"learning_rate": 1e-05,
"loss": 0.001,
"num_tokens": 27724789.0,
"reward": 4.000144004821777,
"reward_std": 0.05562726408243179,
"rewards/ngram_repetition2/mean": 0.9121678471565247,
"rewards/ngram_repetition2/std": 0.0285996925085783,
"rewards/ngram_repetition3/mean": 0.9819085597991943,
"rewards/ngram_repetition3/std": 0.015010321512818336,
"rewards/symbolic_reward_accuracy/mean": 0.98876953125,
"rewards/symbolic_reward_accuracy/std": 0.10540289431810379,
"rewards/symbolic_reward_partial_score/mean": 0.994384765625,
"rewards/symbolic_reward_partial_score/std": 0.052701447159051895,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9279464483261108,
"rewards/thinking_answer_ratio_reward/std": 0.017118271440267563,
"step": 48
},
{
"clip_ratio/high_max": 0.296875,
"clip_ratio/high_mean": 0.17236328125,
"clip_ratio/low_mean": 0.24609375,
"clip_ratio/low_min": 0.12890625,
"clip_ratio/region_mean": 0.41845703125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 397.0,
"completions/max_terminated_length": 397.0,
"completions/mean_length": 219.4580078125,
"completions/mean_terminated_length": 219.4580078125,
"completions/min_length": 94.0,
"completions/min_terminated_length": 94.0,
"entropy": 0.3957546763122082,
"epoch": 1.625,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.017264628619577616,
"learning_rate": 1e-05,
"loss": 0.0011,
"num_tokens": 29514783.0,
"reward": 4.000338554382324,
"reward_std": 0.06502684205770493,
"rewards/ngram_repetition2/mean": 0.9239511489868164,
"rewards/ngram_repetition2/std": 0.026442626491189003,
"rewards/ngram_repetition3/mean": 0.9862804412841797,
"rewards/ngram_repetition3/std": 0.012914017774164677,
"rewards/symbolic_reward_accuracy/mean": 0.98876953125,
"rewards/symbolic_reward_accuracy/std": 0.10540289431810379,
"rewards/symbolic_reward_partial_score/mean": 0.994384765625,
"rewards/symbolic_reward_partial_score/std": 0.052701447159051895,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9312715530395508,
"rewards/thinking_answer_ratio_reward/std": 0.016598645597696304,
"step": 52
},
{
"clip_ratio/high_max": 0.28125,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.2099609375,
"clip_ratio/low_min": 0.08984375,
"clip_ratio/region_mean": 0.3818359375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 355.0,
"completions/max_terminated_length": 355.0,
"completions/mean_length": 211.7568359375,
"completions/mean_terminated_length": 211.7568359375,
"completions/min_length": 97.0,
"completions/min_terminated_length": 97.0,
"entropy": 0.38981067948043346,
"epoch": 1.75,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01563751543259849,
"learning_rate": 1e-05,
"loss": 0.0012,
"num_tokens": 31293997.0,
"reward": 4.0152482986450195,
"reward_std": 0.02903410792350769,
"rewards/ngram_repetition2/mean": 0.9386552572250366,
"rewards/ngram_repetition2/std": 0.02311737835407257,
"rewards/ngram_repetition3/mean": 0.9913500547409058,
"rewards/ngram_repetition3/std": 0.010372841730713844,
"rewards/symbolic_reward_accuracy/mean": 0.99462890625,
"rewards/symbolic_reward_accuracy/std": 0.07310851663351059,
"rewards/symbolic_reward_partial_score/mean": 0.997314453125,
"rewards/symbolic_reward_partial_score/std": 0.036554258316755295,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9375712871551514,
"rewards/thinking_answer_ratio_reward/std": 0.01364652719348669,
"step": 56
},
{
"clip_ratio/high_max": 0.2890625,
"clip_ratio/high_mean": 0.17529296875,
"clip_ratio/low_mean": 0.20703125,
"clip_ratio/low_min": 0.07421875,
"clip_ratio/region_mean": 0.38232421875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 481.0,
"completions/max_terminated_length": 481.0,
"completions/mean_length": 211.9931640625,
"completions/mean_terminated_length": 211.9931640625,
"completions/min_length": 101.0,
"completions/min_terminated_length": 101.0,
"entropy": 0.38565365597605705,
"epoch": 1.875,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.015057855728013598,
"learning_rate": 1e-05,
"loss": 0.0009,
"num_tokens": 33077023.0,
"reward": 4.012916564941406,
"reward_std": 0.04395143315196037,
"rewards/ngram_repetition2/mean": 0.9458911418914795,
"rewards/ngram_repetition2/std": 0.02221463806927204,
"rewards/ngram_repetition3/mean": 0.9930184483528137,
"rewards/ngram_repetition3/std": 0.010048897005617619,
"rewards/symbolic_reward_accuracy/mean": 0.99365234375,
"rewards/symbolic_reward_accuracy/std": 0.07943830639123917,
"rewards/symbolic_reward_partial_score/mean": 0.996826171875,
"rewards/symbolic_reward_partial_score/std": 0.03971915319561958,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.939659833908081,
"rewards/thinking_answer_ratio_reward/std": 0.01160719245672226,
"step": 60
},
{
"clip_ratio/high_max": 0.265625,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.22412109375,
"clip_ratio/low_min": 0.09765625,
"clip_ratio/region_mean": 0.38818359375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 349.0,
"completions/max_terminated_length": 349.0,
"completions/mean_length": 210.61474609375,
"completions/mean_terminated_length": 210.61474609375,
"completions/min_length": 105.0,
"completions/min_terminated_length": 105.0,
"entropy": 0.38424801267683506,
"epoch": 2.0,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01441432834996714,
"learning_rate": 1e-05,
"loss": 0.0003,
"num_tokens": 34848906.0,
"reward": 4.011754989624023,
"reward_std": 0.05248606204986572,
"rewards/ngram_repetition2/mean": 0.9511741399765015,
"rewards/ngram_repetition2/std": 0.02001781016588211,
"rewards/ngram_repetition3/mean": 0.9942980408668518,
"rewards/ngram_repetition3/std": 0.008259872905910015,
"rewards/symbolic_reward_accuracy/mean": 0.9931640625,
"rewards/symbolic_reward_accuracy/std": 0.08241677284240723,
"rewards/symbolic_reward_partial_score/mean": 0.99658203125,
"rewards/symbolic_reward_partial_score/std": 0.04120838642120361,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.938984215259552,
"rewards/thinking_answer_ratio_reward/std": 0.011489045806229115,
"step": 64
},
{
"epoch": 2.0,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 343.2105263157895,
"eval_completions/max_terminated_length": 343.2105263157895,
"eval_completions/mean_length": 213.50863486842104,
"eval_completions/mean_terminated_length": 213.50863486842104,
"eval_completions/min_length": 133.94736842105263,
"eval_completions/min_terminated_length": 133.94736842105263,
"eval_entropy": 0.38965475088671636,
"eval_frac_reward_zero_std": 0.0,
"eval_loss": 0.00070223119109869,
"eval_num_tokens": 34848906.0,
"eval_reward": 3.9170174975144234,
"eval_reward_std": 0.26173981729708,
"eval_rewards/ngram_repetition2/mean": 0.9425309394535265,
"eval_rewards/ngram_repetition2/std": 0.02643796281987115,
"eval_rewards/ngram_repetition3/mean": 0.9898810072949058,
"eval_rewards/ngram_repetition3/std": 0.01246610764218004,
"eval_rewards/symbolic_reward_accuracy/mean": 0.9555921052631579,
"eval_rewards/symbolic_reward_accuracy/std": 0.18631722189878164,
"eval_rewards/symbolic_reward_partial_score/mean": 0.9771792763157895,
"eval_rewards/symbolic_reward_partial_score/std": 0.09655581100990898,
"eval_rewards/tag_count_reward/mean": 1.0,
"eval_rewards/tag_count_reward/std": 0.0,
"eval_rewards/thinking_answer_ratio_reward/mean": 0.9329894781112671,
"eval_rewards/thinking_answer_ratio_reward/std": 0.01513584458122128,
"eval_runtime": 140.2203,
"eval_samples_per_second": 1.07,
"eval_steps_per_second": 0.014,
"step": 64
},
{
"epoch": 2.0,
"step": 64,
"total_flos": 0.0,
"train_loss": 0.0,
"train_runtime": 3.4791,
"train_samples_per_second": 605.321,
"train_steps_per_second": 18.395
}
],
"logging_steps": 4,
"max_steps": 64,
"num_input_tokens_seen": 34848906,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}