Upload folder using huggingface_hub

3aa56a3 verified 7 months ago

31.1 kB

	{
	"best_global_step": 64,
	"best_metric": 0.00070223119109869,
	"best_model_checkpoint": "data/DeepSeek-R1-Distill-Qwen-1.5B-Staged-1/checkpoint-64",
	"epoch": 2.0,
	"eval_steps": 500,
	"global_step": 64,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.11669921875,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2985.0,
	"completions/mean_length": 1047.6865234375,
	"completions/mean_terminated_length": 780.2398681640625,
	"completions/min_length": 230.0,
	"completions/min_terminated_length": 230.0,
	"entropy": 0.3189637362957001,
	"epoch": 0.03125,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.10914598126181888,
	"learning_rate": 1e-05,
	"loss": 0.1667,
	"num_tokens": 3492862.0,
	"reward": 1.400852918624878,
	"reward_std": 0.34828251600265503,
	"rewards/ngram_repetition2/mean": 0.5693519115447998,
	"rewards/ngram_repetition2/std": 0.17405439913272858,
	"rewards/ngram_repetition3/mean": 0.7214103937149048,
	"rewards/ngram_repetition3/std": 0.1819220334291458,
	"rewards/symbolic_reward_accuracy/mean": 0.01123046875,
	"rewards/symbolic_reward_accuracy/std": 0.10540289431810379,
	"rewards/symbolic_reward_partial_score/mean": 0.41455078125,
	"rewards/symbolic_reward_partial_score/std": 0.20262853801250458,
	"rewards/tag_count_reward/mean": 0.94287109375,
	"rewards/tag_count_reward/std": 0.15910091996192932,
	"rewards/thinking_answer_ratio_reward/mean": 0.8062490820884705,
	"rewards/thinking_answer_ratio_reward/std": 0.3022027909755707,
	"step": 1
	},
	{
	"clip_ratio/high_max": 0.3697916666666667,
	"clip_ratio/high_mean": 0.23893229166666666,
	"clip_ratio/low_mean": 0.224609375,
	"clip_ratio/low_min": 0.0625,
	"clip_ratio/region_mean": 0.4635416666666667,
	"entropy": 0.3202968165278435,
	"epoch": 0.125,
	"grad_norm": 0.04593035359379921,
	"learning_rate": 1e-05,
	"loss": 0.1373,
	"step": 4
	},
	{
	"clip_ratio/high_max": 0.33984375,
	"clip_ratio/high_mean": 0.21923828125,
	"clip_ratio/low_mean": 0.16162109375,
	"clip_ratio/low_min": 0.05859375,
	"clip_ratio/region_mean": 0.380859375,
	"completions/clipped_ratio": 0.05712890625,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3030.0,
	"completions/mean_length": 714.38330078125,
	"completions/mean_terminated_length": 571.534423828125,
	"completions/min_length": 161.0,
	"completions/min_terminated_length": 161.0,
	"entropy": 0.3361353427171707,
	"epoch": 0.25,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.046721866146913066,
	"learning_rate": 1e-05,
	"loss": 0.0956,
	"num_tokens": 6308111.0,
	"reward": 1.5210591554641724,
	"reward_std": 0.38517600297927856,
	"rewards/ngram_repetition2/mean": 0.6417093276977539,
	"rewards/ngram_repetition2/std": 0.16069720685482025,
	"rewards/ngram_repetition3/mean": 0.7863442301750183,
	"rewards/ngram_repetition3/std": 0.1631278693675995,
	"rewards/symbolic_reward_accuracy/mean": 0.029296875,
	"rewards/symbolic_reward_accuracy/std": 0.16867858171463013,
	"rewards/symbolic_reward_partial_score/mean": 0.46630859375,
	"rewards/symbolic_reward_partial_score/std": 0.17428098618984222,
	"rewards/tag_count_reward/mean": 0.973388671875,
	"rewards/tag_count_reward/std": 0.11226600408554077,
	"rewards/thinking_answer_ratio_reward/mean": 0.8487584590911865,
	"rewards/thinking_answer_ratio_reward/std": 0.22293001413345337,
	"step": 8
	},
	{
	"clip_ratio/high_max": 0.1953125,
	"clip_ratio/high_mean": 0.09716796875,
	"clip_ratio/low_mean": 0.4169921875,
	"clip_ratio/low_min": 0.29296875,
	"clip_ratio/region_mean": 0.51416015625,
	"completions/clipped_ratio": 0.0419921875,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2810.0,
	"completions/mean_length": 612.14404296875,
	"completions/mean_terminated_length": 504.32159423828125,
	"completions/min_length": 161.0,
	"completions/min_terminated_length": 161.0,
	"entropy": 0.34443395026028156,
	"epoch": 0.375,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.030918519373438542,
	"learning_rate": 1e-05,
	"loss": 0.0396,
	"num_tokens": 8915638.0,
	"reward": 1.937060832977295,
	"reward_std": 0.9647762775421143,
	"rewards/ngram_repetition2/mean": 0.6723504662513733,
	"rewards/ngram_repetition2/std": 0.14847470819950104,
	"rewards/ngram_repetition3/mean": 0.8140700459480286,
	"rewards/ngram_repetition3/std": 0.15126831829547882,
	"rewards/symbolic_reward_accuracy/mean": 0.18310546875,
	"rewards/symbolic_reward_accuracy/std": 0.38684743642807007,
	"rewards/symbolic_reward_partial_score/mean": 0.5675048828125,
	"rewards/symbolic_reward_partial_score/std": 0.23073548078536987,
	"rewards/tag_count_reward/mean": 0.979736328125,
	"rewards/tag_count_reward/std": 0.09862032532691956,
	"rewards/thinking_answer_ratio_reward/mean": 0.8744399547576904,
	"rewards/thinking_answer_ratio_reward/std": 0.19187742471694946,
	"step": 12
	},
	{
	"clip_ratio/high_max": 0.390625,
	"clip_ratio/high_mean": 0.23583984375,
	"clip_ratio/low_mean": 0.12841796875,
	"clip_ratio/low_min": 0.046875,
	"clip_ratio/region_mean": 0.3642578125,
	"completions/clipped_ratio": 0.0224609375,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2416.0,
	"completions/mean_length": 584.31982421875,
	"completions/mean_terminated_length": 527.1603393554688,
	"completions/min_length": 174.0,
	"completions/min_terminated_length": 174.0,
	"entropy": 0.36808328330516815,
	"epoch": 0.5,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.032998073100716244,
	"learning_rate": 1e-05,
	"loss": 0.0433,
	"num_tokens": 11472837.0,
	"reward": 3.4143524169921875,
	"reward_std": 1.0181639194488525,
	"rewards/ngram_repetition2/mean": 0.682662844657898,
	"rewards/ngram_repetition2/std": 0.12800198793411255,
	"rewards/ngram_repetition3/mean": 0.8277437686920166,
	"rewards/ngram_repetition3/std": 0.1255839467048645,
	"rewards/symbolic_reward_accuracy/mean": 0.76953125,
	"rewards/symbolic_reward_accuracy/std": 0.42123574018478394,
	"rewards/symbolic_reward_partial_score/mean": 0.8619384765625,
	"rewards/symbolic_reward_partial_score/std": 0.26891157031059265,
	"rewards/tag_count_reward/mean": 0.9892578125,
	"rewards/tag_count_reward/std": 0.07251390814781189,
	"rewards/thinking_answer_ratio_reward/mean": 0.8989672660827637,
	"rewards/thinking_answer_ratio_reward/std": 0.14425675570964813,
	"step": 16
	},
	{
	"clip_ratio/high_max": 0.3359375,
	"clip_ratio/high_mean": 0.2333984375,
	"clip_ratio/low_mean": 0.10986328125,
	"clip_ratio/low_min": 0.03515625,
	"clip_ratio/region_mean": 0.34326171875,
	"completions/clipped_ratio": 0.02197265625,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 2756.0,
	"completions/mean_length": 520.4208984375,
	"completions/mean_terminated_length": 463.0963439941406,
	"completions/min_length": 162.0,
	"completions/min_terminated_length": 162.0,
	"entropy": 0.3659926615655422,
	"epoch": 0.625,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.0355213854083778,
	"learning_rate": 1e-05,
	"loss": 0.0495,
	"num_tokens": 13864227.0,
	"reward": 3.7088823318481445,
	"reward_std": 0.667622447013855,
	"rewards/ngram_repetition2/mean": 0.7104234099388123,
	"rewards/ngram_repetition2/std": 0.12404344230890274,
	"rewards/ngram_repetition3/mean": 0.84934002161026,
	"rewards/ngram_repetition3/std": 0.12357836216688156,
	"rewards/symbolic_reward_accuracy/mean": 0.88818359375,
	"rewards/symbolic_reward_accuracy/std": 0.31521740555763245,
	"rewards/symbolic_reward_partial_score/mean": 0.9185791015625,
	"rewards/symbolic_reward_partial_score/std": 0.2433638721704483,
	"rewards/tag_count_reward/mean": 0.989501953125,
	"rewards/tag_count_reward/std": 0.07170303165912628,
	"rewards/thinking_answer_ratio_reward/mean": 0.8836514949798584,
	"rewards/thinking_answer_ratio_reward/std": 0.14230027794837952,
	"step": 20
	},
	{
	"clip_ratio/high_max": 0.328125,
	"clip_ratio/high_mean": 0.1982421875,
	"clip_ratio/low_mean": 0.16943359375,
	"clip_ratio/low_min": 0.06640625,
	"clip_ratio/region_mean": 0.36767578125,
	"completions/clipped_ratio": 0.013671875,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 3019.0,
	"completions/mean_length": 460.240234375,
	"completions/mean_terminated_length": 424.0376281738281,
	"completions/min_length": 128.0,
	"completions/min_terminated_length": 128.0,
	"entropy": 0.36848987452685833,
	"epoch": 0.75,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.027143384638001866,
	"learning_rate": 1e-05,
	"loss": 0.049,
	"num_tokens": 16149007.0,
	"reward": 3.875516414642334,
	"reward_std": 0.3786875903606415,
	"rewards/ngram_repetition2/mean": 0.7316169738769531,
	"rewards/ngram_repetition2/std": 0.10848518460988998,
	"rewards/ngram_repetition3/mean": 0.8663702011108398,
	"rewards/ngram_repetition3/std": 0.10447894036769867,
	"rewards/symbolic_reward_accuracy/mean": 0.94775390625,
	"rewards/symbolic_reward_accuracy/std": 0.22257724404335022,
	"rewards/symbolic_reward_partial_score/mean": 0.961181640625,
	"rewards/symbolic_reward_partial_score/std": 0.17496450245380402,
	"rewards/tag_count_reward/mean": 0.993896484375,
	"rewards/tag_count_reward/std": 0.054917916655540466,
	"rewards/thinking_answer_ratio_reward/mean": 0.8950520157814026,
	"rewards/thinking_answer_ratio_reward/std": 0.11731007695198059,
	"step": 24
	},
	{
	"clip_ratio/high_max": 0.30859375,
	"clip_ratio/high_mean": 0.17626953125,
	"clip_ratio/low_mean": 0.22119140625,
	"clip_ratio/low_min": 0.10546875,
	"clip_ratio/region_mean": 0.3974609375,
	"completions/clipped_ratio": 0.00634765625,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 1444.0,
	"completions/mean_length": 378.6123046875,
	"completions/mean_terminated_length": 361.4063720703125,
	"completions/min_length": 151.0,
	"completions/min_terminated_length": 151.0,
	"entropy": 0.3799332305788994,
	"epoch": 0.875,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.024146589341786776,
	"learning_rate": 1e-05,
	"loss": 0.031,
	"num_tokens": 18269941.0,
	"reward": 3.9713735580444336,
	"reward_std": 0.15463097393512726,
	"rewards/ngram_repetition2/mean": 0.7776297330856323,
	"rewards/ngram_repetition2/std": 0.08481114357709885,
	"rewards/ngram_repetition3/mean": 0.9015278816223145,
	"rewards/ngram_repetition3/std": 0.07951432466506958,
	"rewards/symbolic_reward_accuracy/mean": 0.9814453125,
	"rewards/symbolic_reward_accuracy/std": 0.13497892022132874,
	"rewards/symbolic_reward_partial_score/mean": 0.985107421875,
	"rewards/symbolic_reward_partial_score/std": 0.11334022879600525,
	"rewards/tag_count_reward/mean": 0.99755859375,
	"rewards/tag_count_reward/std": 0.034861668944358826,
	"rewards/thinking_answer_ratio_reward/mean": 0.9025194048881531,
	"rewards/thinking_answer_ratio_reward/std": 0.08864665776491165,
	"step": 28
	},
	{
	"clip_ratio/high_max": 0.3359375,
	"clip_ratio/high_mean": 0.22119140625,
	"clip_ratio/low_mean": 0.189453125,
	"clip_ratio/low_min": 0.09765625,
	"clip_ratio/region_mean": 0.41064453125,
	"completions/clipped_ratio": 0.00341796875,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 1606.0,
	"completions/mean_length": 326.34814453125,
	"completions/mean_terminated_length": 316.931396484375,
	"completions/min_length": 99.0,
	"completions/min_terminated_length": 99.0,
	"entropy": 0.3929165042936802,
	"epoch": 1.0,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.02371361608194468,
	"learning_rate": 1e-05,
	"loss": 0.0211,
	"num_tokens": 20282174.0,
	"reward": 3.976555109024048,
	"reward_std": 0.16526807844638824,
	"rewards/ngram_repetition2/mean": 0.8133348226547241,
	"rewards/ngram_repetition2/std": 0.07101932913064957,
	"rewards/ngram_repetition3/mean": 0.9278632402420044,
	"rewards/ngram_repetition3/std": 0.06339241564273834,
	"rewards/symbolic_reward_accuracy/mean": 0.982421875,
	"rewards/symbolic_reward_accuracy/std": 0.13144417107105255,
	"rewards/symbolic_reward_partial_score/mean": 0.9866943359375,
	"rewards/symbolic_reward_partial_score/std": 0.10472454875707626,
	"rewards/tag_count_reward/mean": 0.998291015625,
	"rewards/tag_count_reward/std": 0.02918882668018341,
	"rewards/thinking_answer_ratio_reward/mean": 0.9313849210739136,
	"rewards/thinking_answer_ratio_reward/std": 0.06515223532915115,
	"step": 32
	},
	{
	"epoch": 1.0,
	"eval_clip_ratio/high_max": 0.0,
	"eval_clip_ratio/high_mean": 0.0,
	"eval_clip_ratio/low_mean": 0.0,
	"eval_clip_ratio/low_min": 0.0,
	"eval_clip_ratio/region_mean": 0.0,
	"eval_completions/clipped_ratio": 0.006578947368421052,
	"eval_completions/max_length": 1925.842105263158,
	"eval_completions/max_terminated_length": 641.1578947368421,
	"eval_completions/mean_length": 305.5797697368421,
	"eval_completions/mean_terminated_length": 287.26832982113484,
	"eval_completions/min_length": 145.52631578947367,
	"eval_completions/min_terminated_length": 145.52631578947367,
	"eval_entropy": 0.3921029740258267,
	"eval_frac_reward_zero_std": 0.0,
	"eval_loss": 0.017673568800091743,
	"eval_num_tokens": 20282174.0,
	"eval_reward": 3.8416001420272026,
	"eval_reward_std": 0.4433499896212628,
	"eval_rewards/ngram_repetition2/mean": 0.8298829511592263,
	"eval_rewards/ngram_repetition2/std": 0.0766919782679332,
	"eval_rewards/ngram_repetition3/mean": 0.9369708268265975,
	"eval_rewards/ngram_repetition3/std": 0.0686160976949491,
	"eval_rewards/symbolic_reward_accuracy/mean": 0.930921052631579,
	"eval_rewards/symbolic_reward_accuracy/std": 0.24749822326396642,
	"eval_rewards/symbolic_reward_partial_score/mean": 0.9561060855263158,
	"eval_rewards/symbolic_reward_partial_score/std": 0.1654511419566054,
	"eval_rewards/tag_count_reward/mean": 0.9967105263157895,
	"eval_rewards/tag_count_reward/std": 0.028502884664033588,
	"eval_rewards/thinking_answer_ratio_reward/mean": 0.9272897714062741,
	"eval_rewards/thinking_answer_ratio_reward/std": 0.06353752030745934,
	"eval_runtime": 291.3147,
	"eval_samples_per_second": 0.515,
	"eval_steps_per_second": 0.007,
	"step": 32
	},
	{
	"clip_ratio/high_max": 0.3125,
	"clip_ratio/high_mean": 0.17919921875,
	"clip_ratio/low_mean": 0.21875,
	"clip_ratio/low_min": 0.10546875,
	"clip_ratio/region_mean": 0.39794921875,
	"completions/clipped_ratio": 0.001953125,
	"completions/max_length": 3072.0,
	"completions/max_terminated_length": 853.0,
	"completions/mean_length": 287.046875,
	"completions/mean_terminated_length": 281.59686279296875,
	"completions/min_length": 124.0,
	"completions/min_terminated_length": 124.0,
	"entropy": 0.39634467102587223,
	"epoch": 1.125,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.021677013253874763,
	"learning_rate": 1e-05,
	"loss": 0.0116,
	"num_tokens": 22213918.0,
	"reward": 3.9796054363250732,
	"reward_std": 0.13036790490150452,
	"rewards/ngram_repetition2/mean": 0.8469281792640686,
	"rewards/ngram_repetition2/std": 0.05668526515364647,
	"rewards/ngram_repetition3/mean": 0.9489122033119202,
	"rewards/ngram_repetition3/std": 0.048189926892519,
	"rewards/symbolic_reward_accuracy/mean": 0.98291015625,
	"rewards/symbolic_reward_accuracy/std": 0.1296379119157791,
	"rewards/symbolic_reward_partial_score/mean": 0.9874267578125,
	"rewards/symbolic_reward_partial_score/std": 0.10065428167581558,
	"rewards/tag_count_reward/mean": 0.9990234375,
	"rewards/tag_count_reward/std": 0.022080888971686363,
	"rewards/thinking_answer_ratio_reward/mean": 0.9376400709152222,
	"rewards/thinking_answer_ratio_reward/std": 0.04486775025725365,
	"step": 36
	},
	{
	"clip_ratio/high_max": 0.3125,
	"clip_ratio/high_mean": 0.19140625,
	"clip_ratio/low_mean": 0.2021484375,
	"clip_ratio/low_min": 0.08984375,
	"clip_ratio/region_mean": 0.3935546875,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1465.0,
	"completions/max_terminated_length": 1465.0,
	"completions/mean_length": 254.92529296875,
	"completions/mean_terminated_length": 254.92529296875,
	"completions/min_length": 112.0,
	"completions/min_terminated_length": 112.0,
	"entropy": 0.4012875221669674,
	"epoch": 1.25,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.018544320597192198,
	"learning_rate": 1e-05,
	"loss": 0.0049,
	"num_tokens": 24081541.0,
	"reward": 4.009942054748535,
	"reward_std": 0.060945987701416016,
	"rewards/ngram_repetition2/mean": 0.8762841820716858,
	"rewards/ngram_repetition2/std": 0.037716496735811234,
	"rewards/ngram_repetition3/mean": 0.9659276008605957,
	"rewards/ngram_repetition3/std": 0.025332553312182426,
	"rewards/symbolic_reward_accuracy/mean": 0.9931640625,
	"rewards/symbolic_reward_accuracy/std": 0.08241677284240723,
	"rewards/symbolic_reward_partial_score/mean": 0.995849609375,
	"rewards/symbolic_reward_partial_score/std": 0.052837058901786804,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"rewards/thinking_answer_ratio_reward/mean": 0.934229850769043,
	"rewards/thinking_answer_ratio_reward/std": 0.017355602234601974,
	"step": 40
	},
	{
	"clip_ratio/high_max": 0.26953125,
	"clip_ratio/high_mean": 0.166015625,
	"clip_ratio/low_mean": 0.234375,
	"clip_ratio/low_min": 0.1171875,
	"clip_ratio/region_mean": 0.400390625,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 502.0,
	"completions/max_terminated_length": 502.0,
	"completions/mean_length": 234.970703125,
	"completions/mean_terminated_length": 234.970703125,
	"completions/min_length": 112.0,
	"completions/min_terminated_length": 112.0,
	"entropy": 0.39688388258218765,
	"epoch": 1.375,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.019234315035359873,
	"learning_rate": 1e-05,
	"loss": 0.0024,
	"num_tokens": 25914953.0,
	"reward": 4.015607833862305,
	"reward_std": 0.033271413296461105,
	"rewards/ngram_repetition2/mean": 0.8983770608901978,
	"rewards/ngram_repetition2/std": 0.030990201979875565,
	"rewards/ngram_repetition3/mean": 0.9768623113632202,
	"rewards/ngram_repetition3/std": 0.016665907576680183,
	"rewards/symbolic_reward_accuracy/mean": 0.9951171875,
	"rewards/symbolic_reward_accuracy/std": 0.06972333788871765,
	"rewards/symbolic_reward_partial_score/mean": 0.997314453125,
	"rewards/symbolic_reward_partial_score/std": 0.03975516930222511,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"rewards/thinking_answer_ratio_reward/mean": 0.9306961894035339,
	"rewards/thinking_answer_ratio_reward/std": 0.016988540068268776,
	"step": 44
	},
	{
	"clip_ratio/high_max": 0.30078125,
	"clip_ratio/high_mean": 0.173828125,
	"clip_ratio/low_mean": 0.2197265625,
	"clip_ratio/low_min": 0.1015625,
	"clip_ratio/region_mean": 0.3935546875,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 557.0,
	"completions/max_terminated_length": 557.0,
	"completions/mean_length": 224.271484375,
	"completions/mean_terminated_length": 224.271484375,
	"completions/min_length": 107.0,
	"completions/min_terminated_length": 107.0,
	"entropy": 0.3952790927141905,
	"epoch": 1.5,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.017839049202086836,
	"learning_rate": 1e-05,
	"loss": 0.001,
	"num_tokens": 27724789.0,
	"reward": 4.000144004821777,
	"reward_std": 0.05562726408243179,
	"rewards/ngram_repetition2/mean": 0.9121678471565247,
	"rewards/ngram_repetition2/std": 0.0285996925085783,
	"rewards/ngram_repetition3/mean": 0.9819085597991943,
	"rewards/ngram_repetition3/std": 0.015010321512818336,
	"rewards/symbolic_reward_accuracy/mean": 0.98876953125,
	"rewards/symbolic_reward_accuracy/std": 0.10540289431810379,
	"rewards/symbolic_reward_partial_score/mean": 0.994384765625,
	"rewards/symbolic_reward_partial_score/std": 0.052701447159051895,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"rewards/thinking_answer_ratio_reward/mean": 0.9279464483261108,
	"rewards/thinking_answer_ratio_reward/std": 0.017118271440267563,
	"step": 48
	},
	{
	"clip_ratio/high_max": 0.296875,
	"clip_ratio/high_mean": 0.17236328125,
	"clip_ratio/low_mean": 0.24609375,
	"clip_ratio/low_min": 0.12890625,
	"clip_ratio/region_mean": 0.41845703125,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 397.0,
	"completions/max_terminated_length": 397.0,
	"completions/mean_length": 219.4580078125,
	"completions/mean_terminated_length": 219.4580078125,
	"completions/min_length": 94.0,
	"completions/min_terminated_length": 94.0,
	"entropy": 0.3957546763122082,
	"epoch": 1.625,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.017264628619577616,
	"learning_rate": 1e-05,
	"loss": 0.0011,
	"num_tokens": 29514783.0,
	"reward": 4.000338554382324,
	"reward_std": 0.06502684205770493,
	"rewards/ngram_repetition2/mean": 0.9239511489868164,
	"rewards/ngram_repetition2/std": 0.026442626491189003,
	"rewards/ngram_repetition3/mean": 0.9862804412841797,
	"rewards/ngram_repetition3/std": 0.012914017774164677,
	"rewards/symbolic_reward_accuracy/mean": 0.98876953125,
	"rewards/symbolic_reward_accuracy/std": 0.10540289431810379,
	"rewards/symbolic_reward_partial_score/mean": 0.994384765625,
	"rewards/symbolic_reward_partial_score/std": 0.052701447159051895,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"rewards/thinking_answer_ratio_reward/mean": 0.9312715530395508,
	"rewards/thinking_answer_ratio_reward/std": 0.016598645597696304,
	"step": 52
	},
	{
	"clip_ratio/high_max": 0.28125,
	"clip_ratio/high_mean": 0.171875,
	"clip_ratio/low_mean": 0.2099609375,
	"clip_ratio/low_min": 0.08984375,
	"clip_ratio/region_mean": 0.3818359375,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 355.0,
	"completions/max_terminated_length": 355.0,
	"completions/mean_length": 211.7568359375,
	"completions/mean_terminated_length": 211.7568359375,
	"completions/min_length": 97.0,
	"completions/min_terminated_length": 97.0,
	"entropy": 0.38981067948043346,
	"epoch": 1.75,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.01563751543259849,
	"learning_rate": 1e-05,
	"loss": 0.0012,
	"num_tokens": 31293997.0,
	"reward": 4.0152482986450195,
	"reward_std": 0.02903410792350769,
	"rewards/ngram_repetition2/mean": 0.9386552572250366,
	"rewards/ngram_repetition2/std": 0.02311737835407257,
	"rewards/ngram_repetition3/mean": 0.9913500547409058,
	"rewards/ngram_repetition3/std": 0.010372841730713844,
	"rewards/symbolic_reward_accuracy/mean": 0.99462890625,
	"rewards/symbolic_reward_accuracy/std": 0.07310851663351059,
	"rewards/symbolic_reward_partial_score/mean": 0.997314453125,
	"rewards/symbolic_reward_partial_score/std": 0.036554258316755295,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"rewards/thinking_answer_ratio_reward/mean": 0.9375712871551514,
	"rewards/thinking_answer_ratio_reward/std": 0.01364652719348669,
	"step": 56
	},
	{
	"clip_ratio/high_max": 0.2890625,
	"clip_ratio/high_mean": 0.17529296875,
	"clip_ratio/low_mean": 0.20703125,
	"clip_ratio/low_min": 0.07421875,
	"clip_ratio/region_mean": 0.38232421875,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 481.0,
	"completions/max_terminated_length": 481.0,
	"completions/mean_length": 211.9931640625,
	"completions/mean_terminated_length": 211.9931640625,
	"completions/min_length": 101.0,
	"completions/min_terminated_length": 101.0,
	"entropy": 0.38565365597605705,
	"epoch": 1.875,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.015057855728013598,
	"learning_rate": 1e-05,
	"loss": 0.0009,
	"num_tokens": 33077023.0,
	"reward": 4.012916564941406,
	"reward_std": 0.04395143315196037,
	"rewards/ngram_repetition2/mean": 0.9458911418914795,
	"rewards/ngram_repetition2/std": 0.02221463806927204,
	"rewards/ngram_repetition3/mean": 0.9930184483528137,
	"rewards/ngram_repetition3/std": 0.010048897005617619,
	"rewards/symbolic_reward_accuracy/mean": 0.99365234375,
	"rewards/symbolic_reward_accuracy/std": 0.07943830639123917,
	"rewards/symbolic_reward_partial_score/mean": 0.996826171875,
	"rewards/symbolic_reward_partial_score/std": 0.03971915319561958,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"rewards/thinking_answer_ratio_reward/mean": 0.939659833908081,
	"rewards/thinking_answer_ratio_reward/std": 0.01160719245672226,
	"step": 60
	},
	{
	"clip_ratio/high_max": 0.265625,
	"clip_ratio/high_mean": 0.1640625,
	"clip_ratio/low_mean": 0.22412109375,
	"clip_ratio/low_min": 0.09765625,
	"clip_ratio/region_mean": 0.38818359375,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 349.0,
	"completions/max_terminated_length": 349.0,
	"completions/mean_length": 210.61474609375,
	"completions/mean_terminated_length": 210.61474609375,
	"completions/min_length": 105.0,
	"completions/min_terminated_length": 105.0,
	"entropy": 0.38424801267683506,
	"epoch": 2.0,
	"frac_reward_zero_std": 0.0,
	"grad_norm": 0.01441432834996714,
	"learning_rate": 1e-05,
	"loss": 0.0003,
	"num_tokens": 34848906.0,
	"reward": 4.011754989624023,
	"reward_std": 0.05248606204986572,
	"rewards/ngram_repetition2/mean": 0.9511741399765015,
	"rewards/ngram_repetition2/std": 0.02001781016588211,
	"rewards/ngram_repetition3/mean": 0.9942980408668518,
	"rewards/ngram_repetition3/std": 0.008259872905910015,
	"rewards/symbolic_reward_accuracy/mean": 0.9931640625,
	"rewards/symbolic_reward_accuracy/std": 0.08241677284240723,
	"rewards/symbolic_reward_partial_score/mean": 0.99658203125,
	"rewards/symbolic_reward_partial_score/std": 0.04120838642120361,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"rewards/thinking_answer_ratio_reward/mean": 0.938984215259552,
	"rewards/thinking_answer_ratio_reward/std": 0.011489045806229115,
	"step": 64
	},
	{
	"epoch": 2.0,
	"eval_clip_ratio/high_max": 0.0,
	"eval_clip_ratio/high_mean": 0.0,
	"eval_clip_ratio/low_mean": 0.0,
	"eval_clip_ratio/low_min": 0.0,
	"eval_clip_ratio/region_mean": 0.0,
	"eval_completions/clipped_ratio": 0.0,
	"eval_completions/max_length": 343.2105263157895,
	"eval_completions/max_terminated_length": 343.2105263157895,
	"eval_completions/mean_length": 213.50863486842104,
	"eval_completions/mean_terminated_length": 213.50863486842104,
	"eval_completions/min_length": 133.94736842105263,
	"eval_completions/min_terminated_length": 133.94736842105263,
	"eval_entropy": 0.38965475088671636,
	"eval_frac_reward_zero_std": 0.0,
	"eval_loss": 0.00070223119109869,
	"eval_num_tokens": 34848906.0,
	"eval_reward": 3.9170174975144234,
	"eval_reward_std": 0.26173981729708,
	"eval_rewards/ngram_repetition2/mean": 0.9425309394535265,
	"eval_rewards/ngram_repetition2/std": 0.02643796281987115,
	"eval_rewards/ngram_repetition3/mean": 0.9898810072949058,
	"eval_rewards/ngram_repetition3/std": 0.01246610764218004,
	"eval_rewards/symbolic_reward_accuracy/mean": 0.9555921052631579,
	"eval_rewards/symbolic_reward_accuracy/std": 0.18631722189878164,
	"eval_rewards/symbolic_reward_partial_score/mean": 0.9771792763157895,
	"eval_rewards/symbolic_reward_partial_score/std": 0.09655581100990898,
	"eval_rewards/tag_count_reward/mean": 1.0,
	"eval_rewards/tag_count_reward/std": 0.0,
	"eval_rewards/thinking_answer_ratio_reward/mean": 0.9329894781112671,
	"eval_rewards/thinking_answer_ratio_reward/std": 0.01513584458122128,
	"eval_runtime": 140.2203,
	"eval_samples_per_second": 1.07,
	"eval_steps_per_second": 0.014,
	"step": 64
	},
	{
	"epoch": 2.0,
	"step": 64,
	"total_flos": 0.0,
	"train_loss": 0.0,
	"train_runtime": 3.4791,
	"train_samples_per_second": 605.321,
	"train_steps_per_second": 18.395
	}
	],
	"logging_steps": 4,
	"max_steps": 64,
	"num_input_tokens_seen": 34848906,
	"num_train_epochs": 2,
	"save_steps": 500,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 0.0,
	"train_batch_size": 16,
	"trial_name": null,
	"trial_params": null
	}