| { |
| "best_global_step": 64, |
| "best_metric": 0.00070223119109869, |
| "best_model_checkpoint": "data/DeepSeek-R1-Distill-Qwen-1.5B-Staged-1/checkpoint-64", |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 64, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.11669921875, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 2985.0, |
| "completions/mean_length": 1047.6865234375, |
| "completions/mean_terminated_length": 780.2398681640625, |
| "completions/min_length": 230.0, |
| "completions/min_terminated_length": 230.0, |
| "entropy": 0.3189637362957001, |
| "epoch": 0.03125, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.10914598126181888, |
| "learning_rate": 1e-05, |
| "loss": 0.1667, |
| "num_tokens": 3492862.0, |
| "reward": 1.400852918624878, |
| "reward_std": 0.34828251600265503, |
| "rewards/ngram_repetition2/mean": 0.5693519115447998, |
| "rewards/ngram_repetition2/std": 0.17405439913272858, |
| "rewards/ngram_repetition3/mean": 0.7214103937149048, |
| "rewards/ngram_repetition3/std": 0.1819220334291458, |
| "rewards/symbolic_reward_accuracy/mean": 0.01123046875, |
| "rewards/symbolic_reward_accuracy/std": 0.10540289431810379, |
| "rewards/symbolic_reward_partial_score/mean": 0.41455078125, |
| "rewards/symbolic_reward_partial_score/std": 0.20262853801250458, |
| "rewards/tag_count_reward/mean": 0.94287109375, |
| "rewards/tag_count_reward/std": 0.15910091996192932, |
| "rewards/thinking_answer_ratio_reward/mean": 0.8062490820884705, |
| "rewards/thinking_answer_ratio_reward/std": 0.3022027909755707, |
| "step": 1 |
| }, |
| { |
| "clip_ratio/high_max": 0.3697916666666667, |
| "clip_ratio/high_mean": 0.23893229166666666, |
| "clip_ratio/low_mean": 0.224609375, |
| "clip_ratio/low_min": 0.0625, |
| "clip_ratio/region_mean": 0.4635416666666667, |
| "entropy": 0.3202968165278435, |
| "epoch": 0.125, |
| "grad_norm": 0.04593035359379921, |
| "learning_rate": 1e-05, |
| "loss": 0.1373, |
| "step": 4 |
| }, |
| { |
| "clip_ratio/high_max": 0.33984375, |
| "clip_ratio/high_mean": 0.21923828125, |
| "clip_ratio/low_mean": 0.16162109375, |
| "clip_ratio/low_min": 0.05859375, |
| "clip_ratio/region_mean": 0.380859375, |
| "completions/clipped_ratio": 0.05712890625, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 3030.0, |
| "completions/mean_length": 714.38330078125, |
| "completions/mean_terminated_length": 571.534423828125, |
| "completions/min_length": 161.0, |
| "completions/min_terminated_length": 161.0, |
| "entropy": 0.3361353427171707, |
| "epoch": 0.25, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.046721866146913066, |
| "learning_rate": 1e-05, |
| "loss": 0.0956, |
| "num_tokens": 6308111.0, |
| "reward": 1.5210591554641724, |
| "reward_std": 0.38517600297927856, |
| "rewards/ngram_repetition2/mean": 0.6417093276977539, |
| "rewards/ngram_repetition2/std": 0.16069720685482025, |
| "rewards/ngram_repetition3/mean": 0.7863442301750183, |
| "rewards/ngram_repetition3/std": 0.1631278693675995, |
| "rewards/symbolic_reward_accuracy/mean": 0.029296875, |
| "rewards/symbolic_reward_accuracy/std": 0.16867858171463013, |
| "rewards/symbolic_reward_partial_score/mean": 0.46630859375, |
| "rewards/symbolic_reward_partial_score/std": 0.17428098618984222, |
| "rewards/tag_count_reward/mean": 0.973388671875, |
| "rewards/tag_count_reward/std": 0.11226600408554077, |
| "rewards/thinking_answer_ratio_reward/mean": 0.8487584590911865, |
| "rewards/thinking_answer_ratio_reward/std": 0.22293001413345337, |
| "step": 8 |
| }, |
| { |
| "clip_ratio/high_max": 0.1953125, |
| "clip_ratio/high_mean": 0.09716796875, |
| "clip_ratio/low_mean": 0.4169921875, |
| "clip_ratio/low_min": 0.29296875, |
| "clip_ratio/region_mean": 0.51416015625, |
| "completions/clipped_ratio": 0.0419921875, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 2810.0, |
| "completions/mean_length": 612.14404296875, |
| "completions/mean_terminated_length": 504.32159423828125, |
| "completions/min_length": 161.0, |
| "completions/min_terminated_length": 161.0, |
| "entropy": 0.34443395026028156, |
| "epoch": 0.375, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.030918519373438542, |
| "learning_rate": 1e-05, |
| "loss": 0.0396, |
| "num_tokens": 8915638.0, |
| "reward": 1.937060832977295, |
| "reward_std": 0.9647762775421143, |
| "rewards/ngram_repetition2/mean": 0.6723504662513733, |
| "rewards/ngram_repetition2/std": 0.14847470819950104, |
| "rewards/ngram_repetition3/mean": 0.8140700459480286, |
| "rewards/ngram_repetition3/std": 0.15126831829547882, |
| "rewards/symbolic_reward_accuracy/mean": 0.18310546875, |
| "rewards/symbolic_reward_accuracy/std": 0.38684743642807007, |
| "rewards/symbolic_reward_partial_score/mean": 0.5675048828125, |
| "rewards/symbolic_reward_partial_score/std": 0.23073548078536987, |
| "rewards/tag_count_reward/mean": 0.979736328125, |
| "rewards/tag_count_reward/std": 0.09862032532691956, |
| "rewards/thinking_answer_ratio_reward/mean": 0.8744399547576904, |
| "rewards/thinking_answer_ratio_reward/std": 0.19187742471694946, |
| "step": 12 |
| }, |
| { |
| "clip_ratio/high_max": 0.390625, |
| "clip_ratio/high_mean": 0.23583984375, |
| "clip_ratio/low_mean": 0.12841796875, |
| "clip_ratio/low_min": 0.046875, |
| "clip_ratio/region_mean": 0.3642578125, |
| "completions/clipped_ratio": 0.0224609375, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 2416.0, |
| "completions/mean_length": 584.31982421875, |
| "completions/mean_terminated_length": 527.1603393554688, |
| "completions/min_length": 174.0, |
| "completions/min_terminated_length": 174.0, |
| "entropy": 0.36808328330516815, |
| "epoch": 0.5, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.032998073100716244, |
| "learning_rate": 1e-05, |
| "loss": 0.0433, |
| "num_tokens": 11472837.0, |
| "reward": 3.4143524169921875, |
| "reward_std": 1.0181639194488525, |
| "rewards/ngram_repetition2/mean": 0.682662844657898, |
| "rewards/ngram_repetition2/std": 0.12800198793411255, |
| "rewards/ngram_repetition3/mean": 0.8277437686920166, |
| "rewards/ngram_repetition3/std": 0.1255839467048645, |
| "rewards/symbolic_reward_accuracy/mean": 0.76953125, |
| "rewards/symbolic_reward_accuracy/std": 0.42123574018478394, |
| "rewards/symbolic_reward_partial_score/mean": 0.8619384765625, |
| "rewards/symbolic_reward_partial_score/std": 0.26891157031059265, |
| "rewards/tag_count_reward/mean": 0.9892578125, |
| "rewards/tag_count_reward/std": 0.07251390814781189, |
| "rewards/thinking_answer_ratio_reward/mean": 0.8989672660827637, |
| "rewards/thinking_answer_ratio_reward/std": 0.14425675570964813, |
| "step": 16 |
| }, |
| { |
| "clip_ratio/high_max": 0.3359375, |
| "clip_ratio/high_mean": 0.2333984375, |
| "clip_ratio/low_mean": 0.10986328125, |
| "clip_ratio/low_min": 0.03515625, |
| "clip_ratio/region_mean": 0.34326171875, |
| "completions/clipped_ratio": 0.02197265625, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 2756.0, |
| "completions/mean_length": 520.4208984375, |
| "completions/mean_terminated_length": 463.0963439941406, |
| "completions/min_length": 162.0, |
| "completions/min_terminated_length": 162.0, |
| "entropy": 0.3659926615655422, |
| "epoch": 0.625, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0355213854083778, |
| "learning_rate": 1e-05, |
| "loss": 0.0495, |
| "num_tokens": 13864227.0, |
| "reward": 3.7088823318481445, |
| "reward_std": 0.667622447013855, |
| "rewards/ngram_repetition2/mean": 0.7104234099388123, |
| "rewards/ngram_repetition2/std": 0.12404344230890274, |
| "rewards/ngram_repetition3/mean": 0.84934002161026, |
| "rewards/ngram_repetition3/std": 0.12357836216688156, |
| "rewards/symbolic_reward_accuracy/mean": 0.88818359375, |
| "rewards/symbolic_reward_accuracy/std": 0.31521740555763245, |
| "rewards/symbolic_reward_partial_score/mean": 0.9185791015625, |
| "rewards/symbolic_reward_partial_score/std": 0.2433638721704483, |
| "rewards/tag_count_reward/mean": 0.989501953125, |
| "rewards/tag_count_reward/std": 0.07170303165912628, |
| "rewards/thinking_answer_ratio_reward/mean": 0.8836514949798584, |
| "rewards/thinking_answer_ratio_reward/std": 0.14230027794837952, |
| "step": 20 |
| }, |
| { |
| "clip_ratio/high_max": 0.328125, |
| "clip_ratio/high_mean": 0.1982421875, |
| "clip_ratio/low_mean": 0.16943359375, |
| "clip_ratio/low_min": 0.06640625, |
| "clip_ratio/region_mean": 0.36767578125, |
| "completions/clipped_ratio": 0.013671875, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 3019.0, |
| "completions/mean_length": 460.240234375, |
| "completions/mean_terminated_length": 424.0376281738281, |
| "completions/min_length": 128.0, |
| "completions/min_terminated_length": 128.0, |
| "entropy": 0.36848987452685833, |
| "epoch": 0.75, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.027143384638001866, |
| "learning_rate": 1e-05, |
| "loss": 0.049, |
| "num_tokens": 16149007.0, |
| "reward": 3.875516414642334, |
| "reward_std": 0.3786875903606415, |
| "rewards/ngram_repetition2/mean": 0.7316169738769531, |
| "rewards/ngram_repetition2/std": 0.10848518460988998, |
| "rewards/ngram_repetition3/mean": 0.8663702011108398, |
| "rewards/ngram_repetition3/std": 0.10447894036769867, |
| "rewards/symbolic_reward_accuracy/mean": 0.94775390625, |
| "rewards/symbolic_reward_accuracy/std": 0.22257724404335022, |
| "rewards/symbolic_reward_partial_score/mean": 0.961181640625, |
| "rewards/symbolic_reward_partial_score/std": 0.17496450245380402, |
| "rewards/tag_count_reward/mean": 0.993896484375, |
| "rewards/tag_count_reward/std": 0.054917916655540466, |
| "rewards/thinking_answer_ratio_reward/mean": 0.8950520157814026, |
| "rewards/thinking_answer_ratio_reward/std": 0.11731007695198059, |
| "step": 24 |
| }, |
| { |
| "clip_ratio/high_max": 0.30859375, |
| "clip_ratio/high_mean": 0.17626953125, |
| "clip_ratio/low_mean": 0.22119140625, |
| "clip_ratio/low_min": 0.10546875, |
| "clip_ratio/region_mean": 0.3974609375, |
| "completions/clipped_ratio": 0.00634765625, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 1444.0, |
| "completions/mean_length": 378.6123046875, |
| "completions/mean_terminated_length": 361.4063720703125, |
| "completions/min_length": 151.0, |
| "completions/min_terminated_length": 151.0, |
| "entropy": 0.3799332305788994, |
| "epoch": 0.875, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.024146589341786776, |
| "learning_rate": 1e-05, |
| "loss": 0.031, |
| "num_tokens": 18269941.0, |
| "reward": 3.9713735580444336, |
| "reward_std": 0.15463097393512726, |
| "rewards/ngram_repetition2/mean": 0.7776297330856323, |
| "rewards/ngram_repetition2/std": 0.08481114357709885, |
| "rewards/ngram_repetition3/mean": 0.9015278816223145, |
| "rewards/ngram_repetition3/std": 0.07951432466506958, |
| "rewards/symbolic_reward_accuracy/mean": 0.9814453125, |
| "rewards/symbolic_reward_accuracy/std": 0.13497892022132874, |
| "rewards/symbolic_reward_partial_score/mean": 0.985107421875, |
| "rewards/symbolic_reward_partial_score/std": 0.11334022879600525, |
| "rewards/tag_count_reward/mean": 0.99755859375, |
| "rewards/tag_count_reward/std": 0.034861668944358826, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9025194048881531, |
| "rewards/thinking_answer_ratio_reward/std": 0.08864665776491165, |
| "step": 28 |
| }, |
| { |
| "clip_ratio/high_max": 0.3359375, |
| "clip_ratio/high_mean": 0.22119140625, |
| "clip_ratio/low_mean": 0.189453125, |
| "clip_ratio/low_min": 0.09765625, |
| "clip_ratio/region_mean": 0.41064453125, |
| "completions/clipped_ratio": 0.00341796875, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 1606.0, |
| "completions/mean_length": 326.34814453125, |
| "completions/mean_terminated_length": 316.931396484375, |
| "completions/min_length": 99.0, |
| "completions/min_terminated_length": 99.0, |
| "entropy": 0.3929165042936802, |
| "epoch": 1.0, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.02371361608194468, |
| "learning_rate": 1e-05, |
| "loss": 0.0211, |
| "num_tokens": 20282174.0, |
| "reward": 3.976555109024048, |
| "reward_std": 0.16526807844638824, |
| "rewards/ngram_repetition2/mean": 0.8133348226547241, |
| "rewards/ngram_repetition2/std": 0.07101932913064957, |
| "rewards/ngram_repetition3/mean": 0.9278632402420044, |
| "rewards/ngram_repetition3/std": 0.06339241564273834, |
| "rewards/symbolic_reward_accuracy/mean": 0.982421875, |
| "rewards/symbolic_reward_accuracy/std": 0.13144417107105255, |
| "rewards/symbolic_reward_partial_score/mean": 0.9866943359375, |
| "rewards/symbolic_reward_partial_score/std": 0.10472454875707626, |
| "rewards/tag_count_reward/mean": 0.998291015625, |
| "rewards/tag_count_reward/std": 0.02918882668018341, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9313849210739136, |
| "rewards/thinking_answer_ratio_reward/std": 0.06515223532915115, |
| "step": 32 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_clip_ratio/high_max": 0.0, |
| "eval_clip_ratio/high_mean": 0.0, |
| "eval_clip_ratio/low_mean": 0.0, |
| "eval_clip_ratio/low_min": 0.0, |
| "eval_clip_ratio/region_mean": 0.0, |
| "eval_completions/clipped_ratio": 0.006578947368421052, |
| "eval_completions/max_length": 1925.842105263158, |
| "eval_completions/max_terminated_length": 641.1578947368421, |
| "eval_completions/mean_length": 305.5797697368421, |
| "eval_completions/mean_terminated_length": 287.26832982113484, |
| "eval_completions/min_length": 145.52631578947367, |
| "eval_completions/min_terminated_length": 145.52631578947367, |
| "eval_entropy": 0.3921029740258267, |
| "eval_frac_reward_zero_std": 0.0, |
| "eval_loss": 0.017673568800091743, |
| "eval_num_tokens": 20282174.0, |
| "eval_reward": 3.8416001420272026, |
| "eval_reward_std": 0.4433499896212628, |
| "eval_rewards/ngram_repetition2/mean": 0.8298829511592263, |
| "eval_rewards/ngram_repetition2/std": 0.0766919782679332, |
| "eval_rewards/ngram_repetition3/mean": 0.9369708268265975, |
| "eval_rewards/ngram_repetition3/std": 0.0686160976949491, |
| "eval_rewards/symbolic_reward_accuracy/mean": 0.930921052631579, |
| "eval_rewards/symbolic_reward_accuracy/std": 0.24749822326396642, |
| "eval_rewards/symbolic_reward_partial_score/mean": 0.9561060855263158, |
| "eval_rewards/symbolic_reward_partial_score/std": 0.1654511419566054, |
| "eval_rewards/tag_count_reward/mean": 0.9967105263157895, |
| "eval_rewards/tag_count_reward/std": 0.028502884664033588, |
| "eval_rewards/thinking_answer_ratio_reward/mean": 0.9272897714062741, |
| "eval_rewards/thinking_answer_ratio_reward/std": 0.06353752030745934, |
| "eval_runtime": 291.3147, |
| "eval_samples_per_second": 0.515, |
| "eval_steps_per_second": 0.007, |
| "step": 32 |
| }, |
| { |
| "clip_ratio/high_max": 0.3125, |
| "clip_ratio/high_mean": 0.17919921875, |
| "clip_ratio/low_mean": 0.21875, |
| "clip_ratio/low_min": 0.10546875, |
| "clip_ratio/region_mean": 0.39794921875, |
| "completions/clipped_ratio": 0.001953125, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 853.0, |
| "completions/mean_length": 287.046875, |
| "completions/mean_terminated_length": 281.59686279296875, |
| "completions/min_length": 124.0, |
| "completions/min_terminated_length": 124.0, |
| "entropy": 0.39634467102587223, |
| "epoch": 1.125, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.021677013253874763, |
| "learning_rate": 1e-05, |
| "loss": 0.0116, |
| "num_tokens": 22213918.0, |
| "reward": 3.9796054363250732, |
| "reward_std": 0.13036790490150452, |
| "rewards/ngram_repetition2/mean": 0.8469281792640686, |
| "rewards/ngram_repetition2/std": 0.05668526515364647, |
| "rewards/ngram_repetition3/mean": 0.9489122033119202, |
| "rewards/ngram_repetition3/std": 0.048189926892519, |
| "rewards/symbolic_reward_accuracy/mean": 0.98291015625, |
| "rewards/symbolic_reward_accuracy/std": 0.1296379119157791, |
| "rewards/symbolic_reward_partial_score/mean": 0.9874267578125, |
| "rewards/symbolic_reward_partial_score/std": 0.10065428167581558, |
| "rewards/tag_count_reward/mean": 0.9990234375, |
| "rewards/tag_count_reward/std": 0.022080888971686363, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9376400709152222, |
| "rewards/thinking_answer_ratio_reward/std": 0.04486775025725365, |
| "step": 36 |
| }, |
| { |
| "clip_ratio/high_max": 0.3125, |
| "clip_ratio/high_mean": 0.19140625, |
| "clip_ratio/low_mean": 0.2021484375, |
| "clip_ratio/low_min": 0.08984375, |
| "clip_ratio/region_mean": 0.3935546875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1465.0, |
| "completions/max_terminated_length": 1465.0, |
| "completions/mean_length": 254.92529296875, |
| "completions/mean_terminated_length": 254.92529296875, |
| "completions/min_length": 112.0, |
| "completions/min_terminated_length": 112.0, |
| "entropy": 0.4012875221669674, |
| "epoch": 1.25, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.018544320597192198, |
| "learning_rate": 1e-05, |
| "loss": 0.0049, |
| "num_tokens": 24081541.0, |
| "reward": 4.009942054748535, |
| "reward_std": 0.060945987701416016, |
| "rewards/ngram_repetition2/mean": 0.8762841820716858, |
| "rewards/ngram_repetition2/std": 0.037716496735811234, |
| "rewards/ngram_repetition3/mean": 0.9659276008605957, |
| "rewards/ngram_repetition3/std": 0.025332553312182426, |
| "rewards/symbolic_reward_accuracy/mean": 0.9931640625, |
| "rewards/symbolic_reward_accuracy/std": 0.08241677284240723, |
| "rewards/symbolic_reward_partial_score/mean": 0.995849609375, |
| "rewards/symbolic_reward_partial_score/std": 0.052837058901786804, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.934229850769043, |
| "rewards/thinking_answer_ratio_reward/std": 0.017355602234601974, |
| "step": 40 |
| }, |
| { |
| "clip_ratio/high_max": 0.26953125, |
| "clip_ratio/high_mean": 0.166015625, |
| "clip_ratio/low_mean": 0.234375, |
| "clip_ratio/low_min": 0.1171875, |
| "clip_ratio/region_mean": 0.400390625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 502.0, |
| "completions/max_terminated_length": 502.0, |
| "completions/mean_length": 234.970703125, |
| "completions/mean_terminated_length": 234.970703125, |
| "completions/min_length": 112.0, |
| "completions/min_terminated_length": 112.0, |
| "entropy": 0.39688388258218765, |
| "epoch": 1.375, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.019234315035359873, |
| "learning_rate": 1e-05, |
| "loss": 0.0024, |
| "num_tokens": 25914953.0, |
| "reward": 4.015607833862305, |
| "reward_std": 0.033271413296461105, |
| "rewards/ngram_repetition2/mean": 0.8983770608901978, |
| "rewards/ngram_repetition2/std": 0.030990201979875565, |
| "rewards/ngram_repetition3/mean": 0.9768623113632202, |
| "rewards/ngram_repetition3/std": 0.016665907576680183, |
| "rewards/symbolic_reward_accuracy/mean": 0.9951171875, |
| "rewards/symbolic_reward_accuracy/std": 0.06972333788871765, |
| "rewards/symbolic_reward_partial_score/mean": 0.997314453125, |
| "rewards/symbolic_reward_partial_score/std": 0.03975516930222511, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9306961894035339, |
| "rewards/thinking_answer_ratio_reward/std": 0.016988540068268776, |
| "step": 44 |
| }, |
| { |
| "clip_ratio/high_max": 0.30078125, |
| "clip_ratio/high_mean": 0.173828125, |
| "clip_ratio/low_mean": 0.2197265625, |
| "clip_ratio/low_min": 0.1015625, |
| "clip_ratio/region_mean": 0.3935546875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 557.0, |
| "completions/max_terminated_length": 557.0, |
| "completions/mean_length": 224.271484375, |
| "completions/mean_terminated_length": 224.271484375, |
| "completions/min_length": 107.0, |
| "completions/min_terminated_length": 107.0, |
| "entropy": 0.3952790927141905, |
| "epoch": 1.5, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.017839049202086836, |
| "learning_rate": 1e-05, |
| "loss": 0.001, |
| "num_tokens": 27724789.0, |
| "reward": 4.000144004821777, |
| "reward_std": 0.05562726408243179, |
| "rewards/ngram_repetition2/mean": 0.9121678471565247, |
| "rewards/ngram_repetition2/std": 0.0285996925085783, |
| "rewards/ngram_repetition3/mean": 0.9819085597991943, |
| "rewards/ngram_repetition3/std": 0.015010321512818336, |
| "rewards/symbolic_reward_accuracy/mean": 0.98876953125, |
| "rewards/symbolic_reward_accuracy/std": 0.10540289431810379, |
| "rewards/symbolic_reward_partial_score/mean": 0.994384765625, |
| "rewards/symbolic_reward_partial_score/std": 0.052701447159051895, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9279464483261108, |
| "rewards/thinking_answer_ratio_reward/std": 0.017118271440267563, |
| "step": 48 |
| }, |
| { |
| "clip_ratio/high_max": 0.296875, |
| "clip_ratio/high_mean": 0.17236328125, |
| "clip_ratio/low_mean": 0.24609375, |
| "clip_ratio/low_min": 0.12890625, |
| "clip_ratio/region_mean": 0.41845703125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 397.0, |
| "completions/max_terminated_length": 397.0, |
| "completions/mean_length": 219.4580078125, |
| "completions/mean_terminated_length": 219.4580078125, |
| "completions/min_length": 94.0, |
| "completions/min_terminated_length": 94.0, |
| "entropy": 0.3957546763122082, |
| "epoch": 1.625, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.017264628619577616, |
| "learning_rate": 1e-05, |
| "loss": 0.0011, |
| "num_tokens": 29514783.0, |
| "reward": 4.000338554382324, |
| "reward_std": 0.06502684205770493, |
| "rewards/ngram_repetition2/mean": 0.9239511489868164, |
| "rewards/ngram_repetition2/std": 0.026442626491189003, |
| "rewards/ngram_repetition3/mean": 0.9862804412841797, |
| "rewards/ngram_repetition3/std": 0.012914017774164677, |
| "rewards/symbolic_reward_accuracy/mean": 0.98876953125, |
| "rewards/symbolic_reward_accuracy/std": 0.10540289431810379, |
| "rewards/symbolic_reward_partial_score/mean": 0.994384765625, |
| "rewards/symbolic_reward_partial_score/std": 0.052701447159051895, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9312715530395508, |
| "rewards/thinking_answer_ratio_reward/std": 0.016598645597696304, |
| "step": 52 |
| }, |
| { |
| "clip_ratio/high_max": 0.28125, |
| "clip_ratio/high_mean": 0.171875, |
| "clip_ratio/low_mean": 0.2099609375, |
| "clip_ratio/low_min": 0.08984375, |
| "clip_ratio/region_mean": 0.3818359375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 355.0, |
| "completions/max_terminated_length": 355.0, |
| "completions/mean_length": 211.7568359375, |
| "completions/mean_terminated_length": 211.7568359375, |
| "completions/min_length": 97.0, |
| "completions/min_terminated_length": 97.0, |
| "entropy": 0.38981067948043346, |
| "epoch": 1.75, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.01563751543259849, |
| "learning_rate": 1e-05, |
| "loss": 0.0012, |
| "num_tokens": 31293997.0, |
| "reward": 4.0152482986450195, |
| "reward_std": 0.02903410792350769, |
| "rewards/ngram_repetition2/mean": 0.9386552572250366, |
| "rewards/ngram_repetition2/std": 0.02311737835407257, |
| "rewards/ngram_repetition3/mean": 0.9913500547409058, |
| "rewards/ngram_repetition3/std": 0.010372841730713844, |
| "rewards/symbolic_reward_accuracy/mean": 0.99462890625, |
| "rewards/symbolic_reward_accuracy/std": 0.07310851663351059, |
| "rewards/symbolic_reward_partial_score/mean": 0.997314453125, |
| "rewards/symbolic_reward_partial_score/std": 0.036554258316755295, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9375712871551514, |
| "rewards/thinking_answer_ratio_reward/std": 0.01364652719348669, |
| "step": 56 |
| }, |
| { |
| "clip_ratio/high_max": 0.2890625, |
| "clip_ratio/high_mean": 0.17529296875, |
| "clip_ratio/low_mean": 0.20703125, |
| "clip_ratio/low_min": 0.07421875, |
| "clip_ratio/region_mean": 0.38232421875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 481.0, |
| "completions/max_terminated_length": 481.0, |
| "completions/mean_length": 211.9931640625, |
| "completions/mean_terminated_length": 211.9931640625, |
| "completions/min_length": 101.0, |
| "completions/min_terminated_length": 101.0, |
| "entropy": 0.38565365597605705, |
| "epoch": 1.875, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.015057855728013598, |
| "learning_rate": 1e-05, |
| "loss": 0.0009, |
| "num_tokens": 33077023.0, |
| "reward": 4.012916564941406, |
| "reward_std": 0.04395143315196037, |
| "rewards/ngram_repetition2/mean": 0.9458911418914795, |
| "rewards/ngram_repetition2/std": 0.02221463806927204, |
| "rewards/ngram_repetition3/mean": 0.9930184483528137, |
| "rewards/ngram_repetition3/std": 0.010048897005617619, |
| "rewards/symbolic_reward_accuracy/mean": 0.99365234375, |
| "rewards/symbolic_reward_accuracy/std": 0.07943830639123917, |
| "rewards/symbolic_reward_partial_score/mean": 0.996826171875, |
| "rewards/symbolic_reward_partial_score/std": 0.03971915319561958, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.939659833908081, |
| "rewards/thinking_answer_ratio_reward/std": 0.01160719245672226, |
| "step": 60 |
| }, |
| { |
| "clip_ratio/high_max": 0.265625, |
| "clip_ratio/high_mean": 0.1640625, |
| "clip_ratio/low_mean": 0.22412109375, |
| "clip_ratio/low_min": 0.09765625, |
| "clip_ratio/region_mean": 0.38818359375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 349.0, |
| "completions/max_terminated_length": 349.0, |
| "completions/mean_length": 210.61474609375, |
| "completions/mean_terminated_length": 210.61474609375, |
| "completions/min_length": 105.0, |
| "completions/min_terminated_length": 105.0, |
| "entropy": 0.38424801267683506, |
| "epoch": 2.0, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.01441432834996714, |
| "learning_rate": 1e-05, |
| "loss": 0.0003, |
| "num_tokens": 34848906.0, |
| "reward": 4.011754989624023, |
| "reward_std": 0.05248606204986572, |
| "rewards/ngram_repetition2/mean": 0.9511741399765015, |
| "rewards/ngram_repetition2/std": 0.02001781016588211, |
| "rewards/ngram_repetition3/mean": 0.9942980408668518, |
| "rewards/ngram_repetition3/std": 0.008259872905910015, |
| "rewards/symbolic_reward_accuracy/mean": 0.9931640625, |
| "rewards/symbolic_reward_accuracy/std": 0.08241677284240723, |
| "rewards/symbolic_reward_partial_score/mean": 0.99658203125, |
| "rewards/symbolic_reward_partial_score/std": 0.04120838642120361, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.938984215259552, |
| "rewards/thinking_answer_ratio_reward/std": 0.011489045806229115, |
| "step": 64 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_clip_ratio/high_max": 0.0, |
| "eval_clip_ratio/high_mean": 0.0, |
| "eval_clip_ratio/low_mean": 0.0, |
| "eval_clip_ratio/low_min": 0.0, |
| "eval_clip_ratio/region_mean": 0.0, |
| "eval_completions/clipped_ratio": 0.0, |
| "eval_completions/max_length": 343.2105263157895, |
| "eval_completions/max_terminated_length": 343.2105263157895, |
| "eval_completions/mean_length": 213.50863486842104, |
| "eval_completions/mean_terminated_length": 213.50863486842104, |
| "eval_completions/min_length": 133.94736842105263, |
| "eval_completions/min_terminated_length": 133.94736842105263, |
| "eval_entropy": 0.38965475088671636, |
| "eval_frac_reward_zero_std": 0.0, |
| "eval_loss": 0.00070223119109869, |
| "eval_num_tokens": 34848906.0, |
| "eval_reward": 3.9170174975144234, |
| "eval_reward_std": 0.26173981729708, |
| "eval_rewards/ngram_repetition2/mean": 0.9425309394535265, |
| "eval_rewards/ngram_repetition2/std": 0.02643796281987115, |
| "eval_rewards/ngram_repetition3/mean": 0.9898810072949058, |
| "eval_rewards/ngram_repetition3/std": 0.01246610764218004, |
| "eval_rewards/symbolic_reward_accuracy/mean": 0.9555921052631579, |
| "eval_rewards/symbolic_reward_accuracy/std": 0.18631722189878164, |
| "eval_rewards/symbolic_reward_partial_score/mean": 0.9771792763157895, |
| "eval_rewards/symbolic_reward_partial_score/std": 0.09655581100990898, |
| "eval_rewards/tag_count_reward/mean": 1.0, |
| "eval_rewards/tag_count_reward/std": 0.0, |
| "eval_rewards/thinking_answer_ratio_reward/mean": 0.9329894781112671, |
| "eval_rewards/thinking_answer_ratio_reward/std": 0.01513584458122128, |
| "eval_runtime": 140.2203, |
| "eval_samples_per_second": 1.07, |
| "eval_steps_per_second": 0.014, |
| "step": 64 |
| }, |
| { |
| "epoch": 2.0, |
| "step": 64, |
| "total_flos": 0.0, |
| "train_loss": 0.0, |
| "train_runtime": 3.4791, |
| "train_samples_per_second": 605.321, |
| "train_steps_per_second": 18.395 |
| } |
| ], |
| "logging_steps": 4, |
| "max_steps": 64, |
| "num_input_tokens_seen": 34848906, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|