| { |
| "best_global_step": 92, |
| "best_metric": 0.0008370681316591799, |
| "best_model_checkpoint": "data/DeepSeek-R1-Distill-Qwen-1.5B-Staged-4/checkpoint-92", |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 184, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 473.0, |
| "completions/max_terminated_length": 473.0, |
| "completions/mean_length": 399.97607421875, |
| "completions/mean_terminated_length": 399.97607421875, |
| "completions/min_length": 304.0, |
| "completions/min_terminated_length": 304.0, |
| "entropy": 0.35566435009241104, |
| "epoch": 0.010869565217391304, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.027134701502786768, |
| "learning_rate": 1e-05, |
| "loss": 0.0026, |
| "num_tokens": 2869071.0, |
| "reward": 3.4189200401306152, |
| "reward_std": 0.13538040220737457, |
| "rewards/ngram_repetition2/mean": 0.9907151460647583, |
| "rewards/ngram_repetition2/std": 0.007372148334980011, |
| "rewards/ngram_repetition3/mean": 0.9988653659820557, |
| "rewards/ngram_repetition3/std": 0.0037813771050423384, |
| "rewards/symbolic_reward_accuracy/mean": 0.7431640625, |
| "rewards/symbolic_reward_accuracy/std": 0.43699485063552856, |
| "rewards/symbolic_reward_partial_score/mean": 0.9029541015625, |
| "rewards/symbolic_reward_partial_score/std": 0.19717122614383698, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9742211699485779, |
| "rewards/thinking_answer_ratio_reward/std": 0.004831792786717415, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1269948482513428, |
| "sampling/importance_sampling_ratio/min": 3.996394298155792e-05, |
| "sampling/sampling_logp_difference/max": 10.127532958984375, |
| "sampling/sampling_logp_difference/mean": 0.19825759530067444, |
| "step": 1 |
| }, |
| { |
| "clip_ratio/high_max": 0.3385416666666667, |
| "clip_ratio/high_mean": 0.19986979166666666, |
| "clip_ratio/low_mean": 0.2779947916666667, |
| "clip_ratio/low_min": 0.125, |
| "clip_ratio/region_mean": 0.4778645833333333, |
| "entropy": 0.3625817572077115, |
| "epoch": 0.043478260869565216, |
| "grad_norm": 0.025662250505496927, |
| "learning_rate": 1e-05, |
| "loss": -0.0008, |
| "step": 4 |
| }, |
| { |
| "clip_ratio/high_max": 0.26953125, |
| "clip_ratio/high_mean": 0.15087890625, |
| "clip_ratio/low_mean": 0.201171875, |
| "clip_ratio/low_min": 0.078125, |
| "clip_ratio/region_mean": 0.35205078125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 483.0, |
| "completions/max_terminated_length": 483.0, |
| "completions/mean_length": 389.82080078125, |
| "completions/mean_terminated_length": 389.82080078125, |
| "completions/min_length": 299.0, |
| "completions/min_terminated_length": 299.0, |
| "entropy": 0.36707244999706745, |
| "epoch": 0.08695652173913043, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.02048054919935291, |
| "learning_rate": 1e-05, |
| "loss": 0.0, |
| "num_tokens": 5758528.0, |
| "reward": 3.2567176818847656, |
| "reward_std": 0.11631277203559875, |
| "rewards/ngram_repetition2/mean": 0.9903280138969421, |
| "rewards/ngram_repetition2/std": 0.008003082126379013, |
| "rewards/ngram_repetition3/mean": 0.9988331198692322, |
| "rewards/ngram_repetition3/std": 0.004472358617931604, |
| "rewards/symbolic_reward_accuracy/mean": 0.662109375, |
| "rewards/symbolic_reward_accuracy/std": 0.47310659289360046, |
| "rewards/symbolic_reward_partial_score/mean": 0.9028727412223816, |
| "rewards/symbolic_reward_partial_score/std": 0.16355262696743011, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9734619855880737, |
| "rewards/thinking_answer_ratio_reward/std": 0.005536045413464308, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1309096813201904, |
| "sampling/importance_sampling_ratio/min": 1.0269287486153189e-05, |
| "sampling/sampling_logp_difference/max": 11.486352920532227, |
| "sampling/sampling_logp_difference/mean": 0.20535901188850403, |
| "step": 8 |
| }, |
| { |
| "clip_ratio/high_max": 0.27734375, |
| "clip_ratio/high_mean": 0.16259765625, |
| "clip_ratio/low_mean": 0.197265625, |
| "clip_ratio/low_min": 0.09375, |
| "clip_ratio/region_mean": 0.35986328125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 503.0, |
| "completions/max_terminated_length": 503.0, |
| "completions/mean_length": 391.505859375, |
| "completions/mean_terminated_length": 391.505859375, |
| "completions/min_length": 311.0, |
| "completions/min_terminated_length": 311.0, |
| "entropy": 0.388662975281477, |
| "epoch": 0.13043478260869565, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.02483318093046019, |
| "learning_rate": 1e-05, |
| "loss": 0.0001, |
| "num_tokens": 8607084.0, |
| "reward": 3.3946127891540527, |
| "reward_std": 0.11570382118225098, |
| "rewards/ngram_repetition2/mean": 0.9894477725028992, |
| "rewards/ngram_repetition2/std": 0.007921576499938965, |
| "rewards/ngram_repetition3/mean": 0.9988635778427124, |
| "rewards/ngram_repetition3/std": 0.00399815896525979, |
| "rewards/symbolic_reward_accuracy/mean": 0.72119140625, |
| "rewards/symbolic_reward_accuracy/std": 0.448522686958313, |
| "rewards/symbolic_reward_partial_score/mean": 0.922607421875, |
| "rewards/symbolic_reward_partial_score/std": 0.14913025498390198, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9739440679550171, |
| "rewards/thinking_answer_ratio_reward/std": 0.004836579784750938, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1356343030929565, |
| "sampling/importance_sampling_ratio/min": 0.0001149894596892409, |
| "sampling/sampling_logp_difference/max": 9.070670127868652, |
| "sampling/sampling_logp_difference/mean": 0.21488171815872192, |
| "step": 12 |
| }, |
| { |
| "clip_ratio/high_max": 0.296875, |
| "clip_ratio/high_mean": 0.177734375, |
| "clip_ratio/low_mean": 0.1787109375, |
| "clip_ratio/low_min": 0.07421875, |
| "clip_ratio/region_mean": 0.3564453125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 492.0, |
| "completions/max_terminated_length": 492.0, |
| "completions/mean_length": 393.09716796875, |
| "completions/mean_terminated_length": 393.09716796875, |
| "completions/min_length": 313.0, |
| "completions/min_terminated_length": 313.0, |
| "entropy": 0.39366098679602146, |
| "epoch": 0.17391304347826086, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.03379401199661016, |
| "learning_rate": 1e-05, |
| "loss": 0.0008, |
| "num_tokens": 11465235.0, |
| "reward": 3.1309444904327393, |
| "reward_std": 0.27350103855133057, |
| "rewards/ngram_repetition2/mean": 0.9783711433410645, |
| "rewards/ngram_repetition2/std": 0.021668143570423126, |
| "rewards/ngram_repetition3/mean": 0.9917559623718262, |
| "rewards/ngram_repetition3/std": 0.014161880128085613, |
| "rewards/symbolic_reward_accuracy/mean": 0.625, |
| "rewards/symbolic_reward_accuracy/std": 0.48424115777015686, |
| "rewards/symbolic_reward_partial_score/mean": 0.8516031503677368, |
| "rewards/symbolic_reward_partial_score/std": 0.22019875049591064, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9640083312988281, |
| "rewards/thinking_answer_ratio_reward/std": 0.01703455112874508, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1348872184753418, |
| "sampling/importance_sampling_ratio/min": 4.663659638026729e-06, |
| "sampling/sampling_logp_difference/max": 12.275710105895996, |
| "sampling/sampling_logp_difference/mean": 0.21620666980743408, |
| "step": 16 |
| }, |
| { |
| "clip_ratio/high_max": 0.296875, |
| "clip_ratio/high_mean": 0.1826171875, |
| "clip_ratio/low_mean": 0.17724609375, |
| "clip_ratio/low_min": 0.09375, |
| "clip_ratio/region_mean": 0.35986328125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 508.0, |
| "completions/max_terminated_length": 508.0, |
| "completions/mean_length": 386.9951171875, |
| "completions/mean_terminated_length": 386.9951171875, |
| "completions/min_length": 305.0, |
| "completions/min_terminated_length": 305.0, |
| "entropy": 0.44838985428214073, |
| "epoch": 0.21739130434782608, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.02561700687134101, |
| "learning_rate": 1e-05, |
| "loss": 0.0009, |
| "num_tokens": 14298217.0, |
| "reward": 3.3726930618286133, |
| "reward_std": 0.11649461090564728, |
| "rewards/ngram_repetition2/mean": 0.9818264245986938, |
| "rewards/ngram_repetition2/std": 0.01684102788567543, |
| "rewards/ngram_repetition3/mean": 0.9955232739448547, |
| "rewards/ngram_repetition3/std": 0.010431738570332527, |
| "rewards/symbolic_reward_accuracy/mean": 0.72021484375, |
| "rewards/symbolic_reward_accuracy/std": 0.4490031898021698, |
| "rewards/symbolic_reward_partial_score/mean": 0.9027913212776184, |
| "rewards/symbolic_reward_partial_score/std": 0.1795635223388672, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9698631763458252, |
| "rewards/thinking_answer_ratio_reward/std": 0.012476499192416668, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1498932838439941, |
| "sampling/importance_sampling_ratio/min": 4.004936636192724e-05, |
| "sampling/sampling_logp_difference/max": 10.125397682189941, |
| "sampling/sampling_logp_difference/mean": 0.23924380540847778, |
| "step": 20 |
| }, |
| { |
| "clip_ratio/high_max": 0.328125, |
| "clip_ratio/high_mean": 0.20703125, |
| "clip_ratio/low_mean": 0.17822265625, |
| "clip_ratio/low_min": 0.05859375, |
| "clip_ratio/region_mean": 0.38525390625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 501.0, |
| "completions/max_terminated_length": 501.0, |
| "completions/mean_length": 381.43408203125, |
| "completions/mean_terminated_length": 381.43408203125, |
| "completions/min_length": 293.0, |
| "completions/min_terminated_length": 293.0, |
| "entropy": 0.45580120012164116, |
| "epoch": 0.2608695652173913, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.02764242724586242, |
| "learning_rate": 1e-05, |
| "loss": 0.0008, |
| "num_tokens": 17107138.0, |
| "reward": 3.2511990070343018, |
| "reward_std": 0.14526695013046265, |
| "rewards/ngram_repetition2/mean": 0.9758055210113525, |
| "rewards/ngram_repetition2/std": 0.026150498539209366, |
| "rewards/ngram_repetition3/mean": 0.991034984588623, |
| "rewards/ngram_repetition3/std": 0.018479736521840096, |
| "rewards/symbolic_reward_accuracy/mean": 0.666015625, |
| "rewards/symbolic_reward_accuracy/std": 0.47174936532974243, |
| "rewards/symbolic_reward_partial_score/mean": 0.8898518681526184, |
| "rewards/symbolic_reward_partial_score/std": 0.18666595220565796, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9647442102432251, |
| "rewards/thinking_answer_ratio_reward/std": 0.018395813181996346, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.151930332183838, |
| "sampling/importance_sampling_ratio/min": 2.3768032406223938e-05, |
| "sampling/sampling_logp_difference/max": 10.64716911315918, |
| "sampling/sampling_logp_difference/mean": 0.242381751537323, |
| "step": 24 |
| }, |
| { |
| "clip_ratio/high_max": 0.34375, |
| "clip_ratio/high_mean": 0.21533203125, |
| "clip_ratio/low_mean": 0.16552734375, |
| "clip_ratio/low_min": 0.07421875, |
| "clip_ratio/region_mean": 0.380859375, |
| "completions/clipped_ratio": 0.00048828125, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 480.0, |
| "completions/mean_length": 370.24658203125, |
| "completions/mean_terminated_length": 368.9267272949219, |
| "completions/min_length": 290.0, |
| "completions/min_terminated_length": 290.0, |
| "entropy": 0.47484372183680534, |
| "epoch": 0.30434782608695654, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.02571369534600996, |
| "learning_rate": 1e-05, |
| "loss": 0.0019, |
| "num_tokens": 19915323.0, |
| "reward": 3.4752464294433594, |
| "reward_std": 0.1370949149131775, |
| "rewards/ngram_repetition2/mean": 0.9834737181663513, |
| "rewards/ngram_repetition2/std": 0.016693396493792534, |
| "rewards/ngram_repetition3/mean": 0.996229887008667, |
| "rewards/ngram_repetition3/std": 0.01113222073763609, |
| "rewards/symbolic_reward_accuracy/mean": 0.7587890625, |
| "rewards/symbolic_reward_accuracy/std": 0.42792245745658875, |
| "rewards/symbolic_reward_partial_score/mean": 0.9284260869026184, |
| "rewards/symbolic_reward_partial_score/std": 0.15036074817180634, |
| "rewards/tag_count_reward/mean": 0.999755859375, |
| "rewards/tag_count_reward/std": 0.011048543266952038, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9689278602600098, |
| "rewards/thinking_answer_ratio_reward/std": 0.024484839290380478, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1540093421936035, |
| "sampling/importance_sampling_ratio/min": 4.7302735765697435e-05, |
| "sampling/sampling_logp_difference/max": 9.958942413330078, |
| "sampling/sampling_logp_difference/mean": 0.24644868075847626, |
| "step": 28 |
| }, |
| { |
| "clip_ratio/high_max": 0.3046875, |
| "clip_ratio/high_mean": 0.17724609375, |
| "clip_ratio/low_mean": 0.19384765625, |
| "clip_ratio/low_min": 0.078125, |
| "clip_ratio/region_mean": 0.37109375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 452.0, |
| "completions/max_terminated_length": 452.0, |
| "completions/mean_length": 361.70166015625, |
| "completions/mean_terminated_length": 361.70166015625, |
| "completions/min_length": 275.0, |
| "completions/min_terminated_length": 275.0, |
| "entropy": 0.4775677230209112, |
| "epoch": 0.34782608695652173, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.022611441969899414, |
| "learning_rate": 1e-05, |
| "loss": 0.0003, |
| "num_tokens": 22712344.0, |
| "reward": 3.4439804553985596, |
| "reward_std": 0.07519370317459106, |
| "rewards/ngram_repetition2/mean": 0.9849820137023926, |
| "rewards/ngram_repetition2/std": 0.015775341540575027, |
| "rewards/ngram_repetition3/mean": 0.9966925382614136, |
| "rewards/ngram_repetition3/std": 0.010486208833754063, |
| "rewards/symbolic_reward_accuracy/mean": 0.7451171875, |
| "rewards/symbolic_reward_accuracy/std": 0.4359017610549927, |
| "rewards/symbolic_reward_partial_score/mean": 0.9242349863052368, |
| "rewards/symbolic_reward_partial_score/std": 0.1511358916759491, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9694297313690186, |
| "rewards/thinking_answer_ratio_reward/std": 0.01129063218832016, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1553937196731567, |
| "sampling/importance_sampling_ratio/min": 1.3447781777031764e-10, |
| "sampling/sampling_logp_difference/max": 22.72962188720703, |
| "sampling/sampling_logp_difference/mean": 0.24660193920135498, |
| "step": 32 |
| }, |
| { |
| "clip_ratio/high_max": 0.296875, |
| "clip_ratio/high_mean": 0.18896484375, |
| "clip_ratio/low_mean": 0.18798828125, |
| "clip_ratio/low_min": 0.08203125, |
| "clip_ratio/region_mean": 0.376953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 457.0, |
| "completions/max_terminated_length": 457.0, |
| "completions/mean_length": 360.79736328125, |
| "completions/mean_terminated_length": 360.79736328125, |
| "completions/min_length": 286.0, |
| "completions/min_terminated_length": 286.0, |
| "entropy": 0.4877959694713354, |
| "epoch": 0.391304347826087, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.02221078365492021, |
| "learning_rate": 1e-05, |
| "loss": 0.0004, |
| "num_tokens": 25517017.0, |
| "reward": 3.241891622543335, |
| "reward_std": 0.03938647359609604, |
| "rewards/ngram_repetition2/mean": 0.9857202768325806, |
| "rewards/ngram_repetition2/std": 0.013620593585073948, |
| "rewards/ngram_repetition3/mean": 0.9973255395889282, |
| "rewards/ngram_repetition3/std": 0.008434941992163658, |
| "rewards/symbolic_reward_accuracy/mean": 0.65234375, |
| "rewards/symbolic_reward_accuracy/std": 0.47634249925613403, |
| "rewards/symbolic_reward_partial_score/mean": 0.9076741933822632, |
| "rewards/symbolic_reward_partial_score/std": 0.141921728849411, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9699524641036987, |
| "rewards/thinking_answer_ratio_reward/std": 0.009303269907832146, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.156609296798706, |
| "sampling/importance_sampling_ratio/min": 1.4608130260995722e-09, |
| "sampling/sampling_logp_difference/max": 20.34427261352539, |
| "sampling/sampling_logp_difference/mean": 0.24704143404960632, |
| "step": 36 |
| }, |
| { |
| "clip_ratio/high_max": 0.31640625, |
| "clip_ratio/high_mean": 0.20263671875, |
| "clip_ratio/low_mean": 0.17431640625, |
| "clip_ratio/low_min": 0.0546875, |
| "clip_ratio/region_mean": 0.376953125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 470.0, |
| "completions/max_terminated_length": 470.0, |
| "completions/mean_length": 365.380859375, |
| "completions/mean_terminated_length": 365.380859375, |
| "completions/min_length": 289.0, |
| "completions/min_terminated_length": 289.0, |
| "entropy": 0.500596784055233, |
| "epoch": 0.43478260869565216, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.028585581797482114, |
| "learning_rate": 1e-05, |
| "loss": 0.0003, |
| "num_tokens": 28302565.0, |
| "reward": 3.6249711513519287, |
| "reward_std": 0.031787335872650146, |
| "rewards/ngram_repetition2/mean": 0.9847173690795898, |
| "rewards/ngram_repetition2/std": 0.01566314324736595, |
| "rewards/ngram_repetition3/mean": 0.9967639446258545, |
| "rewards/ngram_repetition3/std": 0.010298742912709713, |
| "rewards/symbolic_reward_accuracy/mean": 0.8203125, |
| "rewards/symbolic_reward_accuracy/std": 0.38402071595191956, |
| "rewards/symbolic_reward_partial_score/mean": 0.954833984375, |
| "rewards/symbolic_reward_partial_score/std": 0.11018021404743195, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9697376489639282, |
| "rewards/thinking_answer_ratio_reward/std": 0.011102610267698765, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1601823568344116, |
| "sampling/importance_sampling_ratio/min": 2.0568222680594772e-05, |
| "sampling/sampling_logp_difference/max": 10.791763305664062, |
| "sampling/sampling_logp_difference/mean": 0.24917525053024292, |
| "step": 40 |
| }, |
| { |
| "clip_ratio/high_max": 0.26171875, |
| "clip_ratio/high_mean": 0.14892578125, |
| "clip_ratio/low_mean": 0.1982421875, |
| "clip_ratio/low_min": 0.08203125, |
| "clip_ratio/region_mean": 0.34716796875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 469.0, |
| "completions/max_terminated_length": 469.0, |
| "completions/mean_length": 373.4169921875, |
| "completions/mean_terminated_length": 373.4169921875, |
| "completions/min_length": 306.0, |
| "completions/min_terminated_length": 306.0, |
| "entropy": 0.5211230479180813, |
| "epoch": 0.4782608695652174, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.031003841790188075, |
| "learning_rate": 1e-05, |
| "loss": 0.0002, |
| "num_tokens": 31126747.0, |
| "reward": 3.277402639389038, |
| "reward_std": 0.036474019289016724, |
| "rewards/ngram_repetition2/mean": 0.9847090840339661, |
| "rewards/ngram_repetition2/std": 0.01521327905356884, |
| "rewards/ngram_repetition3/mean": 0.9967399835586548, |
| "rewards/ngram_repetition3/std": 0.00949870329350233, |
| "rewards/symbolic_reward_accuracy/mean": 0.6708984375, |
| "rewards/symbolic_reward_accuracy/std": 0.4700016975402832, |
| "rewards/symbolic_reward_partial_score/mean": 0.9060872793197632, |
| "rewards/symbolic_reward_partial_score/std": 0.16207432746887207, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9704017043113708, |
| "rewards/thinking_answer_ratio_reward/std": 0.010555099695920944, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1634962558746338, |
| "sampling/importance_sampling_ratio/min": 3.222545501557761e-06, |
| "sampling/sampling_logp_difference/max": 12.645339012145996, |
| "sampling/sampling_logp_difference/mean": 0.2521994411945343, |
| "step": 44 |
| }, |
| { |
| "clip_ratio/high_max": 0.31640625, |
| "clip_ratio/high_mean": 0.20166015625, |
| "clip_ratio/low_mean": 0.16064453125, |
| "clip_ratio/low_min": 0.05078125, |
| "clip_ratio/region_mean": 0.3623046875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 447.0, |
| "completions/max_terminated_length": 447.0, |
| "completions/mean_length": 376.16943359375, |
| "completions/mean_terminated_length": 376.16943359375, |
| "completions/min_length": 287.0, |
| "completions/min_terminated_length": 287.0, |
| "entropy": 0.5292842984199524, |
| "epoch": 0.5217391304347826, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.029177978249695414, |
| "learning_rate": 1e-05, |
| "loss": 0.0003, |
| "num_tokens": 33956566.0, |
| "reward": 3.5402560234069824, |
| "reward_std": 0.05406097322702408, |
| "rewards/ngram_repetition2/mean": 0.9857374429702759, |
| "rewards/ngram_repetition2/std": 0.011830438859760761, |
| "rewards/ngram_repetition3/mean": 0.9977642297744751, |
| "rewards/ngram_repetition3/std": 0.00712351780384779, |
| "rewards/symbolic_reward_accuracy/mean": 0.7822265625, |
| "rewards/symbolic_reward_accuracy/std": 0.4128333628177643, |
| "rewards/symbolic_reward_partial_score/mean": 0.9462483525276184, |
| "rewards/symbolic_reward_partial_score/std": 0.11659117788076401, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9719350934028625, |
| "rewards/thinking_answer_ratio_reward/std": 0.0077917324379086494, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1653231382369995, |
| "sampling/importance_sampling_ratio/min": 1.30699368128262e-06, |
| "sampling/sampling_logp_difference/max": 13.547780990600586, |
| "sampling/sampling_logp_difference/mean": 0.2539900541305542, |
| "step": 48 |
| }, |
| { |
| "clip_ratio/high_max": 0.30078125, |
| "clip_ratio/high_mean": 0.1875, |
| "clip_ratio/low_mean": 0.19287109375, |
| "clip_ratio/low_min": 0.05859375, |
| "clip_ratio/region_mean": 0.38037109375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 476.0, |
| "completions/max_terminated_length": 476.0, |
| "completions/mean_length": 375.197265625, |
| "completions/mean_terminated_length": 375.197265625, |
| "completions/min_length": 285.0, |
| "completions/min_terminated_length": 285.0, |
| "entropy": 0.5228982605040073, |
| "epoch": 0.5652173913043478, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.019909033791539, |
| "learning_rate": 1e-05, |
| "loss": 0.0002, |
| "num_tokens": 36800234.0, |
| "reward": 3.3348586559295654, |
| "reward_std": 0.031567975878715515, |
| "rewards/ngram_repetition2/mean": 0.9860277771949768, |
| "rewards/ngram_repetition2/std": 0.011108696460723877, |
| "rewards/ngram_repetition3/mean": 0.9980593919754028, |
| "rewards/ngram_repetition3/std": 0.006017275620251894, |
| "rewards/symbolic_reward_accuracy/mean": 0.69580078125, |
| "rewards/symbolic_reward_accuracy/std": 0.46017980575561523, |
| "rewards/symbolic_reward_partial_score/mean": 0.9136962890625, |
| "rewards/symbolic_reward_partial_score/std": 0.15295757353305817, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9719891548156738, |
| "rewards/thinking_answer_ratio_reward/std": 0.007201826199889183, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.165921688079834, |
| "sampling/importance_sampling_ratio/min": 7.888810068834573e-05, |
| "sampling/sampling_logp_difference/max": 9.447480201721191, |
| "sampling/sampling_logp_difference/mean": 0.2536054253578186, |
| "step": 52 |
| }, |
| { |
| "clip_ratio/high_max": 0.2890625, |
| "clip_ratio/high_mean": 0.19287109375, |
| "clip_ratio/low_mean": 0.18603515625, |
| "clip_ratio/low_min": 0.08203125, |
| "clip_ratio/region_mean": 0.37890625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 469.0, |
| "completions/max_terminated_length": 469.0, |
| "completions/mean_length": 373.00830078125, |
| "completions/mean_terminated_length": 373.00830078125, |
| "completions/min_length": 310.0, |
| "completions/min_terminated_length": 310.0, |
| "entropy": 0.5138954482972622, |
| "epoch": 0.6086956521739131, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.018577199260664073, |
| "learning_rate": 1e-05, |
| "loss": 0.0001, |
| "num_tokens": 39623579.0, |
| "reward": 3.4861793518066406, |
| "reward_std": 0.01453761849552393, |
| "rewards/ngram_repetition2/mean": 0.9885779619216919, |
| "rewards/ngram_repetition2/std": 0.008892485871911049, |
| "rewards/ngram_repetition3/mean": 0.9986856579780579, |
| "rewards/ngram_repetition3/std": 0.004819902591407299, |
| "rewards/symbolic_reward_accuracy/mean": 0.76318359375, |
| "rewards/symbolic_reward_accuracy/std": 0.42523249983787537, |
| "rewards/symbolic_reward_partial_score/mean": 0.9302164316177368, |
| "rewards/symbolic_reward_partial_score/std": 0.14360538125038147, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9723219871520996, |
| "rewards/thinking_answer_ratio_reward/std": 0.006340987980365753, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1633408069610596, |
| "sampling/importance_sampling_ratio/min": 9.12318591872463e-06, |
| "sampling/sampling_logp_difference/max": 11.604691505432129, |
| "sampling/sampling_logp_difference/mean": 0.2456064522266388, |
| "step": 56 |
| }, |
| { |
| "clip_ratio/high_max": 0.26953125, |
| "clip_ratio/high_mean": 0.17626953125, |
| "clip_ratio/low_mean": 0.18798828125, |
| "clip_ratio/low_min": 0.08984375, |
| "clip_ratio/region_mean": 0.3642578125, |
| "completions/clipped_ratio": 0.0009765625, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 473.0, |
| "completions/mean_length": 383.15380859375, |
| "completions/mean_terminated_length": 380.5254211425781, |
| "completions/min_length": 303.0, |
| "completions/min_terminated_length": 303.0, |
| "entropy": 0.5303183943033218, |
| "epoch": 0.6521739130434783, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.023767814234547865, |
| "learning_rate": 1e-05, |
| "loss": 0.0032, |
| "num_tokens": 42464534.0, |
| "reward": 3.4200026988983154, |
| "reward_std": 0.044837385416030884, |
| "rewards/ngram_repetition2/mean": 0.9886395931243896, |
| "rewards/ngram_repetition2/std": 0.026615649461746216, |
| "rewards/ngram_repetition3/mean": 0.9978044629096985, |
| "rewards/ngram_repetition3/std": 0.02556409314274788, |
| "rewards/symbolic_reward_accuracy/mean": 0.72705078125, |
| "rewards/symbolic_reward_accuracy/std": 0.4455837607383728, |
| "rewards/symbolic_reward_partial_score/mean": 0.9368082284927368, |
| "rewards/symbolic_reward_partial_score/std": 0.12158261984586716, |
| "rewards/tag_count_reward/mean": 0.99951171875, |
| "rewards/tag_count_reward/std": 0.015621182508766651, |
| "rewards/thinking_answer_ratio_reward/mean": 0.971657395362854, |
| "rewards/thinking_answer_ratio_reward/std": 0.031062643975019455, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1663137674331665, |
| "sampling/importance_sampling_ratio/min": 0.00022333291417453438, |
| "sampling/sampling_logp_difference/max": 8.40684700012207, |
| "sampling/sampling_logp_difference/mean": 0.2500694990158081, |
| "step": 60 |
| }, |
| { |
| "clip_ratio/high_max": 0.2734375, |
| "clip_ratio/high_mean": 0.1826171875, |
| "clip_ratio/low_mean": 0.1728515625, |
| "clip_ratio/low_min": 0.0625, |
| "clip_ratio/region_mean": 0.35546875, |
| "completions/clipped_ratio": 0.00048828125, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 539.0, |
| "completions/mean_length": 386.85986328125, |
| "completions/mean_terminated_length": 385.5481262207031, |
| "completions/min_length": 326.0, |
| "completions/min_terminated_length": 326.0, |
| "entropy": 0.5535794571042061, |
| "epoch": 0.6956521739130435, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.033414142847404786, |
| "learning_rate": 1e-05, |
| "loss": 0.0016, |
| "num_tokens": 45319415.0, |
| "reward": 3.5498650074005127, |
| "reward_std": 0.044402044266462326, |
| "rewards/ngram_repetition2/mean": 0.9896550178527832, |
| "rewards/ngram_repetition2/std": 0.01944366842508316, |
| "rewards/ngram_repetition3/mean": 0.998386025428772, |
| "rewards/ngram_repetition3/std": 0.017801163718104362, |
| "rewards/symbolic_reward_accuracy/mean": 0.78857421875, |
| "rewards/symbolic_reward_accuracy/std": 0.40841934084892273, |
| "rewards/symbolic_reward_partial_score/mean": 0.943359375, |
| "rewards/symbolic_reward_partial_score/std": 0.1245667040348053, |
| "rewards/tag_count_reward/mean": 0.999755859375, |
| "rewards/tag_count_reward/std": 0.011048543266952038, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9720906019210815, |
| "rewards/thinking_answer_ratio_reward/std": 0.022436225786805153, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1719954013824463, |
| "sampling/importance_sampling_ratio/min": 3.318129529361613e-05, |
| "sampling/sampling_logp_difference/max": 10.31352424621582, |
| "sampling/sampling_logp_difference/mean": 0.25759539008140564, |
| "step": 64 |
| }, |
| { |
| "clip_ratio/high_max": 0.3203125, |
| "clip_ratio/high_mean": 0.21630859375, |
| "clip_ratio/low_mean": 0.16748046875, |
| "clip_ratio/low_min": 0.0625, |
| "clip_ratio/region_mean": 0.3837890625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 513.0, |
| "completions/max_terminated_length": 513.0, |
| "completions/mean_length": 394.04443359375, |
| "completions/mean_terminated_length": 394.04443359375, |
| "completions/min_length": 325.0, |
| "completions/min_terminated_length": 325.0, |
| "entropy": 0.5683293081820011, |
| "epoch": 0.7391304347826086, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.02077882386260629, |
| "learning_rate": 1e-05, |
| "loss": -0.0001, |
| "num_tokens": 48182674.0, |
| "reward": 3.4722495079040527, |
| "reward_std": 0.027738399803638458, |
| "rewards/ngram_repetition2/mean": 0.990430474281311, |
| "rewards/ngram_repetition2/std": 0.008477425202727318, |
| "rewards/ngram_repetition3/mean": 0.998767614364624, |
| "rewards/ngram_repetition3/std": 0.004312796052545309, |
| "rewards/symbolic_reward_accuracy/mean": 0.75390625, |
| "rewards/symbolic_reward_accuracy/std": 0.43083900213241577, |
| "rewards/symbolic_reward_partial_score/mean": 0.934814453125, |
| "rewards/symbolic_reward_partial_score/std": 0.1286529004573822, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9730780124664307, |
| "rewards/thinking_answer_ratio_reward/std": 0.006791951600462198, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.174986720085144, |
| "sampling/importance_sampling_ratio/min": 5.481481275637634e-05, |
| "sampling/sampling_logp_difference/max": 9.81155014038086, |
| "sampling/sampling_logp_difference/mean": 0.2610657513141632, |
| "step": 68 |
| }, |
| { |
| "clip_ratio/high_max": 0.23046875, |
| "clip_ratio/high_mean": 0.13134765625, |
| "clip_ratio/low_mean": 0.203125, |
| "clip_ratio/low_min": 0.09765625, |
| "clip_ratio/region_mean": 0.33447265625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 525.0, |
| "completions/max_terminated_length": 525.0, |
| "completions/mean_length": 402.0390625, |
| "completions/mean_terminated_length": 402.0390625, |
| "completions/min_length": 305.0, |
| "completions/min_terminated_length": 305.0, |
| "entropy": 0.5930491574108601, |
| "epoch": 0.782608695652174, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.0719362481523671, |
| "learning_rate": 1e-05, |
| "loss": 0.0002, |
| "num_tokens": 51030626.0, |
| "reward": 3.507338047027588, |
| "reward_std": 0.07821064442396164, |
| "rewards/ngram_repetition2/mean": 0.9908634424209595, |
| "rewards/ngram_repetition2/std": 0.006825427990406752, |
| "rewards/ngram_repetition3/mean": 0.9992104768753052, |
| "rewards/ngram_repetition3/std": 0.0024387796875089407, |
| "rewards/symbolic_reward_accuracy/mean": 0.7705078125, |
| "rewards/symbolic_reward_accuracy/std": 0.4206089675426483, |
| "rewards/symbolic_reward_partial_score/mean": 0.9366861581802368, |
| "rewards/symbolic_reward_partial_score/std": 0.12974172830581665, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9735254049301147, |
| "rewards/thinking_answer_ratio_reward/std": 0.006129096262156963, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1793080568313599, |
| "sampling/importance_sampling_ratio/min": 8.93013350378169e-07, |
| "sampling/sampling_logp_difference/max": 13.928664207458496, |
| "sampling/sampling_logp_difference/mean": 0.2646637558937073, |
| "step": 72 |
| }, |
| { |
| "clip_ratio/high_max": 0.16796875, |
| "clip_ratio/high_mean": 0.10595703125, |
| "clip_ratio/low_mean": 0.2255859375, |
| "clip_ratio/low_min": 0.11328125, |
| "clip_ratio/region_mean": 0.33154296875, |
| "completions/clipped_ratio": 0.00634765625, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 664.0, |
| "completions/mean_length": 425.64794921875, |
| "completions/mean_terminated_length": 408.74249267578125, |
| "completions/min_length": 326.0, |
| "completions/min_terminated_length": 326.0, |
| "entropy": 0.6297206245362759, |
| "epoch": 0.8260869565217391, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.17694802563627895, |
| "learning_rate": 1e-05, |
| "loss": 0.0207, |
| "num_tokens": 53958609.0, |
| "reward": 3.434380054473877, |
| "reward_std": 0.15169459581375122, |
| "rewards/ngram_repetition2/mean": 0.986931562423706, |
| "rewards/ngram_repetition2/std": 0.04639098048210144, |
| "rewards/ngram_repetition3/mean": 0.9962227940559387, |
| "rewards/ngram_repetition3/std": 0.046196915209293365, |
| "rewards/symbolic_reward_accuracy/mean": 0.74072265625, |
| "rewards/symbolic_reward_accuracy/std": 0.4383451044559479, |
| "rewards/symbolic_reward_partial_score/mean": 0.9265950918197632, |
| "rewards/symbolic_reward_partial_score/std": 0.153251513838768, |
| "rewards/tag_count_reward/mean": 0.996826171875, |
| "rewards/tag_count_reward/std": 0.03971915319561958, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9682024717330933, |
| "rewards/thinking_answer_ratio_reward/std": 0.07753153145313263, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1941275596618652, |
| "sampling/importance_sampling_ratio/min": 4.905196692561731e-05, |
| "sampling/sampling_logp_difference/max": 9.922630310058594, |
| "sampling/sampling_logp_difference/mean": 0.27940261363983154, |
| "step": 76 |
| }, |
| { |
| "clip_ratio/high_max": 0.2109375, |
| "clip_ratio/high_mean": 0.125, |
| "clip_ratio/low_mean": 0.1884765625, |
| "clip_ratio/low_min": 0.07421875, |
| "clip_ratio/region_mean": 0.3134765625, |
| "completions/clipped_ratio": 0.025390625, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 846.0, |
| "completions/mean_length": 474.76806640625, |
| "completions/mean_terminated_length": 407.1047058105469, |
| "completions/min_length": 308.0, |
| "completions/min_terminated_length": 308.0, |
| "entropy": 0.6249676272273064, |
| "epoch": 0.8695652173913043, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.036438229713036584, |
| "learning_rate": 1e-05, |
| "loss": 0.0733, |
| "num_tokens": 56952342.0, |
| "reward": 3.3732757568359375, |
| "reward_std": 0.3173109292984009, |
| "rewards/ngram_repetition2/mean": 0.9734116792678833, |
| "rewards/ngram_repetition2/std": 0.09751972556114197, |
| "rewards/ngram_repetition3/mean": 0.9850020408630371, |
| "rewards/ngram_repetition3/std": 0.09716872870922089, |
| "rewards/symbolic_reward_accuracy/mean": 0.73046875, |
| "rewards/symbolic_reward_accuracy/std": 0.4438246786594391, |
| "rewards/symbolic_reward_partial_score/mean": 0.8959553837776184, |
| "rewards/symbolic_reward_partial_score/std": 0.21540075540542603, |
| "rewards/tag_count_reward/mean": 0.9873046875, |
| "rewards/tag_count_reward/std": 0.07867342233657837, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9493899941444397, |
| "rewards/thinking_answer_ratio_reward/std": 0.15333302319049835, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.1926825046539307, |
| "sampling/importance_sampling_ratio/min": 1.8732294847723097e-05, |
| "sampling/sampling_logp_difference/max": 10.885261535644531, |
| "sampling/sampling_logp_difference/mean": 0.27355605363845825, |
| "step": 80 |
| }, |
| { |
| "clip_ratio/high_max": 0.16015625, |
| "clip_ratio/high_mean": 0.08154296875, |
| "clip_ratio/low_mean": 0.1630859375, |
| "clip_ratio/low_min": 0.0546875, |
| "clip_ratio/region_mean": 0.24462890625, |
| "completions/clipped_ratio": 0.041015625, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 1910.0, |
| "completions/mean_length": 513.650390625, |
| "completions/mean_terminated_length": 404.23016357421875, |
| "completions/min_length": 283.0, |
| "completions/min_terminated_length": 283.0, |
| "entropy": 0.6995769254863262, |
| "epoch": 0.9130434782608695, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3126366897938619, |
| "learning_rate": 1e-05, |
| "loss": 0.1153, |
| "num_tokens": 60057386.0, |
| "reward": 3.4059977531433105, |
| "reward_std": 0.4714565873146057, |
| "rewards/ngram_repetition2/mean": 0.9679386615753174, |
| "rewards/ngram_repetition2/std": 0.09275452047586441, |
| "rewards/ngram_repetition3/mean": 0.984743595123291, |
| "rewards/ngram_repetition3/std": 0.09202881902456284, |
| "rewards/symbolic_reward_accuracy/mean": 0.7451171875, |
| "rewards/symbolic_reward_accuracy/std": 0.4359017610549927, |
| "rewards/symbolic_reward_partial_score/mean": 0.9079182744026184, |
| "rewards/symbolic_reward_partial_score/std": 0.21840326488018036, |
| "rewards/tag_count_reward/mean": 0.97900390625, |
| "rewards/tag_count_reward/std": 0.10031013935804367, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9314362406730652, |
| "rewards/thinking_answer_ratio_reward/std": 0.19521328806877136, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.2058464288711548, |
| "sampling/importance_sampling_ratio/min": 0.00020593531371559948, |
| "sampling/sampling_logp_difference/max": 8.487948417663574, |
| "sampling/sampling_logp_difference/mean": 0.2925470471382141, |
| "step": 84 |
| }, |
| { |
| "clip_ratio/high_max": 0.04296875, |
| "clip_ratio/high_mean": 0.013671875, |
| "clip_ratio/low_mean": 0.30078125, |
| "clip_ratio/low_min": 0.16796875, |
| "clip_ratio/region_mean": 0.314453125, |
| "completions/clipped_ratio": 0.00146484375, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 578.0, |
| "completions/mean_length": 382.1064453125, |
| "completions/mean_terminated_length": 378.160400390625, |
| "completions/min_length": 266.0, |
| "completions/min_terminated_length": 266.0, |
| "entropy": 0.7504777312278748, |
| "epoch": 0.9565217391304348, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.08070835974712572, |
| "learning_rate": 1e-05, |
| "loss": 0.0066, |
| "num_tokens": 62905700.0, |
| "reward": 3.419804811477661, |
| "reward_std": 0.09055649489164352, |
| "rewards/ngram_repetition2/mean": 0.980319619178772, |
| "rewards/ngram_repetition2/std": 0.011525592766702175, |
| "rewards/ngram_repetition3/mean": 0.9975647330284119, |
| "rewards/ngram_repetition3/std": 0.005651315674185753, |
| "rewards/symbolic_reward_accuracy/mean": 0.732421875, |
| "rewards/symbolic_reward_accuracy/std": 0.4428044855594635, |
| "rewards/symbolic_reward_partial_score/mean": 0.9267171025276184, |
| "rewards/symbolic_reward_partial_score/std": 0.14161133766174316, |
| "rewards/tag_count_reward/mean": 0.998779296875, |
| "rewards/tag_count_reward/std": 0.02468114346265793, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9685767292976379, |
| "rewards/thinking_answer_ratio_reward/std": 0.04834046587347984, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.249918818473816, |
| "sampling/importance_sampling_ratio/min": 0.0003460382577031851, |
| "sampling/sampling_logp_difference/max": 7.968961238861084, |
| "sampling/sampling_logp_difference/mean": 0.3476927876472473, |
| "step": 88 |
| }, |
| { |
| "clip_ratio/high_max": 0.140625, |
| "clip_ratio/high_mean": 0.07470703125, |
| "clip_ratio/low_mean": 0.2451171875, |
| "clip_ratio/low_min": 0.12890625, |
| "clip_ratio/region_mean": 0.31982421875, |
| "completions/clipped_ratio": 0.00146484375, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 579.0, |
| "completions/mean_length": 347.271484375, |
| "completions/mean_terminated_length": 343.2743225097656, |
| "completions/min_length": 187.0, |
| "completions/min_terminated_length": 187.0, |
| "entropy": 0.8337125517427921, |
| "epoch": 1.0, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.09077319995516756, |
| "learning_rate": 1e-05, |
| "loss": 0.0058, |
| "num_tokens": 65689008.0, |
| "reward": 3.366065502166748, |
| "reward_std": 0.12272138148546219, |
| "rewards/ngram_repetition2/mean": 0.9778505563735962, |
| "rewards/ngram_repetition2/std": 0.02180611714720726, |
| "rewards/ngram_repetition3/mean": 0.9966850280761719, |
| "rewards/ngram_repetition3/std": 0.01893492229282856, |
| "rewards/symbolic_reward_accuracy/mean": 0.7119140625, |
| "rewards/symbolic_reward_accuracy/std": 0.4529819190502167, |
| "rewards/symbolic_reward_partial_score/mean": 0.91357421875, |
| "rewards/symbolic_reward_partial_score/std": 0.1592041552066803, |
| "rewards/tag_count_reward/mean": 0.999267578125, |
| "rewards/tag_count_reward/std": 0.019127286970615387, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9650210738182068, |
| "rewards/thinking_answer_ratio_reward/std": 0.03827888146042824, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.254873275756836, |
| "sampling/importance_sampling_ratio/min": 0.00030355059425346553, |
| "sampling/sampling_logp_difference/max": 8.09996223449707, |
| "sampling/sampling_logp_difference/mean": 0.347744882106781, |
| "step": 92 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_clip_ratio/high_max": 0.0, |
| "eval_clip_ratio/high_mean": 0.0, |
| "eval_clip_ratio/low_mean": 0.0, |
| "eval_clip_ratio/low_min": 0.0, |
| "eval_clip_ratio/region_mean": 0.0, |
| "eval_completions/clipped_ratio": 0.0008223684210526315, |
| "eval_completions/max_length": 730.6842105263158, |
| "eval_completions/max_terminated_length": 454.36842105263156, |
| "eval_completions/mean_length": 323.31825657894734, |
| "eval_completions/mean_terminated_length": 321.05658762078536, |
| "eval_completions/min_length": 223.05263157894737, |
| "eval_completions/min_terminated_length": 223.05263157894737, |
| "eval_entropy": 0.8378904399118925, |
| "eval_frac_reward_zero_std": 0.0, |
| "eval_loss": 0.0008370681316591799, |
| "eval_num_tokens": 65689008.0, |
| "eval_reward": 3.2917319975401225, |
| "eval_reward_std": 0.11801875697749079, |
| "eval_rewards/ngram_repetition2/mean": 0.9756150371149966, |
| "eval_rewards/ngram_repetition2/std": 0.01365309997804855, |
| "eval_rewards/ngram_repetition3/mean": 0.9962926067804035, |
| "eval_rewards/ngram_repetition3/std": 0.006524833691257395, |
| "eval_rewards/symbolic_reward_accuracy/mean": 0.678453947368421, |
| "eval_rewards/symbolic_reward_accuracy/std": 0.4427479756505866, |
| "eval_rewards/symbolic_reward_partial_score/mean": 0.9067297138665852, |
| "eval_rewards/symbolic_reward_partial_score/std": 0.1448917659489732, |
| "eval_rewards/tag_count_reward/mean": 0.998766447368421, |
| "eval_rewards/tag_count_reward/std": 0.012580533757021553, |
| "eval_rewards/thinking_answer_ratio_reward/mean": 0.9608869458499708, |
| "eval_rewards/thinking_answer_ratio_reward/std": 0.03243385211221481, |
| "eval_runtime": 180.375, |
| "eval_samples_per_second": 0.832, |
| "eval_sampling/importance_sampling_ratio/max": 2.0, |
| "eval_sampling/importance_sampling_ratio/mean": 1.2766095650823492, |
| "eval_sampling/importance_sampling_ratio/min": 0.0030842775350289516, |
| "eval_sampling/sampling_logp_difference/max": 5.888615081184788, |
| "eval_sampling/sampling_logp_difference/mean": 0.37607322868547943, |
| "eval_steps_per_second": 0.011, |
| "step": 92 |
| }, |
| { |
| "clip_ratio/high_max": 0.1796875, |
| "clip_ratio/high_mean": 0.08984375, |
| "clip_ratio/low_mean": 0.23876953125, |
| "clip_ratio/low_min": 0.10546875, |
| "clip_ratio/region_mean": 0.32861328125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 484.0, |
| "completions/max_terminated_length": 484.0, |
| "completions/mean_length": 318.796875, |
| "completions/mean_terminated_length": 318.796875, |
| "completions/min_length": 188.0, |
| "completions/min_terminated_length": 188.0, |
| "entropy": 0.8296824619174004, |
| "epoch": 1.0434782608695652, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.035556589091200325, |
| "learning_rate": 1e-05, |
| "loss": 0.0004, |
| "num_tokens": 68391824.0, |
| "reward": 3.526369333267212, |
| "reward_std": 0.10698950290679932, |
| "rewards/ngram_repetition2/mean": 0.976723313331604, |
| "rewards/ngram_repetition2/std": 0.013903986662626266, |
| "rewards/ngram_repetition3/mean": 0.9966170787811279, |
| "rewards/ngram_repetition3/std": 0.007368884980678558, |
| "rewards/symbolic_reward_accuracy/mean": 0.779296875, |
| "rewards/symbolic_reward_accuracy/std": 0.414821982383728, |
| "rewards/symbolic_reward_partial_score/mean": 0.9383952021598816, |
| "rewards/symbolic_reward_partial_score/std": 0.13407477736473083, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9647086262702942, |
| "rewards/thinking_answer_ratio_reward/std": 0.010662911459803581, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.277923345565796, |
| "sampling/importance_sampling_ratio/min": 0.0010678451508283615, |
| "sampling/sampling_logp_difference/max": 6.8421125411987305, |
| "sampling/sampling_logp_difference/mean": 0.37708646059036255, |
| "step": 96 |
| }, |
| { |
| "clip_ratio/high_max": 0.3515625, |
| "clip_ratio/high_mean": 0.21728515625, |
| "clip_ratio/low_mean": 0.1474609375, |
| "clip_ratio/low_min": 0.05078125, |
| "clip_ratio/region_mean": 0.36474609375, |
| "completions/clipped_ratio": 0.00048828125, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 474.0, |
| "completions/mean_length": 312.2705078125, |
| "completions/mean_terminated_length": 310.9223327636719, |
| "completions/min_length": 199.0, |
| "completions/min_terminated_length": 199.0, |
| "entropy": 0.8829836808145046, |
| "epoch": 1.0869565217391304, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.026459033753815097, |
| "learning_rate": 1e-05, |
| "loss": 0.0015, |
| "num_tokens": 71097114.0, |
| "reward": 3.375732421875, |
| "reward_std": 0.16172271966934204, |
| "rewards/ngram_repetition2/mean": 0.9744597673416138, |
| "rewards/ngram_repetition2/std": 0.013245878741145134, |
| "rewards/ngram_repetition3/mean": 0.996464729309082, |
| "rewards/ngram_repetition3/std": 0.006197268608957529, |
| "rewards/symbolic_reward_accuracy/mean": 0.71337890625, |
| "rewards/symbolic_reward_accuracy/std": 0.45229339599609375, |
| "rewards/symbolic_reward_partial_score/mean": 0.9201253056526184, |
| "rewards/symbolic_reward_partial_score/std": 0.14969654381275177, |
| "rewards/tag_count_reward/mean": 0.99951171875, |
| "rewards/tag_count_reward/std": 0.015621182508766651, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9628262519836426, |
| "rewards/thinking_answer_ratio_reward/std": 0.03255803510546684, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.280361533164978, |
| "sampling/importance_sampling_ratio/min": 0.000973947171587497, |
| "sampling/sampling_logp_difference/max": 6.9341535568237305, |
| "sampling/sampling_logp_difference/mean": 0.3825719654560089, |
| "step": 100 |
| }, |
| { |
| "clip_ratio/high_max": 0.2421875, |
| "clip_ratio/high_mean": 0.1552734375, |
| "clip_ratio/low_mean": 0.21044921875, |
| "clip_ratio/low_min": 0.1015625, |
| "clip_ratio/region_mean": 0.36572265625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 507.0, |
| "completions/max_terminated_length": 507.0, |
| "completions/mean_length": 305.24658203125, |
| "completions/mean_terminated_length": 305.24658203125, |
| "completions/min_length": 184.0, |
| "completions/min_terminated_length": 184.0, |
| "entropy": 0.93367725238204, |
| "epoch": 1.1304347826086956, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.023622811381027897, |
| "learning_rate": 1e-05, |
| "loss": -0.0013, |
| "num_tokens": 73753171.0, |
| "reward": 3.5650792121887207, |
| "reward_std": 0.07873280346393585, |
| "rewards/ngram_repetition2/mean": 0.9779536724090576, |
| "rewards/ngram_repetition2/std": 0.011236137710511684, |
| "rewards/ngram_repetition3/mean": 0.997450590133667, |
| "rewards/ngram_repetition3/std": 0.004056216217577457, |
| "rewards/symbolic_reward_accuracy/mean": 0.79541015625, |
| "rewards/symbolic_reward_accuracy/std": 0.40350010991096497, |
| "rewards/symbolic_reward_partial_score/mean": 0.9448649287223816, |
| "rewards/symbolic_reward_partial_score/std": 0.12454802542924881, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9639977812767029, |
| "rewards/thinking_answer_ratio_reward/std": 0.010575964115560055, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.2900059223175049, |
| "sampling/importance_sampling_ratio/min": 0.00017328630201518536, |
| "sampling/sampling_logp_difference/max": 8.660565376281738, |
| "sampling/sampling_logp_difference/mean": 0.3951420187950134, |
| "step": 104 |
| }, |
| { |
| "clip_ratio/high_max": 0.3046875, |
| "clip_ratio/high_mean": 0.193359375, |
| "clip_ratio/low_mean": 0.1904296875, |
| "clip_ratio/low_min": 0.07421875, |
| "clip_ratio/region_mean": 0.3837890625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 538.0, |
| "completions/max_terminated_length": 538.0, |
| "completions/mean_length": 312.146484375, |
| "completions/mean_terminated_length": 312.146484375, |
| "completions/min_length": 181.0, |
| "completions/min_terminated_length": 181.0, |
| "entropy": 1.0210995934903622, |
| "epoch": 1.1739130434782608, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.028388966774426528, |
| "learning_rate": 1e-05, |
| "loss": -0.0009, |
| "num_tokens": 76420191.0, |
| "reward": 3.539149761199951, |
| "reward_std": 0.10342732071876526, |
| "rewards/ngram_repetition2/mean": 0.9775739908218384, |
| "rewards/ngram_repetition2/std": 0.011158421635627747, |
| "rewards/ngram_repetition3/mean": 0.99737548828125, |
| "rewards/ngram_repetition3/std": 0.0038673817180097103, |
| "rewards/symbolic_reward_accuracy/mean": 0.78369140625, |
| "rewards/symbolic_reward_accuracy/std": 0.4118276536464691, |
| "rewards/symbolic_reward_partial_score/mean": 0.942626953125, |
| "rewards/symbolic_reward_partial_score/std": 0.12709258496761322, |
| "rewards/tag_count_reward/mean": 0.999755859375, |
| "rewards/tag_count_reward/std": 0.011048543266952038, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9634621739387512, |
| "rewards/thinking_answer_ratio_reward/std": 0.02338651940226555, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.30389404296875, |
| "sampling/importance_sampling_ratio/min": 0.0012677250197157264, |
| "sampling/sampling_logp_difference/max": 6.670531272888184, |
| "sampling/sampling_logp_difference/mean": 0.4129188656806946, |
| "step": 108 |
| }, |
| { |
| "clip_ratio/high_max": 0.26171875, |
| "clip_ratio/high_mean": 0.15966796875, |
| "clip_ratio/low_mean": 0.212890625, |
| "clip_ratio/low_min": 0.10546875, |
| "clip_ratio/region_mean": 0.37255859375, |
| "completions/clipped_ratio": 0.00048828125, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 1036.0, |
| "completions/mean_length": 323.71337890625, |
| "completions/mean_terminated_length": 322.37078857421875, |
| "completions/min_length": 186.0, |
| "completions/min_terminated_length": 186.0, |
| "entropy": 1.1148979514837265, |
| "epoch": 1.2173913043478262, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.025973854193448703, |
| "learning_rate": 1e-05, |
| "loss": 0.0012, |
| "num_tokens": 79101396.0, |
| "reward": 3.5500121116638184, |
| "reward_std": 0.07026512920856476, |
| "rewards/ngram_repetition2/mean": 0.9797195792198181, |
| "rewards/ngram_repetition2/std": 0.011294333264231682, |
| "rewards/ngram_repetition3/mean": 0.9979097843170166, |
| "rewards/ngram_repetition3/std": 0.003578017931431532, |
| "rewards/symbolic_reward_accuracy/mean": 0.7890625, |
| "rewards/symbolic_reward_accuracy/std": 0.408073753118515, |
| "rewards/symbolic_reward_partial_score/mean": 0.9429525136947632, |
| "rewards/symbolic_reward_partial_score/std": 0.12270573526620865, |
| "rewards/tag_count_reward/mean": 0.99951171875, |
| "rewards/tag_count_reward/std": 0.015621182508766651, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9646666049957275, |
| "rewards/thinking_answer_ratio_reward/std": 0.03140328451991081, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.3225877285003662, |
| "sampling/importance_sampling_ratio/min": 0.0020175804384052753, |
| "sampling/sampling_logp_difference/max": 6.2058563232421875, |
| "sampling/sampling_logp_difference/mean": 0.43698880076408386, |
| "step": 112 |
| }, |
| { |
| "clip_ratio/high_max": 0.30859375, |
| "clip_ratio/high_mean": 0.16650390625, |
| "clip_ratio/low_mean": 0.19873046875, |
| "clip_ratio/low_min": 0.0703125, |
| "clip_ratio/region_mean": 0.365234375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 609.0, |
| "completions/max_terminated_length": 609.0, |
| "completions/mean_length": 321.759765625, |
| "completions/mean_terminated_length": 321.759765625, |
| "completions/min_length": 191.0, |
| "completions/min_terminated_length": 191.0, |
| "entropy": 1.1531813517212868, |
| "epoch": 1.2608695652173914, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.03531320773224689, |
| "learning_rate": 1e-05, |
| "loss": -0.0021, |
| "num_tokens": 81835624.0, |
| "reward": 3.5205469131469727, |
| "reward_std": 0.06826324760913849, |
| "rewards/ngram_repetition2/mean": 0.9809833765029907, |
| "rewards/ngram_repetition2/std": 0.009805475361645222, |
| "rewards/ngram_repetition3/mean": 0.998246431350708, |
| "rewards/ngram_repetition3/std": 0.003229390596970916, |
| "rewards/symbolic_reward_accuracy/mean": 0.77392578125, |
| "rewards/symbolic_reward_accuracy/std": 0.4183899462223053, |
| "rewards/symbolic_reward_partial_score/mean": 0.9432373046875, |
| "rewards/symbolic_reward_partial_score/std": 0.1203688383102417, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9665986895561218, |
| "rewards/thinking_answer_ratio_reward/std": 0.007385050877928734, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.3322265148162842, |
| "sampling/importance_sampling_ratio/min": 0.0010348489740863442, |
| "sampling/sampling_logp_difference/max": 6.873499870300293, |
| "sampling/sampling_logp_difference/mean": 0.44727569818496704, |
| "step": 116 |
| }, |
| { |
| "clip_ratio/high_max": 0.2890625, |
| "clip_ratio/high_mean": 0.1962890625, |
| "clip_ratio/low_mean": 0.1875, |
| "clip_ratio/low_min": 0.078125, |
| "clip_ratio/region_mean": 0.3837890625, |
| "completions/clipped_ratio": 0.00048828125, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 723.0, |
| "completions/mean_length": 349.056640625, |
| "completions/mean_terminated_length": 347.7264404296875, |
| "completions/min_length": 210.0, |
| "completions/min_terminated_length": 210.0, |
| "entropy": 1.2782283127307892, |
| "epoch": 1.3043478260869565, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.034579763552097215, |
| "learning_rate": 1e-05, |
| "loss": 0.0015, |
| "num_tokens": 84613084.0, |
| "reward": 3.4549241065979004, |
| "reward_std": 0.06410142034292221, |
| "rewards/ngram_repetition2/mean": 0.9802985191345215, |
| "rewards/ngram_repetition2/std": 0.010372255928814411, |
| "rewards/ngram_repetition3/mean": 0.9981052875518799, |
| "rewards/ngram_repetition3/std": 0.0034116564784199, |
| "rewards/symbolic_reward_accuracy/mean": 0.7470703125, |
| "rewards/symbolic_reward_accuracy/std": 0.43479716777801514, |
| "rewards/symbolic_reward_partial_score/mean": 0.9315592050552368, |
| "rewards/symbolic_reward_partial_score/std": 0.14198674261569977, |
| "rewards/tag_count_reward/mean": 0.999755859375, |
| "rewards/tag_count_reward/std": 0.011048543266952038, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9684212803840637, |
| "rewards/thinking_answer_ratio_reward/std": 0.022755270823836327, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.3524930477142334, |
| "sampling/importance_sampling_ratio/min": 0.0009223732049576938, |
| "sampling/sampling_logp_difference/max": 6.988560676574707, |
| "sampling/sampling_logp_difference/mean": 0.4782055616378784, |
| "step": 120 |
| }, |
| { |
| "clip_ratio/high_max": 0.25390625, |
| "clip_ratio/high_mean": 0.13916015625, |
| "clip_ratio/low_mean": 0.23193359375, |
| "clip_ratio/low_min": 0.11328125, |
| "clip_ratio/region_mean": 0.37109375, |
| "completions/clipped_ratio": 0.00146484375, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 644.0, |
| "completions/mean_length": 365.08154296875, |
| "completions/mean_terminated_length": 361.1105041503906, |
| "completions/min_length": 207.0, |
| "completions/min_terminated_length": 207.0, |
| "entropy": 1.3577501401305199, |
| "epoch": 1.3478260869565217, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.03263029986961485, |
| "learning_rate": 1e-05, |
| "loss": 0.0038, |
| "num_tokens": 87413859.0, |
| "reward": 3.3977296352386475, |
| "reward_std": 0.07217299938201904, |
| "rewards/ngram_repetition2/mean": 0.9815422296524048, |
| "rewards/ngram_repetition2/std": 0.017534635961055756, |
| "rewards/ngram_repetition3/mean": 0.9980996251106262, |
| "rewards/ngram_repetition3/std": 0.014925251714885235, |
| "rewards/symbolic_reward_accuracy/mean": 0.72265625, |
| "rewards/symbolic_reward_accuracy/std": 0.44779694080352783, |
| "rewards/symbolic_reward_partial_score/mean": 0.9239094853401184, |
| "rewards/symbolic_reward_partial_score/std": 0.14968585968017578, |
| "rewards/tag_count_reward/mean": 0.9990234375, |
| "rewards/tag_count_reward/std": 0.022080888971686363, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9687855243682861, |
| "rewards/thinking_answer_ratio_reward/std": 0.043307721614837646, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.3644728660583496, |
| "sampling/importance_sampling_ratio/min": 4.628046553989407e-06, |
| "sampling/sampling_logp_difference/max": 12.28337574005127, |
| "sampling/sampling_logp_difference/mean": 0.4966655373573303, |
| "step": 124 |
| }, |
| { |
| "clip_ratio/high_max": 0.26953125, |
| "clip_ratio/high_mean": 0.1572265625, |
| "clip_ratio/low_mean": 0.189453125, |
| "clip_ratio/low_min": 0.078125, |
| "clip_ratio/region_mean": 0.3466796875, |
| "completions/clipped_ratio": 0.001953125, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 662.0, |
| "completions/mean_length": 374.17041015625, |
| "completions/mean_terminated_length": 368.8908996582031, |
| "completions/min_length": 236.0, |
| "completions/min_terminated_length": 236.0, |
| "entropy": 1.4103393778204918, |
| "epoch": 1.391304347826087, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.03271499886368749, |
| "learning_rate": 1e-05, |
| "loss": 0.0068, |
| "num_tokens": 90233248.0, |
| "reward": 3.470027446746826, |
| "reward_std": 0.08837710320949554, |
| "rewards/ngram_repetition2/mean": 0.9828887581825256, |
| "rewards/ngram_repetition2/std": 0.008584595285356045, |
| "rewards/ngram_repetition3/mean": 0.9987025260925293, |
| "rewards/ngram_repetition3/std": 0.0025142852682620287, |
| "rewards/symbolic_reward_accuracy/mean": 0.75390625, |
| "rewards/symbolic_reward_accuracy/std": 0.43083900213241577, |
| "rewards/symbolic_reward_partial_score/mean": 0.9336751103401184, |
| "rewards/symbolic_reward_partial_score/std": 0.1408122330904007, |
| "rewards/tag_count_reward/mean": 0.9990234375, |
| "rewards/tag_count_reward/std": 0.022080888971686363, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9700585007667542, |
| "rewards/thinking_answer_ratio_reward/std": 0.043263670057058334, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.3686330318450928, |
| "sampling/importance_sampling_ratio/min": 1.2951417147633038e-06, |
| "sampling/sampling_logp_difference/max": 13.556890487670898, |
| "sampling/sampling_logp_difference/mean": 0.5069293975830078, |
| "step": 128 |
| }, |
| { |
| "clip_ratio/high_max": 0.28515625, |
| "clip_ratio/high_mean": 0.1611328125, |
| "clip_ratio/low_mean": 0.2099609375, |
| "clip_ratio/low_min": 0.1015625, |
| "clip_ratio/region_mean": 0.37109375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 640.0, |
| "completions/max_terminated_length": 640.0, |
| "completions/mean_length": 372.28515625, |
| "completions/mean_terminated_length": 372.28515625, |
| "completions/min_length": 222.0, |
| "completions/min_terminated_length": 222.0, |
| "entropy": 1.4540528357028961, |
| "epoch": 1.434782608695652, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.037706615340187905, |
| "learning_rate": 1e-05, |
| "loss": -0.0007, |
| "num_tokens": 93048776.0, |
| "reward": 3.6577959060668945, |
| "reward_std": 0.08277644217014313, |
| "rewards/ngram_repetition2/mean": 0.9837426543235779, |
| "rewards/ngram_repetition2/std": 0.008059200830757618, |
| "rewards/ngram_repetition3/mean": 0.9987409710884094, |
| "rewards/ngram_repetition3/std": 0.002457347232848406, |
| "rewards/symbolic_reward_accuracy/mean": 0.8359375, |
| "rewards/symbolic_reward_accuracy/std": 0.37042272090911865, |
| "rewards/symbolic_reward_partial_score/mean": 0.9568685293197632, |
| "rewards/symbolic_reward_partial_score/std": 0.11145035177469254, |
| "rewards/tag_count_reward/mean": 0.99951171875, |
| "rewards/tag_count_reward/std": 0.015621182508766651, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9716061353683472, |
| "rewards/thinking_answer_ratio_reward/std": 0.03078635036945343, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.3747305870056152, |
| "sampling/importance_sampling_ratio/min": 6.537237368320348e-06, |
| "sampling/sampling_logp_difference/max": 11.937995910644531, |
| "sampling/sampling_logp_difference/mean": 0.5204066634178162, |
| "step": 132 |
| }, |
| { |
| "clip_ratio/high_max": 0.24609375, |
| "clip_ratio/high_mean": 0.1455078125, |
| "clip_ratio/low_mean": 0.2216796875, |
| "clip_ratio/low_min": 0.109375, |
| "clip_ratio/region_mean": 0.3671875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 690.0, |
| "completions/max_terminated_length": 690.0, |
| "completions/mean_length": 384.67431640625, |
| "completions/mean_terminated_length": 384.67431640625, |
| "completions/min_length": 243.0, |
| "completions/min_terminated_length": 243.0, |
| "entropy": 1.5088882371783257, |
| "epoch": 1.4782608695652173, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.037293164778679895, |
| "learning_rate": 1e-05, |
| "loss": -0.0015, |
| "num_tokens": 95896013.0, |
| "reward": 3.551879405975342, |
| "reward_std": 0.05557756870985031, |
| "rewards/ngram_repetition2/mean": 0.9850949048995972, |
| "rewards/ngram_repetition2/std": 0.007369986269623041, |
| "rewards/ngram_repetition3/mean": 0.9990456104278564, |
| "rewards/ngram_repetition3/std": 0.002027435228228569, |
| "rewards/symbolic_reward_accuracy/mean": 0.7919921875, |
| "rewards/symbolic_reward_accuracy/std": 0.40598157048225403, |
| "rewards/symbolic_reward_partial_score/mean": 0.9383137822151184, |
| "rewards/symbolic_reward_partial_score/std": 0.13532200455665588, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9739952683448792, |
| "rewards/thinking_answer_ratio_reward/std": 0.00430486723780632, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.3827202320098877, |
| "sampling/importance_sampling_ratio/min": 3.3355458981532138e-06, |
| "sampling/sampling_logp_difference/max": 12.61087417602539, |
| "sampling/sampling_logp_difference/mean": 0.5356423854827881, |
| "step": 136 |
| }, |
| { |
| "clip_ratio/high_max": 0.265625, |
| "clip_ratio/high_mean": 0.14111328125, |
| "clip_ratio/low_mean": 0.2041015625, |
| "clip_ratio/low_min": 0.08203125, |
| "clip_ratio/region_mean": 0.34521484375, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 697.0, |
| "completions/max_terminated_length": 697.0, |
| "completions/mean_length": 411.45556640625, |
| "completions/mean_terminated_length": 411.45556640625, |
| "completions/min_length": 243.0, |
| "completions/min_terminated_length": 243.0, |
| "entropy": 1.632333055138588, |
| "epoch": 1.5217391304347827, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.04682422298321068, |
| "learning_rate": 1e-05, |
| "loss": -0.0012, |
| "num_tokens": 98772754.0, |
| "reward": 3.4415788650512695, |
| "reward_std": 0.06567732989788055, |
| "rewards/ngram_repetition2/mean": 0.9847305417060852, |
| "rewards/ngram_repetition2/std": 0.007420970126986504, |
| "rewards/ngram_repetition3/mean": 0.9989989399909973, |
| "rewards/ngram_repetition3/std": 0.0022030072286725044, |
| "rewards/symbolic_reward_accuracy/mean": 0.74072265625, |
| "rewards/symbolic_reward_accuracy/std": 0.4383451044559479, |
| "rewards/symbolic_reward_partial_score/mean": 0.9307861328125, |
| "rewards/symbolic_reward_partial_score/std": 0.1378849595785141, |
| "rewards/tag_count_reward/mean": 0.999755859375, |
| "rewards/tag_count_reward/std": 0.011048543266952038, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9754141569137573, |
| "rewards/thinking_answer_ratio_reward/std": 0.022000106051564217, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.3969695568084717, |
| "sampling/importance_sampling_ratio/min": 2.535395157710063e-11, |
| "sampling/sampling_logp_difference/max": 24.398086547851562, |
| "sampling/sampling_logp_difference/mean": 0.5631133317947388, |
| "step": 140 |
| }, |
| { |
| "clip_ratio/high_max": 0.24609375, |
| "clip_ratio/high_mean": 0.15283203125, |
| "clip_ratio/low_mean": 0.203125, |
| "clip_ratio/low_min": 0.1015625, |
| "clip_ratio/region_mean": 0.35595703125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 890.0, |
| "completions/max_terminated_length": 890.0, |
| "completions/mean_length": 425.41552734375, |
| "completions/mean_terminated_length": 425.41552734375, |
| "completions/min_length": 258.0, |
| "completions/min_terminated_length": 258.0, |
| "entropy": 1.726756490767002, |
| "epoch": 1.5652173913043477, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.038093541651771236, |
| "learning_rate": 1e-05, |
| "loss": -0.001, |
| "num_tokens": 101674917.0, |
| "reward": 3.5725719928741455, |
| "reward_std": 0.017834221944212914, |
| "rewards/ngram_repetition2/mean": 0.9846716523170471, |
| "rewards/ngram_repetition2/std": 0.0074483552016317844, |
| "rewards/ngram_repetition3/mean": 0.9990028738975525, |
| "rewards/ngram_repetition3/std": 0.00226139766164124, |
| "rewards/symbolic_reward_accuracy/mean": 0.80322265625, |
| "rewards/symbolic_reward_accuracy/std": 0.39765968918800354, |
| "rewards/symbolic_reward_partial_score/mean": 0.9365234375, |
| "rewards/symbolic_reward_partial_score/std": 0.14711622893810272, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9766725897789001, |
| "rewards/thinking_answer_ratio_reward/std": 0.004060372244566679, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.4077180624008179, |
| "sampling/importance_sampling_ratio/min": 5.527023176910006e-07, |
| "sampling/sampling_logp_difference/max": 14.408446311950684, |
| "sampling/sampling_logp_difference/mean": 0.5831520557403564, |
| "step": 144 |
| }, |
| { |
| "clip_ratio/high_max": 0.2734375, |
| "clip_ratio/high_mean": 0.1513671875, |
| "clip_ratio/low_mean": 0.22314453125, |
| "clip_ratio/low_min": 0.10546875, |
| "clip_ratio/region_mean": 0.37451171875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 764.0, |
| "completions/max_terminated_length": 764.0, |
| "completions/mean_length": 434.2431640625, |
| "completions/mean_terminated_length": 434.2431640625, |
| "completions/min_length": 236.0, |
| "completions/min_terminated_length": 236.0, |
| "entropy": 1.815441645681858, |
| "epoch": 1.608695652173913, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.06173162254334352, |
| "learning_rate": 1e-05, |
| "loss": -0.0011, |
| "num_tokens": 104649015.0, |
| "reward": 3.4869613647460938, |
| "reward_std": 0.0348396934568882, |
| "rewards/ngram_repetition2/mean": 0.9839403629302979, |
| "rewards/ngram_repetition2/std": 0.00729979295283556, |
| "rewards/ngram_repetition3/mean": 0.9989305138587952, |
| "rewards/ngram_repetition3/std": 0.002313731238245964, |
| "rewards/symbolic_reward_accuracy/mean": 0.75830078125, |
| "rewards/symbolic_reward_accuracy/std": 0.4282175302505493, |
| "rewards/symbolic_reward_partial_score/mean": 0.9407551884651184, |
| "rewards/symbolic_reward_partial_score/std": 0.12213268131017685, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9775879383087158, |
| "rewards/thinking_answer_ratio_reward/std": 0.003819839097559452, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.4123494625091553, |
| "sampling/importance_sampling_ratio/min": 8.588379569118842e-05, |
| "sampling/sampling_logp_difference/max": 9.362515449523926, |
| "sampling/sampling_logp_difference/mean": 0.5984525084495544, |
| "step": 148 |
| }, |
| { |
| "clip_ratio/high_max": 0.2265625, |
| "clip_ratio/high_mean": 0.1328125, |
| "clip_ratio/low_mean": 0.2197265625, |
| "clip_ratio/low_min": 0.1015625, |
| "clip_ratio/region_mean": 0.3525390625, |
| "completions/clipped_ratio": 0.00048828125, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 932.0, |
| "completions/mean_length": 463.5791015625, |
| "completions/mean_terminated_length": 462.3048400878906, |
| "completions/min_length": 241.0, |
| "completions/min_terminated_length": 241.0, |
| "entropy": 1.9924634099006653, |
| "epoch": 1.6521739130434783, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.10959925778321504, |
| "learning_rate": 1e-05, |
| "loss": 0.0013, |
| "num_tokens": 107645177.0, |
| "reward": 3.647918462753296, |
| "reward_std": 0.0664098709821701, |
| "rewards/ngram_repetition2/mean": 0.9830807447433472, |
| "rewards/ngram_repetition2/std": 0.007356169633567333, |
| "rewards/ngram_repetition3/mean": 0.9989252090454102, |
| "rewards/ngram_repetition3/std": 0.0019529308192431927, |
| "rewards/symbolic_reward_accuracy/mean": 0.82958984375, |
| "rewards/symbolic_reward_accuracy/std": 0.376084566116333, |
| "rewards/symbolic_reward_partial_score/mean": 0.9603678584098816, |
| "rewards/symbolic_reward_partial_score/std": 0.09740671515464783, |
| "rewards/tag_count_reward/mean": 0.998779296875, |
| "rewards/tag_count_reward/std": 0.02468114346265793, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9771405458450317, |
| "rewards/thinking_answer_ratio_reward/std": 0.043395884335041046, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.4253789186477661, |
| "sampling/importance_sampling_ratio/min": 0.0006438745185732841, |
| "sampling/sampling_logp_difference/max": 7.348006725311279, |
| "sampling/sampling_logp_difference/mean": 0.6298946738243103, |
| "step": 152 |
| }, |
| { |
| "clip_ratio/high_max": 0.140625, |
| "clip_ratio/high_mean": 0.08349609375, |
| "clip_ratio/low_mean": 0.263671875, |
| "clip_ratio/low_min": 0.1328125, |
| "clip_ratio/region_mean": 0.34716796875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 824.0, |
| "completions/max_terminated_length": 824.0, |
| "completions/mean_length": 462.9541015625, |
| "completions/mean_terminated_length": 462.9541015625, |
| "completions/min_length": 234.0, |
| "completions/min_terminated_length": 234.0, |
| "entropy": 2.0628679618239403, |
| "epoch": 1.6956521739130435, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24161130829987623, |
| "learning_rate": 1e-05, |
| "loss": -0.0003, |
| "num_tokens": 110662235.0, |
| "reward": 3.3783979415893555, |
| "reward_std": 0.036025457084178925, |
| "rewards/ngram_repetition2/mean": 0.9834574460983276, |
| "rewards/ngram_repetition2/std": 0.0074860225431621075, |
| "rewards/ngram_repetition3/mean": 0.9989546537399292, |
| "rewards/ngram_repetition3/std": 0.002006194554269314, |
| "rewards/symbolic_reward_accuracy/mean": 0.70654296875, |
| "rewards/symbolic_reward_accuracy/std": 0.45545724034309387, |
| "rewards/symbolic_reward_partial_score/mean": 0.9361978769302368, |
| "rewards/symbolic_reward_partial_score/std": 0.10914558917284012, |
| "rewards/tag_count_reward/mean": 0.99951171875, |
| "rewards/tag_count_reward/std": 0.015621182508766651, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9778301119804382, |
| "rewards/thinking_answer_ratio_reward/std": 0.030854862183332443, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.4279154539108276, |
| "sampling/importance_sampling_ratio/min": 2.4768311050138436e-05, |
| "sampling/sampling_logp_difference/max": 10.605945587158203, |
| "sampling/sampling_logp_difference/mean": 0.6437482237815857, |
| "step": 156 |
| }, |
| { |
| "clip_ratio/high_max": 0.15234375, |
| "clip_ratio/high_mean": 0.0888671875, |
| "clip_ratio/low_mean": 0.23388671875, |
| "clip_ratio/low_min": 0.125, |
| "clip_ratio/region_mean": 0.32275390625, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 993.0, |
| "completions/max_terminated_length": 993.0, |
| "completions/mean_length": 498.11767578125, |
| "completions/mean_terminated_length": 498.11767578125, |
| "completions/min_length": 291.0, |
| "completions/min_terminated_length": 291.0, |
| "entropy": 2.2991671413183212, |
| "epoch": 1.7391304347826086, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3548338605641257, |
| "learning_rate": 1e-05, |
| "loss": 0.0016, |
| "num_tokens": 113729132.0, |
| "reward": 3.5990939140319824, |
| "reward_std": 0.0962018221616745, |
| "rewards/ngram_repetition2/mean": 0.9809653162956238, |
| "rewards/ngram_repetition2/std": 0.007712055929005146, |
| "rewards/ngram_repetition3/mean": 0.9986952543258667, |
| "rewards/ngram_repetition3/std": 0.001996663399040699, |
| "rewards/symbolic_reward_accuracy/mean": 0.8076171875, |
| "rewards/symbolic_reward_accuracy/std": 0.3942683935165405, |
| "rewards/symbolic_reward_partial_score/mean": 0.9549967050552368, |
| "rewards/symbolic_reward_partial_score/std": 0.10741008818149567, |
| "rewards/tag_count_reward/mean": 0.999267578125, |
| "rewards/tag_count_reward/std": 0.019127286970615387, |
| "rewards/thinking_answer_ratio_reward/mean": 0.979852557182312, |
| "rewards/thinking_answer_ratio_reward/std": 0.02200859785079956, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.4530794620513916, |
| "sampling/importance_sampling_ratio/min": 3.576672469307596e-08, |
| "sampling/sampling_logp_difference/max": 17.14624786376953, |
| "sampling/sampling_logp_difference/mean": 0.6910567283630371, |
| "step": 160 |
| }, |
| { |
| "clip_ratio/high_max": 0.16796875, |
| "clip_ratio/high_mean": 0.10107421875, |
| "clip_ratio/low_mean": 0.24560546875, |
| "clip_ratio/low_min": 0.1328125, |
| "clip_ratio/region_mean": 0.3466796875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1344.0, |
| "completions/max_terminated_length": 1344.0, |
| "completions/mean_length": 543.86083984375, |
| "completions/mean_terminated_length": 543.86083984375, |
| "completions/min_length": 310.0, |
| "completions/min_terminated_length": 310.0, |
| "entropy": 2.6379848271608353, |
| "epoch": 1.7826086956521738, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.08513499323190751, |
| "learning_rate": 1e-05, |
| "loss": 0.0133, |
| "num_tokens": 116905551.0, |
| "reward": 3.548675775527954, |
| "reward_std": 0.09981994330883026, |
| "rewards/ngram_repetition2/mean": 0.9711927771568298, |
| "rewards/ngram_repetition2/std": 0.011247237212955952, |
| "rewards/ngram_repetition3/mean": 0.9973983764648438, |
| "rewards/ngram_repetition3/std": 0.002803381998091936, |
| "rewards/symbolic_reward_accuracy/mean": 0.7880859375, |
| "rewards/symbolic_reward_accuracy/std": 0.4087640941143036, |
| "rewards/symbolic_reward_partial_score/mean": 0.9429931640625, |
| "rewards/symbolic_reward_partial_score/std": 0.12854379415512085, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9824697971343994, |
| "rewards/thinking_answer_ratio_reward/std": 0.004015693906694651, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.4711500406265259, |
| "sampling/importance_sampling_ratio/min": 1.322712250839686e-07, |
| "sampling/sampling_logp_difference/max": 15.838411331176758, |
| "sampling/sampling_logp_difference/mean": 0.7328225374221802, |
| "step": 164 |
| }, |
| { |
| "clip_ratio/high_max": 0.24609375, |
| "clip_ratio/high_mean": 0.1396484375, |
| "clip_ratio/low_mean": 0.23193359375, |
| "clip_ratio/low_min": 0.10546875, |
| "clip_ratio/region_mean": 0.37158203125, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1438.0, |
| "completions/max_terminated_length": 1438.0, |
| "completions/mean_length": 503.81298828125, |
| "completions/mean_terminated_length": 503.81298828125, |
| "completions/min_length": 263.0, |
| "completions/min_terminated_length": 263.0, |
| "entropy": 2.788048267364502, |
| "epoch": 1.8260869565217392, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.09455791725368402, |
| "learning_rate": 1e-05, |
| "loss": 0.0074, |
| "num_tokens": 120015792.0, |
| "reward": 3.349081039428711, |
| "reward_std": 0.10028517991304398, |
| "rewards/ngram_repetition2/mean": 0.9654586315155029, |
| "rewards/ngram_repetition2/std": 0.012577379122376442, |
| "rewards/ngram_repetition3/mean": 0.9965716004371643, |
| "rewards/ngram_repetition3/std": 0.0033189503010362387, |
| "rewards/symbolic_reward_accuracy/mean": 0.69970703125, |
| "rewards/symbolic_reward_accuracy/std": 0.4584972560405731, |
| "rewards/symbolic_reward_partial_score/mean": 0.9207356572151184, |
| "rewards/symbolic_reward_partial_score/std": 0.14194521307945251, |
| "rewards/tag_count_reward/mean": 0.99951171875, |
| "rewards/tag_count_reward/std": 0.015621182508766651, |
| "rewards/thinking_answer_ratio_reward/mean": 0.979932427406311, |
| "rewards/thinking_answer_ratio_reward/std": 0.031026024371385574, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.473034381866455, |
| "sampling/importance_sampling_ratio/min": 2.001723487410345e-06, |
| "sampling/sampling_logp_difference/max": 13.121501922607422, |
| "sampling/sampling_logp_difference/mean": 0.7396732568740845, |
| "step": 168 |
| }, |
| { |
| "clip_ratio/high_max": 0.19140625, |
| "clip_ratio/high_mean": 0.10546875, |
| "clip_ratio/low_mean": 0.25341796875, |
| "clip_ratio/low_min": 0.13671875, |
| "clip_ratio/region_mean": 0.35888671875, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 898.0, |
| "completions/max_terminated_length": 898.0, |
| "completions/mean_length": 467.646484375, |
| "completions/mean_terminated_length": 467.646484375, |
| "completions/min_length": 187.0, |
| "completions/min_terminated_length": 187.0, |
| "entropy": 2.768448531627655, |
| "epoch": 1.8695652173913042, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.06805611273845928, |
| "learning_rate": 1e-05, |
| "loss": 0.0022, |
| "num_tokens": 123004444.0, |
| "reward": 3.5058183670043945, |
| "reward_std": 0.07877355813980103, |
| "rewards/ngram_repetition2/mean": 0.9659230709075928, |
| "rewards/ngram_repetition2/std": 0.011437240056693554, |
| "rewards/ngram_repetition3/mean": 0.996779203414917, |
| "rewards/ngram_repetition3/std": 0.003355607157573104, |
| "rewards/symbolic_reward_accuracy/mean": 0.77001953125, |
| "rewards/symbolic_reward_accuracy/std": 0.42092275619506836, |
| "rewards/symbolic_reward_partial_score/mean": 0.9363606572151184, |
| "rewards/symbolic_reward_partial_score/std": 0.13817265629768372, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9791755676269531, |
| "rewards/thinking_answer_ratio_reward/std": 0.005196898244321346, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.4763879776000977, |
| "sampling/importance_sampling_ratio/min": 4.474750312510878e-05, |
| "sampling/sampling_logp_difference/max": 10.014474868774414, |
| "sampling/sampling_logp_difference/mean": 0.7400292158126831, |
| "step": 172 |
| }, |
| { |
| "clip_ratio/high_max": 0.30859375, |
| "clip_ratio/high_mean": 0.18505859375, |
| "clip_ratio/low_mean": 0.17431640625, |
| "clip_ratio/low_min": 0.078125, |
| "clip_ratio/region_mean": 0.359375, |
| "completions/clipped_ratio": 0.00048828125, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 1376.0, |
| "completions/mean_length": 430.4609375, |
| "completions/mean_terminated_length": 429.1705017089844, |
| "completions/min_length": 189.0, |
| "completions/min_terminated_length": 189.0, |
| "entropy": 2.8502298444509506, |
| "epoch": 1.9130434782608696, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.06663024928165782, |
| "learning_rate": 1e-05, |
| "loss": 0.003, |
| "num_tokens": 125954956.0, |
| "reward": 3.483172655105591, |
| "reward_std": 0.1899569034576416, |
| "rewards/ngram_repetition2/mean": 0.9636595845222473, |
| "rewards/ngram_repetition2/std": 0.016547439619898796, |
| "rewards/ngram_repetition3/mean": 0.9965729117393494, |
| "rewards/ngram_repetition3/std": 0.011003647930920124, |
| "rewards/symbolic_reward_accuracy/mean": 0.76025390625, |
| "rewards/symbolic_reward_accuracy/std": 0.4270327091217041, |
| "rewards/symbolic_reward_partial_score/mean": 0.9335530400276184, |
| "rewards/symbolic_reward_partial_score/std": 0.14367683231830597, |
| "rewards/tag_count_reward/mean": 0.999755859375, |
| "rewards/tag_count_reward/std": 0.011048543266952038, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9753477573394775, |
| "rewards/thinking_answer_ratio_reward/std": 0.03124345652759075, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.4821131229400635, |
| "sampling/importance_sampling_ratio/min": 2.7937696359003894e-05, |
| "sampling/sampling_logp_difference/max": 10.485533714294434, |
| "sampling/sampling_logp_difference/mean": 0.7600972652435303, |
| "step": 176 |
| }, |
| { |
| "clip_ratio/high_max": 0.2421875, |
| "clip_ratio/high_mean": 0.14013671875, |
| "clip_ratio/low_mean": 0.23681640625, |
| "clip_ratio/low_min": 0.12109375, |
| "clip_ratio/region_mean": 0.376953125, |
| "completions/clipped_ratio": 0.00146484375, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 696.0, |
| "completions/mean_length": 415.51611328125, |
| "completions/mean_terminated_length": 411.61907958984375, |
| "completions/min_length": 171.0, |
| "completions/min_terminated_length": 171.0, |
| "entropy": 2.858258455991745, |
| "epoch": 1.9565217391304348, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.07966906477050349, |
| "learning_rate": 1e-05, |
| "loss": 0.0034, |
| "num_tokens": 128874861.0, |
| "reward": 3.4829840660095215, |
| "reward_std": 0.08554819971323013, |
| "rewards/ngram_repetition2/mean": 0.9658774733543396, |
| "rewards/ngram_repetition2/std": 0.011492163874208927, |
| "rewards/ngram_repetition3/mean": 0.9971028566360474, |
| "rewards/ngram_repetition3/std": 0.0032859230414032936, |
| "rewards/symbolic_reward_accuracy/mean": 0.7587890625, |
| "rewards/symbolic_reward_accuracy/std": 0.42792245745658875, |
| "rewards/symbolic_reward_partial_score/mean": 0.93701171875, |
| "rewards/symbolic_reward_partial_score/std": 0.13184432685375214, |
| "rewards/tag_count_reward/mean": 0.9990234375, |
| "rewards/tag_count_reward/std": 0.022080888971686363, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9741167426109314, |
| "rewards/thinking_answer_ratio_reward/std": 0.04080890119075775, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.4818463325500488, |
| "sampling/importance_sampling_ratio/min": 1.6995619489534874e-06, |
| "sampling/sampling_logp_difference/max": 13.285140037536621, |
| "sampling/sampling_logp_difference/mean": 0.747634768486023, |
| "step": 180 |
| }, |
| { |
| "clip_ratio/high_max": 0.1640625, |
| "clip_ratio/high_mean": 0.09521484375, |
| "clip_ratio/low_mean": 0.2451171875, |
| "clip_ratio/low_min": 0.125, |
| "clip_ratio/region_mean": 0.34033203125, |
| "completions/clipped_ratio": 0.00048828125, |
| "completions/max_length": 3072.0, |
| "completions/max_terminated_length": 2981.0, |
| "completions/mean_length": 415.7646484375, |
| "completions/mean_terminated_length": 414.4670104980469, |
| "completions/min_length": 197.0, |
| "completions/min_terminated_length": 197.0, |
| "entropy": 3.0020454972982407, |
| "epoch": 2.0, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.20941599091305263, |
| "learning_rate": 1e-05, |
| "loss": 0.0019, |
| "num_tokens": 131785771.0, |
| "reward": 3.4347691535949707, |
| "reward_std": 0.0830899327993393, |
| "rewards/ngram_repetition2/mean": 0.9693027138710022, |
| "rewards/ngram_repetition2/std": 0.02014472335577011, |
| "rewards/ngram_repetition3/mean": 0.9970437288284302, |
| "rewards/ngram_repetition3/std": 0.01778826303780079, |
| "rewards/symbolic_reward_accuracy/mean": 0.73974609375, |
| "rewards/symbolic_reward_accuracy/std": 0.43888023495674133, |
| "rewards/symbolic_reward_partial_score/mean": 0.9258626103401184, |
| "rewards/symbolic_reward_partial_score/std": 0.15298709273338318, |
| "rewards/tag_count_reward/mean": 1.0, |
| "rewards/tag_count_reward/std": 0.0, |
| "rewards/thinking_answer_ratio_reward/mean": 0.9750823378562927, |
| "rewards/thinking_answer_ratio_reward/std": 0.022159311920404434, |
| "sampling/importance_sampling_ratio/max": 2.0, |
| "sampling/importance_sampling_ratio/mean": 1.4881045818328857, |
| "sampling/importance_sampling_ratio/min": 6.7104201662004925e-06, |
| "sampling/sampling_logp_difference/max": 11.911849021911621, |
| "sampling/sampling_logp_difference/mean": 0.7676164507865906, |
| "step": 184 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_clip_ratio/high_max": 0.0, |
| "eval_clip_ratio/high_mean": 0.0, |
| "eval_clip_ratio/low_mean": 0.0, |
| "eval_clip_ratio/low_min": 0.0, |
| "eval_clip_ratio/region_mean": 0.0, |
| "eval_completions/clipped_ratio": 0.001644736842105263, |
| "eval_completions/max_length": 895.8947368421053, |
| "eval_completions/max_terminated_length": 641.0526315789474, |
| "eval_completions/mean_length": 447.49136513157896, |
| "eval_completions/mean_terminated_length": 443.1748866031044, |
| "eval_completions/min_length": 229.31578947368422, |
| "eval_completions/min_terminated_length": 229.31578947368422, |
| "eval_entropy": 2.969714365507427, |
| "eval_frac_reward_zero_std": 0.0, |
| "eval_loss": 0.0008429406443610787, |
| "eval_num_tokens": 131785771.0, |
| "eval_reward": 3.461897749649851, |
| "eval_reward_std": 0.11290462953145738, |
| "eval_rewards/ngram_repetition2/mean": 0.9665061580507379, |
| "eval_rewards/ngram_repetition2/std": 0.010592278240150526, |
| "eval_rewards/ngram_repetition3/mean": 0.9974528551101685, |
| "eval_rewards/ngram_repetition3/std": 0.002903796377052602, |
| "eval_rewards/symbolic_reward_accuracy/mean": 0.7504111842105263, |
| "eval_rewards/symbolic_reward_accuracy/std": 0.3684137702772492, |
| "eval_rewards/symbolic_reward_partial_score/mean": 0.9324972535434523, |
| "eval_rewards/symbolic_reward_partial_score/std": 0.11325683170243313, |
| "eval_rewards/tag_count_reward/mean": 0.9991776315789473, |
| "eval_rewards/tag_count_reward/std": 0.006552994643387042, |
| "eval_rewards/thinking_answer_ratio_reward/mean": 0.9760947635299281, |
| "eval_rewards/thinking_answer_ratio_reward/std": 0.018366032500604267, |
| "eval_runtime": 197.726, |
| "eval_samples_per_second": 0.759, |
| "eval_sampling/importance_sampling_ratio/max": 2.0, |
| "eval_sampling/importance_sampling_ratio/mean": 1.4970186572325856, |
| "eval_sampling/importance_sampling_ratio/min": 0.002377865083537089, |
| "eval_sampling/sampling_logp_difference/max": 6.292696877529747, |
| "eval_sampling/sampling_logp_difference/mean": 0.7791855492089924, |
| "eval_steps_per_second": 0.01, |
| "step": 184 |
| }, |
| { |
| "epoch": 2.0, |
| "step": 184, |
| "total_flos": 0.0, |
| "train_loss": 0.005923212121358475, |
| "train_runtime": 6121.2967, |
| "train_samples_per_second": 0.98, |
| "train_steps_per_second": 0.03 |
| } |
| ], |
| "logging_steps": 4, |
| "max_steps": 184, |
| "num_input_tokens_seen": 131785771, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|