{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1248, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.166015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16306.0, "completions/mean_length": 10551.48828125, "completions/mean_terminated_length": 9390.44921875, "completions/min_length": 2058.0, "completions/min_terminated_length": 2058.0, "entropy": 0.32506339251995087, "epoch": 0.0016025641025641025, "frac_reward_zero_std": 0.34375, "grad_norm": 0.044611234217882156, "learning_rate": 1e-06, "loss": 0.1228, "num_tokens": 6250714.0, "reward": 0.6636751890182495, "reward_std": 0.18405699729919434, "rewards/progression_diversity/mean": -0.0030846872832626104, "rewards/progression_diversity/std": 0.02705330215394497, "rewards/symbolic_reward_accuracy/mean": 0.7265625, "rewards/symbolic_reward_accuracy/std": 0.4461594223976135, "rewards/symbolic_reward_partial_score/mean": 0.8119628429412842, "rewards/symbolic_reward_partial_score/std": 0.36560988426208496, "rewards/tag_count_reward/mean": -0.158203125, "rewards/tag_count_reward/std": 0.36528825759887695, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0823476314544678, "sampling/importance_sampling_ratio/min": 5.835279080201872e-05, "sampling/sampling_logp_difference/max": 9.749003410339355, "sampling/sampling_logp_difference/mean": 0.13594314455986023, "step": 1 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.32573169469833374, "epoch": 0.003205128205128205, "grad_norm": 0.05619456619024277, "learning_rate": 1e-06, "loss": 0.0168, "step": 2 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.32708846032619476, "epoch": 0.004807692307692308, "grad_norm": 0.03278400003910065, "learning_rate": 1e-06, "loss": -0.0697, "step": 3 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.3290020525455475, "epoch": 0.00641025641025641, "grad_norm": 0.04777136817574501, "learning_rate": 1e-06, "loss": 0.0348, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16796875, "completions/max_length": 16384.0, "completions/max_terminated_length": 16363.0, "completions/mean_length": 10448.6640625, "completions/mean_terminated_length": 9250.451171875, "completions/min_length": 2934.0, "completions/min_terminated_length": 2934.0, "entropy": 0.32880647480487823, "epoch": 0.008012820512820512, "frac_reward_zero_std": 0.3125, "grad_norm": 0.05143669620156288, "learning_rate": 1e-06, "loss": 0.0564, "num_tokens": 12425550.0, "reward": 0.6473922729492188, "reward_std": 0.23677554726600647, "rewards/progression_diversity/mean": -0.0019865420181304216, "rewards/progression_diversity/std": 0.024099716916680336, "rewards/symbolic_reward_accuracy/mean": 0.70703125, "rewards/symbolic_reward_accuracy/std": 0.455569326877594, "rewards/symbolic_reward_partial_score/mean": 0.7986654043197632, "rewards/symbolic_reward_partial_score/std": 0.378904789686203, "rewards/tag_count_reward/mean": -0.1640625, "rewards/tag_count_reward/std": 0.37069445848464966, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0827651023864746, "sampling/importance_sampling_ratio/min": 0.0012605652445927262, "sampling/sampling_logp_difference/max": 6.67619514465332, "sampling/sampling_logp_difference/mean": 0.13637593388557434, "step": 5 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.3292965739965439, "epoch": 0.009615384615384616, "grad_norm": 0.04503220319747925, "learning_rate": 1e-06, "loss": 0.1359, "step": 6 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.32996658980846405, "epoch": 0.011217948717948718, "grad_norm": 0.03700640797615051, "learning_rate": 1e-06, "loss": -0.0285, "step": 7 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.33296214044094086, "epoch": 0.01282051282051282, "grad_norm": 0.043708402663469315, "learning_rate": 1e-06, "loss": 0.0612, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.166015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16353.0, "completions/mean_length": 10651.83203125, "completions/mean_terminated_length": 9510.767578125, "completions/min_length": 3284.0, "completions/min_terminated_length": 3284.0, "entropy": 0.32981322705745697, "epoch": 0.014423076923076924, "frac_reward_zero_std": 0.375, "grad_norm": 0.053172070533037186, "learning_rate": 1e-06, "loss": 0.0483, "num_tokens": 18781656.0, "reward": 0.659508466720581, "reward_std": 0.21591773629188538, "rewards/progression_diversity/mean": -0.001792945433408022, "rewards/progression_diversity/std": 0.02059319242835045, "rewards/symbolic_reward_accuracy/mean": 0.724609375, "rewards/symbolic_reward_accuracy/std": 0.44714778661727905, "rewards/symbolic_reward_partial_score/mean": 0.8019368648529053, "rewards/symbolic_reward_partial_score/std": 0.3785783052444458, "rewards/tag_count_reward/mean": -0.158203125, "rewards/tag_count_reward/std": 0.36528825759887695, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0816385746002197, "sampling/importance_sampling_ratio/min": 0.0015555794816464186, "sampling/sampling_logp_difference/max": 6.465907096862793, "sampling/sampling_logp_difference/mean": 0.1348809450864792, "step": 9 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.3281726986169815, "epoch": 0.016025641025641024, "grad_norm": 0.042232953011989594, "learning_rate": 1e-06, "loss": 0.0559, "step": 10 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.3172971159219742, "epoch": 0.017628205128205128, "grad_norm": 0.050342120230197906, "learning_rate": 1e-06, "loss": 0.1033, "step": 11 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.32771991193294525, "epoch": 0.019230769230769232, "grad_norm": 0.05385352298617363, "learning_rate": 1e-06, "loss": -0.0274, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.134765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16323.0, "completions/mean_length": 10421.53125, "completions/mean_terminated_length": 9492.83984375, "completions/min_length": 2597.0, "completions/min_terminated_length": 2597.0, "entropy": 0.3343167155981064, "epoch": 0.020833333333333332, "frac_reward_zero_std": 0.4375, "grad_norm": 0.04376198351383209, "learning_rate": 1e-06, "loss": 0.0843, "num_tokens": 24990936.0, "reward": 0.7011435031890869, "reward_std": 0.20758269727230072, "rewards/progression_diversity/mean": -0.0003993186983279884, "rewards/progression_diversity/std": 0.00878761988133192, "rewards/symbolic_reward_accuracy/mean": 0.7734375, "rewards/symbolic_reward_accuracy/std": 0.4190165400505066, "rewards/symbolic_reward_partial_score/mean": 0.833251953125, "rewards/symbolic_reward_partial_score/std": 0.358816921710968, "rewards/tag_count_reward/mean": -0.12890625, "rewards/tag_count_reward/std": 0.33542385697364807, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0832061767578125, "sampling/importance_sampling_ratio/min": 1.757594509399496e-05, "sampling/sampling_logp_difference/max": 10.948979377746582, "sampling/sampling_logp_difference/mean": 0.13748203217983246, "step": 13 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.33190542459487915, "epoch": 0.022435897435897436, "grad_norm": 0.043753933161497116, "learning_rate": 1e-06, "loss": -0.0088, "step": 14 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.3347119390964508, "epoch": 0.02403846153846154, "grad_norm": 0.04401244595646858, "learning_rate": 1e-06, "loss": 0.074, "step": 15 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.3319084793329239, "epoch": 0.02564102564102564, "grad_norm": 0.033044297248125076, "learning_rate": 1e-06, "loss": -0.0032, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16076.0, "completions/mean_length": 9447.185546875, "completions/mean_terminated_length": 8560.984375, "completions/min_length": 3179.0, "completions/min_terminated_length": 3179.0, "entropy": 0.3487512767314911, "epoch": 0.027243589743589744, "frac_reward_zero_std": 0.5625, "grad_norm": 0.03728368133306503, "learning_rate": 1e-06, "loss": -0.0261, "num_tokens": 30661527.0, "reward": 0.7235032320022583, "reward_std": 0.14087146520614624, "rewards/progression_diversity/mean": -0.0002674094866961241, "rewards/progression_diversity/std": 0.005158489104360342, "rewards/symbolic_reward_accuracy/mean": 0.796875, "rewards/symbolic_reward_accuracy/std": 0.4027182459831238, "rewards/symbolic_reward_partial_score/mean": 0.8543945550918579, "rewards/symbolic_reward_partial_score/std": 0.336029052734375, "rewards/tag_count_reward/mean": -0.109375, "rewards/tag_count_reward/std": 0.31241437792778015, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0864083766937256, "sampling/importance_sampling_ratio/min": 0.002907535759732127, "sampling/sampling_logp_difference/max": 5.840449333190918, "sampling/sampling_logp_difference/mean": 0.14251816272735596, "step": 17 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.3508133143186569, "epoch": 0.028846153846153848, "grad_norm": 0.025252720341086388, "learning_rate": 1e-06, "loss": 0.0543, "step": 18 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.3455636203289032, "epoch": 0.030448717948717948, "grad_norm": 0.04936947673559189, "learning_rate": 1e-06, "loss": 0.0408, "step": 19 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.3427297919988632, "epoch": 0.03205128205128205, "grad_norm": 0.04556189477443695, "learning_rate": 1e-06, "loss": 0.0202, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.142578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16317.0, "completions/mean_length": 10418.515625, "completions/mean_terminated_length": 9426.533203125, "completions/min_length": 3359.0, "completions/min_terminated_length": 3359.0, "entropy": 0.3352004587650299, "epoch": 0.03365384615384615, "frac_reward_zero_std": 0.3125, "grad_norm": 0.0631578117609024, "learning_rate": 1e-06, "loss": 0.0442, "num_tokens": 36837695.0, "reward": 0.6921972632408142, "reward_std": 0.20993337035179138, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.765625, "rewards/symbolic_reward_accuracy/std": 0.42402184009552, "rewards/symbolic_reward_partial_score/mean": 0.8242512941360474, "rewards/symbolic_reward_partial_score/std": 0.3660709261894226, "rewards/tag_count_reward/mean": -0.14453125, "rewards/tag_count_reward/std": 0.35197147727012634, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0834827423095703, "sampling/importance_sampling_ratio/min": 0.0013367350911721587, "sampling/sampling_logp_difference/max": 6.617525100708008, "sampling/sampling_logp_difference/mean": 0.13813522458076477, "step": 21 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.3373718708753586, "epoch": 0.035256410256410256, "grad_norm": 0.05303698405623436, "learning_rate": 1e-06, "loss": 0.124, "step": 22 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.3379471004009247, "epoch": 0.03685897435897436, "grad_norm": 0.056940287351608276, "learning_rate": 1e-06, "loss": -0.0224, "step": 23 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.3319106251001358, "epoch": 0.038461538461538464, "grad_norm": 0.036604754626750946, "learning_rate": 1e-06, "loss": 0.0221, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16796875, "completions/max_length": 16384.0, "completions/max_terminated_length": 16238.0, "completions/mean_length": 11014.87109375, "completions/mean_terminated_length": 9930.962890625, "completions/min_length": 3731.0, "completions/min_terminated_length": 3731.0, "entropy": 0.34052951633930206, "epoch": 0.04006410256410257, "frac_reward_zero_std": 0.375, "grad_norm": 0.04965716972947121, "learning_rate": 1e-06, "loss": 0.0141, "num_tokens": 43350445.0, "reward": 0.6583292484283447, "reward_std": 0.22545002400875092, "rewards/progression_diversity/mean": -8.649941446492448e-05, "rewards/progression_diversity/std": 0.0019572582095861435, "rewards/symbolic_reward_accuracy/mean": 0.716796875, "rewards/symbolic_reward_accuracy/std": 0.4509948492050171, "rewards/symbolic_reward_partial_score/mean": 0.8142252564430237, "rewards/symbolic_reward_partial_score/std": 0.36810046434402466, "rewards/tag_count_reward/mean": -0.16015625, "rewards/tag_count_reward/std": 0.3671095669269562, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0841944217681885, "sampling/importance_sampling_ratio/min": 0.0011797985062003136, "sampling/sampling_logp_difference/max": 6.7424116134643555, "sampling/sampling_logp_difference/mean": 0.13874885439872742, "step": 25 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.3379000872373581, "epoch": 0.041666666666666664, "grad_norm": 0.043555937707424164, "learning_rate": 1e-06, "loss": 0.03, "step": 26 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.3365878015756607, "epoch": 0.04326923076923077, "grad_norm": 0.048798009753227234, "learning_rate": 1e-06, "loss": -0.0271, "step": 27 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.3339509665966034, "epoch": 0.04487179487179487, "grad_norm": 0.03760625422000885, "learning_rate": 1e-06, "loss": 0.1111, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.115234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16300.0, "completions/mean_length": 10312.2734375, "completions/mean_terminated_length": 9521.474609375, "completions/min_length": 3734.0, "completions/min_terminated_length": 3734.0, "entropy": 0.3400348424911499, "epoch": 0.046474358974358976, "frac_reward_zero_std": 0.4375, "grad_norm": 0.046900391578674316, "learning_rate": 1e-06, "loss": -0.0275, "num_tokens": 49548425.0, "reward": 0.7269140481948853, "reward_std": 0.19069784879684448, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.802734375, "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, "rewards/symbolic_reward_partial_score/mean": 0.8553385138511658, "rewards/symbolic_reward_partial_score/std": 0.3376051187515259, "rewards/tag_count_reward/mean": -0.11328125, "rewards/tag_count_reward/std": 0.3172462284564972, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0843985080718994, "sampling/importance_sampling_ratio/min": 1.196963512484217e-05, "sampling/sampling_logp_difference/max": 11.333137512207031, "sampling/sampling_logp_difference/mean": 0.13986769318580627, "step": 29 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.33834680914878845, "epoch": 0.04807692307692308, "grad_norm": 0.032770950347185135, "learning_rate": 1e-06, "loss": 0.0921, "step": 30 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.34235046803951263, "epoch": 0.049679487179487176, "grad_norm": 0.04878636822104454, "learning_rate": 1e-06, "loss": 0.1467, "step": 31 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.3426048308610916, "epoch": 0.05128205128205128, "grad_norm": 0.035221368074417114, "learning_rate": 1e-06, "loss": -0.0613, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16369.0, "completions/mean_length": 9957.662109375, "completions/mean_terminated_length": 9136.67578125, "completions/min_length": 3096.0, "completions/min_terminated_length": 3096.0, "entropy": 0.35028260946273804, "epoch": 0.052884615384615384, "frac_reward_zero_std": 0.46875, "grad_norm": 0.04036945849657059, "learning_rate": 1e-06, "loss": 0.1135, "num_tokens": 55490012.0, "reward": 0.7271240949630737, "reward_std": 0.18368291854858398, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.80078125, "rewards/symbolic_reward_accuracy/std": 0.39980348944664, "rewards/symbolic_reward_partial_score/mean": 0.8592935800552368, "rewards/symbolic_reward_partial_score/std": 0.3345940411090851, "rewards/tag_count_reward/mean": -0.111328125, "rewards/tag_count_reward/std": 0.31484565138816833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0867599248886108, "sampling/importance_sampling_ratio/min": 0.0015392610803246498, "sampling/sampling_logp_difference/max": 6.476452827453613, "sampling/sampling_logp_difference/mean": 0.14337460696697235, "step": 33 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.3484779894351959, "epoch": 0.05448717948717949, "grad_norm": 0.03777889907360077, "learning_rate": 1e-06, "loss": -0.0092, "step": 34 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.3541874587535858, "epoch": 0.05608974358974359, "grad_norm": 0.042037446051836014, "learning_rate": 1e-06, "loss": -0.0103, "step": 35 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.3485944867134094, "epoch": 0.057692307692307696, "grad_norm": 0.030424781143665314, "learning_rate": 1e-06, "loss": 0.0218, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.068359375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16276.0, "completions/mean_length": 9817.58203125, "completions/mean_terminated_length": 9335.76953125, "completions/min_length": 3208.0, "completions/min_terminated_length": 3208.0, "entropy": 0.34963034093379974, "epoch": 0.05929487179487179, "frac_reward_zero_std": 0.5625, "grad_norm": 0.04224839061498642, "learning_rate": 1e-06, "loss": 0.0633, "num_tokens": 61372294.0, "reward": 0.8006471395492554, "reward_std": 0.1407773494720459, "rewards/progression_diversity/mean": -0.00023221348237711936, "rewards/progression_diversity/std": 0.005254391580820084, "rewards/symbolic_reward_accuracy/mean": 0.8828125, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.9246907830238342, "rewards/symbolic_reward_partial_score/std": 0.25209876894950867, "rewards/tag_count_reward/mean": -0.064453125, "rewards/tag_count_reward/std": 0.24579854309558868, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.086740493774414, "sampling/importance_sampling_ratio/min": 0.0005778282065875828, "sampling/sampling_logp_difference/max": 7.456233978271484, "sampling/sampling_logp_difference/mean": 0.14352458715438843, "step": 37 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.35366642475128174, "epoch": 0.060897435897435896, "grad_norm": 0.03482625260949135, "learning_rate": 1e-06, "loss": -0.0092, "step": 38 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.3519248515367508, "epoch": 0.0625, "grad_norm": 0.02846507355570793, "learning_rate": 1e-06, "loss": 0.0231, "step": 39 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.3489530235528946, "epoch": 0.0641025641025641, "grad_norm": 0.029413459822535515, "learning_rate": 1e-06, "loss": -0.0039, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.111328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16274.0, "completions/mean_length": 10307.072265625, "completions/mean_terminated_length": 9545.787109375, "completions/min_length": 3050.0, "completions/min_terminated_length": 3050.0, "entropy": 0.34452080726623535, "epoch": 0.06570512820512821, "frac_reward_zero_std": 0.40625, "grad_norm": 0.051934659481048584, "learning_rate": 1e-06, "loss": 0.1097, "num_tokens": 67578507.0, "reward": 0.718945324420929, "reward_std": 0.1863139569759369, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.78125, "rewards/symbolic_reward_accuracy/std": 0.41380295157432556, "rewards/symbolic_reward_partial_score/mean": 0.8678385019302368, "rewards/symbolic_reward_partial_score/std": 0.3194737434387207, "rewards/tag_count_reward/mean": -0.1015625, "rewards/tag_count_reward/std": 0.30236753821372986, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.084733486175537, "sampling/importance_sampling_ratio/min": 1.5089102817000821e-05, "sampling/sampling_logp_difference/max": 11.101537704467773, "sampling/sampling_logp_difference/mean": 0.14066340029239655, "step": 41 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.3470620810985565, "epoch": 0.0673076923076923, "grad_norm": 0.05445835366845131, "learning_rate": 1e-06, "loss": -0.0056, "step": 42 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.34327615797519684, "epoch": 0.06891025641025642, "grad_norm": 0.056635189801454544, "learning_rate": 1e-06, "loss": -0.0054, "step": 43 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.34379227459430695, "epoch": 0.07051282051282051, "grad_norm": 0.03655867278575897, "learning_rate": 1e-06, "loss": 0.0207, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16342.0, "completions/mean_length": 10268.3828125, "completions/mean_terminated_length": 9267.6455078125, "completions/min_length": 2829.0, "completions/min_terminated_length": 2829.0, "entropy": 0.3472183048725128, "epoch": 0.07211538461538461, "frac_reward_zero_std": 0.4375, "grad_norm": 0.031955551356077194, "learning_rate": 1e-06, "loss": 0.0389, "num_tokens": 73686271.0, "reward": 0.7038329839706421, "reward_std": 0.1754232943058014, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.7734375, "rewards/symbolic_reward_accuracy/std": 0.4190165400505066, "rewards/symbolic_reward_partial_score/mean": 0.841552734375, "rewards/symbolic_reward_partial_score/std": 0.3493834137916565, "rewards/tag_count_reward/mean": -0.126953125, "rewards/tag_count_reward/std": 0.33324605226516724, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.086193323135376, "sampling/importance_sampling_ratio/min": 2.6150517307144128e-09, "sampling/sampling_logp_difference/max": 19.761981964111328, "sampling/sampling_logp_difference/mean": 0.14203211665153503, "step": 45 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.34860071539878845, "epoch": 0.07371794871794872, "grad_norm": 0.04454493150115013, "learning_rate": 1e-06, "loss": -0.0093, "step": 46 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.3458279073238373, "epoch": 0.07532051282051282, "grad_norm": 0.028682490810751915, "learning_rate": 1e-06, "loss": 0.0824, "step": 47 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.35116448998451233, "epoch": 0.07692307692307693, "grad_norm": 0.03319811820983887, "learning_rate": 1e-06, "loss": 0.0371, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16302.0, "completions/mean_length": 10789.669921875, "completions/mean_terminated_length": 10157.2666015625, "completions/min_length": 4249.0, "completions/min_terminated_length": 4249.0, "entropy": 0.3366381973028183, "epoch": 0.07852564102564102, "frac_reward_zero_std": 0.28125, "grad_norm": 0.04699065536260605, "learning_rate": 1e-06, "loss": 0.0168, "num_tokens": 80139782.0, "reward": 0.7050830721855164, "reward_std": 0.22467857599258423, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.759765625, "rewards/symbolic_reward_accuracy/std": 0.4276435375213623, "rewards/symbolic_reward_partial_score/mean": 0.8619954586029053, "rewards/symbolic_reward_partial_score/std": 0.32479149103164673, "rewards/tag_count_reward/mean": -0.09375, "rewards/tag_count_reward/std": 0.29176566004753113, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0840137004852295, "sampling/importance_sampling_ratio/min": 0.0013393433764576912, "sampling/sampling_logp_difference/max": 6.615575790405273, "sampling/sampling_logp_difference/mean": 0.13926517963409424, "step": 49 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.3428249955177307, "epoch": 0.08012820512820513, "grad_norm": 0.05130426585674286, "learning_rate": 1e-06, "loss": 0.0691, "step": 50 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.3388234078884125, "epoch": 0.08173076923076923, "grad_norm": 0.053293149918317795, "learning_rate": 1e-06, "loss": 0.0268, "step": 51 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.34156106412410736, "epoch": 0.08333333333333333, "grad_norm": 0.04985737428069115, "learning_rate": 1e-06, "loss": -0.028, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.103515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16322.0, "completions/mean_length": 10276.51171875, "completions/mean_terminated_length": 9571.2890625, "completions/min_length": 3452.0, "completions/min_terminated_length": 3452.0, "entropy": 0.3465278446674347, "epoch": 0.08493589743589744, "frac_reward_zero_std": 0.4375, "grad_norm": 0.061375465244054794, "learning_rate": 1e-06, "loss": -0.0205, "num_tokens": 86243244.0, "reward": 0.7516357898712158, "reward_std": 0.16822142899036407, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.82421875, "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, "rewards/symbolic_reward_partial_score/mean": 0.8882649540901184, "rewards/symbolic_reward_partial_score/std": 0.2975149154663086, "rewards/tag_count_reward/mean": -0.09375, "rewards/tag_count_reward/std": 0.29176566004753113, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0861289501190186, "sampling/importance_sampling_ratio/min": 0.0001545468985568732, "sampling/sampling_logp_difference/max": 8.775012969970703, "sampling/sampling_logp_difference/mean": 0.14235404133796692, "step": 53 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.35107581317424774, "epoch": 0.08653846153846154, "grad_norm": 0.04079843685030937, "learning_rate": 1e-06, "loss": 0.0417, "step": 54 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.34525130689144135, "epoch": 0.08814102564102565, "grad_norm": 0.05603815242648125, "learning_rate": 1e-06, "loss": 0.0387, "step": 55 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.34873533248901367, "epoch": 0.08974358974358974, "grad_norm": 0.04589448869228363, "learning_rate": 1e-06, "loss": 0.0513, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.095703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16358.0, "completions/mean_length": 10740.359375, "completions/mean_terminated_length": 10143.083984375, "completions/min_length": 3255.0, "completions/min_terminated_length": 3255.0, "entropy": 0.3383316993713379, "epoch": 0.09134615384615384, "frac_reward_zero_std": 0.375, "grad_norm": 0.05304954573512077, "learning_rate": 1e-06, "loss": 0.0393, "num_tokens": 92654068.0, "reward": 0.7263745069503784, "reward_std": 0.20219381153583527, "rewards/progression_diversity/mean": -0.00025326735340058804, "rewards/progression_diversity/std": 0.00356759550049901, "rewards/symbolic_reward_accuracy/mean": 0.794921875, "rewards/symbolic_reward_accuracy/std": 0.4041535556316376, "rewards/symbolic_reward_partial_score/mean": 0.8620116710662842, "rewards/symbolic_reward_partial_score/std": 0.328750342130661, "rewards/tag_count_reward/mean": -0.091796875, "rewards/tag_count_reward/std": 0.289021372795105, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0836436748504639, "sampling/importance_sampling_ratio/min": 0.0003667703422252089, "sampling/sampling_logp_difference/max": 7.9107747077941895, "sampling/sampling_logp_difference/mean": 0.13842356204986572, "step": 57 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.33808162808418274, "epoch": 0.09294871794871795, "grad_norm": 0.037355221807956696, "learning_rate": 1e-06, "loss": -0.0301, "step": 58 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.3411588817834854, "epoch": 0.09455128205128205, "grad_norm": 0.055566657334566116, "learning_rate": 1e-06, "loss": 0.02, "step": 59 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.34048154950141907, "epoch": 0.09615384615384616, "grad_norm": 0.056268103420734406, "learning_rate": 1e-06, "loss": 0.0807, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.080078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16293.0, "completions/mean_length": 9842.513671875, "completions/mean_terminated_length": 9273.0849609375, "completions/min_length": 3582.0, "completions/min_terminated_length": 3582.0, "entropy": 0.3575614392757416, "epoch": 0.09775641025641026, "frac_reward_zero_std": 0.53125, "grad_norm": 0.056403666734695435, "learning_rate": 1e-06, "loss": -0.0049, "num_tokens": 98531323.0, "reward": 0.761899471282959, "reward_std": 0.13937385380268097, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.8359375, "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, "rewards/symbolic_reward_partial_score/mean": 0.8918782472610474, "rewards/symbolic_reward_partial_score/std": 0.29803723096847534, "rewards/tag_count_reward/mean": -0.072265625, "rewards/tag_count_reward/std": 0.2591804563999176, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0875978469848633, "sampling/importance_sampling_ratio/min": 0.0013123038224875927, "sampling/sampling_logp_difference/max": 6.6359710693359375, "sampling/sampling_logp_difference/mean": 0.1450347602367401, "step": 61 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3541255295276642, "epoch": 0.09935897435897435, "grad_norm": 0.049526289105415344, "learning_rate": 1e-06, "loss": 0.012, "step": 62 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.35454636812210083, "epoch": 0.10096153846153846, "grad_norm": 0.028227200731635094, "learning_rate": 1e-06, "loss": 0.0124, "step": 63 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.3537105768918991, "epoch": 0.10256410256410256, "grad_norm": 0.032517120242118835, "learning_rate": 1e-06, "loss": 0.0673, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.115234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16374.0, "completions/mean_length": 10551.0, "completions/mean_terminated_length": 9791.2939453125, "completions/min_length": 3181.0, "completions/min_terminated_length": 3181.0, "entropy": 0.3576884865760803, "epoch": 0.10416666666666667, "frac_reward_zero_std": 0.5, "grad_norm": 0.0434148907661438, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 104799931.0, "reward": 0.7335938215255737, "reward_std": 0.17546309530735016, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.802734375, "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, "rewards/symbolic_reward_partial_score/mean": 0.876953125, "rewards/symbolic_reward_partial_score/std": 0.31392836570739746, "rewards/tag_count_reward/mean": -0.111328125, "rewards/tag_count_reward/std": 0.31484565138816833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.087789535522461, "sampling/importance_sampling_ratio/min": 0.0014198714634403586, "sampling/sampling_logp_difference/max": 6.557188987731934, "sampling/sampling_logp_difference/mean": 0.14461146295070648, "step": 65 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.35563966631889343, "epoch": 0.10576923076923077, "grad_norm": 0.04970910772681236, "learning_rate": 1e-06, "loss": 0.0064, "step": 66 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.3538610637187958, "epoch": 0.10737179487179487, "grad_norm": 0.039721861481666565, "learning_rate": 1e-06, "loss": 0.0354, "step": 67 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.35437028110027313, "epoch": 0.10897435897435898, "grad_norm": 0.03535262495279312, "learning_rate": 1e-06, "loss": 0.0732, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16312.0, "completions/mean_length": 10258.015625, "completions/mean_terminated_length": 9794.7060546875, "completions/min_length": 2997.0, "completions/min_terminated_length": 2997.0, "entropy": 0.35005713999271393, "epoch": 0.11057692307692307, "frac_reward_zero_std": 0.5, "grad_norm": 0.04420126602053642, "learning_rate": 1e-06, "loss": 0.1169, "num_tokens": 110901763.0, "reward": 0.7876123189926147, "reward_std": 0.14684240520000458, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.865234375, "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, "rewards/symbolic_reward_partial_score/mean": 0.9176920652389526, "rewards/symbolic_reward_partial_score/std": 0.26269444823265076, "rewards/tag_count_reward/mean": -0.068359375, "rewards/tag_count_reward/std": 0.25260838866233826, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0872759819030762, "sampling/importance_sampling_ratio/min": 0.0015730534214526415, "sampling/sampling_logp_difference/max": 6.454736709594727, "sampling/sampling_logp_difference/mean": 0.1436619907617569, "step": 69 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.35464054346084595, "epoch": 0.11217948717948718, "grad_norm": 0.028692971915006638, "learning_rate": 1e-06, "loss": -0.015, "step": 70 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.35661159455776215, "epoch": 0.11378205128205128, "grad_norm": 0.028049832209944725, "learning_rate": 1e-06, "loss": 0.0162, "step": 71 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.35251204669475555, "epoch": 0.11538461538461539, "grad_norm": 0.05862206593155861, "learning_rate": 1e-06, "loss": 0.0029, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16130.0, "completions/mean_length": 9987.98828125, "completions/mean_terminated_length": 9416.4296875, "completions/min_length": 3678.0, "completions/min_terminated_length": 3678.0, "entropy": 0.35229024291038513, "epoch": 0.11698717948717949, "frac_reward_zero_std": 0.46875, "grad_norm": 0.03338691219687462, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 116876653.0, "reward": 0.7723047137260437, "reward_std": 0.15315422415733337, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.845703125, "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, "rewards/symbolic_reward_partial_score/mean": 0.9102864265441895, "rewards/symbolic_reward_partial_score/std": 0.2674944996833801, "rewards/tag_count_reward/mean": -0.08203125, "rewards/tag_count_reward/std": 0.2746807038784027, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0867061614990234, "sampling/importance_sampling_ratio/min": 0.0024432684294879436, "sampling/sampling_logp_difference/max": 6.014418601989746, "sampling/sampling_logp_difference/mean": 0.14364758133888245, "step": 73 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.3558579385280609, "epoch": 0.11858974358974358, "grad_norm": 0.03649130091071129, "learning_rate": 1e-06, "loss": 0.0875, "step": 74 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.35268354415893555, "epoch": 0.1201923076923077, "grad_norm": 0.045283980667591095, "learning_rate": 1e-06, "loss": -0.0395, "step": 75 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.35194286704063416, "epoch": 0.12179487179487179, "grad_norm": 0.04006224498152733, "learning_rate": 1e-06, "loss": 0.0715, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.056640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16163.0, "completions/mean_length": 9884.43359375, "completions/mean_terminated_length": 9494.1904296875, "completions/min_length": 3154.0, "completions/min_terminated_length": 3154.0, "entropy": 0.3586219549179077, "epoch": 0.1233974358974359, "frac_reward_zero_std": 0.5, "grad_norm": 0.03124525584280491, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 122829195.0, "reward": 0.7831738591194153, "reward_std": 0.15691515803337097, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.8515625, "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, "rewards/symbolic_reward_partial_score/mean": 0.9243814945220947, "rewards/symbolic_reward_partial_score/std": 0.2465325891971588, "rewards/tag_count_reward/mean": -0.05078125, "rewards/tag_count_reward/std": 0.21976542472839355, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0875834226608276, "sampling/importance_sampling_ratio/min": 0.0018888328922912478, "sampling/sampling_logp_difference/max": 6.271796226501465, "sampling/sampling_logp_difference/mean": 0.14485181868076324, "step": 77 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.3570743352174759, "epoch": 0.125, "grad_norm": 0.039502520114183426, "learning_rate": 1e-06, "loss": 0.1045, "step": 78 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.3577471524477005, "epoch": 0.1266025641025641, "grad_norm": 0.05074159801006317, "learning_rate": 1e-06, "loss": -0.0204, "step": 79 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.3527442365884781, "epoch": 0.1282051282051282, "grad_norm": 0.04185720905661583, "learning_rate": 1e-06, "loss": 0.0032, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 16384.0, "completions/max_terminated_length": 16365.0, "completions/mean_length": 9510.943359375, "completions/mean_terminated_length": 9187.6708984375, "completions/min_length": 2979.0, "completions/min_terminated_length": 2979.0, "entropy": 0.35790617763996124, "epoch": 0.12980769230769232, "frac_reward_zero_std": 0.5625, "grad_norm": 0.04458483308553696, "learning_rate": 1e-06, "loss": 0.0806, "num_tokens": 128493198.0, "reward": 0.8221728801727295, "reward_std": 0.11117751896381378, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.8984375, "rewards/symbolic_reward_accuracy/std": 0.30236753821372986, "rewards/symbolic_reward_partial_score/mean": 0.9586751461029053, "rewards/symbolic_reward_partial_score/std": 0.17586030066013336, "rewards/tag_count_reward/mean": -0.044921875, "rewards/tag_count_reward/std": 0.20733514428138733, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0880753993988037, "sampling/importance_sampling_ratio/min": 8.141039870679379e-06, "sampling/sampling_logp_difference/max": 11.718592643737793, "sampling/sampling_logp_difference/mean": 0.14561305940151215, "step": 81 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.361351415514946, "epoch": 0.13141025641025642, "grad_norm": 0.059289369732141495, "learning_rate": 1e-06, "loss": 0.0795, "step": 82 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.35831257700920105, "epoch": 0.1330128205128205, "grad_norm": 0.020446285605430603, "learning_rate": 1e-06, "loss": -0.0607, "step": 83 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.36143477261066437, "epoch": 0.1346153846153846, "grad_norm": 0.02317287027835846, "learning_rate": 1e-06, "loss": -0.0405, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.068359375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16342.0, "completions/mean_length": 10131.134765625, "completions/mean_terminated_length": 9672.3291015625, "completions/min_length": 3775.0, "completions/min_terminated_length": 3775.0, "entropy": 0.34663867950439453, "epoch": 0.1362179487179487, "frac_reward_zero_std": 0.3125, "grad_norm": 0.0594073049724102, "learning_rate": 1e-06, "loss": 0.0201, "num_tokens": 134566339.0, "reward": 0.7725633382797241, "reward_std": 0.178839772939682, "rewards/progression_diversity/mean": -0.0005024719866923988, "rewards/progression_diversity/std": 0.011369643732905388, "rewards/symbolic_reward_accuracy/mean": 0.841796875, "rewards/symbolic_reward_accuracy/std": 0.36528825759887695, "rewards/symbolic_reward_partial_score/mean": 0.9144206047058105, "rewards/symbolic_reward_partial_score/std": 0.25830593705177307, "rewards/tag_count_reward/mean": -0.068359375, "rewards/tag_count_reward/std": 0.25260838866233826, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.084014654159546, "sampling/importance_sampling_ratio/min": 0.0015900562284514308, "sampling/sampling_logp_difference/max": 6.443985939025879, "sampling/sampling_logp_difference/mean": 0.13950303196907043, "step": 85 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.341892808675766, "epoch": 0.13782051282051283, "grad_norm": 0.04441990703344345, "learning_rate": 1e-06, "loss": 0.0012, "step": 86 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.3454471379518509, "epoch": 0.13942307692307693, "grad_norm": 0.03872222825884819, "learning_rate": 1e-06, "loss": 0.058, "step": 87 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.3473415672779083, "epoch": 0.14102564102564102, "grad_norm": 0.05630237236618996, "learning_rate": 1e-06, "loss": 0.0618, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16171.0, "completions/mean_length": 9006.404296875, "completions/mean_terminated_length": 8889.2998046875, "completions/min_length": 2781.0, "completions/min_terminated_length": 2781.0, "entropy": 0.36482852697372437, "epoch": 0.14262820512820512, "frac_reward_zero_std": 0.65625, "grad_norm": 0.0254896879196167, "learning_rate": 1e-06, "loss": -0.0307, "num_tokens": 139932018.0, "reward": 0.8607568740844727, "reward_std": 0.09075237810611725, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.955078125, "rewards/symbolic_reward_accuracy/std": 0.20733514428138733, "rewards/symbolic_reward_partial_score/mean": 0.9635905027389526, "rewards/symbolic_reward_partial_score/std": 0.18457859754562378, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.089465618133545, "sampling/importance_sampling_ratio/min": 1.3188043794798432e-06, "sampling/sampling_logp_difference/max": 13.538784980773926, "sampling/sampling_logp_difference/mean": 0.14778712391853333, "step": 89 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.36814509332180023, "epoch": 0.14423076923076922, "grad_norm": 0.039982639253139496, "learning_rate": 1e-06, "loss": 0.0261, "step": 90 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3661590963602066, "epoch": 0.14583333333333334, "grad_norm": 0.05572717636823654, "learning_rate": 1e-06, "loss": 0.0927, "step": 91 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.36387769877910614, "epoch": 0.14743589743589744, "grad_norm": 0.02685936540365219, "learning_rate": 1e-06, "loss": -0.0201, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16209.0, "completions/mean_length": 9095.279296875, "completions/mean_terminated_length": 8798.9892578125, "completions/min_length": 1840.0, "completions/min_terminated_length": 1840.0, "entropy": 0.3607036769390106, "epoch": 0.14903846153846154, "frac_reward_zero_std": 0.53125, "grad_norm": 0.04796959087252617, "learning_rate": 1e-06, "loss": 0.0477, "num_tokens": 145388049.0, "reward": 0.8146386742591858, "reward_std": 0.13105550408363342, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.888671875, "rewards/symbolic_reward_accuracy/std": 0.31484565138816833, "rewards/symbolic_reward_partial_score/mean": 0.9498372077941895, "rewards/symbolic_reward_partial_score/std": 0.20072229206562042, "rewards/tag_count_reward/mean": -0.03515625, "rewards/tag_count_reward/std": 0.1843547374010086, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0879380702972412, "sampling/importance_sampling_ratio/min": 0.0015213644364848733, "sampling/sampling_logp_difference/max": 6.488147735595703, "sampling/sampling_logp_difference/mean": 0.1456601470708847, "step": 93 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.36027221381664276, "epoch": 0.15064102564102563, "grad_norm": 0.04194071143865585, "learning_rate": 1e-06, "loss": -0.0069, "step": 94 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.3594335913658142, "epoch": 0.15224358974358973, "grad_norm": 0.045048587024211884, "learning_rate": 1e-06, "loss": -0.0111, "step": 95 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.36144396662712097, "epoch": 0.15384615384615385, "grad_norm": 0.048988793045282364, "learning_rate": 1e-06, "loss": 0.0862, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16381.0, "completions/mean_length": 11043.560546875, "completions/mean_terminated_length": 10590.98046875, "completions/min_length": 2828.0, "completions/min_terminated_length": 2828.0, "entropy": 0.3313770890235901, "epoch": 0.15544871794871795, "frac_reward_zero_std": 0.40625, "grad_norm": 0.05903196707367897, "learning_rate": 1e-06, "loss": -0.046, "num_tokens": 152099648.0, "reward": 0.7212207317352295, "reward_std": 0.20257437229156494, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.765625, "rewards/symbolic_reward_accuracy/std": 0.42402184009552, "rewards/symbolic_reward_partial_score/mean": 0.8982096910476685, "rewards/symbolic_reward_partial_score/std": 0.27255114912986755, "rewards/tag_count_reward/mean": -0.076171875, "rewards/tag_count_reward/std": 0.26553234457969666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0802266597747803, "sampling/importance_sampling_ratio/min": 0.0013272056821733713, "sampling/sampling_logp_difference/max": 6.6246795654296875, "sampling/sampling_logp_difference/mean": 0.1339382529258728, "step": 97 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.3316061198711395, "epoch": 0.15705128205128205, "grad_norm": 0.04312283918261528, "learning_rate": 1e-06, "loss": 0.027, "step": 98 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.331068217754364, "epoch": 0.15865384615384615, "grad_norm": 0.053597960621118546, "learning_rate": 1e-06, "loss": 0.1085, "step": 99 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.3328063637018204, "epoch": 0.16025641025641027, "grad_norm": 0.045624740421772, "learning_rate": 1e-06, "loss": 0.0106, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16378.0, "completions/mean_length": 10175.697265625, "completions/mean_terminated_length": 9761.810546875, "completions/min_length": 3233.0, "completions/min_terminated_length": 3233.0, "entropy": 0.34128957986831665, "epoch": 0.16185897435897437, "frac_reward_zero_std": 0.53125, "grad_norm": 0.062391623854637146, "learning_rate": 1e-06, "loss": 0.085, "num_tokens": 158197061.0, "reward": 0.74693363904953, "reward_std": 0.1256483793258667, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.794921875, "rewards/symbolic_reward_accuracy/std": 0.4041535556316376, "rewards/symbolic_reward_partial_score/mean": 0.9194661378860474, "rewards/symbolic_reward_partial_score/std": 0.23865830898284912, "rewards/tag_count_reward/mean": -0.05859375, "rewards/tag_count_reward/std": 0.23509246110916138, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0831738710403442, "sampling/importance_sampling_ratio/min": 0.0017570381751284003, "sampling/sampling_logp_difference/max": 6.344125747680664, "sampling/sampling_logp_difference/mean": 0.1381148397922516, "step": 101 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.34410953521728516, "epoch": 0.16346153846153846, "grad_norm": 0.05245565250515938, "learning_rate": 1e-06, "loss": 0.0154, "step": 102 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.3436356484889984, "epoch": 0.16506410256410256, "grad_norm": 0.027100468054413795, "learning_rate": 1e-06, "loss": -0.0417, "step": 103 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.33905932307243347, "epoch": 0.16666666666666666, "grad_norm": 0.04369059205055237, "learning_rate": 1e-06, "loss": 0.0036, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16241.0, "completions/mean_length": 8800.21484375, "completions/mean_terminated_length": 8770.474609375, "completions/min_length": 2990.0, "completions/min_terminated_length": 2990.0, "entropy": 0.3521415889263153, "epoch": 0.16826923076923078, "frac_reward_zero_std": 0.53125, "grad_norm": 0.05701710283756256, "learning_rate": 1e-06, "loss": 0.0428, "num_tokens": 163514387.0, "reward": 0.8450829982757568, "reward_std": 0.11813522130250931, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.921875, "rewards/symbolic_reward_accuracy/std": 0.26863065361976624, "rewards/symbolic_reward_partial_score/mean": 0.9738444089889526, "rewards/symbolic_reward_partial_score/std": 0.13636134564876556, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0855627059936523, "sampling/importance_sampling_ratio/min": 0.002420509932562709, "sampling/sampling_logp_difference/max": 6.023777008056641, "sampling/sampling_logp_difference/mean": 0.14236831665039062, "step": 105 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.35106146335601807, "epoch": 0.16987179487179488, "grad_norm": 0.03181541711091995, "learning_rate": 1e-06, "loss": -0.0231, "step": 106 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.3530445843935013, "epoch": 0.17147435897435898, "grad_norm": 0.033489957451820374, "learning_rate": 1e-06, "loss": -0.0048, "step": 107 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.3535982072353363, "epoch": 0.17307692307692307, "grad_norm": 0.03859458491206169, "learning_rate": 1e-06, "loss": -0.0146, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16310.0, "completions/mean_length": 9577.984375, "completions/mean_terminated_length": 9199.0927734375, "completions/min_length": 3513.0, "completions/min_terminated_length": 3513.0, "entropy": 0.3445345163345337, "epoch": 0.17467948717948717, "frac_reward_zero_std": 0.375, "grad_norm": 0.030105717480182648, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 169308571.0, "reward": 0.7697656154632568, "reward_std": 0.15501999855041504, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.826171875, "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, "rewards/symbolic_reward_partial_score/mean": 0.9278645515441895, "rewards/symbolic_reward_partial_score/std": 0.231072336435318, "rewards/tag_count_reward/mean": -0.04296875, "rewards/tag_count_reward/std": 0.2029850035905838, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0829436779022217, "sampling/importance_sampling_ratio/min": 0.0021386975422501564, "sampling/sampling_logp_difference/max": 6.147558212280273, "sampling/sampling_logp_difference/mean": 0.13814884424209595, "step": 109 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.33951735496520996, "epoch": 0.1762820512820513, "grad_norm": 0.047239113599061966, "learning_rate": 1e-06, "loss": 0.0045, "step": 110 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.3426303267478943, "epoch": 0.1778846153846154, "grad_norm": 0.04610922932624817, "learning_rate": 1e-06, "loss": 0.0809, "step": 111 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.3422156274318695, "epoch": 0.1794871794871795, "grad_norm": 0.05085078626871109, "learning_rate": 1e-06, "loss": -0.0159, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 16075.0, "completions/mean_length": 8959.640625, "completions/mean_terminated_length": 8856.728515625, "completions/min_length": 2911.0, "completions/min_terminated_length": 2911.0, "entropy": 0.3481294810771942, "epoch": 0.18108974358974358, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0655587688088417, "learning_rate": 1e-06, "loss": 0.0224, "num_tokens": 174744371.0, "reward": 0.8303369283676147, "reward_std": 0.12149346619844437, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.90625, "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, "rewards/symbolic_reward_partial_score/mean": 0.9591959714889526, "rewards/symbolic_reward_partial_score/std": 0.18175262212753296, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.083998441696167, "sampling/importance_sampling_ratio/min": 0.0036077979020774364, "sampling/sampling_logp_difference/max": 5.62465763092041, "sampling/sampling_logp_difference/mean": 0.13975155353546143, "step": 113 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.34841732680797577, "epoch": 0.18269230769230768, "grad_norm": 0.03695600852370262, "learning_rate": 1e-06, "loss": 0.0007, "step": 114 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.3461028039455414, "epoch": 0.1842948717948718, "grad_norm": 0.026078850030899048, "learning_rate": 1e-06, "loss": 0.008, "step": 115 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.34539441764354706, "epoch": 0.1858974358974359, "grad_norm": 0.03016047365963459, "learning_rate": 1e-06, "loss": 0.0127, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16381.0, "completions/mean_length": 9113.162109375, "completions/mean_terminated_length": 8878.619140625, "completions/min_length": 2780.0, "completions/min_terminated_length": 2780.0, "entropy": 0.35554787516593933, "epoch": 0.1875, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0360124446451664, "learning_rate": 1e-06, "loss": -0.0517, "num_tokens": 180237526.0, "reward": 0.8028459548950195, "reward_std": 0.11904940009117126, "rewards/progression_diversity/mean": -7.497695332858711e-05, "rewards/progression_diversity/std": 0.0016965348040685058, "rewards/symbolic_reward_accuracy/mean": 0.869140625, "rewards/symbolic_reward_accuracy/std": 0.33757632970809937, "rewards/symbolic_reward_partial_score/mean": 0.948291003704071, "rewards/symbolic_reward_partial_score/std": 0.19676990807056427, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0862194299697876, "sampling/importance_sampling_ratio/min": 0.002927119378000498, "sampling/sampling_logp_difference/max": 5.833736419677734, "sampling/sampling_logp_difference/mean": 0.14279107749462128, "step": 117 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.3518333435058594, "epoch": 0.1891025641025641, "grad_norm": 0.03996913880109787, "learning_rate": 1e-06, "loss": 0.1356, "step": 118 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.35259974002838135, "epoch": 0.1907051282051282, "grad_norm": 0.03163549676537514, "learning_rate": 1e-06, "loss": 0.0225, "step": 119 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.3569566011428833, "epoch": 0.19230769230769232, "grad_norm": 0.04826189577579498, "learning_rate": 1e-06, "loss": -0.0253, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16301.0, "completions/mean_length": 8850.17578125, "completions/mean_terminated_length": 8638.380859375, "completions/min_length": 3422.0, "completions/min_terminated_length": 3422.0, "entropy": 0.35906967520713806, "epoch": 0.19391025641025642, "frac_reward_zero_std": 0.65625, "grad_norm": 0.03243906795978546, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 185531216.0, "reward": 0.8294873237609863, "reward_std": 0.08104758709669113, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.90234375, "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, "rewards/symbolic_reward_partial_score/mean": 0.9687337279319763, "rewards/symbolic_reward_partial_score/std": 0.14531417191028595, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0881123542785645, "sampling/importance_sampling_ratio/min": 0.001374780316837132, "sampling/sampling_logp_difference/max": 6.589461326599121, "sampling/sampling_logp_difference/mean": 0.1460496038198471, "step": 121 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.36369843780994415, "epoch": 0.1955128205128205, "grad_norm": 0.029248500242829323, "learning_rate": 1e-06, "loss": -0.0297, "step": 122 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.3637174665927887, "epoch": 0.1971153846153846, "grad_norm": 0.051945000886917114, "learning_rate": 1e-06, "loss": 0.0276, "step": 123 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.36110472679138184, "epoch": 0.1987179487179487, "grad_norm": 0.03304561600089073, "learning_rate": 1e-06, "loss": 0.0175, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 15513.0, "completions/mean_length": 9026.537109375, "completions/mean_terminated_length": 8864.99609375, "completions/min_length": 2347.0, "completions/min_terminated_length": 2347.0, "entropy": 0.34777674078941345, "epoch": 0.20032051282051283, "frac_reward_zero_std": 0.5, "grad_norm": 0.04810767248272896, "learning_rate": 1e-06, "loss": 0.0241, "num_tokens": 191083203.0, "reward": 0.8114843964576721, "reward_std": 0.1339455246925354, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.876953125, "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, "rewards/symbolic_reward_partial_score/mean": 0.9575520753860474, "rewards/symbolic_reward_partial_score/std": 0.17691098153591156, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.083823561668396, "sampling/importance_sampling_ratio/min": 0.0015166376251727343, "sampling/sampling_logp_difference/max": 6.491259574890137, "sampling/sampling_logp_difference/mean": 0.13936495780944824, "step": 125 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.3457389622926712, "epoch": 0.20192307692307693, "grad_norm": 0.0352606326341629, "learning_rate": 1e-06, "loss": 0.0099, "step": 126 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.3449082672595978, "epoch": 0.20352564102564102, "grad_norm": 0.05728811398148537, "learning_rate": 1e-06, "loss": 0.0025, "step": 127 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.3434547334909439, "epoch": 0.20512820512820512, "grad_norm": 0.0379202663898468, "learning_rate": 1e-06, "loss": 0.0225, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16315.0, "completions/mean_length": 9241.806640625, "completions/mean_terminated_length": 8996.51953125, "completions/min_length": 3618.0, "completions/min_terminated_length": 3618.0, "entropy": 0.34919285774230957, "epoch": 0.20673076923076922, "frac_reward_zero_std": 0.5, "grad_norm": 0.05845572426915169, "learning_rate": 1e-06, "loss": 0.0275, "num_tokens": 196726176.0, "reward": 0.7978222966194153, "reward_std": 0.12185439467430115, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.859375, "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, "rewards/symbolic_reward_partial_score/mean": 0.9504231810569763, "rewards/symbolic_reward_partial_score/std": 0.1883016973733902, "rewards/tag_count_reward/mean": -0.029296875, "rewards/tag_count_reward/std": 0.16880230605602264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0857439041137695, "sampling/importance_sampling_ratio/min": 0.0011124002048745751, "sampling/sampling_logp_difference/max": 6.801235198974609, "sampling/sampling_logp_difference/mean": 0.14204151928424835, "step": 129 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.3502717614173889, "epoch": 0.20833333333333334, "grad_norm": 0.03797543793916702, "learning_rate": 1e-06, "loss": 0.0419, "step": 130 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.35478633642196655, "epoch": 0.20993589743589744, "grad_norm": 0.031961336731910706, "learning_rate": 1e-06, "loss": -0.0706, "step": 131 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.35218366980552673, "epoch": 0.21153846153846154, "grad_norm": 0.044474322348833084, "learning_rate": 1e-06, "loss": 0.0365, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16282.0, "completions/mean_length": 9844.837890625, "completions/mean_terminated_length": 9606.5693359375, "completions/min_length": 2080.0, "completions/min_terminated_length": 2080.0, "entropy": 0.34503108263015747, "epoch": 0.21314102564102563, "frac_reward_zero_std": 0.375, "grad_norm": 0.03489559888839722, "learning_rate": 1e-06, "loss": -0.0072, "num_tokens": 202712269.0, "reward": 0.772519588470459, "reward_std": 0.16101190447807312, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.822265625, "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, "rewards/symbolic_reward_partial_score/mean": 0.9402994513511658, "rewards/symbolic_reward_partial_score/std": 0.20471788942813873, "rewards/tag_count_reward/mean": -0.029296875, "rewards/tag_count_reward/std": 0.16880230605602264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0832815170288086, "sampling/importance_sampling_ratio/min": 0.0005271582049317658, "sampling/sampling_logp_difference/max": 7.548009872436523, "sampling/sampling_logp_difference/mean": 0.13840848207473755, "step": 133 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.34120801091194153, "epoch": 0.21474358974358973, "grad_norm": 0.035141848027706146, "learning_rate": 1e-06, "loss": 0.0249, "step": 134 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.34197595715522766, "epoch": 0.21634615384615385, "grad_norm": 0.05070476606488228, "learning_rate": 1e-06, "loss": 0.0675, "step": 135 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.3441559672355652, "epoch": 0.21794871794871795, "grad_norm": 0.0495283305644989, "learning_rate": 1e-06, "loss": -0.0592, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 16340.0, "completions/mean_length": 9352.974609375, "completions/mean_terminated_length": 9269.603515625, "completions/min_length": 3670.0, "completions/min_terminated_length": 3670.0, "entropy": 0.34940995275974274, "epoch": 0.21955128205128205, "frac_reward_zero_std": 0.5625, "grad_norm": 0.05279716104269028, "learning_rate": 1e-06, "loss": 0.0594, "num_tokens": 208332960.0, "reward": 0.8177881240844727, "reward_std": 0.11742972582578659, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.8828125, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.9642415046691895, "rewards/symbolic_reward_partial_score/std": 0.15622878074645996, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.08543062210083, "sampling/importance_sampling_ratio/min": 0.002090509980916977, "sampling/sampling_logp_difference/max": 6.170347213745117, "sampling/sampling_logp_difference/mean": 0.14151644706726074, "step": 137 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.3540169894695282, "epoch": 0.22115384615384615, "grad_norm": 0.051732927560806274, "learning_rate": 1e-06, "loss": 0.0098, "step": 138 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.35277238488197327, "epoch": 0.22275641025641027, "grad_norm": 0.028537962585687637, "learning_rate": 1e-06, "loss": -0.0018, "step": 139 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.35403163731098175, "epoch": 0.22435897435897437, "grad_norm": 0.04318862035870552, "learning_rate": 1e-06, "loss": -0.0283, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16134.0, "completions/mean_length": 8975.630859375, "completions/mean_terminated_length": 8917.296875, "completions/min_length": 3650.0, "completions/min_terminated_length": 3650.0, "entropy": 0.3518424928188324, "epoch": 0.22596153846153846, "frac_reward_zero_std": 0.625, "grad_norm": 0.02684016153216362, "learning_rate": 1e-06, "loss": 0.038, "num_tokens": 213772099.0, "reward": 0.8544141054153442, "reward_std": 0.09175492078065872, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.935546875, "rewards/symbolic_reward_accuracy/std": 0.24579854309558868, "rewards/symbolic_reward_partial_score/mean": 0.9795572757720947, "rewards/symbolic_reward_partial_score/std": 0.11998182535171509, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0846960544586182, "sampling/importance_sampling_ratio/min": 0.0016334766987711191, "sampling/sampling_logp_difference/max": 6.417044639587402, "sampling/sampling_logp_difference/mean": 0.14141295850276947, "step": 141 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.3497009426355362, "epoch": 0.22756410256410256, "grad_norm": 0.01540279109030962, "learning_rate": 1e-06, "loss": -0.0051, "step": 142 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.3526155948638916, "epoch": 0.22916666666666666, "grad_norm": 0.03089858405292034, "learning_rate": 1e-06, "loss": 0.0383, "step": 143 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.3568020164966583, "epoch": 0.23076923076923078, "grad_norm": 0.01360280066728592, "learning_rate": 1e-06, "loss": -0.0418, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 16383.0, "completions/mean_length": 9131.513671875, "completions/mean_terminated_length": 8912.625, "completions/min_length": 3696.0, "completions/min_terminated_length": 3696.0, "entropy": 0.3563638925552368, "epoch": 0.23237179487179488, "frac_reward_zero_std": 0.53125, "grad_norm": 0.03985019773244858, "learning_rate": 1e-06, "loss": 0.0224, "num_tokens": 219298058.0, "reward": 0.8047363758087158, "reward_std": 0.10955963283777237, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.865234375, "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, "rewards/symbolic_reward_partial_score/mean": 0.9611002206802368, "rewards/symbolic_reward_partial_score/std": 0.15853257477283478, "rewards/tag_count_reward/mean": -0.02734375, "rewards/tag_count_reward/std": 0.16324250400066376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.086620807647705, "sampling/importance_sampling_ratio/min": 0.0015216735191643238, "sampling/sampling_logp_difference/max": 6.487944602966309, "sampling/sampling_logp_difference/mean": 0.1434181034564972, "step": 145 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.35702893137931824, "epoch": 0.23397435897435898, "grad_norm": 0.03723561018705368, "learning_rate": 1e-06, "loss": 0.0668, "step": 146 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.35949432849884033, "epoch": 0.23557692307692307, "grad_norm": 0.045161064714193344, "learning_rate": 1e-06, "loss": -0.0593, "step": 147 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.3579980880022049, "epoch": 0.23717948717948717, "grad_norm": 0.02439435012638569, "learning_rate": 1e-06, "loss": -0.004, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16285.0, "completions/mean_length": 8998.2890625, "completions/mean_terminated_length": 8851.1640625, "completions/min_length": 3320.0, "completions/min_terminated_length": 3320.0, "entropy": 0.35286062955856323, "epoch": 0.2387820512820513, "frac_reward_zero_std": 0.59375, "grad_norm": 0.029027221724390984, "learning_rate": 1e-06, "loss": -0.0633, "num_tokens": 224763566.0, "reward": 0.8301221132278442, "reward_std": 0.11094683408737183, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.90234375, "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, "rewards/symbolic_reward_partial_score/mean": 0.9682454466819763, "rewards/symbolic_reward_partial_score/std": 0.14932651817798615, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0849233865737915, "sampling/importance_sampling_ratio/min": 0.0024807597510516644, "sampling/sampling_logp_difference/max": 5.999190330505371, "sampling/sampling_logp_difference/mean": 0.1413499414920807, "step": 149 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.35287538170814514, "epoch": 0.2403846153846154, "grad_norm": 0.046236295253038406, "learning_rate": 1e-06, "loss": 0.0023, "step": 150 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.35205814242362976, "epoch": 0.2419871794871795, "grad_norm": 0.031354814767837524, "learning_rate": 1e-06, "loss": -0.0024, "step": 151 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.35166609287261963, "epoch": 0.24358974358974358, "grad_norm": 0.032667215913534164, "learning_rate": 1e-06, "loss": 0.1197, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16213.0, "completions/mean_length": 9591.09765625, "completions/mean_terminated_length": 9455.78125, "completions/min_length": 3713.0, "completions/min_terminated_length": 3713.0, "entropy": 0.35303913056850433, "epoch": 0.24519230769230768, "frac_reward_zero_std": 0.4375, "grad_norm": 0.05372530594468117, "learning_rate": 1e-06, "loss": -0.0104, "num_tokens": 230598176.0, "reward": 0.8339111804962158, "reward_std": 0.13246983289718628, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.908203125, "rewards/symbolic_reward_accuracy/std": 0.289021372795105, "rewards/symbolic_reward_partial_score/mean": 0.9698079824447632, "rewards/symbolic_reward_partial_score/std": 0.1492735594511032, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0853214263916016, "sampling/importance_sampling_ratio/min": 0.0011984565062448382, "sampling/sampling_logp_difference/max": 6.726720809936523, "sampling/sampling_logp_difference/mean": 0.14169231057167053, "step": 153 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.35560114681720734, "epoch": 0.2467948717948718, "grad_norm": 0.044723186641931534, "learning_rate": 1e-06, "loss": 0.0532, "step": 154 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.35691170394420624, "epoch": 0.2483974358974359, "grad_norm": 0.04241631180047989, "learning_rate": 1e-06, "loss": -0.0167, "step": 155 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.3529065102338791, "epoch": 0.25, "grad_norm": 0.03733038529753685, "learning_rate": 1e-06, "loss": 0.0303, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16320.0, "completions/mean_length": 9631.318359375, "completions/mean_terminated_length": 9356.818359375, "completions/min_length": 3819.0, "completions/min_terminated_length": 3819.0, "entropy": 0.3581730127334595, "epoch": 0.2516025641025641, "frac_reward_zero_std": 0.59375, "grad_norm": 0.03136492148041725, "learning_rate": 1e-06, "loss": -0.0063, "num_tokens": 236435123.0, "reward": 0.7973340153694153, "reward_std": 0.10428421199321747, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.857421875, "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, "rewards/symbolic_reward_partial_score/mean": 0.9553059935569763, "rewards/symbolic_reward_partial_score/std": 0.1736644208431244, "rewards/tag_count_reward/mean": -0.037109375, "rewards/tag_count_reward/std": 0.18921469151973724, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0863444805145264, "sampling/importance_sampling_ratio/min": 0.0019276229431852698, "sampling/sampling_logp_difference/max": 6.251467704772949, "sampling/sampling_logp_difference/mean": 0.1433483511209488, "step": 157 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.3594221770763397, "epoch": 0.2532051282051282, "grad_norm": 0.03519349545240402, "learning_rate": 1e-06, "loss": 0.0227, "step": 158 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.35834231972694397, "epoch": 0.2548076923076923, "grad_norm": 0.04243331775069237, "learning_rate": 1e-06, "loss": -0.034, "step": 159 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.36326245963573456, "epoch": 0.2564102564102564, "grad_norm": 0.02983006276190281, "learning_rate": 1e-06, "loss": 0.0232, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16326.0, "completions/mean_length": 8806.05859375, "completions/mean_terminated_length": 8670.46875, "completions/min_length": 2932.0, "completions/min_terminated_length": 2932.0, "entropy": 0.36841265857219696, "epoch": 0.25801282051282054, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0387740395963192, "learning_rate": 1e-06, "loss": -0.0436, "num_tokens": 241738817.0, "reward": 0.8450244665145874, "reward_std": 0.0741325095295906, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.92578125, "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, "rewards/symbolic_reward_partial_score/mean": 0.9703938961029053, "rewards/symbolic_reward_partial_score/std": 0.14939291775226593, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0885682106018066, "sampling/importance_sampling_ratio/min": 0.001623596646822989, "sampling/sampling_logp_difference/max": 6.423111438751221, "sampling/sampling_logp_difference/mean": 0.1468866765499115, "step": 161 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.36578311026096344, "epoch": 0.25961538461538464, "grad_norm": 0.03768492862582207, "learning_rate": 1e-06, "loss": 0.0457, "step": 162 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.36687426269054413, "epoch": 0.26121794871794873, "grad_norm": 0.02925664372742176, "learning_rate": 1e-06, "loss": -0.0465, "step": 163 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3688085824251175, "epoch": 0.26282051282051283, "grad_norm": 0.0425320640206337, "learning_rate": 1e-06, "loss": 0.0604, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16051.0, "completions/mean_length": 8450.478515625, "completions/mean_terminated_length": 8419.3671875, "completions/min_length": 2066.0, "completions/min_terminated_length": 2066.0, "entropy": 0.37353581190109253, "epoch": 0.2644230769230769, "frac_reward_zero_std": 0.78125, "grad_norm": 0.013331255875527859, "learning_rate": 1e-06, "loss": -0.0258, "num_tokens": 246766070.0, "reward": 0.8802295327186584, "reward_std": 0.044710736721754074, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.970703125, "rewards/symbolic_reward_accuracy/std": 0.16880230605602264, "rewards/symbolic_reward_partial_score/mean": 0.9939941167831421, "rewards/symbolic_reward_partial_score/std": 0.051585350185632706, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0899767875671387, "sampling/importance_sampling_ratio/min": 0.00048812784370966256, "sampling/sampling_logp_difference/max": 7.624933242797852, "sampling/sampling_logp_difference/mean": 0.14946144819259644, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.37515178322792053, "epoch": 0.266025641025641, "grad_norm": 0.04269331321120262, "learning_rate": 1e-06, "loss": 0.0658, "step": 166 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.3724121153354645, "epoch": 0.2676282051282051, "grad_norm": 0.011194121092557907, "learning_rate": 1e-06, "loss": -0.044, "step": 167 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.37359511852264404, "epoch": 0.2692307692307692, "grad_norm": 0.03109007515013218, "learning_rate": 1e-06, "loss": 0.0335, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 15973.0, "completions/mean_length": 8635.6953125, "completions/mean_terminated_length": 8543.818359375, "completions/min_length": 2676.0, "completions/min_terminated_length": 2676.0, "entropy": 0.35454364120960236, "epoch": 0.2708333333333333, "frac_reward_zero_std": 0.5625, "grad_norm": 0.04293088614940643, "learning_rate": 1e-06, "loss": -0.0246, "num_tokens": 252068858.0, "reward": 0.8351123332977295, "reward_std": 0.11048781871795654, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.908203125, "rewards/symbolic_reward_accuracy/std": 0.289021372795105, "rewards/symbolic_reward_partial_score/mean": 0.9712077379226685, "rewards/symbolic_reward_partial_score/std": 0.13899967074394226, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0848219394683838, "sampling/importance_sampling_ratio/min": 0.0010735776741057634, "sampling/sampling_logp_difference/max": 6.836758613586426, "sampling/sampling_logp_difference/mean": 0.14154580235481262, "step": 169 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.35268211364746094, "epoch": 0.2724358974358974, "grad_norm": 0.02954338677227497, "learning_rate": 1e-06, "loss": 0.0129, "step": 170 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.3522224724292755, "epoch": 0.27403846153846156, "grad_norm": 0.0466536320745945, "learning_rate": 1e-06, "loss": 0.0188, "step": 171 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.35762760043144226, "epoch": 0.27564102564102566, "grad_norm": 0.027192890644073486, "learning_rate": 1e-06, "loss": 0.0065, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15698.0, "completions/max_terminated_length": 15698.0, "completions/mean_length": 8384.9296875, "completions/mean_terminated_length": 8384.9296875, "completions/min_length": 2842.0, "completions/min_terminated_length": 2842.0, "entropy": 0.3587222099304199, "epoch": 0.27724358974358976, "frac_reward_zero_std": 0.625, "grad_norm": 0.05196116492152214, "learning_rate": 1e-06, "loss": 0.0556, "num_tokens": 257190646.0, "reward": 0.8613916635513306, "reward_std": 0.08774565160274506, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.947265625, "rewards/symbolic_reward_accuracy/std": 0.22372129559516907, "rewards/symbolic_reward_partial_score/mean": 0.9767740964889526, "rewards/symbolic_reward_partial_score/std": 0.13959519565105438, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0853793621063232, "sampling/importance_sampling_ratio/min": 0.0006424426101148129, "sampling/sampling_logp_difference/max": 7.35023307800293, "sampling/sampling_logp_difference/mean": 0.14276909828186035, "step": 173 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.35669371485710144, "epoch": 0.27884615384615385, "grad_norm": 0.017249129712581635, "learning_rate": 1e-06, "loss": 0.0012, "step": 174 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.3552614599466324, "epoch": 0.28044871794871795, "grad_norm": 0.02084186114370823, "learning_rate": 1e-06, "loss": 0.0242, "step": 175 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.356476366519928, "epoch": 0.28205128205128205, "grad_norm": 0.014071096666157246, "learning_rate": 1e-06, "loss": -0.0461, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16124.0, "completions/mean_length": 7951.38671875, "completions/mean_terminated_length": 7901.68603515625, "completions/min_length": 2005.0, "completions/min_terminated_length": 2005.0, "entropy": 0.35373421013355255, "epoch": 0.28365384615384615, "frac_reward_zero_std": 0.6875, "grad_norm": 0.03797481581568718, "learning_rate": 1e-06, "loss": 0.0788, "num_tokens": 262111308.0, "reward": 0.8627225160598755, "reward_std": 0.07896904647350311, "rewards/progression_diversity/mean": -0.00021306316193658859, "rewards/progression_diversity/std": 0.004821069072932005, "rewards/symbolic_reward_accuracy/mean": 0.94921875, "rewards/symbolic_reward_accuracy/std": 0.21976542472839355, "rewards/symbolic_reward_partial_score/mean": 0.9792643785476685, "rewards/symbolic_reward_partial_score/std": 0.12813684344291687, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0842299461364746, "sampling/importance_sampling_ratio/min": 0.0023363421205431223, "sampling/sampling_logp_difference/max": 6.059168815612793, "sampling/sampling_logp_difference/mean": 0.14104613661766052, "step": 177 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.3539777994155884, "epoch": 0.28525641025641024, "grad_norm": 0.028011592105031013, "learning_rate": 1e-06, "loss": 0.0291, "step": 178 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3513519614934921, "epoch": 0.28685897435897434, "grad_norm": 0.016913555562496185, "learning_rate": 1e-06, "loss": -0.06, "step": 179 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.35203738510608673, "epoch": 0.28846153846153844, "grad_norm": 0.040348220616579056, "learning_rate": 1e-06, "loss": 0.0125, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16168.0, "completions/mean_length": 9183.673828125, "completions/mean_terminated_length": 9069.3837890625, "completions/min_length": 3320.0, "completions/min_terminated_length": 3320.0, "entropy": 0.3340272307395935, "epoch": 0.2900641025641026, "frac_reward_zero_std": 0.5625, "grad_norm": 0.03144938498735428, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 267754325.0, "reward": 0.8455274105072021, "reward_std": 0.100336454808712, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.923828125, "rewards/symbolic_reward_accuracy/std": 0.26553234457969666, "rewards/symbolic_reward_partial_score/mean": 0.9759765863418579, "rewards/symbolic_reward_partial_score/std": 0.12853872776031494, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0812079906463623, "sampling/importance_sampling_ratio/min": 0.0013913201401010156, "sampling/sampling_logp_difference/max": 6.577502250671387, "sampling/sampling_logp_difference/mean": 0.13597433269023895, "step": 181 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.33848777413368225, "epoch": 0.2916666666666667, "grad_norm": 0.03411485627293587, "learning_rate": 1e-06, "loss": 0.0718, "step": 182 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.3445683866739273, "epoch": 0.2932692307692308, "grad_norm": 0.020242290571331978, "learning_rate": 1e-06, "loss": 0.0077, "step": 183 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.342714786529541, "epoch": 0.2948717948717949, "grad_norm": 0.03483344614505768, "learning_rate": 1e-06, "loss": -0.0934, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15912.0, "completions/mean_length": 8466.994140625, "completions/mean_terminated_length": 8341.328125, "completions/min_length": 2411.0, "completions/min_terminated_length": 2411.0, "entropy": 0.34684619307518005, "epoch": 0.296474358974359, "frac_reward_zero_std": 0.625, "grad_norm": 0.050103284418582916, "learning_rate": 1e-06, "loss": 0.053, "num_tokens": 273006082.0, "reward": 0.8299316763877869, "reward_std": 0.0870482325553894, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.900390625, "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, "rewards/symbolic_reward_partial_score/mean": 0.9708658456802368, "rewards/symbolic_reward_partial_score/std": 0.13320381939411163, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0825543403625488, "sampling/importance_sampling_ratio/min": 0.00306839682161808, "sampling/sampling_logp_difference/max": 5.786600112915039, "sampling/sampling_logp_difference/mean": 0.13881386816501617, "step": 185 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.34666164219379425, "epoch": 0.2980769230769231, "grad_norm": 0.021181946620345116, "learning_rate": 1e-06, "loss": -0.0278, "step": 186 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.34807461500167847, "epoch": 0.29967948717948717, "grad_norm": 0.030283570289611816, "learning_rate": 1e-06, "loss": -0.017, "step": 187 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3434204161167145, "epoch": 0.30128205128205127, "grad_norm": 0.026113567873835564, "learning_rate": 1e-06, "loss": 0.0155, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16234.0, "completions/mean_length": 8863.322265625, "completions/mean_terminated_length": 8743.947265625, "completions/min_length": 3721.0, "completions/min_terminated_length": 3721.0, "entropy": 0.35359667241573334, "epoch": 0.30288461538461536, "frac_reward_zero_std": 0.65625, "grad_norm": 0.026801202446222305, "learning_rate": 1e-06, "loss": -0.0552, "num_tokens": 278416583.0, "reward": 0.8386768102645874, "reward_std": 0.08783165365457535, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.91015625, "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, "rewards/symbolic_reward_partial_score/mean": 0.9798339605331421, "rewards/symbolic_reward_partial_score/std": 0.10151783376932144, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.084189772605896, "sampling/importance_sampling_ratio/min": 0.0016244198195636272, "sampling/sampling_logp_difference/max": 6.422604560852051, "sampling/sampling_logp_difference/mean": 0.14125624299049377, "step": 189 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.35646533966064453, "epoch": 0.30448717948717946, "grad_norm": 0.029226401820778847, "learning_rate": 1e-06, "loss": 0.0248, "step": 190 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.3521941602230072, "epoch": 0.3060897435897436, "grad_norm": 0.02918200194835663, "learning_rate": 1e-06, "loss": 0.0288, "step": 191 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.35472485423088074, "epoch": 0.3076923076923077, "grad_norm": 0.028059527277946472, "learning_rate": 1e-06, "loss": 0.0176, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16299.0, "completions/mean_length": 9452.720703125, "completions/mean_terminated_length": 9425.5400390625, "completions/min_length": 2998.0, "completions/min_terminated_length": 2998.0, "entropy": 0.33849048614501953, "epoch": 0.3092948717948718, "frac_reward_zero_std": 0.59375, "grad_norm": 0.04876245558261871, "learning_rate": 1e-06, "loss": 0.0117, "num_tokens": 284235368.0, "reward": 0.8463916778564453, "reward_std": 0.09903424978256226, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.921875, "rewards/symbolic_reward_accuracy/std": 0.26863065361976624, "rewards/symbolic_reward_partial_score/mean": 0.9782063961029053, "rewards/symbolic_reward_partial_score/std": 0.12213291972875595, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0805222988128662, "sampling/importance_sampling_ratio/min": 0.002695305971428752, "sampling/sampling_logp_difference/max": 5.916243553161621, "sampling/sampling_logp_difference/mean": 0.13528338074684143, "step": 193 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.3376816213130951, "epoch": 0.3108974358974359, "grad_norm": 0.02816428802907467, "learning_rate": 1e-06, "loss": -0.047, "step": 194 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.33794452250003815, "epoch": 0.3125, "grad_norm": 0.05909707769751549, "learning_rate": 1e-06, "loss": 0.0759, "step": 195 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3367193639278412, "epoch": 0.3141025641025641, "grad_norm": 0.038326241075992584, "learning_rate": 1e-06, "loss": -0.0225, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16179.0, "completions/mean_length": 8295.279296875, "completions/mean_terminated_length": 8231.5888671875, "completions/min_length": 2244.0, "completions/min_terminated_length": 2244.0, "entropy": 0.34846246242523193, "epoch": 0.3157051282051282, "frac_reward_zero_std": 0.59375, "grad_norm": 0.03890756517648697, "learning_rate": 1e-06, "loss": -0.0259, "num_tokens": 289408183.0, "reward": 0.8529297113418579, "reward_std": 0.10290726274251938, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9375, "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, "rewards/symbolic_reward_partial_score/mean": 0.970703125, "rewards/symbolic_reward_partial_score/std": 0.15238547325134277, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0831495523452759, "sampling/importance_sampling_ratio/min": 0.0035951808094978333, "sampling/sampling_logp_difference/max": 5.6281609535217285, "sampling/sampling_logp_difference/mean": 0.13981285691261292, "step": 197 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.35014402866363525, "epoch": 0.3173076923076923, "grad_norm": 0.022344104945659637, "learning_rate": 1e-06, "loss": -0.0064, "step": 198 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.3466189503669739, "epoch": 0.3189102564102564, "grad_norm": 0.039988093078136444, "learning_rate": 1e-06, "loss": 0.0364, "step": 199 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.3481054753065109, "epoch": 0.32051282051282054, "grad_norm": 0.032751064747571945, "learning_rate": 1e-06, "loss": 0.0125, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16352.0, "completions/mean_length": 8901.630859375, "completions/mean_terminated_length": 8872.2890625, "completions/min_length": 2562.0, "completions/min_terminated_length": 2562.0, "entropy": 0.3477822542190552, "epoch": 0.32211538461538464, "frac_reward_zero_std": 0.625, "grad_norm": 0.030911585316061974, "learning_rate": 1e-06, "loss": -0.0368, "num_tokens": 294884506.0, "reward": 0.8583984375, "reward_std": 0.08643007278442383, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9375, "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, "rewards/symbolic_reward_partial_score/mean": 0.9876302480697632, "rewards/symbolic_reward_partial_score/std": 0.07567723095417023, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.082869291305542, "sampling/importance_sampling_ratio/min": 0.001809673965908587, "sampling/sampling_logp_difference/max": 6.314608573913574, "sampling/sampling_logp_difference/mean": 0.13899391889572144, "step": 201 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3481653928756714, "epoch": 0.32371794871794873, "grad_norm": 0.01745874620974064, "learning_rate": 1e-06, "loss": -0.0037, "step": 202 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.34481124579906464, "epoch": 0.32532051282051283, "grad_norm": 0.05091443657875061, "learning_rate": 1e-06, "loss": 0.0307, "step": 203 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.34859275817871094, "epoch": 0.3269230769230769, "grad_norm": 0.032610710710287094, "learning_rate": 1e-06, "loss": 0.0371, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 16128.0, "completions/mean_length": 8127.3203125, "completions/mean_terminated_length": 8029.41552734375, "completions/min_length": 1918.0, "completions/min_terminated_length": 1918.0, "entropy": 0.35380254685878754, "epoch": 0.328525641025641, "frac_reward_zero_std": 0.6875, "grad_norm": 0.01934434100985527, "learning_rate": 1e-06, "loss": -0.0052, "num_tokens": 299968126.0, "reward": 0.8516113758087158, "reward_std": 0.07474110275506973, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.931640625, "rewards/symbolic_reward_accuracy/std": 0.25260838866233826, "rewards/symbolic_reward_partial_score/mean": 0.9786783456802368, "rewards/symbolic_reward_partial_score/std": 0.1169460341334343, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.084766149520874, "sampling/importance_sampling_ratio/min": 0.0036111027002334595, "sampling/sampling_logp_difference/max": 5.62374210357666, "sampling/sampling_logp_difference/mean": 0.14199641346931458, "step": 205 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.35594668984413147, "epoch": 0.3301282051282051, "grad_norm": 0.014267152175307274, "learning_rate": 1e-06, "loss": 0.0036, "step": 206 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.3535865247249603, "epoch": 0.3317307692307692, "grad_norm": 0.04129806533455849, "learning_rate": 1e-06, "loss": 0.0594, "step": 207 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.35314691066741943, "epoch": 0.3333333333333333, "grad_norm": 0.014989838004112244, "learning_rate": 1e-06, "loss": -0.0366, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16291.0, "completions/mean_length": 8222.17578125, "completions/mean_terminated_length": 8206.203125, "completions/min_length": 2499.0, "completions/min_terminated_length": 2499.0, "entropy": 0.34904006123542786, "epoch": 0.3349358974358974, "frac_reward_zero_std": 0.65625, "grad_norm": 0.03350500762462616, "learning_rate": 1e-06, "loss": -0.0327, "num_tokens": 305037448.0, "reward": 0.8797119855880737, "reward_std": 0.07099881768226624, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.97265625, "rewards/symbolic_reward_accuracy/std": 0.16324250400066376, "rewards/symbolic_reward_partial_score/mean": 0.987060546875, "rewards/symbolic_reward_partial_score/std": 0.10216094553470612, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0828568935394287, "sampling/importance_sampling_ratio/min": 0.0028825579211115837, "sampling/sampling_logp_difference/max": 5.849077224731445, "sampling/sampling_logp_difference/mean": 0.13946111500263214, "step": 209 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.34604765474796295, "epoch": 0.33653846153846156, "grad_norm": 0.029328972101211548, "learning_rate": 1e-06, "loss": 0.006, "step": 210 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.3496543616056442, "epoch": 0.33814102564102566, "grad_norm": 0.03172793239355087, "learning_rate": 1e-06, "loss": -0.0184, "step": 211 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.3500349521636963, "epoch": 0.33974358974358976, "grad_norm": 0.06902319937944412, "learning_rate": 1e-06, "loss": 0.0593, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16100.0, "completions/mean_length": 8706.89453125, "completions/mean_terminated_length": 8553.96484375, "completions/min_length": 3071.0, "completions/min_terminated_length": 3071.0, "entropy": 0.34992852807044983, "epoch": 0.34134615384615385, "frac_reward_zero_std": 0.65625, "grad_norm": 0.05105890706181526, "learning_rate": 1e-06, "loss": 0.0546, "num_tokens": 310384962.0, "reward": 0.8485401272773743, "reward_std": 0.0741346925497055, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.92578125, "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, "rewards/symbolic_reward_partial_score/mean": 0.9834147095680237, "rewards/symbolic_reward_partial_score/std": 0.09006182104349136, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0833783149719238, "sampling/importance_sampling_ratio/min": 0.0013107128906995058, "sampling/sampling_logp_difference/max": 6.637184143066406, "sampling/sampling_logp_difference/mean": 0.14013588428497314, "step": 213 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.34734752774238586, "epoch": 0.34294871794871795, "grad_norm": 0.02068176679313183, "learning_rate": 1e-06, "loss": -0.014, "step": 214 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.3516520708799362, "epoch": 0.34455128205128205, "grad_norm": 0.02231115661561489, "learning_rate": 1e-06, "loss": -0.0081, "step": 215 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.34688548743724823, "epoch": 0.34615384615384615, "grad_norm": 0.031053557991981506, "learning_rate": 1e-06, "loss": -0.0059, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16089.0, "completions/mean_length": 8752.5390625, "completions/mean_terminated_length": 8615.9912109375, "completions/min_length": 2857.0, "completions/min_terminated_length": 2857.0, "entropy": 0.34852510690689087, "epoch": 0.34775641025641024, "frac_reward_zero_std": 0.625, "grad_norm": 0.03795945271849632, "learning_rate": 1e-06, "loss": -0.0026, "num_tokens": 315758550.0, "reward": 0.8486670255661011, "reward_std": 0.07713688910007477, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.923828125, "rewards/symbolic_reward_accuracy/std": 0.26553234457969666, "rewards/symbolic_reward_partial_score/mean": 0.9870930910110474, "rewards/symbolic_reward_partial_score/std": 0.07342680543661118, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0828152894973755, "sampling/importance_sampling_ratio/min": 0.0008049794123508036, "sampling/sampling_logp_difference/max": 7.124693870544434, "sampling/sampling_logp_difference/mean": 0.13945353031158447, "step": 217 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.348160982131958, "epoch": 0.34935897435897434, "grad_norm": 0.03552406281232834, "learning_rate": 1e-06, "loss": -0.0051, "step": 218 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.345045730471611, "epoch": 0.35096153846153844, "grad_norm": 0.030356399714946747, "learning_rate": 1e-06, "loss": -0.0355, "step": 219 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3462676405906677, "epoch": 0.3525641025641026, "grad_norm": 0.04316132143139839, "learning_rate": 1e-06, "loss": 0.0971, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 15953.0, "completions/mean_length": 9123.447265625, "completions/mean_terminated_length": 9080.654296875, "completions/min_length": 2684.0, "completions/min_terminated_length": 2684.0, "entropy": 0.3451053500175476, "epoch": 0.3541666666666667, "frac_reward_zero_std": 0.625, "grad_norm": 0.035643648356199265, "learning_rate": 1e-06, "loss": -0.0295, "num_tokens": 321322331.0, "reward": 0.8746826648712158, "reward_std": 0.08143013715744019, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.966796875, "rewards/symbolic_reward_accuracy/std": 0.17934183776378632, "rewards/symbolic_reward_partial_score/mean": 0.9839681386947632, "rewards/symbolic_reward_partial_score/std": 0.11741097271442413, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0823302268981934, "sampling/importance_sampling_ratio/min": 0.0023029057774692774, "sampling/sampling_logp_difference/max": 6.073583602905273, "sampling/sampling_logp_difference/mean": 0.13817697763442993, "step": 221 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.3439979702234268, "epoch": 0.3557692307692308, "grad_norm": 0.010124808177351952, "learning_rate": 1e-06, "loss": -0.0391, "step": 222 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.3439791202545166, "epoch": 0.3573717948717949, "grad_norm": 0.04620878025889397, "learning_rate": 1e-06, "loss": 0.0275, "step": 223 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.34597405791282654, "epoch": 0.358974358974359, "grad_norm": 0.033160168677568436, "learning_rate": 1e-06, "loss": 0.0352, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16352.0, "completions/mean_length": 9824.076171875, "completions/mean_terminated_length": 9759.3828125, "completions/min_length": 3691.0, "completions/min_terminated_length": 3691.0, "entropy": 0.34894634783267975, "epoch": 0.3605769230769231, "frac_reward_zero_std": 0.5625, "grad_norm": 0.03732907027006149, "learning_rate": 1e-06, "loss": -0.0695, "num_tokens": 327177986.0, "reward": 0.8497949242591858, "reward_std": 0.10570286214351654, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9296875, "rewards/symbolic_reward_accuracy/std": 0.25592297315597534, "rewards/symbolic_reward_partial_score/mean": 0.9765299558639526, "rewards/symbolic_reward_partial_score/std": 0.1241905465722084, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0834449529647827, "sampling/importance_sampling_ratio/min": 1.91979356856109e-08, "sampling/sampling_logp_difference/max": 17.768463134765625, "sampling/sampling_logp_difference/mean": 0.14005614817142487, "step": 225 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.3486892879009247, "epoch": 0.36217948717948717, "grad_norm": 0.031979870051145554, "learning_rate": 1e-06, "loss": -0.0316, "step": 226 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.3491098880767822, "epoch": 0.36378205128205127, "grad_norm": 0.033694539219141006, "learning_rate": 1e-06, "loss": 0.0537, "step": 227 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.3520296514034271, "epoch": 0.36538461538461536, "grad_norm": 0.048576388508081436, "learning_rate": 1e-06, "loss": 0.0723, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16232.0, "completions/mean_length": 8509.306640625, "completions/mean_terminated_length": 8368.4072265625, "completions/min_length": 2210.0, "completions/min_terminated_length": 2210.0, "entropy": 0.36283712089061737, "epoch": 0.36698717948717946, "frac_reward_zero_std": 0.75, "grad_norm": 0.01633063144981861, "learning_rate": 1e-06, "loss": 0.0332, "num_tokens": 332317407.0, "reward": 0.8684619665145874, "reward_std": 0.04608375206589699, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.958984375, "rewards/symbolic_reward_accuracy/std": 0.19852031767368317, "rewards/symbolic_reward_partial_score/mean": 0.9821126461029053, "rewards/symbolic_reward_partial_score/std": 0.11883527040481567, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0863630771636963, "sampling/importance_sampling_ratio/min": 0.002568704541772604, "sampling/sampling_logp_difference/max": 5.964353561401367, "sampling/sampling_logp_difference/mean": 0.1453000009059906, "step": 229 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.3623500317335129, "epoch": 0.3685897435897436, "grad_norm": 0.0336209200322628, "learning_rate": 1e-06, "loss": -0.0133, "step": 230 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.363225594162941, "epoch": 0.3701923076923077, "grad_norm": 0.04329166188836098, "learning_rate": 1e-06, "loss": 0.0424, "step": 231 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3680781126022339, "epoch": 0.3717948717948718, "grad_norm": 0.037873681634664536, "learning_rate": 1e-06, "loss": -0.0225, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 16355.0, "completions/mean_length": 9007.216796875, "completions/mean_terminated_length": 8904.96484375, "completions/min_length": 2843.0, "completions/min_terminated_length": 2843.0, "entropy": 0.34392890334129333, "epoch": 0.3733974358974359, "frac_reward_zero_std": 0.59375, "grad_norm": 0.03418034315109253, "learning_rate": 1e-06, "loss": -0.0263, "num_tokens": 337802526.0, "reward": 0.8667529821395874, "reward_std": 0.08569878339767456, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.955078125, "rewards/symbolic_reward_accuracy/std": 0.20733514428138733, "rewards/symbolic_reward_partial_score/mean": 0.9822753667831421, "rewards/symbolic_reward_partial_score/std": 0.11924097687005997, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0817949771881104, "sampling/importance_sampling_ratio/min": 1.5144179243975486e-08, "sampling/sampling_logp_difference/max": 18.00564956665039, "sampling/sampling_logp_difference/mean": 0.1374126374721527, "step": 233 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.341513454914093, "epoch": 0.375, "grad_norm": 0.013998467475175858, "learning_rate": 1e-06, "loss": -0.0165, "step": 234 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.3413054198026657, "epoch": 0.3766025641025641, "grad_norm": 0.02814164012670517, "learning_rate": 1e-06, "loss": 0.0273, "step": 235 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.34441515803337097, "epoch": 0.3782051282051282, "grad_norm": 0.02532966062426567, "learning_rate": 1e-06, "loss": 0.0409, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16097.0, "completions/mean_length": 8402.685546875, "completions/mean_terminated_length": 8387.06640625, "completions/min_length": 2394.0, "completions/min_terminated_length": 2394.0, "entropy": 0.35238178074359894, "epoch": 0.3798076923076923, "frac_reward_zero_std": 0.6875, "grad_norm": 0.04017337039113045, "learning_rate": 1e-06, "loss": 0.0203, "num_tokens": 342976333.0, "reward": 0.8732129335403442, "reward_std": 0.06924398243427277, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.962890625, "rewards/symbolic_reward_accuracy/std": 0.18921469151973724, "rewards/symbolic_reward_partial_score/mean": 0.9855793714523315, "rewards/symbolic_reward_partial_score/std": 0.09895546734333038, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0842185020446777, "sampling/importance_sampling_ratio/min": 0.0017609870992600918, "sampling/sampling_logp_difference/max": 6.341880798339844, "sampling/sampling_logp_difference/mean": 0.14145317673683167, "step": 237 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.35770200192928314, "epoch": 0.3814102564102564, "grad_norm": 0.04241131991147995, "learning_rate": 1e-06, "loss": 0.0155, "step": 238 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.3542347550392151, "epoch": 0.38301282051282054, "grad_norm": 0.036281999200582504, "learning_rate": 1e-06, "loss": -0.0087, "step": 239 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.3547066003084183, "epoch": 0.38461538461538464, "grad_norm": 0.013339616358280182, "learning_rate": 1e-06, "loss": -0.0054, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16380.0, "completions/mean_length": 8927.443359375, "completions/mean_terminated_length": 8794.025390625, "completions/min_length": 2976.0, "completions/min_terminated_length": 2976.0, "entropy": 0.3498099446296692, "epoch": 0.38621794871794873, "frac_reward_zero_std": 0.71875, "grad_norm": 0.05310158431529999, "learning_rate": 1e-06, "loss": 0.0157, "num_tokens": 348376192.0, "reward": 0.8511426448822021, "reward_std": 0.06999102979898453, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.931640625, "rewards/symbolic_reward_accuracy/std": 0.25260838866233826, "rewards/symbolic_reward_partial_score/mean": 0.9797200560569763, "rewards/symbolic_reward_partial_score/std": 0.11565099656581879, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0826332569122314, "sampling/importance_sampling_ratio/min": 0.001733652432449162, "sampling/sampling_logp_difference/max": 6.357524871826172, "sampling/sampling_logp_difference/mean": 0.13907378911972046, "step": 241 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.35080140829086304, "epoch": 0.38782051282051283, "grad_norm": 0.022202912718057632, "learning_rate": 1e-06, "loss": -0.0317, "step": 242 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3477839380502701, "epoch": 0.3894230769230769, "grad_norm": 0.036365654319524765, "learning_rate": 1e-06, "loss": 0.0201, "step": 243 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.3487197905778885, "epoch": 0.391025641025641, "grad_norm": 0.021586695685982704, "learning_rate": 1e-06, "loss": -0.0049, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16030.0, "completions/mean_length": 9297.01953125, "completions/mean_terminated_length": 9269.2275390625, "completions/min_length": 3400.0, "completions/min_terminated_length": 3400.0, "entropy": 0.33930741250514984, "epoch": 0.3926282051282051, "frac_reward_zero_std": 0.46875, "grad_norm": 0.04877723380923271, "learning_rate": 1e-06, "loss": -0.0107, "num_tokens": 354082730.0, "reward": 0.8371142745018005, "reward_std": 0.1299927830696106, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.91015625, "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, "rewards/symbolic_reward_partial_score/mean": 0.9713704586029053, "rewards/symbolic_reward_partial_score/std": 0.13531623780727386, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0807551145553589, "sampling/importance_sampling_ratio/min": 0.0020373817533254623, "sampling/sampling_logp_difference/max": 6.196089744567871, "sampling/sampling_logp_difference/mean": 0.13597947359085083, "step": 245 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.34164106845855713, "epoch": 0.3942307692307692, "grad_norm": 0.05906060338020325, "learning_rate": 1e-06, "loss": 0.0228, "step": 246 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.3417434096336365, "epoch": 0.3958333333333333, "grad_norm": 0.05268029868602753, "learning_rate": 1e-06, "loss": -0.0033, "step": 247 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.34191256761550903, "epoch": 0.3974358974358974, "grad_norm": 0.036686696112155914, "learning_rate": 1e-06, "loss": -0.0155, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16381.0, "completions/mean_length": 8782.435546875, "completions/mean_terminated_length": 8752.6259765625, "completions/min_length": 2203.0, "completions/min_terminated_length": 2203.0, "entropy": 0.34532971680164337, "epoch": 0.39903846153846156, "frac_reward_zero_std": 0.65625, "grad_norm": 0.044723521918058395, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 359442553.0, "reward": 0.8644434213638306, "reward_std": 0.07359655201435089, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9453125, "rewards/symbolic_reward_accuracy/std": 0.2275916188955307, "rewards/symbolic_reward_partial_score/mean": 0.991503894329071, "rewards/symbolic_reward_partial_score/std": 0.055232565850019455, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0815048217773438, "sampling/importance_sampling_ratio/min": 0.003359792288392782, "sampling/sampling_logp_difference/max": 5.695876121520996, "sampling/sampling_logp_difference/mean": 0.1374778151512146, "step": 249 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.3445995897054672, "epoch": 0.40064102564102566, "grad_norm": 0.03639920800924301, "learning_rate": 1e-06, "loss": -0.0067, "step": 250 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.3449539989233017, "epoch": 0.40224358974358976, "grad_norm": 0.017649181187152863, "learning_rate": 1e-06, "loss": 0.0334, "step": 251 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.3447718918323517, "epoch": 0.40384615384615385, "grad_norm": 0.031058967113494873, "learning_rate": 1e-06, "loss": 0.0107, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16128.0, "completions/mean_length": 9108.4453125, "completions/mean_terminated_length": 9051.1572265625, "completions/min_length": 2442.0, "completions/min_terminated_length": 2442.0, "entropy": 0.33958064019680023, "epoch": 0.40544871794871795, "frac_reward_zero_std": 0.6875, "grad_norm": 0.024986546486616135, "learning_rate": 1e-06, "loss": -0.0033, "num_tokens": 364931101.0, "reward": 0.8485742807388306, "reward_std": 0.06668805330991745, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.92578125, "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, "rewards/symbolic_reward_partial_score/mean": 0.9796223640441895, "rewards/symbolic_reward_partial_score/std": 0.10862989723682404, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.080409288406372, "sampling/importance_sampling_ratio/min": 0.0008817166090011597, "sampling/sampling_logp_difference/max": 7.033639907836914, "sampling/sampling_logp_difference/mean": 0.13533073663711548, "step": 253 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.3421936333179474, "epoch": 0.40705128205128205, "grad_norm": 0.03423618525266647, "learning_rate": 1e-06, "loss": -0.0005, "step": 254 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.33770668506622314, "epoch": 0.40865384615384615, "grad_norm": 0.04051296040415764, "learning_rate": 1e-06, "loss": 0.0231, "step": 255 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.3379022479057312, "epoch": 0.41025641025641024, "grad_norm": 0.022254278883337975, "learning_rate": 1e-06, "loss": 0.0037, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16239.0, "completions/mean_length": 9432.91015625, "completions/mean_terminated_length": 9251.8193359375, "completions/min_length": 2405.0, "completions/min_terminated_length": 2405.0, "entropy": 0.33586740493774414, "epoch": 0.41185897435897434, "frac_reward_zero_std": 0.5, "grad_norm": 0.03636210039258003, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 370672031.0, "reward": 0.8245459794998169, "reward_std": 0.13228921592235565, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.892578125, "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, "rewards/symbolic_reward_partial_score/mean": 0.9717936515808105, "rewards/symbolic_reward_partial_score/std": 0.13187147676944733, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.079423189163208, "sampling/importance_sampling_ratio/min": 0.001578564289957285, "sampling/sampling_logp_difference/max": 6.451239585876465, "sampling/sampling_logp_difference/mean": 0.13348346948623657, "step": 257 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.33470618724823, "epoch": 0.41346153846153844, "grad_norm": 0.05403440073132515, "learning_rate": 1e-06, "loss": 0.0094, "step": 258 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.3330310583114624, "epoch": 0.4150641025641026, "grad_norm": 0.029213331639766693, "learning_rate": 1e-06, "loss": 0.028, "step": 259 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.33597373962402344, "epoch": 0.4166666666666667, "grad_norm": 0.030459623783826828, "learning_rate": 1e-06, "loss": 0.021, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16261.0, "completions/mean_length": 8883.1484375, "completions/mean_terminated_length": 8868.4697265625, "completions/min_length": 2473.0, "completions/min_terminated_length": 2473.0, "entropy": 0.34810274839401245, "epoch": 0.4182692307692308, "frac_reward_zero_std": 0.6875, "grad_norm": 0.014394080266356468, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 376080491.0, "reward": 0.8571972846984863, "reward_std": 0.06691135466098785, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.935546875, "rewards/symbolic_reward_accuracy/std": 0.24579854309558868, "rewards/symbolic_reward_partial_score/mean": 0.9868814945220947, "rewards/symbolic_reward_partial_score/std": 0.08386261761188507, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0820012092590332, "sampling/importance_sampling_ratio/min": 0.002452337648719549, "sampling/sampling_logp_difference/max": 6.010713577270508, "sampling/sampling_logp_difference/mean": 0.13821958005428314, "step": 261 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.3462749570608139, "epoch": 0.4198717948717949, "grad_norm": 0.02984866313636303, "learning_rate": 1e-06, "loss": 0.0182, "step": 262 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.34777069091796875, "epoch": 0.421474358974359, "grad_norm": 0.04877055808901787, "learning_rate": 1e-06, "loss": -0.0066, "step": 263 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.35094353556632996, "epoch": 0.4230769230769231, "grad_norm": 0.012895370833575726, "learning_rate": 1e-06, "loss": -0.0002, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 16319.0, "completions/mean_length": 9126.1796875, "completions/mean_terminated_length": 9040.119140625, "completions/min_length": 2192.0, "completions/min_terminated_length": 2192.0, "entropy": 0.3465740531682968, "epoch": 0.42467948717948717, "frac_reward_zero_std": 0.65625, "grad_norm": 0.02313750796020031, "learning_rate": 1e-06, "loss": 0.0073, "num_tokens": 381673559.0, "reward": 0.8556641340255737, "reward_std": 0.06552300602197647, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.935546875, "rewards/symbolic_reward_accuracy/std": 0.24579854309558868, "rewards/symbolic_reward_partial_score/mean": 0.9856771230697632, "rewards/symbolic_reward_partial_score/std": 0.09215202182531357, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.081518530845642, "sampling/importance_sampling_ratio/min": 0.0010775821283459663, "sampling/sampling_logp_difference/max": 6.833035469055176, "sampling/sampling_logp_difference/mean": 0.13711869716644287, "step": 265 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.3460189998149872, "epoch": 0.42628205128205127, "grad_norm": 0.01572212018072605, "learning_rate": 1e-06, "loss": 0.0796, "step": 266 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3438139259815216, "epoch": 0.42788461538461536, "grad_norm": 0.03588380664587021, "learning_rate": 1e-06, "loss": 0.0108, "step": 267 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3455318212509155, "epoch": 0.42948717948717946, "grad_norm": 0.02190067432820797, "learning_rate": 1e-06, "loss": -0.0591, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16164.0, "completions/mean_length": 8714.4921875, "completions/mean_terminated_length": 8699.4833984375, "completions/min_length": 2861.0, "completions/min_terminated_length": 2861.0, "entropy": 0.3366045355796814, "epoch": 0.4310897435897436, "frac_reward_zero_std": 0.59375, "grad_norm": 0.04956274479627609, "learning_rate": 1e-06, "loss": -0.0015, "num_tokens": 387124451.0, "reward": 0.8711084127426147, "reward_std": 0.08412916958332062, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9609375, "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, "rewards/symbolic_reward_partial_score/mean": 0.9824707508087158, "rewards/symbolic_reward_partial_score/std": 0.11954645812511444, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0790743827819824, "sampling/importance_sampling_ratio/min": 0.0011303027858957648, "sampling/sampling_logp_difference/max": 6.785269737243652, "sampling/sampling_logp_difference/mean": 0.13317176699638367, "step": 269 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.33688215911388397, "epoch": 0.4326923076923077, "grad_norm": 0.028510021045804024, "learning_rate": 1e-06, "loss": -0.0246, "step": 270 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.33309443295001984, "epoch": 0.4342948717948718, "grad_norm": 0.025804786011576653, "learning_rate": 1e-06, "loss": 0.0367, "step": 271 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3343093693256378, "epoch": 0.4358974358974359, "grad_norm": 0.020202551037073135, "learning_rate": 1e-06, "loss": -0.004, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15658.0, "completions/max_terminated_length": 15658.0, "completions/mean_length": 8455.501953125, "completions/mean_terminated_length": 8455.501953125, "completions/min_length": 2604.0, "completions/min_terminated_length": 2604.0, "entropy": 0.3380519896745682, "epoch": 0.4375, "frac_reward_zero_std": 0.6875, "grad_norm": 0.01542014442384243, "learning_rate": 1e-06, "loss": -0.0041, "num_tokens": 392383220.0, "reward": 0.8757373094558716, "reward_std": 0.058174289762973785, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.962890625, "rewards/symbolic_reward_accuracy/std": 0.18921469151973724, "rewards/symbolic_reward_partial_score/mean": 0.9939941167831421, "rewards/symbolic_reward_partial_score/std": 0.04931728541851044, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0802242755889893, "sampling/importance_sampling_ratio/min": 0.0010891318088397384, "sampling/sampling_logp_difference/max": 6.82237434387207, "sampling/sampling_logp_difference/mean": 0.13519537448883057, "step": 273 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3411364108324051, "epoch": 0.4391025641025641, "grad_norm": 0.034905482083559036, "learning_rate": 1e-06, "loss": 0.0287, "step": 274 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.33939428627491, "epoch": 0.4407051282051282, "grad_norm": 0.026735356077551842, "learning_rate": 1e-06, "loss": -0.0558, "step": 275 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.34197986125946045, "epoch": 0.4423076923076923, "grad_norm": 0.012434117496013641, "learning_rate": 1e-06, "loss": 0.0317, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15186.0, "completions/mean_length": 8064.33984375, "completions/mean_terminated_length": 8048.05859375, "completions/min_length": 2408.0, "completions/min_terminated_length": 2408.0, "entropy": 0.3548656851053238, "epoch": 0.4439102564102564, "frac_reward_zero_std": 0.78125, "grad_norm": 0.012898312881588936, "learning_rate": 1e-06, "loss": -0.0317, "num_tokens": 397432226.0, "reward": 0.8813184499740601, "reward_std": 0.051420196890830994, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9765625, "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, "rewards/symbolic_reward_partial_score/mean": 0.9852539300918579, "rewards/symbolic_reward_partial_score/std": 0.11670207232236862, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0825469493865967, "sampling/importance_sampling_ratio/min": 0.0011710706166923046, "sampling/sampling_logp_difference/max": 6.7498369216918945, "sampling/sampling_logp_difference/mean": 0.13951528072357178, "step": 277 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.35443782806396484, "epoch": 0.44551282051282054, "grad_norm": 0.04750717431306839, "learning_rate": 1e-06, "loss": 0.0677, "step": 278 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3521930128335953, "epoch": 0.44711538461538464, "grad_norm": 0.013996710069477558, "learning_rate": 1e-06, "loss": -0.0339, "step": 279 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.35318708419799805, "epoch": 0.44871794871794873, "grad_norm": 0.038625575602054596, "learning_rate": 1e-06, "loss": 0.0086, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 15941.0, "completions/mean_length": 8743.060546875, "completions/mean_terminated_length": 8698.025390625, "completions/min_length": 3508.0, "completions/min_terminated_length": 3508.0, "entropy": 0.34629644453525543, "epoch": 0.45032051282051283, "frac_reward_zero_std": 0.6875, "grad_norm": 0.03841150552034378, "learning_rate": 1e-06, "loss": 0.0368, "num_tokens": 402807201.0, "reward": 0.8585205078125, "reward_std": 0.05953942611813545, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.939453125, "rewards/symbolic_reward_accuracy/std": 0.2387305200099945, "rewards/symbolic_reward_partial_score/mean": 0.9847819209098816, "rewards/symbolic_reward_partial_score/std": 0.096100814640522, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0810859203338623, "sampling/importance_sampling_ratio/min": 0.0028462130576372147, "sampling/sampling_logp_difference/max": 5.8617658615112305, "sampling/sampling_logp_difference/mean": 0.13681869208812714, "step": 281 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.3481019139289856, "epoch": 0.4519230769230769, "grad_norm": 0.018154006451368332, "learning_rate": 1e-06, "loss": -0.0263, "step": 282 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.34661518037319183, "epoch": 0.453525641025641, "grad_norm": 0.028883378952741623, "learning_rate": 1e-06, "loss": 0.0019, "step": 283 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3459591269493103, "epoch": 0.4551282051282051, "grad_norm": 0.0511566661298275, "learning_rate": 1e-06, "loss": 0.0185, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15589.0, "completions/mean_length": 8230.841796875, "completions/mean_terminated_length": 8214.88671875, "completions/min_length": 3874.0, "completions/min_terminated_length": 3874.0, "entropy": 0.33991706371307373, "epoch": 0.4567307692307692, "frac_reward_zero_std": 0.625, "grad_norm": 0.02823617309331894, "learning_rate": 1e-06, "loss": -0.0129, "num_tokens": 407844784.0, "reward": 0.8700635433197021, "reward_std": 0.07864132523536682, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.958984375, "rewards/symbolic_reward_accuracy/std": 0.19852031767368317, "rewards/symbolic_reward_partial_score/mean": 0.9828939437866211, "rewards/symbolic_reward_partial_score/std": 0.10988438129425049, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.079613208770752, "sampling/importance_sampling_ratio/min": 0.001327555044554174, "sampling/sampling_logp_difference/max": 6.624416351318359, "sampling/sampling_logp_difference/mean": 0.13476338982582092, "step": 285 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.34054870903491974, "epoch": 0.4583333333333333, "grad_norm": 0.010968566872179508, "learning_rate": 1e-06, "loss": -0.0121, "step": 286 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.33991578221321106, "epoch": 0.4599358974358974, "grad_norm": 0.025836389511823654, "learning_rate": 1e-06, "loss": 0.0229, "step": 287 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.340618833899498, "epoch": 0.46153846153846156, "grad_norm": 0.03293606638908386, "learning_rate": 1e-06, "loss": 0.0162, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15688.0, "completions/mean_length": 8631.4296875, "completions/mean_terminated_length": 8570.3857421875, "completions/min_length": 2921.0, "completions/min_terminated_length": 2921.0, "entropy": 0.3468311131000519, "epoch": 0.46314102564102566, "frac_reward_zero_std": 0.75, "grad_norm": 0.05308857932686806, "learning_rate": 1e-06, "loss": 0.0117, "num_tokens": 413105804.0, "reward": 0.8817285299301147, "reward_std": 0.04743686318397522, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9765625, "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, "rewards/symbolic_reward_partial_score/mean": 0.9879231452941895, "rewards/symbolic_reward_partial_score/std": 0.10144013911485672, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0810067653656006, "sampling/importance_sampling_ratio/min": 0.0019312994554638863, "sampling/sampling_logp_difference/max": 6.2495622634887695, "sampling/sampling_logp_difference/mean": 0.13701131939888, "step": 289 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3475600481033325, "epoch": 0.46474358974358976, "grad_norm": 0.037866950035095215, "learning_rate": 1e-06, "loss": 0.0226, "step": 290 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3477329760789871, "epoch": 0.46634615384615385, "grad_norm": 0.03711353987455368, "learning_rate": 1e-06, "loss": -0.0107, "step": 291 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.3468507379293442, "epoch": 0.46794871794871795, "grad_norm": 0.04164763540029526, "learning_rate": 1e-06, "loss": 0.0098, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14946.0, "completions/mean_length": 8548.748046875, "completions/mean_terminated_length": 8533.4150390625, "completions/min_length": 2878.0, "completions/min_terminated_length": 2878.0, "entropy": 0.35460542142391205, "epoch": 0.46955128205128205, "frac_reward_zero_std": 0.71875, "grad_norm": 0.02361561357975006, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 418371051.0, "reward": 0.8782764673233032, "reward_std": 0.055502112954854965, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.96875, "rewards/symbolic_reward_accuracy/std": 0.17416280508041382, "rewards/symbolic_reward_partial_score/mean": 0.9907389283180237, "rewards/symbolic_reward_partial_score/std": 0.0802433118224144, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0822389125823975, "sampling/importance_sampling_ratio/min": 0.0006191764841787517, "sampling/sampling_logp_difference/max": 7.387120246887207, "sampling/sampling_logp_difference/mean": 0.13936927914619446, "step": 293 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.35354921221733093, "epoch": 0.47115384615384615, "grad_norm": 0.010919775813817978, "learning_rate": 1e-06, "loss": -0.0298, "step": 294 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.35306140780448914, "epoch": 0.47275641025641024, "grad_norm": 0.031216872856020927, "learning_rate": 1e-06, "loss": 0.035, "step": 295 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3557155877351761, "epoch": 0.47435897435897434, "grad_norm": 0.013406717218458652, "learning_rate": 1e-06, "loss": -0.0099, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15964.0, "completions/mean_length": 8315.3046875, "completions/mean_terminated_length": 8299.5146484375, "completions/min_length": 2951.0, "completions/min_terminated_length": 2951.0, "entropy": 0.3517186939716339, "epoch": 0.47596153846153844, "frac_reward_zero_std": 0.75, "grad_norm": 0.021900570020079613, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 423478775.0, "reward": 0.8844971060752869, "reward_std": 0.046689338982105255, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.978515625, "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, "rewards/symbolic_reward_partial_score/mean": 0.991943359375, "rewards/symbolic_reward_partial_score/std": 0.08004472404718399, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0816459655761719, "sampling/importance_sampling_ratio/min": 0.0024761438835412264, "sampling/sampling_logp_difference/max": 6.0010528564453125, "sampling/sampling_logp_difference/mean": 0.13847716152668, "step": 297 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.35239139199256897, "epoch": 0.4775641025641026, "grad_norm": 0.03177941218018532, "learning_rate": 1e-06, "loss": 0.0091, "step": 298 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.35044218599796295, "epoch": 0.4791666666666667, "grad_norm": 0.028342947363853455, "learning_rate": 1e-06, "loss": 0.008, "step": 299 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.3535783439874649, "epoch": 0.4807692307692308, "grad_norm": 0.03565602749586105, "learning_rate": 1e-06, "loss": -0.0028, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15619.0, "completions/mean_length": 8299.53515625, "completions/mean_terminated_length": 8283.7138671875, "completions/min_length": 3955.0, "completions/min_terminated_length": 3955.0, "entropy": 0.3493725508451462, "epoch": 0.4823717948717949, "frac_reward_zero_std": 0.71875, "grad_norm": 0.029596375301480293, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 428603673.0, "reward": 0.8677734732627869, "reward_std": 0.0615101121366024, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.955078125, "rewards/symbolic_reward_accuracy/std": 0.20733514428138733, "rewards/symbolic_reward_partial_score/mean": 0.9830728769302368, "rewards/symbolic_reward_partial_score/std": 0.11326193064451218, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0808006525039673, "sampling/importance_sampling_ratio/min": 0.0013948846608400345, "sampling/sampling_logp_difference/max": 6.574943542480469, "sampling/sampling_logp_difference/mean": 0.1373080462217331, "step": 301 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.3497709035873413, "epoch": 0.483974358974359, "grad_norm": 0.0381162129342556, "learning_rate": 1e-06, "loss": 0.0187, "step": 302 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.3505343049764633, "epoch": 0.4855769230769231, "grad_norm": 0.018648389726877213, "learning_rate": 1e-06, "loss": -0.0085, "step": 303 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.34972867369651794, "epoch": 0.48717948717948717, "grad_norm": 0.011281575076282024, "learning_rate": 1e-06, "loss": -0.0189, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15856.0, "completions/mean_length": 8226.66796875, "completions/mean_terminated_length": 8210.7041015625, "completions/min_length": 1562.0, "completions/min_terminated_length": 1562.0, "entropy": 0.35703901946544647, "epoch": 0.48878205128205127, "frac_reward_zero_std": 0.6875, "grad_norm": 0.035666901618242264, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 433663935.0, "reward": 0.8768360018730164, "reward_std": 0.06430701911449432, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.96875, "rewards/symbolic_reward_accuracy/std": 0.17416280508041382, "rewards/symbolic_reward_partial_score/mean": 0.9859374761581421, "rewards/symbolic_reward_partial_score/std": 0.10897184908390045, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0820643901824951, "sampling/importance_sampling_ratio/min": 0.0015389270847663283, "sampling/sampling_logp_difference/max": 6.476669788360596, "sampling/sampling_logp_difference/mean": 0.1392567753791809, "step": 305 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.35459552705287933, "epoch": 0.49038461538461536, "grad_norm": 0.040812939405441284, "learning_rate": 1e-06, "loss": 0.0109, "step": 306 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.35502541065216064, "epoch": 0.49198717948717946, "grad_norm": 0.046345051378011703, "learning_rate": 1e-06, "loss": 0.0124, "step": 307 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.35453028976917267, "epoch": 0.4935897435897436, "grad_norm": 0.0216151662170887, "learning_rate": 1e-06, "loss": 0.02, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15888.0, "completions/max_terminated_length": 15888.0, "completions/mean_length": 7780.3828125, "completions/mean_terminated_length": 7780.3828125, "completions/min_length": 2444.0, "completions/min_terminated_length": 2444.0, "entropy": 0.35226917266845703, "epoch": 0.4951923076923077, "frac_reward_zero_std": 0.625, "grad_norm": 0.02735666185617447, "learning_rate": 1e-06, "loss": -0.0073, "num_tokens": 438428995.0, "reward": 0.8719433546066284, "reward_std": 0.08105036616325378, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9609375, "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, "rewards/symbolic_reward_partial_score/mean": 0.9846028685569763, "rewards/symbolic_reward_partial_score/std": 0.11089053750038147, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0812032222747803, "sampling/importance_sampling_ratio/min": 0.001538945478387177, "sampling/sampling_logp_difference/max": 6.476657867431641, "sampling/sampling_logp_difference/mean": 0.13829082250595093, "step": 309 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.353080153465271, "epoch": 0.4967948717948718, "grad_norm": 0.03591330349445343, "learning_rate": 1e-06, "loss": 0.0109, "step": 310 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.35180215537548065, "epoch": 0.4983974358974359, "grad_norm": 0.034577466547489166, "learning_rate": 1e-06, "loss": 0.0396, "step": 311 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.34946948289871216, "epoch": 0.5, "grad_norm": 0.01364669855684042, "learning_rate": 1e-06, "loss": -0.0211, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15104.0, "completions/max_terminated_length": 15104.0, "completions/mean_length": 7865.982421875, "completions/mean_terminated_length": 7865.982421875, "completions/min_length": 2855.0, "completions/min_terminated_length": 2855.0, "entropy": 0.35234443843364716, "epoch": 0.5016025641025641, "frac_reward_zero_std": 0.65625, "grad_norm": 0.02486881986260414, "learning_rate": 1e-06, "loss": -0.0127, "num_tokens": 443306634.0, "reward": 0.8547022342681885, "reward_std": 0.08743016421794891, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9375, "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, "rewards/symbolic_reward_partial_score/mean": 0.974007248878479, "rewards/symbolic_reward_partial_score/std": 0.14314807951450348, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0809471607208252, "sampling/importance_sampling_ratio/min": 0.0009187202085740864, "sampling/sampling_logp_difference/max": 6.992528915405273, "sampling/sampling_logp_difference/mean": 0.1381678283214569, "step": 313 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.353774830698967, "epoch": 0.5032051282051282, "grad_norm": 0.02260231412947178, "learning_rate": 1e-06, "loss": 0.0262, "step": 314 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.35382866859436035, "epoch": 0.5048076923076923, "grad_norm": 0.036160290241241455, "learning_rate": 1e-06, "loss": -0.0069, "step": 315 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.3496834635734558, "epoch": 0.5064102564102564, "grad_norm": 0.020312169566750526, "learning_rate": 1e-06, "loss": 0.0113, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14235.0, "completions/mean_length": 7867.984375, "completions/mean_terminated_length": 7851.31884765625, "completions/min_length": 2672.0, "completions/min_terminated_length": 2672.0, "entropy": 0.3520466834306717, "epoch": 0.5080128205128205, "frac_reward_zero_std": 0.71875, "grad_norm": 0.03794070705771446, "learning_rate": 1e-06, "loss": -0.0048, "num_tokens": 448186466.0, "reward": 0.8595849871635437, "reward_std": 0.06614896655082703, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.943359375, "rewards/symbolic_reward_accuracy/std": 0.23138070106506348, "rewards/symbolic_reward_partial_score/mean": 0.9792155027389526, "rewards/symbolic_reward_partial_score/std": 0.12694667279720306, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.080937147140503, "sampling/importance_sampling_ratio/min": 0.0017650576774030924, "sampling/sampling_logp_difference/max": 6.339571952819824, "sampling/sampling_logp_difference/mean": 0.13793881237506866, "step": 317 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.34787003695964813, "epoch": 0.5096153846153846, "grad_norm": 0.03488509729504585, "learning_rate": 1e-06, "loss": 0.0068, "step": 318 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.3510197401046753, "epoch": 0.5112179487179487, "grad_norm": 0.029722966253757477, "learning_rate": 1e-06, "loss": 0.0042, "step": 319 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.35279516875743866, "epoch": 0.5128205128205128, "grad_norm": 0.017475329339504242, "learning_rate": 1e-06, "loss": 0.0023, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14852.0, "completions/max_terminated_length": 14852.0, "completions/mean_length": 7846.564453125, "completions/mean_terminated_length": 7846.564453125, "completions/min_length": 3124.0, "completions/min_terminated_length": 3124.0, "entropy": 0.3422964960336685, "epoch": 0.5144230769230769, "frac_reward_zero_std": 0.71875, "grad_norm": 0.029501963406801224, "learning_rate": 1e-06, "loss": -0.0253, "num_tokens": 453072499.0, "reward": 0.8744287490844727, "reward_std": 0.061712294816970825, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.96484375, "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, "rewards/symbolic_reward_partial_score/mean": 0.9850748777389526, "rewards/symbolic_reward_partial_score/std": 0.10596644133329391, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.078956127166748, "sampling/importance_sampling_ratio/min": 6.88058798914426e-06, "sampling/sampling_logp_difference/max": 11.88680648803711, "sampling/sampling_logp_difference/mean": 0.13481628894805908, "step": 321 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.3414119780063629, "epoch": 0.5160256410256411, "grad_norm": 0.031455278396606445, "learning_rate": 1e-06, "loss": -0.0154, "step": 322 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.3419031500816345, "epoch": 0.5176282051282052, "grad_norm": 0.019468756392598152, "learning_rate": 1e-06, "loss": 0.0171, "step": 323 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3411635458469391, "epoch": 0.5192307692307693, "grad_norm": 0.011964436620473862, "learning_rate": 1e-06, "loss": 0.0274, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15979.0, "completions/max_terminated_length": 15979.0, "completions/mean_length": 7799.4296875, "completions/mean_terminated_length": 7799.4296875, "completions/min_length": 3735.0, "completions/min_terminated_length": 3735.0, "entropy": 0.34323766827583313, "epoch": 0.5208333333333334, "frac_reward_zero_std": 0.53125, "grad_norm": 0.047106627374887466, "learning_rate": 1e-06, "loss": 0.0247, "num_tokens": 457995983.0, "reward": 0.8687793016433716, "reward_std": 0.09881381690502167, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.958984375, "rewards/symbolic_reward_accuracy/std": 0.19852031767368317, "rewards/symbolic_reward_partial_score/mean": 0.9779622554779053, "rewards/symbolic_reward_partial_score/std": 0.1316920816898346, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.079183578491211, "sampling/importance_sampling_ratio/min": 0.0027032888028770685, "sampling/sampling_logp_difference/max": 5.913286209106445, "sampling/sampling_logp_difference/mean": 0.13572083413600922, "step": 325 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.3419703096151352, "epoch": 0.5224358974358975, "grad_norm": 0.03265855461359024, "learning_rate": 1e-06, "loss": 0.0004, "step": 326 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.34272991120815277, "epoch": 0.5240384615384616, "grad_norm": 0.012581798247992992, "learning_rate": 1e-06, "loss": -0.0431, "step": 327 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.3413187265396118, "epoch": 0.5256410256410257, "grad_norm": 0.03506917878985405, "learning_rate": 1e-06, "loss": 0.0227, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12545.0, "completions/max_terminated_length": 12545.0, "completions/mean_length": 7267.603515625, "completions/mean_terminated_length": 7267.603515625, "completions/min_length": 2376.0, "completions/min_terminated_length": 2376.0, "entropy": 0.3425762802362442, "epoch": 0.5272435897435898, "frac_reward_zero_std": 0.875, "grad_norm": 0.005494450684636831, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 462559172.0, "reward": 0.8919922113418579, "reward_std": 0.027988089248538017, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9928385019302368, "rewards/symbolic_reward_partial_score/std": 0.07976873219013214, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0796170234680176, "sampling/importance_sampling_ratio/min": 0.0032112544868141413, "sampling/sampling_logp_difference/max": 5.741093635559082, "sampling/sampling_logp_difference/mean": 0.1358506977558136, "step": 329 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.338531956076622, "epoch": 0.5288461538461539, "grad_norm": 0.003034857101738453, "learning_rate": 1e-06, "loss": 0.0093, "step": 330 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3404413312673569, "epoch": 0.530448717948718, "grad_norm": 0.005156984087079763, "learning_rate": 1e-06, "loss": -0.0017, "step": 331 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3447296917438507, "epoch": 0.532051282051282, "grad_norm": 0.005420037545263767, "learning_rate": 1e-06, "loss": -0.0038, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15153.0, "completions/max_terminated_length": 15153.0, "completions/mean_length": 8001.07421875, "completions/mean_terminated_length": 8001.07421875, "completions/min_length": 2949.0, "completions/min_terminated_length": 2949.0, "entropy": 0.3399716317653656, "epoch": 0.5336538461538461, "frac_reward_zero_std": 0.6875, "grad_norm": 0.03587412089109421, "learning_rate": 1e-06, "loss": 0.0326, "num_tokens": 467489130.0, "reward": 0.8555078506469727, "reward_std": 0.08913202583789825, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.943359375, "rewards/symbolic_reward_accuracy/std": 0.23138070106506348, "rewards/symbolic_reward_partial_score/mean": 0.9649739265441895, "rewards/symbolic_reward_partial_score/std": 0.17589768767356873, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0791929960250854, "sampling/importance_sampling_ratio/min": 0.002150619635358453, "sampling/sampling_logp_difference/max": 6.141999244689941, "sampling/sampling_logp_difference/mean": 0.1354849636554718, "step": 333 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3407832086086273, "epoch": 0.5352564102564102, "grad_norm": 0.02745853364467621, "learning_rate": 1e-06, "loss": -0.0044, "step": 334 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3397674411535263, "epoch": 0.5368589743589743, "grad_norm": 0.031612735241651535, "learning_rate": 1e-06, "loss": 0.0186, "step": 335 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.34108591079711914, "epoch": 0.5384615384615384, "grad_norm": 0.022471971809864044, "learning_rate": 1e-06, "loss": -0.0298, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16055.0, "completions/max_terminated_length": 16055.0, "completions/mean_length": 7275.859375, "completions/mean_terminated_length": 7275.859375, "completions/min_length": 2403.0, "completions/min_terminated_length": 2403.0, "entropy": 0.34444108605384827, "epoch": 0.5400641025641025, "frac_reward_zero_std": 0.75, "grad_norm": 0.011129658669233322, "learning_rate": 1e-06, "loss": -0.0411, "num_tokens": 472043010.0, "reward": 0.8753613233566284, "reward_std": 0.06425656378269196, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.96875, "rewards/symbolic_reward_accuracy/std": 0.17416280508041382, "rewards/symbolic_reward_partial_score/mean": 0.9803711175918579, "rewards/symbolic_reward_partial_score/std": 0.13306567072868347, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0804669857025146, "sampling/importance_sampling_ratio/min": 0.0026080512907356024, "sampling/sampling_logp_difference/max": 5.949151992797852, "sampling/sampling_logp_difference/mean": 0.13730235397815704, "step": 337 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.3436679393053055, "epoch": 0.5416666666666666, "grad_norm": 0.030403705313801765, "learning_rate": 1e-06, "loss": 0.0169, "step": 338 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.3479660898447037, "epoch": 0.5432692307692307, "grad_norm": 0.028728436678647995, "learning_rate": 1e-06, "loss": 0.0166, "step": 339 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.34490153193473816, "epoch": 0.5448717948717948, "grad_norm": 0.019514864310622215, "learning_rate": 1e-06, "loss": 0.0333, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14709.0, "completions/max_terminated_length": 14709.0, "completions/mean_length": 7591.978515625, "completions/mean_terminated_length": 7591.978515625, "completions/min_length": 2559.0, "completions/min_terminated_length": 2559.0, "entropy": 0.32192863523960114, "epoch": 0.5464743589743589, "frac_reward_zero_std": 0.65625, "grad_norm": 0.034874796867370605, "learning_rate": 1e-06, "loss": -0.0277, "num_tokens": 476821191.0, "reward": 0.8587450981140137, "reward_std": 0.09320931881666183, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.94921875, "rewards/symbolic_reward_accuracy/std": 0.21976542472839355, "rewards/symbolic_reward_partial_score/mean": 0.9640462398529053, "rewards/symbolic_reward_partial_score/std": 0.18081288039684296, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.076791524887085, "sampling/importance_sampling_ratio/min": 7.61121918912977e-05, "sampling/sampling_logp_difference/max": 9.483302116394043, "sampling/sampling_logp_difference/mean": 0.13102132081985474, "step": 341 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.32630935311317444, "epoch": 0.5480769230769231, "grad_norm": 0.026049602776765823, "learning_rate": 1e-06, "loss": 0.0041, "step": 342 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.329432412981987, "epoch": 0.5496794871794872, "grad_norm": 0.04313475638628006, "learning_rate": 1e-06, "loss": 0.0112, "step": 343 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.32661859691143036, "epoch": 0.5512820512820513, "grad_norm": 0.017984000965952873, "learning_rate": 1e-06, "loss": 0.0177, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13244.0, "completions/max_terminated_length": 13244.0, "completions/mean_length": 7087.556640625, "completions/mean_terminated_length": 7087.556640625, "completions/min_length": 1901.0, "completions/min_terminated_length": 1901.0, "entropy": 0.3469938188791275, "epoch": 0.5528846153846154, "frac_reward_zero_std": 0.875, "grad_norm": 0.018712131306529045, "learning_rate": 1e-06, "loss": -0.0047, "num_tokens": 481245764.0, "reward": 0.8896288871765137, "reward_std": 0.027910862118005753, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.9966796636581421, "rewards/symbolic_reward_partial_score/std": 0.04575762152671814, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0802421569824219, "sampling/importance_sampling_ratio/min": 0.0005523766740225255, "sampling/sampling_logp_difference/max": 7.501280307769775, "sampling/sampling_logp_difference/mean": 0.13726381957530975, "step": 345 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3463977128267288, "epoch": 0.5544871794871795, "grad_norm": 0.015085569582879543, "learning_rate": 1e-06, "loss": -0.0153, "step": 346 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3441087454557419, "epoch": 0.5560897435897436, "grad_norm": 0.01735646091401577, "learning_rate": 1e-06, "loss": 0.003, "step": 347 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3465464264154434, "epoch": 0.5576923076923077, "grad_norm": 0.008039072155952454, "learning_rate": 1e-06, "loss": 0.0079, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12953.0, "completions/max_terminated_length": 12953.0, "completions/mean_length": 7303.236328125, "completions/mean_terminated_length": 7303.236328125, "completions/min_length": 1640.0, "completions/min_terminated_length": 1640.0, "entropy": 0.33060184121131897, "epoch": 0.5592948717948718, "frac_reward_zero_std": 0.65625, "grad_norm": 0.04009666666388512, "learning_rate": 1e-06, "loss": 0.0272, "num_tokens": 485834813.0, "reward": 0.8736133575439453, "reward_std": 0.07724933326244354, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.96484375, "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, "rewards/symbolic_reward_partial_score/mean": 0.9823567867279053, "rewards/symbolic_reward_partial_score/std": 0.12479215115308762, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0774190425872803, "sampling/importance_sampling_ratio/min": 0.0005185269983485341, "sampling/sampling_logp_difference/max": 7.564518451690674, "sampling/sampling_logp_difference/mean": 0.13230668008327484, "step": 349 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.3295121490955353, "epoch": 0.5608974358974359, "grad_norm": 0.03869420662522316, "learning_rate": 1e-06, "loss": 0.0468, "step": 350 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.3295880854129791, "epoch": 0.5625, "grad_norm": 0.026594726368784904, "learning_rate": 1e-06, "loss": -0.0518, "step": 351 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3306301534175873, "epoch": 0.5641025641025641, "grad_norm": 0.012518891133368015, "learning_rate": 1e-06, "loss": -0.0214, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13286.0, "completions/max_terminated_length": 13286.0, "completions/mean_length": 7497.123046875, "completions/mean_terminated_length": 7497.123046875, "completions/min_length": 1852.0, "completions/min_terminated_length": 1852.0, "entropy": 0.3239324390888214, "epoch": 0.5657051282051282, "frac_reward_zero_std": 0.6875, "grad_norm": 0.03741198033094406, "learning_rate": 1e-06, "loss": 0.0108, "num_tokens": 490596460.0, "reward": 0.8816300630569458, "reward_std": 0.05658324807882309, "rewards/progression_diversity/mean": -8.933329081628472e-05, "rewards/progression_diversity/std": 0.0020213813986629248, "rewards/symbolic_reward_accuracy/mean": 0.97265625, "rewards/symbolic_reward_accuracy/std": 0.16324250400066376, "rewards/symbolic_reward_partial_score/mean": 0.9934570789337158, "rewards/symbolic_reward_partial_score/std": 0.06523283571004868, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.075650930404663, "sampling/importance_sampling_ratio/min": 0.0018057399429380894, "sampling/sampling_logp_difference/max": 6.316784858703613, "sampling/sampling_logp_difference/mean": 0.12974762916564941, "step": 353 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3255824148654938, "epoch": 0.5673076923076923, "grad_norm": 0.036682289093732834, "learning_rate": 1e-06, "loss": -0.0039, "step": 354 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3206946551799774, "epoch": 0.5689102564102564, "grad_norm": 0.022839486598968506, "learning_rate": 1e-06, "loss": 0.0127, "step": 355 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.32720427215099335, "epoch": 0.5705128205128205, "grad_norm": 0.008449007757008076, "learning_rate": 1e-06, "loss": -0.0116, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13980.0, "completions/max_terminated_length": 13980.0, "completions/mean_length": 7222.951171875, "completions/mean_terminated_length": 7222.951171875, "completions/min_length": 2306.0, "completions/min_terminated_length": 2306.0, "entropy": 0.333197221159935, "epoch": 0.5721153846153846, "frac_reward_zero_std": 0.625, "grad_norm": 0.044194769114255905, "learning_rate": 1e-06, "loss": -0.0328, "num_tokens": 495195011.0, "reward": 0.8622949719429016, "reward_std": 0.07508472353219986, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.943359375, "rewards/symbolic_reward_accuracy/std": 0.23138070106506348, "rewards/symbolic_reward_partial_score/mean": 0.987597644329071, "rewards/symbolic_reward_partial_score/std": 0.0845394879579544, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0780634880065918, "sampling/importance_sampling_ratio/min": 0.0019118906930088997, "sampling/sampling_logp_difference/max": 6.259662628173828, "sampling/sampling_logp_difference/mean": 0.13341926038265228, "step": 357 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.33615103363990784, "epoch": 0.5737179487179487, "grad_norm": 0.0372152104973793, "learning_rate": 1e-06, "loss": 0.0259, "step": 358 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.33543069660663605, "epoch": 0.5753205128205128, "grad_norm": 0.015658585354685783, "learning_rate": 1e-06, "loss": 0.0127, "step": 359 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.332485556602478, "epoch": 0.5769230769230769, "grad_norm": 0.016284069046378136, "learning_rate": 1e-06, "loss": -0.0161, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14887.0, "completions/max_terminated_length": 14887.0, "completions/mean_length": 7543.685546875, "completions/mean_terminated_length": 7543.685546875, "completions/min_length": 2924.0, "completions/min_terminated_length": 2924.0, "entropy": 0.3329332023859024, "epoch": 0.5785256410256411, "frac_reward_zero_std": 0.71875, "grad_norm": 0.022686509415507317, "learning_rate": 1e-06, "loss": -0.0063, "num_tokens": 499919474.0, "reward": 0.870678722858429, "reward_std": 0.0752020850777626, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.962890625, "rewards/symbolic_reward_accuracy/std": 0.18921469151973724, "rewards/symbolic_reward_partial_score/mean": 0.9764811396598816, "rewards/symbolic_reward_partial_score/std": 0.14590464532375336, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0771108865737915, "sampling/importance_sampling_ratio/min": 0.00014151324285194278, "sampling/sampling_logp_difference/max": 8.863117218017578, "sampling/sampling_logp_difference/mean": 0.13242536783218384, "step": 361 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.33253198862075806, "epoch": 0.5801282051282052, "grad_norm": 0.010025088675320148, "learning_rate": 1e-06, "loss": -0.036, "step": 362 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.33328503370285034, "epoch": 0.5817307692307693, "grad_norm": 0.03455505147576332, "learning_rate": 1e-06, "loss": 0.0002, "step": 363 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.33706234395504, "epoch": 0.5833333333333334, "grad_norm": 0.015231803059577942, "learning_rate": 1e-06, "loss": 0.0334, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13322.0, "completions/max_terminated_length": 13322.0, "completions/mean_length": 7328.0625, "completions/mean_terminated_length": 7328.0625, "completions/min_length": 2416.0, "completions/min_terminated_length": 2416.0, "entropy": 0.3319256901741028, "epoch": 0.5849358974358975, "frac_reward_zero_std": 0.875, "grad_norm": 0.020963091403245926, "learning_rate": 1e-06, "loss": 0.0066, "num_tokens": 504454178.0, "reward": 0.8920166492462158, "reward_std": 0.027890432626008987, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.992919921875, "rewards/symbolic_reward_partial_score/std": 0.07962682098150253, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0766181945800781, "sampling/importance_sampling_ratio/min": 4.0527328337702784e-07, "sampling/sampling_logp_difference/max": 14.718704223632812, "sampling/sampling_logp_difference/mean": 0.13169197738170624, "step": 365 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3278692811727524, "epoch": 0.5865384615384616, "grad_norm": 0.014562098309397697, "learning_rate": 1e-06, "loss": 0.0053, "step": 366 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.33057862520217896, "epoch": 0.5881410256410257, "grad_norm": 0.00395246734842658, "learning_rate": 1e-06, "loss": -0.0172, "step": 367 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3307371437549591, "epoch": 0.5897435897435898, "grad_norm": 0.00415209261700511, "learning_rate": 1e-06, "loss": -0.0035, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15281.0, "completions/mean_length": 8133.962890625, "completions/mean_terminated_length": 8117.81787109375, "completions/min_length": 3105.0, "completions/min_terminated_length": 3105.0, "entropy": 0.3238765001296997, "epoch": 0.5913461538461539, "frac_reward_zero_std": 0.65625, "grad_norm": 0.029979927465319633, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 509532927.0, "reward": 0.8768750429153442, "reward_std": 0.0702906921505928, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.96875, "rewards/symbolic_reward_accuracy/std": 0.17416280508041382, "rewards/symbolic_reward_partial_score/mean": 0.9860677123069763, "rewards/symbolic_reward_partial_score/std": 0.10367308557033539, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0753402709960938, "sampling/importance_sampling_ratio/min": 4.647930836654268e-05, "sampling/sampling_logp_difference/max": 9.976503372192383, "sampling/sampling_logp_difference/mean": 0.12939895689487457, "step": 369 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.3259133845567703, "epoch": 0.592948717948718, "grad_norm": 0.03497137129306793, "learning_rate": 1e-06, "loss": -0.0033, "step": 370 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.32264500856399536, "epoch": 0.594551282051282, "grad_norm": 0.010510552674531937, "learning_rate": 1e-06, "loss": -0.0245, "step": 371 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.3223158270120621, "epoch": 0.5961538461538461, "grad_norm": 0.04193766042590141, "learning_rate": 1e-06, "loss": 0.0436, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13321.0, "completions/mean_length": 7861.279296875, "completions/mean_terminated_length": 7844.6005859375, "completions/min_length": 2852.0, "completions/min_terminated_length": 2852.0, "entropy": 0.33821770548820496, "epoch": 0.5977564102564102, "frac_reward_zero_std": 0.8125, "grad_norm": 0.006328184623271227, "learning_rate": 1e-06, "loss": -0.0028, "num_tokens": 514360078.0, "reward": 0.8896973133087158, "reward_std": 0.04121094197034836, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.98974609375, "rewards/symbolic_reward_partial_score/std": 0.09900352358818054, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0773589611053467, "sampling/importance_sampling_ratio/min": 0.0021734030451625586, "sampling/sampling_logp_difference/max": 6.131461143493652, "sampling/sampling_logp_difference/mean": 0.1328354924917221, "step": 373 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.33533041179180145, "epoch": 0.5993589743589743, "grad_norm": 0.00386610790155828, "learning_rate": 1e-06, "loss": 0.0224, "step": 374 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3337092846632004, "epoch": 0.6009615384615384, "grad_norm": 0.005762523040175438, "learning_rate": 1e-06, "loss": -0.0269, "step": 375 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.33220188319683075, "epoch": 0.6025641025641025, "grad_norm": 0.02871812880039215, "learning_rate": 1e-06, "loss": 0.0281, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14615.0, "completions/max_terminated_length": 14615.0, "completions/mean_length": 7813.74609375, "completions/mean_terminated_length": 7813.74609375, "completions/min_length": 2636.0, "completions/min_terminated_length": 2636.0, "entropy": 0.3266758322715759, "epoch": 0.6041666666666666, "frac_reward_zero_std": 0.78125, "grad_norm": 0.007526485249400139, "learning_rate": 1e-06, "loss": -0.0347, "num_tokens": 519285372.0, "reward": 0.887402355670929, "reward_std": 0.04390469565987587, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.982421875, "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, "rewards/symbolic_reward_partial_score/mean": 0.9931640625, "rewards/symbolic_reward_partial_score/std": 0.0685238167643547, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0762808322906494, "sampling/importance_sampling_ratio/min": 0.0014965370064601302, "sampling/sampling_logp_difference/max": 6.50460147857666, "sampling/sampling_logp_difference/mean": 0.13083325326442719, "step": 377 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3292153924703598, "epoch": 0.6057692307692307, "grad_norm": 0.006894251331686974, "learning_rate": 1e-06, "loss": -0.0203, "step": 378 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3328756242990494, "epoch": 0.6073717948717948, "grad_norm": 0.05631379410624504, "learning_rate": 1e-06, "loss": 0.0699, "step": 379 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.33030255138874054, "epoch": 0.6089743589743589, "grad_norm": 0.029384924098849297, "learning_rate": 1e-06, "loss": -0.0124, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14604.0, "completions/max_terminated_length": 14604.0, "completions/mean_length": 8022.71484375, "completions/mean_terminated_length": 8022.71484375, "completions/min_length": 3237.0, "completions/min_terminated_length": 3237.0, "entropy": 0.332174688577652, "epoch": 0.6105769230769231, "frac_reward_zero_std": 0.84375, "grad_norm": 0.031777773052453995, "learning_rate": 1e-06, "loss": 0.0257, "num_tokens": 524217866.0, "reward": 0.885449230670929, "reward_std": 0.033893831074237823, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.978515625, "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, "rewards/symbolic_reward_partial_score/mean": 0.9944661855697632, "rewards/symbolic_reward_partial_score/std": 0.06361044943332672, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0768940448760986, "sampling/importance_sampling_ratio/min": 0.0011646309867501259, "sampling/sampling_logp_difference/max": 6.7553510665893555, "sampling/sampling_logp_difference/mean": 0.1318342089653015, "step": 381 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.33310163021087646, "epoch": 0.6121794871794872, "grad_norm": 0.009926833212375641, "learning_rate": 1e-06, "loss": -0.0134, "step": 382 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3353753387928009, "epoch": 0.6137820512820513, "grad_norm": 0.012040833942592144, "learning_rate": 1e-06, "loss": -0.0263, "step": 383 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.33477625250816345, "epoch": 0.6153846153846154, "grad_norm": 0.012466357089579105, "learning_rate": 1e-06, "loss": 0.0128, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15163.0, "completions/max_terminated_length": 15163.0, "completions/mean_length": 7643.994140625, "completions/mean_terminated_length": 7643.994140625, "completions/min_length": 2423.0, "completions/min_terminated_length": 2423.0, "entropy": 0.34519071877002716, "epoch": 0.6169871794871795, "frac_reward_zero_std": 0.90625, "grad_norm": 0.007148720324039459, "learning_rate": 1e-06, "loss": -0.0192, "num_tokens": 528899975.0, "reward": 0.8918750286102295, "reward_std": 0.0212983638048172, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.9963542222976685, "rewards/symbolic_reward_partial_score/std": 0.049014534801244736, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0788915157318115, "sampling/importance_sampling_ratio/min": 0.0002961345307994634, "sampling/sampling_logp_difference/max": 8.124696731567383, "sampling/sampling_logp_difference/mean": 0.13547851145267487, "step": 385 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3407631367444992, "epoch": 0.6185897435897436, "grad_norm": 0.033643610775470734, "learning_rate": 1e-06, "loss": 0.0207, "step": 386 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3453748822212219, "epoch": 0.6201923076923077, "grad_norm": 0.028541069477796555, "learning_rate": 1e-06, "loss": 0.0086, "step": 387 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3432910591363907, "epoch": 0.6217948717948718, "grad_norm": 0.012735442258417606, "learning_rate": 1e-06, "loss": -0.0093, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14118.0, "completions/max_terminated_length": 14118.0, "completions/mean_length": 8072.060546875, "completions/mean_terminated_length": 8072.060546875, "completions/min_length": 2273.0, "completions/min_terminated_length": 2273.0, "entropy": 0.33578476309776306, "epoch": 0.6233974358974359, "frac_reward_zero_std": 0.90625, "grad_norm": 0.004418730735778809, "learning_rate": 1e-06, "loss": -0.0135, "num_tokens": 533880966.0, "reward": 0.8963379263877869, "reward_std": 0.0146484375, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.006366382818669081, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0782651901245117, "sampling/importance_sampling_ratio/min": 0.0019372059032320976, "sampling/sampling_logp_difference/max": 6.246508598327637, "sampling/sampling_logp_difference/mean": 0.13377907872200012, "step": 389 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3380652666091919, "epoch": 0.625, "grad_norm": 0.004354769829660654, "learning_rate": 1e-06, "loss": -0.0145, "step": 390 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.33700278401374817, "epoch": 0.6266025641025641, "grad_norm": 0.037305738776922226, "learning_rate": 1e-06, "loss": 0.0218, "step": 391 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3386920690536499, "epoch": 0.6282051282051282, "grad_norm": 0.004598728846758604, "learning_rate": 1e-06, "loss": 0.0025, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15080.0, "completions/max_terminated_length": 15080.0, "completions/mean_length": 8235.884765625, "completions/mean_terminated_length": 8235.884765625, "completions/min_length": 2229.0, "completions/min_terminated_length": 2229.0, "entropy": 0.33072568476200104, "epoch": 0.6298076923076923, "frac_reward_zero_std": 0.71875, "grad_norm": 0.02775965817272663, "learning_rate": 1e-06, "loss": -0.0314, "num_tokens": 539029355.0, "reward": 0.8783007264137268, "reward_std": 0.05748133361339569, "rewards/progression_diversity/mean": -1.044711098074913e-05, "rewards/progression_diversity/std": 0.0002363911335123703, "rewards/symbolic_reward_accuracy/mean": 0.96875, "rewards/symbolic_reward_accuracy/std": 0.17416280508041382, "rewards/symbolic_reward_partial_score/mean": 0.9901692867279053, "rewards/symbolic_reward_partial_score/std": 0.07673969119787216, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.077056646347046, "sampling/importance_sampling_ratio/min": 0.001056751934811473, "sampling/sampling_logp_difference/max": 6.852555274963379, "sampling/sampling_logp_difference/mean": 0.13160306215286255, "step": 393 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.33172135055065155, "epoch": 0.6314102564102564, "grad_norm": 0.03995116427540779, "learning_rate": 1e-06, "loss": 0.0466, "step": 394 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3319375663995743, "epoch": 0.6330128205128205, "grad_norm": 0.03385557979345322, "learning_rate": 1e-06, "loss": 0.019, "step": 395 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.329319566488266, "epoch": 0.6346153846153846, "grad_norm": 0.022209029644727707, "learning_rate": 1e-06, "loss": -0.0185, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15288.0, "completions/max_terminated_length": 15288.0, "completions/mean_length": 8061.54296875, "completions/mean_terminated_length": 8061.54296875, "completions/min_length": 2868.0, "completions/min_terminated_length": 2868.0, "entropy": 0.3458179533481598, "epoch": 0.6362179487179487, "frac_reward_zero_std": 0.875, "grad_norm": 0.02973409928381443, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 543944337.0, "reward": 0.8913769721984863, "reward_std": 0.02456841617822647, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.986328125, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.9986002445220947, "rewards/symbolic_reward_partial_score/std": 0.012323745526373386, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0800797939300537, "sampling/importance_sampling_ratio/min": 0.002433090703561902, "sampling/sampling_logp_difference/max": 6.018592834472656, "sampling/sampling_logp_difference/mean": 0.1366550773382187, "step": 397 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3451702296733856, "epoch": 0.6378205128205128, "grad_norm": 0.015311814844608307, "learning_rate": 1e-06, "loss": -0.0002, "step": 398 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.35143060982227325, "epoch": 0.6394230769230769, "grad_norm": 0.007167026866227388, "learning_rate": 1e-06, "loss": 0.004, "step": 399 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3457927256822586, "epoch": 0.6410256410256411, "grad_norm": 0.006636085454374552, "learning_rate": 1e-06, "loss": -0.0084, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14824.0, "completions/max_terminated_length": 14824.0, "completions/mean_length": 8275.130859375, "completions/mean_terminated_length": 8275.130859375, "completions/min_length": 3423.0, "completions/min_terminated_length": 3423.0, "entropy": 0.3534078747034073, "epoch": 0.6426282051282052, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0068145752884447575, "learning_rate": 1e-06, "loss": -0.0208, "num_tokens": 549038356.0, "reward": 0.8936426639556885, "reward_std": 0.01865454763174057, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.998339831829071, "rewards/symbolic_reward_partial_score/std": 0.023384064435958862, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0808277130126953, "sampling/importance_sampling_ratio/min": 0.0013517005136236548, "sampling/sampling_logp_difference/max": 6.606391906738281, "sampling/sampling_logp_difference/mean": 0.137827068567276, "step": 401 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3464537560939789, "epoch": 0.6442307692307693, "grad_norm": 0.006630260962992907, "learning_rate": 1e-06, "loss": -0.0198, "step": 402 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3510500192642212, "epoch": 0.6458333333333334, "grad_norm": 0.006664854008704424, "learning_rate": 1e-06, "loss": 0.0088, "step": 403 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.35208629071712494, "epoch": 0.6474358974358975, "grad_norm": 0.02022351324558258, "learning_rate": 1e-06, "loss": 0.0198, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15815.0, "completions/max_terminated_length": 15815.0, "completions/mean_length": 8582.462890625, "completions/mean_terminated_length": 8582.462890625, "completions/min_length": 3537.0, "completions/min_terminated_length": 3537.0, "entropy": 0.3529227524995804, "epoch": 0.6490384615384616, "frac_reward_zero_std": 0.875, "grad_norm": 0.029474791139364243, "learning_rate": 1e-06, "loss": -0.008, "num_tokens": 554291073.0, "reward": 0.8908447623252869, "reward_std": 0.026672566309571266, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.986328125, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.996826171875, "rewards/symbolic_reward_partial_score/std": 0.045555230230093, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0808701515197754, "sampling/importance_sampling_ratio/min": 0.002707641338929534, "sampling/sampling_logp_difference/max": 5.911677360534668, "sampling/sampling_logp_difference/mean": 0.13789406418800354, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3472251296043396, "epoch": 0.6506410256410257, "grad_norm": 0.0134069062769413, "learning_rate": 1e-06, "loss": -0.0308, "step": 406 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3502205014228821, "epoch": 0.6522435897435898, "grad_norm": 0.007101547438651323, "learning_rate": 1e-06, "loss": 0.027, "step": 407 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3522060215473175, "epoch": 0.6538461538461539, "grad_norm": 0.0266169011592865, "learning_rate": 1e-06, "loss": 0.005, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15488.0, "completions/mean_length": 9075.255859375, "completions/mean_terminated_length": 9060.953125, "completions/min_length": 5026.0, "completions/min_terminated_length": 5026.0, "entropy": 0.3400547206401825, "epoch": 0.655448717948718, "frac_reward_zero_std": 0.8125, "grad_norm": 0.008739027194678783, "learning_rate": 1e-06, "loss": -0.0382, "num_tokens": 559903588.0, "reward": 0.8877148628234863, "reward_std": 0.03960757330060005, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.982421875, "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, "rewards/symbolic_reward_partial_score/mean": 0.9942057728767395, "rewards/symbolic_reward_partial_score/std": 0.06572275608778, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0781176090240479, "sampling/importance_sampling_ratio/min": 0.0015160402981564403, "sampling/sampling_logp_difference/max": 6.4916534423828125, "sampling/sampling_logp_difference/mean": 0.13297772407531738, "step": 409 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3377264440059662, "epoch": 0.657051282051282, "grad_norm": 0.024263739585876465, "learning_rate": 1e-06, "loss": 0.0125, "step": 410 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.33608396351337433, "epoch": 0.6586538461538461, "grad_norm": 0.03586423769593239, "learning_rate": 1e-06, "loss": 0.0638, "step": 411 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.33477842807769775, "epoch": 0.6602564102564102, "grad_norm": 0.024101443588733673, "learning_rate": 1e-06, "loss": -0.0188, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15585.0, "completions/mean_length": 8471.755859375, "completions/mean_terminated_length": 8440.7275390625, "completions/min_length": 3034.0, "completions/min_terminated_length": 3034.0, "entropy": 0.35421672463417053, "epoch": 0.6618589743589743, "frac_reward_zero_std": 0.8125, "grad_norm": 0.03559446707367897, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 565094967.0, "reward": 0.8892334699630737, "reward_std": 0.0323222316801548, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.9966634511947632, "rewards/symbolic_reward_partial_score/std": 0.045737121254205704, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.082103967666626, "sampling/importance_sampling_ratio/min": 6.664723969151964e-06, "sampling/sampling_logp_difference/max": 11.918682098388672, "sampling/sampling_logp_difference/mean": 0.13944613933563232, "step": 413 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.35572129487991333, "epoch": 0.6634615384615384, "grad_norm": 0.016064023599028587, "learning_rate": 1e-06, "loss": 0.0156, "step": 414 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3548201322555542, "epoch": 0.6650641025641025, "grad_norm": 0.022146787494421005, "learning_rate": 1e-06, "loss": -0.0125, "step": 415 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.3592114597558975, "epoch": 0.6666666666666666, "grad_norm": 0.03847922012209892, "learning_rate": 1e-06, "loss": 0.0232, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14720.0, "completions/mean_length": 8116.115234375, "completions/mean_terminated_length": 8099.935546875, "completions/min_length": 2122.0, "completions/min_terminated_length": 2122.0, "entropy": 0.3613823801279068, "epoch": 0.6682692307692307, "frac_reward_zero_std": 0.75, "grad_norm": 0.015714695677161217, "learning_rate": 1e-06, "loss": -0.0276, "num_tokens": 570118818.0, "reward": 0.8804053068161011, "reward_std": 0.05617455393075943, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.97265625, "rewards/symbolic_reward_accuracy/std": 0.16324250400066376, "rewards/symbolic_reward_partial_score/mean": 0.9893717765808105, "rewards/symbolic_reward_partial_score/std": 0.09099794924259186, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0830599069595337, "sampling/importance_sampling_ratio/min": 1.3514282727555837e-05, "sampling/sampling_logp_difference/max": 11.211763381958008, "sampling/sampling_logp_difference/mean": 0.14126574993133545, "step": 417 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3637978732585907, "epoch": 0.6698717948717948, "grad_norm": 0.030573366209864616, "learning_rate": 1e-06, "loss": -0.0046, "step": 418 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.36206208169460297, "epoch": 0.6714743589743589, "grad_norm": 0.02386789582669735, "learning_rate": 1e-06, "loss": 0.0513, "step": 419 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.36033014953136444, "epoch": 0.6730769230769231, "grad_norm": 0.009966620244085789, "learning_rate": 1e-06, "loss": -0.024, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16093.0, "completions/mean_length": 8408.451171875, "completions/mean_terminated_length": 8377.1748046875, "completions/min_length": 3048.0, "completions/min_terminated_length": 3048.0, "entropy": 0.34985409677028656, "epoch": 0.6746794871794872, "frac_reward_zero_std": 0.65625, "grad_norm": 0.041775792837142944, "learning_rate": 1e-06, "loss": 0.0666, "num_tokens": 575346489.0, "reward": 0.8713477253913879, "reward_std": 0.06763733923435211, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.95703125, "rewards/symbolic_reward_accuracy/std": 0.2029850035905838, "rewards/symbolic_reward_partial_score/mean": 0.9917317628860474, "rewards/symbolic_reward_partial_score/std": 0.0664297491312027, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0804332494735718, "sampling/importance_sampling_ratio/min": 1.763604373650196e-08, "sampling/sampling_logp_difference/max": 17.853321075439453, "sampling/sampling_logp_difference/mean": 0.13745513558387756, "step": 421 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3511565178632736, "epoch": 0.6762820512820513, "grad_norm": 0.029272131621837616, "learning_rate": 1e-06, "loss": -0.0419, "step": 422 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.3484925478696823, "epoch": 0.6778846153846154, "grad_norm": 0.04266318306326866, "learning_rate": 1e-06, "loss": 0.0154, "step": 423 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.35260292887687683, "epoch": 0.6794871794871795, "grad_norm": 0.026875099167227745, "learning_rate": 1e-06, "loss": -0.0066, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16085.0, "completions/max_terminated_length": 16085.0, "completions/mean_length": 8416.912109375, "completions/mean_terminated_length": 8416.912109375, "completions/min_length": 3243.0, "completions/min_terminated_length": 3243.0, "entropy": 0.35831810534000397, "epoch": 0.6810897435897436, "frac_reward_zero_std": 0.875, "grad_norm": 0.028424283489584923, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 580492796.0, "reward": 0.8927881121635437, "reward_std": 0.025222305208444595, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9954915642738342, "rewards/symbolic_reward_partial_score/std": 0.06290679425001144, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0821194648742676, "sampling/importance_sampling_ratio/min": 0.00029070241726003587, "sampling/sampling_logp_difference/max": 8.143210411071777, "sampling/sampling_logp_difference/mean": 0.13962535560131073, "step": 425 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.35801512002944946, "epoch": 0.6826923076923077, "grad_norm": 0.005722351837903261, "learning_rate": 1e-06, "loss": 0.0113, "step": 426 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.35911840200424194, "epoch": 0.6842948717948718, "grad_norm": 0.006318832281976938, "learning_rate": 1e-06, "loss": -0.0081, "step": 427 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.354499876499176, "epoch": 0.6858974358974359, "grad_norm": 0.006353972014039755, "learning_rate": 1e-06, "loss": 0.0009, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13891.0, "completions/mean_length": 8136.873046875, "completions/mean_terminated_length": 8120.73388671875, "completions/min_length": 3079.0, "completions/min_terminated_length": 3079.0, "entropy": 0.3637176752090454, "epoch": 0.6875, "frac_reward_zero_std": 0.84375, "grad_norm": 0.03145010769367218, "learning_rate": 1e-06, "loss": 0.0035, "num_tokens": 585527947.0, "reward": 0.8924024105072021, "reward_std": 0.027296192944049835, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.9987630844116211, "rewards/symbolic_reward_partial_score/std": 0.012274928390979767, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0836389064788818, "sampling/importance_sampling_ratio/min": 0.0015040416037663817, "sampling/sampling_logp_difference/max": 6.499599456787109, "sampling/sampling_logp_difference/mean": 0.14229172468185425, "step": 429 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3629887253046036, "epoch": 0.6891025641025641, "grad_norm": 0.030462458729743958, "learning_rate": 1e-06, "loss": 0.0206, "step": 430 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.36127839982509613, "epoch": 0.6907051282051282, "grad_norm": 0.006768929772078991, "learning_rate": 1e-06, "loss": 0.0063, "step": 431 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.36501798033714294, "epoch": 0.6923076923076923, "grad_norm": 0.00611158600077033, "learning_rate": 1e-06, "loss": -0.0282, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15266.0, "completions/max_terminated_length": 15266.0, "completions/mean_length": 8200.052734375, "completions/mean_terminated_length": 8200.052734375, "completions/min_length": 3573.0, "completions/min_terminated_length": 3573.0, "entropy": 0.3631163239479065, "epoch": 0.6939102564102564, "frac_reward_zero_std": 0.875, "grad_norm": 0.025189094245433807, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 590550646.0, "reward": 0.893994152545929, "reward_std": 0.02402343973517418, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.99560546875, "rewards/symbolic_reward_partial_score/std": 0.06294834613800049, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.08342444896698, "sampling/importance_sampling_ratio/min": 0.0021930765360593796, "sampling/sampling_logp_difference/max": 6.12244987487793, "sampling/sampling_logp_difference/mean": 0.14241476356983185, "step": 433 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.36200442910194397, "epoch": 0.6955128205128205, "grad_norm": 0.030085749924182892, "learning_rate": 1e-06, "loss": 0.0222, "step": 434 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.36389629542827606, "epoch": 0.6971153846153846, "grad_norm": 0.004491681698709726, "learning_rate": 1e-06, "loss": 0.0033, "step": 435 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3608408570289612, "epoch": 0.6987179487179487, "grad_norm": 0.00477554788812995, "learning_rate": 1e-06, "loss": -0.0171, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15140.0, "completions/mean_length": 8503.419921875, "completions/mean_terminated_length": 8472.5166015625, "completions/min_length": 3957.0, "completions/min_terminated_length": 3957.0, "entropy": 0.36018745601177216, "epoch": 0.7003205128205128, "frac_reward_zero_std": 0.78125, "grad_norm": 0.021929524838924408, "learning_rate": 1e-06, "loss": 0.0096, "num_tokens": 595724013.0, "reward": 0.8848145008087158, "reward_std": 0.04023914784193039, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9765625, "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, "rewards/symbolic_reward_partial_score/mean": 0.99755859375, "rewards/symbolic_reward_partial_score/std": 0.016178663820028305, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0824849605560303, "sampling/importance_sampling_ratio/min": 2.4478840714436956e-05, "sampling/sampling_logp_difference/max": 10.617701530456543, "sampling/sampling_logp_difference/mean": 0.14112256467342377, "step": 437 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.36046767234802246, "epoch": 0.7019230769230769, "grad_norm": 0.008901628665626049, "learning_rate": 1e-06, "loss": -0.0151, "step": 438 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3631622791290283, "epoch": 0.7035256410256411, "grad_norm": 0.009371104650199413, "learning_rate": 1e-06, "loss": -0.0012, "step": 439 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3572462946176529, "epoch": 0.7051282051282052, "grad_norm": 0.03333629295229912, "learning_rate": 1e-06, "loss": 0.028, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15100.0, "completions/mean_length": 8613.44140625, "completions/mean_terminated_length": 8598.234375, "completions/min_length": 3167.0, "completions/min_terminated_length": 3167.0, "entropy": 0.35990896821022034, "epoch": 0.7067307692307693, "frac_reward_zero_std": 0.84375, "grad_norm": 0.020661354064941406, "learning_rate": 1e-06, "loss": 0.0205, "num_tokens": 601002703.0, "reward": 0.890869140625, "reward_std": 0.02897302433848381, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.99365234375, "rewards/symbolic_reward_partial_score/std": 0.07662402093410492, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.081774353981018, "sampling/importance_sampling_ratio/min": 0.0014660513261333108, "sampling/sampling_logp_difference/max": 6.525182723999023, "sampling/sampling_logp_difference/mean": 0.13994953036308289, "step": 441 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3518172800540924, "epoch": 0.7083333333333334, "grad_norm": 0.006469558924436569, "learning_rate": 1e-06, "loss": -0.0004, "step": 442 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3583241403102875, "epoch": 0.7099358974358975, "grad_norm": 0.004939902573823929, "learning_rate": 1e-06, "loss": -0.0275, "step": 443 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.35749298334121704, "epoch": 0.7115384615384616, "grad_norm": 0.03977655991911888, "learning_rate": 1e-06, "loss": 0.0204, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16122.0, "completions/mean_length": 8782.55859375, "completions/mean_terminated_length": 8767.6826171875, "completions/min_length": 3384.0, "completions/min_terminated_length": 3384.0, "entropy": 0.3510270267724991, "epoch": 0.7131410256410257, "frac_reward_zero_std": 0.8125, "grad_norm": 0.03301560506224632, "learning_rate": 1e-06, "loss": 0.0386, "num_tokens": 606343597.0, "reward": 0.8837646842002869, "reward_std": 0.030399255454540253, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9765625, "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, "rewards/symbolic_reward_partial_score/mean": 0.9940592646598816, "rewards/symbolic_reward_partial_score/std": 0.05622868239879608, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.080472469329834, "sampling/importance_sampling_ratio/min": 0.0006269825389608741, "sampling/sampling_logp_difference/max": 7.374591827392578, "sampling/sampling_logp_difference/mean": 0.13803288340568542, "step": 445 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.35030949115753174, "epoch": 0.7147435897435898, "grad_norm": 0.01800094172358513, "learning_rate": 1e-06, "loss": -0.0036, "step": 446 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.35025037825107574, "epoch": 0.7163461538461539, "grad_norm": 0.024082746356725693, "learning_rate": 1e-06, "loss": -0.0023, "step": 447 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.35331277549266815, "epoch": 0.717948717948718, "grad_norm": 0.012219024822115898, "learning_rate": 1e-06, "loss": -0.0035, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15603.0, "completions/max_terminated_length": 15603.0, "completions/mean_length": 8595.328125, "completions/mean_terminated_length": 8595.328125, "completions/min_length": 3447.0, "completions/min_terminated_length": 3447.0, "entropy": 0.3564409017562866, "epoch": 0.719551282051282, "frac_reward_zero_std": 0.875, "grad_norm": 0.01346027571707964, "learning_rate": 1e-06, "loss": 0.005, "num_tokens": 611701701.0, "reward": 0.8896973133087158, "reward_std": 0.026320863515138626, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.9969075918197632, "rewards/symbolic_reward_partial_score/std": 0.04519396647810936, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0815019607543945, "sampling/importance_sampling_ratio/min": 0.0023846339900046587, "sampling/sampling_logp_difference/max": 6.03870964050293, "sampling/sampling_logp_difference/mean": 0.13934297859668732, "step": 449 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.35363440215587616, "epoch": 0.7211538461538461, "grad_norm": 0.030180808156728745, "learning_rate": 1e-06, "loss": -0.0056, "step": 450 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3554205894470215, "epoch": 0.7227564102564102, "grad_norm": 0.026657214388251305, "learning_rate": 1e-06, "loss": -0.0036, "step": 451 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3553068935871124, "epoch": 0.7243589743589743, "grad_norm": 0.023180054500699043, "learning_rate": 1e-06, "loss": 0.0014, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15589.0, "completions/mean_length": 8833.49609375, "completions/mean_terminated_length": 8803.88671875, "completions/min_length": 4390.0, "completions/min_terminated_length": 4390.0, "entropy": 0.3539529889822006, "epoch": 0.7259615384615384, "frac_reward_zero_std": 0.875, "grad_norm": 0.006918177008628845, "learning_rate": 1e-06, "loss": -0.012, "num_tokens": 617112531.0, "reward": 0.8917285799980164, "reward_std": 0.026078036054968834, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.9971679449081421, "rewards/symbolic_reward_partial_score/std": 0.04504242166876793, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0814685821533203, "sampling/importance_sampling_ratio/min": 0.003002573037520051, "sampling/sampling_logp_difference/max": 5.808285713195801, "sampling/sampling_logp_difference/mean": 0.13951382040977478, "step": 453 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.35784053802490234, "epoch": 0.7275641025641025, "grad_norm": 0.03929273784160614, "learning_rate": 1e-06, "loss": 0.0343, "step": 454 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.35644376277923584, "epoch": 0.7291666666666666, "grad_norm": 0.006792739033699036, "learning_rate": 1e-06, "loss": -0.0045, "step": 455 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3582670986652374, "epoch": 0.7307692307692307, "grad_norm": 0.006977362558245659, "learning_rate": 1e-06, "loss": 0.0062, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16102.0, "completions/max_terminated_length": 16102.0, "completions/mean_length": 9150.384765625, "completions/mean_terminated_length": 9150.384765625, "completions/min_length": 4510.0, "completions/min_terminated_length": 4510.0, "entropy": 0.3545047491788864, "epoch": 0.7323717948717948, "frac_reward_zero_std": 0.8125, "grad_norm": 0.02863190323114395, "learning_rate": 1e-06, "loss": -0.006, "num_tokens": 622718600.0, "reward": 0.8840234279632568, "reward_std": 0.04033438116312027, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9765625, "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, "rewards/symbolic_reward_partial_score/mean": 0.9942708015441895, "rewards/symbolic_reward_partial_score/std": 0.06366147845983505, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0819820165634155, "sampling/importance_sampling_ratio/min": 0.0015321827959269285, "sampling/sampling_logp_difference/max": 6.481061935424805, "sampling/sampling_logp_difference/mean": 0.1397707313299179, "step": 457 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3561212420463562, "epoch": 0.7339743589743589, "grad_norm": 0.011848966591060162, "learning_rate": 1e-06, "loss": 0.0048, "step": 458 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.35785505175590515, "epoch": 0.7355769230769231, "grad_norm": 0.008489933796226978, "learning_rate": 1e-06, "loss": 0.0164, "step": 459 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.35608378052711487, "epoch": 0.7371794871794872, "grad_norm": 0.013068986125290394, "learning_rate": 1e-06, "loss": -0.0196, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15891.0, "completions/mean_length": 9031.88671875, "completions/mean_terminated_length": 9017.4990234375, "completions/min_length": 3230.0, "completions/min_terminated_length": 3230.0, "entropy": 0.3540836423635483, "epoch": 0.7387820512820513, "frac_reward_zero_std": 0.71875, "grad_norm": 0.04618172347545624, "learning_rate": 1e-06, "loss": 0.0191, "num_tokens": 628202206.0, "reward": 0.8800146579742432, "reward_std": 0.06314942240715027, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.974609375, "rewards/symbolic_reward_accuracy/std": 0.15746226906776428, "rewards/symbolic_reward_partial_score/mean": 0.9848144054412842, "rewards/symbolic_reward_partial_score/std": 0.11697658896446228, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0817463397979736, "sampling/importance_sampling_ratio/min": 0.00019343453459441662, "sampling/sampling_logp_difference/max": 8.55057144165039, "sampling/sampling_logp_difference/mean": 0.13958865404129028, "step": 461 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.35761868953704834, "epoch": 0.7403846153846154, "grad_norm": 0.045976750552654266, "learning_rate": 1e-06, "loss": 0.0128, "step": 462 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.356304407119751, "epoch": 0.7419871794871795, "grad_norm": 0.017900973558425903, "learning_rate": 1e-06, "loss": -0.0142, "step": 463 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.35570865869522095, "epoch": 0.7435897435897436, "grad_norm": 0.013649231754243374, "learning_rate": 1e-06, "loss": 0.0053, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16213.0, "completions/mean_length": 9302.775390625, "completions/mean_terminated_length": 9275.0068359375, "completions/min_length": 3300.0, "completions/min_terminated_length": 3300.0, "entropy": 0.34583213925361633, "epoch": 0.7451923076923077, "frac_reward_zero_std": 0.5625, "grad_norm": 0.04486886411905289, "learning_rate": 1e-06, "loss": 0.0691, "num_tokens": 633961371.0, "reward": 0.8627539277076721, "reward_std": 0.10190732777118683, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.953125, "rewards/symbolic_reward_accuracy/std": 0.21157780289649963, "rewards/symbolic_reward_partial_score/mean": 0.970898449420929, "rewards/symbolic_reward_partial_score/std": 0.1634274274110794, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0803803205490112, "sampling/importance_sampling_ratio/min": 0.0016337695997208357, "sampling/sampling_logp_difference/max": 6.416865348815918, "sampling/sampling_logp_difference/mean": 0.13692045211791992, "step": 465 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.34644070267677307, "epoch": 0.7467948717948718, "grad_norm": 0.044408995658159256, "learning_rate": 1e-06, "loss": -0.036, "step": 466 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.349708616733551, "epoch": 0.7483974358974359, "grad_norm": 0.0231320858001709, "learning_rate": 1e-06, "loss": -0.056, "step": 467 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.3454091101884842, "epoch": 0.75, "grad_norm": 0.04146129637956619, "learning_rate": 1e-06, "loss": 0.0432, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15407.0, "completions/max_terminated_length": 15407.0, "completions/mean_length": 8852.943359375, "completions/mean_terminated_length": 8852.943359375, "completions/min_length": 3225.0, "completions/min_terminated_length": 3225.0, "entropy": 0.35281412303447723, "epoch": 0.7516025641025641, "frac_reward_zero_std": 0.6875, "grad_norm": 0.03396932780742645, "learning_rate": 1e-06, "loss": 0.0199, "num_tokens": 639348686.0, "reward": 0.8635010123252869, "reward_std": 0.08311234414577484, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.953125, "rewards/symbolic_reward_accuracy/std": 0.21157780289649963, "rewards/symbolic_reward_partial_score/mean": 0.9720866084098816, "rewards/symbolic_reward_partial_score/std": 0.15833736956119537, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.081454873085022, "sampling/importance_sampling_ratio/min": 6.140080954153339e-16, "sampling/sampling_logp_difference/max": 35.02652359008789, "sampling/sampling_logp_difference/mean": 0.1392558515071869, "step": 469 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.35504350066185, "epoch": 0.7532051282051282, "grad_norm": 0.03383636847138405, "learning_rate": 1e-06, "loss": -0.0306, "step": 470 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.3511788100004196, "epoch": 0.7548076923076923, "grad_norm": 0.016086360439658165, "learning_rate": 1e-06, "loss": -0.0376, "step": 471 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.353060781955719, "epoch": 0.7564102564102564, "grad_norm": 0.018982337787747383, "learning_rate": 1e-06, "loss": 0.0579, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15526.0, "completions/mean_length": 8947.90234375, "completions/mean_terminated_length": 8918.7421875, "completions/min_length": 3442.0, "completions/min_terminated_length": 3442.0, "entropy": 0.3535376191139221, "epoch": 0.7580128205128205, "frac_reward_zero_std": 0.8125, "grad_norm": 0.03322957828640938, "learning_rate": 1e-06, "loss": 0.007, "num_tokens": 644786348.0, "reward": 0.8871484398841858, "reward_std": 0.03933054581284523, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.9897135496139526, "rewards/symbolic_reward_partial_score/std": 0.09861777722835541, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0809029340744019, "sampling/importance_sampling_ratio/min": 0.000914527743589133, "sampling/sampling_logp_difference/max": 6.997102737426758, "sampling/sampling_logp_difference/mean": 0.13842171430587769, "step": 473 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.3526497185230255, "epoch": 0.7596153846153846, "grad_norm": 0.008333048783242702, "learning_rate": 1e-06, "loss": -0.0032, "step": 474 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.3505059778690338, "epoch": 0.7612179487179487, "grad_norm": 0.007741092704236507, "learning_rate": 1e-06, "loss": -0.0055, "step": 475 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3537901043891907, "epoch": 0.7628205128205128, "grad_norm": 0.008508720435202122, "learning_rate": 1e-06, "loss": 0.0219, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16281.0, "completions/mean_length": 9367.115234375, "completions/mean_terminated_length": 9339.5986328125, "completions/min_length": 3479.0, "completions/min_terminated_length": 3479.0, "entropy": 0.3506188541650772, "epoch": 0.7644230769230769, "frac_reward_zero_std": 0.875, "grad_norm": 0.027823351323604584, "learning_rate": 1e-06, "loss": 0.0217, "num_tokens": 650486679.0, "reward": 0.8952149152755737, "reward_std": 0.01864551566541195, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9977213740348816, "rewards/symbolic_reward_partial_score/std": 0.04448510333895683, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0803250074386597, "sampling/importance_sampling_ratio/min": 0.0017684729536995292, "sampling/sampling_logp_difference/max": 6.337638854980469, "sampling/sampling_logp_difference/mean": 0.1373838186264038, "step": 477 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3502143770456314, "epoch": 0.7660256410256411, "grad_norm": 0.006330309435725212, "learning_rate": 1e-06, "loss": -0.0048, "step": 478 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.35096271336078644, "epoch": 0.7676282051282052, "grad_norm": 0.04565264657139778, "learning_rate": 1e-06, "loss": 0.0217, "step": 479 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3490835726261139, "epoch": 0.7692307692307693, "grad_norm": 0.005274102557450533, "learning_rate": 1e-06, "loss": -0.0251, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15412.0, "completions/mean_length": 9111.294921875, "completions/mean_terminated_length": 9097.0625, "completions/min_length": 2826.0, "completions/min_terminated_length": 2826.0, "entropy": 0.3503502458333969, "epoch": 0.7708333333333334, "frac_reward_zero_std": 0.78125, "grad_norm": 0.03831252083182335, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 656072990.0, "reward": 0.8912500143051147, "reward_std": 0.03190556913614273, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.986328125, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.9988281726837158, "rewards/symbolic_reward_partial_score/std": 0.009986638091504574, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.079893708229065, "sampling/importance_sampling_ratio/min": 0.001857686205767095, "sampling/sampling_logp_difference/max": 6.288423538208008, "sampling/sampling_logp_difference/mean": 0.13626208901405334, "step": 481 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3467428386211395, "epoch": 0.7724358974358975, "grad_norm": 0.02244342491030693, "learning_rate": 1e-06, "loss": 0.0084, "step": 482 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.34799858927726746, "epoch": 0.7740384615384616, "grad_norm": 0.03198394924402237, "learning_rate": 1e-06, "loss": 0.0081, "step": 483 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.34685586392879486, "epoch": 0.7756410256410257, "grad_norm": 0.04258110746741295, "learning_rate": 1e-06, "loss": 0.0082, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16010.0, "completions/mean_length": 8945.87109375, "completions/mean_terminated_length": 8931.3154296875, "completions/min_length": 4307.0, "completions/min_terminated_length": 4307.0, "entropy": 0.35287177562713623, "epoch": 0.7772435897435898, "frac_reward_zero_std": 0.90625, "grad_norm": 0.04437045007944107, "learning_rate": 1e-06, "loss": 0.0314, "num_tokens": 661405004.0, "reward": 0.8942578434944153, "reward_std": 0.01977742463350296, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9971354007720947, "rewards/symbolic_reward_partial_score/std": 0.04583593085408211, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0803794860839844, "sampling/importance_sampling_ratio/min": 0.0032736759167164564, "sampling/sampling_logp_difference/max": 5.721841812133789, "sampling/sampling_logp_difference/mean": 0.13785810768604279, "step": 485 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.3534203916788101, "epoch": 0.7788461538461539, "grad_norm": 0.004420117940753698, "learning_rate": 1e-06, "loss": -0.0185, "step": 486 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.35098250210285187, "epoch": 0.780448717948718, "grad_norm": 0.0055827731266617775, "learning_rate": 1e-06, "loss": 0.0013, "step": 487 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.35377365350723267, "epoch": 0.782051282051282, "grad_norm": 0.005644972436130047, "learning_rate": 1e-06, "loss": 0.0013, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14908.0, "completions/mean_length": 8959.513671875, "completions/mean_terminated_length": 8944.984375, "completions/min_length": 2746.0, "completions/min_terminated_length": 2746.0, "entropy": 0.3521414250135422, "epoch": 0.7836538461538461, "frac_reward_zero_std": 0.90625, "grad_norm": 0.03292594850063324, "learning_rate": 1e-06, "loss": 0.0126, "num_tokens": 666857683.0, "reward": 0.893310546875, "reward_std": 0.019982675090432167, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9972330331802368, "rewards/symbolic_reward_partial_score/std": 0.04521510377526283, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.080183506011963, "sampling/importance_sampling_ratio/min": 0.001276485389098525, "sampling/sampling_logp_difference/max": 6.663644790649414, "sampling/sampling_logp_difference/mean": 0.13744832575321198, "step": 489 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.35108447074890137, "epoch": 0.7852564102564102, "grad_norm": 0.005429164972156286, "learning_rate": 1e-06, "loss": -0.0187, "step": 490 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.35315074026584625, "epoch": 0.7868589743589743, "grad_norm": 0.0047265770845115185, "learning_rate": 1e-06, "loss": 0.0406, "step": 491 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3532349318265915, "epoch": 0.7884615384615384, "grad_norm": 0.0051872618496418, "learning_rate": 1e-06, "loss": -0.0195, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15847.0, "completions/max_terminated_length": 15847.0, "completions/mean_length": 8263.16796875, "completions/mean_terminated_length": 8263.16796875, "completions/min_length": 3232.0, "completions/min_terminated_length": 3232.0, "entropy": 0.3506288528442383, "epoch": 0.7900641025641025, "frac_reward_zero_std": 0.875, "grad_norm": 0.03242136538028717, "learning_rate": 1e-06, "loss": 0.0349, "num_tokens": 671971593.0, "reward": 0.8926270008087158, "reward_std": 0.023242928087711334, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.9988607168197632, "rewards/symbolic_reward_partial_score/std": 0.011000390164554119, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0794179439544678, "sampling/importance_sampling_ratio/min": 3.908433427568525e-05, "sampling/sampling_logp_difference/max": 10.149788856506348, "sampling/sampling_logp_difference/mean": 0.136383056640625, "step": 493 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.34977929294109344, "epoch": 0.7916666666666666, "grad_norm": 0.02093787118792534, "learning_rate": 1e-06, "loss": -0.0087, "step": 494 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3473648726940155, "epoch": 0.7932692307692307, "grad_norm": 0.020648593083024025, "learning_rate": 1e-06, "loss": -0.0107, "step": 495 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.34989985823631287, "epoch": 0.7948717948717948, "grad_norm": 0.005807805806398392, "learning_rate": 1e-06, "loss": -0.0092, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14949.0, "completions/max_terminated_length": 14949.0, "completions/mean_length": 7869.15234375, "completions/mean_terminated_length": 7869.15234375, "completions/min_length": 2486.0, "completions/min_terminated_length": 2486.0, "entropy": 0.358594685792923, "epoch": 0.7964743589743589, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0033265817910432816, "learning_rate": 1e-06, "loss": -0.0084, "num_tokens": 676874327.0, "reward": 0.8970117568969727, "reward_std": 0.011953125707805157, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.997851550579071, "rewards/symbolic_reward_partial_score/std": 0.0444059856235981, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0811829566955566, "sampling/importance_sampling_ratio/min": 0.0013898679753765464, "sampling/sampling_logp_difference/max": 6.578546524047852, "sampling/sampling_logp_difference/mean": 0.13945680856704712, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3543122410774231, "epoch": 0.7980769230769231, "grad_norm": 0.003433031029999256, "learning_rate": 1e-06, "loss": 0.0077, "step": 498 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.35702161490917206, "epoch": 0.7996794871794872, "grad_norm": 0.0025476589798927307, "learning_rate": 1e-06, "loss": -0.0088, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3605404496192932, "epoch": 0.8012820512820513, "grad_norm": 0.003566565690562129, "learning_rate": 1e-06, "loss": 0.0075, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15634.0, "completions/max_terminated_length": 15634.0, "completions/mean_length": 7596.6015625, "completions/mean_terminated_length": 7596.6015625, "completions/min_length": 1919.0, "completions/min_terminated_length": 1919.0, "entropy": 0.34830760955810547, "epoch": 0.8028846153846154, "frac_reward_zero_std": 0.875, "grad_norm": 0.005601275246590376, "learning_rate": 1e-06, "loss": -0.0206, "num_tokens": 681632843.0, "reward": 0.8949024081230164, "reward_std": 0.020390626043081284, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9986327886581421, "rewards/symbolic_reward_partial_score/std": 0.017917124554514885, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.079047679901123, "sampling/importance_sampling_ratio/min": 0.0030545254703611135, "sampling/sampling_logp_difference/max": 5.791131019592285, "sampling/sampling_logp_difference/mean": 0.13599880039691925, "step": 501 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3446967899799347, "epoch": 0.8044871794871795, "grad_norm": 0.004956559743732214, "learning_rate": 1e-06, "loss": 0.0201, "step": 502 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.34777167439460754, "epoch": 0.8060897435897436, "grad_norm": 0.0046878382563591, "learning_rate": 1e-06, "loss": 0.026, "step": 503 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3478347361087799, "epoch": 0.8076923076923077, "grad_norm": 0.005885657388716936, "learning_rate": 1e-06, "loss": -0.0198, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14632.0, "completions/max_terminated_length": 14632.0, "completions/mean_length": 7865.123046875, "completions/mean_terminated_length": 7865.123046875, "completions/min_length": 3031.0, "completions/min_terminated_length": 3031.0, "entropy": 0.3459438234567642, "epoch": 0.8092948717948718, "frac_reward_zero_std": 0.9375, "grad_norm": 0.006020909175276756, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 686566010.0, "reward": 0.8949218988418579, "reward_std": 0.013898635283112526, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9986979365348816, "rewards/symbolic_reward_partial_score/std": 0.01946326717734337, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.078963041305542, "sampling/importance_sampling_ratio/min": 3.411178113310598e-05, "sampling/sampling_logp_difference/max": 10.285867691040039, "sampling/sampling_logp_difference/mean": 0.13606896996498108, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.34449777007102966, "epoch": 0.8108974358974359, "grad_norm": 0.004631552845239639, "learning_rate": 1e-06, "loss": 0.0067, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.34616413712501526, "epoch": 0.8125, "grad_norm": 0.006058537866920233, "learning_rate": 1e-06, "loss": 0.0147, "step": 507 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.34855130314826965, "epoch": 0.8141025641025641, "grad_norm": 0.005270438734441996, "learning_rate": 1e-06, "loss": -0.0156, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14870.0, "completions/max_terminated_length": 14870.0, "completions/mean_length": 7227.810546875, "completions/mean_terminated_length": 7227.810546875, "completions/min_length": 2077.0, "completions/min_terminated_length": 2077.0, "entropy": 0.3595229387283325, "epoch": 0.8157051282051282, "frac_reward_zero_std": 0.96875, "grad_norm": 0.012571842409670353, "learning_rate": 1e-06, "loss": 0.0106, "num_tokens": 691035001.0, "reward": 0.8963379263877869, "reward_std": 0.007873298600316048, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.006366382818669081, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0812909603118896, "sampling/importance_sampling_ratio/min": 0.001930754166096449, "sampling/sampling_logp_difference/max": 6.249844551086426, "sampling/sampling_logp_difference/mean": 0.1403871476650238, "step": 509 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3557231277227402, "epoch": 0.8173076923076923, "grad_norm": 0.0038961891550570726, "learning_rate": 1e-06, "loss": -0.0098, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36071114242076874, "epoch": 0.8189102564102564, "grad_norm": 0.017868027091026306, "learning_rate": 1e-06, "loss": 0.0028, "step": 511 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3606826663017273, "epoch": 0.8205128205128205, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0108, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15480.0, "completions/mean_length": 7729.685546875, "completions/mean_terminated_length": 7712.74951171875, "completions/min_length": 3186.0, "completions/min_terminated_length": 3186.0, "entropy": 0.3494661897420883, "epoch": 0.8221153846153846, "frac_reward_zero_std": 0.84375, "grad_norm": 0.024848448112607002, "learning_rate": 1e-06, "loss": -0.0157, "num_tokens": 695869864.0, "reward": 0.8913525342941284, "reward_std": 0.02776050567626953, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.9952636957168579, "rewards/symbolic_reward_partial_score/std": 0.06309281289577484, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0796198844909668, "sampling/importance_sampling_ratio/min": 0.002194375963881612, "sampling/sampling_logp_difference/max": 6.121857643127441, "sampling/sampling_logp_difference/mean": 0.13689905405044556, "step": 513 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.35251592099666595, "epoch": 0.8237179487179487, "grad_norm": 0.015938380733132362, "learning_rate": 1e-06, "loss": 0.0119, "step": 514 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3494427651166916, "epoch": 0.8253205128205128, "grad_norm": 0.007019078359007835, "learning_rate": 1e-06, "loss": -0.0265, "step": 515 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.349221408367157, "epoch": 0.8269230769230769, "grad_norm": 0.008070691488683224, "learning_rate": 1e-06, "loss": 0.0363, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14153.0, "completions/max_terminated_length": 14153.0, "completions/mean_length": 7543.953125, "completions/mean_terminated_length": 7543.953125, "completions/min_length": 2888.0, "completions/min_terminated_length": 2888.0, "entropy": 0.35047176480293274, "epoch": 0.8285256410256411, "frac_reward_zero_std": 0.9375, "grad_norm": 0.004935094155371189, "learning_rate": 1e-06, "loss": -0.0122, "num_tokens": 700589776.0, "reward": 0.8963379263877869, "reward_std": 0.011554005555808544, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.006366382818669081, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0796656608581543, "sampling/importance_sampling_ratio/min": 0.0010661783162504435, "sampling/sampling_logp_difference/max": 6.843674659729004, "sampling/sampling_logp_difference/mean": 0.13724185526371002, "step": 517 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.34853705763816833, "epoch": 0.8301282051282052, "grad_norm": 0.004621150437742472, "learning_rate": 1e-06, "loss": -0.0124, "step": 518 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3537525534629822, "epoch": 0.8317307692307693, "grad_norm": 0.027055934071540833, "learning_rate": 1e-06, "loss": 0.0029, "step": 519 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.34897057712078094, "epoch": 0.8333333333333334, "grad_norm": 0.017684390768408775, "learning_rate": 1e-06, "loss": 0.0129, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13704.0, "completions/max_terminated_length": 13704.0, "completions/mean_length": 7982.77734375, "completions/mean_terminated_length": 7982.77734375, "completions/min_length": 3595.0, "completions/min_terminated_length": 3595.0, "entropy": 0.3441619724035263, "epoch": 0.8349358974358975, "frac_reward_zero_std": 0.9375, "grad_norm": 0.003706417977809906, "learning_rate": 1e-06, "loss": 0.0101, "num_tokens": 705571630.0, "reward": 0.89697265625, "reward_std": 0.01210937649011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9977213144302368, "rewards/symbolic_reward_partial_score/std": 0.04478955641388893, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0786633491516113, "sampling/importance_sampling_ratio/min": 0.002241447102278471, "sampling/sampling_logp_difference/max": 6.10063362121582, "sampling/sampling_logp_difference/mean": 0.13530901074409485, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34124282002449036, "epoch": 0.8365384615384616, "grad_norm": 0.003935777582228184, "learning_rate": 1e-06, "loss": -0.0103, "step": 522 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.34511810541152954, "epoch": 0.8381410256410257, "grad_norm": 0.003198092570528388, "learning_rate": 1e-06, "loss": 0.0138, "step": 523 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3459903448820114, "epoch": 0.8397435897435898, "grad_norm": 0.003314318135380745, "learning_rate": 1e-06, "loss": -0.0091, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15321.0, "completions/max_terminated_length": 15321.0, "completions/mean_length": 7773.587890625, "completions/mean_terminated_length": 7773.587890625, "completions/min_length": 2502.0, "completions/min_terminated_length": 2502.0, "entropy": 0.3592463433742523, "epoch": 0.8413461538461539, "frac_reward_zero_std": 0.9375, "grad_norm": 0.022124452516436577, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 710346843.0, "reward": 0.8949805498123169, "reward_std": 0.013724283315241337, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9988932609558105, "rewards/symbolic_reward_partial_score/std": 0.01488781999796629, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0813720226287842, "sampling/importance_sampling_ratio/min": 0.0011896007927134633, "sampling/sampling_logp_difference/max": 6.734137535095215, "sampling/sampling_logp_difference/mean": 0.14021453261375427, "step": 525 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.35817258059978485, "epoch": 0.842948717948718, "grad_norm": 0.006050014402717352, "learning_rate": 1e-06, "loss": -0.0012, "step": 526 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.35983653366565704, "epoch": 0.844551282051282, "grad_norm": 0.005000798497349024, "learning_rate": 1e-06, "loss": -0.0168, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.36064282059669495, "epoch": 0.8461538461538461, "grad_norm": 0.021946826949715614, "learning_rate": 1e-06, "loss": 0.0136, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14918.0, "completions/max_terminated_length": 14918.0, "completions/mean_length": 8457.853515625, "completions/mean_terminated_length": 8457.853515625, "completions/min_length": 3365.0, "completions/min_terminated_length": 3365.0, "entropy": 0.3512197732925415, "epoch": 0.8477564102564102, "frac_reward_zero_std": 0.84375, "grad_norm": 0.014062962494790554, "learning_rate": 1e-06, "loss": -0.0208, "num_tokens": 715601920.0, "reward": 0.888867199420929, "reward_std": 0.03324303776025772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.9947916269302368, "rewards/symbolic_reward_partial_score/std": 0.06355251371860504, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0800974369049072, "sampling/importance_sampling_ratio/min": 0.0020220300648361444, "sampling/sampling_logp_difference/max": 6.203653335571289, "sampling/sampling_logp_difference/mean": 0.13709557056427002, "step": 529 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3483118563890457, "epoch": 0.8493589743589743, "grad_norm": 0.030690278857946396, "learning_rate": 1e-06, "loss": 0.0064, "step": 530 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3472297489643097, "epoch": 0.8509615384615384, "grad_norm": 0.029186764732003212, "learning_rate": 1e-06, "loss": 0.015, "step": 531 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3533993065357208, "epoch": 0.8525641025641025, "grad_norm": 0.036227382719516754, "learning_rate": 1e-06, "loss": 0.0024, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13666.0, "completions/max_terminated_length": 13666.0, "completions/mean_length": 8019.4296875, "completions/mean_terminated_length": 8019.4296875, "completions/min_length": 3266.0, "completions/min_terminated_length": 3266.0, "entropy": 0.3659429997205734, "epoch": 0.8541666666666666, "frac_reward_zero_std": 0.9375, "grad_norm": 0.004023151472210884, "learning_rate": 1e-06, "loss": -0.01, "num_tokens": 720472972.0, "reward": 0.8975586295127869, "reward_std": 0.009765625931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9996744990348816, "rewards/symbolic_reward_partial_score/std": 0.005203233566135168, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.082960844039917, "sampling/importance_sampling_ratio/min": 0.001946546952240169, "sampling/sampling_logp_difference/max": 6.241698265075684, "sampling/sampling_logp_difference/mean": 0.14204412698745728, "step": 533 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3657459020614624, "epoch": 0.8557692307692307, "grad_norm": 0.0030788229778409004, "learning_rate": 1e-06, "loss": 0.0042, "step": 534 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3604998290538788, "epoch": 0.8573717948717948, "grad_norm": 0.003156043589115143, "learning_rate": 1e-06, "loss": 0.0102, "step": 535 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.36391162872314453, "epoch": 0.8589743589743589, "grad_norm": 0.00332885910756886, "learning_rate": 1e-06, "loss": -0.0102, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14971.0, "completions/max_terminated_length": 14971.0, "completions/mean_length": 8043.517578125, "completions/mean_terminated_length": 8043.517578125, "completions/min_length": 3636.0, "completions/min_terminated_length": 3636.0, "entropy": 0.3486137092113495, "epoch": 0.8605769230769231, "frac_reward_zero_std": 0.9375, "grad_norm": 0.03648432344198227, "learning_rate": 1e-06, "loss": 0.0292, "num_tokens": 725517061.0, "reward": 0.8957812786102295, "reward_std": 0.01323145255446434, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9976562261581421, "rewards/symbolic_reward_partial_score/std": 0.044615939259529114, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0795795917510986, "sampling/importance_sampling_ratio/min": 0.0013775692787021399, "sampling/sampling_logp_difference/max": 6.587434768676758, "sampling/sampling_logp_difference/mean": 0.13693031668663025, "step": 537 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.34757718443870544, "epoch": 0.8621794871794872, "grad_norm": 0.004802484530955553, "learning_rate": 1e-06, "loss": -0.0125, "step": 538 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.349036306142807, "epoch": 0.8637820512820513, "grad_norm": 0.004811335355043411, "learning_rate": 1e-06, "loss": -0.0128, "step": 539 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.34473690390586853, "epoch": 0.8653846153846154, "grad_norm": 0.024146266281604767, "learning_rate": 1e-06, "loss": 0.0052, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15815.0, "completions/max_terminated_length": 15815.0, "completions/mean_length": 7976.205078125, "completions/mean_terminated_length": 7976.205078125, "completions/min_length": 3195.0, "completions/min_terminated_length": 3195.0, "entropy": 0.3391701281070709, "epoch": 0.8669871794871795, "frac_reward_zero_std": 0.78125, "grad_norm": 0.014171161688864231, "learning_rate": 1e-06, "loss": -0.0068, "num_tokens": 730469742.0, "reward": 0.8826172351837158, "reward_std": 0.042127519845962524, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.97265625, "rewards/symbolic_reward_accuracy/std": 0.16324250400066376, "rewards/symbolic_reward_partial_score/mean": 0.9967448115348816, "rewards/symbolic_reward_partial_score/std": 0.024094481021165848, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0776143074035645, "sampling/importance_sampling_ratio/min": 0.002562822075560689, "sampling/sampling_logp_difference/max": 5.966646194458008, "sampling/sampling_logp_difference/mean": 0.1333872377872467, "step": 541 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.33675433695316315, "epoch": 0.8685897435897436, "grad_norm": 0.010020706802606583, "learning_rate": 1e-06, "loss": -0.0402, "step": 542 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.33644047379493713, "epoch": 0.8701923076923077, "grad_norm": 0.030058881267905235, "learning_rate": 1e-06, "loss": -0.0027, "step": 543 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3406301885843277, "epoch": 0.8717948717948718, "grad_norm": 0.0302598774433136, "learning_rate": 1e-06, "loss": 0.038, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14176.0, "completions/max_terminated_length": 14176.0, "completions/mean_length": 7765.052734375, "completions/mean_terminated_length": 7765.052734375, "completions/min_length": 2888.0, "completions/min_terminated_length": 2888.0, "entropy": 0.34120941162109375, "epoch": 0.8733974358974359, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0033070528879761696, "learning_rate": 1e-06, "loss": -0.0077, "num_tokens": 735281545.0, "reward": 0.8974902629852295, "reward_std": 0.010039063170552254, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9994466304779053, "rewards/symbolic_reward_partial_score/std": 0.009568748995661736, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0787575244903564, "sampling/importance_sampling_ratio/min": 0.0011823868844658136, "sampling/sampling_logp_difference/max": 6.740220069885254, "sampling/sampling_logp_difference/mean": 0.13539470732212067, "step": 545 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.343269482254982, "epoch": 0.875, "grad_norm": 0.0036119655705988407, "learning_rate": 1e-06, "loss": -0.0084, "step": 546 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3422205001115799, "epoch": 0.8766025641025641, "grad_norm": 0.024655714631080627, "learning_rate": 1e-06, "loss": 0.0045, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34527136385440826, "epoch": 0.8782051282051282, "grad_norm": 0.03149307519197464, "learning_rate": 1e-06, "loss": 0.014, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13980.0, "completions/mean_length": 8458.0703125, "completions/mean_terminated_length": 8442.5595703125, "completions/min_length": 3255.0, "completions/min_terminated_length": 3255.0, "entropy": 0.3415333330631256, "epoch": 0.8798076923076923, "frac_reward_zero_std": 0.75, "grad_norm": 0.035741694271564484, "learning_rate": 1e-06, "loss": 0.0212, "num_tokens": 740569997.0, "reward": 0.8761670589447021, "reward_std": 0.05735776573419571, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.96484375, "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, "rewards/symbolic_reward_partial_score/mean": 0.9915201663970947, "rewards/symbolic_reward_partial_score/std": 0.06965228170156479, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.078197956085205, "sampling/importance_sampling_ratio/min": 0.00012383870489429682, "sampling/sampling_logp_difference/max": 8.996530532836914, "sampling/sampling_logp_difference/mean": 0.13424280285835266, "step": 549 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.34185269474983215, "epoch": 0.8814102564102564, "grad_norm": 0.027247874066233635, "learning_rate": 1e-06, "loss": -0.0193, "step": 550 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.340072363615036, "epoch": 0.8830128205128205, "grad_norm": 0.026825400069355965, "learning_rate": 1e-06, "loss": -0.006, "step": 551 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3389461189508438, "epoch": 0.8846153846153846, "grad_norm": 0.01177581213414669, "learning_rate": 1e-06, "loss": 0.0108, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15687.0, "completions/mean_length": 8557.7578125, "completions/mean_terminated_length": 8542.4423828125, "completions/min_length": 3451.0, "completions/min_terminated_length": 3451.0, "entropy": 0.3391670137643814, "epoch": 0.8862179487179487, "frac_reward_zero_std": 0.8125, "grad_norm": 0.05094817653298378, "learning_rate": 1e-06, "loss": 0.029, "num_tokens": 745929169.0, "reward": 0.8828418254852295, "reward_std": 0.041340723633766174, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.974609375, "rewards/symbolic_reward_accuracy/std": 0.15746226906776428, "rewards/symbolic_reward_partial_score/mean": 0.9942382574081421, "rewards/symbolic_reward_partial_score/std": 0.06357735395431519, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.078660249710083, "sampling/importance_sampling_ratio/min": 0.0018597175367176533, "sampling/sampling_logp_difference/max": 6.287330627441406, "sampling/sampling_logp_difference/mean": 0.13432765007019043, "step": 553 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.3397351950407028, "epoch": 0.8878205128205128, "grad_norm": 0.023710021749138832, "learning_rate": 1e-06, "loss": -0.0204, "step": 554 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3418770432472229, "epoch": 0.8894230769230769, "grad_norm": 0.014287839643657207, "learning_rate": 1e-06, "loss": -0.0068, "step": 555 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.34017275273799896, "epoch": 0.8910256410256411, "grad_norm": 0.03731085732579231, "learning_rate": 1e-06, "loss": 0.0191, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15681.0, "completions/mean_length": 8314.595703125, "completions/mean_terminated_length": 8298.8046875, "completions/min_length": 2319.0, "completions/min_terminated_length": 2319.0, "entropy": 0.34538203477859497, "epoch": 0.8926282051282052, "frac_reward_zero_std": 0.84375, "grad_norm": 0.034499768167734146, "learning_rate": 1e-06, "loss": -0.0083, "num_tokens": 751059218.0, "reward": 0.8814844489097595, "reward_std": 0.0325779914855957, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.970703125, "rewards/symbolic_reward_accuracy/std": 0.16880230605602264, "rewards/symbolic_reward_partial_score/mean": 0.9975260496139526, "rewards/symbolic_reward_partial_score/std": 0.01427219994366169, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0788187980651855, "sampling/importance_sampling_ratio/min": 0.0011919845128431916, "sampling/sampling_logp_difference/max": 6.732135772705078, "sampling/sampling_logp_difference/mean": 0.1352325677871704, "step": 557 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.33867380023002625, "epoch": 0.8942307692307693, "grad_norm": 0.009168021380901337, "learning_rate": 1e-06, "loss": -0.0062, "step": 558 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.3426493704319, "epoch": 0.8958333333333334, "grad_norm": 0.026689838618040085, "learning_rate": 1e-06, "loss": 0.0336, "step": 559 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.34023988246917725, "epoch": 0.8974358974358975, "grad_norm": 0.0200203787535429, "learning_rate": 1e-06, "loss": -0.007, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14225.0, "completions/max_terminated_length": 14225.0, "completions/mean_length": 8181.326171875, "completions/mean_terminated_length": 8181.326171875, "completions/min_length": 2263.0, "completions/min_terminated_length": 2263.0, "entropy": 0.33621205389499664, "epoch": 0.8990384615384616, "frac_reward_zero_std": 0.8125, "grad_norm": 0.013075647875666618, "learning_rate": 1e-06, "loss": -0.0235, "num_tokens": 756131337.0, "reward": 0.8851074576377869, "reward_std": 0.03694463148713112, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9765625, "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, "rewards/symbolic_reward_partial_score/mean": 0.9972330927848816, "rewards/symbolic_reward_partial_score/std": 0.025120805948972702, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0780417919158936, "sampling/importance_sampling_ratio/min": 0.00187345826998353, "sampling/sampling_logp_difference/max": 6.279969215393066, "sampling/sampling_logp_difference/mean": 0.1335962861776352, "step": 561 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.33530767261981964, "epoch": 0.9006410256410257, "grad_norm": 0.028118403628468513, "learning_rate": 1e-06, "loss": -0.0032, "step": 562 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3365401029586792, "epoch": 0.9022435897435898, "grad_norm": 0.013327058404684067, "learning_rate": 1e-06, "loss": 0.0064, "step": 563 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.337029293179512, "epoch": 0.9038461538461539, "grad_norm": 0.026841716840863228, "learning_rate": 1e-06, "loss": 0.0058, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14894.0, "completions/mean_length": 8410.248046875, "completions/mean_terminated_length": 8331.611328125, "completions/min_length": 3038.0, "completions/min_terminated_length": 3038.0, "entropy": 0.3237399756908417, "epoch": 0.905448717948718, "frac_reward_zero_std": 0.84375, "grad_norm": 0.03471784666180611, "learning_rate": 1e-06, "loss": 0.011, "num_tokens": 761358792.0, "reward": 0.8862695693969727, "reward_std": 0.03639495372772217, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.982421875, "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, "rewards/symbolic_reward_partial_score/mean": 0.9926432371139526, "rewards/symbolic_reward_partial_score/std": 0.0781535804271698, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0754002332687378, "sampling/importance_sampling_ratio/min": 0.0025414065457880497, "sampling/sampling_logp_difference/max": 5.975037574768066, "sampling/sampling_logp_difference/mean": 0.12898731231689453, "step": 565 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.32419992983341217, "epoch": 0.907051282051282, "grad_norm": 0.04175221920013428, "learning_rate": 1e-06, "loss": 0.0366, "step": 566 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3232503980398178, "epoch": 0.9086538461538461, "grad_norm": 0.017876626923680305, "learning_rate": 1e-06, "loss": -0.0176, "step": 567 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.32405321300029755, "epoch": 0.9102564102564102, "grad_norm": 0.009180638007819653, "learning_rate": 1e-06, "loss": -0.0137, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16196.0, "completions/mean_length": 8126.017578125, "completions/mean_terminated_length": 8109.85693359375, "completions/min_length": 2547.0, "completions/min_terminated_length": 2547.0, "entropy": 0.318324476480484, "epoch": 0.9118589743589743, "frac_reward_zero_std": 0.78125, "grad_norm": 0.059893812984228134, "learning_rate": 1e-06, "loss": 0.091, "num_tokens": 766446097.0, "reward": 0.8878125548362732, "reward_std": 0.04243948683142662, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.982421875, "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, "rewards/symbolic_reward_partial_score/mean": 0.9945312738418579, "rewards/symbolic_reward_partial_score/std": 0.06387193500995636, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0746209621429443, "sampling/importance_sampling_ratio/min": 0.0013399770250543952, "sampling/sampling_logp_difference/max": 6.615102767944336, "sampling/sampling_logp_difference/mean": 0.1282263845205307, "step": 569 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.32095424830913544, "epoch": 0.9134615384615384, "grad_norm": 0.007462606765329838, "learning_rate": 1e-06, "loss": -0.0061, "step": 570 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.32305654883384705, "epoch": 0.9150641025641025, "grad_norm": 0.007424467243254185, "learning_rate": 1e-06, "loss": -0.0398, "step": 571 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.32131558656692505, "epoch": 0.9166666666666666, "grad_norm": 0.008652808144688606, "learning_rate": 1e-06, "loss": -0.0255, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15004.0, "completions/max_terminated_length": 15004.0, "completions/mean_length": 7835.513671875, "completions/mean_terminated_length": 7835.513671875, "completions/min_length": 1667.0, "completions/min_terminated_length": 1667.0, "entropy": 0.32700352370738983, "epoch": 0.9182692307692307, "frac_reward_zero_std": 0.9375, "grad_norm": 0.004091544076800346, "learning_rate": 1e-06, "loss": 0.0158, "num_tokens": 771284648.0, "reward": 0.8975342512130737, "reward_std": 0.00986328162252903, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9995931386947632, "rewards/symbolic_reward_partial_score/std": 0.006633348762989044, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0761460065841675, "sampling/importance_sampling_ratio/min": 0.00308411568403244, "sampling/sampling_logp_difference/max": 5.781490325927734, "sampling/sampling_logp_difference/mean": 0.1310756802558899, "step": 573 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3267098069190979, "epoch": 0.9198717948717948, "grad_norm": 0.004021909087896347, "learning_rate": 1e-06, "loss": -0.0107, "step": 574 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.33004049956798553, "epoch": 0.9214743589743589, "grad_norm": 0.03467942029237747, "learning_rate": 1e-06, "loss": 0.0196, "step": 575 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3284391909837723, "epoch": 0.9230769230769231, "grad_norm": 0.004009925294667482, "learning_rate": 1e-06, "loss": -0.0112, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14375.0, "completions/max_terminated_length": 14375.0, "completions/mean_length": 7657.677734375, "completions/mean_terminated_length": 7657.677734375, "completions/min_length": 2564.0, "completions/min_terminated_length": 2564.0, "entropy": 0.3263503909111023, "epoch": 0.9246794871794872, "frac_reward_zero_std": 0.84375, "grad_norm": 0.02574007958173752, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 776033699.0, "reward": 0.8927539587020874, "reward_std": 0.028984377160668373, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9953776001930237, "rewards/symbolic_reward_partial_score/std": 0.05420267954468727, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0757060050964355, "sampling/importance_sampling_ratio/min": 0.0019505794625729322, "sampling/sampling_logp_difference/max": 6.239628791809082, "sampling/sampling_logp_difference/mean": 0.13035494089126587, "step": 577 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.32775697112083435, "epoch": 0.9262820512820513, "grad_norm": 0.027526434510946274, "learning_rate": 1e-06, "loss": 0.0132, "step": 578 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.32670439779758453, "epoch": 0.9278846153846154, "grad_norm": 0.004334470722824335, "learning_rate": 1e-06, "loss": 0.0139, "step": 579 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3250982463359833, "epoch": 0.9294871794871795, "grad_norm": 0.005130876321345568, "learning_rate": 1e-06, "loss": -0.0216, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16199.0, "completions/max_terminated_length": 16199.0, "completions/mean_length": 7839.169921875, "completions/mean_terminated_length": 7839.169921875, "completions/min_length": 2284.0, "completions/min_terminated_length": 2284.0, "entropy": 0.31938914954662323, "epoch": 0.9310897435897436, "frac_reward_zero_std": 0.78125, "grad_norm": 0.03870486468076706, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 780940426.0, "reward": 0.8854395151138306, "reward_std": 0.04298959672451019, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.978515625, "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, "rewards/symbolic_reward_partial_score/mean": 0.994433581829071, "rewards/symbolic_reward_partial_score/std": 0.06360119581222534, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.073976993560791, "sampling/importance_sampling_ratio/min": 0.0017784941010177135, "sampling/sampling_logp_difference/max": 6.331988334655762, "sampling/sampling_logp_difference/mean": 0.12727048993110657, "step": 581 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.31475602090358734, "epoch": 0.9326923076923077, "grad_norm": 0.005861265119165182, "learning_rate": 1e-06, "loss": -0.0224, "step": 582 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.31599095463752747, "epoch": 0.9342948717948718, "grad_norm": 0.04167579486966133, "learning_rate": 1e-06, "loss": 0.0347, "step": 583 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.316402330994606, "epoch": 0.9358974358974359, "grad_norm": 0.008439240045845509, "learning_rate": 1e-06, "loss": -0.002, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14758.0, "completions/max_terminated_length": 14758.0, "completions/mean_length": 8480.80078125, "completions/mean_terminated_length": 8480.80078125, "completions/min_length": 2513.0, "completions/min_terminated_length": 2513.0, "entropy": 0.31434524059295654, "epoch": 0.9375, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0425376296043396, "learning_rate": 1e-06, "loss": 0.0294, "num_tokens": 786220196.0, "reward": 0.8827539682388306, "reward_std": 0.04234263300895691, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.974609375, "rewards/symbolic_reward_accuracy/std": 0.15746226906776428, "rewards/symbolic_reward_partial_score/mean": 0.9932942986488342, "rewards/symbolic_reward_partial_score/std": 0.06595806032419205, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0727444887161255, "sampling/importance_sampling_ratio/min": 0.00259688263759017, "sampling/sampling_logp_difference/max": 5.95344352722168, "sampling/sampling_logp_difference/mean": 0.12544244527816772, "step": 585 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3120368868112564, "epoch": 0.9391025641025641, "grad_norm": 0.027404973283410072, "learning_rate": 1e-06, "loss": 0.0034, "step": 586 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.31391267478466034, "epoch": 0.9407051282051282, "grad_norm": 0.011466105468571186, "learning_rate": 1e-06, "loss": -0.0099, "step": 587 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.30838459730148315, "epoch": 0.9423076923076923, "grad_norm": 0.01363231148570776, "learning_rate": 1e-06, "loss": -0.0252, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15039.0, "completions/max_terminated_length": 15039.0, "completions/mean_length": 8221.654296875, "completions/mean_terminated_length": 8221.654296875, "completions/min_length": 3078.0, "completions/min_terminated_length": 3078.0, "entropy": 0.31640224158763885, "epoch": 0.9439102564102564, "frac_reward_zero_std": 0.875, "grad_norm": 0.012828879989683628, "learning_rate": 1e-06, "loss": -0.0195, "num_tokens": 791379779.0, "reward": 0.888427734375, "reward_std": 0.02675781399011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.982421875, "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, "rewards/symbolic_reward_partial_score/mean": 0.99658203125, "rewards/symbolic_reward_partial_score/std": 0.04576823115348816, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0732287168502808, "sampling/importance_sampling_ratio/min": 0.001962594222277403, "sampling/sampling_logp_difference/max": 6.233488082885742, "sampling/sampling_logp_difference/mean": 0.12605366110801697, "step": 589 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.31497912108898163, "epoch": 0.9455128205128205, "grad_norm": 0.007003838662058115, "learning_rate": 1e-06, "loss": 0.0055, "step": 590 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.31226901710033417, "epoch": 0.9471153846153846, "grad_norm": 0.009434962645173073, "learning_rate": 1e-06, "loss": -0.0171, "step": 591 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.31289002299308777, "epoch": 0.9487179487179487, "grad_norm": 0.029598215594887733, "learning_rate": 1e-06, "loss": 0.0323, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15631.0, "completions/mean_length": 7556.111328125, "completions/mean_terminated_length": 7521.49267578125, "completions/min_length": 2706.0, "completions/min_terminated_length": 2706.0, "entropy": 0.3237259238958359, "epoch": 0.9503205128205128, "frac_reward_zero_std": 0.78125, "grad_norm": 0.027938202023506165, "learning_rate": 1e-06, "loss": -0.0256, "num_tokens": 796036188.0, "reward": 0.8843164443969727, "reward_std": 0.0491020604968071, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.978515625, "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, "rewards/symbolic_reward_partial_score/mean": 0.9919922351837158, "rewards/symbolic_reward_partial_score/std": 0.07998085021972656, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0745928287506104, "sampling/importance_sampling_ratio/min": 0.0028330194763839245, "sampling/sampling_logp_difference/max": 5.866412162780762, "sampling/sampling_logp_difference/mean": 0.1289880871772766, "step": 593 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.32408207654953003, "epoch": 0.9519230769230769, "grad_norm": 0.008252340368926525, "learning_rate": 1e-06, "loss": -0.0019, "step": 594 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.32150138914585114, "epoch": 0.9535256410256411, "grad_norm": 0.03429492935538292, "learning_rate": 1e-06, "loss": 0.0217, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.32402147352695465, "epoch": 0.9551282051282052, "grad_norm": 0.009120246395468712, "learning_rate": 1e-06, "loss": 0.0294, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12020.0, "completions/max_terminated_length": 12020.0, "completions/mean_length": 7426.94140625, "completions/mean_terminated_length": 7426.94140625, "completions/min_length": 2185.0, "completions/min_terminated_length": 2185.0, "entropy": 0.31911197304725647, "epoch": 0.9567307692307693, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0036414633505046368, "learning_rate": 1e-06, "loss": 0.0101, "num_tokens": 800708638.0, "reward": 0.8975342512130737, "reward_std": 0.00986328162252903, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9995931386947632, "rewards/symbolic_reward_partial_score/std": 0.006633348762989044, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0737879276275635, "sampling/importance_sampling_ratio/min": 0.002881818450987339, "sampling/sampling_logp_difference/max": 5.849333763122559, "sampling/sampling_logp_difference/mean": 0.12712301313877106, "step": 597 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3197551965713501, "epoch": 0.9583333333333334, "grad_norm": 0.0037489698734134436, "learning_rate": 1e-06, "loss": -0.0105, "step": 598 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.31525179743766785, "epoch": 0.9599358974358975, "grad_norm": 0.0037976326420903206, "learning_rate": 1e-06, "loss": -0.0107, "step": 599 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3183380663394928, "epoch": 0.9615384615384616, "grad_norm": 0.004050182178616524, "learning_rate": 1e-06, "loss": 0.0071, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13201.0, "completions/max_terminated_length": 13201.0, "completions/mean_length": 7741.779296875, "completions/mean_terminated_length": 7741.779296875, "completions/min_length": 2137.0, "completions/min_terminated_length": 2137.0, "entropy": 0.30652301013469696, "epoch": 0.9631410256410257, "frac_reward_zero_std": 0.90625, "grad_norm": 0.003980451263487339, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 805573213.0, "reward": 0.8963379263877869, "reward_std": 0.0146484375, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.006366382818669081, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.071467638015747, "sampling/importance_sampling_ratio/min": 0.0035312180407345295, "sampling/sampling_logp_difference/max": 5.646112442016602, "sampling/sampling_logp_difference/mean": 0.12333828955888748, "step": 601 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.308976948261261, "epoch": 0.9647435897435898, "grad_norm": 0.004512297920882702, "learning_rate": 1e-06, "loss": -0.0144, "step": 602 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3075946718454361, "epoch": 0.9663461538461539, "grad_norm": 0.042057253420352936, "learning_rate": 1e-06, "loss": 0.0292, "step": 603 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3098761737346649, "epoch": 0.967948717948718, "grad_norm": 0.0032194158993661404, "learning_rate": 1e-06, "loss": -0.0138, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13150.0, "completions/max_terminated_length": 13150.0, "completions/mean_length": 7347.171875, "completions/mean_terminated_length": 7347.171875, "completions/min_length": 3343.0, "completions/min_terminated_length": 3343.0, "entropy": 0.31372472643852234, "epoch": 0.969551282051282, "frac_reward_zero_std": 0.90625, "grad_norm": 0.012704822234809399, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 810218597.0, "reward": 0.8932813405990601, "reward_std": 0.019242819398641586, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9971354007720947, "rewards/symbolic_reward_partial_score/std": 0.04540697857737541, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0729745626449585, "sampling/importance_sampling_ratio/min": 0.0011778591433539987, "sampling/sampling_logp_difference/max": 6.744056701660156, "sampling/sampling_logp_difference/mean": 0.12568244338035583, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.31331661343574524, "epoch": 0.9711538461538461, "grad_norm": 0.023920981213450432, "learning_rate": 1e-06, "loss": -0.0027, "step": 606 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.31440359354019165, "epoch": 0.9727564102564102, "grad_norm": 0.0026107721496373415, "learning_rate": 1e-06, "loss": 0.0064, "step": 607 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3150676339864731, "epoch": 0.9743589743589743, "grad_norm": 0.005230339244008064, "learning_rate": 1e-06, "loss": -0.0158, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16012.0, "completions/max_terminated_length": 16012.0, "completions/mean_length": 7207.955078125, "completions/mean_terminated_length": 7207.955078125, "completions/min_length": 1977.0, "completions/min_terminated_length": 1977.0, "entropy": 0.31749193370342255, "epoch": 0.9759615384615384, "frac_reward_zero_std": 0.96875, "grad_norm": 0.012931720353662968, "learning_rate": 1e-06, "loss": 0.0116, "num_tokens": 814748078.0, "reward": 0.8936523795127869, "reward_std": 0.009723869152367115, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9983723759651184, "rewards/symbolic_reward_partial_score/std": 0.016405608505010605, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0738425254821777, "sampling/importance_sampling_ratio/min": 0.0018883716547861695, "sampling/sampling_logp_difference/max": 6.272040367126465, "sampling/sampling_logp_difference/mean": 0.12718096375465393, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.32043755054473877, "epoch": 0.9775641025641025, "grad_norm": 0.004636459518224001, "learning_rate": 1e-06, "loss": 0.0136, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3188112676143646, "epoch": 0.9791666666666666, "grad_norm": 0.007376720663160086, "learning_rate": 1e-06, "loss": -0.0013, "step": 611 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3232942074537277, "epoch": 0.9807692307692307, "grad_norm": 0.004888304974883795, "learning_rate": 1e-06, "loss": -0.0103, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13274.0, "completions/max_terminated_length": 13274.0, "completions/mean_length": 7558.6015625, "completions/mean_terminated_length": 7558.6015625, "completions/min_length": 3026.0, "completions/min_terminated_length": 3026.0, "entropy": 0.3174569755792618, "epoch": 0.9823717948717948, "frac_reward_zero_std": 0.9375, "grad_norm": 0.030527809634804726, "learning_rate": 1e-06, "loss": 0.0126, "num_tokens": 819498626.0, "reward": 0.8975489139556885, "reward_std": 0.009804688394069672, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9996418952941895, "rewards/symbolic_reward_partial_score/std": 0.0057472530752420425, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0731250047683716, "sampling/importance_sampling_ratio/min": 0.00021767415455542505, "sampling/sampling_logp_difference/max": 8.432511329650879, "sampling/sampling_logp_difference/mean": 0.12604385614395142, "step": 613 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.31529515981674194, "epoch": 0.9839743589743589, "grad_norm": 0.003561709076166153, "learning_rate": 1e-06, "loss": -0.0082, "step": 614 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3131861388683319, "epoch": 0.9855769230769231, "grad_norm": 0.0026964659336954355, "learning_rate": 1e-06, "loss": 0.0063, "step": 615 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3176727741956711, "epoch": 0.9871794871794872, "grad_norm": 0.003797512035816908, "learning_rate": 1e-06, "loss": -0.0086, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12709.0, "completions/max_terminated_length": 12709.0, "completions/mean_length": 7531.46484375, "completions/mean_terminated_length": 7531.46484375, "completions/min_length": 2899.0, "completions/min_terminated_length": 2899.0, "entropy": 0.3113708794116974, "epoch": 0.9887820512820513, "frac_reward_zero_std": 0.90625, "grad_norm": 0.004006918985396624, "learning_rate": 1e-06, "loss": -0.0119, "num_tokens": 824209328.0, "reward": 0.8957422375679016, "reward_std": 0.01703125238418579, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9975260496139526, "rewards/symbolic_reward_partial_score/std": 0.04499715566635132, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.071731686592102, "sampling/importance_sampling_ratio/min": 0.0014121038839221, "sampling/sampling_logp_difference/max": 6.562674522399902, "sampling/sampling_logp_difference/mean": 0.12378332018852234, "step": 617 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3087853342294693, "epoch": 0.9903846153846154, "grad_norm": 0.03744850680232048, "learning_rate": 1e-06, "loss": 0.0223, "step": 618 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30861349403858185, "epoch": 0.9919871794871795, "grad_norm": 0.0037763495929539204, "learning_rate": 1e-06, "loss": 0.008, "step": 619 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3096979409456253, "epoch": 0.9935897435897436, "grad_norm": 0.0037383539602160454, "learning_rate": 1e-06, "loss": -0.0121, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14987.0, "completions/max_terminated_length": 14987.0, "completions/mean_length": 7305.70703125, "completions/mean_terminated_length": 7305.70703125, "completions/min_length": 2528.0, "completions/min_terminated_length": 2528.0, "entropy": 0.3111078590154648, "epoch": 0.9951923076923077, "frac_reward_zero_std": 0.90625, "grad_norm": 0.02684037759900093, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 828826298.0, "reward": 0.8938965201377869, "reward_std": 0.01822519674897194, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9991862177848816, "rewards/symbolic_reward_partial_score/std": 0.008202801458537579, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.071767807006836, "sampling/importance_sampling_ratio/min": 0.0012590044643729925, "sampling/sampling_logp_difference/max": 6.677433967590332, "sampling/sampling_logp_difference/mean": 0.12439341098070145, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3138638287782669, "epoch": 0.9967948717948718, "grad_norm": 0.006550188641995192, "learning_rate": 1e-06, "loss": 0.0108, "step": 622 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3120322823524475, "epoch": 0.9983974358974359, "grad_norm": 0.005148868542164564, "learning_rate": 1e-06, "loss": -0.0076, "step": 623 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3123059719800949, "epoch": 1.0, "grad_norm": 0.02084791287779808, "learning_rate": 1e-06, "loss": -0.0061, "step": 624 }, { "epoch": 1.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 13006.59375, "eval_completions/max_terminated_length": 13006.59375, "eval_completions/mean_length": 7671.5634765625, "eval_completions/mean_terminated_length": 7671.5634765625, "eval_completions/min_length": 3719.90625, "eval_completions/min_terminated_length": 3719.90625, "eval_entropy": 0.30491245817393064, "eval_frac_reward_zero_std": 0.8984375, "eval_loss": 6.410128116840497e-05, "eval_num_tokens": 828826298.0, "eval_reward": 0.8919745273888111, "eval_reward_std": 0.02051253244280815, "eval_rewards/progression_diversity/mean": 0.0, "eval_rewards/progression_diversity/std": 0.0, "eval_rewards/symbolic_reward_accuracy/mean": 0.988037109375, "eval_rewards/symbolic_reward_accuracy/std": 0.07526328600943089, "eval_rewards/symbolic_reward_partial_score/mean": 0.9971740767359734, "eval_rewards/symbolic_reward_partial_score/std": 0.022606244034250267, "eval_rewards/tag_count_reward/mean": 0.0, "eval_rewards/tag_count_reward/std": 0.0, "eval_runtime": 8025.6621, "eval_samples_per_second": 0.031, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 1.0703570805490017, "eval_sampling/importance_sampling_ratio/min": 0.003686803707207531, "eval_sampling/sampling_logp_difference/max": 6.016047358512878, "eval_sampling/sampling_logp_difference/mean": 0.12205275008454919, "eval_steps_per_second": 0.0, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16264.0, "completions/mean_length": 7572.095703125, "completions/mean_terminated_length": 7554.85107421875, "completions/min_length": 3083.0, "completions/min_terminated_length": 3083.0, "entropy": 0.3038008362054825, "epoch": 1.001602564102564, "frac_reward_zero_std": 0.84375, "grad_norm": 0.01717405952513218, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 833551563.0, "reward": 0.891308605670929, "reward_std": 0.03100288100540638, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.9951171875, "rewards/symbolic_reward_partial_score/std": 0.06366384774446487, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0701267719268799, "sampling/importance_sampling_ratio/min": 4.460942818695912e-06, "sampling/sampling_logp_difference/max": 12.320150375366211, "sampling/sampling_logp_difference/mean": 0.12167361378669739, "step": 625 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3047471046447754, "epoch": 1.0032051282051282, "grad_norm": 0.0346001572906971, "learning_rate": 1e-06, "loss": 0.0141, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3034920394420624, "epoch": 1.0048076923076923, "grad_norm": 0.006263586226850748, "learning_rate": 1e-06, "loss": -0.0261, "step": 627 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3038324564695358, "epoch": 1.0064102564102564, "grad_norm": 0.005815045442432165, "learning_rate": 1e-06, "loss": 0.0183, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15214.0, "completions/max_terminated_length": 15214.0, "completions/mean_length": 7960.427734375, "completions/mean_terminated_length": 7960.427734375, "completions/min_length": 3501.0, "completions/min_terminated_length": 3501.0, "entropy": 0.30271387100219727, "epoch": 1.0080128205128205, "frac_reward_zero_std": 0.875, "grad_norm": 0.007918753661215305, "learning_rate": 1e-06, "loss": -0.0069, "num_tokens": 838452422.0, "reward": 0.8900195956230164, "reward_std": 0.024737173691391945, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.9979817867279053, "rewards/symbolic_reward_partial_score/std": 0.01871746964752674, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0692280530929565, "sampling/importance_sampling_ratio/min": 8.266865188488737e-05, "sampling/sampling_logp_difference/max": 9.400670051574707, "sampling/sampling_logp_difference/mean": 0.12023815512657166, "step": 629 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.29866258800029755, "epoch": 1.0096153846153846, "grad_norm": 0.012961495667696, "learning_rate": 1e-06, "loss": -0.0035, "step": 630 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.30047234892845154, "epoch": 1.0112179487179487, "grad_norm": 0.025549575686454773, "learning_rate": 1e-06, "loss": 0.0027, "step": 631 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.29905661940574646, "epoch": 1.0128205128205128, "grad_norm": 0.02982858195900917, "learning_rate": 1e-06, "loss": 0.0138, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14581.0, "completions/max_terminated_length": 14581.0, "completions/mean_length": 7970.181640625, "completions/mean_terminated_length": 7970.181640625, "completions/min_length": 4062.0, "completions/min_terminated_length": 4062.0, "entropy": 0.2968658357858658, "epoch": 1.0144230769230769, "frac_reward_zero_std": 0.875, "grad_norm": 0.005726849194616079, "learning_rate": 1e-06, "loss": -0.0211, "num_tokens": 843435523.0, "reward": 0.8917480707168579, "reward_std": 0.02855183184146881, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9920247793197632, "rewards/symbolic_reward_partial_score/std": 0.08819098025560379, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0682308673858643, "sampling/importance_sampling_ratio/min": 0.0020298007875680923, "sampling/sampling_logp_difference/max": 6.199817657470703, "sampling/sampling_logp_difference/mean": 0.11880014836788177, "step": 633 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2927432805299759, "epoch": 1.016025641025641, "grad_norm": 0.019428346306085587, "learning_rate": 1e-06, "loss": 0.012, "step": 634 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2961471676826477, "epoch": 1.017628205128205, "grad_norm": 0.004945802036672831, "learning_rate": 1e-06, "loss": -0.0203, "step": 635 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.29704806208610535, "epoch": 1.0192307692307692, "grad_norm": 0.03395095467567444, "learning_rate": 1e-06, "loss": 0.0348, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13490.0, "completions/max_terminated_length": 13490.0, "completions/mean_length": 7647.15625, "completions/mean_terminated_length": 7647.15625, "completions/min_length": 2218.0, "completions/min_terminated_length": 2218.0, "entropy": 0.29956918954849243, "epoch": 1.0208333333333333, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002243980998173356, "learning_rate": 1e-06, "loss": 0.0178, "num_tokens": 848224323.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0689256191253662, "sampling/importance_sampling_ratio/min": 0.001507283071987331, "sampling/sampling_logp_difference/max": 6.497446537017822, "sampling/sampling_logp_difference/mean": 0.11993230879306793, "step": 637 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.29689641296863556, "epoch": 1.0224358974358974, "grad_norm": 0.0027434667572379112, "learning_rate": 1e-06, "loss": -0.0045, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2978483736515045, "epoch": 1.0240384615384615, "grad_norm": 0.0023496190551668406, "learning_rate": 1e-06, "loss": -0.0039, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30301567912101746, "epoch": 1.0256410256410255, "grad_norm": 0.002188858576118946, "learning_rate": 1e-06, "loss": -0.0033, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14745.0, "completions/max_terminated_length": 14745.0, "completions/mean_length": 7850.69921875, "completions/mean_terminated_length": 7850.69921875, "completions/min_length": 2627.0, "completions/min_terminated_length": 2627.0, "entropy": 0.3039587587118149, "epoch": 1.0272435897435896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 853077513.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0695734024047852, "sampling/importance_sampling_ratio/min": 0.0015758582158014178, "sampling/sampling_logp_difference/max": 6.45295524597168, "sampling/sampling_logp_difference/mean": 0.12085698544979095, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3003881871700287, "epoch": 1.0288461538461537, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30047284066677094, "epoch": 1.0304487179487178, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.29807648062705994, "epoch": 1.032051282051282, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15055.0, "completions/max_terminated_length": 15055.0, "completions/mean_length": 8018.087890625, "completions/mean_terminated_length": 8018.087890625, "completions/min_length": 2745.0, "completions/min_terminated_length": 2745.0, "entropy": 0.29161442816257477, "epoch": 1.0336538461538463, "frac_reward_zero_std": 0.875, "grad_norm": 0.01743435487151146, "learning_rate": 1e-06, "loss": 0.0105, "num_tokens": 858024662.0, "reward": 0.8869141340255737, "reward_std": 0.030482003465294838, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98046875, "rewards/symbolic_reward_accuracy/std": 0.1385180652141571, "rewards/symbolic_reward_partial_score/mean": 0.9954427480697632, "rewards/symbolic_reward_partial_score/std": 0.05087684467434883, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.067512035369873, "sampling/importance_sampling_ratio/min": 0.000799354340415448, "sampling/sampling_logp_difference/max": 7.131706237792969, "sampling/sampling_logp_difference/mean": 0.11742247641086578, "step": 645 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.29346978664398193, "epoch": 1.0352564102564104, "grad_norm": 0.006802158895879984, "learning_rate": 1e-06, "loss": -0.0234, "step": 646 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.29226499795913696, "epoch": 1.0368589743589745, "grad_norm": 0.008012805134057999, "learning_rate": 1e-06, "loss": -0.0024, "step": 647 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.29208989441394806, "epoch": 1.0384615384615385, "grad_norm": 0.021944493055343628, "learning_rate": 1e-06, "loss": 0.0179, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 13799.0, "completions/mean_length": 8456.9765625, "completions/mean_terminated_length": 8410.255859375, "completions/min_length": 3337.0, "completions/min_terminated_length": 3337.0, "entropy": 0.2881511002779007, "epoch": 1.0400641025641026, "frac_reward_zero_std": 0.84375, "grad_norm": 0.036767441779375076, "learning_rate": 1e-06, "loss": 0.0646, "num_tokens": 863227770.0, "reward": 0.8824414014816284, "reward_std": 0.044292449951171875, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.978515625, "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, "rewards/symbolic_reward_partial_score/mean": 0.9863932132720947, "rewards/symbolic_reward_partial_score/std": 0.11026550084352493, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0669612884521484, "sampling/importance_sampling_ratio/min": 0.0011750105768442154, "sampling/sampling_logp_difference/max": 6.746478080749512, "sampling/sampling_logp_difference/mean": 0.11578300595283508, "step": 649 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2874348312616348, "epoch": 1.0416666666666667, "grad_norm": 0.008486775681376457, "learning_rate": 1e-06, "loss": -0.0081, "step": 650 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2900143712759018, "epoch": 1.0432692307692308, "grad_norm": 0.01539598498493433, "learning_rate": 1e-06, "loss": -0.024, "step": 651 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.28809335827827454, "epoch": 1.044871794871795, "grad_norm": 0.014531200751662254, "learning_rate": 1e-06, "loss": -0.0172, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13522.0, "completions/max_terminated_length": 13522.0, "completions/mean_length": 8069.59375, "completions/mean_terminated_length": 8069.59375, "completions/min_length": 4101.0, "completions/min_terminated_length": 4101.0, "entropy": 0.28572797775268555, "epoch": 1.046474358974359, "frac_reward_zero_std": 0.84375, "grad_norm": 0.02514318935573101, "learning_rate": 1e-06, "loss": -0.009, "num_tokens": 868277498.0, "reward": 0.8922950029373169, "reward_std": 0.027725880965590477, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.997753918170929, "rewards/symbolic_reward_partial_score/std": 0.026105698198080063, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0669407844543457, "sampling/importance_sampling_ratio/min": 0.00189927127212286, "sampling/sampling_logp_difference/max": 6.266284942626953, "sampling/sampling_logp_difference/mean": 0.11612759530544281, "step": 653 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.28615590929985046, "epoch": 1.0480769230769231, "grad_norm": 0.030614567920565605, "learning_rate": 1e-06, "loss": 0.0051, "step": 654 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2873466908931732, "epoch": 1.0496794871794872, "grad_norm": 0.024968182668089867, "learning_rate": 1e-06, "loss": -0.0074, "step": 655 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.28713513910770416, "epoch": 1.0512820512820513, "grad_norm": 0.033524587750434875, "learning_rate": 1e-06, "loss": 0.0105, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12987.0, "completions/max_terminated_length": 12987.0, "completions/mean_length": 7668.623046875, "completions/mean_terminated_length": 7668.623046875, "completions/min_length": 2465.0, "completions/min_terminated_length": 2465.0, "entropy": 0.29844947159290314, "epoch": 1.0528846153846154, "frac_reward_zero_std": 0.90625, "grad_norm": 0.034264422953128815, "learning_rate": 1e-06, "loss": 0.0183, "num_tokens": 873047097.0, "reward": 0.8963037729263306, "reward_std": 0.014785156585276127, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9993977546691895, "rewards/symbolic_reward_partial_score/std": 0.007960735820233822, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0682570934295654, "sampling/importance_sampling_ratio/min": 0.004175328183919191, "sampling/sampling_logp_difference/max": 5.478562355041504, "sampling/sampling_logp_difference/mean": 0.11900214105844498, "step": 657 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.29740945994853973, "epoch": 1.0544871794871795, "grad_norm": 0.004157658200711012, "learning_rate": 1e-06, "loss": -0.0124, "step": 658 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.29370294511318207, "epoch": 1.0560897435897436, "grad_norm": 0.004620330408215523, "learning_rate": 1e-06, "loss": -0.0129, "step": 659 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2955019474029541, "epoch": 1.0576923076923077, "grad_norm": 0.0033308968413621187, "learning_rate": 1e-06, "loss": 0.0019, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14623.0, "completions/max_terminated_length": 14623.0, "completions/mean_length": 7852.88671875, "completions/mean_terminated_length": 7852.88671875, "completions/min_length": 3324.0, "completions/min_terminated_length": 3324.0, "entropy": 0.2860754281282425, "epoch": 1.0592948717948718, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 877923455.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.066422700881958, "sampling/importance_sampling_ratio/min": 0.001346131437458098, "sampling/sampling_logp_difference/max": 6.610520362854004, "sampling/sampling_logp_difference/mean": 0.11576870828866959, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2884225696325302, "epoch": 1.060897435897436, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.28944897651672363, "epoch": 1.0625, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.28575797379016876, "epoch": 1.064102564102564, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12896.0, "completions/max_terminated_length": 12896.0, "completions/mean_length": 7736.224609375, "completions/mean_terminated_length": 7736.224609375, "completions/min_length": 2776.0, "completions/min_terminated_length": 2776.0, "entropy": 0.2841587960720062, "epoch": 1.0657051282051282, "frac_reward_zero_std": 0.75, "grad_norm": 0.014903567731380463, "learning_rate": 1e-06, "loss": -0.0349, "num_tokens": 882813394.0, "reward": 0.8854101896286011, "reward_std": 0.04905132204294205, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.978515625, "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, "rewards/symbolic_reward_partial_score/mean": 0.994335949420929, "rewards/symbolic_reward_partial_score/std": 0.06388040632009506, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0652308464050293, "sampling/importance_sampling_ratio/min": 0.0015026178443804383, "sampling/sampling_logp_difference/max": 6.500546455383301, "sampling/sampling_logp_difference/mean": 0.11406149715185165, "step": 665 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.28192271292209625, "epoch": 1.0673076923076923, "grad_norm": 0.03575403615832329, "learning_rate": 1e-06, "loss": 0.0236, "step": 666 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.28293684124946594, "epoch": 1.0689102564102564, "grad_norm": 0.03577136993408203, "learning_rate": 1e-06, "loss": 0.0142, "step": 667 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.282418817281723, "epoch": 1.0705128205128205, "grad_norm": 0.01990150660276413, "learning_rate": 1e-06, "loss": -0.0136, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13928.0, "completions/max_terminated_length": 13928.0, "completions/mean_length": 7708.44140625, "completions/mean_terminated_length": 7708.44140625, "completions/min_length": 2868.0, "completions/min_terminated_length": 2868.0, "entropy": 0.28265492618083954, "epoch": 1.0721153846153846, "frac_reward_zero_std": 0.84375, "grad_norm": 0.030519738793373108, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 887610468.0, "reward": 0.8933585286140442, "reward_std": 0.02347153052687645, "rewards/progression_diversity/mean": -8.656460704514757e-05, "rewards/progression_diversity/std": 0.0019587334245443344, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9973958730697632, "rewards/symbolic_reward_partial_score/std": 0.04477177560329437, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0654292106628418, "sampling/importance_sampling_ratio/min": 0.003087102435529232, "sampling/sampling_logp_difference/max": 5.780522346496582, "sampling/sampling_logp_difference/mean": 0.11411414295434952, "step": 669 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2829418480396271, "epoch": 1.0737179487179487, "grad_norm": 0.03352310508489609, "learning_rate": 1e-06, "loss": 0.0097, "step": 670 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.281900018453598, "epoch": 1.0753205128205128, "grad_norm": 0.005207178648561239, "learning_rate": 1e-06, "loss": 0.0079, "step": 671 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2834167182445526, "epoch": 1.0769230769230769, "grad_norm": 0.006832549348473549, "learning_rate": 1e-06, "loss": -0.0133, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13941.0, "completions/max_terminated_length": 13941.0, "completions/mean_length": 8134.1484375, "completions/mean_terminated_length": 8134.1484375, "completions/min_length": 4374.0, "completions/min_terminated_length": 4374.0, "entropy": 0.2623938024044037, "epoch": 1.078525641025641, "frac_reward_zero_std": 0.875, "grad_norm": 0.017798224464058876, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 892704352.0, "reward": 0.8955559730529785, "reward_std": 0.01373315043747425, "rewards/progression_diversity/mean": -7.182326226029545e-05, "rewards/progression_diversity/std": 0.0012371126795187593, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9969075918197632, "rewards/symbolic_reward_partial_score/std": 0.04949941113591194, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0605111122131348, "sampling/importance_sampling_ratio/min": 0.0017345065716654062, "sampling/sampling_logp_difference/max": 6.357032299041748, "sampling/sampling_logp_difference/mean": 0.10625582188367844, "step": 673 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2608880549669266, "epoch": 1.080128205128205, "grad_norm": 0.003827364183962345, "learning_rate": 1e-06, "loss": -0.0129, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.25945594906806946, "epoch": 1.0817307692307692, "grad_norm": 0.004291590768843889, "learning_rate": 1e-06, "loss": 0.0058, "step": 675 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2584989666938782, "epoch": 1.0833333333333333, "grad_norm": 0.029342349618673325, "learning_rate": 1e-06, "loss": 0.0149, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15488.0, "completions/mean_length": 7833.21484375, "completions/mean_terminated_length": 7816.4814453125, "completions/min_length": 3418.0, "completions/min_terminated_length": 3418.0, "entropy": 0.27159421145915985, "epoch": 1.0849358974358974, "frac_reward_zero_std": 0.8125, "grad_norm": 0.02988434210419655, "learning_rate": 1e-06, "loss": 0.0206, "num_tokens": 897556846.0, "reward": 0.8891208171844482, "reward_std": 0.027503937482833862, "rewards/progression_diversity/mean": -2.9832091968273744e-05, "rewards/progression_diversity/std": 0.0006708584260195494, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.9956380128860474, "rewards/symbolic_reward_partial_score/std": 0.051393549889326096, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.062467336654663, "sampling/importance_sampling_ratio/min": 0.0017893225885927677, "sampling/sampling_logp_difference/max": 6.325918197631836, "sampling/sampling_logp_difference/mean": 0.10957305133342743, "step": 677 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.26841987669467926, "epoch": 1.0865384615384615, "grad_norm": 0.012452783063054085, "learning_rate": 1e-06, "loss": -0.0202, "step": 678 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.27052880823612213, "epoch": 1.0881410256410255, "grad_norm": 0.017503680661320686, "learning_rate": 1e-06, "loss": -0.0126, "step": 679 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2656054198741913, "epoch": 1.0897435897435896, "grad_norm": 0.025547392666339874, "learning_rate": 1e-06, "loss": 0.0068, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15721.0, "completions/mean_length": 7773.837890625, "completions/mean_terminated_length": 7740.0732421875, "completions/min_length": 2609.0, "completions/min_terminated_length": 2609.0, "entropy": 0.2652207016944885, "epoch": 1.0913461538461537, "frac_reward_zero_std": 0.8125, "grad_norm": 0.007663046941161156, "learning_rate": 1e-06, "loss": 0.0097, "num_tokens": 902448811.0, "reward": 0.8927390575408936, "reward_std": 0.02515510842204094, "rewards/progression_diversity/mean": -2.1295189071679488e-05, "rewards/progression_diversity/std": 0.00048185509513132274, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9972819089889526, "rewards/symbolic_reward_partial_score/std": 0.04502078890800476, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0612239837646484, "sampling/importance_sampling_ratio/min": 0.002235043328255415, "sampling/sampling_logp_difference/max": 6.103494644165039, "sampling/sampling_logp_difference/mean": 0.10745704174041748, "step": 681 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.2610100209712982, "epoch": 1.092948717948718, "grad_norm": 0.038671281188726425, "learning_rate": 1e-06, "loss": 0.0154, "step": 682 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.26077619194984436, "epoch": 1.094551282051282, "grad_norm": 0.01831183023750782, "learning_rate": 1e-06, "loss": 0.0095, "step": 683 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.26326900720596313, "epoch": 1.0961538461538463, "grad_norm": 0.005235353950411081, "learning_rate": 1e-06, "loss": -0.0075, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13355.0, "completions/max_terminated_length": 13355.0, "completions/mean_length": 7589.1484375, "completions/mean_terminated_length": 7589.1484375, "completions/min_length": 2987.0, "completions/min_terminated_length": 2987.0, "entropy": 0.26450496912002563, "epoch": 1.0977564102564104, "frac_reward_zero_std": 0.75, "grad_norm": 0.03519367799162865, "learning_rate": 1e-06, "loss": 0.043, "num_tokens": 907172343.0, "reward": 0.8846186399459839, "reward_std": 0.046957530081272125, "rewards/progression_diversity/mean": -5.9372316172812134e-05, "rewards/progression_diversity/std": 0.0009608343243598938, "rewards/symbolic_reward_accuracy/mean": 0.978515625, "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, "rewards/symbolic_reward_partial_score/mean": 0.99169921875, "rewards/symbolic_reward_partial_score/std": 0.07279406487941742, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0615901947021484, "sampling/importance_sampling_ratio/min": 0.0027771987952291965, "sampling/sampling_logp_difference/max": 5.886312484741211, "sampling/sampling_logp_difference/mean": 0.1085769385099411, "step": 685 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2648194283246994, "epoch": 1.0993589743589745, "grad_norm": 0.017659511417150497, "learning_rate": 1e-06, "loss": 0.0099, "step": 686 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.26925028860569, "epoch": 1.1009615384615385, "grad_norm": 0.006832172628492117, "learning_rate": 1e-06, "loss": -0.0204, "step": 687 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.25970420241355896, "epoch": 1.1025641025641026, "grad_norm": 0.008957959711551666, "learning_rate": 1e-06, "loss": -0.009, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 13287.0, "completions/mean_length": 7613.986328125, "completions/mean_terminated_length": 7579.5947265625, "completions/min_length": 2514.0, "completions/min_terminated_length": 2514.0, "entropy": 0.25761424005031586, "epoch": 1.1041666666666667, "frac_reward_zero_std": 0.78125, "grad_norm": 0.02380330301821232, "learning_rate": 1e-06, "loss": 0.0246, "num_tokens": 911937200.0, "reward": 0.8876208066940308, "reward_std": 0.04614756256341934, "rewards/progression_diversity/mean": -0.0001286342740058899, "rewards/progression_diversity/std": 0.0029106612782925367, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.9912923574447632, "rewards/symbolic_reward_partial_score/std": 0.08297867327928543, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0609831809997559, "sampling/importance_sampling_ratio/min": 0.0015549632953479886, "sampling/sampling_logp_difference/max": 6.46630334854126, "sampling/sampling_logp_difference/mean": 0.10692013055086136, "step": 689 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2630904018878937, "epoch": 1.1057692307692308, "grad_norm": 0.015310103073716164, "learning_rate": 1e-06, "loss": 0.0122, "step": 690 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.26416918635368347, "epoch": 1.107371794871795, "grad_norm": 0.01843937300145626, "learning_rate": 1e-06, "loss": -0.0016, "step": 691 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.2623608559370041, "epoch": 1.108974358974359, "grad_norm": 0.006605224683880806, "learning_rate": 1e-06, "loss": -0.0174, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14738.0, "completions/max_terminated_length": 14738.0, "completions/mean_length": 7770.841796875, "completions/mean_terminated_length": 7770.841796875, "completions/min_length": 2381.0, "completions/min_terminated_length": 2381.0, "entropy": 0.2696673274040222, "epoch": 1.1105769230769231, "frac_reward_zero_std": 0.90625, "grad_norm": 0.003520580241456628, "learning_rate": 1e-06, "loss": -0.0114, "num_tokens": 916765599.0, "reward": 0.89599609375, "reward_std": 0.01601562649011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9983723759651184, "rewards/symbolic_reward_partial_score/std": 0.024398809298872948, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0617854595184326, "sampling/importance_sampling_ratio/min": 0.0025482482742518187, "sampling/sampling_logp_difference/max": 5.972349166870117, "sampling/sampling_logp_difference/mean": 0.10866742581129074, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.26511339843273163, "epoch": 1.1121794871794872, "grad_norm": 0.020077740773558617, "learning_rate": 1e-06, "loss": 0.0033, "step": 694 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2642548382282257, "epoch": 1.1137820512820513, "grad_norm": 0.0034260887186974287, "learning_rate": 1e-06, "loss": 0.0199, "step": 695 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.26379284262657166, "epoch": 1.1153846153846154, "grad_norm": 0.0034673307090997696, "learning_rate": 1e-06, "loss": -0.0128, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14377.0, "completions/max_terminated_length": 14377.0, "completions/mean_length": 7378.373046875, "completions/mean_terminated_length": 7378.373046875, "completions/min_length": 2058.0, "completions/min_terminated_length": 2058.0, "entropy": 0.2753664702177048, "epoch": 1.1169871794871795, "frac_reward_zero_std": 0.84375, "grad_norm": 0.02176603674888611, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 921404366.0, "reward": 0.8903019428253174, "reward_std": 0.022549103945493698, "rewards/progression_diversity/mean": -8.592897211201489e-05, "rewards/progression_diversity/std": 0.001514117349870503, "rewards/symbolic_reward_accuracy/mean": 0.986328125, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.9950195550918579, "rewards/symbolic_reward_partial_score/std": 0.06346382945775986, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0625696182250977, "sampling/importance_sampling_ratio/min": 0.0005904682911932468, "sampling/sampling_logp_difference/max": 7.434594631195068, "sampling/sampling_logp_difference/mean": 0.11063844710588455, "step": 697 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2678314447402954, "epoch": 1.1185897435897436, "grad_norm": 0.007807692512869835, "learning_rate": 1e-06, "loss": -0.0285, "step": 698 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2715350091457367, "epoch": 1.1201923076923077, "grad_norm": 0.012418070808053017, "learning_rate": 1e-06, "loss": 0.0153, "step": 699 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2664782702922821, "epoch": 1.1217948717948718, "grad_norm": 0.023162173107266426, "learning_rate": 1e-06, "loss": 0.0091, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14574.0, "completions/mean_length": 7569.21484375, "completions/mean_terminated_length": 7534.6474609375, "completions/min_length": 1983.0, "completions/min_terminated_length": 1983.0, "entropy": 0.2645212560892105, "epoch": 1.123397435897436, "frac_reward_zero_std": 0.8125, "grad_norm": 0.02870558202266693, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 926171516.0, "reward": 0.8899610042572021, "reward_std": 0.0324556939303875, "rewards/progression_diversity/mean": -5.122274160385132e-07, "rewards/progression_diversity/std": 1.1590383110160474e-05, "rewards/symbolic_reward_accuracy/mean": 0.986328125, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.9945312738418579, "rewards/symbolic_reward_partial_score/std": 0.06467118859291077, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0612825155258179, "sampling/importance_sampling_ratio/min": 0.0015629983972758055, "sampling/sampling_logp_difference/max": 6.461149215698242, "sampling/sampling_logp_difference/mean": 0.10810823738574982, "step": 701 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2682636231184006, "epoch": 1.125, "grad_norm": 0.005539908539503813, "learning_rate": 1e-06, "loss": 0.0636, "step": 702 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2629767805337906, "epoch": 1.126602564102564, "grad_norm": 0.0060573117807507515, "learning_rate": 1e-06, "loss": -0.0252, "step": 703 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2614084780216217, "epoch": 1.1282051282051282, "grad_norm": 0.005166211631149054, "learning_rate": 1e-06, "loss": -0.0221, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14494.0, "completions/mean_length": 7480.783203125, "completions/mean_terminated_length": 7463.35986328125, "completions/min_length": 2870.0, "completions/min_terminated_length": 2870.0, "entropy": 0.2685774117708206, "epoch": 1.1298076923076923, "frac_reward_zero_std": 0.71875, "grad_norm": 0.008230074308812618, "learning_rate": 1e-06, "loss": -0.0097, "num_tokens": 930796077.0, "reward": 0.8889119625091553, "reward_std": 0.04125416278839111, "rewards/progression_diversity/mean": -0.00040413427632302046, "rewards/progression_diversity/std": 0.004968197550624609, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.9949544668197632, "rewards/symbolic_reward_partial_score/std": 0.06327638775110245, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0628039836883545, "sampling/importance_sampling_ratio/min": 0.0042944494634866714, "sampling/sampling_logp_difference/max": 5.450431823730469, "sampling/sampling_logp_difference/mean": 0.11043090373277664, "step": 705 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.27061736583709717, "epoch": 1.1314102564102564, "grad_norm": 0.006081659346818924, "learning_rate": 1e-06, "loss": 0.0214, "step": 706 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.27226053178310394, "epoch": 1.1330128205128205, "grad_norm": 0.019175931811332703, "learning_rate": 1e-06, "loss": 0.021, "step": 707 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.27494950592517853, "epoch": 1.1346153846153846, "grad_norm": 0.019779911264777184, "learning_rate": 1e-06, "loss": -0.0026, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14330.0, "completions/mean_length": 7486.875, "completions/mean_terminated_length": 7451.98486328125, "completions/min_length": 1773.0, "completions/min_terminated_length": 1773.0, "entropy": 0.2697181850671768, "epoch": 1.1362179487179487, "frac_reward_zero_std": 0.84375, "grad_norm": 0.00622901227325201, "learning_rate": 1e-06, "loss": -0.0269, "num_tokens": 935515357.0, "reward": 0.8910156488418579, "reward_std": 0.03098640777170658, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.9947916865348816, "rewards/symbolic_reward_partial_score/std": 0.06532405316829681, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0624651908874512, "sampling/importance_sampling_ratio/min": 0.0015064862091094255, "sampling/sampling_logp_difference/max": 6.4979753494262695, "sampling/sampling_logp_difference/mean": 0.10939180850982666, "step": 709 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.2662170082330704, "epoch": 1.1378205128205128, "grad_norm": 0.03273667022585869, "learning_rate": 1e-06, "loss": 0.0403, "step": 710 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2710031718015671, "epoch": 1.1394230769230769, "grad_norm": 0.006321210414171219, "learning_rate": 1e-06, "loss": -0.0072, "step": 711 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2719338834285736, "epoch": 1.141025641025641, "grad_norm": 0.02884542942047119, "learning_rate": 1e-06, "loss": 0.0176, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12891.0, "completions/max_terminated_length": 12891.0, "completions/mean_length": 6856.3828125, "completions/mean_terminated_length": 6856.3828125, "completions/min_length": 2791.0, "completions/min_terminated_length": 2791.0, "entropy": 0.2899167388677597, "epoch": 1.142628205128205, "frac_reward_zero_std": 0.84375, "grad_norm": 0.005113643128424883, "learning_rate": 1e-06, "loss": -0.0193, "num_tokens": 939780225.0, "reward": 0.8940412998199463, "reward_std": 0.020740434527397156, "rewards/progression_diversity/mean": -0.0001685535826254636, "rewards/progression_diversity/std": 0.00280111120082438, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9957682490348816, "rewards/symbolic_reward_partial_score/std": 0.0626349002122879, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.066714882850647, "sampling/importance_sampling_ratio/min": 0.0023814705200493336, "sampling/sampling_logp_difference/max": 6.040037155151367, "sampling/sampling_logp_difference/mean": 0.11683158576488495, "step": 713 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2890501618385315, "epoch": 1.1442307692307692, "grad_norm": 0.025946229696273804, "learning_rate": 1e-06, "loss": 0.0229, "step": 714 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.28599461913108826, "epoch": 1.1458333333333333, "grad_norm": 0.009792243130505085, "learning_rate": 1e-06, "loss": 0.0141, "step": 715 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.28565630316734314, "epoch": 1.1474358974358974, "grad_norm": 0.004287914838641882, "learning_rate": 1e-06, "loss": -0.0057, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14698.0, "completions/mean_length": 6872.3984375, "completions/mean_terminated_length": 6853.78466796875, "completions/min_length": 1888.0, "completions/min_terminated_length": 1888.0, "entropy": 0.28366775810718536, "epoch": 1.1490384615384615, "frac_reward_zero_std": 0.8125, "grad_norm": 0.016366643831133842, "learning_rate": 1e-06, "loss": 0.0085, "num_tokens": 944098141.0, "reward": 0.8756428956985474, "reward_std": 0.04919969290494919, "rewards/progression_diversity/mean": -0.0006512624095194042, "rewards/progression_diversity/std": 0.014736386016011238, "rewards/symbolic_reward_accuracy/mean": 0.96484375, "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, "rewards/symbolic_reward_partial_score/mean": 0.9904459714889526, "rewards/symbolic_reward_partial_score/std": 0.07924753427505493, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0652272701263428, "sampling/importance_sampling_ratio/min": 0.0023581604473292828, "sampling/sampling_logp_difference/max": 6.049873352050781, "sampling/sampling_logp_difference/mean": 0.11434115469455719, "step": 717 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2845660150051117, "epoch": 1.1506410256410255, "grad_norm": 0.021158399060368538, "learning_rate": 1e-06, "loss": -0.0064, "step": 718 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.28018422424793243, "epoch": 1.1522435897435896, "grad_norm": 0.015796631574630737, "learning_rate": 1e-06, "loss": 0.0102, "step": 719 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.2849064916372299, "epoch": 1.1538461538461537, "grad_norm": 0.012021268717944622, "learning_rate": 1e-06, "loss": -0.0081, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16245.0, "completions/mean_length": 8046.552734375, "completions/mean_terminated_length": 8013.857421875, "completions/min_length": 3661.0, "completions/min_terminated_length": 3661.0, "entropy": 0.25592416524887085, "epoch": 1.155448717948718, "frac_reward_zero_std": 0.6875, "grad_norm": 0.010126116685569286, "learning_rate": 1e-06, "loss": 0.0224, "num_tokens": 949275272.0, "reward": 0.8764989376068115, "reward_std": 0.06346848607063293, "rewards/progression_diversity/mean": -1.028479528031312e-05, "rewards/progression_diversity/std": 0.00023271834652405232, "rewards/symbolic_reward_accuracy/mean": 0.966796875, "rewards/symbolic_reward_accuracy/std": 0.17934183776378632, "rewards/symbolic_reward_partial_score/mean": 0.9893717169761658, "rewards/symbolic_reward_partial_score/std": 0.08977557718753815, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0593876838684082, "sampling/importance_sampling_ratio/min": 0.00013644673163071275, "sampling/sampling_logp_difference/max": 8.899576187133789, "sampling/sampling_logp_difference/mean": 0.10469293594360352, "step": 721 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.2578028440475464, "epoch": 1.157051282051282, "grad_norm": 0.012335097417235374, "learning_rate": 1e-06, "loss": -0.0021, "step": 722 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.2576797902584076, "epoch": 1.1586538461538463, "grad_norm": 0.02756766602396965, "learning_rate": 1e-06, "loss": 0.0038, "step": 723 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.25286468863487244, "epoch": 1.1602564102564104, "grad_norm": 0.02716866135597229, "learning_rate": 1e-06, "loss": 0.0209, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 13557.0, "completions/mean_length": 7729.5390625, "completions/mean_terminated_length": 7678.53076171875, "completions/min_length": 3962.0, "completions/min_terminated_length": 3962.0, "entropy": 0.2743615508079529, "epoch": 1.1618589743589745, "frac_reward_zero_std": 0.78125, "grad_norm": 0.012762167491018772, "learning_rate": 1e-06, "loss": -0.0301, "num_tokens": 954120252.0, "reward": 0.8848584890365601, "reward_std": 0.041579727083444595, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.978515625, "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, "rewards/symbolic_reward_partial_score/mean": 0.9944498538970947, "rewards/symbolic_reward_partial_score/std": 0.06356307864189148, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.063340663909912, "sampling/importance_sampling_ratio/min": 0.0027780013624578714, "sampling/sampling_logp_difference/max": 5.88602352142334, "sampling/sampling_logp_difference/mean": 0.11120878905057907, "step": 725 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2750166952610016, "epoch": 1.1634615384615385, "grad_norm": 0.02495425008237362, "learning_rate": 1e-06, "loss": 0.0247, "step": 726 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2725471556186676, "epoch": 1.1650641025641026, "grad_norm": 0.0408598817884922, "learning_rate": 1e-06, "loss": 0.0145, "step": 727 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.27416326105594635, "epoch": 1.1666666666666667, "grad_norm": 0.009657352231442928, "learning_rate": 1e-06, "loss": 0.0287, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14030.0, "completions/mean_length": 7417.345703125, "completions/mean_terminated_length": 7328.91748046875, "completions/min_length": 3062.0, "completions/min_terminated_length": 3062.0, "entropy": 0.2749616950750351, "epoch": 1.1682692307692308, "frac_reward_zero_std": 0.875, "grad_norm": 0.023918617516756058, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 958729549.0, "reward": 0.890673816204071, "reward_std": 0.02003018744289875, "rewards/progression_diversity/mean": -1.871625840976776e-06, "rewards/progression_diversity/std": 4.235006053932011e-05, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.99169921875, "rewards/symbolic_reward_partial_score/std": 0.08877533674240112, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0633912086486816, "sampling/importance_sampling_ratio/min": 0.002192102139815688, "sampling/sampling_logp_difference/max": 6.122894287109375, "sampling/sampling_logp_difference/mean": 0.11154164373874664, "step": 729 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.27727919816970825, "epoch": 1.169871794871795, "grad_norm": 0.01305021345615387, "learning_rate": 1e-06, "loss": -0.0027, "step": 730 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.27219584584236145, "epoch": 1.171474358974359, "grad_norm": 0.008659505285322666, "learning_rate": 1e-06, "loss": 0.0143, "step": 731 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2767972946166992, "epoch": 1.1730769230769231, "grad_norm": 0.0262775719165802, "learning_rate": 1e-06, "loss": 0.0295, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15356.0, "completions/mean_length": 7562.185546875, "completions/mean_terminated_length": 7544.921875, "completions/min_length": 1964.0, "completions/min_terminated_length": 1964.0, "entropy": 0.2714828997850418, "epoch": 1.1746794871794872, "frac_reward_zero_std": 0.71875, "grad_norm": 0.026525314897298813, "learning_rate": 1e-06, "loss": 0.007, "num_tokens": 963491644.0, "reward": 0.8770654201507568, "reward_std": 0.06552450358867645, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.96875, "rewards/symbolic_reward_accuracy/std": 0.17416280508041382, "rewards/symbolic_reward_partial_score/mean": 0.9867024421691895, "rewards/symbolic_reward_partial_score/std": 0.09885963052511215, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0629959106445312, "sampling/importance_sampling_ratio/min": 8.583244925830513e-05, "sampling/sampling_logp_difference/max": 9.363113403320312, "sampling/sampling_logp_difference/mean": 0.11046160757541656, "step": 733 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.2706471383571625, "epoch": 1.1762820512820513, "grad_norm": 0.022599341347813606, "learning_rate": 1e-06, "loss": -0.0197, "step": 734 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2723312973976135, "epoch": 1.1778846153846154, "grad_norm": 0.026649735867977142, "learning_rate": 1e-06, "loss": 0.0485, "step": 735 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.27444125711917877, "epoch": 1.1794871794871795, "grad_norm": 0.01070537231862545, "learning_rate": 1e-06, "loss": -0.0264, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13870.0, "completions/max_terminated_length": 13870.0, "completions/mean_length": 7185.0625, "completions/mean_terminated_length": 7185.0625, "completions/min_length": 2770.0, "completions/min_terminated_length": 2770.0, "entropy": 0.2854379415512085, "epoch": 1.1810897435897436, "frac_reward_zero_std": 0.875, "grad_norm": 0.0061562443152070045, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 968018860.0, "reward": 0.8901855945587158, "reward_std": 0.02915896289050579, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.986328125, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.99462890625, "rewards/symbolic_reward_partial_score/std": 0.06525672227144241, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.065285563468933, "sampling/importance_sampling_ratio/min": 0.0010467558167874813, "sampling/sampling_logp_difference/max": 6.862059593200684, "sampling/sampling_logp_difference/mean": 0.11443185061216354, "step": 737 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.28180165588855743, "epoch": 1.1826923076923077, "grad_norm": 0.007420375477522612, "learning_rate": 1e-06, "loss": -0.0116, "step": 738 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.28390932083129883, "epoch": 1.1842948717948718, "grad_norm": 0.005746510811150074, "learning_rate": 1e-06, "loss": -0.0073, "step": 739 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2805173099040985, "epoch": 1.185897435897436, "grad_norm": 0.010871785692870617, "learning_rate": 1e-06, "loss": -0.0052, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14873.0, "completions/max_terminated_length": 14873.0, "completions/mean_length": 7421.39453125, "completions/mean_terminated_length": 7421.39453125, "completions/min_length": 2053.0, "completions/min_terminated_length": 2053.0, "entropy": 0.2859332859516144, "epoch": 1.1875, "frac_reward_zero_std": 0.875, "grad_norm": 0.015115713700652122, "learning_rate": 1e-06, "loss": -0.0135, "num_tokens": 972645830.0, "reward": 0.8908447623252869, "reward_std": 0.029325148090720177, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.9935709834098816, "rewards/symbolic_reward_partial_score/std": 0.07672799378633499, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.065746784210205, "sampling/importance_sampling_ratio/min": 0.0030088303610682487, "sampling/sampling_logp_difference/max": 5.806203842163086, "sampling/sampling_logp_difference/mean": 0.11504846811294556, "step": 741 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2846004366874695, "epoch": 1.189102564102564, "grad_norm": 0.02620539255440235, "learning_rate": 1e-06, "loss": 0.0142, "step": 742 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.28722913563251495, "epoch": 1.1907051282051282, "grad_norm": 0.021206529811024666, "learning_rate": 1e-06, "loss": -0.0107, "step": 743 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.28755249083042145, "epoch": 1.1923076923076923, "grad_norm": 0.014355083927512169, "learning_rate": 1e-06, "loss": 0.006, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15088.0, "completions/max_terminated_length": 15088.0, "completions/mean_length": 7411.068359375, "completions/mean_terminated_length": 7411.068359375, "completions/min_length": 2300.0, "completions/min_terminated_length": 2300.0, "entropy": 0.287930428981781, "epoch": 1.1939102564102564, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0049547115340828896, "learning_rate": 1e-06, "loss": -0.0152, "num_tokens": 977202697.0, "reward": 0.8944531679153442, "reward_std": 0.018543953076004982, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9971354007720947, "rewards/symbolic_reward_partial_score/std": 0.04583593085408211, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0666286945343018, "sampling/importance_sampling_ratio/min": 0.0015383908757939935, "sampling/sampling_logp_difference/max": 6.477018356323242, "sampling/sampling_logp_difference/mean": 0.11660847067832947, "step": 745 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.29048343002796173, "epoch": 1.1955128205128205, "grad_norm": 0.02166350558400154, "learning_rate": 1e-06, "loss": 0.0241, "step": 746 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2873390465974808, "epoch": 1.1971153846153846, "grad_norm": 0.0050978874787688255, "learning_rate": 1e-06, "loss": -0.0144, "step": 747 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.29117564857006073, "epoch": 1.1987179487179487, "grad_norm": 0.017066776752471924, "learning_rate": 1e-06, "loss": 0.0139, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15377.0, "completions/mean_length": 7712.28125, "completions/mean_terminated_length": 7695.31103515625, "completions/min_length": 2399.0, "completions/min_terminated_length": 2399.0, "entropy": 0.27627554535865784, "epoch": 1.2003205128205128, "frac_reward_zero_std": 0.90625, "grad_norm": 0.024169372394680977, "learning_rate": 1e-06, "loss": 0.0219, "num_tokens": 982081785.0, "reward": 0.8903414011001587, "reward_std": 0.018626626580953598, "rewards/progression_diversity/mean": -4.191866173641756e-05, "rewards/progression_diversity/std": 0.00094851094763726, "rewards/symbolic_reward_accuracy/mean": 0.986328125, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.9964518547058105, "rewards/symbolic_reward_partial_score/std": 0.048222340643405914, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0641599893569946, "sampling/importance_sampling_ratio/min": 0.0002658157900441438, "sampling/sampling_logp_difference/max": 8.232707023620605, "sampling/sampling_logp_difference/mean": 0.11234469711780548, "step": 749 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2774530202150345, "epoch": 1.2019230769230769, "grad_norm": 0.01591247320175171, "learning_rate": 1e-06, "loss": 0.0008, "step": 750 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.27769312262535095, "epoch": 1.203525641025641, "grad_norm": 0.005623947829008102, "learning_rate": 1e-06, "loss": -0.0104, "step": 751 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.27439242601394653, "epoch": 1.205128205128205, "grad_norm": 0.011445503681898117, "learning_rate": 1e-06, "loss": -0.0085, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15583.0, "completions/mean_length": 7719.01171875, "completions/mean_terminated_length": 7685.03173828125, "completions/min_length": 2215.0, "completions/min_terminated_length": 2215.0, "entropy": 0.2831403911113739, "epoch": 1.2067307692307692, "frac_reward_zero_std": 0.6875, "grad_norm": 0.012956078164279461, "learning_rate": 1e-06, "loss": -0.026, "num_tokens": 986945087.0, "reward": 0.8807443380355835, "reward_std": 0.057317137718200684, "rewards/progression_diversity/mean": -0.0002774419845081866, "rewards/progression_diversity/std": 0.006277794949710369, "rewards/symbolic_reward_accuracy/mean": 0.97265625, "rewards/symbolic_reward_accuracy/std": 0.16324250400066376, "rewards/symbolic_reward_partial_score/mean": 0.9918131828308105, "rewards/symbolic_reward_partial_score/std": 0.06508524715900421, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0649077892303467, "sampling/importance_sampling_ratio/min": 0.002849755110219121, "sampling/sampling_logp_difference/max": 5.860522270202637, "sampling/sampling_logp_difference/mean": 0.11349144577980042, "step": 753 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.28055115044116974, "epoch": 1.2083333333333333, "grad_norm": 0.02566658891737461, "learning_rate": 1e-06, "loss": 0.0113, "step": 754 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.27944353222846985, "epoch": 1.2099358974358974, "grad_norm": 0.028652476146817207, "learning_rate": 1e-06, "loss": 0.0305, "step": 755 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.28025032579898834, "epoch": 1.2115384615384615, "grad_norm": 0.017124585807323456, "learning_rate": 1e-06, "loss": -0.0014, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14722.0, "completions/mean_length": 8052.341796875, "completions/mean_terminated_length": 8036.037109375, "completions/min_length": 3876.0, "completions/min_terminated_length": 3876.0, "entropy": 0.277658149600029, "epoch": 1.2131410256410255, "frac_reward_zero_std": 0.6875, "grad_norm": 0.03241975978016853, "learning_rate": 1e-06, "loss": 0.0281, "num_tokens": 992013422.0, "reward": 0.8829687833786011, "reward_std": 0.05664096772670746, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9765625, "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, "rewards/symbolic_reward_partial_score/mean": 0.9907552003860474, "rewards/symbolic_reward_partial_score/std": 0.08320756256580353, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0649545192718506, "sampling/importance_sampling_ratio/min": 0.0018924587639048696, "sampling/sampling_logp_difference/max": 6.269878387451172, "sampling/sampling_logp_difference/mean": 0.11347892135381699, "step": 757 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.277600422501564, "epoch": 1.2147435897435896, "grad_norm": 0.033942461013793945, "learning_rate": 1e-06, "loss": 0.0586, "step": 758 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2830449342727661, "epoch": 1.2163461538461537, "grad_norm": 0.009825375862419605, "learning_rate": 1e-06, "loss": -0.0332, "step": 759 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.282797247171402, "epoch": 1.217948717948718, "grad_norm": 0.009744114242494106, "learning_rate": 1e-06, "loss": -0.0309, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15098.0, "completions/max_terminated_length": 15098.0, "completions/mean_length": 7623.50390625, "completions/mean_terminated_length": 7623.50390625, "completions/min_length": 3855.0, "completions/min_terminated_length": 3855.0, "entropy": 0.2917735129594803, "epoch": 1.219551282051282, "frac_reward_zero_std": 0.875, "grad_norm": 0.0076350378803908825, "learning_rate": 1e-06, "loss": -0.0117, "num_tokens": 996748624.0, "reward": 0.8948044776916504, "reward_std": 0.01744021661579609, "rewards/progression_diversity/mean": -2.3829052224755287e-05, "rewards/progression_diversity/std": 0.0005391899030655622, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9983072876930237, "rewards/symbolic_reward_partial_score/std": 0.019349031150341034, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0678482055664062, "sampling/importance_sampling_ratio/min": 0.001051880419254303, "sampling/sampling_logp_difference/max": 6.857175827026367, "sampling/sampling_logp_difference/mean": 0.11811263114213943, "step": 761 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2913740575313568, "epoch": 1.2211538461538463, "grad_norm": 0.004803275689482689, "learning_rate": 1e-06, "loss": -0.0062, "step": 762 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.28947630524635315, "epoch": 1.2227564102564104, "grad_norm": 0.004451874177902937, "learning_rate": 1e-06, "loss": -0.0046, "step": 763 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2937081605195999, "epoch": 1.2243589743589745, "grad_norm": 0.028366444632411003, "learning_rate": 1e-06, "loss": 0.0196, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13416.0, "completions/mean_length": 7715.34765625, "completions/mean_terminated_length": 7698.38330078125, "completions/min_length": 2870.0, "completions/min_terminated_length": 2870.0, "entropy": 0.293381005525589, "epoch": 1.2259615384615385, "frac_reward_zero_std": 0.90625, "grad_norm": 0.004608824849128723, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 1001542498.0, "reward": 0.8941406607627869, "reward_std": 0.02034306898713112, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9967448115348816, "rewards/symbolic_reward_partial_score/std": 0.04962607100605965, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0679861307144165, "sampling/importance_sampling_ratio/min": 0.0015377042582258582, "sampling/sampling_logp_difference/max": 6.47746467590332, "sampling/sampling_logp_difference/mean": 0.11805201321840286, "step": 765 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2906278818845749, "epoch": 1.2275641025641026, "grad_norm": 0.03442588075995445, "learning_rate": 1e-06, "loss": 0.0285, "step": 766 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.29679957032203674, "epoch": 1.2291666666666667, "grad_norm": 0.0050870683044195175, "learning_rate": 1e-06, "loss": -0.0035, "step": 767 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.29432834684848785, "epoch": 1.2307692307692308, "grad_norm": 0.004784552846103907, "learning_rate": 1e-06, "loss": -0.0157, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13010.0, "completions/max_terminated_length": 13010.0, "completions/mean_length": 7732.798828125, "completions/mean_terminated_length": 7732.798828125, "completions/min_length": 2400.0, "completions/min_terminated_length": 2400.0, "entropy": 0.29547902941703796, "epoch": 1.232371794871795, "frac_reward_zero_std": 0.8125, "grad_norm": 0.012175563722848892, "learning_rate": 1e-06, "loss": 0.008, "num_tokens": 1006352315.0, "reward": 0.8856445550918579, "reward_std": 0.03763294965028763, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.978515625, "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, "rewards/symbolic_reward_partial_score/mean": 0.9951171875, "rewards/symbolic_reward_partial_score/std": 0.051642172038555145, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0691790580749512, "sampling/importance_sampling_ratio/min": 0.0011769047705456614, "sampling/sampling_logp_difference/max": 6.744867324829102, "sampling/sampling_logp_difference/mean": 0.11996014416217804, "step": 769 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2981226444244385, "epoch": 1.233974358974359, "grad_norm": 0.010661444626748562, "learning_rate": 1e-06, "loss": -0.0197, "step": 770 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2980218082666397, "epoch": 1.2355769230769231, "grad_norm": 0.011630581691861153, "learning_rate": 1e-06, "loss": -0.0277, "step": 771 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.29873549938201904, "epoch": 1.2371794871794872, "grad_norm": 0.012972732074558735, "learning_rate": 1e-06, "loss": 0.0369, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14956.0, "completions/max_terminated_length": 14956.0, "completions/mean_length": 7541.69921875, "completions/mean_terminated_length": 7541.69921875, "completions/min_length": 3751.0, "completions/min_terminated_length": 3751.0, "entropy": 0.29750749468803406, "epoch": 1.2387820512820513, "frac_reward_zero_std": 0.8125, "grad_norm": 0.031947024166584015, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 1011072049.0, "reward": 0.8877832293510437, "reward_std": 0.038269512355327606, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.982421875, "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, "rewards/symbolic_reward_partial_score/mean": 0.994433581829071, "rewards/symbolic_reward_partial_score/std": 0.06408015638589859, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0681447982788086, "sampling/importance_sampling_ratio/min": 0.0004270588397048414, "sampling/sampling_logp_difference/max": 7.758588790893555, "sampling/sampling_logp_difference/mean": 0.11879236251115799, "step": 773 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2897566109895706, "epoch": 1.2403846153846154, "grad_norm": 0.03253176808357239, "learning_rate": 1e-06, "loss": 0.0032, "step": 774 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.29233625531196594, "epoch": 1.2419871794871795, "grad_norm": 0.028909306973218918, "learning_rate": 1e-06, "loss": 0.0133, "step": 775 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.28808602690696716, "epoch": 1.2435897435897436, "grad_norm": 0.03054192289710045, "learning_rate": 1e-06, "loss": -0.0073, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15503.0, "completions/mean_length": 7913.501953125, "completions/mean_terminated_length": 7829.966796875, "completions/min_length": 3547.0, "completions/min_terminated_length": 3547.0, "entropy": 0.2850220203399658, "epoch": 1.2451923076923077, "frac_reward_zero_std": 0.71875, "grad_norm": 0.014626838266849518, "learning_rate": 1e-06, "loss": 0.0291, "num_tokens": 1016047730.0, "reward": 0.8811328411102295, "reward_std": 0.0615340992808342, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9765625, "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, "rewards/symbolic_reward_partial_score/mean": 0.9872395992279053, "rewards/symbolic_reward_partial_score/std": 0.10803718864917755, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0665440559387207, "sampling/importance_sampling_ratio/min": 2.4944363758550026e-05, "sampling/sampling_logp_difference/max": 10.598862648010254, "sampling/sampling_logp_difference/mean": 0.11551474034786224, "step": 777 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2869865745306015, "epoch": 1.2467948717948718, "grad_norm": 0.03926634415984154, "learning_rate": 1e-06, "loss": 0.0145, "step": 778 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2825475335121155, "epoch": 1.248397435897436, "grad_norm": 0.03181926906108856, "learning_rate": 1e-06, "loss": 0.0064, "step": 779 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.28822551667690277, "epoch": 1.25, "grad_norm": 0.013265200890600681, "learning_rate": 1e-06, "loss": -0.0185, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14623.0, "completions/max_terminated_length": 14623.0, "completions/mean_length": 7728.783203125, "completions/mean_terminated_length": 7728.783203125, "completions/min_length": 3294.0, "completions/min_terminated_length": 3294.0, "entropy": 0.29114533960819244, "epoch": 1.251602564102564, "frac_reward_zero_std": 0.875, "grad_norm": 0.030667047947645187, "learning_rate": 1e-06, "loss": 0.0149, "num_tokens": 1020910579.0, "reward": 0.8891504406929016, "reward_std": 0.02797744609415531, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.9950846433639526, "rewards/symbolic_reward_partial_score/std": 0.06306508928537369, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.068063735961914, "sampling/importance_sampling_ratio/min": 0.0007180717075243592, "sampling/sampling_logp_difference/max": 7.238941192626953, "sampling/sampling_logp_difference/mean": 0.11831213533878326, "step": 781 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2906217724084854, "epoch": 1.2532051282051282, "grad_norm": 0.006525769364088774, "learning_rate": 1e-06, "loss": 0.0125, "step": 782 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.29298603534698486, "epoch": 1.2548076923076923, "grad_norm": 0.007502012420445681, "learning_rate": 1e-06, "loss": -0.0171, "step": 783 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.29166728258132935, "epoch": 1.2564102564102564, "grad_norm": 0.020680276677012444, "learning_rate": 1e-06, "loss": -0.0086, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14149.0, "completions/max_terminated_length": 14149.0, "completions/mean_length": 7387.083984375, "completions/mean_terminated_length": 7387.083984375, "completions/min_length": 2680.0, "completions/min_terminated_length": 2680.0, "entropy": 0.3060101866722107, "epoch": 1.2580128205128205, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0034628999419510365, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 1025487758.0, "reward": 0.8975389003753662, "reward_std": 0.006725301966071129, "rewards/progression_diversity/mean": -1.852334025898017e-05, "rewards/progression_diversity/std": 0.0004191353218629956, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9996093511581421, "rewards/symbolic_reward_partial_score/std": 0.006243883166462183, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.070400595664978, "sampling/importance_sampling_ratio/min": 0.0005534536903724074, "sampling/sampling_logp_difference/max": 7.499332427978516, "sampling/sampling_logp_difference/mean": 0.12200077623128891, "step": 785 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3041597902774811, "epoch": 1.2596153846153846, "grad_norm": 0.01983814872801304, "learning_rate": 1e-06, "loss": 0.0082, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.304287388920784, "epoch": 1.2612179487179487, "grad_norm": 0.0035897772759199142, "learning_rate": 1e-06, "loss": -0.0077, "step": 787 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2998305410146713, "epoch": 1.2628205128205128, "grad_norm": 0.005747531540691853, "learning_rate": 1e-06, "loss": -0.0041, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11941.0, "completions/max_terminated_length": 11941.0, "completions/mean_length": 7136.01171875, "completions/mean_terminated_length": 7136.01171875, "completions/min_length": 2860.0, "completions/min_terminated_length": 2860.0, "entropy": 0.3156207203865051, "epoch": 1.2644230769230769, "frac_reward_zero_std": 0.875, "grad_norm": 0.0063291979022324085, "learning_rate": 1e-06, "loss": -0.0203, "num_tokens": 1029842004.0, "reward": 0.8918652534484863, "reward_std": 0.025776326656341553, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.9963216185569763, "rewards/symbolic_reward_partial_score/std": 0.050049230456352234, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.072809100151062, "sampling/importance_sampling_ratio/min": 0.001212042523548007, "sampling/sampling_logp_difference/max": 6.715448379516602, "sampling/sampling_logp_difference/mean": 0.12579187750816345, "step": 789 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3099760115146637, "epoch": 1.266025641025641, "grad_norm": 0.006072794087231159, "learning_rate": 1e-06, "loss": -0.0054, "step": 790 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.31477801501750946, "epoch": 1.267628205128205, "grad_norm": 0.005829320289194584, "learning_rate": 1e-06, "loss": 0.0413, "step": 791 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.31682026386260986, "epoch": 1.2692307692307692, "grad_norm": 0.01667245663702488, "learning_rate": 1e-06, "loss": -0.0101, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15053.0, "completions/max_terminated_length": 15053.0, "completions/mean_length": 7788.76171875, "completions/mean_terminated_length": 7788.76171875, "completions/min_length": 2657.0, "completions/min_terminated_length": 2657.0, "entropy": 0.29877328872680664, "epoch": 1.2708333333333333, "frac_reward_zero_std": 0.8125, "grad_norm": 0.027668794617056847, "learning_rate": 1e-06, "loss": -0.0109, "num_tokens": 1034711162.0, "reward": 0.8842675685882568, "reward_std": 0.04109036177396774, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9765625, "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, "rewards/symbolic_reward_partial_score/mean": 0.9944336414337158, "rewards/symbolic_reward_partial_score/std": 0.06344074010848999, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.068732500076294, "sampling/importance_sampling_ratio/min": 0.003076533554121852, "sampling/sampling_logp_difference/max": 5.783951759338379, "sampling/sampling_logp_difference/mean": 0.11873342841863632, "step": 793 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2952185273170471, "epoch": 1.2724358974358974, "grad_norm": 0.011909693479537964, "learning_rate": 1e-06, "loss": -0.002, "step": 794 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.29329799115657806, "epoch": 1.2740384615384617, "grad_norm": 0.008952183648943901, "learning_rate": 1e-06, "loss": 0.0087, "step": 795 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.29316699504852295, "epoch": 1.2756410256410255, "grad_norm": 0.03064666874706745, "learning_rate": 1e-06, "loss": 0.0023, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14643.0, "completions/max_terminated_length": 14643.0, "completions/mean_length": 7695.822265625, "completions/mean_terminated_length": 7695.822265625, "completions/min_length": 3069.0, "completions/min_terminated_length": 3069.0, "entropy": 0.30592451989650726, "epoch": 1.2772435897435899, "frac_reward_zero_std": 0.8125, "grad_norm": 0.008664174936711788, "learning_rate": 1e-06, "loss": -0.0229, "num_tokens": 1039480127.0, "reward": 0.8838672637939453, "reward_std": 0.04563511162996292, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.978515625, "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, "rewards/symbolic_reward_partial_score/mean": 0.9891927242279053, "rewards/symbolic_reward_partial_score/std": 0.09879883378744125, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.071099042892456, "sampling/importance_sampling_ratio/min": 0.0011856004130095243, "sampling/sampling_logp_difference/max": 6.737505912780762, "sampling/sampling_logp_difference/mean": 0.12261907756328583, "step": 797 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.30264943838119507, "epoch": 1.2788461538461537, "grad_norm": 0.020319325849413872, "learning_rate": 1e-06, "loss": -0.0038, "step": 798 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3089882731437683, "epoch": 1.280448717948718, "grad_norm": 0.013271676376461983, "learning_rate": 1e-06, "loss": 0.0276, "step": 799 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.30357275903224945, "epoch": 1.282051282051282, "grad_norm": 0.022130422294139862, "learning_rate": 1e-06, "loss": -0.0088, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12987.0, "completions/mean_length": 7707.5078125, "completions/mean_terminated_length": 7690.5283203125, "completions/min_length": 2717.0, "completions/min_terminated_length": 2717.0, "entropy": 0.3025813400745392, "epoch": 1.2836538461538463, "frac_reward_zero_std": 0.90625, "grad_norm": 0.009830499067902565, "learning_rate": 1e-06, "loss": -0.0073, "num_tokens": 1044275923.0, "reward": 0.8870117664337158, "reward_std": 0.022976521402597427, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98046875, "rewards/symbolic_reward_accuracy/std": 0.1385180652141571, "rewards/symbolic_reward_partial_score/mean": 0.9964193105697632, "rewards/symbolic_reward_partial_score/std": 0.045904021710157394, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0703761577606201, "sampling/importance_sampling_ratio/min": 0.0024872496724128723, "sampling/sampling_logp_difference/max": 5.996577739715576, "sampling/sampling_logp_difference/mean": 0.12130090594291687, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3026769459247589, "epoch": 1.2852564102564101, "grad_norm": 0.00772315775975585, "learning_rate": 1e-06, "loss": -0.0016, "step": 802 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3021104037761688, "epoch": 1.2868589743589745, "grad_norm": 0.01086731068789959, "learning_rate": 1e-06, "loss": 0.0311, "step": 803 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.299283891916275, "epoch": 1.2884615384615383, "grad_norm": 0.012979859486222267, "learning_rate": 1e-06, "loss": -0.0104, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16163.0, "completions/max_terminated_length": 16163.0, "completions/mean_length": 8625.98046875, "completions/mean_terminated_length": 8625.98046875, "completions/min_length": 3964.0, "completions/min_terminated_length": 3964.0, "entropy": 0.29120542109012604, "epoch": 1.2900641025641026, "frac_reward_zero_std": 0.875, "grad_norm": 0.005270938854664564, "learning_rate": 1e-06, "loss": -0.02, "num_tokens": 1049633401.0, "reward": 0.8940331935882568, "reward_std": 0.02386718988418579, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9957356452941895, "rewards/symbolic_reward_partial_score/std": 0.06268040835857391, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0678813457489014, "sampling/importance_sampling_ratio/min": 0.0018456288380548358, "sampling/sampling_logp_difference/max": 6.29493522644043, "sampling/sampling_logp_difference/mean": 0.11715521663427353, "step": 805 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.29076600074768066, "epoch": 1.2916666666666667, "grad_norm": 0.024370063096284866, "learning_rate": 1e-06, "loss": -0.0091, "step": 806 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2884713411331177, "epoch": 1.2932692307692308, "grad_norm": 0.005116340704262257, "learning_rate": 1e-06, "loss": 0.0212, "step": 807 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.28749142587184906, "epoch": 1.294871794871795, "grad_norm": 0.005040435586124659, "learning_rate": 1e-06, "loss": 0.0027, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16067.0, "completions/mean_length": 8135.697265625, "completions/mean_terminated_length": 8103.3515625, "completions/min_length": 2674.0, "completions/min_terminated_length": 2674.0, "entropy": 0.2949254959821701, "epoch": 1.296474358974359, "frac_reward_zero_std": 0.84375, "grad_norm": 0.008695917204022408, "learning_rate": 1e-06, "loss": 0.0171, "num_tokens": 1054715534.0, "reward": 0.8852832317352295, "reward_std": 0.037169456481933594, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.978515625, "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, "rewards/symbolic_reward_partial_score/mean": 0.9945638179779053, "rewards/symbolic_reward_partial_score/std": 0.06339205801486969, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.068376064300537, "sampling/importance_sampling_ratio/min": 0.0008881228277459741, "sampling/sampling_logp_difference/max": 7.026400566101074, "sampling/sampling_logp_difference/mean": 0.11818452924489975, "step": 809 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.2952611595392227, "epoch": 1.2980769230769231, "grad_norm": 0.015448620542883873, "learning_rate": 1e-06, "loss": 0.0007, "step": 810 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2906734049320221, "epoch": 1.2996794871794872, "grad_norm": 0.01636776700615883, "learning_rate": 1e-06, "loss": -0.0312, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2910599410533905, "epoch": 1.3012820512820513, "grad_norm": 0.020219558849930763, "learning_rate": 1e-06, "loss": 0.0283, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15860.0, "completions/max_terminated_length": 15860.0, "completions/mean_length": 8440.0234375, "completions/mean_terminated_length": 8440.0234375, "completions/min_length": 4233.0, "completions/min_terminated_length": 4233.0, "entropy": 0.29061928391456604, "epoch": 1.3028846153846154, "frac_reward_zero_std": 0.875, "grad_norm": 0.006722176913172007, "learning_rate": 1e-06, "loss": -0.0116, "num_tokens": 1059909306.0, "reward": 0.8908935785293579, "reward_std": 0.02562917396426201, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.986328125, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.9969889521598816, "rewards/symbolic_reward_partial_score/std": 0.045237038284540176, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0675506591796875, "sampling/importance_sampling_ratio/min": 0.0014303690986707807, "sampling/sampling_logp_difference/max": 6.549822807312012, "sampling/sampling_logp_difference/mean": 0.11663618683815002, "step": 813 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.28442804515361786, "epoch": 1.3044871794871795, "grad_norm": 0.028145821765065193, "learning_rate": 1e-06, "loss": -0.0056, "step": 814 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.28922364115715027, "epoch": 1.3060897435897436, "grad_norm": 0.018946906551718712, "learning_rate": 1e-06, "loss": 0.0325, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.28923703730106354, "epoch": 1.3076923076923077, "grad_norm": 0.005810855887830257, "learning_rate": 1e-06, "loss": -0.0143, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15852.0, "completions/mean_length": 8810.791015625, "completions/mean_terminated_length": 8751.1591796875, "completions/min_length": 4011.0, "completions/min_terminated_length": 4011.0, "entropy": 0.26773253083229065, "epoch": 1.3092948717948718, "frac_reward_zero_std": 0.6875, "grad_norm": 0.03435671329498291, "learning_rate": 1e-06, "loss": 0.078, "num_tokens": 1065399423.0, "reward": 0.8867088556289673, "reward_std": 0.053164899349212646, "rewards/progression_diversity/mean": -2.084466177620925e-05, "rewards/progression_diversity/std": 0.0004716608382295817, "rewards/symbolic_reward_accuracy/mean": 0.982421875, "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, "rewards/symbolic_reward_partial_score/mean": 0.9928059577941895, "rewards/symbolic_reward_partial_score/std": 0.07733853906393051, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0633445978164673, "sampling/importance_sampling_ratio/min": 7.085214019753039e-05, "sampling/sampling_logp_difference/max": 9.554915428161621, "sampling/sampling_logp_difference/mean": 0.10956880450248718, "step": 817 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.26666541397571564, "epoch": 1.310897435897436, "grad_norm": 0.02670695073902607, "learning_rate": 1e-06, "loss": -0.0012, "step": 818 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2719925343990326, "epoch": 1.3125, "grad_norm": 0.026015251874923706, "learning_rate": 1e-06, "loss": -0.0272, "step": 819 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.26630181074142456, "epoch": 1.314102564102564, "grad_norm": 0.0247061625123024, "learning_rate": 1e-06, "loss": -0.0138, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15426.0, "completions/mean_length": 7572.466796875, "completions/mean_terminated_length": 7503.08447265625, "completions/min_length": 2604.0, "completions/min_terminated_length": 2604.0, "entropy": 0.27331072092056274, "epoch": 1.3157051282051282, "frac_reward_zero_std": 0.71875, "grad_norm": 0.025832634419202805, "learning_rate": 1e-06, "loss": 0.0133, "num_tokens": 1070202158.0, "reward": 0.8901264071464539, "reward_std": 0.033305682241916656, "rewards/progression_diversity/mean": -5.895454887649976e-05, "rewards/progression_diversity/std": 0.0013339892029762268, "rewards/symbolic_reward_accuracy/mean": 0.986328125, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.9970377683639526, "rewards/symbolic_reward_partial_score/std": 0.04511844739317894, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0644633769989014, "sampling/importance_sampling_ratio/min": 0.002112078480422497, "sampling/sampling_logp_difference/max": 6.160082817077637, "sampling/sampling_logp_difference/mean": 0.1119283139705658, "step": 821 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.27660806477069855, "epoch": 1.3173076923076923, "grad_norm": 0.02830098383128643, "learning_rate": 1e-06, "loss": 0.0153, "step": 822 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2817423641681671, "epoch": 1.3189102564102564, "grad_norm": 0.020086728036403656, "learning_rate": 1e-06, "loss": -0.0156, "step": 823 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.274202823638916, "epoch": 1.3205128205128205, "grad_norm": 0.023740731179714203, "learning_rate": 1e-06, "loss": 0.0539, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14339.0, "completions/mean_length": 7367.796875, "completions/mean_terminated_length": 7350.15234375, "completions/min_length": 2376.0, "completions/min_terminated_length": 2376.0, "entropy": 0.2823334336280823, "epoch": 1.3221153846153846, "frac_reward_zero_std": 0.84375, "grad_norm": 0.00718251708894968, "learning_rate": 1e-06, "loss": -0.0283, "num_tokens": 1074893158.0, "reward": 0.8895508050918579, "reward_std": 0.031776778399944305, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.9964193105697632, "rewards/symbolic_reward_partial_score/std": 0.04641921818256378, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0652776956558228, "sampling/importance_sampling_ratio/min": 4.965396328771021e-06, "sampling/sampling_logp_difference/max": 12.213017463684082, "sampling/sampling_logp_difference/mean": 0.11385728418827057, "step": 825 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.28175710141658783, "epoch": 1.3237179487179487, "grad_norm": 0.005745855160057545, "learning_rate": 1e-06, "loss": 0.0189, "step": 826 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.28442713618278503, "epoch": 1.3253205128205128, "grad_norm": 0.007170431315898895, "learning_rate": 1e-06, "loss": -0.017, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2808009684085846, "epoch": 1.3269230769230769, "grad_norm": 0.031063031405210495, "learning_rate": 1e-06, "loss": 0.0343, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13142.0, "completions/max_terminated_length": 13142.0, "completions/mean_length": 6885.845703125, "completions/mean_terminated_length": 6885.845703125, "completions/min_length": 1827.0, "completions/min_terminated_length": 1827.0, "entropy": 0.2910860776901245, "epoch": 1.328525641025641, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002486742567270994, "learning_rate": 1e-06, "loss": -0.005, "num_tokens": 1079341143.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0668704509735107, "sampling/importance_sampling_ratio/min": 0.0020832736045122147, "sampling/sampling_logp_difference/max": 6.17381477355957, "sampling/sampling_logp_difference/mean": 0.1167755201458931, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2902107387781143, "epoch": 1.330128205128205, "grad_norm": 0.002582802902907133, "learning_rate": 1e-06, "loss": -0.0052, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.29032963514328003, "epoch": 1.3317307692307692, "grad_norm": 0.0026250346563756466, "learning_rate": 1e-06, "loss": -0.0057, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2910378873348236, "epoch": 1.3333333333333333, "grad_norm": 0.02715476043522358, "learning_rate": 1e-06, "loss": 0.0168, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12247.0, "completions/max_terminated_length": 12247.0, "completions/mean_length": 6663.154296875, "completions/mean_terminated_length": 6663.154296875, "completions/min_length": 2193.0, "completions/min_terminated_length": 2193.0, "entropy": 0.2914612591266632, "epoch": 1.3349358974358974, "frac_reward_zero_std": 0.90625, "grad_norm": 0.004975066985934973, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 1083612246.0, "reward": 0.8934961557388306, "reward_std": 0.021559644490480423, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.993945300579071, "rewards/symbolic_reward_partial_score/std": 0.07650934904813766, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.066861629486084, "sampling/importance_sampling_ratio/min": 0.0018572451081126928, "sampling/sampling_logp_difference/max": 6.288661003112793, "sampling/sampling_logp_difference/mean": 0.11649785935878754, "step": 833 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2909384220838547, "epoch": 1.3365384615384617, "grad_norm": 0.00430506095290184, "learning_rate": 1e-06, "loss": -0.003, "step": 834 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2926383912563324, "epoch": 1.3381410256410255, "grad_norm": 0.0038858274929225445, "learning_rate": 1e-06, "loss": 0.0099, "step": 835 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2900676280260086, "epoch": 1.3397435897435899, "grad_norm": 0.0030539829749614, "learning_rate": 1e-06, "loss": -0.0002, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14258.0, "completions/max_terminated_length": 14258.0, "completions/mean_length": 6737.3671875, "completions/mean_terminated_length": 6737.3671875, "completions/min_length": 2386.0, "completions/min_terminated_length": 2386.0, "entropy": 0.2902229428291321, "epoch": 1.3413461538461537, "frac_reward_zero_std": 0.9375, "grad_norm": 0.00329815992154181, "learning_rate": 1e-06, "loss": -0.0076, "num_tokens": 1087951362.0, "reward": 0.8975489139556885, "reward_std": 0.009804688394069672, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9996418952941895, "rewards/symbolic_reward_partial_score/std": 0.0057472530752420425, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0671701431274414, "sampling/importance_sampling_ratio/min": 0.002303540473803878, "sampling/sampling_logp_difference/max": 6.073307991027832, "sampling/sampling_logp_difference/mean": 0.11671570688486099, "step": 837 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.29137638211250305, "epoch": 1.342948717948718, "grad_norm": 0.003731071949005127, "learning_rate": 1e-06, "loss": 0.0101, "step": 838 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.292741596698761, "epoch": 1.344551282051282, "grad_norm": 0.002997028874233365, "learning_rate": 1e-06, "loss": -0.008, "step": 839 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2919839918613434, "epoch": 1.3461538461538463, "grad_norm": 0.023894716054201126, "learning_rate": 1e-06, "loss": 0.0054, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14003.0, "completions/max_terminated_length": 14003.0, "completions/mean_length": 6641.5, "completions/mean_terminated_length": 6641.5, "completions/min_length": 2245.0, "completions/min_terminated_length": 2245.0, "entropy": 0.29364730417728424, "epoch": 1.3477564102564101, "frac_reward_zero_std": 0.78125, "grad_norm": 0.006924602203071117, "learning_rate": 1e-06, "loss": -0.0172, "num_tokens": 1092244098.0, "reward": 0.8913965225219727, "reward_std": 0.03441406041383743, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.986328125, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.9986653327941895, "rewards/symbolic_reward_partial_score/std": 0.011836127378046513, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0672986507415771, "sampling/importance_sampling_ratio/min": 0.0001825806830311194, "sampling/sampling_logp_difference/max": 8.608318328857422, "sampling/sampling_logp_difference/mean": 0.11722898483276367, "step": 841 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2935139685869217, "epoch": 1.3493589743589745, "grad_norm": 0.02281988225877285, "learning_rate": 1e-06, "loss": 0.0156, "step": 842 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2943962812423706, "epoch": 1.3509615384615383, "grad_norm": 0.00623247679322958, "learning_rate": 1e-06, "loss": -0.0121, "step": 843 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.29411324858665466, "epoch": 1.3525641025641026, "grad_norm": 0.006375083699822426, "learning_rate": 1e-06, "loss": 0.0022, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11405.0, "completions/max_terminated_length": 11405.0, "completions/mean_length": 7117.671875, "completions/mean_terminated_length": 7117.671875, "completions/min_length": 2860.0, "completions/min_terminated_length": 2860.0, "entropy": 0.29216840863227844, "epoch": 1.3541666666666667, "frac_reward_zero_std": 0.90625, "grad_norm": 0.01883051171898842, "learning_rate": 1e-06, "loss": 0.0144, "num_tokens": 1096780922.0, "reward": 0.8945703506469727, "reward_std": 0.0186243187636137, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9975260496139526, "rewards/symbolic_reward_partial_score/std": 0.044694118201732635, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0677756071090698, "sampling/importance_sampling_ratio/min": 0.0024964665062725544, "sampling/sampling_logp_difference/max": 5.9928789138793945, "sampling/sampling_logp_difference/mean": 0.11734534054994583, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.29655005037784576, "epoch": 1.3557692307692308, "grad_norm": 0.004739716649055481, "learning_rate": 1e-06, "loss": -0.0133, "step": 846 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.29636050760746, "epoch": 1.357371794871795, "grad_norm": 0.004647532943636179, "learning_rate": 1e-06, "loss": -0.0138, "step": 847 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2918502986431122, "epoch": 1.358974358974359, "grad_norm": 0.030562305822968483, "learning_rate": 1e-06, "loss": 0.0146, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13102.0, "completions/max_terminated_length": 13102.0, "completions/mean_length": 7432.736328125, "completions/mean_terminated_length": 7432.736328125, "completions/min_length": 2530.0, "completions/min_terminated_length": 2530.0, "entropy": 0.30163420736789703, "epoch": 1.3605769230769231, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0024499939754605293, "learning_rate": 1e-06, "loss": -0.0051, "num_tokens": 1101412211.0, "reward": 0.8987793326377869, "reward_std": 0.004882812965661287, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9998372793197632, "rewards/symbolic_reward_partial_score/std": 0.003682846901938319, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0680015087127686, "sampling/importance_sampling_ratio/min": 0.0015895604155957699, "sampling/sampling_logp_difference/max": 6.444297790527344, "sampling/sampling_logp_difference/mean": 0.11842922866344452, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.29871611297130585, "epoch": 1.3621794871794872, "grad_norm": 0.002597298938781023, "learning_rate": 1e-06, "loss": -0.0048, "step": 850 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2991296947002411, "epoch": 1.3637820512820513, "grad_norm": 0.02783992327749729, "learning_rate": 1e-06, "loss": 0.0167, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2933713495731354, "epoch": 1.3653846153846154, "grad_norm": 0.0024502624291926622, "learning_rate": 1e-06, "loss": -0.0046, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14793.0, "completions/max_terminated_length": 14793.0, "completions/mean_length": 7011.0234375, "completions/mean_terminated_length": 7011.0234375, "completions/min_length": 2970.0, "completions/min_terminated_length": 2970.0, "entropy": 0.31015750765800476, "epoch": 1.3669871794871795, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0022398389410227537, "learning_rate": 1e-06, "loss": 0.0133, "num_tokens": 1105784511.0, "reward": 0.8987793326377869, "reward_std": 0.004882812965661287, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9998372793197632, "rewards/symbolic_reward_partial_score/std": 0.003682846901938319, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0706534385681152, "sampling/importance_sampling_ratio/min": 0.0033097388222813606, "sampling/sampling_logp_difference/max": 5.710886001586914, "sampling/sampling_logp_difference/mean": 0.12256957590579987, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30707795917987823, "epoch": 1.3685897435897436, "grad_norm": 0.002265053801238537, "learning_rate": 1e-06, "loss": -0.0042, "step": 854 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3124368190765381, "epoch": 1.3701923076923077, "grad_norm": 0.001633352367207408, "learning_rate": 1e-06, "loss": -0.0043, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3110050559043884, "epoch": 1.3717948717948718, "grad_norm": 0.0022995418403297663, "learning_rate": 1e-06, "loss": -0.0039, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14868.0, "completions/max_terminated_length": 14868.0, "completions/mean_length": 7329.43359375, "completions/mean_terminated_length": 7329.43359375, "completions/min_length": 2311.0, "completions/min_terminated_length": 2311.0, "entropy": 0.29253579676151276, "epoch": 1.373397435897436, "frac_reward_zero_std": 0.90625, "grad_norm": 0.005778716877102852, "learning_rate": 1e-06, "loss": -0.0164, "num_tokens": 1110410605.0, "reward": 0.8926172256469727, "reward_std": 0.0187346413731575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.998828113079071, "rewards/symbolic_reward_partial_score/std": 0.011265556327998638, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.066795825958252, "sampling/importance_sampling_ratio/min": 0.003194189630448818, "sampling/sampling_logp_difference/max": 5.746421813964844, "sampling/sampling_logp_difference/mean": 0.11596965044736862, "step": 857 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.29146991670131683, "epoch": 1.375, "grad_norm": 0.005454590544104576, "learning_rate": 1e-06, "loss": 0.0106, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2908618450164795, "epoch": 1.376602564102564, "grad_norm": 0.014141952618956566, "learning_rate": 1e-06, "loss": 0.0008, "step": 859 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2932679355144501, "epoch": 1.3782051282051282, "grad_norm": 0.012503272853791714, "learning_rate": 1e-06, "loss": 0.0113, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11966.0, "completions/max_terminated_length": 11966.0, "completions/mean_length": 7037.12109375, "completions/mean_terminated_length": 7037.12109375, "completions/min_length": 3731.0, "completions/min_terminated_length": 3731.0, "entropy": 0.30436907708644867, "epoch": 1.3798076923076923, "frac_reward_zero_std": 0.90625, "grad_norm": 0.005315630231052637, "learning_rate": 1e-06, "loss": -0.0167, "num_tokens": 1114885243.0, "reward": 0.8938672542572021, "reward_std": 0.018317628651857376, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9990885257720947, "rewards/symbolic_reward_partial_score/std": 0.009222573600709438, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0696780681610107, "sampling/importance_sampling_ratio/min": 0.0011150204809382558, "sampling/sampling_logp_difference/max": 6.798882484436035, "sampling/sampling_logp_difference/mean": 0.1209033951163292, "step": 861 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30297212302684784, "epoch": 1.3814102564102564, "grad_norm": 0.01675940863788128, "learning_rate": 1e-06, "loss": 0.0164, "step": 862 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.304256334900856, "epoch": 1.3830128205128205, "grad_norm": 0.016471805050969124, "learning_rate": 1e-06, "loss": 0.0067, "step": 863 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.30140577256679535, "epoch": 1.3846153846153846, "grad_norm": 0.017290934920310974, "learning_rate": 1e-06, "loss": -0.0046, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14854.0, "completions/max_terminated_length": 14854.0, "completions/mean_length": 7528.865234375, "completions/mean_terminated_length": 7528.865234375, "completions/min_length": 2487.0, "completions/min_terminated_length": 2487.0, "entropy": 0.29779052734375, "epoch": 1.3862179487179487, "frac_reward_zero_std": 0.9375, "grad_norm": 0.016191750764846802, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 1119569030.0, "reward": 0.8950879573822021, "reward_std": 0.012819098308682442, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9992513060569763, "rewards/symbolic_reward_partial_score/std": 0.008469752036035061, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.06855046749115, "sampling/importance_sampling_ratio/min": 0.0019925027154386044, "sampling/sampling_logp_difference/max": 6.2183637619018555, "sampling/sampling_logp_difference/mean": 0.11855703592300415, "step": 865 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2984299212694168, "epoch": 1.3878205128205128, "grad_norm": 0.004113992676138878, "learning_rate": 1e-06, "loss": -0.0144, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.29633837938308716, "epoch": 1.3894230769230769, "grad_norm": 0.004194690380245447, "learning_rate": 1e-06, "loss": -0.0033, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2999422550201416, "epoch": 1.391025641025641, "grad_norm": 0.005653866101056337, "learning_rate": 1e-06, "loss": 0.0154, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13589.0, "completions/max_terminated_length": 13589.0, "completions/mean_length": 7836.474609375, "completions/mean_terminated_length": 7836.474609375, "completions/min_length": 3547.0, "completions/min_terminated_length": 3547.0, "entropy": 0.2959325313568115, "epoch": 1.392628205128205, "frac_reward_zero_std": 0.90625, "grad_norm": 0.031015831977128983, "learning_rate": 1e-06, "loss": 0.0285, "num_tokens": 1124527769.0, "reward": 0.8963379263877869, "reward_std": 0.0146484375, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.006366382818669081, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.067394733428955, "sampling/importance_sampling_ratio/min": 0.002571824938058853, "sampling/sampling_logp_difference/max": 5.963139533996582, "sampling/sampling_logp_difference/mean": 0.11667640507221222, "step": 869 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2887873947620392, "epoch": 1.3942307692307692, "grad_norm": 0.00414247065782547, "learning_rate": 1e-06, "loss": -0.0135, "step": 870 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2930924892425537, "epoch": 1.3958333333333333, "grad_norm": 0.003337219590321183, "learning_rate": 1e-06, "loss": -0.0123, "step": 871 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.29419972002506256, "epoch": 1.3974358974358974, "grad_norm": 0.004136944655328989, "learning_rate": 1e-06, "loss": 0.0065, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12698.0, "completions/max_terminated_length": 12698.0, "completions/mean_length": 7294.765625, "completions/mean_terminated_length": 7294.765625, "completions/min_length": 3576.0, "completions/min_terminated_length": 3576.0, "entropy": 0.29781144857406616, "epoch": 1.3990384615384617, "frac_reward_zero_std": 0.90625, "grad_norm": 0.025357889011502266, "learning_rate": 1e-06, "loss": 0.0257, "num_tokens": 1129125905.0, "reward": 0.8950977325439453, "reward_std": 0.016514942049980164, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9992839097976685, "rewards/symbolic_reward_partial_score/std": 0.00811202172189951, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0683658123016357, "sampling/importance_sampling_ratio/min": 0.00021021001157350838, "sampling/sampling_logp_difference/max": 8.467403411865234, "sampling/sampling_logp_difference/mean": 0.11810927093029022, "step": 873 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2955755293369293, "epoch": 1.4006410256410255, "grad_norm": 0.004411085043102503, "learning_rate": 1e-06, "loss": -0.013, "step": 874 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.29726772010326385, "epoch": 1.4022435897435899, "grad_norm": 0.026978353038430214, "learning_rate": 1e-06, "loss": 0.0058, "step": 875 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.29815293848514557, "epoch": 1.4038461538461537, "grad_norm": 0.003415036480873823, "learning_rate": 1e-06, "loss": -0.0142, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14252.0, "completions/max_terminated_length": 14252.0, "completions/mean_length": 7402.025390625, "completions/mean_terminated_length": 7402.025390625, "completions/min_length": 3103.0, "completions/min_terminated_length": 3103.0, "entropy": 0.2916039377450943, "epoch": 1.405448717948718, "frac_reward_zero_std": 0.9375, "grad_norm": 0.003685388946905732, "learning_rate": 1e-06, "loss": -0.0106, "num_tokens": 1133740766.0, "reward": 0.8975586295127869, "reward_std": 0.009765625, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9996744990348816, "rewards/symbolic_reward_partial_score/std": 0.005203233566135168, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.066849946975708, "sampling/importance_sampling_ratio/min": 0.0012651029974222183, "sampling/sampling_logp_difference/max": 6.672601699829102, "sampling/sampling_logp_difference/mean": 0.11604957282543182, "step": 877 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2909359931945801, "epoch": 1.407051282051282, "grad_norm": 0.0035066467244178057, "learning_rate": 1e-06, "loss": -0.0093, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2898210138082504, "epoch": 1.4086538461538463, "grad_norm": 0.0033086894545704126, "learning_rate": 1e-06, "loss": 0.0095, "step": 879 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.29252733290195465, "epoch": 1.4102564102564101, "grad_norm": 0.02426670864224434, "learning_rate": 1e-06, "loss": 0.0116, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15130.0, "completions/max_terminated_length": 15130.0, "completions/mean_length": 7525.994140625, "completions/mean_terminated_length": 7525.994140625, "completions/min_length": 2851.0, "completions/min_terminated_length": 2851.0, "entropy": 0.28987959027290344, "epoch": 1.4118589743589745, "frac_reward_zero_std": 0.8125, "grad_norm": 0.007168716751039028, "learning_rate": 1e-06, "loss": -0.0173, "num_tokens": 1138505355.0, "reward": 0.8906446099281311, "reward_std": 0.034327443689107895, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.986328125, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.9961588382720947, "rewards/symbolic_reward_partial_score/std": 0.0501725897192955, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.065748691558838, "sampling/importance_sampling_ratio/min": 0.003183236112818122, "sampling/sampling_logp_difference/max": 5.749856948852539, "sampling/sampling_logp_difference/mean": 0.11428984254598618, "step": 881 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2841818630695343, "epoch": 1.4134615384615383, "grad_norm": 0.026367392390966415, "learning_rate": 1e-06, "loss": 0.0049, "step": 882 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2883693128824234, "epoch": 1.4150641025641026, "grad_norm": 0.019833385944366455, "learning_rate": 1e-06, "loss": 0.0391, "step": 883 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.28537461161613464, "epoch": 1.4166666666666667, "grad_norm": 0.00673852302134037, "learning_rate": 1e-06, "loss": -0.0287, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14382.0, "completions/max_terminated_length": 14382.0, "completions/mean_length": 7235.87109375, "completions/mean_terminated_length": 7235.87109375, "completions/min_length": 3265.0, "completions/min_terminated_length": 3265.0, "entropy": 0.29649393260478973, "epoch": 1.4182692307692308, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0026662563905119896, "learning_rate": 1e-06, "loss": -0.0052, "num_tokens": 1143070409.0, "reward": 0.898681640625, "reward_std": 0.0052734375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.011048543266952038, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.068642497062683, "sampling/importance_sampling_ratio/min": 0.00319400685839355, "sampling/sampling_logp_difference/max": 5.746479034423828, "sampling/sampling_logp_difference/mean": 0.11909361928701401, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3017093986272812, "epoch": 1.419871794871795, "grad_norm": 0.0022972896695137024, "learning_rate": 1e-06, "loss": 0.0138, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30264417827129364, "epoch": 1.421474358974359, "grad_norm": 0.0025393194518983364, "learning_rate": 1e-06, "loss": -0.0061, "step": 887 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2955441474914551, "epoch": 1.4230769230769231, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.005, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12235.0, "completions/max_terminated_length": 12235.0, "completions/mean_length": 7418.005859375, "completions/mean_terminated_length": 7418.005859375, "completions/min_length": 2399.0, "completions/min_terminated_length": 2399.0, "entropy": 0.29583777487277985, "epoch": 1.4246794871794872, "frac_reward_zero_std": 0.875, "grad_norm": 0.005485072731971741, "learning_rate": 1e-06, "loss": -0.019, "num_tokens": 1147788892.0, "reward": 0.8933594226837158, "reward_std": 0.02346806786954403, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9973958730697632, "rewards/symbolic_reward_partial_score/std": 0.04477177560329437, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0678550004959106, "sampling/importance_sampling_ratio/min": 2.4652272259118035e-05, "sampling/sampling_logp_difference/max": 10.610641479492188, "sampling/sampling_logp_difference/mean": 0.11751426756381989, "step": 889 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2936114966869354, "epoch": 1.4262820512820513, "grad_norm": 0.019580816850066185, "learning_rate": 1e-06, "loss": -0.0057, "step": 890 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2942151129245758, "epoch": 1.4278846153846154, "grad_norm": 0.04152410104870796, "learning_rate": 1e-06, "loss": 0.0524, "step": 891 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.298058420419693, "epoch": 1.4294871794871795, "grad_norm": 0.005063659977167845, "learning_rate": 1e-06, "loss": -0.0209, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 13606.0, "completions/mean_length": 7480.78515625, "completions/mean_terminated_length": 7445.87109375, "completions/min_length": 2894.0, "completions/min_terminated_length": 2894.0, "entropy": 0.2844327837228775, "epoch": 1.4310897435897436, "frac_reward_zero_std": 0.84375, "grad_norm": 0.0049340990372002125, "learning_rate": 1e-06, "loss": -0.0191, "num_tokens": 1152608126.0, "reward": 0.8936182260513306, "reward_std": 0.0255273450165987, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.995654284954071, "rewards/symbolic_reward_partial_score/std": 0.06281018257141113, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.065583348274231, "sampling/importance_sampling_ratio/min": 0.0014057016232982278, "sampling/sampling_logp_difference/max": 6.567218780517578, "sampling/sampling_logp_difference/mean": 0.11391088366508484, "step": 893 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.2806769758462906, "epoch": 1.4326923076923077, "grad_norm": 0.040655478835105896, "learning_rate": 1e-06, "loss": 0.0593, "step": 894 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2880139797925949, "epoch": 1.4342948717948718, "grad_norm": 0.004034819081425667, "learning_rate": 1e-06, "loss": 0.0154, "step": 895 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.28479818999767303, "epoch": 1.435897435897436, "grad_norm": 0.004762680269777775, "learning_rate": 1e-06, "loss": -0.0189, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12721.0, "completions/max_terminated_length": 12721.0, "completions/mean_length": 7157.248046875, "completions/mean_terminated_length": 7157.248046875, "completions/min_length": 1919.0, "completions/min_terminated_length": 1919.0, "entropy": 0.29515235126018524, "epoch": 1.4375, "frac_reward_zero_std": 0.9375, "grad_norm": 0.002855000551789999, "learning_rate": 1e-06, "loss": -0.0081, "num_tokens": 1157202189.0, "reward": 0.8975586295127869, "reward_std": 0.009765625, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9996744990348816, "rewards/symbolic_reward_partial_score/std": 0.005203233566135168, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.067905306816101, "sampling/importance_sampling_ratio/min": 0.0016314449021592736, "sampling/sampling_logp_difference/max": 6.4182891845703125, "sampling/sampling_logp_difference/mean": 0.11758951842784882, "step": 897 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2956536114215851, "epoch": 1.439102564102564, "grad_norm": 0.001475250581279397, "learning_rate": 1e-06, "loss": 0.0062, "step": 898 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.29977406561374664, "epoch": 1.4407051282051282, "grad_norm": 0.0027920929715037346, "learning_rate": 1e-06, "loss": 0.0064, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.294843852519989, "epoch": 1.4423076923076923, "grad_norm": 0.0032186834141612053, "learning_rate": 1e-06, "loss": -0.0077, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11521.0, "completions/max_terminated_length": 11521.0, "completions/mean_length": 6777.044921875, "completions/mean_terminated_length": 6777.044921875, "completions/min_length": 2950.0, "completions/min_terminated_length": 2950.0, "entropy": 0.3089780658483505, "epoch": 1.4439102564102564, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0031850591767579317, "learning_rate": 1e-06, "loss": -0.0055, "num_tokens": 1161592100.0, "reward": 0.8970215320587158, "reward_std": 0.008288709446787834, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9978841543197632, "rewards/symbolic_reward_partial_score/std": 0.04434017464518547, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0706193447113037, "sampling/importance_sampling_ratio/min": 0.001069498248398304, "sampling/sampling_logp_difference/max": 6.8405656814575195, "sampling/sampling_logp_difference/mean": 0.12223009765148163, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3052385300397873, "epoch": 1.4455128205128205, "grad_norm": 0.002852542558684945, "learning_rate": 1e-06, "loss": 0.0031, "step": 902 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3105472922325134, "epoch": 1.4471153846153846, "grad_norm": 0.002079383237287402, "learning_rate": 1e-06, "loss": -0.0058, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3094532638788223, "epoch": 1.4487179487179487, "grad_norm": 0.003387544071301818, "learning_rate": 1e-06, "loss": 0.0072, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13264.0, "completions/max_terminated_length": 13264.0, "completions/mean_length": 7214.5078125, "completions/mean_terminated_length": 7214.5078125, "completions/min_length": 3526.0, "completions/min_terminated_length": 3526.0, "entropy": 0.3008831590414047, "epoch": 1.4503205128205128, "frac_reward_zero_std": 0.875, "grad_norm": 0.006545086856931448, "learning_rate": 1e-06, "loss": 0.014, "num_tokens": 1166184456.0, "reward": 0.890869140625, "reward_std": 0.026565231382846832, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.986328125, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.9969075918197632, "rewards/symbolic_reward_partial_score/std": 0.04549367353320122, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0693827867507935, "sampling/importance_sampling_ratio/min": 0.0020341959316283464, "sampling/sampling_logp_difference/max": 6.197654724121094, "sampling/sampling_logp_difference/mean": 0.11990463733673096, "step": 905 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3034283220767975, "epoch": 1.4519230769230769, "grad_norm": 0.025849303230643272, "learning_rate": 1e-06, "loss": 0.0067, "step": 906 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2999170422554016, "epoch": 1.453525641025641, "grad_norm": 0.005960631184279919, "learning_rate": 1e-06, "loss": -0.0245, "step": 907 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.30265505611896515, "epoch": 1.455128205128205, "grad_norm": 0.018048452213406563, "learning_rate": 1e-06, "loss": 0.0078, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13220.0, "completions/max_terminated_length": 13220.0, "completions/mean_length": 6950.375, "completions/mean_terminated_length": 6950.375, "completions/min_length": 2652.0, "completions/min_terminated_length": 2652.0, "entropy": 0.301173597574234, "epoch": 1.4567307692307692, "frac_reward_zero_std": 0.8125, "grad_norm": 0.006944697350263596, "learning_rate": 1e-06, "loss": -0.0061, "num_tokens": 1170566440.0, "reward": 0.8882812857627869, "reward_std": 0.040093328803777695, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.984375, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.9921875, "rewards/symbolic_reward_partial_score/std": 0.08017498254776001, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0687031745910645, "sampling/importance_sampling_ratio/min": 0.00040110392728820443, "sampling/sampling_logp_difference/max": 7.821290016174316, "sampling/sampling_logp_difference/mean": 0.11923378705978394, "step": 909 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2987861931324005, "epoch": 1.4583333333333333, "grad_norm": 0.020350439473986626, "learning_rate": 1e-06, "loss": 0.0218, "step": 910 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2985246926546097, "epoch": 1.4599358974358974, "grad_norm": 0.030750460922718048, "learning_rate": 1e-06, "loss": -0.0054, "step": 911 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2979954779148102, "epoch": 1.4615384615384617, "grad_norm": 0.01273882295936346, "learning_rate": 1e-06, "loss": 0.0, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11232.0, "completions/max_terminated_length": 11232.0, "completions/mean_length": 6890.78125, "completions/mean_terminated_length": 6890.78125, "completions/min_length": 2828.0, "completions/min_terminated_length": 2828.0, "entropy": 0.29950951039791107, "epoch": 1.4631410256410255, "frac_reward_zero_std": 0.875, "grad_norm": 0.004619269166141748, "learning_rate": 1e-06, "loss": 0.0198, "num_tokens": 1174936248.0, "reward": 0.8934961557388306, "reward_std": 0.02601562812924385, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.993945300579071, "rewards/symbolic_reward_partial_score/std": 0.07650934904813766, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0690256357192993, "sampling/importance_sampling_ratio/min": 0.0026924286503344774, "sampling/sampling_logp_difference/max": 5.917311668395996, "sampling/sampling_logp_difference/mean": 0.1194806694984436, "step": 913 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3017955720424652, "epoch": 1.4647435897435899, "grad_norm": 0.023214569315314293, "learning_rate": 1e-06, "loss": 0.017, "step": 914 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.30300796031951904, "epoch": 1.4663461538461537, "grad_norm": 0.004014391452074051, "learning_rate": 1e-06, "loss": -0.0151, "step": 915 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3013370633125305, "epoch": 1.467948717948718, "grad_norm": 0.004451179411262274, "learning_rate": 1e-06, "loss": -0.0157, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11576.0, "completions/max_terminated_length": 11576.0, "completions/mean_length": 6749.005859375, "completions/mean_terminated_length": 6749.005859375, "completions/min_length": 3037.0, "completions/min_terminated_length": 3037.0, "entropy": 0.305722251534462, "epoch": 1.469551282051282, "frac_reward_zero_std": 0.875, "grad_norm": 0.0044407895766198635, "learning_rate": 1e-06, "loss": 0.0082, "num_tokens": 1179280027.0, "reward": 0.89404296875, "reward_std": 0.02382812649011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9957682490348816, "rewards/symbolic_reward_partial_score/std": 0.0626349002122879, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0697790384292603, "sampling/importance_sampling_ratio/min": 0.0032580127008259296, "sampling/sampling_logp_difference/max": 5.726637840270996, "sampling/sampling_logp_difference/mean": 0.12138458341360092, "step": 917 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3042749762535095, "epoch": 1.4711538461538463, "grad_norm": 0.004154628608375788, "learning_rate": 1e-06, "loss": 0.015, "step": 918 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3077068626880646, "epoch": 1.4727564102564101, "grad_norm": 0.021867236122488976, "learning_rate": 1e-06, "loss": -0.0009, "step": 919 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.30803389847278595, "epoch": 1.4743589743589745, "grad_norm": 0.003819097066298127, "learning_rate": 1e-06, "loss": -0.0163, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11396.0, "completions/max_terminated_length": 11396.0, "completions/mean_length": 6802.4296875, "completions/mean_terminated_length": 6802.4296875, "completions/min_length": 2662.0, "completions/min_terminated_length": 2662.0, "entropy": 0.31033851206302643, "epoch": 1.4759615384615383, "frac_reward_zero_std": 0.96875, "grad_norm": 0.004083678126335144, "learning_rate": 1e-06, "loss": -0.0065, "num_tokens": 1183613159.0, "reward": 0.8967773914337158, "reward_std": 0.008847462944686413, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9970703125, "rewards/symbolic_reward_partial_score/std": 0.04937189444899559, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0699421167373657, "sampling/importance_sampling_ratio/min": 0.002445993945002556, "sampling/sampling_logp_difference/max": 6.013303756713867, "sampling/sampling_logp_difference/mean": 0.12115398049354553, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30309993028640747, "epoch": 1.4775641025641026, "grad_norm": 0.015554931946098804, "learning_rate": 1e-06, "loss": 0.0182, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.307483434677124, "epoch": 1.4791666666666667, "grad_norm": 0.0038423407822847366, "learning_rate": 1e-06, "loss": -0.0062, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30638065934181213, "epoch": 1.4807692307692308, "grad_norm": 0.0036329433787614107, "learning_rate": 1e-06, "loss": -0.0063, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12225.0, "completions/max_terminated_length": 12225.0, "completions/mean_length": 6971.83984375, "completions/mean_terminated_length": 6971.83984375, "completions/min_length": 3684.0, "completions/min_terminated_length": 3684.0, "entropy": 0.3044200539588928, "epoch": 1.482371794871795, "frac_reward_zero_std": 0.84375, "grad_norm": 0.00529886968433857, "learning_rate": 1e-06, "loss": -0.003, "num_tokens": 1188058277.0, "reward": 0.8922510147094727, "reward_std": 0.03099609725177288, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.993701159954071, "rewards/symbolic_reward_partial_score/std": 0.076689213514328, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0690126419067383, "sampling/importance_sampling_ratio/min": 0.002685052575543523, "sampling/sampling_logp_difference/max": 5.920054912567139, "sampling/sampling_logp_difference/mean": 0.11991654336452484, "step": 925 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.30244874954223633, "epoch": 1.483974358974359, "grad_norm": 0.03052562102675438, "learning_rate": 1e-06, "loss": 0.0028, "step": 926 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3017626106739044, "epoch": 1.4855769230769231, "grad_norm": 0.005101877264678478, "learning_rate": 1e-06, "loss": 0.0135, "step": 927 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.30204281210899353, "epoch": 1.4871794871794872, "grad_norm": 0.004146016668528318, "learning_rate": 1e-06, "loss": -0.0054, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12020.0, "completions/mean_length": 6972.96484375, "completions/mean_terminated_length": 6954.5478515625, "completions/min_length": 2702.0, "completions/min_terminated_length": 2702.0, "entropy": 0.3108600080013275, "epoch": 1.4887820512820513, "frac_reward_zero_std": 0.90625, "grad_norm": 0.004198870155960321, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 1192476643.0, "reward": 0.8955957293510437, "reward_std": 0.01684584841132164, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9976887702941895, "rewards/symbolic_reward_partial_score/std": 0.0445505827665329, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0708767175674438, "sampling/importance_sampling_ratio/min": 0.0016421438194811344, "sampling/sampling_logp_difference/max": 6.411752700805664, "sampling/sampling_logp_difference/mean": 0.12295570969581604, "step": 929 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3109143078327179, "epoch": 1.4903846153846154, "grad_norm": 0.004327795933932066, "learning_rate": 1e-06, "loss": -0.009, "step": 930 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3126075565814972, "epoch": 1.4919871794871795, "grad_norm": 0.003698945278301835, "learning_rate": 1e-06, "loss": -0.0132, "step": 931 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.31349894404411316, "epoch": 1.4935897435897436, "grad_norm": 0.0040559847839176655, "learning_rate": 1e-06, "loss": 0.0121, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12509.0, "completions/max_terminated_length": 12509.0, "completions/mean_length": 6793.474609375, "completions/mean_terminated_length": 6793.474609375, "completions/min_length": 3194.0, "completions/min_terminated_length": 3194.0, "entropy": 0.30804505944252014, "epoch": 1.4951923076923077, "frac_reward_zero_std": 0.90625, "grad_norm": 0.028041956946253777, "learning_rate": 1e-06, "loss": 0.0087, "num_tokens": 1196736406.0, "reward": 0.8957666754722595, "reward_std": 0.016933593899011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.997607409954071, "rewards/symbolic_reward_partial_score/std": 0.04473654553294182, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.07077956199646, "sampling/importance_sampling_ratio/min": 0.0015226604882627726, "sampling/sampling_logp_difference/max": 6.487296104431152, "sampling/sampling_logp_difference/mean": 0.1224517673254013, "step": 933 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.30982083082199097, "epoch": 1.4967948717948718, "grad_norm": 0.027506569400429726, "learning_rate": 1e-06, "loss": 0.0069, "step": 934 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.31032316386699677, "epoch": 1.498397435897436, "grad_norm": 0.0033966144546866417, "learning_rate": 1e-06, "loss": -0.0113, "step": 935 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.30926352739334106, "epoch": 1.5, "grad_norm": 0.024733208119869232, "learning_rate": 1e-06, "loss": 0.0028, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11656.0, "completions/max_terminated_length": 11656.0, "completions/mean_length": 6906.4140625, "completions/mean_terminated_length": 6906.4140625, "completions/min_length": 3048.0, "completions/min_terminated_length": 3048.0, "entropy": 0.3099443316459656, "epoch": 1.501602564102564, "frac_reward_zero_std": 0.96875, "grad_norm": 0.025104807689785957, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 1201122746.0, "reward": 0.8987793326377869, "reward_std": 0.0048828125, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9998372793197632, "rewards/symbolic_reward_partial_score/std": 0.003682846901938319, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.07002854347229, "sampling/importance_sampling_ratio/min": 0.0023560093250125647, "sampling/sampling_logp_difference/max": 6.050786018371582, "sampling/sampling_logp_difference/mean": 0.12201914191246033, "step": 937 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30576586723327637, "epoch": 1.5032051282051282, "grad_norm": 0.0017299925675615668, "learning_rate": 1e-06, "loss": -0.0043, "step": 938 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30867621302604675, "epoch": 1.5048076923076923, "grad_norm": 0.0017178656999021769, "learning_rate": 1e-06, "loss": -0.0045, "step": 939 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.31071707606315613, "epoch": 1.5064102564102564, "grad_norm": 0.0020263735204935074, "learning_rate": 1e-06, "loss": -0.0047, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12147.0, "completions/max_terminated_length": 12147.0, "completions/mean_length": 7048.5234375, "completions/mean_terminated_length": 7048.5234375, "completions/min_length": 2555.0, "completions/min_terminated_length": 2555.0, "entropy": 0.30272747576236725, "epoch": 1.5080128205128205, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0034703791607171297, "learning_rate": 1e-06, "loss": -0.0094, "num_tokens": 1205583014.0, "reward": 0.8975098133087158, "reward_std": 0.00996093824505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.008228649385273457, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0694454908370972, "sampling/importance_sampling_ratio/min": 0.0011049419408664107, "sampling/sampling_logp_difference/max": 6.807962417602539, "sampling/sampling_logp_difference/mean": 0.12039151787757874, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3034730553627014, "epoch": 1.5096153846153846, "grad_norm": 0.003563554957509041, "learning_rate": 1e-06, "loss": -0.0098, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30564577877521515, "epoch": 1.5112179487179487, "grad_norm": 0.0264495387673378, "learning_rate": 1e-06, "loss": 0.0068, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3048667460680008, "epoch": 1.5128205128205128, "grad_norm": 0.028183914721012115, "learning_rate": 1e-06, "loss": 0.0111, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11975.0, "completions/max_terminated_length": 11975.0, "completions/mean_length": 7280.6484375, "completions/mean_terminated_length": 7280.6484375, "completions/min_length": 2948.0, "completions/min_terminated_length": 2948.0, "entropy": 0.2988281399011612, "epoch": 1.5144230769230769, "frac_reward_zero_std": 0.90625, "grad_norm": 0.004710206296294928, "learning_rate": 1e-06, "loss": -0.0147, "num_tokens": 1210179298.0, "reward": 0.8951172232627869, "reward_std": 0.01643681712448597, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9993489980697632, "rewards/symbolic_reward_partial_score/std": 0.0073440405540168285, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0679197311401367, "sampling/importance_sampling_ratio/min": 0.0007556354394182563, "sampling/sampling_logp_difference/max": 7.187951564788818, "sampling/sampling_logp_difference/mean": 0.11771701276302338, "step": 945 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2950664162635803, "epoch": 1.516025641025641, "grad_norm": 0.00515699153766036, "learning_rate": 1e-06, "loss": 0.032, "step": 946 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.29714447259902954, "epoch": 1.5176282051282053, "grad_norm": 0.004459913820028305, "learning_rate": 1e-06, "loss": -0.0161, "step": 947 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.297834575176239, "epoch": 1.5192307692307692, "grad_norm": 0.00498380558565259, "learning_rate": 1e-06, "loss": -0.0013, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12741.0, "completions/max_terminated_length": 12741.0, "completions/mean_length": 7402.451171875, "completions/mean_terminated_length": 7402.451171875, "completions/min_length": 3591.0, "completions/min_terminated_length": 3591.0, "entropy": 0.296127051115036, "epoch": 1.5208333333333335, "frac_reward_zero_std": 0.96875, "grad_norm": 0.004056466277688742, "learning_rate": 1e-06, "loss": -0.0066, "num_tokens": 1214899529.0, "reward": 0.8970117568969727, "reward_std": 0.008309577591717243, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.997851550579071, "rewards/symbolic_reward_partial_score/std": 0.0444059856235981, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.068399429321289, "sampling/importance_sampling_ratio/min": 0.0013273690128698945, "sampling/sampling_logp_difference/max": 6.624556541442871, "sampling/sampling_logp_difference/mean": 0.11876702308654785, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30057577788829803, "epoch": 1.5224358974358974, "grad_norm": 0.003635054687038064, "learning_rate": 1e-06, "loss": 0.0065, "step": 950 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3005140721797943, "epoch": 1.5240384615384617, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0058, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30061452090740204, "epoch": 1.5256410256410255, "grad_norm": 0.014728136360645294, "learning_rate": 1e-06, "loss": 0.0044, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12208.0, "completions/max_terminated_length": 12208.0, "completions/mean_length": 7126.783203125, "completions/mean_terminated_length": 7126.783203125, "completions/min_length": 2316.0, "completions/min_terminated_length": 2316.0, "entropy": 0.30513474345207214, "epoch": 1.5272435897435899, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002268366049975157, "learning_rate": 1e-06, "loss": -0.004, "num_tokens": 1219390618.0, "reward": 0.8987793326377869, "reward_std": 0.004882812965661287, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9998372793197632, "rewards/symbolic_reward_partial_score/std": 0.003682846901938319, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0697507858276367, "sampling/importance_sampling_ratio/min": 0.0018025912577286363, "sampling/sampling_logp_difference/max": 6.318530082702637, "sampling/sampling_logp_difference/mean": 0.12075302749872208, "step": 953 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.30484290421009064, "epoch": 1.5288461538461537, "grad_norm": 0.0017726526129990816, "learning_rate": 1e-06, "loss": -0.004, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30700714886188507, "epoch": 1.530448717948718, "grad_norm": 0.002408325904980302, "learning_rate": 1e-06, "loss": -0.0043, "step": 955 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.30840322375297546, "epoch": 1.532051282051282, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0142, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13056.0, "completions/mean_length": 7702.8828125, "completions/mean_terminated_length": 7685.89404296875, "completions/min_length": 3550.0, "completions/min_terminated_length": 3550.0, "entropy": 0.3104442358016968, "epoch": 1.5336538461538463, "frac_reward_zero_std": 0.9375, "grad_norm": 0.004158318508416414, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 1224167902.0, "reward": 0.8956055045127869, "reward_std": 0.01448369212448597, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9977213740348816, "rewards/symbolic_reward_partial_score/std": 0.04448510333895683, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0701613426208496, "sampling/importance_sampling_ratio/min": 1.4257681701934644e-09, "sampling/sampling_logp_difference/max": 20.368555068969727, "sampling/sampling_logp_difference/mean": 0.12146088480949402, "step": 957 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3077358156442642, "epoch": 1.5352564102564101, "grad_norm": 0.004114874638617039, "learning_rate": 1e-06, "loss": 0.022, "step": 958 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.30623532831668854, "epoch": 1.5368589743589745, "grad_norm": 0.001660304144024849, "learning_rate": 1e-06, "loss": -0.0103, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30911290645599365, "epoch": 1.5384615384615383, "grad_norm": 0.003970026969909668, "learning_rate": 1e-06, "loss": 0.0053, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12576.0, "completions/mean_length": 7168.396484375, "completions/mean_terminated_length": 7150.36181640625, "completions/min_length": 2494.0, "completions/min_terminated_length": 2494.0, "entropy": 0.30951225757598877, "epoch": 1.5400641025641026, "frac_reward_zero_std": 0.90625, "grad_norm": 0.03025236912071705, "learning_rate": 1e-06, "loss": 0.005, "num_tokens": 1228666761.0, "reward": 0.8961328268051147, "reward_std": 0.015468751080334187, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9994791746139526, "rewards/symbolic_reward_partial_score/std": 0.006817440502345562, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0703372955322266, "sampling/importance_sampling_ratio/min": 0.0028708188328891993, "sampling/sampling_logp_difference/max": 5.853157997131348, "sampling/sampling_logp_difference/mean": 0.12191639095544815, "step": 961 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3105301558971405, "epoch": 1.5416666666666665, "grad_norm": 0.00395960221067071, "learning_rate": 1e-06, "loss": 0.0362, "step": 962 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30917251110076904, "epoch": 1.5432692307692308, "grad_norm": 0.003832736052572727, "learning_rate": 1e-06, "loss": -0.0135, "step": 963 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3083617687225342, "epoch": 1.5448717948717947, "grad_norm": 0.003537870245054364, "learning_rate": 1e-06, "loss": -0.0129, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12432.0, "completions/max_terminated_length": 12432.0, "completions/mean_length": 7791.91796875, "completions/mean_terminated_length": 7791.91796875, "completions/min_length": 4424.0, "completions/min_terminated_length": 4424.0, "entropy": 0.2928401529788971, "epoch": 1.546474358974359, "frac_reward_zero_std": 0.875, "grad_norm": 0.005985740572214127, "learning_rate": 1e-06, "loss": -0.0197, "num_tokens": 1233547311.0, "reward": 0.89306640625, "reward_std": 0.02463994361460209, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9964193105697632, "rewards/symbolic_reward_partial_score/std": 0.05014854669570923, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0672111511230469, "sampling/importance_sampling_ratio/min": 0.0024554759729653597, "sampling/sampling_logp_difference/max": 6.009434700012207, "sampling/sampling_logp_difference/mean": 0.11611609160900116, "step": 965 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.29419833421707153, "epoch": 1.5480769230769231, "grad_norm": 0.019973278045654297, "learning_rate": 1e-06, "loss": -0.0069, "step": 966 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2934655100107193, "epoch": 1.5496794871794872, "grad_norm": 0.00576686579734087, "learning_rate": 1e-06, "loss": -0.0064, "step": 967 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.29317569732666016, "epoch": 1.5512820512820513, "grad_norm": 0.03426339477300644, "learning_rate": 1e-06, "loss": 0.0292, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12050.0, "completions/max_terminated_length": 12050.0, "completions/mean_length": 7161.708984375, "completions/mean_terminated_length": 7161.708984375, "completions/min_length": 2797.0, "completions/min_terminated_length": 2797.0, "entropy": 0.3102886974811554, "epoch": 1.5528846153846154, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0047947121784091, "learning_rate": 1e-06, "loss": -0.0141, "num_tokens": 1238009850.0, "reward": 0.8939453363418579, "reward_std": 0.021000541746616364, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9954426884651184, "rewards/symbolic_reward_partial_score/std": 0.06325981765985489, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0706758499145508, "sampling/importance_sampling_ratio/min": 0.0009838519617915154, "sampling/sampling_logp_difference/max": 6.92403507232666, "sampling/sampling_logp_difference/mean": 0.12199265509843826, "step": 969 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.31009647250175476, "epoch": 1.5544871794871795, "grad_norm": 0.02185901068150997, "learning_rate": 1e-06, "loss": 0.011, "step": 970 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.30998343229293823, "epoch": 1.5560897435897436, "grad_norm": 0.005330664571374655, "learning_rate": 1e-06, "loss": 0.0082, "step": 971 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.30864378809928894, "epoch": 1.5576923076923077, "grad_norm": 0.004321873653680086, "learning_rate": 1e-06, "loss": -0.0145, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12500.0, "completions/max_terminated_length": 12500.0, "completions/mean_length": 7311.296875, "completions/mean_terminated_length": 7311.296875, "completions/min_length": 2664.0, "completions/min_terminated_length": 2664.0, "entropy": 0.2941237837076187, "epoch": 1.5592948717948718, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1242603026.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0677077770233154, "sampling/importance_sampling_ratio/min": 0.002588698174804449, "sampling/sampling_logp_difference/max": 5.956600189208984, "sampling/sampling_logp_difference/mean": 0.11709814518690109, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.29679691791534424, "epoch": 1.560897435897436, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2933846265077591, "epoch": 1.5625, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.29794517159461975, "epoch": 1.564102564102564, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12324.0, "completions/max_terminated_length": 12324.0, "completions/mean_length": 7672.275390625, "completions/mean_terminated_length": 7672.275390625, "completions/min_length": 2624.0, "completions/min_terminated_length": 2624.0, "entropy": 0.2908579856157303, "epoch": 1.5657051282051282, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002392068738117814, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 1247454351.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.066673994064331, "sampling/importance_sampling_ratio/min": 0.0019470222759991884, "sampling/sampling_logp_difference/max": 6.241454124450684, "sampling/sampling_logp_difference/mean": 0.1153443232178688, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.28915299475193024, "epoch": 1.5673076923076923, "grad_norm": 0.0020979184191673994, "learning_rate": 1e-06, "loss": -0.0041, "step": 978 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.29180823266506195, "epoch": 1.5689102564102564, "grad_norm": 0.0016745133325457573, "learning_rate": 1e-06, "loss": -0.0046, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2878534197807312, "epoch": 1.5705128205128205, "grad_norm": 0.023565437644720078, "learning_rate": 1e-06, "loss": 0.0117, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13424.0, "completions/max_terminated_length": 13424.0, "completions/mean_length": 7492.591796875, "completions/mean_terminated_length": 7492.591796875, "completions/min_length": 3296.0, "completions/min_terminated_length": 3296.0, "entropy": 0.302181601524353, "epoch": 1.5721153846153846, "frac_reward_zero_std": 0.9375, "grad_norm": 0.004688016138970852, "learning_rate": 1e-06, "loss": -0.0118, "num_tokens": 1252190958.0, "reward": 0.8957812786102295, "reward_std": 0.01375581230968237, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9976562261581421, "rewards/symbolic_reward_partial_score/std": 0.044615939259529114, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0691838264465332, "sampling/importance_sampling_ratio/min": 0.0014014577027410269, "sampling/sampling_logp_difference/max": 6.570242404937744, "sampling/sampling_logp_difference/mean": 0.11948972940444946, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30064649879932404, "epoch": 1.5737179487179487, "grad_norm": 0.004267314448952675, "learning_rate": 1e-06, "loss": -0.0106, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30243119597435, "epoch": 1.5753205128205128, "grad_norm": 0.003907489590346813, "learning_rate": 1e-06, "loss": 0.0174, "step": 983 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30112794041633606, "epoch": 1.5769230769230769, "grad_norm": 0.0035790693946182728, "learning_rate": 1e-06, "loss": 0.006, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14669.0, "completions/mean_length": 7357.505859375, "completions/mean_terminated_length": 7339.84130859375, "completions/min_length": 3741.0, "completions/min_terminated_length": 3741.0, "entropy": 0.3018489480018616, "epoch": 1.578525641025641, "frac_reward_zero_std": 0.9375, "grad_norm": 0.003833780298009515, "learning_rate": 1e-06, "loss": -0.0092, "num_tokens": 1256820097.0, "reward": 0.8965234756469727, "reward_std": 0.013906250707805157, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.996874988079071, "rewards/symbolic_reward_partial_score/std": 0.051494304090738297, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0690256357192993, "sampling/importance_sampling_ratio/min": 0.00016220644465647638, "sampling/sampling_logp_difference/max": 8.726640701293945, "sampling/sampling_logp_difference/mean": 0.11971522867679596, "step": 985 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.30244719982147217, "epoch": 1.5801282051282053, "grad_norm": 0.0026512236800044775, "learning_rate": 1e-06, "loss": 0.0229, "step": 986 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.30103157460689545, "epoch": 1.5817307692307692, "grad_norm": 0.0024182642810046673, "learning_rate": 1e-06, "loss": 0.0206, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30128341913223267, "epoch": 1.5833333333333335, "grad_norm": 0.00353293027728796, "learning_rate": 1e-06, "loss": -0.009, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11739.0, "completions/max_terminated_length": 11739.0, "completions/mean_length": 7189.322265625, "completions/mean_terminated_length": 7189.322265625, "completions/min_length": 3961.0, "completions/min_terminated_length": 3961.0, "entropy": 0.308343768119812, "epoch": 1.5849358974358974, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1261283766.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0706350803375244, "sampling/importance_sampling_ratio/min": 0.0024685210082679987, "sampling/sampling_logp_difference/max": 6.004136085510254, "sampling/sampling_logp_difference/mean": 0.12226317822933197, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30838756263256073, "epoch": 1.5865384615384617, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3085535615682602, "epoch": 1.5881410256410255, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30801253020763397, "epoch": 1.5897435897435899, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14298.0, "completions/max_terminated_length": 14298.0, "completions/mean_length": 7401.361328125, "completions/mean_terminated_length": 7401.361328125, "completions/min_length": 3344.0, "completions/min_terminated_length": 3344.0, "entropy": 0.30265168845653534, "epoch": 1.5913461538461537, "frac_reward_zero_std": 0.9375, "grad_norm": 0.016912033781409264, "learning_rate": 1e-06, "loss": 0.01, "num_tokens": 1265987423.0, "reward": 0.8945801258087158, "reward_std": 0.014904549345374107, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.99755859375, "rewards/symbolic_reward_partial_score/std": 0.044628970324993134, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0696299076080322, "sampling/importance_sampling_ratio/min": 0.0019413894042372704, "sampling/sampling_logp_difference/max": 6.244351387023926, "sampling/sampling_logp_difference/mean": 0.12015168368816376, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3008352816104889, "epoch": 1.592948717948718, "grad_norm": 0.005295754410326481, "learning_rate": 1e-06, "loss": -0.0123, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30222633481025696, "epoch": 1.594551282051282, "grad_norm": 0.014241804368793964, "learning_rate": 1e-06, "loss": 0.0207, "step": 995 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.30317962169647217, "epoch": 1.5961538461538463, "grad_norm": 0.003911672160029411, "learning_rate": 1e-06, "loss": -0.0128, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14579.0, "completions/max_terminated_length": 14579.0, "completions/mean_length": 7341.966796875, "completions/mean_terminated_length": 7341.966796875, "completions/min_length": 3543.0, "completions/min_terminated_length": 3543.0, "entropy": 0.3111606240272522, "epoch": 1.5977564102564101, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1270548686.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0711815357208252, "sampling/importance_sampling_ratio/min": 1.5573759729742287e-12, "sampling/sampling_logp_difference/max": 27.188018798828125, "sampling/sampling_logp_difference/mean": 0.12292777001857758, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3086211085319519, "epoch": 1.5993589743589745, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30883246660232544, "epoch": 1.6009615384615383, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.31077563762664795, "epoch": 1.6025641025641026, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12309.0, "completions/max_terminated_length": 12309.0, "completions/mean_length": 7422.494140625, "completions/mean_terminated_length": 7422.494140625, "completions/min_length": 3676.0, "completions/min_terminated_length": 3676.0, "entropy": 0.2989673614501953, "epoch": 1.6041666666666665, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002359275473281741, "learning_rate": 1e-06, "loss": -0.0036, "num_tokens": 1275273659.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0692410469055176, "sampling/importance_sampling_ratio/min": 0.002691791858524084, "sampling/sampling_logp_difference/max": 5.917548179626465, "sampling/sampling_logp_difference/mean": 0.11995130777359009, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30219927430152893, "epoch": 1.6057692307692308, "grad_norm": 0.0021961370948702097, "learning_rate": 1e-06, "loss": 0.0098, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.301533043384552, "epoch": 1.6073717948717947, "grad_norm": 0.0021459702402353287, "learning_rate": 1e-06, "loss": -0.0035, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3046448081731796, "epoch": 1.608974358974359, "grad_norm": 0.002298478502780199, "learning_rate": 1e-06, "loss": -0.0039, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12258.0, "completions/max_terminated_length": 12258.0, "completions/mean_length": 7409.1484375, "completions/mean_terminated_length": 7409.1484375, "completions/min_length": 3620.0, "completions/min_terminated_length": 3620.0, "entropy": 0.305515393614769, "epoch": 1.6105769230769231, "frac_reward_zero_std": 0.875, "grad_norm": 0.006075072567909956, "learning_rate": 1e-06, "loss": 0.0092, "num_tokens": 1279892007.0, "reward": 0.893310546875, "reward_std": 0.02366338111460209, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9972330927848816, "rewards/symbolic_reward_partial_score/std": 0.04513990134000778, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0702705383300781, "sampling/importance_sampling_ratio/min": 9.090582153703508e-08, "sampling/sampling_logp_difference/max": 16.213441848754883, "sampling/sampling_logp_difference/mean": 0.12114111334085464, "step": 1005 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30623990297317505, "epoch": 1.6121794871794872, "grad_norm": 0.01973203755915165, "learning_rate": 1e-06, "loss": -0.0047, "step": 1006 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3060086518526077, "epoch": 1.6137820512820513, "grad_norm": 0.004733169451355934, "learning_rate": 1e-06, "loss": -0.0043, "step": 1007 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3062121272087097, "epoch": 1.6153846153846154, "grad_norm": 0.0320599228143692, "learning_rate": 1e-06, "loss": 0.0035, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13785.0, "completions/max_terminated_length": 13785.0, "completions/mean_length": 7285.33203125, "completions/mean_terminated_length": 7285.33203125, "completions/min_length": 2963.0, "completions/min_terminated_length": 2963.0, "entropy": 0.31654123961925507, "epoch": 1.6169871794871795, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1284390481.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0720587968826294, "sampling/importance_sampling_ratio/min": 0.0013273663353174925, "sampling/sampling_logp_difference/max": 6.624558448791504, "sampling/sampling_logp_difference/mean": 0.12469251453876495, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.31280288100242615, "epoch": 1.6185897435897436, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3145320415496826, "epoch": 1.6201923076923077, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3184860795736313, "epoch": 1.6217948717948718, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12617.0, "completions/max_terminated_length": 12617.0, "completions/mean_length": 7735.546875, "completions/mean_terminated_length": 7735.546875, "completions/min_length": 3578.0, "completions/min_terminated_length": 3578.0, "entropy": 0.30475133657455444, "epoch": 1.623397435897436, "frac_reward_zero_std": 0.90625, "grad_norm": 0.004162474535405636, "learning_rate": 1e-06, "loss": -0.0123, "num_tokens": 1289199177.0, "reward": 0.8960742950439453, "reward_std": 0.015703124925494194, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9986327886581421, "rewards/symbolic_reward_partial_score/std": 0.022945649921894073, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0703797340393066, "sampling/importance_sampling_ratio/min": 0.0022015005815774202, "sampling/sampling_logp_difference/max": 6.118616104125977, "sampling/sampling_logp_difference/mean": 0.1216018795967102, "step": 1013 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.30561186373233795, "epoch": 1.625, "grad_norm": 0.004154075402766466, "learning_rate": 1e-06, "loss": 0.0073, "step": 1014 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.30855441093444824, "epoch": 1.626602564102564, "grad_norm": 0.00390764232724905, "learning_rate": 1e-06, "loss": -0.0126, "step": 1015 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.30875104665756226, "epoch": 1.6282051282051282, "grad_norm": 0.0035740600433200598, "learning_rate": 1e-06, "loss": 0.0296, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13899.0, "completions/max_terminated_length": 13899.0, "completions/mean_length": 7892.951171875, "completions/mean_terminated_length": 7892.951171875, "completions/min_length": 2880.0, "completions/min_terminated_length": 2880.0, "entropy": 0.3002883791923523, "epoch": 1.6298076923076923, "frac_reward_zero_std": 0.875, "grad_norm": 0.0051344577223062515, "learning_rate": 1e-06, "loss": -0.0177, "num_tokens": 1294171984.0, "reward": 0.8945556879043579, "reward_std": 0.02177734486758709, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9974772334098816, "rewards/symbolic_reward_partial_score/std": 0.04481436312198639, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.068955421447754, "sampling/importance_sampling_ratio/min": 0.00017753004794940352, "sampling/sampling_logp_difference/max": 8.636370658874512, "sampling/sampling_logp_difference/mean": 0.11876966059207916, "step": 1017 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.30114126205444336, "epoch": 1.6314102564102564, "grad_norm": 0.025489378720521927, "learning_rate": 1e-06, "loss": 0.0132, "step": 1018 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.29750263690948486, "epoch": 1.6330128205128205, "grad_norm": 0.004476260859519243, "learning_rate": 1e-06, "loss": -0.0006, "step": 1019 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2999611049890518, "epoch": 1.6346153846153846, "grad_norm": 0.03016292303800583, "learning_rate": 1e-06, "loss": 0.003, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12380.0, "completions/max_terminated_length": 12380.0, "completions/mean_length": 7600.552734375, "completions/mean_terminated_length": 7600.552734375, "completions/min_length": 3388.0, "completions/min_terminated_length": 3388.0, "entropy": 0.31947872042655945, "epoch": 1.6362179487179487, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0039420961402356625, "learning_rate": 1e-06, "loss": -0.0093, "num_tokens": 1298850939.0, "reward": 0.8975342512130737, "reward_std": 0.00986328162252903, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9995931386947632, "rewards/symbolic_reward_partial_score/std": 0.006633348762989044, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.072902798652649, "sampling/importance_sampling_ratio/min": 0.0011838971404358745, "sampling/sampling_logp_difference/max": 6.738943576812744, "sampling/sampling_logp_difference/mean": 0.1259896457195282, "step": 1021 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3202350288629532, "epoch": 1.6378205128205128, "grad_norm": 0.003697711741551757, "learning_rate": 1e-06, "loss": -0.0088, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3217895179986954, "epoch": 1.6394230769230769, "grad_norm": 0.027309000492095947, "learning_rate": 1e-06, "loss": 0.0269, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3194885402917862, "epoch": 1.641025641025641, "grad_norm": 0.0037369539495557547, "learning_rate": 1e-06, "loss": -0.0094, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14350.0, "completions/max_terminated_length": 14350.0, "completions/mean_length": 7857.98046875, "completions/mean_terminated_length": 7857.98046875, "completions/min_length": 4121.0, "completions/min_terminated_length": 4121.0, "entropy": 0.316010519862175, "epoch": 1.6426282051282053, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1303731377.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.07135808467865, "sampling/importance_sampling_ratio/min": 0.0011098433751612902, "sampling/sampling_logp_difference/max": 6.803536415100098, "sampling/sampling_logp_difference/mean": 0.12318795174360275, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3135620355606079, "epoch": 1.6442307692307692, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30700941383838654, "epoch": 1.6458333333333335, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3093920797109604, "epoch": 1.6474358974358974, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13836.0, "completions/max_terminated_length": 13836.0, "completions/mean_length": 7814.375, "completions/mean_terminated_length": 7814.375, "completions/min_length": 3011.0, "completions/min_terminated_length": 3011.0, "entropy": 0.3088514804840088, "epoch": 1.6490384615384617, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0025735602248460054, "learning_rate": 1e-06, "loss": -0.0049, "num_tokens": 1308590833.0, "reward": 0.8987793326377869, "reward_std": 0.0048828125, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9998372793197632, "rewards/symbolic_reward_partial_score/std": 0.003682846901938319, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0705108642578125, "sampling/importance_sampling_ratio/min": 0.003677040571346879, "sampling/sampling_logp_difference/max": 5.605647087097168, "sampling/sampling_logp_difference/mean": 0.12232539057731628, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3050430715084076, "epoch": 1.6506410256410255, "grad_norm": 0.026151379570364952, "learning_rate": 1e-06, "loss": 0.0156, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3086225986480713, "epoch": 1.6522435897435899, "grad_norm": 0.0025779418647289276, "learning_rate": 1e-06, "loss": -0.0047, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3103097528219223, "epoch": 1.6538461538461537, "grad_norm": 0.002719617448747158, "learning_rate": 1e-06, "loss": -0.0048, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14775.0, "completions/max_terminated_length": 14775.0, "completions/mean_length": 8091.33984375, "completions/mean_terminated_length": 8091.33984375, "completions/min_length": 4480.0, "completions/min_terminated_length": 4480.0, "entropy": 0.2884677052497864, "epoch": 1.655448717948718, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0037412128876894712, "learning_rate": 1e-06, "loss": -0.0101, "num_tokens": 1313699583.0, "reward": 0.8975489139556885, "reward_std": 0.009804688394069672, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9996419548988342, "rewards/symbolic_reward_partial_score/std": 0.0057472530752420425, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0662262439727783, "sampling/importance_sampling_ratio/min": 0.005839198362082243, "sampling/sampling_logp_difference/max": 5.143161773681641, "sampling/sampling_logp_difference/mean": 0.1148136556148529, "step": 1033 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.28683267533779144, "epoch": 1.657051282051282, "grad_norm": 0.003717794781550765, "learning_rate": 1e-06, "loss": -0.0095, "step": 1034 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2895479053258896, "epoch": 1.6586538461538463, "grad_norm": 0.00363312684930861, "learning_rate": 1e-06, "loss": -0.0098, "step": 1035 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.28496386110782623, "epoch": 1.6602564102564101, "grad_norm": 0.027082180604338646, "learning_rate": 1e-06, "loss": 0.029, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 11484.0, "completions/max_terminated_length": 11484.0, "completions/mean_length": 7448.791015625, "completions/mean_terminated_length": 7448.791015625, "completions/min_length": 3219.0, "completions/min_terminated_length": 3219.0, "entropy": 0.3116293400526047, "epoch": 1.6618589743589745, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0021872571669518948, "learning_rate": 1e-06, "loss": -0.0034, "num_tokens": 1318367204.0, "reward": 0.8987696170806885, "reward_std": 0.004921874962747097, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.999804675579071, "rewards/symbolic_reward_partial_score/std": 0.004419418517500162, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0708409547805786, "sampling/importance_sampling_ratio/min": 0.0022659434471279383, "sampling/sampling_logp_difference/max": 6.08976411819458, "sampling/sampling_logp_difference/mean": 0.12293624877929688, "step": 1037 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.30697862803936005, "epoch": 1.6634615384615383, "grad_norm": 0.0015544932102784514, "learning_rate": 1e-06, "loss": -0.0036, "step": 1038 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.31002984941005707, "epoch": 1.6650641025641026, "grad_norm": 0.023099038749933243, "learning_rate": 1e-06, "loss": 0.0098, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.31141215562820435, "epoch": 1.6666666666666665, "grad_norm": 0.0024219758342951536, "learning_rate": 1e-06, "loss": -0.0038, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12306.0, "completions/max_terminated_length": 12306.0, "completions/mean_length": 7141.751953125, "completions/mean_terminated_length": 7141.751953125, "completions/min_length": 3117.0, "completions/min_terminated_length": 3117.0, "entropy": 0.3169794976711273, "epoch": 1.6682692307692308, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1322892181.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0715605020523071, "sampling/importance_sampling_ratio/min": 0.0019754667300730944, "sampling/sampling_logp_difference/max": 6.226950645446777, "sampling/sampling_logp_difference/mean": 0.12364828586578369, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.31208787858486176, "epoch": 1.6698717948717947, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3129581958055496, "epoch": 1.671474358974359, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.31092701852321625, "epoch": 1.6730769230769231, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13112.0, "completions/max_terminated_length": 13112.0, "completions/mean_length": 7309.1953125, "completions/mean_terminated_length": 7309.1953125, "completions/min_length": 3182.0, "completions/min_terminated_length": 3182.0, "entropy": 0.2980504482984543, "epoch": 1.6746794871794872, "frac_reward_zero_std": 0.8125, "grad_norm": 0.005616697948426008, "learning_rate": 1e-06, "loss": -0.0244, "num_tokens": 1327557033.0, "reward": 0.8921875357627869, "reward_std": 0.0312500037252903, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.9973958730697632, "rewards/symbolic_reward_partial_score/std": 0.0320318341255188, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0682005882263184, "sampling/importance_sampling_ratio/min": 0.001380621804855764, "sampling/sampling_logp_difference/max": 6.585221290588379, "sampling/sampling_logp_difference/mean": 0.1181633397936821, "step": 1045 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2949713468551636, "epoch": 1.6762820512820513, "grad_norm": 0.035627998411655426, "learning_rate": 1e-06, "loss": 0.0373, "step": 1046 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.3000248521566391, "epoch": 1.6778846153846154, "grad_norm": 0.004616168327629566, "learning_rate": 1e-06, "loss": -0.0122, "step": 1047 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.2983495146036148, "epoch": 1.6794871794871795, "grad_norm": 0.005554524250328541, "learning_rate": 1e-06, "loss": -0.0077, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12885.0, "completions/max_terminated_length": 12885.0, "completions/mean_length": 7191.85546875, "completions/mean_terminated_length": 7191.85546875, "completions/min_length": 3008.0, "completions/min_terminated_length": 3008.0, "entropy": 0.3059154152870178, "epoch": 1.6810897435897436, "frac_reward_zero_std": 0.875, "grad_norm": 0.02501201070845127, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 1332076111.0, "reward": 0.89404296875, "reward_std": 0.02382812649011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9957682490348816, "rewards/symbolic_reward_partial_score/std": 0.0626349002122879, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.069615125656128, "sampling/importance_sampling_ratio/min": 0.002162045333534479, "sampling/sampling_logp_difference/max": 6.136700630187988, "sampling/sampling_logp_difference/mean": 0.12109000235795975, "step": 1049 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30635979771614075, "epoch": 1.6826923076923077, "grad_norm": 0.004613403230905533, "learning_rate": 1e-06, "loss": -0.0175, "step": 1050 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.30721524357795715, "epoch": 1.6842948717948718, "grad_norm": 0.025122642517089844, "learning_rate": 1e-06, "loss": 0.0025, "step": 1051 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.303785964846611, "epoch": 1.685897435897436, "grad_norm": 0.026602037250995636, "learning_rate": 1e-06, "loss": 0.0249, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12410.0, "completions/max_terminated_length": 12410.0, "completions/mean_length": 7009.740234375, "completions/mean_terminated_length": 7009.740234375, "completions/min_length": 3039.0, "completions/min_terminated_length": 3039.0, "entropy": 0.30636970698833466, "epoch": 1.6875, "frac_reward_zero_std": 0.875, "grad_norm": 0.0045735533349215984, "learning_rate": 1e-06, "loss": -0.0164, "num_tokens": 1336534170.0, "reward": 0.893505871295929, "reward_std": 0.02597656473517418, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9939779043197632, "rewards/symbolic_reward_partial_score/std": 0.07647283375263214, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0696215629577637, "sampling/importance_sampling_ratio/min": 0.0018621934577822685, "sampling/sampling_logp_difference/max": 6.2860002517700195, "sampling/sampling_logp_difference/mean": 0.1207767054438591, "step": 1053 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3046419471502304, "epoch": 1.689102564102564, "grad_norm": 0.0311747919768095, "learning_rate": 1e-06, "loss": 0.0121, "step": 1054 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.3092672973871231, "epoch": 1.6907051282051282, "grad_norm": 0.026585765182971954, "learning_rate": 1e-06, "loss": 0.0187, "step": 1055 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3052953779697418, "epoch": 1.6923076923076923, "grad_norm": 0.004019228275865316, "learning_rate": 1e-06, "loss": -0.0155, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12460.0, "completions/max_terminated_length": 12460.0, "completions/mean_length": 7024.05078125, "completions/mean_terminated_length": 7024.05078125, "completions/min_length": 3136.0, "completions/min_terminated_length": 3136.0, "entropy": 0.31111109256744385, "epoch": 1.6939102564102564, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0032420039642602205, "learning_rate": 1e-06, "loss": -0.0079, "num_tokens": 1340954756.0, "reward": 0.8969531059265137, "reward_std": 0.012187501415610313, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9976562261581421, "rewards/symbolic_reward_partial_score/std": 0.0450524240732193, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0706874132156372, "sampling/importance_sampling_ratio/min": 0.0010910998098552227, "sampling/sampling_logp_difference/max": 6.820569038391113, "sampling/sampling_logp_difference/mean": 0.12288743257522583, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.30891941487789154, "epoch": 1.6955128205128205, "grad_norm": 0.003183747874572873, "learning_rate": 1e-06, "loss": 0.0228, "step": 1058 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3126651793718338, "epoch": 1.6971153846153846, "grad_norm": 0.0023205929901450872, "learning_rate": 1e-06, "loss": -0.0077, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3115437924861908, "epoch": 1.6987179487179487, "grad_norm": 0.0032985415309667587, "learning_rate": 1e-06, "loss": -0.0089, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12176.0, "completions/max_terminated_length": 12176.0, "completions/mean_length": 7136.126953125, "completions/mean_terminated_length": 7136.126953125, "completions/min_length": 3152.0, "completions/min_terminated_length": 3152.0, "entropy": 0.31162822246551514, "epoch": 1.7003205128205128, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0024950928054749966, "learning_rate": 1e-06, "loss": -0.0047, "num_tokens": 1345428069.0, "reward": 0.8987793326377869, "reward_std": 0.0048828125, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9998372793197632, "rewards/symbolic_reward_partial_score/std": 0.003682846901938319, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0699987411499023, "sampling/importance_sampling_ratio/min": 0.001176500809378922, "sampling/sampling_logp_difference/max": 6.745210647583008, "sampling/sampling_logp_difference/mean": 0.12207436561584473, "step": 1061 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30960360169410706, "epoch": 1.7019230769230769, "grad_norm": 0.0017172261141240597, "learning_rate": 1e-06, "loss": -0.0045, "step": 1062 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3100372850894928, "epoch": 1.703525641025641, "grad_norm": 0.0024772342294454575, "learning_rate": 1e-06, "loss": -0.0048, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.31119243800640106, "epoch": 1.7051282051282053, "grad_norm": 0.02614988386631012, "learning_rate": 1e-06, "loss": 0.0144, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12824.0, "completions/max_terminated_length": 12824.0, "completions/mean_length": 7220.416015625, "completions/mean_terminated_length": 7220.416015625, "completions/min_length": 2008.0, "completions/min_terminated_length": 2008.0, "entropy": 0.31035639345645905, "epoch": 1.7067307692307692, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1349993530.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.069838047027588, "sampling/importance_sampling_ratio/min": 0.00014011569146532565, "sampling/sampling_logp_difference/max": 8.873042106628418, "sampling/sampling_logp_difference/mean": 0.12209658324718475, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30622316896915436, "epoch": 1.7083333333333335, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3089989274740219, "epoch": 1.7099358974358974, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3127421587705612, "epoch": 1.7115384615384617, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12989.0, "completions/max_terminated_length": 12989.0, "completions/mean_length": 7588.2265625, "completions/mean_terminated_length": 7588.2265625, "completions/min_length": 4223.0, "completions/min_terminated_length": 4223.0, "entropy": 0.3054191470146179, "epoch": 1.7131410256410255, "frac_reward_zero_std": 0.9375, "grad_norm": 0.018871258944272995, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 1354722926.0, "reward": 0.8963379263877869, "reward_std": 0.01155400462448597, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.006366382818669081, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0694046020507812, "sampling/importance_sampling_ratio/min": 0.0024460311979055405, "sampling/sampling_logp_difference/max": 6.013288497924805, "sampling/sampling_logp_difference/mean": 0.12116353213787079, "step": 1069 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3093167692422867, "epoch": 1.7147435897435899, "grad_norm": 0.0049339476972818375, "learning_rate": 1e-06, "loss": -0.0122, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30434533953666687, "epoch": 1.7163461538461537, "grad_norm": 0.004389526788145304, "learning_rate": 1e-06, "loss": -0.0122, "step": 1071 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.30614908039569855, "epoch": 1.717948717948718, "grad_norm": 0.0026515277568250895, "learning_rate": 1e-06, "loss": 0.0261, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12732.0, "completions/max_terminated_length": 12732.0, "completions/mean_length": 7372.23828125, "completions/mean_terminated_length": 7372.23828125, "completions/min_length": 2964.0, "completions/min_terminated_length": 2964.0, "entropy": 0.31145814061164856, "epoch": 1.719551282051282, "frac_reward_zero_std": 0.875, "grad_norm": 0.006173459812998772, "learning_rate": 1e-06, "loss": -0.0209, "num_tokens": 1359454808.0, "reward": 0.8932129144668579, "reward_std": 0.02405400574207306, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9969075918197632, "rewards/symbolic_reward_partial_score/std": 0.046963535249233246, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0696375370025635, "sampling/importance_sampling_ratio/min": 0.002213636413216591, "sampling/sampling_logp_difference/max": 6.113118648529053, "sampling/sampling_logp_difference/mean": 0.12133283913135529, "step": 1073 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3093886077404022, "epoch": 1.7211538461538463, "grad_norm": 0.0051024542190134525, "learning_rate": 1e-06, "loss": -0.0022, "step": 1074 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3084931820631027, "epoch": 1.7227564102564101, "grad_norm": 0.02555413916707039, "learning_rate": 1e-06, "loss": 0.0442, "step": 1075 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30458875000476837, "epoch": 1.7243589743589745, "grad_norm": 0.005829751957207918, "learning_rate": 1e-06, "loss": -0.0204, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12915.0, "completions/max_terminated_length": 12915.0, "completions/mean_length": 7552.041015625, "completions/mean_terminated_length": 7552.041015625, "completions/min_length": 4131.0, "completions/min_terminated_length": 4131.0, "entropy": 0.3053240031003952, "epoch": 1.7259615384615383, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0045834193006157875, "learning_rate": 1e-06, "loss": -0.012, "num_tokens": 1364209533.0, "reward": 0.8963379263877869, "reward_std": 0.011554005555808544, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.006366382818669081, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0692803859710693, "sampling/importance_sampling_ratio/min": 4.1937363448596443e-07, "sampling/sampling_logp_difference/max": 14.684503555297852, "sampling/sampling_logp_difference/mean": 0.12061621993780136, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30637161433696747, "epoch": 1.7275641025641026, "grad_norm": 0.004549259319901466, "learning_rate": 1e-06, "loss": -0.0116, "step": 1078 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3078131079673767, "epoch": 1.7291666666666665, "grad_norm": 0.003365602809935808, "learning_rate": 1e-06, "loss": 0.0011, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30383966863155365, "epoch": 1.7307692307692308, "grad_norm": 0.005219892133027315, "learning_rate": 1e-06, "loss": 0.0237, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13163.0, "completions/max_terminated_length": 13163.0, "completions/mean_length": 7803.697265625, "completions/mean_terminated_length": 7803.697265625, "completions/min_length": 3165.0, "completions/min_terminated_length": 3165.0, "entropy": 0.29921863973140717, "epoch": 1.7323717948717947, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0048124357126653194, "learning_rate": 1e-06, "loss": 0.0241, "num_tokens": 1369126098.0, "reward": 0.8949414491653442, "reward_std": 0.017115186899900436, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9987630248069763, "rewards/symbolic_reward_partial_score/std": 0.01657147705554962, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0677688121795654, "sampling/importance_sampling_ratio/min": 0.0015285428380593657, "sampling/sampling_logp_difference/max": 6.483440399169922, "sampling/sampling_logp_difference/mean": 0.11848267912864685, "step": 1081 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.29725104570388794, "epoch": 1.733974358974359, "grad_norm": 0.004948347806930542, "learning_rate": 1e-06, "loss": -0.0151, "step": 1082 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.30296705663204193, "epoch": 1.7355769230769231, "grad_norm": 0.004423732403665781, "learning_rate": 1e-06, "loss": -0.0158, "step": 1083 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2992386966943741, "epoch": 1.7371794871794872, "grad_norm": 0.01645185425877571, "learning_rate": 1e-06, "loss": 0.007, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12400.0, "completions/mean_length": 7506.873046875, "completions/mean_terminated_length": 7489.5009765625, "completions/min_length": 2675.0, "completions/min_terminated_length": 2675.0, "entropy": 0.30802831053733826, "epoch": 1.7387820512820513, "frac_reward_zero_std": 0.90625, "grad_norm": 0.005042756907641888, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 1373828897.0, "reward": 0.8933007717132568, "reward_std": 0.02315332740545273, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.993945300579071, "rewards/symbolic_reward_partial_score/std": 0.07650934904813766, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.06916344165802, "sampling/importance_sampling_ratio/min": 0.002870687283575535, "sampling/sampling_logp_difference/max": 5.853203773498535, "sampling/sampling_logp_difference/mean": 0.12105688452720642, "step": 1085 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30676987767219543, "epoch": 1.7403846153846154, "grad_norm": 0.019612792879343033, "learning_rate": 1e-06, "loss": -0.0029, "step": 1086 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3068690150976181, "epoch": 1.7419871794871795, "grad_norm": 0.03385534510016441, "learning_rate": 1e-06, "loss": 0.0183, "step": 1087 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.30766068398952484, "epoch": 1.7435897435897436, "grad_norm": 0.004236425273120403, "learning_rate": 1e-06, "loss": 0.0023, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13133.0, "completions/max_terminated_length": 13133.0, "completions/mean_length": 8049.69140625, "completions/mean_terminated_length": 8049.69140625, "completions/min_length": 3345.0, "completions/min_terminated_length": 3345.0, "entropy": 0.29996953904628754, "epoch": 1.7451923076923077, "frac_reward_zero_std": 0.9375, "grad_norm": 0.003684076014906168, "learning_rate": 1e-06, "loss": -0.0095, "num_tokens": 1378946483.0, "reward": 0.8975489139556885, "reward_std": 0.009804688394069672, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9996418952941895, "rewards/symbolic_reward_partial_score/std": 0.0057472530752420425, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.067392349243164, "sampling/importance_sampling_ratio/min": 0.0015522823669016361, "sampling/sampling_logp_difference/max": 6.468029022216797, "sampling/sampling_logp_difference/mean": 0.11816658079624176, "step": 1089 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.29776372015476227, "epoch": 1.7467948717948718, "grad_norm": 0.0024523658212274313, "learning_rate": 1e-06, "loss": -0.0091, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2962539494037628, "epoch": 1.748397435897436, "grad_norm": 0.029643870890140533, "learning_rate": 1e-06, "loss": 0.0333, "step": 1091 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3009512573480606, "epoch": 1.75, "grad_norm": 0.003705281764268875, "learning_rate": 1e-06, "loss": -0.0101, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13541.0, "completions/max_terminated_length": 13541.0, "completions/mean_length": 7528.818359375, "completions/mean_terminated_length": 7528.818359375, "completions/min_length": 2870.0, "completions/min_terminated_length": 2870.0, "entropy": 0.3087153285741806, "epoch": 1.751602564102564, "frac_reward_zero_std": 0.90625, "grad_norm": 0.03856688365340233, "learning_rate": 1e-06, "loss": 0.0225, "num_tokens": 1383655846.0, "reward": 0.8951172232627869, "reward_std": 0.01643681712448597, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9993489980697632, "rewards/symbolic_reward_partial_score/std": 0.0073440405540168285, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.069844365119934, "sampling/importance_sampling_ratio/min": 0.0015164552023634315, "sampling/sampling_logp_difference/max": 6.491379737854004, "sampling/sampling_logp_difference/mean": 0.12191909551620483, "step": 1093 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3065723031759262, "epoch": 1.7532051282051282, "grad_norm": 0.004395030438899994, "learning_rate": 1e-06, "loss": -0.0046, "step": 1094 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.31010180711746216, "epoch": 1.7548076923076923, "grad_norm": 0.005476230755448341, "learning_rate": 1e-06, "loss": -0.0161, "step": 1095 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3097592294216156, "epoch": 1.7564102564102564, "grad_norm": 0.00418106047436595, "learning_rate": 1e-06, "loss": -0.0023, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13614.0, "completions/max_terminated_length": 13614.0, "completions/mean_length": 7423.01953125, "completions/mean_terminated_length": 7423.01953125, "completions/min_length": 3172.0, "completions/min_terminated_length": 3172.0, "entropy": 0.3101639300584793, "epoch": 1.7580128205128205, "frac_reward_zero_std": 0.9375, "grad_norm": 0.004919133614748716, "learning_rate": 1e-06, "loss": -0.0111, "num_tokens": 1388312768.0, "reward": 0.8963086605072021, "reward_std": 0.011646436527371407, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9994140863418579, "rewards/symbolic_reward_partial_score/std": 0.007639662828296423, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0697271823883057, "sampling/importance_sampling_ratio/min": 0.0010703522711992264, "sampling/sampling_logp_difference/max": 6.8397674560546875, "sampling/sampling_logp_difference/mean": 0.12227816879749298, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.31154482066631317, "epoch": 1.7596153846153846, "grad_norm": 0.004658720921725035, "learning_rate": 1e-06, "loss": 0.0214, "step": 1098 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.31060969829559326, "epoch": 1.7612179487179487, "grad_norm": 0.004687273874878883, "learning_rate": 1e-06, "loss": -0.0112, "step": 1099 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.30701225996017456, "epoch": 1.7628205128205128, "grad_norm": 0.004379766061902046, "learning_rate": 1e-06, "loss": 0.001, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12550.0, "completions/max_terminated_length": 12550.0, "completions/mean_length": 7696.623046875, "completions/mean_terminated_length": 7696.623046875, "completions/min_length": 2813.0, "completions/min_terminated_length": 2813.0, "entropy": 0.30509281158447266, "epoch": 1.7644230769230769, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0035905460827052593, "learning_rate": 1e-06, "loss": 0.0067, "num_tokens": 1393157807.0, "reward": 0.8975586295127869, "reward_std": 0.0066711921244859695, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9996744990348816, "rewards/symbolic_reward_partial_score/std": 0.005203233566135168, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0688618421554565, "sampling/importance_sampling_ratio/min": 0.002739542629569769, "sampling/sampling_logp_difference/max": 5.899964332580566, "sampling/sampling_logp_difference/mean": 0.1204448714852333, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30465711653232574, "epoch": 1.766025641025641, "grad_norm": 0.0038503403775393963, "learning_rate": 1e-06, "loss": -0.0069, "step": 1102 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30177219212055206, "epoch": 1.7676282051282053, "grad_norm": 0.00662583764642477, "learning_rate": 1e-06, "loss": -0.0069, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30572329461574554, "epoch": 1.7692307692307692, "grad_norm": 0.019112765789031982, "learning_rate": 1e-06, "loss": 0.0083, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12681.0, "completions/max_terminated_length": 12681.0, "completions/mean_length": 7622.841796875, "completions/mean_terminated_length": 7622.841796875, "completions/min_length": 3801.0, "completions/min_terminated_length": 3801.0, "entropy": 0.3011849373579025, "epoch": 1.7708333333333335, "frac_reward_zero_std": 0.90625, "grad_norm": 0.027583763003349304, "learning_rate": 1e-06, "loss": 0.0133, "num_tokens": 1397982030.0, "reward": 0.8950977325439453, "reward_std": 0.016514942049980164, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9992838501930237, "rewards/symbolic_reward_partial_score/std": 0.00811202172189951, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0681089162826538, "sampling/importance_sampling_ratio/min": 0.0003649833088275045, "sampling/sampling_logp_difference/max": 7.915658950805664, "sampling/sampling_logp_difference/mean": 0.11929503828287125, "step": 1105 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.30475567281246185, "epoch": 1.7724358974358974, "grad_norm": 0.005002895370125771, "learning_rate": 1e-06, "loss": -0.016, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.29859915375709534, "epoch": 1.7740384615384617, "grad_norm": 0.005493351258337498, "learning_rate": 1e-06, "loss": -0.0167, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3013754189014435, "epoch": 1.7756410256410255, "grad_norm": 0.02845832332968712, "learning_rate": 1e-06, "loss": 0.0172, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13094.0, "completions/max_terminated_length": 13094.0, "completions/mean_length": 7377.328125, "completions/mean_terminated_length": 7377.328125, "completions/min_length": 4239.0, "completions/min_terminated_length": 4239.0, "entropy": 0.30232779681682587, "epoch": 1.7772435897435899, "frac_reward_zero_std": 0.9375, "grad_norm": 0.003389689838513732, "learning_rate": 1e-06, "loss": -0.0081, "num_tokens": 1402510950.0, "reward": 0.8975391387939453, "reward_std": 0.009843749925494194, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9996093511581421, "rewards/symbolic_reward_partial_score/std": 0.006243883166462183, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.069146752357483, "sampling/importance_sampling_ratio/min": 0.0019089938141405582, "sampling/sampling_logp_difference/max": 6.261178970336914, "sampling/sampling_logp_difference/mean": 0.12119454890489578, "step": 1109 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.30659207701683044, "epoch": 1.7788461538461537, "grad_norm": 0.002910124370828271, "learning_rate": 1e-06, "loss": 0.0114, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3085610866546631, "epoch": 1.780448717948718, "grad_norm": 0.003460104111582041, "learning_rate": 1e-06, "loss": 0.0081, "step": 1111 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30735933780670166, "epoch": 1.782051282051282, "grad_norm": 0.0027840950060635805, "learning_rate": 1e-06, "loss": -0.0078, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13833.0, "completions/max_terminated_length": 13833.0, "completions/mean_length": 7699.49609375, "completions/mean_terminated_length": 7699.49609375, "completions/min_length": 2247.0, "completions/min_terminated_length": 2247.0, "entropy": 0.3024802505970001, "epoch": 1.7836538461538463, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0052666435949504375, "learning_rate": 1e-06, "loss": 0.0069, "num_tokens": 1407318500.0, "reward": 0.8945801258087158, "reward_std": 0.01858525536954403, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.99755859375, "rewards/symbolic_reward_partial_score/std": 0.044628970324993134, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0683372020721436, "sampling/importance_sampling_ratio/min": 0.00017096490773838013, "sampling/sampling_logp_difference/max": 8.674052238464355, "sampling/sampling_logp_difference/mean": 0.11985655128955841, "step": 1113 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.30356265604496, "epoch": 1.7852564102564101, "grad_norm": 0.0042807371355593204, "learning_rate": 1e-06, "loss": 0.0029, "step": 1114 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.30395713448524475, "epoch": 1.7868589743589745, "grad_norm": 0.004926716443151236, "learning_rate": 1e-06, "loss": -0.001, "step": 1115 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.30027005076408386, "epoch": 1.7884615384615383, "grad_norm": 0.019763482734560966, "learning_rate": 1e-06, "loss": -0.0021, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13435.0, "completions/max_terminated_length": 13435.0, "completions/mean_length": 7422.830078125, "completions/mean_terminated_length": 7422.830078125, "completions/min_length": 2285.0, "completions/min_terminated_length": 2285.0, "entropy": 0.30064651370048523, "epoch": 1.7900641025641026, "frac_reward_zero_std": 0.90625, "grad_norm": 0.004853361286222935, "learning_rate": 1e-06, "loss": -0.0149, "num_tokens": 1412002157.0, "reward": 0.8963282108306885, "reward_std": 0.014687499962747097, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9994791746139526, "rewards/symbolic_reward_partial_score/std": 0.006817440502345562, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0680367946624756, "sampling/importance_sampling_ratio/min": 1.495128998385553e-07, "sampling/sampling_logp_difference/max": 15.715883255004883, "sampling/sampling_logp_difference/mean": 0.11941249668598175, "step": 1117 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.30547034740448, "epoch": 1.7916666666666665, "grad_norm": 0.03301442041993141, "learning_rate": 1e-06, "loss": 0.0097, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3002014458179474, "epoch": 1.7932692307692308, "grad_norm": 0.030427923426032066, "learning_rate": 1e-06, "loss": 0.0106, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.29985660314559937, "epoch": 1.7948717948717947, "grad_norm": 0.029749156907200813, "learning_rate": 1e-06, "loss": 0.0088, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12488.0, "completions/max_terminated_length": 12488.0, "completions/mean_length": 7279.498046875, "completions/mean_terminated_length": 7279.498046875, "completions/min_length": 3535.0, "completions/min_terminated_length": 3535.0, "entropy": 0.3116193860769272, "epoch": 1.796474358974359, "frac_reward_zero_std": 0.96875, "grad_norm": 0.002386681968346238, "learning_rate": 1e-06, "loss": -0.0042, "num_tokens": 1416602988.0, "reward": 0.8982422351837158, "reward_std": 0.00703125074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.998046875, "rewards/symbolic_reward_partial_score/std": 0.04419417306780815, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0702404975891113, "sampling/importance_sampling_ratio/min": 0.0015238322084769607, "sampling/sampling_logp_difference/max": 6.486526966094971, "sampling/sampling_logp_difference/mean": 0.12285487353801727, "step": 1121 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.311521053314209, "epoch": 1.7980769230769231, "grad_norm": 0.0017178966663777828, "learning_rate": 1e-06, "loss": -0.0039, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.31351254880428314, "epoch": 1.7996794871794872, "grad_norm": 0.002310063922777772, "learning_rate": 1e-06, "loss": 0.0122, "step": 1123 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3113110065460205, "epoch": 1.8012820512820513, "grad_norm": 0.0022971148137003183, "learning_rate": 1e-06, "loss": -0.0039, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12657.0, "completions/max_terminated_length": 12657.0, "completions/mean_length": 7098.353515625, "completions/mean_terminated_length": 7098.353515625, "completions/min_length": 2688.0, "completions/min_terminated_length": 2688.0, "entropy": 0.30783234536647797, "epoch": 1.8028846153846154, "frac_reward_zero_std": 0.90625, "grad_norm": 0.005364755168557167, "learning_rate": 1e-06, "loss": -0.0181, "num_tokens": 1421106401.0, "reward": 0.8938770294189453, "reward_std": 0.018278565257787704, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9991210699081421, "rewards/symbolic_reward_partial_score/std": 0.008895767852663994, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.069352388381958, "sampling/importance_sampling_ratio/min": 0.0006668087444268167, "sampling/sampling_logp_difference/max": 7.313007354736328, "sampling/sampling_logp_difference/mean": 0.12137231230735779, "step": 1125 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.30746787786483765, "epoch": 1.8044871794871795, "grad_norm": 0.0049836039543151855, "learning_rate": 1e-06, "loss": 0.0083, "step": 1126 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.307818204164505, "epoch": 1.8060897435897436, "grad_norm": 0.00515682203695178, "learning_rate": 1e-06, "loss": -0.0155, "step": 1127 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3116377890110016, "epoch": 1.8076923076923077, "grad_norm": 0.005574796348810196, "learning_rate": 1e-06, "loss": 0.0293, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12872.0, "completions/max_terminated_length": 12872.0, "completions/mean_length": 7302.146484375, "completions/mean_terminated_length": 7302.146484375, "completions/min_length": 3062.0, "completions/min_terminated_length": 3062.0, "entropy": 0.30744411051273346, "epoch": 1.8092948717948718, "frac_reward_zero_std": 0.90625, "grad_norm": 0.004532939754426479, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 1425751324.0, "reward": 0.8945116996765137, "reward_std": 0.0183095782995224, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9973307251930237, "rewards/symbolic_reward_partial_score/std": 0.04520295187830925, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.06903076171875, "sampling/importance_sampling_ratio/min": 0.0014675172278657556, "sampling/sampling_logp_difference/max": 6.52418327331543, "sampling/sampling_logp_difference/mean": 0.12041410058736801, "step": 1129 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3038233667612076, "epoch": 1.810897435897436, "grad_norm": 0.004973679780960083, "learning_rate": 1e-06, "loss": -0.014, "step": 1130 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3052085041999817, "epoch": 1.8125, "grad_norm": 0.0037389693316072226, "learning_rate": 1e-06, "loss": 0.0152, "step": 1131 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.3043749928474426, "epoch": 1.814102564102564, "grad_norm": 0.004087743815034628, "learning_rate": 1e-06, "loss": -0.0033, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15298.0, "completions/max_terminated_length": 15298.0, "completions/mean_length": 6917.4140625, "completions/mean_terminated_length": 6917.4140625, "completions/min_length": 3192.0, "completions/min_terminated_length": 3192.0, "entropy": 0.31978972256183624, "epoch": 1.8157051282051282, "frac_reward_zero_std": 0.90625, "grad_norm": 0.029717234894633293, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 1430061392.0, "reward": 0.8950879573822021, "reward_std": 0.016529249027371407, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9992513060569763, "rewards/symbolic_reward_partial_score/std": 0.008469752967357635, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.071985125541687, "sampling/importance_sampling_ratio/min": 0.0019307893235236406, "sampling/sampling_logp_difference/max": 6.249826431274414, "sampling/sampling_logp_difference/mean": 0.12560206651687622, "step": 1133 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.32091666758060455, "epoch": 1.8173076923076923, "grad_norm": 0.01882363110780716, "learning_rate": 1e-06, "loss": -0.0033, "step": 1134 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.32024863362312317, "epoch": 1.8189102564102564, "grad_norm": 0.024035094305872917, "learning_rate": 1e-06, "loss": -0.0005, "step": 1135 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3210635632276535, "epoch": 1.8205128205128205, "grad_norm": 0.004419194534420967, "learning_rate": 1e-06, "loss": -0.0015, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12108.0, "completions/max_terminated_length": 12108.0, "completions/mean_length": 7406.19140625, "completions/mean_terminated_length": 7406.19140625, "completions/min_length": 3186.0, "completions/min_terminated_length": 3186.0, "entropy": 0.30747954547405243, "epoch": 1.8221153846153846, "frac_reward_zero_std": 0.9375, "grad_norm": 0.027421489357948303, "learning_rate": 1e-06, "loss": 0.0097, "num_tokens": 1434730626.0, "reward": 0.8975489139556885, "reward_std": 0.009804688394069672, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9996419548988342, "rewards/symbolic_reward_partial_score/std": 0.0057472530752420425, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.069690227508545, "sampling/importance_sampling_ratio/min": 0.001751437084749341, "sampling/sampling_logp_difference/max": 6.347318649291992, "sampling/sampling_logp_difference/mean": 0.12143737822771072, "step": 1137 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3087593764066696, "epoch": 1.8237179487179487, "grad_norm": 0.0031265048310160637, "learning_rate": 1e-06, "loss": -0.0087, "step": 1138 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3095775842666626, "epoch": 1.8253205128205128, "grad_norm": 0.0032832089345902205, "learning_rate": 1e-06, "loss": 0.0071, "step": 1139 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3101077526807785, "epoch": 1.8269230769230769, "grad_norm": 0.002986360341310501, "learning_rate": 1e-06, "loss": -0.0087, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13108.0, "completions/max_terminated_length": 13108.0, "completions/mean_length": 7286.09375, "completions/mean_terminated_length": 7286.09375, "completions/min_length": 3523.0, "completions/min_terminated_length": 3523.0, "entropy": 0.30785150825977325, "epoch": 1.828525641025641, "frac_reward_zero_std": 0.9375, "grad_norm": 0.004749401472508907, "learning_rate": 1e-06, "loss": -0.0117, "num_tokens": 1439318514.0, "reward": 0.8963379263877869, "reward_std": 0.01155400462448597, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.006366382818669081, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0699366331100464, "sampling/importance_sampling_ratio/min": 0.0016831329558044672, "sampling/sampling_logp_difference/max": 6.38709831237793, "sampling/sampling_logp_difference/mean": 0.12186126410961151, "step": 1141 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3084726333618164, "epoch": 1.8301282051282053, "grad_norm": 0.003292299574241042, "learning_rate": 1e-06, "loss": 0.0, "step": 1142 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3109244406223297, "epoch": 1.8317307692307692, "grad_norm": 0.004434341564774513, "learning_rate": 1e-06, "loss": -0.0107, "step": 1143 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.31032970547676086, "epoch": 1.8333333333333335, "grad_norm": 0.003546637948602438, "learning_rate": 1e-06, "loss": 0.0189, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13205.0, "completions/max_terminated_length": 13205.0, "completions/mean_length": 7583.61328125, "completions/mean_terminated_length": 7583.61328125, "completions/min_length": 4315.0, "completions/min_terminated_length": 4315.0, "entropy": 0.3044735789299011, "epoch": 1.8349358974358974, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0023911476600915194, "learning_rate": 1e-06, "loss": 0.0219, "num_tokens": 1444095996.0, "reward": 0.8987793326377869, "reward_std": 0.004882812965661287, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9998372793197632, "rewards/symbolic_reward_partial_score/std": 0.003682846901938319, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.068371295928955, "sampling/importance_sampling_ratio/min": 0.0025255377404391766, "sampling/sampling_logp_difference/max": 5.981301307678223, "sampling/sampling_logp_difference/mean": 0.11961719393730164, "step": 1145 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3037119656801224, "epoch": 1.8365384615384617, "grad_norm": 0.0018107750220224261, "learning_rate": 1e-06, "loss": -0.0045, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2995622456073761, "epoch": 1.8381410256410255, "grad_norm": 0.0025102372746914625, "learning_rate": 1e-06, "loss": -0.0041, "step": 1147 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30536913871765137, "epoch": 1.8397435897435899, "grad_norm": 0.0023733393754810095, "learning_rate": 1e-06, "loss": -0.0043, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14755.0, "completions/max_terminated_length": 14755.0, "completions/mean_length": 7358.2578125, "completions/mean_terminated_length": 7358.2578125, "completions/min_length": 3661.0, "completions/min_terminated_length": 3661.0, "entropy": 0.3135741055011749, "epoch": 1.8413461538461537, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0034127088729292154, "learning_rate": 1e-06, "loss": -0.0087, "num_tokens": 1448658560.0, "reward": 0.8975586295127869, "reward_std": 0.009765625, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9996744990348816, "rewards/symbolic_reward_partial_score/std": 0.005203233566135168, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.071128010749817, "sampling/importance_sampling_ratio/min": 0.0019161163363605738, "sampling/sampling_logp_difference/max": 6.257454872131348, "sampling/sampling_logp_difference/mean": 0.12349528074264526, "step": 1149 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.31672754883766174, "epoch": 1.842948717948718, "grad_norm": 0.023975789546966553, "learning_rate": 1e-06, "loss": 0.007, "step": 1150 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.31721872091293335, "epoch": 1.844551282051282, "grad_norm": 0.0023351602721959352, "learning_rate": 1e-06, "loss": -0.0086, "step": 1151 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.31144554913043976, "epoch": 1.8461538461538463, "grad_norm": 0.025021545588970184, "learning_rate": 1e-06, "loss": 0.0078, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13811.0, "completions/max_terminated_length": 13811.0, "completions/mean_length": 7853.33203125, "completions/mean_terminated_length": 7853.33203125, "completions/min_length": 4144.0, "completions/min_terminated_length": 4144.0, "entropy": 0.3009275645017624, "epoch": 1.8477564102564101, "frac_reward_zero_std": 0.90625, "grad_norm": 0.005614398512989283, "learning_rate": 1e-06, "loss": -0.0154, "num_tokens": 1453604122.0, "reward": 0.8940234184265137, "reward_std": 0.0202627032995224, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9957031011581421, "rewards/symbolic_reward_partial_score/std": 0.06272586435079575, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0684130191802979, "sampling/importance_sampling_ratio/min": 0.0020829718559980392, "sampling/sampling_logp_difference/max": 6.173959732055664, "sampling/sampling_logp_difference/mean": 0.11914074420928955, "step": 1153 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.3018946796655655, "epoch": 1.8493589743589745, "grad_norm": 0.026210306212306023, "learning_rate": 1e-06, "loss": 0.0008, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3021729290485382, "epoch": 1.8509615384615383, "grad_norm": 0.004779895767569542, "learning_rate": 1e-06, "loss": -0.003, "step": 1155 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.30005723237991333, "epoch": 1.8525641025641026, "grad_norm": 0.0037132585421204567, "learning_rate": 1e-06, "loss": 0.0172, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 12805.0, "completions/max_terminated_length": 12805.0, "completions/mean_length": 7567.783203125, "completions/mean_terminated_length": 7567.783203125, "completions/min_length": 4338.0, "completions/min_terminated_length": 4338.0, "entropy": 0.3199533224105835, "epoch": 1.8541666666666665, "frac_reward_zero_std": 0.90625, "grad_norm": 0.004679365549236536, "learning_rate": 1e-06, "loss": -0.0136, "num_tokens": 1458243931.0, "reward": 0.8961426019668579, "reward_std": 0.01542968861758709, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9988607168197632, "rewards/symbolic_reward_partial_score/std": 0.016854895278811455, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.071823239326477, "sampling/importance_sampling_ratio/min": 0.0027870051562786102, "sampling/sampling_logp_difference/max": 5.882787704467773, "sampling/sampling_logp_difference/mean": 0.12532271444797516, "step": 1157 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.318808451294899, "epoch": 1.8557692307692308, "grad_norm": 0.004111420828849077, "learning_rate": 1e-06, "loss": -0.0134, "step": 1158 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.3141668140888214, "epoch": 1.8573717948717947, "grad_norm": 0.027966901659965515, "learning_rate": 1e-06, "loss": 0.0258, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3200962245464325, "epoch": 1.858974358974359, "grad_norm": 0.004104991443455219, "learning_rate": 1e-06, "loss": 0.0024, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13143.0, "completions/max_terminated_length": 13143.0, "completions/mean_length": 7944.10546875, "completions/mean_terminated_length": 7944.10546875, "completions/min_length": 4646.0, "completions/min_terminated_length": 4646.0, "entropy": 0.30335795879364014, "epoch": 1.8605769230769231, "frac_reward_zero_std": 0.90625, "grad_norm": 0.004546971060335636, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 1463237121.0, "reward": 0.8958008289337158, "reward_std": 0.01679687574505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9977213740348816, "rewards/symbolic_reward_partial_score/std": 0.04448510333895683, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0681650638580322, "sampling/importance_sampling_ratio/min": 0.002749611856415868, "sampling/sampling_logp_difference/max": 5.896295547485352, "sampling/sampling_logp_difference/mean": 0.11904764920473099, "step": 1161 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3040137141942978, "epoch": 1.8621794871794872, "grad_norm": 0.004086529370397329, "learning_rate": 1e-06, "loss": -0.0139, "step": 1162 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30111585557460785, "epoch": 1.8637820512820513, "grad_norm": 0.004171317908912897, "learning_rate": 1e-06, "loss": -0.0122, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.29825007915496826, "epoch": 1.8653846153846154, "grad_norm": 0.003949691541492939, "learning_rate": 1e-06, "loss": 0.0259, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14200.0, "completions/max_terminated_length": 14200.0, "completions/mean_length": 7925.0078125, "completions/mean_terminated_length": 7925.0078125, "completions/min_length": 4500.0, "completions/min_terminated_length": 4500.0, "entropy": 0.29860930144786835, "epoch": 1.8669871794871795, "frac_reward_zero_std": 0.9375, "grad_norm": 0.005291419103741646, "learning_rate": 1e-06, "loss": -0.0159, "num_tokens": 1468163589.0, "reward": 0.8951172232627869, "reward_std": 0.013342385180294514, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9993489980697632, "rewards/symbolic_reward_partial_score/std": 0.0073440405540168285, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.067738652229309, "sampling/importance_sampling_ratio/min": 0.0019308445043861866, "sampling/sampling_logp_difference/max": 6.249797821044922, "sampling/sampling_logp_difference/mean": 0.11880922317504883, "step": 1165 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.29944686591625214, "epoch": 1.8685897435897436, "grad_norm": 0.022908741608262062, "learning_rate": 1e-06, "loss": 0.0343, "step": 1166 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3004586547613144, "epoch": 1.8701923076923077, "grad_norm": 0.016763806343078613, "learning_rate": 1e-06, "loss": -0.0032, "step": 1167 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.29962702095508575, "epoch": 1.8717948717948718, "grad_norm": 0.004018325824290514, "learning_rate": 1e-06, "loss": -0.015, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14060.0, "completions/max_terminated_length": 14060.0, "completions/mean_length": 7686.337890625, "completions/mean_terminated_length": 7686.337890625, "completions/min_length": 3536.0, "completions/min_terminated_length": 3536.0, "entropy": 0.30295367538928986, "epoch": 1.873397435897436, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1472935090.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0685579776763916, "sampling/importance_sampling_ratio/min": 0.0032049978617578745, "sampling/sampling_logp_difference/max": 5.743043899536133, "sampling/sampling_logp_difference/mean": 0.11951278150081635, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30429041385650635, "epoch": 1.875, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30121228098869324, "epoch": 1.876602564102564, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30381765961647034, "epoch": 1.8782051282051282, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14807.0, "completions/max_terminated_length": 14807.0, "completions/mean_length": 8211.1796875, "completions/mean_terminated_length": 8211.1796875, "completions/min_length": 2933.0, "completions/min_terminated_length": 2933.0, "entropy": 0.2926924079656601, "epoch": 1.8798076923076923, "frac_reward_zero_std": 0.9375, "grad_norm": 0.003593733301386237, "learning_rate": 1e-06, "loss": -0.0092, "num_tokens": 1478097134.0, "reward": 0.8975586295127869, "reward_std": 0.009765625931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9996744990348816, "rewards/symbolic_reward_partial_score/std": 0.005203233566135168, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0667604207992554, "sampling/importance_sampling_ratio/min": 0.0008619053987786174, "sampling/sampling_logp_difference/max": 7.056365013122559, "sampling/sampling_logp_difference/mean": 0.11660229414701462, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.29721662402153015, "epoch": 1.8814102564102564, "grad_norm": 0.003629385493695736, "learning_rate": 1e-06, "loss": -0.0099, "step": 1174 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.29691576957702637, "epoch": 1.8830128205128205, "grad_norm": 0.0030263112857937813, "learning_rate": 1e-06, "loss": 0.0374, "step": 1175 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.29324012994766235, "epoch": 1.8846153846153846, "grad_norm": 0.0032665154431015253, "learning_rate": 1e-06, "loss": -0.0088, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15435.0, "completions/max_terminated_length": 15435.0, "completions/mean_length": 8228.052734375, "completions/mean_terminated_length": 8228.052734375, "completions/min_length": 4305.0, "completions/min_terminated_length": 4305.0, "entropy": 0.29094359278678894, "epoch": 1.8862179487179487, "frac_reward_zero_std": 0.875, "grad_norm": 0.007459859363734722, "learning_rate": 1e-06, "loss": -0.0249, "num_tokens": 1483287497.0, "reward": 0.8914356231689453, "reward_std": 0.023461204022169113, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.986328125, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.9987956285476685, "rewards/symbolic_reward_partial_score/std": 0.010277888737618923, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0661139488220215, "sampling/importance_sampling_ratio/min": 5.513464174100591e-08, "sampling/sampling_logp_difference/max": 16.71348762512207, "sampling/sampling_logp_difference/mean": 0.11546041816473007, "step": 1177 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2922461926937103, "epoch": 1.8878205128205128, "grad_norm": 0.007165617309510708, "learning_rate": 1e-06, "loss": -0.0021, "step": 1178 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.28839120268821716, "epoch": 1.8894230769230769, "grad_norm": 0.0291578508913517, "learning_rate": 1e-06, "loss": 0.0343, "step": 1179 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2906486839056015, "epoch": 1.891025641025641, "grad_norm": 0.0069350446574389935, "learning_rate": 1e-06, "loss": -0.0131, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13589.0, "completions/max_terminated_length": 13589.0, "completions/mean_length": 7907.95703125, "completions/mean_terminated_length": 7907.95703125, "completions/min_length": 3527.0, "completions/min_terminated_length": 3527.0, "entropy": 0.30357837677001953, "epoch": 1.8926282051282053, "frac_reward_zero_std": 0.875, "grad_norm": 0.005162107292562723, "learning_rate": 1e-06, "loss": 0.0184, "num_tokens": 1488209347.0, "reward": 0.8948730826377869, "reward_std": 0.0205078125, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.99853515625, "rewards/symbolic_reward_partial_score/std": 0.022975130006670952, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0685532093048096, "sampling/importance_sampling_ratio/min": 0.0008111695642583072, "sampling/sampling_logp_difference/max": 7.1170334815979, "sampling/sampling_logp_difference/mean": 0.11978597193956375, "step": 1181 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.3027549088001251, "epoch": 1.8942307692307692, "grad_norm": 0.004358244594186544, "learning_rate": 1e-06, "loss": 0.001, "step": 1182 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.3017651438713074, "epoch": 1.8958333333333335, "grad_norm": 0.004373589064925909, "learning_rate": 1e-06, "loss": 0.0001, "step": 1183 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.30212725698947906, "epoch": 1.8974358974358974, "grad_norm": 0.004225477576255798, "learning_rate": 1e-06, "loss": -0.0188, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13942.0, "completions/max_terminated_length": 13942.0, "completions/mean_length": 7953.171875, "completions/mean_terminated_length": 7953.171875, "completions/min_length": 2041.0, "completions/min_terminated_length": 2041.0, "entropy": 0.30399172008037567, "epoch": 1.8990384615384617, "frac_reward_zero_std": 0.9375, "grad_norm": 0.003516353666782379, "learning_rate": 1e-06, "loss": -0.0094, "num_tokens": 1493164651.0, "reward": 0.8975489139556885, "reward_std": 0.009804687462747097, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9996418952941895, "rewards/symbolic_reward_partial_score/std": 0.0057472530752420425, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0690739154815674, "sampling/importance_sampling_ratio/min": 0.004113218747079372, "sampling/sampling_logp_difference/max": 5.493549346923828, "sampling/sampling_logp_difference/mean": 0.12033951282501221, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30357593297958374, "epoch": 1.9006410256410255, "grad_norm": 0.0033840613905340433, "learning_rate": 1e-06, "loss": 0.0093, "step": 1186 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3051948994398117, "epoch": 1.9022435897435899, "grad_norm": 0.0033332568127661943, "learning_rate": 1e-06, "loss": 0.0098, "step": 1187 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.30435769259929657, "epoch": 1.9038461538461537, "grad_norm": 0.003013043198734522, "learning_rate": 1e-06, "loss": -0.0096, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14426.0, "completions/max_terminated_length": 14426.0, "completions/mean_length": 8251.572265625, "completions/mean_terminated_length": 8251.572265625, "completions/min_length": 3841.0, "completions/min_terminated_length": 3841.0, "entropy": 0.2953474074602127, "epoch": 1.905448717948718, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0061998371966183186, "learning_rate": 1e-06, "loss": -0.0125, "num_tokens": 1498310864.0, "reward": 0.8951172232627869, "reward_std": 0.008734640665352345, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9993489980697632, "rewards/symbolic_reward_partial_score/std": 0.0073440405540168285, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0669368505477905, "sampling/importance_sampling_ratio/min": 0.003785366890951991, "sampling/sampling_logp_difference/max": 5.57661247253418, "sampling/sampling_logp_difference/mean": 0.11687429249286652, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.294635072350502, "epoch": 1.907051282051282, "grad_norm": 0.005789327435195446, "learning_rate": 1e-06, "loss": 0.001, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.29555289447307587, "epoch": 1.9086538461538463, "grad_norm": 0.012939794920384884, "learning_rate": 1e-06, "loss": 0.0127, "step": 1191 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.29336051642894745, "epoch": 1.9102564102564101, "grad_norm": 0.013760295696556568, "learning_rate": 1e-06, "loss": 0.0024, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15622.0, "completions/max_terminated_length": 15622.0, "completions/mean_length": 8120.173828125, "completions/mean_terminated_length": 8120.173828125, "completions/min_length": 3226.0, "completions/min_terminated_length": 3226.0, "entropy": 0.2943793833255768, "epoch": 1.9118589743589745, "frac_reward_zero_std": 0.90625, "grad_norm": 0.03283732756972313, "learning_rate": 1e-06, "loss": 0.0086, "num_tokens": 1503395177.0, "reward": 0.8958008289337158, "reward_std": 0.01679687574505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9977213740348816, "rewards/symbolic_reward_partial_score/std": 0.04448510333895683, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0672670602798462, "sampling/importance_sampling_ratio/min": 0.002507069380953908, "sampling/sampling_logp_difference/max": 5.988640785217285, "sampling/sampling_logp_difference/mean": 0.11740481853485107, "step": 1193 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2965329438447952, "epoch": 1.9134615384615383, "grad_norm": 0.0039680697955191135, "learning_rate": 1e-06, "loss": 0.0089, "step": 1194 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2973887622356415, "epoch": 1.9150641025641026, "grad_norm": 0.0037179056089371443, "learning_rate": 1e-06, "loss": 0.0066, "step": 1195 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2941938638687134, "epoch": 1.9166666666666665, "grad_norm": 0.0037544008810073137, "learning_rate": 1e-06, "loss": -0.0129, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14247.0, "completions/mean_length": 7902.625, "completions/mean_terminated_length": 7886.02734375, "completions/min_length": 3220.0, "completions/min_terminated_length": 3220.0, "entropy": 0.3004665970802307, "epoch": 1.9182692307692308, "frac_reward_zero_std": 0.9375, "grad_norm": 0.00423093605786562, "learning_rate": 1e-06, "loss": 0.0041, "num_tokens": 1508268089.0, "reward": 0.8950684070587158, "reward_std": 0.01596381887793541, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9959310293197632, "rewards/symbolic_reward_partial_score/std": 0.06253714859485626, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0682998895645142, "sampling/importance_sampling_ratio/min": 0.0023962194100022316, "sampling/sampling_logp_difference/max": 6.033863067626953, "sampling/sampling_logp_difference/mean": 0.11888313293457031, "step": 1197 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2986510396003723, "epoch": 1.9198717948717947, "grad_norm": 0.014576694928109646, "learning_rate": 1e-06, "loss": 0.0284, "step": 1198 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3021872192621231, "epoch": 1.921474358974359, "grad_norm": 0.004513179417699575, "learning_rate": 1e-06, "loss": -0.0113, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2986437827348709, "epoch": 1.9230769230769231, "grad_norm": 0.004633089527487755, "learning_rate": 1e-06, "loss": -0.0111, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15059.0, "completions/max_terminated_length": 15059.0, "completions/mean_length": 8173.046875, "completions/mean_terminated_length": 8173.046875, "completions/min_length": 3649.0, "completions/min_terminated_length": 3649.0, "entropy": 0.29754653573036194, "epoch": 1.9246794871794872, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0024362760595977306, "learning_rate": 1e-06, "loss": -0.0055, "num_tokens": 1513281009.0, "reward": 0.8987793326377869, "reward_std": 0.0048828125, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.998046875, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.9998372793197632, "rewards/symbolic_reward_partial_score/std": 0.003682846901938319, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0677549839019775, "sampling/importance_sampling_ratio/min": 0.004164539277553558, "sampling/sampling_logp_difference/max": 5.481149673461914, "sampling/sampling_logp_difference/mean": 0.11856520175933838, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30047936737537384, "epoch": 1.9262820512820513, "grad_norm": 0.002458572620525956, "learning_rate": 1e-06, "loss": -0.006, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2958284020423889, "epoch": 1.9278846153846154, "grad_norm": 0.0025684263091534376, "learning_rate": 1e-06, "loss": -0.0043, "step": 1203 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.30148808658123016, "epoch": 1.9294871794871795, "grad_norm": 0.0025260718539357185, "learning_rate": 1e-06, "loss": 0.0233, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14282.0, "completions/max_terminated_length": 14282.0, "completions/mean_length": 8364.19140625, "completions/mean_terminated_length": 8364.19140625, "completions/min_length": 3836.0, "completions/min_terminated_length": 3836.0, "entropy": 0.2892305999994278, "epoch": 1.9310897435897436, "frac_reward_zero_std": 0.875, "grad_norm": 0.019499700516462326, "learning_rate": 1e-06, "loss": 0.0032, "num_tokens": 1518456547.0, "reward": 0.8886035680770874, "reward_std": 0.028310654684901237, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.982421875, "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, "rewards/symbolic_reward_partial_score/mean": 0.9971679449081421, "rewards/symbolic_reward_partial_score/std": 0.027863482013344765, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.065371036529541, "sampling/importance_sampling_ratio/min": 0.0013542938977479935, "sampling/sampling_logp_difference/max": 6.604475021362305, "sampling/sampling_logp_difference/mean": 0.11470746994018555, "step": 1205 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.29128527641296387, "epoch": 1.9326923076923077, "grad_norm": 0.01882844790816307, "learning_rate": 1e-06, "loss": 0.0367, "step": 1206 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.2861212342977524, "epoch": 1.9342948717948718, "grad_norm": 0.022711176425218582, "learning_rate": 1e-06, "loss": -0.0139, "step": 1207 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.281126469373703, "epoch": 1.935897435897436, "grad_norm": 0.008646919392049313, "learning_rate": 1e-06, "loss": -0.0146, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16226.0, "completions/mean_length": 8800.673828125, "completions/mean_terminated_length": 8785.833984375, "completions/min_length": 3895.0, "completions/min_terminated_length": 3895.0, "entropy": 0.2832132428884506, "epoch": 1.9375, "frac_reward_zero_std": 0.90625, "grad_norm": 0.005954642314463854, "learning_rate": 1e-06, "loss": -0.0179, "num_tokens": 1523900092.0, "reward": 0.8942675590515137, "reward_std": 0.01977486163377762, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9971679449081421, "rewards/symbolic_reward_partial_score/std": 0.04577268287539482, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0643864870071411, "sampling/importance_sampling_ratio/min": 0.0024164393544197083, "sampling/sampling_logp_difference/max": 6.025460243225098, "sampling/sampling_logp_difference/mean": 0.11290614306926727, "step": 1209 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.27907827496528625, "epoch": 1.939102564102564, "grad_norm": 0.03196370601654053, "learning_rate": 1e-06, "loss": 0.0323, "step": 1210 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.2788865268230438, "epoch": 1.9407051282051282, "grad_norm": 0.004880078136920929, "learning_rate": 1e-06, "loss": 0.0012, "step": 1211 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.28488609194755554, "epoch": 1.9423076923076923, "grad_norm": 0.018956732004880905, "learning_rate": 1e-06, "loss": -0.0029, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14333.0, "completions/max_terminated_length": 14333.0, "completions/mean_length": 8613.55859375, "completions/mean_terminated_length": 8613.55859375, "completions/min_length": 3934.0, "completions/min_terminated_length": 3934.0, "entropy": 0.28035739064216614, "epoch": 1.9439102564102564, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0046940273605287075, "learning_rate": 1e-06, "loss": -0.0131, "num_tokens": 1529260330.0, "reward": 0.8960742950439453, "reward_std": 0.015703124925494194, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9986327886581421, "rewards/symbolic_reward_partial_score/std": 0.022945649921894073, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0646021366119385, "sampling/importance_sampling_ratio/min": 0.0013317377306520939, "sampling/sampling_logp_difference/max": 6.621270656585693, "sampling/sampling_logp_difference/mean": 0.11306291818618774, "step": 1213 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2839740216732025, "epoch": 1.9455128205128205, "grad_norm": 0.0036105273757129908, "learning_rate": 1e-06, "loss": 0.019, "step": 1214 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2793775945901871, "epoch": 1.9471153846153846, "grad_norm": 0.025335954502224922, "learning_rate": 1e-06, "loss": 0.0055, "step": 1215 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2792014926671982, "epoch": 1.9487179487179487, "grad_norm": 0.003647672478109598, "learning_rate": 1e-06, "loss": -0.0137, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15800.0, "completions/max_terminated_length": 15800.0, "completions/mean_length": 8183.126953125, "completions/mean_terminated_length": 8183.126953125, "completions/min_length": 3387.0, "completions/min_terminated_length": 3387.0, "entropy": 0.2911984920501709, "epoch": 1.9503205128205128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1534237771.0, "reward": 0.9000000357627869, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0660669803619385, "sampling/importance_sampling_ratio/min": 0.00016737951955292374, "sampling/sampling_logp_difference/max": 8.695246696472168, "sampling/sampling_logp_difference/mean": 0.11629128456115723, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2953929454088211, "epoch": 1.9519230769230769, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2897518128156662, "epoch": 1.953525641025641, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.28839822113513947, "epoch": 1.9551282051282053, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13799.0, "completions/max_terminated_length": 13799.0, "completions/mean_length": 8616.69921875, "completions/mean_terminated_length": 8616.69921875, "completions/min_length": 4475.0, "completions/min_terminated_length": 4475.0, "entropy": 0.2832952290773392, "epoch": 1.9567307692307692, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0034733996726572514, "learning_rate": 1e-06, "loss": 0.0261, "num_tokens": 1539519377.0, "reward": 0.8975489139556885, "reward_std": 0.009804688394069672, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9996418952941895, "rewards/symbolic_reward_partial_score/std": 0.0057472530752420425, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.064860224723816, "sampling/importance_sampling_ratio/min": 0.0020989603362977505, "sampling/sampling_logp_difference/max": 6.166313171386719, "sampling/sampling_logp_difference/mean": 0.1138753816485405, "step": 1221 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.28177329897880554, "epoch": 1.9583333333333335, "grad_norm": 0.003140608314424753, "learning_rate": 1e-06, "loss": -0.0095, "step": 1222 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2827134430408478, "epoch": 1.9599358974358974, "grad_norm": 0.003823231440037489, "learning_rate": 1e-06, "loss": -0.0103, "step": 1223 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2860657125711441, "epoch": 1.9615384615384617, "grad_norm": 0.003579829353839159, "learning_rate": 1e-06, "loss": -0.0095, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14100.0, "completions/max_terminated_length": 14100.0, "completions/mean_length": 8672.375, "completions/mean_terminated_length": 8672.375, "completions/min_length": 4399.0, "completions/min_terminated_length": 4399.0, "entropy": 0.2804241478443146, "epoch": 1.9631410256410255, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0035035342443734407, "learning_rate": 1e-06, "loss": -0.01, "num_tokens": 1544860417.0, "reward": 0.8970215320587158, "reward_std": 0.01191406324505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9978841543197632, "rewards/symbolic_reward_partial_score/std": 0.04434017464518547, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0639734268188477, "sampling/importance_sampling_ratio/min": 9.44249695749022e-05, "sampling/sampling_logp_difference/max": 9.267704963684082, "sampling/sampling_logp_difference/mean": 0.11255315691232681, "step": 1225 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2808882147073746, "epoch": 1.9647435897435899, "grad_norm": 0.0034705237485468388, "learning_rate": 1e-06, "loss": -0.0088, "step": 1226 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2766978591680527, "epoch": 1.9663461538461537, "grad_norm": 0.027924956753849983, "learning_rate": 1e-06, "loss": 0.0309, "step": 1227 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2793623208999634, "epoch": 1.967948717948718, "grad_norm": 0.0028724130243062973, "learning_rate": 1e-06, "loss": -0.0095, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 16123.0, "completions/max_terminated_length": 16123.0, "completions/mean_length": 8708.259765625, "completions/mean_terminated_length": 8708.259765625, "completions/min_length": 4282.0, "completions/min_terminated_length": 4282.0, "entropy": 0.2766948789358139, "epoch": 1.969551282051282, "frac_reward_zero_std": 0.875, "grad_norm": 0.005033229012042284, "learning_rate": 1e-06, "loss": -0.0205, "num_tokens": 1550202678.0, "reward": 0.8943262100219727, "reward_std": 0.02269531413912773, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.997363269329071, "rewards/symbolic_reward_partial_score/std": 0.045138679444789886, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0631859302520752, "sampling/importance_sampling_ratio/min": 0.003191123716533184, "sampling/sampling_logp_difference/max": 5.747382164001465, "sampling/sampling_logp_difference/mean": 0.11148595809936523, "step": 1229 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.27636095881462097, "epoch": 1.9711538461538463, "grad_norm": 0.004540975205600262, "learning_rate": 1e-06, "loss": 0.0021, "step": 1230 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2746479660272598, "epoch": 1.9727564102564101, "grad_norm": 0.004312835168093443, "learning_rate": 1e-06, "loss": 0.0013, "step": 1231 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.2745707780122757, "epoch": 1.9743589743589745, "grad_norm": 0.004494936671108007, "learning_rate": 1e-06, "loss": 0.0129, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15025.0, "completions/mean_length": 7987.767578125, "completions/mean_terminated_length": 7971.33642578125, "completions/min_length": 3112.0, "completions/min_terminated_length": 3112.0, "entropy": 0.28349705040454865, "epoch": 1.9759615384615383, "frac_reward_zero_std": 0.9375, "grad_norm": 0.00874620396643877, "learning_rate": 1e-06, "loss": -0.0211, "num_tokens": 1555131423.0, "reward": 0.8898340463638306, "reward_std": 0.01714194566011429, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.986328125, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.9941080808639526, "rewards/symbolic_reward_partial_score/std": 0.06613869965076447, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0643033981323242, "sampling/importance_sampling_ratio/min": 0.0017722464399412274, "sampling/sampling_logp_difference/max": 6.335507392883301, "sampling/sampling_logp_difference/mean": 0.11349356174468994, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2810690402984619, "epoch": 1.9775641025641026, "grad_norm": 0.008442613296210766, "learning_rate": 1e-06, "loss": -0.0023, "step": 1234 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.28290465474128723, "epoch": 1.9791666666666665, "grad_norm": 0.002465788973495364, "learning_rate": 1e-06, "loss": 0.0125, "step": 1235 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.28290964663028717, "epoch": 1.9807692307692308, "grad_norm": 0.026566775515675545, "learning_rate": 1e-06, "loss": 0.0143, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13867.0, "completions/max_terminated_length": 13867.0, "completions/mean_length": 8091.439453125, "completions/mean_terminated_length": 8091.439453125, "completions/min_length": 3847.0, "completions/min_terminated_length": 3847.0, "entropy": 0.2784264087677002, "epoch": 1.9823717948717947, "frac_reward_zero_std": 0.90625, "grad_norm": 0.004467155318707228, "learning_rate": 1e-06, "loss": -0.0132, "num_tokens": 1560154784.0, "reward": 0.895751953125, "reward_std": 0.01699218899011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.99755859375, "rewards/symbolic_reward_partial_score/std": 0.044932443648576736, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0629961490631104, "sampling/importance_sampling_ratio/min": 1.6737449186621234e-05, "sampling/sampling_logp_difference/max": 10.997861862182617, "sampling/sampling_logp_difference/mean": 0.11135035753250122, "step": 1237 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2753297984600067, "epoch": 1.983974358974359, "grad_norm": 0.0024451538920402527, "learning_rate": 1e-06, "loss": -0.0016, "step": 1238 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.27506595849990845, "epoch": 1.9855769230769231, "grad_norm": 0.003434085752815008, "learning_rate": 1e-06, "loss": 0.0044, "step": 1239 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.27488943934440613, "epoch": 1.9871794871794872, "grad_norm": 0.0032492363825440407, "learning_rate": 1e-06, "loss": 0.0037, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14333.0, "completions/max_terminated_length": 14333.0, "completions/mean_length": 8140.427734375, "completions/mean_terminated_length": 8140.427734375, "completions/min_length": 3333.0, "completions/min_terminated_length": 3333.0, "entropy": 0.28159183263778687, "epoch": 1.9887820512820513, "frac_reward_zero_std": 0.875, "grad_norm": 0.00511184660717845, "learning_rate": 1e-06, "loss": 0.0158, "num_tokens": 1565177275.0, "reward": 0.8945459127426147, "reward_std": 0.021816406399011612, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9921875, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.9974446296691895, "rewards/symbolic_reward_partial_score/std": 0.044879186898469925, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0642945766448975, "sampling/importance_sampling_ratio/min": 0.0013277892721816897, "sampling/sampling_logp_difference/max": 6.624239921569824, "sampling/sampling_logp_difference/mean": 0.11332094669342041, "step": 1241 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.28187714517116547, "epoch": 1.9903846153846154, "grad_norm": 0.004039302468299866, "learning_rate": 1e-06, "loss": 0.0025, "step": 1242 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2771865576505661, "epoch": 1.9919871794871795, "grad_norm": 0.003922486677765846, "learning_rate": 1e-06, "loss": -0.0177, "step": 1243 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.28154416382312775, "epoch": 1.9935897435897436, "grad_norm": 0.004652918316423893, "learning_rate": 1e-06, "loss": -0.0016, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14042.0, "completions/max_terminated_length": 14042.0, "completions/mean_length": 8255.6796875, "completions/mean_terminated_length": 8255.6796875, "completions/min_length": 3448.0, "completions/min_terminated_length": 3448.0, "entropy": 0.27853433787822723, "epoch": 1.9951923076923077, "frac_reward_zero_std": 0.90625, "grad_norm": 0.030363189056515694, "learning_rate": 1e-06, "loss": 0.0109, "num_tokens": 1570280631.0, "reward": 0.8975096940994263, "reward_std": 0.009961274452507496, "rewards/progression_diversity/mean": -8.409509973716922e-06, "rewards/progression_diversity/std": 0.0001902854855870828, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.99951171875, "rewards/symbolic_reward_partial_score/std": 0.008228649385273457, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.063688039779663, "sampling/importance_sampling_ratio/min": 0.0021940346341580153, "sampling/sampling_logp_difference/max": 6.122013092041016, "sampling/sampling_logp_difference/mean": 0.11205926537513733, "step": 1245 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.27532532811164856, "epoch": 1.9967948717948718, "grad_norm": 0.004188434686511755, "learning_rate": 1e-06, "loss": -0.0088, "step": 1246 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2833784818649292, "epoch": 1.998397435897436, "grad_norm": 0.029332676902413368, "learning_rate": 1e-06, "loss": 0.0118, "step": 1247 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.27936534583568573, "epoch": 2.0, "grad_norm": 0.0042526316829025745, "learning_rate": 1e-06, "loss": -0.0109, "step": 1248 }, { "epoch": 2.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.000244140625, "eval_completions/max_length": 13595.1875, "eval_completions/max_terminated_length": 13568.59375, "eval_completions/mean_length": 8149.333740234375, "eval_completions/mean_terminated_length": 8147.492645263672, "eval_completions/min_length": 4439.53125, "eval_completions/min_terminated_length": 4439.53125, "eval_entropy": 0.27844913955777884, "eval_frac_reward_zero_std": 0.8984375, "eval_loss": 0.0007266444154083729, "eval_num_tokens": 1570280631.0, "eval_reward": 0.8951080553233624, "eval_reward_std": 0.01795433840015903, "eval_rewards/progression_diversity/mean": -1.194192577713693e-06, "eval_rewards/progression_diversity/std": 1.3510747521650046e-05, "eval_rewards/symbolic_reward_accuracy/mean": 0.992919921875, "eval_rewards/symbolic_reward_accuracy/std": 0.057395454961806536, "eval_rewards/symbolic_reward_partial_score/mean": 0.9978536050766706, "eval_rewards/symbolic_reward_partial_score/std": 0.018804883409757167, "eval_rewards/tag_count_reward/mean": 0.0, "eval_rewards/tag_count_reward/std": 0.0, "eval_runtime": 8401.4648, "eval_samples_per_second": 0.03, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 1.0635985918343067, "eval_sampling/importance_sampling_ratio/min": 0.003701142359204823, "eval_sampling/sampling_logp_difference/max": 5.673463404178619, "eval_sampling/sampling_logp_difference/mean": 0.11233945097774267, "eval_steps_per_second": 0.0, "step": 1248 }, { "epoch": 2.0, "step": 1248, "total_flos": 0.0, "train_loss": 0.0006463412022966599, "train_runtime": 115411.6286, "train_samples_per_second": 0.087, "train_steps_per_second": 0.011 } ], "logging_steps": 1, "max_steps": 1248, "num_input_tokens_seen": 1570280631, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }