leonMW's picture
Model save
f785fd3 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 348,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 255.0,
"completions/max_terminated_length": 255.0,
"completions/mean_length": 79.640625,
"completions/mean_terminated_length": 79.640625,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.5482596457004547,
"epoch": 0.0028735632183908046,
"frac_reward_zero_std": 0.34375,
"grad_norm": 0.004432788118720055,
"learning_rate": 1e-05,
"loss": 0.0001,
"num_tokens": 442920.0,
"reward": 0.8734374046325684,
"reward_std": 0.13520026206970215,
"rewards/ngram_repetition2/mean": 0.0,
"rewards/ngram_repetition2/std": 0.0,
"rewards/ngram_repetition3/mean": -1.1421783710829914e-05,
"rewards/ngram_repetition3/std": 0.00025844547781161964,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.841796875,
"rewards/symbolic_reward_accuracy/std": 0.36528825759887695,
"rewards/symbolic_reward_partial_score/mean": 0.947265625,
"rewards/symbolic_reward_partial_score/std": 0.18198402225971222,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2040585279464722,
"sampling/importance_sampling_ratio/min": 0.007793497759848833,
"sampling/sampling_logp_difference/max": 4.854465484619141,
"sampling/sampling_logp_difference/mean": 0.27342236042022705,
"step": 1
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.5987788438796997,
"epoch": 0.005747126436781609,
"grad_norm": 0.0029553379863500595,
"learning_rate": 1e-05,
"loss": -0.0003,
"step": 2
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.5281001031398773,
"epoch": 0.008620689655172414,
"grad_norm": 0.0020463597029447556,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 3
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.6004053950309753,
"epoch": 0.011494252873563218,
"grad_norm": 0.002862541936337948,
"learning_rate": 1e-05,
"loss": 0.0,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 221.0,
"completions/max_terminated_length": 221.0,
"completions/mean_length": 81.396484375,
"completions/mean_terminated_length": 81.396484375,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 0.6153657734394073,
"epoch": 0.014367816091954023,
"frac_reward_zero_std": 0.46875,
"grad_norm": 0.004800648894160986,
"learning_rate": 1e-05,
"loss": 0.0002,
"num_tokens": 874227.0,
"reward": 0.831931471824646,
"reward_std": 0.10583364963531494,
"rewards/ngram_repetition2/mean": -0.00010850694525288418,
"rewards/ngram_repetition2/std": 0.001905819051899016,
"rewards/ngram_repetition3/mean": -0.00010186366125708446,
"rewards/ngram_repetition3/std": 0.0017140271374955773,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.787109375,
"rewards/symbolic_reward_accuracy/std": 0.409751296043396,
"rewards/symbolic_reward_partial_score/mean": 0.9365234375,
"rewards/symbolic_reward_partial_score/std": 0.17081154882907867,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2208325862884521,
"sampling/importance_sampling_ratio/min": 0.004927594680339098,
"sampling/sampling_logp_difference/max": 5.312904357910156,
"sampling/sampling_logp_difference/mean": 0.27739793062210083,
"step": 5
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.6263215243816376,
"epoch": 0.017241379310344827,
"grad_norm": 0.001903409487567842,
"learning_rate": 1e-05,
"loss": -0.0005,
"step": 6
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.5939083099365234,
"epoch": 0.020114942528735632,
"grad_norm": 0.0020359489135444164,
"learning_rate": 1e-05,
"loss": -0.0003,
"step": 7
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.5956400632858276,
"epoch": 0.022988505747126436,
"grad_norm": 0.002406664891168475,
"learning_rate": 1e-05,
"loss": 0.0006,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 223.0,
"completions/max_terminated_length": 223.0,
"completions/mean_length": 88.787109375,
"completions/mean_terminated_length": 88.787109375,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"entropy": 0.6508974432945251,
"epoch": 0.02586206896551724,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.00618784548714757,
"learning_rate": 1e-05,
"loss": -0.0005,
"num_tokens": 1318982.0,
"reward": 0.8049274682998657,
"reward_std": 0.11365753412246704,
"rewards/ngram_repetition2/mean": -9.494357800576836e-05,
"rewards/ngram_repetition2/std": 0.0014728810638189316,
"rewards/ngram_repetition3/mean": -0.0003199847706127912,
"rewards/ngram_repetition3/std": 0.004649759270250797,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.759765625,
"rewards/symbolic_reward_accuracy/std": 0.4276435375213623,
"rewards/symbolic_reward_partial_score/mean": 0.9103189706802368,
"rewards/symbolic_reward_partial_score/std": 0.21451303362846375,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2356998920440674,
"sampling/importance_sampling_ratio/min": 0.002650650916621089,
"sampling/sampling_logp_difference/max": 5.932950019836426,
"sampling/sampling_logp_difference/mean": 0.3045163154602051,
"step": 9
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.6686981022357941,
"epoch": 0.028735632183908046,
"grad_norm": 0.002366194501519203,
"learning_rate": 1e-05,
"loss": -0.0,
"step": 10
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.6524442732334137,
"epoch": 0.031609195402298854,
"grad_norm": 0.004010654054582119,
"learning_rate": 1e-05,
"loss": 0.0005,
"step": 11
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.6432266533374786,
"epoch": 0.034482758620689655,
"grad_norm": 0.004558259155601263,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 277.0,
"completions/max_terminated_length": 277.0,
"completions/mean_length": 101.388671875,
"completions/mean_terminated_length": 101.388671875,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"entropy": 0.7190418243408203,
"epoch": 0.03735632183908046,
"frac_reward_zero_std": 0.15625,
"grad_norm": 0.010927229188382626,
"learning_rate": 1e-05,
"loss": 0.0009,
"num_tokens": 1792205.0,
"reward": 0.7988025546073914,
"reward_std": 0.1612505316734314,
"rewards/ngram_repetition2/mean": -0.0011130181374028325,
"rewards/ngram_repetition2/std": 0.013989781960844994,
"rewards/ngram_repetition3/mean": -0.0014404850080609322,
"rewards/ngram_repetition3/std": 0.013817409984767437,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.755859375,
"rewards/symbolic_reward_accuracy/std": 0.42999663949012756,
"rewards/symbolic_reward_partial_score/mean": 0.90234375,
"rewards/symbolic_reward_partial_score/std": 0.21886694431304932,
"rewards/tag_count_reward/mean": -0.009765625,
"rewards/tag_count_reward/std": 0.09843364357948303,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.267106056213379,
"sampling/importance_sampling_ratio/min": 0.0028255321085453033,
"sampling/sampling_logp_difference/max": 5.869058609008789,
"sampling/sampling_logp_difference/mean": 0.335277795791626,
"step": 13
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.7461837530136108,
"epoch": 0.040229885057471264,
"grad_norm": 0.004749669693410397,
"learning_rate": 1e-05,
"loss": 0.0008,
"step": 14
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.706035703420639,
"epoch": 0.04310344827586207,
"grad_norm": 0.0036822864785790443,
"learning_rate": 1e-05,
"loss": -0.0003,
"step": 15
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.6989518105983734,
"epoch": 0.04597701149425287,
"grad_norm": 0.0051618898287415504,
"learning_rate": 1e-05,
"loss": -0.0005,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 298.0,
"completions/max_terminated_length": 298.0,
"completions/mean_length": 102.732421875,
"completions/mean_terminated_length": 102.732421875,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"entropy": 0.7355360388755798,
"epoch": 0.04885057471264368,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004863899666815996,
"learning_rate": 1e-05,
"loss": -0.0005,
"num_tokens": 2268804.0,
"reward": 0.7467195391654968,
"reward_std": 0.20338207483291626,
"rewards/ngram_repetition2/mean": -0.005328277125954628,
"rewards/ngram_repetition2/std": 0.021549751982092857,
"rewards/ngram_repetition3/mean": -0.005334243178367615,
"rewards/ngram_repetition3/std": 0.020070303231477737,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.703125,
"rewards/symbolic_reward_accuracy/std": 0.45732781291007996,
"rewards/symbolic_reward_partial_score/mean": 0.86767578125,
"rewards/symbolic_reward_partial_score/std": 0.28104299306869507,
"rewards/tag_count_reward/mean": -0.056640625,
"rewards/tag_count_reward/std": 0.23138070106506348,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2863795757293701,
"sampling/importance_sampling_ratio/min": 0.005591566674411297,
"sampling/sampling_logp_difference/max": 5.186495780944824,
"sampling/sampling_logp_difference/mean": 0.3546299636363983,
"step": 17
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.7249170541763306,
"epoch": 0.05172413793103448,
"grad_norm": 0.008876707404851913,
"learning_rate": 1e-05,
"loss": 0.0013,
"step": 18
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.7589452862739563,
"epoch": 0.05459770114942529,
"grad_norm": 0.006985923275351524,
"learning_rate": 1e-05,
"loss": 0.0017,
"step": 19
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.7788407206535339,
"epoch": 0.05747126436781609,
"grad_norm": 0.0038319623563438654,
"learning_rate": 1e-05,
"loss": 0.0011,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 285.0,
"completions/max_terminated_length": 285.0,
"completions/mean_length": 107.83984375,
"completions/mean_terminated_length": 107.83984375,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"entropy": 0.735305517911911,
"epoch": 0.0603448275862069,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004826927557587624,
"learning_rate": 1e-05,
"loss": -0.0006,
"num_tokens": 2732178.0,
"reward": 0.8216053247451782,
"reward_std": 0.14511626958847046,
"rewards/ngram_repetition2/mean": -0.00868980959057808,
"rewards/ngram_repetition2/std": 0.03146844357252121,
"rewards/ngram_repetition3/mean": -0.00836949236690998,
"rewards/ngram_repetition3/std": 0.029039273038506508,
"rewards/sentence_repetition/mean": -0.00014195645053405315,
"rewards/sentence_repetition/std": 0.0032121078111231327,
"rewards/symbolic_reward_accuracy/mean": 0.78125,
"rewards/symbolic_reward_accuracy/std": 0.41380295157432556,
"rewards/symbolic_reward_partial_score/mean": 0.9169921875,
"rewards/symbolic_reward_partial_score/std": 0.21696852147579193,
"rewards/tag_count_reward/mean": -0.001953125,
"rewards/tag_count_reward/std": 0.04419417306780815,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.287536382675171,
"sampling/importance_sampling_ratio/min": 0.0035747073125094175,
"sampling/sampling_logp_difference/max": 5.633872032165527,
"sampling/sampling_logp_difference/mean": 0.35755419731140137,
"step": 21
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.7287033498287201,
"epoch": 0.06321839080459771,
"grad_norm": 0.008059649728238583,
"learning_rate": 1e-05,
"loss": 0.0013,
"step": 22
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.7331990599632263,
"epoch": 0.06609195402298851,
"grad_norm": 0.0054933661594986916,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 23
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.7370081543922424,
"epoch": 0.06896551724137931,
"grad_norm": 0.004683576058596373,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 248.0,
"completions/max_terminated_length": 248.0,
"completions/mean_length": 100.126953125,
"completions/mean_terminated_length": 100.126953125,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"entropy": 0.7627054750919342,
"epoch": 0.07183908045977011,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007382780313491821,
"learning_rate": 1e-05,
"loss": 0.0001,
"num_tokens": 3204435.0,
"reward": 0.8065550923347473,
"reward_std": 0.11376181244850159,
"rewards/ngram_repetition2/mean": -0.007007577456533909,
"rewards/ngram_repetition2/std": 0.028018856421113014,
"rewards/ngram_repetition3/mean": -0.006428225431591272,
"rewards/ngram_repetition3/std": 0.024606449529528618,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.767578125,
"rewards/symbolic_reward_accuracy/std": 0.42278963327407837,
"rewards/symbolic_reward_partial_score/mean": 0.8986002206802368,
"rewards/symbolic_reward_partial_score/std": 0.23486287891864777,
"rewards/tag_count_reward/mean": -0.001953125,
"rewards/tag_count_reward/std": 0.04419417306780815,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2815171480178833,
"sampling/importance_sampling_ratio/min": 0.003166247857734561,
"sampling/sampling_logp_difference/max": 5.7552080154418945,
"sampling/sampling_logp_difference/mean": 0.34828951954841614,
"step": 25
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.328125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.733154833316803,
"epoch": 0.07471264367816093,
"grad_norm": 0.010408902540802956,
"learning_rate": 1e-05,
"loss": 0.0,
"step": 26
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.7431564331054688,
"epoch": 0.07758620689655173,
"grad_norm": 0.006061589810997248,
"learning_rate": 1e-05,
"loss": 0.0017,
"step": 27
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.6956618130207062,
"epoch": 0.08045977011494253,
"grad_norm": 0.004011357668787241,
"learning_rate": 1e-05,
"loss": 0.0005,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.001953125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 200.0,
"completions/mean_length": 111.146484375,
"completions/mean_terminated_length": 79.3013687133789,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"entropy": 0.6434568762779236,
"epoch": 0.08333333333333333,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.006815092638134956,
"learning_rate": 1e-05,
"loss": -0.0003,
"num_tokens": 3704190.0,
"reward": 0.7337323427200317,
"reward_std": 0.15399469435214996,
"rewards/ngram_repetition2/mean": -0.004595820792019367,
"rewards/ngram_repetition2/std": 0.04047611355781555,
"rewards/ngram_repetition3/mean": -0.0048026395961642265,
"rewards/ngram_repetition3/std": 0.04022197425365448,
"rewards/sentence_repetition/mean": -0.001153680495917797,
"rewards/sentence_repetition/std": 0.02610480971634388,
"rewards/symbolic_reward_accuracy/mean": 0.677734375,
"rewards/symbolic_reward_accuracy/std": 0.46780112385749817,
"rewards/symbolic_reward_partial_score/mean": 0.86474609375,
"rewards/symbolic_reward_partial_score/std": 0.22939814627170563,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2359492778778076,
"sampling/importance_sampling_ratio/min": 0.005792928393930197,
"sampling/sampling_logp_difference/max": 5.151117324829102,
"sampling/sampling_logp_difference/mean": 0.2986485958099365,
"step": 29
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.6564987599849701,
"epoch": 0.08620689655172414,
"grad_norm": 0.006035366095602512,
"learning_rate": 1e-05,
"loss": 0.0294,
"step": 30
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.6078056395053864,
"epoch": 0.08908045977011494,
"grad_norm": 0.005147899966686964,
"learning_rate": 1e-05,
"loss": 0.0007,
"step": 31
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.6026411354541779,
"epoch": 0.09195402298850575,
"grad_norm": 0.0064353933557868,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 245.0,
"completions/max_terminated_length": 245.0,
"completions/mean_length": 73.185546875,
"completions/mean_terminated_length": 73.185546875,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"entropy": 0.6158567667007446,
"epoch": 0.09482758620689655,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.005984546151012182,
"learning_rate": 1e-05,
"loss": 0.0001,
"num_tokens": 4162653.0,
"reward": 0.7928154468536377,
"reward_std": 0.13436806201934814,
"rewards/ngram_repetition2/mean": -0.002782913390547037,
"rewards/ngram_repetition2/std": 0.01602020114660263,
"rewards/ngram_repetition3/mean": -0.0027794050984084606,
"rewards/ngram_repetition3/std": 0.015124209225177765,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.7421875,
"rewards/symbolic_reward_accuracy/std": 0.43785804510116577,
"rewards/symbolic_reward_partial_score/mean": 0.9111328125,
"rewards/symbolic_reward_partial_score/std": 0.19450142979621887,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2359068393707275,
"sampling/importance_sampling_ratio/min": 0.005337143782526255,
"sampling/sampling_logp_difference/max": 5.233064651489258,
"sampling/sampling_logp_difference/mean": 0.2945653796195984,
"step": 33
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.5790786743164062,
"epoch": 0.09770114942528736,
"grad_norm": 0.004169682040810585,
"learning_rate": 1e-05,
"loss": 0.0007,
"step": 34
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.5978860259056091,
"epoch": 0.10057471264367816,
"grad_norm": 0.0042377435602247715,
"learning_rate": 1e-05,
"loss": 0.0,
"step": 35
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.6040017604827881,
"epoch": 0.10344827586206896,
"grad_norm": 0.003600814612582326,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 174.0,
"completions/max_terminated_length": 174.0,
"completions/mean_length": 63.58984375,
"completions/mean_terminated_length": 63.58984375,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"entropy": 0.5243661403656006,
"epoch": 0.10632183908045977,
"frac_reward_zero_std": 0.21875,
"grad_norm": 0.0034763177391141653,
"learning_rate": 1e-05,
"loss": 0.0005,
"num_tokens": 4622699.0,
"reward": 0.8271001577377319,
"reward_std": 0.09751666337251663,
"rewards/ngram_repetition2/mean": -0.0021677776239812374,
"rewards/ngram_repetition2/std": 0.014099263586103916,
"rewards/ngram_repetition3/mean": -0.0023514076601713896,
"rewards/ngram_repetition3/std": 0.015108847990632057,
"rewards/sentence_repetition/mean": -0.00030838814564049244,
"rewards/sentence_repetition/std": 0.006978027056902647,
"rewards/symbolic_reward_accuracy/mean": 0.791015625,
"rewards/symbolic_reward_accuracy/std": 0.40698084235191345,
"rewards/symbolic_reward_partial_score/mean": 0.912109375,
"rewards/symbolic_reward_partial_score/std": 0.21082071959972382,
"rewards/tag_count_reward/mean": -0.001953125,
"rewards/tag_count_reward/std": 0.04419417306780815,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2076072692871094,
"sampling/importance_sampling_ratio/min": 0.0033999995794147253,
"sampling/sampling_logp_difference/max": 5.6839799880981445,
"sampling/sampling_logp_difference/mean": 0.26283755898475647,
"step": 37
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.49053676426410675,
"epoch": 0.10919540229885058,
"grad_norm": 0.002470890525728464,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 38
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.4919590651988983,
"epoch": 0.11206896551724138,
"grad_norm": 0.0052912612445652485,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 39
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.5164909660816193,
"epoch": 0.11494252873563218,
"grad_norm": 0.002411817666143179,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 183.0,
"completions/max_terminated_length": 183.0,
"completions/mean_length": 60.0703125,
"completions/mean_terminated_length": 60.0703125,
"completions/min_length": 41.0,
"completions/min_terminated_length": 41.0,
"entropy": 0.4661720544099808,
"epoch": 0.11781609195402298,
"frac_reward_zero_std": 0.28125,
"grad_norm": 0.0034640042576938868,
"learning_rate": 1e-05,
"loss": -0.0001,
"num_tokens": 5025199.0,
"reward": 0.9066269397735596,
"reward_std": 0.12120135873556137,
"rewards/ngram_repetition2/mean": -0.0005607319180853665,
"rewards/ngram_repetition2/std": 0.00634410185739398,
"rewards/ngram_repetition3/mean": -0.0008070581243373454,
"rewards/ngram_repetition3/std": 0.007949295453727245,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.884765625,
"rewards/symbolic_reward_accuracy/std": 0.3196168541908264,
"rewards/symbolic_reward_partial_score/mean": 0.9576822519302368,
"rewards/symbolic_reward_partial_score/std": 0.17431758344173431,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1969958543777466,
"sampling/importance_sampling_ratio/min": 0.005194148980081081,
"sampling/sampling_logp_difference/max": 5.260222434997559,
"sampling/sampling_logp_difference/mean": 0.23839689791202545,
"step": 41
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.42921097576618195,
"epoch": 0.1206896551724138,
"grad_norm": 0.0021978251170367002,
"learning_rate": 1e-05,
"loss": -0.0003,
"step": 42
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.5135487914085388,
"epoch": 0.1235632183908046,
"grad_norm": 0.004084162879735231,
"learning_rate": 1e-05,
"loss": 0.0008,
"step": 43
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.43603725731372833,
"epoch": 0.12643678160919541,
"grad_norm": 0.0016240986296907067,
"learning_rate": 1e-05,
"loss": 0.0005,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.001953125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 166.0,
"completions/mean_length": 91.220703125,
"completions/mean_terminated_length": 59.33659362792969,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"entropy": 0.4012712836265564,
"epoch": 0.12931034482758622,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.0030578728765249252,
"learning_rate": 1e-05,
"loss": -0.0003,
"num_tokens": 5486880.0,
"reward": 0.6342004537582397,
"reward_std": 0.12287883460521698,
"rewards/ngram_repetition2/mean": -0.0027574566192924976,
"rewards/ngram_repetition2/std": 0.04059094563126564,
"rewards/ngram_repetition3/mean": -0.002955373842269182,
"rewards/ngram_repetition3/std": 0.04163341596722603,
"rewards/sentence_repetition/mean": -0.001980098430067301,
"rewards/sentence_repetition/std": 0.03845130279660225,
"rewards/symbolic_reward_accuracy/mean": 0.564453125,
"rewards/symbolic_reward_accuracy/std": 0.49631330370903015,
"rewards/symbolic_reward_partial_score/mean": 0.7985026240348816,
"rewards/symbolic_reward_partial_score/std": 0.2610504925251007,
"rewards/tag_count_reward/mean": -0.00390625,
"rewards/tag_count_reward/std": 0.06243881583213806,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.162497878074646,
"sampling/importance_sampling_ratio/min": 0.003198559395968914,
"sampling/sampling_logp_difference/max": 5.745054721832275,
"sampling/sampling_logp_difference/mean": 0.20148907601833344,
"step": 45
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.39941859245300293,
"epoch": 0.13218390804597702,
"grad_norm": 0.0021550978999584913,
"learning_rate": 1e-05,
"loss": 0.0003,
"step": 46
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.39996063709259033,
"epoch": 0.13505747126436782,
"grad_norm": 0.00215929769910872,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 47
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.3763733506202698,
"epoch": 0.13793103448275862,
"grad_norm": 0.003114216960966587,
"learning_rate": 1e-05,
"loss": 0.0102,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 4079.0,
"completions/max_terminated_length": 4079.0,
"completions/mean_length": 65.365234375,
"completions/mean_terminated_length": 65.365234375,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 0.4029734432697296,
"epoch": 0.14080459770114942,
"frac_reward_zero_std": 0.34375,
"grad_norm": 0.004060815088450909,
"learning_rate": 1e-05,
"loss": 0.0071,
"num_tokens": 5925819.0,
"reward": 0.790141224861145,
"reward_std": 0.09819010645151138,
"rewards/ngram_repetition2/mean": -0.002248897682875395,
"rewards/ngram_repetition2/std": 0.042518578469753265,
"rewards/ngram_repetition3/mean": -0.0021793104242533445,
"rewards/ngram_repetition3/std": 0.04257462918758392,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.73828125,
"rewards/symbolic_reward_accuracy/std": 0.44000017642974854,
"rewards/symbolic_reward_partial_score/mean": 0.9112955331802368,
"rewards/symbolic_reward_partial_score/std": 0.1929275393486023,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1678524017333984,
"sampling/importance_sampling_ratio/min": 0.008369643241167068,
"sampling/sampling_logp_difference/max": 4.783143997192383,
"sampling/sampling_logp_difference/mean": 0.21218247711658478,
"step": 49
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.41800519824028015,
"epoch": 0.14367816091954022,
"grad_norm": 0.001978965476155281,
"learning_rate": 1e-05,
"loss": 0.0,
"step": 50
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.39291223883628845,
"epoch": 0.14655172413793102,
"grad_norm": 0.0015093119582161307,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 51
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.4259650707244873,
"epoch": 0.14942528735632185,
"grad_norm": 0.0014180849539116025,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.001953125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 143.0,
"completions/mean_length": 90.54296875,
"completions/mean_terminated_length": 58.657535552978516,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"entropy": 0.4426834136247635,
"epoch": 0.15229885057471265,
"frac_reward_zero_std": 0.34375,
"grad_norm": 0.004828969016671181,
"learning_rate": 1e-05,
"loss": 0.0002,
"num_tokens": 6408689.0,
"reward": 0.7719333171844482,
"reward_std": 0.18412092328071594,
"rewards/ngram_repetition2/mean": -0.0019203309202566743,
"rewards/ngram_repetition2/std": 0.04211832955479622,
"rewards/ngram_repetition3/mean": -0.00201208028011024,
"rewards/ngram_repetition3/std": 0.04237477108836174,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.724609375,
"rewards/symbolic_reward_accuracy/std": 0.44714778661727905,
"rewards/symbolic_reward_partial_score/mean": 0.8824869394302368,
"rewards/symbolic_reward_partial_score/std": 0.23320595920085907,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1707079410552979,
"sampling/importance_sampling_ratio/min": 0.005764464382082224,
"sampling/sampling_logp_difference/max": 5.15604305267334,
"sampling/sampling_logp_difference/mean": 0.21329498291015625,
"step": 53
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.43557606637477875,
"epoch": 0.15517241379310345,
"grad_norm": 0.002126704901456833,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 54
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.15625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.4388166666030884,
"epoch": 0.15804597701149425,
"grad_norm": 0.0041025117971003056,
"learning_rate": 1e-05,
"loss": 0.0165,
"step": 55
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.40535806119441986,
"epoch": 0.16091954022988506,
"grad_norm": 0.0017564742593094707,
"learning_rate": 1e-05,
"loss": -0.0003,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 175.0,
"completions/max_terminated_length": 175.0,
"completions/mean_length": 57.52734375,
"completions/mean_terminated_length": 57.52734375,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"entropy": 0.4543343782424927,
"epoch": 0.16379310344827586,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.0039548370987176895,
"learning_rate": 1e-05,
"loss": 0.0003,
"num_tokens": 6834111.0,
"reward": 0.8162930011749268,
"reward_std": 0.15883183479309082,
"rewards/ngram_repetition2/mean": -0.0005367818521335721,
"rewards/ngram_repetition2/std": 0.009957203641533852,
"rewards/ngram_repetition3/mean": -0.0006075998535379767,
"rewards/ngram_repetition3/std": 0.01245367806404829,
"rewards/sentence_repetition/mean": -0.0004145588318351656,
"rewards/sentence_repetition/std": 0.009304200299084187,
"rewards/symbolic_reward_accuracy/mean": 0.7734375,
"rewards/symbolic_reward_accuracy/std": 0.4190165400505066,
"rewards/symbolic_reward_partial_score/mean": 0.9176431894302368,
"rewards/symbolic_reward_partial_score/std": 0.2047864943742752,
"rewards/tag_count_reward/mean": -0.00390625,
"rewards/tag_count_reward/std": 0.06243881583213806,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1725012063980103,
"sampling/importance_sampling_ratio/min": 0.008290642872452736,
"sampling/sampling_logp_difference/max": 4.792627811431885,
"sampling/sampling_logp_difference/mean": 0.22002115845680237,
"step": 57
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.44487476348876953,
"epoch": 0.16666666666666666,
"grad_norm": 0.0029834641609340906,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 58
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.4447477012872696,
"epoch": 0.16954022988505746,
"grad_norm": 0.0029372554272413254,
"learning_rate": 1e-05,
"loss": 0.0003,
"step": 59
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.4102463573217392,
"epoch": 0.1724137931034483,
"grad_norm": 0.0020892529282718897,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 114.0,
"completions/max_terminated_length": 114.0,
"completions/mean_length": 54.142578125,
"completions/mean_terminated_length": 54.142578125,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"entropy": 0.4056634455919266,
"epoch": 0.1752873563218391,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.0013282729778438807,
"learning_rate": 1e-05,
"loss": -0.0001,
"num_tokens": 7295496.0,
"reward": 0.7713481187820435,
"reward_std": 0.08887413889169693,
"rewards/ngram_repetition2/mean": -0.0016605097334831953,
"rewards/ngram_repetition2/std": 0.0266667939722538,
"rewards/ngram_repetition3/mean": -0.0018864045850932598,
"rewards/ngram_repetition3/std": 0.028560085222125053,
"rewards/sentence_repetition/mean": -0.00030838814564049244,
"rewards/sentence_repetition/std": 0.006978027056902647,
"rewards/symbolic_reward_accuracy/mean": 0.72265625,
"rewards/symbolic_reward_accuracy/std": 0.4481254518032074,
"rewards/symbolic_reward_partial_score/mean": 0.8857421875,
"rewards/symbolic_reward_partial_score/std": 0.2284853160381317,
"rewards/tag_count_reward/mean": -0.001953125,
"rewards/tag_count_reward/std": 0.04419417306780815,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1588588953018188,
"sampling/importance_sampling_ratio/min": 0.0037964945659041405,
"sampling/sampling_logp_difference/max": 5.573677062988281,
"sampling/sampling_logp_difference/mean": 0.20629723370075226,
"step": 61
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.125,
"entropy": 0.40721358358860016,
"epoch": 0.1781609195402299,
"grad_norm": 0.002758313436061144,
"learning_rate": 1e-05,
"loss": 0.0006,
"step": 62
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.38856589794158936,
"epoch": 0.1810344827586207,
"grad_norm": 0.0010847699595615268,
"learning_rate": 1e-05,
"loss": -0.0003,
"step": 63
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0859375,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.40965311229228973,
"epoch": 0.1839080459770115,
"grad_norm": 0.0023630608338862658,
"learning_rate": 1e-05,
"loss": -0.0,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 107.0,
"completions/max_terminated_length": 107.0,
"completions/mean_length": 56.3203125,
"completions/mean_terminated_length": 56.3203125,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 0.4272766709327698,
"epoch": 0.1867816091954023,
"frac_reward_zero_std": 0.28125,
"grad_norm": 0.0028679470997303724,
"learning_rate": 1e-05,
"loss": -0.0001,
"num_tokens": 7754348.0,
"reward": 0.7936002016067505,
"reward_std": 0.130602166056633,
"rewards/ngram_repetition2/mean": -0.0001220703125,
"rewards/ngram_repetition2/std": 0.0027621358167380095,
"rewards/ngram_repetition3/mean": -0.0002109008200932294,
"rewards/ngram_repetition3/std": 0.004772140644490719,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.755859375,
"rewards/symbolic_reward_accuracy/std": 0.42999663949012756,
"rewards/symbolic_reward_partial_score/mean": 0.8816731572151184,
"rewards/symbolic_reward_partial_score/std": 0.2536083459854126,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1688048839569092,
"sampling/importance_sampling_ratio/min": 0.00808299146592617,
"sampling/sampling_logp_difference/max": 4.8179931640625,
"sampling/sampling_logp_difference/mean": 0.21086427569389343,
"step": 65
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.42733363807201385,
"epoch": 0.1896551724137931,
"grad_norm": 0.0032221204601228237,
"learning_rate": 1e-05,
"loss": 0.0005,
"step": 66
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.41568654775619507,
"epoch": 0.1925287356321839,
"grad_norm": 0.002042518462985754,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 67
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.42494598031044006,
"epoch": 0.19540229885057472,
"grad_norm": 0.001341567374765873,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 118.0,
"completions/max_terminated_length": 118.0,
"completions/mean_length": 57.2265625,
"completions/mean_terminated_length": 57.2265625,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 0.48594728112220764,
"epoch": 0.19827586206896552,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.00340810464695096,
"learning_rate": 1e-05,
"loss": 0.0002,
"num_tokens": 8192288.0,
"reward": 0.7913572788238525,
"reward_std": 0.1352584958076477,
"rewards/ngram_repetition2/mean": 0.0,
"rewards/ngram_repetition2/std": 0.0,
"rewards/ngram_repetition3/mean": -1.1421783710829914e-05,
"rewards/ngram_repetition3/std": 0.00025844547781161964,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.7421875,
"rewards/symbolic_reward_accuracy/std": 0.43785804510116577,
"rewards/symbolic_reward_partial_score/mean": 0.9060872793197632,
"rewards/symbolic_reward_partial_score/std": 0.20613642036914825,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1853888034820557,
"sampling/importance_sampling_ratio/min": 0.003909863531589508,
"sampling/sampling_logp_difference/max": 5.544252872467041,
"sampling/sampling_logp_difference/mean": 0.23184895515441895,
"step": 69
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.46007974445819855,
"epoch": 0.20114942528735633,
"grad_norm": 0.0018378323875367641,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 70
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.449898436665535,
"epoch": 0.20402298850574713,
"grad_norm": 0.0010113210882991552,
"learning_rate": 1e-05,
"loss": -0.0003,
"step": 71
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.4371645599603653,
"epoch": 0.20689655172413793,
"grad_norm": 0.0015104720368981361,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 315.0,
"completions/max_terminated_length": 315.0,
"completions/mean_length": 58.287109375,
"completions/mean_terminated_length": 58.287109375,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"entropy": 0.46672871708869934,
"epoch": 0.20977011494252873,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.0020025873091071844,
"learning_rate": 1e-05,
"loss": -0.0005,
"num_tokens": 8631251.0,
"reward": 0.8614984750747681,
"reward_std": 0.11169316619634628,
"rewards/ngram_repetition2/mean": -0.0012671099975705147,
"rewards/ngram_repetition2/std": 0.028671424835920334,
"rewards/ngram_repetition3/mean": -0.0012333698105067015,
"rewards/ngram_repetition3/std": 0.02623102255165577,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.826171875,
"rewards/symbolic_reward_accuracy/std": 0.3793322443962097,
"rewards/symbolic_reward_partial_score/mean": 0.9440103769302368,
"rewards/symbolic_reward_partial_score/std": 0.18596942722797394,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1941895484924316,
"sampling/importance_sampling_ratio/min": 0.002809051424264908,
"sampling/sampling_logp_difference/max": 5.874908447265625,
"sampling/sampling_logp_difference/mean": 0.24140113592147827,
"step": 73
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.4615946561098099,
"epoch": 0.21264367816091953,
"grad_norm": 0.0036014795769006014,
"learning_rate": 1e-05,
"loss": 0.0005,
"step": 74
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.50718954205513,
"epoch": 0.21551724137931033,
"grad_norm": 0.002646596170961857,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 75
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.464016318321228,
"epoch": 0.21839080459770116,
"grad_norm": 0.002083716681227088,
"learning_rate": 1e-05,
"loss": 0.0,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 140.0,
"completions/max_terminated_length": 140.0,
"completions/mean_length": 58.1015625,
"completions/mean_terminated_length": 58.1015625,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"entropy": 0.45281049609184265,
"epoch": 0.22126436781609196,
"frac_reward_zero_std": 0.28125,
"grad_norm": 0.0050986045971512794,
"learning_rate": 1e-05,
"loss": 0.0001,
"num_tokens": 9085159.0,
"reward": 0.851704478263855,
"reward_std": 0.1377110630273819,
"rewards/ngram_repetition2/mean": -0.0002638691512402147,
"rewards/ngram_repetition2/std": 0.0059706768952310085,
"rewards/ngram_repetition3/mean": -0.0001890077255666256,
"rewards/ngram_repetition3/std": 0.003657597815617919,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.818359375,
"rewards/symbolic_reward_accuracy/std": 0.38592514395713806,
"rewards/symbolic_reward_partial_score/mean": 0.9295247197151184,
"rewards/symbolic_reward_partial_score/std": 0.20194244384765625,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1843516826629639,
"sampling/importance_sampling_ratio/min": 0.007268482819199562,
"sampling/sampling_logp_difference/max": 4.92420768737793,
"sampling/sampling_logp_difference/mean": 0.22934292256832123,
"step": 77
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.45783525705337524,
"epoch": 0.22413793103448276,
"grad_norm": 0.0012645251117646694,
"learning_rate": 1e-05,
"loss": -0.0004,
"step": 78
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.4066694378852844,
"epoch": 0.22701149425287356,
"grad_norm": 0.0011590078938752413,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 79
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.44288161396980286,
"epoch": 0.22988505747126436,
"grad_norm": 0.0016866448568180203,
"learning_rate": 1e-05,
"loss": 0.0005,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 121.0,
"completions/max_terminated_length": 121.0,
"completions/mean_length": 58.046875,
"completions/mean_terminated_length": 58.046875,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"entropy": 0.49261198937892914,
"epoch": 0.23275862068965517,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.0027653626166284084,
"learning_rate": 1e-05,
"loss": -0.0003,
"num_tokens": 9526847.0,
"reward": 0.8708001375198364,
"reward_std": 0.13339784741401672,
"rewards/ngram_repetition2/mean": -2.26501560973702e-05,
"rewards/ngram_repetition2/std": 0.00040280655957758427,
"rewards/ngram_repetition3/mean": -4.4448628614190966e-05,
"rewards/ngram_repetition3/std": 0.0010057577164843678,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.845703125,
"rewards/symbolic_reward_accuracy/std": 0.36158639192581177,
"rewards/symbolic_reward_partial_score/mean": 0.9293619394302368,
"rewards/symbolic_reward_partial_score/std": 0.23605132102966309,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1911598443984985,
"sampling/importance_sampling_ratio/min": 0.00648106262087822,
"sampling/sampling_logp_difference/max": 5.038870811462402,
"sampling/sampling_logp_difference/mean": 0.23612119257450104,
"step": 81
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.441509410738945,
"epoch": 0.23563218390804597,
"grad_norm": 0.004880265332758427,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 82
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.4431619942188263,
"epoch": 0.23850574712643677,
"grad_norm": 0.0035018131602555513,
"learning_rate": 1e-05,
"loss": 0.0,
"step": 83
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.4725227504968643,
"epoch": 0.2413793103448276,
"grad_norm": 0.0032173239160329103,
"learning_rate": 1e-05,
"loss": 0.0004,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 132.0,
"completions/max_terminated_length": 132.0,
"completions/mean_length": 56.25390625,
"completions/mean_terminated_length": 56.25390625,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"entropy": 0.4439745992422104,
"epoch": 0.2442528735632184,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.0022925150115042925,
"learning_rate": 1e-05,
"loss": -0.0001,
"num_tokens": 9979969.0,
"reward": 0.7895450592041016,
"reward_std": 0.11284206807613373,
"rewards/ngram_repetition2/mean": -0.0002489655453246087,
"rewards/ngram_repetition2/std": 0.005007500294595957,
"rewards/ngram_repetition3/mean": -0.00032552084303461015,
"rewards/ngram_repetition3/std": 0.007365696132183075,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.73046875,
"rewards/symbolic_reward_accuracy/std": 0.44415023922920227,
"rewards/symbolic_reward_partial_score/mean": 0.9274088144302368,
"rewards/symbolic_reward_partial_score/std": 0.17746692895889282,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.18569016456604,
"sampling/importance_sampling_ratio/min": 0.009709770791232586,
"sampling/sampling_logp_difference/max": 4.634622573852539,
"sampling/sampling_logp_difference/mean": 0.23130828142166138,
"step": 85
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.45062993466854095,
"epoch": 0.2471264367816092,
"grad_norm": 0.002243473893031478,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 86
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.48541413247585297,
"epoch": 0.25,
"grad_norm": 0.002420612843707204,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 87
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.4482133090496063,
"epoch": 0.25287356321839083,
"grad_norm": 0.0013555807527154684,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 126.0,
"completions/max_terminated_length": 126.0,
"completions/mean_length": 55.689453125,
"completions/mean_terminated_length": 55.689453125,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 0.40743130445480347,
"epoch": 0.2557471264367816,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.004098931793123484,
"learning_rate": 1e-05,
"loss": 0.0002,
"num_tokens": 10420770.0,
"reward": 0.64892578125,
"reward_std": 0.11810654401779175,
"rewards/ngram_repetition2/mean": 0.0,
"rewards/ngram_repetition2/std": 0.0,
"rewards/ngram_repetition3/mean": 0.0,
"rewards/ngram_repetition3/std": 0.0,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.5703125,
"rewards/symbolic_reward_accuracy/std": 0.4955156147480011,
"rewards/symbolic_reward_partial_score/mean": 0.8323567509651184,
"rewards/symbolic_reward_partial_score/std": 0.27279549837112427,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.178039789199829,
"sampling/importance_sampling_ratio/min": 0.0019275805680081248,
"sampling/sampling_logp_difference/max": 6.251489639282227,
"sampling/sampling_logp_difference/mean": 0.21803206205368042,
"step": 89
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.41286924481391907,
"epoch": 0.25862068965517243,
"grad_norm": 0.0021748561412096024,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 90
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0859375,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.4246648848056793,
"epoch": 0.2614942528735632,
"grad_norm": 0.0018133048433810472,
"learning_rate": 1e-05,
"loss": 0.0003,
"step": 91
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.4122104048728943,
"epoch": 0.26436781609195403,
"grad_norm": 0.002096879994496703,
"learning_rate": 1e-05,
"loss": -0.0004,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 100.0,
"completions/max_terminated_length": 100.0,
"completions/mean_length": 56.546875,
"completions/mean_terminated_length": 56.546875,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"entropy": 0.42113952338695526,
"epoch": 0.2672413793103448,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.0021440223790705204,
"learning_rate": 1e-05,
"loss": 0.0,
"num_tokens": 10848858.0,
"reward": 0.8250000476837158,
"reward_std": 0.11331714689731598,
"rewards/ngram_repetition2/mean": 0.0,
"rewards/ngram_repetition2/std": 0.0,
"rewards/ngram_repetition3/mean": 0.0,
"rewards/ngram_repetition3/std": 0.0,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.791015625,
"rewards/symbolic_reward_accuracy/std": 0.40698084235191345,
"rewards/symbolic_reward_partial_score/mean": 0.904296875,
"rewards/symbolic_reward_partial_score/std": 0.24538351595401764,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1647822856903076,
"sampling/importance_sampling_ratio/min": 0.00526386359706521,
"sampling/sampling_logp_difference/max": 5.246890068054199,
"sampling/sampling_logp_difference/mean": 0.20634809136390686,
"step": 93
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.4066851735115051,
"epoch": 0.27011494252873564,
"grad_norm": 0.0021633717697113752,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 94
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.3979349434375763,
"epoch": 0.27298850574712646,
"grad_norm": 0.0010904585942626,
"learning_rate": 1e-05,
"loss": -0.0003,
"step": 95
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.40239329636096954,
"epoch": 0.27586206896551724,
"grad_norm": 0.0020343970973044634,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 93.0,
"completions/max_terminated_length": 93.0,
"completions/mean_length": 55.3828125,
"completions/mean_terminated_length": 55.3828125,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 0.41389578580856323,
"epoch": 0.27873563218390807,
"frac_reward_zero_std": 0.34375,
"grad_norm": 0.0023732734844088554,
"learning_rate": 1e-05,
"loss": 0.0002,
"num_tokens": 11286334.0,
"reward": 0.762939453125,
"reward_std": 0.13904184103012085,
"rewards/ngram_repetition2/mean": 0.0,
"rewards/ngram_repetition2/std": 0.0,
"rewards/ngram_repetition3/mean": 0.0,
"rewards/ngram_repetition3/std": 0.0,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.708984375,
"rewards/symbolic_reward_accuracy/std": 0.45467492938041687,
"rewards/symbolic_reward_partial_score/mean": 0.8888345956802368,
"rewards/symbolic_reward_partial_score/std": 0.23015637695789337,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1618776321411133,
"sampling/importance_sampling_ratio/min": 0.004022667650133371,
"sampling/sampling_logp_difference/max": 5.515810012817383,
"sampling/sampling_logp_difference/mean": 0.2021733969449997,
"step": 97
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.3943755477666855,
"epoch": 0.28160919540229884,
"grad_norm": 0.0024514112155884504,
"learning_rate": 1e-05,
"loss": 0.0003,
"step": 98
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.374920517206192,
"epoch": 0.28448275862068967,
"grad_norm": 0.0016932882135733962,
"learning_rate": 1e-05,
"loss": 0.0,
"step": 99
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.41551171243190765,
"epoch": 0.28735632183908044,
"grad_norm": 0.001992169301956892,
"learning_rate": 1e-05,
"loss": -0.0004,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 97.0,
"completions/max_terminated_length": 97.0,
"completions/mean_length": 56.2109375,
"completions/mean_terminated_length": 56.2109375,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.39603303372859955,
"epoch": 0.29022988505747127,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.0020668664947152138,
"learning_rate": 1e-05,
"loss": -0.0002,
"num_tokens": 11720906.0,
"reward": 0.751953125,
"reward_std": 0.11652664840221405,
"rewards/ngram_repetition2/mean": 0.0,
"rewards/ngram_repetition2/std": 0.0,
"rewards/ngram_repetition3/mean": 0.0,
"rewards/ngram_repetition3/std": 0.0,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.701171875,
"rewards/symbolic_reward_accuracy/std": 0.45819199085235596,
"rewards/symbolic_reward_partial_score/mean": 0.8704427480697632,
"rewards/symbolic_reward_partial_score/std": 0.24692247807979584,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.166778326034546,
"sampling/importance_sampling_ratio/min": 0.005492182448506355,
"sampling/sampling_logp_difference/max": 5.204429626464844,
"sampling/sampling_logp_difference/mean": 0.20504958927631378,
"step": 101
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.42469410598278046,
"epoch": 0.29310344827586204,
"grad_norm": 0.0026059469673782587,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 102
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.4196999818086624,
"epoch": 0.2959770114942529,
"grad_norm": 0.0022485863883048296,
"learning_rate": 1e-05,
"loss": -0.0,
"step": 103
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.4103551357984543,
"epoch": 0.2988505747126437,
"grad_norm": 0.0018114866688847542,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 104
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 117.0,
"completions/max_terminated_length": 117.0,
"completions/mean_length": 58.57421875,
"completions/mean_terminated_length": 58.57421875,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.46819332242012024,
"epoch": 0.3017241379310345,
"frac_reward_zero_std": 0.59375,
"grad_norm": 0.0030483694281429052,
"learning_rate": 1e-05,
"loss": -0.0003,
"num_tokens": 12168720.0,
"reward": 0.741650402545929,
"reward_std": 0.09308037161827087,
"rewards/ngram_repetition2/mean": 0.0,
"rewards/ngram_repetition2/std": 0.0,
"rewards/ngram_repetition3/mean": 0.0,
"rewards/ngram_repetition3/std": 0.0,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.6875,
"rewards/symbolic_reward_accuracy/std": 0.4639657139778137,
"rewards/symbolic_reward_partial_score/mean": 0.8680012822151184,
"rewards/symbolic_reward_partial_score/std": 0.25573408603668213,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1807539463043213,
"sampling/importance_sampling_ratio/min": 0.0036130903754383326,
"sampling/sampling_logp_difference/max": 5.623191833496094,
"sampling/sampling_logp_difference/mean": 0.22634021937847137,
"step": 105
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.447001576423645,
"epoch": 0.3045977011494253,
"grad_norm": 0.0014942652778699994,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 106
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0859375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.140625,
"entropy": 0.420550674200058,
"epoch": 0.3074712643678161,
"grad_norm": 0.001921923947520554,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 107
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.078125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.4742784798145294,
"epoch": 0.3103448275862069,
"grad_norm": 0.0013419737806543708,
"learning_rate": 1e-05,
"loss": 0.0004,
"step": 108
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 108.0,
"completions/max_terminated_length": 108.0,
"completions/mean_length": 58.876953125,
"completions/mean_terminated_length": 58.876953125,
"completions/min_length": 40.0,
"completions/min_terminated_length": 40.0,
"entropy": 0.4446108043193817,
"epoch": 0.3132183908045977,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.0027625032234936953,
"learning_rate": 1e-05,
"loss": 0.0001,
"num_tokens": 12607825.0,
"reward": 0.7533203363418579,
"reward_std": 0.1126197874546051,
"rewards/ngram_repetition2/mean": 0.0,
"rewards/ngram_repetition2/std": 0.0,
"rewards/ngram_repetition3/mean": 0.0,
"rewards/ngram_repetition3/std": 0.0,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.70703125,
"rewards/symbolic_reward_accuracy/std": 0.455569326877594,
"rewards/symbolic_reward_partial_score/mean": 0.861328125,
"rewards/symbolic_reward_partial_score/std": 0.29597893357276917,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1799287796020508,
"sampling/importance_sampling_ratio/min": 0.006384609267115593,
"sampling/sampling_logp_difference/max": 5.0538649559021,
"sampling/sampling_logp_difference/mean": 0.22297433018684387,
"step": 109
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.43454255163669586,
"epoch": 0.3160919540229885,
"grad_norm": 0.0020271167159080505,
"learning_rate": 1e-05,
"loss": -0.0003,
"step": 110
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.4507320821285248,
"epoch": 0.31896551724137934,
"grad_norm": 0.0015110382810235023,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 111
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.4569307267665863,
"epoch": 0.3218390804597701,
"grad_norm": 0.002051499206572771,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 162.0,
"completions/max_terminated_length": 162.0,
"completions/mean_length": 61.46484375,
"completions/mean_terminated_length": 61.46484375,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"entropy": 0.5046162307262421,
"epoch": 0.32471264367816094,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.003583670826628804,
"learning_rate": 1e-05,
"loss": 0.0001,
"num_tokens": 13056959.0,
"reward": 0.7070310115814209,
"reward_std": 0.09193491190671921,
"rewards/ngram_repetition2/mean": -1.3152356586942915e-05,
"rewards/ngram_repetition2/std": 0.00029760386678390205,
"rewards/ngram_repetition3/mean": -1.181027982966043e-05,
"rewards/ngram_repetition3/std": 0.0002672361151780933,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.646484375,
"rewards/symbolic_reward_accuracy/std": 0.47852855920791626,
"rewards/symbolic_reward_partial_score/mean": 0.8483072519302368,
"rewards/symbolic_reward_partial_score/std": 0.29286909103393555,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1988115310668945,
"sampling/importance_sampling_ratio/min": 0.0016131963348016143,
"sampling/sampling_logp_difference/max": 6.429537773132324,
"sampling/sampling_logp_difference/mean": 0.24746036529541016,
"step": 113
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.0859375,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1015625,
"entropy": 0.49996377527713776,
"epoch": 0.3275862068965517,
"grad_norm": 0.001245135790668428,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 114
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.5030421316623688,
"epoch": 0.33045977011494254,
"grad_norm": 0.003247750224545598,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 115
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.5156670063734055,
"epoch": 0.3333333333333333,
"grad_norm": 0.00133526383433491,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 116
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 10776.0,
"completions/max_terminated_length": 10776.0,
"completions/mean_length": 86.06640625,
"completions/mean_terminated_length": 86.06640625,
"completions/min_length": 41.0,
"completions/min_terminated_length": 41.0,
"entropy": 0.5009976476430893,
"epoch": 0.33620689655172414,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.009771936573088169,
"learning_rate": 1e-05,
"loss": 0.0198,
"num_tokens": 13515681.0,
"reward": 0.8136388063430786,
"reward_std": 0.09995077550411224,
"rewards/ngram_repetition2/mean": -0.001687212847173214,
"rewards/ngram_repetition2/std": 0.03696763888001442,
"rewards/ngram_repetition3/mean": -0.0016231336630880833,
"rewards/ngram_repetition3/std": 0.036507681012153625,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.7734375,
"rewards/symbolic_reward_accuracy/std": 0.4190165400505066,
"rewards/symbolic_reward_partial_score/mean": 0.908203125,
"rewards/symbolic_reward_partial_score/std": 0.21997693181037903,
"rewards/tag_count_reward/mean": -0.001953125,
"rewards/tag_count_reward/std": 0.04419417306780815,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2038259506225586,
"sampling/importance_sampling_ratio/min": 8.807555423118174e-05,
"sampling/sampling_logp_difference/max": 9.337315559387207,
"sampling/sampling_logp_difference/mean": 0.246160626411438,
"step": 117
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.5377438366413116,
"epoch": 0.3390804597701149,
"grad_norm": 0.0017847833223640919,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 118
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.5264367163181305,
"epoch": 0.34195402298850575,
"grad_norm": 0.003602989250794053,
"learning_rate": 1e-05,
"loss": 0.0,
"step": 119
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0859375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.5324976742267609,
"epoch": 0.3448275862068966,
"grad_norm": 0.002521625952795148,
"learning_rate": 1e-05,
"loss": 0.0,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 170.0,
"completions/max_terminated_length": 170.0,
"completions/mean_length": 70.751953125,
"completions/mean_terminated_length": 70.751953125,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 0.6152905225753784,
"epoch": 0.34770114942528735,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.0035737582948058844,
"learning_rate": 1e-05,
"loss": -0.0,
"num_tokens": 13938850.0,
"reward": 0.9204086065292358,
"reward_std": 0.13115757703781128,
"rewards/ngram_repetition2/mean": -0.0001035748136928305,
"rewards/ngram_repetition2/std": 0.0023436304181814194,
"rewards/ngram_repetition3/mean": -5.3146257414482534e-05,
"rewards/ngram_repetition3/std": 0.001106699462980032,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.8984375,
"rewards/symbolic_reward_accuracy/std": 0.30236753821372986,
"rewards/symbolic_reward_partial_score/mean": 0.9716796875,
"rewards/symbolic_reward_partial_score/std": 0.1532812863588333,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2282681465148926,
"sampling/importance_sampling_ratio/min": 0.0026613196823745966,
"sampling/sampling_logp_difference/max": 5.928933143615723,
"sampling/sampling_logp_difference/mean": 0.27953964471817017,
"step": 121
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.6114228069782257,
"epoch": 0.3505747126436782,
"grad_norm": 0.0028497313614934683,
"learning_rate": 1e-05,
"loss": -0.0,
"step": 122
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.6111920773983002,
"epoch": 0.35344827586206895,
"grad_norm": 0.0034871206153184175,
"learning_rate": 1e-05,
"loss": -0.0005,
"step": 123
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.5921016335487366,
"epoch": 0.3563218390804598,
"grad_norm": 0.0030272386502474546,
"learning_rate": 1e-05,
"loss": 0.0004,
"step": 124
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 166.0,
"completions/max_terminated_length": 166.0,
"completions/mean_length": 73.875,
"completions/mean_terminated_length": 73.875,
"completions/min_length": 43.0,
"completions/min_terminated_length": 43.0,
"entropy": 0.5922946929931641,
"epoch": 0.35919540229885055,
"frac_reward_zero_std": 0.4375,
"grad_norm": 0.005635711830109358,
"learning_rate": 1e-05,
"loss": 0.0008,
"num_tokens": 14404162.0,
"reward": 0.766205906867981,
"reward_std": 0.08414055407047272,
"rewards/ngram_repetition2/mean": -0.0003033262328244746,
"rewards/ngram_repetition2/std": 0.004093645140528679,
"rewards/ngram_repetition3/mean": -0.000197924004169181,
"rewards/ngram_repetition3/std": 0.0027138942386955023,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.716796875,
"rewards/symbolic_reward_accuracy/std": 0.4509948492050171,
"rewards/symbolic_reward_partial_score/mean": 0.8815103769302368,
"rewards/symbolic_reward_partial_score/std": 0.22402197122573853,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.238568663597107,
"sampling/importance_sampling_ratio/min": 0.0008153519011102617,
"sampling/sampling_logp_difference/max": 7.11189079284668,
"sampling/sampling_logp_difference/mean": 0.29386207461357117,
"step": 125
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.6237926781177521,
"epoch": 0.3620689655172414,
"grad_norm": 0.0012159041361883283,
"learning_rate": 1e-05,
"loss": 0.0005,
"step": 126
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.6169875264167786,
"epoch": 0.3649425287356322,
"grad_norm": 0.0013761859154328704,
"learning_rate": 1e-05,
"loss": -0.0004,
"step": 127
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.6234523355960846,
"epoch": 0.367816091954023,
"grad_norm": 0.0022137663327157497,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 128
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 158.0,
"completions/max_terminated_length": 158.0,
"completions/mean_length": 72.220703125,
"completions/mean_terminated_length": 72.220703125,
"completions/min_length": 41.0,
"completions/min_terminated_length": 41.0,
"entropy": 0.642326831817627,
"epoch": 0.3706896551724138,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.004551479127258062,
"learning_rate": 1e-05,
"loss": 0.0003,
"num_tokens": 14868307.0,
"reward": 0.7379156351089478,
"reward_std": 0.1207510381937027,
"rewards/ngram_repetition2/mean": -0.0011725000804290175,
"rewards/ngram_repetition2/std": 0.011009343899786472,
"rewards/ngram_repetition3/mean": -0.001207483932375908,
"rewards/ngram_repetition3/std": 0.011307465843856335,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.677734375,
"rewards/symbolic_reward_accuracy/std": 0.46780112385749817,
"rewards/symbolic_reward_partial_score/mean": 0.87841796875,
"rewards/symbolic_reward_partial_score/std": 0.2543152868747711,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2485902309417725,
"sampling/importance_sampling_ratio/min": 0.005005154758691788,
"sampling/sampling_logp_difference/max": 5.2972869873046875,
"sampling/sampling_logp_difference/mean": 0.30063968896865845,
"step": 129
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.6359215378761292,
"epoch": 0.3735632183908046,
"grad_norm": 0.003405208932235837,
"learning_rate": 1e-05,
"loss": 0.0007,
"step": 130
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.6392008662223816,
"epoch": 0.3764367816091954,
"grad_norm": 0.0017990770284086466,
"learning_rate": 1e-05,
"loss": -0.0005,
"step": 131
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.5917164087295532,
"epoch": 0.3793103448275862,
"grad_norm": 0.0019928663969039917,
"learning_rate": 1e-05,
"loss": -0.0004,
"step": 132
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 173.0,
"completions/max_terminated_length": 173.0,
"completions/mean_length": 73.037109375,
"completions/mean_terminated_length": 73.037109375,
"completions/min_length": 42.0,
"completions/min_terminated_length": 42.0,
"entropy": 0.6280420422554016,
"epoch": 0.382183908045977,
"frac_reward_zero_std": 0.28125,
"grad_norm": 0.005952598061412573,
"learning_rate": 1e-05,
"loss": 0.0005,
"num_tokens": 15327014.0,
"reward": 0.7588248252868652,
"reward_std": 0.10919959098100662,
"rewards/ngram_repetition2/mean": -0.0005840057274326682,
"rewards/ngram_repetition2/std": 0.008051936514675617,
"rewards/ngram_repetition3/mean": -0.0007190246833488345,
"rewards/ngram_repetition3/std": 0.008538857102394104,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.712890625,
"rewards/symbolic_reward_accuracy/std": 0.45285552740097046,
"rewards/symbolic_reward_partial_score/mean": 0.86669921875,
"rewards/symbolic_reward_partial_score/std": 0.27006658911705017,
"rewards/tag_count_reward/mean": -0.001953125,
"rewards/tag_count_reward/std": 0.04419417306780815,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.249870777130127,
"sampling/importance_sampling_ratio/min": 0.0022422403562813997,
"sampling/sampling_logp_difference/max": 6.100279808044434,
"sampling/sampling_logp_difference/mean": 0.2998029589653015,
"step": 133
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.6212248504161835,
"epoch": 0.3850574712643678,
"grad_norm": 0.003057986032217741,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 134
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.6091723740100861,
"epoch": 0.3879310344827586,
"grad_norm": 0.0012814750662073493,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 135
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.6359426975250244,
"epoch": 0.39080459770114945,
"grad_norm": 0.0027223494835197926,
"learning_rate": 1e-05,
"loss": 0.0004,
"step": 136
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 129.0,
"completions/max_terminated_length": 129.0,
"completions/mean_length": 68.91015625,
"completions/mean_terminated_length": 68.91015625,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 0.6074822843074799,
"epoch": 0.3936781609195402,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.0033188818488270044,
"learning_rate": 1e-05,
"loss": 0.0005,
"num_tokens": 15783608.0,
"reward": 0.7168921232223511,
"reward_std": 0.11317743360996246,
"rewards/ngram_repetition2/mean": -0.00015996501315385103,
"rewards/ngram_repetition2/std": 0.001963542541489005,
"rewards/ngram_repetition3/mean": -8.46942639327608e-05,
"rewards/ngram_repetition3/std": 0.001818765769712627,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.646484375,
"rewards/symbolic_reward_accuracy/std": 0.47852855920791626,
"rewards/symbolic_reward_partial_score/mean": 0.8811848759651184,
"rewards/symbolic_reward_partial_score/std": 0.21943449974060059,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2335309982299805,
"sampling/importance_sampling_ratio/min": 0.00211329129524529,
"sampling/sampling_logp_difference/max": 6.15950870513916,
"sampling/sampling_logp_difference/mean": 0.281631737947464,
"step": 137
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.6190885007381439,
"epoch": 0.39655172413793105,
"grad_norm": 0.001330671482719481,
"learning_rate": 1e-05,
"loss": -0.0006,
"step": 138
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.6273294687271118,
"epoch": 0.3994252873563218,
"grad_norm": 0.001271542045287788,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 139
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.6308173537254333,
"epoch": 0.40229885057471265,
"grad_norm": 0.0031829047948122025,
"learning_rate": 1e-05,
"loss": 0.0006,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 177.0,
"completions/max_terminated_length": 177.0,
"completions/mean_length": 68.35546875,
"completions/mean_terminated_length": 68.35546875,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 0.6132214367389679,
"epoch": 0.4051724137931034,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.003619483206421137,
"learning_rate": 1e-05,
"loss": -0.0003,
"num_tokens": 16214894.0,
"reward": 0.9032166004180908,
"reward_std": 0.11951880156993866,
"rewards/ngram_repetition2/mean": -0.00033016284578479826,
"rewards/ngram_repetition2/std": 0.005208797287195921,
"rewards/ngram_repetition3/mean": -0.00027478154515847564,
"rewards/ngram_repetition3/std": 0.004721499979496002,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.884765625,
"rewards/symbolic_reward_accuracy/std": 0.3196168541908264,
"rewards/symbolic_reward_partial_score/mean": 0.9462890625,
"rewards/symbolic_reward_partial_score/std": 0.1916707307100296,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2291423082351685,
"sampling/importance_sampling_ratio/min": 0.003642383264377713,
"sampling/sampling_logp_difference/max": 5.615117073059082,
"sampling/sampling_logp_difference/mean": 0.28253355622291565,
"step": 141
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.6177562177181244,
"epoch": 0.40804597701149425,
"grad_norm": 0.002994804410263896,
"learning_rate": 1e-05,
"loss": 0.0003,
"step": 142
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.6419652104377747,
"epoch": 0.4109195402298851,
"grad_norm": 0.00205876212567091,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 143
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.6255612373352051,
"epoch": 0.41379310344827586,
"grad_norm": 0.0019418266601860523,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 144
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 125.0,
"completions/max_terminated_length": 125.0,
"completions/mean_length": 67.984375,
"completions/mean_terminated_length": 67.984375,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 0.6153721511363983,
"epoch": 0.4166666666666667,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.0032779767643660307,
"learning_rate": 1e-05,
"loss": -0.0002,
"num_tokens": 16680358.0,
"reward": 0.7255349159240723,
"reward_std": 0.09427813440561295,
"rewards/ngram_repetition2/mean": -5.109919948154129e-05,
"rewards/ngram_repetition2/std": 0.0011562429135665298,
"rewards/ngram_repetition3/mean": -0.00016613237676210701,
"rewards/ngram_repetition3/std": 0.0023371358402073383,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.65234375,
"rewards/symbolic_reward_accuracy/std": 0.47669193148612976,
"rewards/symbolic_reward_partial_score/mean": 0.8963215947151184,
"rewards/symbolic_reward_partial_score/std": 0.2021331936120987,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2260642051696777,
"sampling/importance_sampling_ratio/min": 0.0031624305993318558,
"sampling/sampling_logp_difference/max": 5.756414413452148,
"sampling/sampling_logp_difference/mean": 0.2714301347732544,
"step": 145
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.5929703116416931,
"epoch": 0.41954022988505746,
"grad_norm": 0.0029908197466284037,
"learning_rate": 1e-05,
"loss": 0.0005,
"step": 146
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.5892519950866699,
"epoch": 0.4224137931034483,
"grad_norm": 0.0029139199759811163,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 147
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.6122852563858032,
"epoch": 0.42528735632183906,
"grad_norm": 0.0019851247780025005,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 148
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.001953125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 151.0,
"completions/mean_length": 98.384765625,
"completions/mean_terminated_length": 66.51467895507812,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 0.5971274673938751,
"epoch": 0.4281609195402299,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.005040820688009262,
"learning_rate": 1e-05,
"loss": -0.0004,
"num_tokens": 17148555.0,
"reward": 0.7448210120201111,
"reward_std": 0.1062232255935669,
"rewards/ngram_repetition2/mean": -0.0026194120291620493,
"rewards/ngram_repetition2/std": 0.042374733835458755,
"rewards/ngram_repetition3/mean": -0.002586992457509041,
"rewards/ngram_repetition3/std": 0.04207787662744522,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.697265625,
"rewards/symbolic_reward_accuracy/std": 0.45989060401916504,
"rewards/symbolic_reward_partial_score/mean": 0.85595703125,
"rewards/symbolic_reward_partial_score/std": 0.25713518261909485,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2251850366592407,
"sampling/importance_sampling_ratio/min": 0.0008897649240680039,
"sampling/sampling_logp_difference/max": 7.024553298950195,
"sampling/sampling_logp_difference/mean": 0.27212345600128174,
"step": 149
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.5970511436462402,
"epoch": 0.43103448275862066,
"grad_norm": 0.0023280696477741003,
"learning_rate": 1e-05,
"loss": -0.0,
"step": 150
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.6188189387321472,
"epoch": 0.4339080459770115,
"grad_norm": 0.003590863663703203,
"learning_rate": 1e-05,
"loss": 0.0003,
"step": 151
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.619499146938324,
"epoch": 0.4367816091954023,
"grad_norm": 0.0035510375164449215,
"learning_rate": 1e-05,
"loss": 0.0295,
"step": 152
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0078125,
"completions/max_length": 16384.0,
"completions/max_terminated_length": 3966.0,
"completions/mean_length": 199.205078125,
"completions/mean_terminated_length": 71.7657470703125,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 0.5717917680740356,
"epoch": 0.4396551724137931,
"frac_reward_zero_std": 0.15625,
"grad_norm": 0.004968932364135981,
"learning_rate": 1e-05,
"loss": -0.001,
"num_tokens": 17665524.0,
"reward": 0.7633627653121948,
"reward_std": 0.09578961879014969,
"rewards/ngram_repetition2/mean": -0.010589256882667542,
"rewards/ngram_repetition2/std": 0.09524083882570267,
"rewards/ngram_repetition3/mean": -0.010552708059549332,
"rewards/ngram_repetition3/std": 0.0955950990319252,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.712890625,
"rewards/symbolic_reward_accuracy/std": 0.45285552740097046,
"rewards/symbolic_reward_partial_score/mean": 0.8818359375,
"rewards/symbolic_reward_partial_score/std": 0.2471448928117752,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1987650394439697,
"sampling/importance_sampling_ratio/min": 0.002070514252409339,
"sampling/sampling_logp_difference/max": 6.179958343505859,
"sampling/sampling_logp_difference/mean": 0.23549169301986694,
"step": 153
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.57914999127388,
"epoch": 0.4425287356321839,
"grad_norm": 0.0037953564897179604,
"learning_rate": 1e-05,
"loss": 0.0279,
"step": 154
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.6249216198921204,
"epoch": 0.4454022988505747,
"grad_norm": 0.0024858491960912943,
"learning_rate": 1e-05,
"loss": 0.0066,
"step": 155
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.5942817330360413,
"epoch": 0.4482758620689655,
"grad_norm": 0.005163618829101324,
"learning_rate": 1e-05,
"loss": 0.0294,
"step": 156
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 117.0,
"completions/max_terminated_length": 117.0,
"completions/mean_length": 60.4375,
"completions/mean_terminated_length": 60.4375,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"entropy": 0.5676226019859314,
"epoch": 0.4511494252873563,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.003438665997236967,
"learning_rate": 1e-05,
"loss": -0.0,
"num_tokens": 18102260.0,
"reward": 0.7589313983917236,
"reward_std": 0.0913117527961731,
"rewards/ngram_repetition2/mean": -0.00015224021626636386,
"rewards/ngram_repetition2/std": 0.0017761130584403872,
"rewards/ngram_repetition3/mean": -0.0002660206810105592,
"rewards/ngram_repetition3/std": 0.0028565367683768272,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.6953125,
"rewards/symbolic_reward_accuracy/std": 0.4607250988483429,
"rewards/symbolic_reward_partial_score/mean": 0.9073892831802368,
"rewards/symbolic_reward_partial_score/std": 0.19136381149291992,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2238843441009521,
"sampling/importance_sampling_ratio/min": 0.004014967940747738,
"sampling/sampling_logp_difference/max": 5.517725944519043,
"sampling/sampling_logp_difference/mean": 0.2637117803096771,
"step": 157
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.5621353983879089,
"epoch": 0.4540229885057471,
"grad_norm": 0.00312551436945796,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 158
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.5782199203968048,
"epoch": 0.45689655172413796,
"grad_norm": 0.0011367530096322298,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 159
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.5718804597854614,
"epoch": 0.45977011494252873,
"grad_norm": 0.0023252142127603292,
"learning_rate": 1e-05,
"loss": 0.0004,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 101.0,
"completions/max_terminated_length": 101.0,
"completions/mean_length": 56.208984375,
"completions/mean_terminated_length": 56.208984375,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 0.4680989980697632,
"epoch": 0.46264367816091956,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.004628314170986414,
"learning_rate": 1e-05,
"loss": 0.0002,
"num_tokens": 18558047.0,
"reward": 0.7402796745300293,
"reward_std": 0.11135189980268478,
"rewards/ngram_repetition2/mean": -0.00023176189279183745,
"rewards/ngram_repetition2/std": 0.003024648642167449,
"rewards/ngram_repetition3/mean": -0.00011768023250624537,
"rewards/ngram_repetition3/std": 0.00259247119538486,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.6796875,
"rewards/symbolic_reward_accuracy/std": 0.4670529365539551,
"rewards/symbolic_reward_partial_score/mean": 0.8816731572151184,
"rewards/symbolic_reward_partial_score/std": 0.23182804882526398,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1849548816680908,
"sampling/importance_sampling_ratio/min": 0.0010554436594247818,
"sampling/sampling_logp_difference/max": 6.853794097900391,
"sampling/sampling_logp_difference/mean": 0.2266506552696228,
"step": 161
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.4583146572113037,
"epoch": 0.46551724137931033,
"grad_norm": 0.0011460937093943357,
"learning_rate": 1e-05,
"loss": -0.0003,
"step": 162
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.4420630782842636,
"epoch": 0.46839080459770116,
"grad_norm": 0.0012833460932597518,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 163
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.0546875,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.4771575480699539,
"epoch": 0.47126436781609193,
"grad_norm": 0.0012788056628778577,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 164
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 101.0,
"completions/max_terminated_length": 101.0,
"completions/mean_length": 54.5,
"completions/mean_terminated_length": 54.5,
"completions/min_length": 6.0,
"completions/min_terminated_length": 6.0,
"entropy": 0.46562887728214264,
"epoch": 0.47413793103448276,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.0049250805750489235,
"learning_rate": 1e-05,
"loss": -0.0002,
"num_tokens": 19006783.0,
"reward": 0.7197023630142212,
"reward_std": 0.1307801455259323,
"rewards/ngram_repetition2/mean": -0.001037043984979391,
"rewards/ngram_repetition2/std": 0.012370138429105282,
"rewards/ngram_repetition3/mean": -0.0013850650284439325,
"rewards/ngram_repetition3/std": 0.013737454079091549,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.66015625,
"rewards/symbolic_reward_accuracy/std": 0.4741191864013672,
"rewards/symbolic_reward_partial_score/mean": 0.8645833134651184,
"rewards/symbolic_reward_partial_score/std": 0.25641369819641113,
"rewards/tag_count_reward/mean": -0.017578125,
"rewards/tag_count_reward/std": 0.13154059648513794,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1873273849487305,
"sampling/importance_sampling_ratio/min": 0.0031211727764457464,
"sampling/sampling_logp_difference/max": 5.7695465087890625,
"sampling/sampling_logp_difference/mean": 0.22203099727630615,
"step": 165
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.4372178316116333,
"epoch": 0.47701149425287354,
"grad_norm": 0.0017901709070429206,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 166
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1015625,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.43625424802303314,
"epoch": 0.47988505747126436,
"grad_norm": 0.0014502947451546788,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 167
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.4362707883119583,
"epoch": 0.4827586206896552,
"grad_norm": 0.002049398375675082,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 168
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 11593.0,
"completions/max_terminated_length": 11593.0,
"completions/mean_length": 79.595703125,
"completions/mean_terminated_length": 79.595703125,
"completions/min_length": 23.0,
"completions/min_terminated_length": 23.0,
"entropy": 0.4336452931165695,
"epoch": 0.48563218390804597,
"frac_reward_zero_std": 0.21875,
"grad_norm": 0.001363643677905202,
"learning_rate": 1e-05,
"loss": 0.0174,
"num_tokens": 19437648.0,
"reward": 0.7321650981903076,
"reward_std": 0.10649190843105316,
"rewards/ngram_repetition2/mean": -0.0030163757037371397,
"rewards/ngram_repetition2/std": 0.04440012574195862,
"rewards/ngram_repetition3/mean": -0.003132551908493042,
"rewards/ngram_repetition3/std": 0.04444865137338638,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.677734375,
"rewards/symbolic_reward_accuracy/std": 0.46780112385749817,
"rewards/symbolic_reward_partial_score/mean": 0.861328125,
"rewards/symbolic_reward_partial_score/std": 0.2618214786052704,
"rewards/tag_count_reward/mean": -0.005859375,
"rewards/tag_count_reward/std": 0.07639661431312561,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1761358976364136,
"sampling/importance_sampling_ratio/min": 0.0009067317587323487,
"sampling/sampling_logp_difference/max": 7.005663871765137,
"sampling/sampling_logp_difference/mean": 0.20624347031116486,
"step": 169
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.4565078616142273,
"epoch": 0.4885057471264368,
"grad_norm": 0.003753960132598877,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 170
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.4344637244939804,
"epoch": 0.49137931034482757,
"grad_norm": 0.004187124315649271,
"learning_rate": 1e-05,
"loss": 0.0004,
"step": 171
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.296875,
"entropy": 0.4768785983324051,
"epoch": 0.4942528735632184,
"grad_norm": 0.0035138626117259264,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 172
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 92.0,
"completions/max_terminated_length": 92.0,
"completions/mean_length": 55.111328125,
"completions/mean_terminated_length": 55.111328125,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"entropy": 0.4357730895280838,
"epoch": 0.49712643678160917,
"frac_reward_zero_std": 0.34375,
"grad_norm": 0.004127317573875189,
"learning_rate": 1e-05,
"loss": 0.0002,
"num_tokens": 19877673.0,
"reward": 0.6550204753875732,
"reward_std": 0.09477680921554565,
"rewards/ngram_repetition2/mean": -0.000270573771558702,
"rewards/ngram_repetition2/std": 0.0034710762556642294,
"rewards/ngram_repetition3/mean": -0.0006142433849163353,
"rewards/ngram_repetition3/std": 0.006228008773177862,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.587890625,
"rewards/symbolic_reward_accuracy/std": 0.49269601702690125,
"rewards/symbolic_reward_partial_score/mean": 0.8142903447151184,
"rewards/symbolic_reward_partial_score/std": 0.26848551630973816,
"rewards/tag_count_reward/mean": -0.0078125,
"rewards/tag_count_reward/std": 0.08812850713729858,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1787736415863037,
"sampling/importance_sampling_ratio/min": 0.002628255868330598,
"sampling/sampling_logp_difference/max": 5.941434860229492,
"sampling/sampling_logp_difference/mean": 0.2137116938829422,
"step": 173
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.42396125197410583,
"epoch": 0.5,
"grad_norm": 0.001028429134748876,
"learning_rate": 1e-05,
"loss": -0.0004,
"step": 174
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.4160853624343872,
"epoch": 0.5028735632183908,
"grad_norm": 0.0022454692516475916,
"learning_rate": 1e-05,
"loss": 0.0004,
"step": 175
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.140625,
"entropy": 0.457041934132576,
"epoch": 0.5057471264367817,
"grad_norm": 0.001310663647018373,
"learning_rate": 1e-05,
"loss": -0.0003,
"step": 176
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 6001.0,
"completions/max_terminated_length": 6001.0,
"completions/mean_length": 63.87109375,
"completions/mean_terminated_length": 63.87109375,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"entropy": 0.3979407250881195,
"epoch": 0.5086206896551724,
"frac_reward_zero_std": 0.28125,
"grad_norm": 0.0026170641649514437,
"learning_rate": 1e-05,
"loss": -0.0004,
"num_tokens": 20334055.0,
"reward": 0.754780113697052,
"reward_std": 0.11056315898895264,
"rewards/ngram_repetition2/mean": -0.0019408154767006636,
"rewards/ngram_repetition2/std": 0.043002985417842865,
"rewards/ngram_repetition3/mean": -0.003445713547989726,
"rewards/ngram_repetition3/std": 0.04354758933186531,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.697265625,
"rewards/symbolic_reward_accuracy/std": 0.45989060401916504,
"rewards/symbolic_reward_partial_score/mean": 0.8898111581802368,
"rewards/symbolic_reward_partial_score/std": 0.21633607149124146,
"rewards/tag_count_reward/mean": -0.001953125,
"rewards/tag_count_reward/std": 0.04419417306780815,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1619974374771118,
"sampling/importance_sampling_ratio/min": 0.004802882205694914,
"sampling/sampling_logp_difference/max": 5.338539123535156,
"sampling/sampling_logp_difference/mean": 0.20070970058441162,
"step": 177
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.3868762254714966,
"epoch": 0.5114942528735632,
"grad_norm": 0.002501050941646099,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 178
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.40972326695919037,
"epoch": 0.514367816091954,
"grad_norm": 0.00537771126255393,
"learning_rate": 1e-05,
"loss": 0.0004,
"step": 179
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.3921160250902176,
"epoch": 0.5172413793103449,
"grad_norm": 0.0022035855799913406,
"learning_rate": 1e-05,
"loss": 0.007,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 11699.0,
"completions/max_terminated_length": 11699.0,
"completions/mean_length": 75.41015625,
"completions/mean_terminated_length": 75.41015625,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 0.3657400608062744,
"epoch": 0.5201149425287356,
"frac_reward_zero_std": 0.21875,
"grad_norm": 0.004235206637531519,
"learning_rate": 1e-05,
"loss": 0.0004,
"num_tokens": 20765945.0,
"reward": 0.7215181589126587,
"reward_std": 0.15164119005203247,
"rewards/ngram_repetition2/mean": -0.0021292921155691147,
"rewards/ngram_repetition2/std": 0.042043447494506836,
"rewards/ngram_repetition3/mean": -0.004256935324519873,
"rewards/ngram_repetition3/std": 0.04291826859116554,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.6640625,
"rewards/symbolic_reward_accuracy/std": 0.4727790653705597,
"rewards/symbolic_reward_partial_score/mean": 0.8564453125,
"rewards/symbolic_reward_partial_score/std": 0.25408750772476196,
"rewards/tag_count_reward/mean": -0.001953125,
"rewards/tag_count_reward/std": 0.04419417306780815,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.154313325881958,
"sampling/importance_sampling_ratio/min": 0.004486561752855778,
"sampling/sampling_logp_difference/max": 5.406668663024902,
"sampling/sampling_logp_difference/mean": 0.19018931686878204,
"step": 181
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.3666221499443054,
"epoch": 0.5229885057471264,
"grad_norm": 0.014725334011018276,
"learning_rate": 1e-05,
"loss": 0.0176,
"step": 182
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.1640625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.39625655114650726,
"epoch": 0.5258620689655172,
"grad_norm": 0.00213233451358974,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 183
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.078125,
"clip_ratio/low_mean": 0.1875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.3968651592731476,
"epoch": 0.5287356321839081,
"grad_norm": 0.0036237018648535013,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 184
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 86.0,
"completions/max_terminated_length": 86.0,
"completions/mean_length": 51.23828125,
"completions/mean_terminated_length": 51.23828125,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 0.36812902987003326,
"epoch": 0.5316091954022989,
"frac_reward_zero_std": 0.0625,
"grad_norm": 0.004530012607574463,
"learning_rate": 1e-05,
"loss": -0.0001,
"num_tokens": 21201139.0,
"reward": 0.6796539425849915,
"reward_std": 0.1844376027584076,
"rewards/ngram_repetition2/mean": -0.005348730832338333,
"rewards/ngram_repetition2/std": 0.026414524763822556,
"rewards/ngram_repetition3/mean": -0.012658631429076195,
"rewards/ngram_repetition3/std": 0.03263530880212784,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.609375,
"rewards/symbolic_reward_accuracy/std": 0.48836761713027954,
"rewards/symbolic_reward_partial_score/mean": 0.8468424081802368,
"rewards/symbolic_reward_partial_score/std": 0.24985168874263763,
"rewards/tag_count_reward/mean": -0.0078125,
"rewards/tag_count_reward/std": 0.08812850713729858,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.15095853805542,
"sampling/importance_sampling_ratio/min": 0.0017468180740252137,
"sampling/sampling_logp_difference/max": 6.349959373474121,
"sampling/sampling_logp_difference/mean": 0.1853640079498291,
"step": 185
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.2734375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.3317708671092987,
"epoch": 0.5344827586206896,
"grad_norm": 0.00515081686899066,
"learning_rate": 1e-05,
"loss": 0.0003,
"step": 186
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.28125,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4140625,
"entropy": 0.3512876033782959,
"epoch": 0.5373563218390804,
"grad_norm": 0.0033993138931691647,
"learning_rate": 1e-05,
"loss": -0.0,
"step": 187
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.358001247048378,
"epoch": 0.5402298850574713,
"grad_norm": 0.0027523390017449856,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 188
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 84.0,
"completions/max_terminated_length": 84.0,
"completions/mean_length": 53.28125,
"completions/mean_terminated_length": 53.28125,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.37513381242752075,
"epoch": 0.5431034482758621,
"frac_reward_zero_std": 0.03125,
"grad_norm": 0.003202933119609952,
"learning_rate": 1e-05,
"loss": 0.0005,
"num_tokens": 21661603.0,
"reward": 0.660548985004425,
"reward_std": 0.18187561631202698,
"rewards/ngram_repetition2/mean": -0.011021820828318596,
"rewards/ngram_repetition2/std": 0.03202351555228233,
"rewards/ngram_repetition3/mean": -0.018067045137286186,
"rewards/ngram_repetition3/std": 0.03626594319939613,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.587890625,
"rewards/symbolic_reward_accuracy/std": 0.49269601702690125,
"rewards/symbolic_reward_partial_score/mean": 0.8310546875,
"rewards/symbolic_reward_partial_score/std": 0.25592610239982605,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1602153778076172,
"sampling/importance_sampling_ratio/min": 0.003239927114918828,
"sampling/sampling_logp_difference/max": 5.732204437255859,
"sampling/sampling_logp_difference/mean": 0.19722561538219452,
"step": 189
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3125,
"clip_ratio/low_mean": 0.125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4375,
"entropy": 0.37337982654571533,
"epoch": 0.5459770114942529,
"grad_norm": 0.0023845031391829252,
"learning_rate": 1e-05,
"loss": -0.0005,
"step": 190
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.2421875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4140625,
"entropy": 0.3779737800359726,
"epoch": 0.5488505747126436,
"grad_norm": 0.002789959777146578,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 191
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.359375,
"entropy": 0.38622401654720306,
"epoch": 0.5517241379310345,
"grad_norm": 0.003167940303683281,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 192
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 77.0,
"completions/max_terminated_length": 77.0,
"completions/mean_length": 51.712890625,
"completions/mean_terminated_length": 51.712890625,
"completions/min_length": 31.0,
"completions/min_terminated_length": 31.0,
"entropy": 0.36182406544685364,
"epoch": 0.5545977011494253,
"frac_reward_zero_std": 0.0625,
"grad_norm": 0.00419881846755743,
"learning_rate": 1e-05,
"loss": 0.0004,
"num_tokens": 22103056.0,
"reward": 0.6468621492385864,
"reward_std": 0.18368908762931824,
"rewards/ngram_repetition2/mean": -0.008758383803069592,
"rewards/ngram_repetition2/std": 0.02697998657822609,
"rewards/ngram_repetition3/mean": -0.016943732276558876,
"rewards/ngram_repetition3/std": 0.03183072432875633,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.56640625,
"rewards/symbolic_reward_accuracy/std": 0.4960552453994751,
"rewards/symbolic_reward_partial_score/mean": 0.83544921875,
"rewards/symbolic_reward_partial_score/std": 0.24576953053474426,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1562879085540771,
"sampling/importance_sampling_ratio/min": 0.004593458957970142,
"sampling/sampling_logp_difference/max": 5.383121967315674,
"sampling/sampling_logp_difference/mean": 0.1905713975429535,
"step": 193
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.2578125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4453125,
"entropy": 0.3760426491498947,
"epoch": 0.5574712643678161,
"grad_norm": 0.002429383806884289,
"learning_rate": 1e-05,
"loss": -0.0,
"step": 194
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.25,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4765625,
"entropy": 0.36310867965221405,
"epoch": 0.5603448275862069,
"grad_norm": 0.0015612218994647264,
"learning_rate": 1e-05,
"loss": -0.0004,
"step": 195
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.28125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.4609375,
"entropy": 0.3646097779273987,
"epoch": 0.5632183908045977,
"grad_norm": 0.0017141635762527585,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 196
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 79.0,
"completions/max_terminated_length": 79.0,
"completions/mean_length": 49.515625,
"completions/mean_terminated_length": 49.515625,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.3320048302412033,
"epoch": 0.5660919540229885,
"frac_reward_zero_std": 0.03125,
"grad_norm": 0.0032437844201922417,
"learning_rate": 1e-05,
"loss": 0.0003,
"num_tokens": 22549560.0,
"reward": 0.7433836460113525,
"reward_std": 0.15558896958827972,
"rewards/ngram_repetition2/mean": -0.0033277005422860384,
"rewards/ngram_repetition2/std": 0.01725313626229763,
"rewards/ngram_repetition3/mean": -0.004009343683719635,
"rewards/ngram_repetition3/std": 0.01900562271475792,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.689453125,
"rewards/symbolic_reward_accuracy/std": 0.46317005157470703,
"rewards/symbolic_reward_partial_score/mean": 0.8694661259651184,
"rewards/symbolic_reward_partial_score/std": 0.24413488805294037,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1475906372070312,
"sampling/importance_sampling_ratio/min": 0.003707862924784422,
"sampling/sampling_logp_difference/max": 5.597299575805664,
"sampling/sampling_logp_difference/mean": 0.16953125596046448,
"step": 197
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.3272063732147217,
"epoch": 0.5689655172413793,
"grad_norm": 0.0022684920113533735,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 198
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.390625,
"entropy": 0.3217940479516983,
"epoch": 0.5718390804597702,
"grad_norm": 0.0012801972916349769,
"learning_rate": 1e-05,
"loss": -0.0003,
"step": 199
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.34869106113910675,
"epoch": 0.5747126436781609,
"grad_norm": 0.0017063482664525509,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 74.0,
"completions/max_terminated_length": 74.0,
"completions/mean_length": 50.125,
"completions/mean_terminated_length": 50.125,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"entropy": 0.3502783924341202,
"epoch": 0.5775862068965517,
"frac_reward_zero_std": 0.0625,
"grad_norm": 0.0024014730006456375,
"learning_rate": 1e-05,
"loss": 0.0002,
"num_tokens": 23005560.0,
"reward": 0.6850454807281494,
"reward_std": 0.1775665283203125,
"rewards/ngram_repetition2/mean": -0.0029550609178841114,
"rewards/ngram_repetition2/std": 0.015238610096275806,
"rewards/ngram_repetition3/mean": -0.0032362965866923332,
"rewards/ngram_repetition3/std": 0.016220975667238235,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.607421875,
"rewards/symbolic_reward_accuracy/std": 0.4888018071651459,
"rewards/symbolic_reward_partial_score/mean": 0.8663736581802368,
"rewards/symbolic_reward_partial_score/std": 0.2243141382932663,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1440577507019043,
"sampling/importance_sampling_ratio/min": 0.0059717451222240925,
"sampling/sampling_logp_difference/max": 5.120716094970703,
"sampling/sampling_logp_difference/mean": 0.1660330891609192,
"step": 201
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.3343355357646942,
"epoch": 0.5804597701149425,
"grad_norm": 0.0018374222563579679,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 202
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.3133174031972885,
"epoch": 0.5833333333333334,
"grad_norm": 0.0014621627051383257,
"learning_rate": 1e-05,
"loss": 0.0003,
"step": 203
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.1328125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.32208897173404694,
"epoch": 0.5862068965517241,
"grad_norm": 0.0016267385799437761,
"learning_rate": 1e-05,
"loss": -0.0003,
"step": 204
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 75.0,
"completions/max_terminated_length": 75.0,
"completions/mean_length": 49.19921875,
"completions/mean_terminated_length": 49.19921875,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"entropy": 0.34746497869491577,
"epoch": 0.5890804597701149,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.0025201276876032352,
"learning_rate": 1e-05,
"loss": -0.0002,
"num_tokens": 23451742.0,
"reward": 0.7662777900695801,
"reward_std": 0.15987689793109894,
"rewards/ngram_repetition2/mean": -0.0014059185050427914,
"rewards/ngram_repetition2/std": 0.009861784987151623,
"rewards/ngram_repetition3/mean": -0.001674062223173678,
"rewards/ngram_repetition3/std": 0.012159745208919048,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.70703125,
"rewards/symbolic_reward_accuracy/std": 0.455569326877594,
"rewards/symbolic_reward_partial_score/mean": 0.9046223759651184,
"rewards/symbolic_reward_partial_score/std": 0.21026679873466492,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1511237621307373,
"sampling/importance_sampling_ratio/min": 0.012976454570889473,
"sampling/sampling_logp_difference/max": 4.344618797302246,
"sampling/sampling_logp_difference/mean": 0.1697186529636383,
"step": 205
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.318589448928833,
"epoch": 0.5919540229885057,
"grad_norm": 0.0021791488397866488,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 206
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.109375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.35446639358997345,
"epoch": 0.5948275862068966,
"grad_norm": 0.002254737773910165,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 207
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.3369624614715576,
"epoch": 0.5977011494252874,
"grad_norm": 0.0016745221801102161,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 208
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 83.0,
"completions/max_terminated_length": 83.0,
"completions/mean_length": 49.271484375,
"completions/mean_terminated_length": 49.271484375,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 0.33122432231903076,
"epoch": 0.6005747126436781,
"frac_reward_zero_std": 0.0625,
"grad_norm": 0.0019423977937549353,
"learning_rate": 1e-05,
"loss": -0.0,
"num_tokens": 23888617.0,
"reward": 0.6938353776931763,
"reward_std": 0.15832862257957458,
"rewards/ngram_repetition2/mean": -0.003065573051571846,
"rewards/ngram_repetition2/std": 0.01647561974823475,
"rewards/ngram_repetition3/mean": -0.0030463775619864464,
"rewards/ngram_repetition3/std": 0.016763268038630486,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.625,
"rewards/symbolic_reward_accuracy/std": 0.4845963716506958,
"rewards/symbolic_reward_partial_score/mean": 0.8546549081802368,
"rewards/symbolic_reward_partial_score/std": 0.24149714410305023,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1502861976623535,
"sampling/importance_sampling_ratio/min": 0.010833137668669224,
"sampling/sampling_logp_difference/max": 4.525145530700684,
"sampling/sampling_logp_difference/mean": 0.17161469161510468,
"step": 209
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.3186955451965332,
"epoch": 0.603448275862069,
"grad_norm": 0.0018531373934820294,
"learning_rate": 1e-05,
"loss": 0.0004,
"step": 210
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.31654396653175354,
"epoch": 0.6063218390804598,
"grad_norm": 0.0016878793248906732,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 211
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3515625,
"entropy": 0.33084215223789215,
"epoch": 0.6091954022988506,
"grad_norm": 0.00255804555490613,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 212
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 107.0,
"completions/max_terminated_length": 107.0,
"completions/mean_length": 50.869140625,
"completions/mean_terminated_length": 50.869140625,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.35066109895706177,
"epoch": 0.6120689655172413,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004320996347814798,
"learning_rate": 1e-05,
"loss": 0.0001,
"num_tokens": 24329318.0,
"reward": 0.8296270966529846,
"reward_std": 0.12831971049308777,
"rewards/ngram_repetition2/mean": -0.004883305169641972,
"rewards/ngram_repetition2/std": 0.02222960814833641,
"rewards/ngram_repetition3/mean": -0.0060415808111429214,
"rewards/ngram_repetition3/std": 0.02252221666276455,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.787109375,
"rewards/symbolic_reward_accuracy/std": 0.409751296043396,
"rewards/symbolic_reward_partial_score/mean": 0.92919921875,
"rewards/symbolic_reward_partial_score/std": 0.1828947216272354,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1664955615997314,
"sampling/importance_sampling_ratio/min": 0.010721365921199322,
"sampling/sampling_logp_difference/max": 4.535516738891602,
"sampling/sampling_logp_difference/mean": 0.17997923493385315,
"step": 213
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.2734375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.375,
"entropy": 0.3535380959510803,
"epoch": 0.6149425287356322,
"grad_norm": 0.0023528970777988434,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 214
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.40625,
"entropy": 0.36174456775188446,
"epoch": 0.617816091954023,
"grad_norm": 0.0027345679700374603,
"learning_rate": 1e-05,
"loss": 0.0007,
"step": 215
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3671875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.421875,
"entropy": 0.3348846435546875,
"epoch": 0.6206896551724138,
"grad_norm": 0.0029091956093907356,
"learning_rate": 1e-05,
"loss": -0.0006,
"step": 216
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 75.0,
"completions/max_terminated_length": 75.0,
"completions/mean_length": 48.234375,
"completions/mean_terminated_length": 48.234375,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.3045446276664734,
"epoch": 0.6235632183908046,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.0026498916558921337,
"learning_rate": 1e-05,
"loss": -0.0002,
"num_tokens": 24790206.0,
"reward": 0.7498259544372559,
"reward_std": 0.17582294344902039,
"rewards/ngram_repetition2/mean": -0.001370269455946982,
"rewards/ngram_repetition2/std": 0.00850055180490017,
"rewards/ngram_repetition3/mean": -0.0013840529136359692,
"rewards/ngram_repetition3/std": 0.008392706513404846,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.689453125,
"rewards/symbolic_reward_accuracy/std": 0.46317005157470703,
"rewards/symbolic_reward_partial_score/mean": 0.8907877206802368,
"rewards/symbolic_reward_partial_score/std": 0.21418313682079315,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1454708576202393,
"sampling/importance_sampling_ratio/min": 0.005403801798820496,
"sampling/sampling_logp_difference/max": 5.2206525802612305,
"sampling/sampling_logp_difference/mean": 0.16215276718139648,
"step": 217
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.25,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.34375,
"entropy": 0.31783026456832886,
"epoch": 0.6264367816091954,
"grad_norm": 0.0027007993776351213,
"learning_rate": 1e-05,
"loss": 0.0003,
"step": 218
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.2987368553876877,
"epoch": 0.6293103448275862,
"grad_norm": 0.0020821229554712772,
"learning_rate": 1e-05,
"loss": 0.0003,
"step": 219
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.1171875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3359375,
"entropy": 0.31727874279022217,
"epoch": 0.632183908045977,
"grad_norm": 0.0024530631490051746,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 220
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 73.0,
"completions/max_terminated_length": 73.0,
"completions/mean_length": 48.341796875,
"completions/mean_terminated_length": 48.341796875,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.29757460951805115,
"epoch": 0.6350574712643678,
"frac_reward_zero_std": 0.21875,
"grad_norm": 0.002699502045288682,
"learning_rate": 1e-05,
"loss": -0.0001,
"num_tokens": 25245613.0,
"reward": 0.7274583578109741,
"reward_std": 0.12696774303913116,
"rewards/ngram_repetition2/mean": -0.0015846589813008904,
"rewards/ngram_repetition2/std": 0.010053995065391064,
"rewards/ngram_repetition3/mean": -0.0016071019927039742,
"rewards/ngram_repetition3/std": 0.009995999746024609,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.66015625,
"rewards/symbolic_reward_accuracy/std": 0.4741191864013672,
"rewards/symbolic_reward_partial_score/mean": 0.8846029043197632,
"rewards/symbolic_reward_partial_score/std": 0.21321582794189453,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1418908834457397,
"sampling/importance_sampling_ratio/min": 0.003111095167696476,
"sampling/sampling_logp_difference/max": 5.772780418395996,
"sampling/sampling_logp_difference/mean": 0.15935096144676208,
"step": 221
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.3056052029132843,
"epoch": 0.6379310344827587,
"grad_norm": 0.002731436863541603,
"learning_rate": 1e-05,
"loss": 0.0,
"step": 222
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.140625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.2877749502658844,
"epoch": 0.6408045977011494,
"grad_norm": 0.0023609516210854053,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 223
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.2868718057870865,
"epoch": 0.6436781609195402,
"grad_norm": 0.002323941560462117,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 224
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 74.0,
"completions/max_terminated_length": 74.0,
"completions/mean_length": 47.828125,
"completions/mean_terminated_length": 47.828125,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"entropy": 0.3106960952281952,
"epoch": 0.646551724137931,
"frac_reward_zero_std": 0.28125,
"grad_norm": 0.003546177176758647,
"learning_rate": 1e-05,
"loss": 0.0,
"num_tokens": 25691093.0,
"reward": 0.7983278036117554,
"reward_std": 0.13130336999893188,
"rewards/ngram_repetition2/mean": -0.000689077889546752,
"rewards/ngram_repetition2/std": 0.006285086274147034,
"rewards/ngram_repetition3/mean": -0.0005111521459184587,
"rewards/ngram_repetition3/std": 0.004265496972948313,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.744140625,
"rewards/symbolic_reward_accuracy/std": 0.43676990270614624,
"rewards/symbolic_reward_partial_score/mean": 0.9248046875,
"rewards/symbolic_reward_partial_score/std": 0.16897927224636078,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1400320529937744,
"sampling/importance_sampling_ratio/min": 0.0077552772127091885,
"sampling/sampling_logp_difference/max": 4.859381675720215,
"sampling/sampling_logp_difference/mean": 0.1513688564300537,
"step": 225
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.30004067718982697,
"epoch": 0.6494252873563219,
"grad_norm": 0.0019598742946982384,
"learning_rate": 1e-05,
"loss": 0.0003,
"step": 226
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.29398113489151,
"epoch": 0.6522988505747126,
"grad_norm": 0.0021716589108109474,
"learning_rate": 1e-05,
"loss": 0.0,
"step": 227
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.2966929078102112,
"epoch": 0.6551724137931034,
"grad_norm": 0.0018855092348530889,
"learning_rate": 1e-05,
"loss": -0.0003,
"step": 228
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 70.0,
"completions/max_terminated_length": 70.0,
"completions/mean_length": 47.287109375,
"completions/mean_terminated_length": 47.287109375,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.3022087961435318,
"epoch": 0.6580459770114943,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.0015829600160941482,
"learning_rate": 1e-05,
"loss": -0.0003,
"num_tokens": 26148808.0,
"reward": 0.8019521832466125,
"reward_std": 0.11994894593954086,
"rewards/ngram_repetition2/mean": -3.474125696811825e-05,
"rewards/ngram_repetition2/std": 0.0006160694174468517,
"rewards/ngram_repetition3/mean": -5.918560782447457e-05,
"rewards/ngram_repetition3/std": 0.0013392174150794744,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.75,
"rewards/symbolic_reward_accuracy/std": 0.43343618512153625,
"rewards/symbolic_reward_partial_score/mean": 0.9231770634651184,
"rewards/symbolic_reward_partial_score/std": 0.19579437375068665,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1348984241485596,
"sampling/importance_sampling_ratio/min": 0.009495556354522705,
"sampling/sampling_logp_difference/max": 4.656931400299072,
"sampling/sampling_logp_difference/mean": 0.14863896369934082,
"step": 229
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.28135547041893005,
"epoch": 0.6609195402298851,
"grad_norm": 0.0008100973791442811,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 230
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.28573939204216003,
"epoch": 0.6637931034482759,
"grad_norm": 0.0018312670290470123,
"learning_rate": 1e-05,
"loss": -0.0,
"step": 231
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.2705778628587723,
"epoch": 0.6666666666666666,
"grad_norm": 0.002459563547745347,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 232
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 71.0,
"completions/max_terminated_length": 71.0,
"completions/mean_length": 48.6171875,
"completions/mean_terminated_length": 48.6171875,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"entropy": 0.30483564734458923,
"epoch": 0.6695402298850575,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.0017376919277012348,
"learning_rate": 1e-05,
"loss": -0.0001,
"num_tokens": 26573476.0,
"reward": 0.7558529376983643,
"reward_std": 0.10200260579586029,
"rewards/ngram_repetition2/mean": -0.0003638204070739448,
"rewards/ngram_repetition2/std": 0.004353736061602831,
"rewards/ngram_repetition3/mean": -0.00028086977545171976,
"rewards/ngram_repetition3/std": 0.005314893089234829,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.69140625,
"rewards/symbolic_reward_accuracy/std": 0.4623647928237915,
"rewards/symbolic_reward_partial_score/mean": 0.90625,
"rewards/symbolic_reward_partial_score/std": 0.1864483654499054,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1364421844482422,
"sampling/importance_sampling_ratio/min": 0.006919211242347956,
"sampling/sampling_logp_difference/max": 4.973453521728516,
"sampling/sampling_logp_difference/mean": 0.15489694476127625,
"step": 233
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.29285167157649994,
"epoch": 0.6724137931034483,
"grad_norm": 0.0008845807751640677,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 234
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0859375,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.2890046238899231,
"epoch": 0.6752873563218391,
"grad_norm": 0.002121682045981288,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 235
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.2796691060066223,
"epoch": 0.6781609195402298,
"grad_norm": 0.0009406171157024801,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 236
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 79.0,
"completions/max_terminated_length": 79.0,
"completions/mean_length": 52.1953125,
"completions/mean_terminated_length": 52.1953125,
"completions/min_length": 39.0,
"completions/min_terminated_length": 39.0,
"entropy": 0.29087191820144653,
"epoch": 0.6810344827586207,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.0022152920719236135,
"learning_rate": 1e-05,
"loss": -0.0,
"num_tokens": 27027368.0,
"reward": 0.7920734286308289,
"reward_std": 0.08740311861038208,
"rewards/ngram_repetition2/mean": -0.0009828612674027681,
"rewards/ngram_repetition2/std": 0.0061480761505663395,
"rewards/ngram_repetition3/mean": -0.0006573445862159133,
"rewards/ngram_repetition3/std": 0.004901508800685406,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.751953125,
"rewards/symbolic_reward_accuracy/std": 0.4323015511035919,
"rewards/symbolic_reward_partial_score/mean": 0.8857421875,
"rewards/symbolic_reward_partial_score/std": 0.24903303384780884,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1425862312316895,
"sampling/importance_sampling_ratio/min": 0.0047275903634727,
"sampling/sampling_logp_difference/max": 5.354339599609375,
"sampling/sampling_logp_difference/mean": 0.15625977516174316,
"step": 237
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.2970282882452011,
"epoch": 0.6839080459770115,
"grad_norm": 0.0018691563745960593,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 238
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.3026685267686844,
"epoch": 0.6867816091954023,
"grad_norm": 0.0018776139477267861,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 239
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.2863199859857559,
"epoch": 0.6896551724137931,
"grad_norm": 0.001305580255575478,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 240
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 79.0,
"completions/max_terminated_length": 79.0,
"completions/mean_length": 50.650390625,
"completions/mean_terminated_length": 50.650390625,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"entropy": 0.2873764932155609,
"epoch": 0.6925287356321839,
"frac_reward_zero_std": 0.21875,
"grad_norm": 0.002372318645939231,
"learning_rate": 1e-05,
"loss": 0.0003,
"num_tokens": 27474613.0,
"reward": 0.7715722322463989,
"reward_std": 0.11197517067193985,
"rewards/ngram_repetition2/mean": -0.0005445921560749412,
"rewards/ngram_repetition2/std": 0.004746263846755028,
"rewards/ngram_repetition3/mean": -0.0004385022330097854,
"rewards/ngram_repetition3/std": 0.005005388054996729,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.72265625,
"rewards/symbolic_reward_accuracy/std": 0.4481254518032074,
"rewards/symbolic_reward_partial_score/mean": 0.8857421875,
"rewards/symbolic_reward_partial_score/std": 0.23940622806549072,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1354875564575195,
"sampling/importance_sampling_ratio/min": 0.01270719151943922,
"sampling/sampling_logp_difference/max": 4.36558723449707,
"sampling/sampling_logp_difference/mean": 0.1487276554107666,
"step": 241
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.285337433218956,
"epoch": 0.6954022988505747,
"grad_norm": 0.001999761676415801,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 242
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.28375518321990967,
"epoch": 0.6982758620689655,
"grad_norm": 0.0021911542862653732,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 243
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.29059699177742004,
"epoch": 0.7011494252873564,
"grad_norm": 0.001007542130537331,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 244
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 74.0,
"completions/max_terminated_length": 74.0,
"completions/mean_length": 50.28125,
"completions/mean_terminated_length": 50.28125,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"entropy": 0.2930067479610443,
"epoch": 0.7040229885057471,
"frac_reward_zero_std": 0.34375,
"grad_norm": 0.0017507770098745823,
"learning_rate": 1e-05,
"loss": -0.0002,
"num_tokens": 27893637.0,
"reward": 0.8494104146957397,
"reward_std": 0.09027449786663055,
"rewards/ngram_repetition2/mean": -7.109075522748753e-05,
"rewards/ngram_repetition2/std": 0.00128466309979558,
"rewards/ngram_repetition3/mean": -0.00029177218675613403,
"rewards/ngram_repetition3/std": 0.0030360899399966,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.81640625,
"rewards/symbolic_reward_accuracy/std": 0.3875311613082886,
"rewards/symbolic_reward_partial_score/mean": 0.9264322519302368,
"rewards/symbolic_reward_partial_score/std": 0.20362317562103271,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1433947086334229,
"sampling/importance_sampling_ratio/min": 0.0034871636889874935,
"sampling/sampling_logp_difference/max": 5.658666610717773,
"sampling/sampling_logp_difference/mean": 0.1622186154127121,
"step": 245
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.109375,
"entropy": 0.3093741685152054,
"epoch": 0.7068965517241379,
"grad_norm": 0.0025900655891746283,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 246
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.303500235080719,
"epoch": 0.7097701149425287,
"grad_norm": 0.0009350689360871911,
"learning_rate": 1e-05,
"loss": -0.0,
"step": 247
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.31663645803928375,
"epoch": 0.7126436781609196,
"grad_norm": 0.0009929202497005463,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 248
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 77.0,
"completions/max_terminated_length": 77.0,
"completions/mean_length": 53.07421875,
"completions/mean_terminated_length": 53.07421875,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.30330008268356323,
"epoch": 0.7155172413793104,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.0020502391271293163,
"learning_rate": 1e-05,
"loss": 0.0002,
"num_tokens": 28344811.0,
"reward": 0.746130108833313,
"reward_std": 0.10843535512685776,
"rewards/ngram_repetition2/mean": -0.0006376801757141948,
"rewards/ngram_repetition2/std": 0.004766174126416445,
"rewards/ngram_repetition3/mean": -0.0006113115232437849,
"rewards/ngram_repetition3/std": 0.005070660263299942,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.68359375,
"rewards/symbolic_reward_accuracy/std": 0.46552830934524536,
"rewards/symbolic_reward_partial_score/mean": 0.89208984375,
"rewards/symbolic_reward_partial_score/std": 0.22636321187019348,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1441969871520996,
"sampling/importance_sampling_ratio/min": 0.009571997448801994,
"sampling/sampling_logp_difference/max": 4.648913383483887,
"sampling/sampling_logp_difference/mean": 0.16198162734508514,
"step": 249
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.309006005525589,
"epoch": 0.7183908045977011,
"grad_norm": 0.0022520306520164013,
"learning_rate": 1e-05,
"loss": 0.0004,
"step": 250
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.3168264776468277,
"epoch": 0.7212643678160919,
"grad_norm": 0.0034188460558652878,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 251
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.30942320823669434,
"epoch": 0.7241379310344828,
"grad_norm": 0.002158515155315399,
"learning_rate": 1e-05,
"loss": -0.0004,
"step": 252
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 82.0,
"completions/max_terminated_length": 82.0,
"completions/mean_length": 55.02734375,
"completions/mean_terminated_length": 55.02734375,
"completions/min_length": 34.0,
"completions/min_terminated_length": 34.0,
"entropy": 0.310179203748703,
"epoch": 0.7270114942528736,
"frac_reward_zero_std": 0.0625,
"grad_norm": 0.0023198144044727087,
"learning_rate": 1e-05,
"loss": -0.0001,
"num_tokens": 28806489.0,
"reward": 0.7360048294067383,
"reward_std": 0.11768031865358353,
"rewards/ngram_repetition2/mean": -0.001742619788274169,
"rewards/ngram_repetition2/std": 0.008961454033851624,
"rewards/ngram_repetition3/mean": -0.0012919665314257145,
"rewards/ngram_repetition3/std": 0.006052871700376272,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.673828125,
"rewards/symbolic_reward_accuracy/std": 0.4692695140838623,
"rewards/symbolic_reward_partial_score/mean": 0.8811848759651184,
"rewards/symbolic_reward_partial_score/std": 0.22542209923267365,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1476696729660034,
"sampling/importance_sampling_ratio/min": 0.006753196474164724,
"sampling/sampling_logp_difference/max": 4.997739315032959,
"sampling/sampling_logp_difference/mean": 0.16210368275642395,
"step": 253
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.32601065933704376,
"epoch": 0.7298850574712644,
"grad_norm": 0.0027695144526660442,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 254
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3046875,
"entropy": 0.3259432166814804,
"epoch": 0.7327586206896551,
"grad_norm": 0.0019364689942449331,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 255
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.1484375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.30629634857177734,
"epoch": 0.735632183908046,
"grad_norm": 0.0017040437087416649,
"learning_rate": 1e-05,
"loss": 0.0005,
"step": 256
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 76.0,
"completions/max_terminated_length": 76.0,
"completions/mean_length": 53.3359375,
"completions/mean_terminated_length": 53.3359375,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"entropy": 0.31905922293663025,
"epoch": 0.7385057471264368,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.0018301898380741477,
"learning_rate": 1e-05,
"loss": -0.0003,
"num_tokens": 29245925.0,
"reward": 0.8677599430084229,
"reward_std": 0.13781705498695374,
"rewards/ngram_repetition2/mean": -0.0006793971406295896,
"rewards/ngram_repetition2/std": 0.007922603748738766,
"rewards/ngram_repetition3/mean": -0.0006691771559417248,
"rewards/ngram_repetition3/std": 0.006458070129156113,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.83203125,
"rewards/symbolic_reward_accuracy/std": 0.374204158782959,
"rewards/symbolic_reward_partial_score/mean": 0.951171875,
"rewards/symbolic_reward_partial_score/std": 0.1681368052959442,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1527073383331299,
"sampling/importance_sampling_ratio/min": 0.004750695079565048,
"sampling/sampling_logp_difference/max": 5.349464416503906,
"sampling/sampling_logp_difference/mean": 0.17114776372909546,
"step": 257
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.30993255972862244,
"epoch": 0.7413793103448276,
"grad_norm": 0.002674890449270606,
"learning_rate": 1e-05,
"loss": 0.0004,
"step": 258
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.3146713227033615,
"epoch": 0.7442528735632183,
"grad_norm": 0.0010953254532068968,
"learning_rate": 1e-05,
"loss": -0.0,
"step": 259
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.30896949768066406,
"epoch": 0.7471264367816092,
"grad_norm": 0.0015775602078065276,
"learning_rate": 1e-05,
"loss": -0.0,
"step": 260
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 80.0,
"completions/max_terminated_length": 80.0,
"completions/mean_length": 51.8671875,
"completions/mean_terminated_length": 51.8671875,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 0.3116368055343628,
"epoch": 0.75,
"frac_reward_zero_std": 0.15625,
"grad_norm": 0.002235675696283579,
"learning_rate": 1e-05,
"loss": 0.0001,
"num_tokens": 29702817.0,
"reward": 0.8898277282714844,
"reward_std": 0.11886290460824966,
"rewards/ngram_repetition2/mean": -0.0007994142360985279,
"rewards/ngram_repetition2/std": 0.006934627424925566,
"rewards/ngram_repetition3/mean": -0.0007997690699994564,
"rewards/ngram_repetition3/std": 0.005978343077003956,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.8671875,
"rewards/symbolic_reward_accuracy/std": 0.33970388770103455,
"rewards/symbolic_reward_partial_score/mean": 0.9427083730697632,
"rewards/symbolic_reward_partial_score/std": 0.1916244775056839,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.151458740234375,
"sampling/importance_sampling_ratio/min": 0.005481026601046324,
"sampling/sampling_logp_difference/max": 5.206462860107422,
"sampling/sampling_logp_difference/mean": 0.17337773740291595,
"step": 261
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.1015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3125,
"entropy": 0.308078333735466,
"epoch": 0.7528735632183908,
"grad_norm": 0.0009869755012914538,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 262
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.30821652710437775,
"epoch": 0.7557471264367817,
"grad_norm": 0.0018410662887617946,
"learning_rate": 1e-05,
"loss": -0.0,
"step": 263
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.31607021391391754,
"epoch": 0.7586206896551724,
"grad_norm": 0.001890576328150928,
"learning_rate": 1e-05,
"loss": -0.0004,
"step": 264
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 74.0,
"completions/max_terminated_length": 74.0,
"completions/mean_length": 53.72265625,
"completions/mean_terminated_length": 53.72265625,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 0.32434844970703125,
"epoch": 0.7614942528735632,
"frac_reward_zero_std": 0.15625,
"grad_norm": 0.005114416126161814,
"learning_rate": 1e-05,
"loss": 0.0003,
"num_tokens": 30135635.0,
"reward": 0.8375043272972107,
"reward_std": 0.11283920705318451,
"rewards/ngram_repetition2/mean": -0.002288718707859516,
"rewards/ngram_repetition2/std": 0.011774125508964062,
"rewards/ngram_repetition3/mean": -0.002161826938390732,
"rewards/ngram_repetition3/std": 0.010816823691129684,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.79296875,
"rewards/symbolic_reward_accuracy/std": 0.40557438135147095,
"rewards/symbolic_reward_partial_score/mean": 0.9415690302848816,
"rewards/symbolic_reward_partial_score/std": 0.16040416061878204,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.159005880355835,
"sampling/importance_sampling_ratio/min": 0.0024109813384711742,
"sampling/sampling_logp_difference/max": 6.027721405029297,
"sampling/sampling_logp_difference/mean": 0.17788560688495636,
"step": 265
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.32901355624198914,
"epoch": 0.764367816091954,
"grad_norm": 0.0023172965738922358,
"learning_rate": 1e-05,
"loss": 0.0,
"step": 266
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.31494955718517303,
"epoch": 0.7672413793103449,
"grad_norm": 0.0014608411584049463,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 267
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0703125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.3259265422821045,
"epoch": 0.7701149425287356,
"grad_norm": 0.0023232330568134785,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 268
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 77.0,
"completions/max_terminated_length": 77.0,
"completions/mean_length": 51.58203125,
"completions/mean_terminated_length": 51.58203125,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"entropy": 0.3111993819475174,
"epoch": 0.7729885057471264,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.00332874758169055,
"learning_rate": 1e-05,
"loss": 0.0003,
"num_tokens": 30564669.0,
"reward": 0.8610060214996338,
"reward_std": 0.1156771183013916,
"rewards/ngram_repetition2/mean": -0.0010844022035598755,
"rewards/ngram_repetition2/std": 0.008614128455519676,
"rewards/ngram_repetition3/mean": -0.0018316828645765781,
"rewards/ngram_repetition3/std": 0.011513025499880314,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.828125,
"rewards/symbolic_reward_accuracy/std": 0.3776407241821289,
"rewards/symbolic_reward_partial_score/mean": 0.9378255009651184,
"rewards/symbolic_reward_partial_score/std": 0.19194307923316956,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1578052043914795,
"sampling/importance_sampling_ratio/min": 0.0028162214439362288,
"sampling/sampling_logp_difference/max": 5.872359275817871,
"sampling/sampling_logp_difference/mean": 0.17754799127578735,
"step": 269
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.31480634212493896,
"epoch": 0.7758620689655172,
"grad_norm": 0.001611092360690236,
"learning_rate": 1e-05,
"loss": -0.0,
"step": 270
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.328125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3828125,
"entropy": 0.3181573897600174,
"epoch": 0.7787356321839081,
"grad_norm": 0.0019503665389493108,
"learning_rate": 1e-05,
"loss": -0.0,
"step": 271
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.3203125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3671875,
"entropy": 0.32164546847343445,
"epoch": 0.7816091954022989,
"grad_norm": 0.00171377370133996,
"learning_rate": 1e-05,
"loss": 0.0,
"step": 272
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 82.0,
"completions/max_terminated_length": 82.0,
"completions/mean_length": 49.7265625,
"completions/mean_terminated_length": 49.7265625,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.3150666356086731,
"epoch": 0.7844827586206896,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.0014554295921698213,
"learning_rate": 1e-05,
"loss": 0.0,
"num_tokens": 31020305.0,
"reward": 0.8767499327659607,
"reward_std": 0.10860107094049454,
"rewards/ngram_repetition2/mean": -0.00034998581395484507,
"rewards/ngram_repetition2/std": 0.004037702456116676,
"rewards/ngram_repetition3/mean": -0.0004340244340710342,
"rewards/ngram_repetition3/std": 0.003480604151263833,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.85546875,
"rewards/symbolic_reward_accuracy/std": 0.35197147727012634,
"rewards/symbolic_reward_partial_score/mean": 0.9264322519302368,
"rewards/symbolic_reward_partial_score/std": 0.21918009221553802,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.15395188331604,
"sampling/importance_sampling_ratio/min": 0.003653094405308366,
"sampling/sampling_logp_difference/max": 5.612180709838867,
"sampling/sampling_logp_difference/mean": 0.17280669510364532,
"step": 273
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.3282313942909241,
"epoch": 0.7873563218390804,
"grad_norm": 0.0014400951331481338,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 274
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.3243577927350998,
"epoch": 0.7902298850574713,
"grad_norm": 0.0014871220337226987,
"learning_rate": 1e-05,
"loss": -0.0003,
"step": 275
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2578125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3203125,
"entropy": 0.319719135761261,
"epoch": 0.7931034482758621,
"grad_norm": 0.0018327397992834449,
"learning_rate": 1e-05,
"loss": 0.0003,
"step": 276
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 78.0,
"completions/max_terminated_length": 78.0,
"completions/mean_length": 50.357421875,
"completions/mean_terminated_length": 50.357421875,
"completions/min_length": 37.0,
"completions/min_terminated_length": 37.0,
"entropy": 0.3244573473930359,
"epoch": 0.7959770114942529,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.002220498863607645,
"learning_rate": 1e-05,
"loss": 0.0,
"num_tokens": 31451880.0,
"reward": 0.892960786819458,
"reward_std": 0.08757635951042175,
"rewards/ngram_repetition2/mean": -0.00032840867061167955,
"rewards/ngram_repetition2/std": 0.0031765031162649393,
"rewards/ngram_repetition3/mean": -0.00047303378232754767,
"rewards/ngram_repetition3/std": 0.00411380548030138,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.87109375,
"rewards/symbolic_reward_accuracy/std": 0.33542385697364807,
"rewards/symbolic_reward_partial_score/mean": 0.9440103769302368,
"rewards/symbolic_reward_partial_score/std": 0.1815319061279297,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.15175461769104,
"sampling/importance_sampling_ratio/min": 0.008527955040335655,
"sampling/sampling_logp_difference/max": 4.764405727386475,
"sampling/sampling_logp_difference/mean": 0.17153030633926392,
"step": 277
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.31425492465496063,
"epoch": 0.7988505747126436,
"grad_norm": 0.0010231384076178074,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 278
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2109375,
"entropy": 0.3150269687175751,
"epoch": 0.8017241379310345,
"grad_norm": 0.0016076903557404876,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 279
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.31596778333187103,
"epoch": 0.8045977011494253,
"grad_norm": 0.0005898877861909568,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 280
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 74.0,
"completions/max_terminated_length": 74.0,
"completions/mean_length": 49.728515625,
"completions/mean_terminated_length": 49.728515625,
"completions/min_length": 38.0,
"completions/min_terminated_length": 38.0,
"entropy": 0.3190753608942032,
"epoch": 0.8074712643678161,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.0013339928118512034,
"learning_rate": 1e-05,
"loss": -0.0002,
"num_tokens": 31901181.0,
"reward": 0.7412005066871643,
"reward_std": 0.08384595811367035,
"rewards/ngram_repetition2/mean": -0.0005108925397507846,
"rewards/ngram_repetition2/std": 0.004918646067380905,
"rewards/ngram_repetition3/mean": -0.0005344899836927652,
"rewards/ngram_repetition3/std": 0.004946576897054911,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.68359375,
"rewards/symbolic_reward_accuracy/std": 0.46552830934524536,
"rewards/symbolic_reward_partial_score/mean": 0.8756510019302368,
"rewards/symbolic_reward_partial_score/std": 0.22855894267559052,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1574604511260986,
"sampling/importance_sampling_ratio/min": 0.007844127714633942,
"sampling/sampling_logp_difference/max": 4.847990036010742,
"sampling/sampling_logp_difference/mean": 0.17656563222408295,
"step": 281
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.140625,
"entropy": 0.3263961225748062,
"epoch": 0.8103448275862069,
"grad_norm": 0.001344437012448907,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 282
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.31043170392513275,
"epoch": 0.8132183908045977,
"grad_norm": 0.0022367271594703197,
"learning_rate": 1e-05,
"loss": 0.0004,
"step": 283
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.30807800590991974,
"epoch": 0.8160919540229885,
"grad_norm": 0.0007553516770713031,
"learning_rate": 1e-05,
"loss": 0.0,
"step": 284
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 76.0,
"completions/max_terminated_length": 76.0,
"completions/mean_length": 49.177734375,
"completions/mean_terminated_length": 49.177734375,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.3095483332872391,
"epoch": 0.8189655172413793,
"frac_reward_zero_std": 0.1875,
"grad_norm": 0.002113133668899536,
"learning_rate": 1e-05,
"loss": -0.0001,
"num_tokens": 32350680.0,
"reward": 0.8011506795883179,
"reward_std": 0.09198400378227234,
"rewards/ngram_repetition2/mean": -0.0008190472144633532,
"rewards/ngram_repetition2/std": 0.005462025757879019,
"rewards/ngram_repetition3/mean": -0.0013033249415457249,
"rewards/ngram_repetition3/std": 0.007873849011957645,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.751953125,
"rewards/symbolic_reward_accuracy/std": 0.4323015511035919,
"rewards/symbolic_reward_partial_score/mean": 0.916015625,
"rewards/symbolic_reward_partial_score/std": 0.17717863619327545,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.157573938369751,
"sampling/importance_sampling_ratio/min": 0.0037520925980061293,
"sampling/sampling_logp_difference/max": 5.585441589355469,
"sampling/sampling_logp_difference/mean": 0.17748284339904785,
"step": 285
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.3058888614177704,
"epoch": 0.8218390804597702,
"grad_norm": 0.0015753849875181913,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 286
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2421875,
"clip_ratio/low_mean": 0.0859375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.318478599190712,
"epoch": 0.8247126436781609,
"grad_norm": 0.0023167389445006847,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 287
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.3046875,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.3984375,
"entropy": 0.30891837179660797,
"epoch": 0.8275862068965517,
"grad_norm": 0.0011400324292480946,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 288
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 73.0,
"completions/max_terminated_length": 73.0,
"completions/mean_length": 47.052734375,
"completions/mean_terminated_length": 47.052734375,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.31216710805892944,
"epoch": 0.8304597701149425,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.0013744381722062826,
"learning_rate": 1e-05,
"loss": -0.0002,
"num_tokens": 32771219.0,
"reward": 0.8360303640365601,
"reward_std": 0.1082114726305008,
"rewards/ngram_repetition2/mean": -0.00015832216013222933,
"rewards/ngram_repetition2/std": 0.0017230219673365355,
"rewards/ngram_repetition3/mean": -0.00032279096194542944,
"rewards/ngram_repetition3/std": 0.003777548670768738,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.796875,
"rewards/symbolic_reward_accuracy/std": 0.4027182459831238,
"rewards/symbolic_reward_partial_score/mean": 0.9274088740348816,
"rewards/symbolic_reward_partial_score/std": 0.19053959846496582,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1560266017913818,
"sampling/importance_sampling_ratio/min": 0.0048217857256531715,
"sampling/sampling_logp_difference/max": 5.334610939025879,
"sampling/sampling_logp_difference/mean": 0.1788705289363861,
"step": 289
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1640625,
"entropy": 0.29697035253047943,
"epoch": 0.8333333333333334,
"grad_norm": 0.0019398077856749296,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 290
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.234375,
"entropy": 0.28635063767433167,
"epoch": 0.8362068965517241,
"grad_norm": 0.0011315299198031425,
"learning_rate": 1e-05,
"loss": 0.0,
"step": 291
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.2855062931776047,
"epoch": 0.8390804597701149,
"grad_norm": 0.0018408946925774217,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 292
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 65.0,
"completions/max_terminated_length": 65.0,
"completions/mean_length": 47.859375,
"completions/mean_terminated_length": 47.859375,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.2754402905702591,
"epoch": 0.8419540229885057,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.002310027601197362,
"learning_rate": 1e-05,
"loss": 0.0003,
"num_tokens": 33213227.0,
"reward": 0.8567273616790771,
"reward_std": 0.1125885546207428,
"rewards/ngram_repetition2/mean": -0.0004661846614908427,
"rewards/ngram_repetition2/std": 0.005311489105224609,
"rewards/ngram_repetition3/mean": -0.0006299333763308823,
"rewards/ngram_repetition3/std": 0.006063228473067284,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.818359375,
"rewards/symbolic_reward_accuracy/std": 0.38592514395713806,
"rewards/symbolic_reward_partial_score/mean": 0.9462890625,
"rewards/symbolic_reward_partial_score/std": 0.162509486079216,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1567089557647705,
"sampling/importance_sampling_ratio/min": 0.009652514941990376,
"sampling/sampling_logp_difference/max": 4.640536785125732,
"sampling/sampling_logp_difference/mean": 0.17354559898376465,
"step": 293
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.28624600172042847,
"epoch": 0.8448275862068966,
"grad_norm": 0.0017463957192376256,
"learning_rate": 1e-05,
"loss": 0.0,
"step": 294
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2421875,
"entropy": 0.2811366319656372,
"epoch": 0.8477011494252874,
"grad_norm": 0.001280653988942504,
"learning_rate": 1e-05,
"loss": 0.0,
"step": 295
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.26673395931720734,
"epoch": 0.8505747126436781,
"grad_norm": 0.0017503536073490977,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 296
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 66.0,
"completions/max_terminated_length": 66.0,
"completions/mean_length": 47.21484375,
"completions/mean_terminated_length": 47.21484375,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 0.24163145571947098,
"epoch": 0.853448275862069,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.001637010253034532,
"learning_rate": 1e-05,
"loss": -0.0002,
"num_tokens": 33664569.0,
"reward": 0.9082980751991272,
"reward_std": 0.10888613760471344,
"rewards/ngram_repetition2/mean": -9.320468234363943e-05,
"rewards/ngram_repetition2/std": 0.0017233911203220487,
"rewards/ngram_repetition3/mean": -0.00018059475405607373,
"rewards/ngram_repetition3/std": 0.002074119634926319,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.890625,
"rewards/symbolic_reward_accuracy/std": 0.31241437792778015,
"rewards/symbolic_reward_partial_score/mean": 0.9495442509651184,
"rewards/symbolic_reward_partial_score/std": 0.19056856632232666,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.144517421722412,
"sampling/importance_sampling_ratio/min": 0.0013258950784802437,
"sampling/sampling_logp_difference/max": 6.625667572021484,
"sampling/sampling_logp_difference/mean": 0.16017059981822968,
"step": 297
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.24898061156272888,
"epoch": 0.8563218390804598,
"grad_norm": 0.0015583988279104233,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 298
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.140625,
"entropy": 0.2613293379545212,
"epoch": 0.8591954022988506,
"grad_norm": 0.0013029163237661123,
"learning_rate": 1e-05,
"loss": 0.0,
"step": 299
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.25456735491752625,
"epoch": 0.8620689655172413,
"grad_norm": 0.0016972015146166086,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 300
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 70.0,
"completions/max_terminated_length": 70.0,
"completions/mean_length": 47.791015625,
"completions/mean_terminated_length": 47.791015625,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.26080329716205597,
"epoch": 0.8649425287356322,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.0011522574350237846,
"learning_rate": 1e-05,
"loss": -0.0001,
"num_tokens": 34103534.0,
"reward": 0.8019988536834717,
"reward_std": 0.12343436479568481,
"rewards/ngram_repetition2/mean": -0.00018372925114817917,
"rewards/ngram_repetition2/std": 0.0026741281617432833,
"rewards/ngram_repetition3/mean": -0.0001245810417458415,
"rewards/ngram_repetition3/std": 0.001959472196176648,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.74609375,
"rewards/symbolic_reward_accuracy/std": 0.43567025661468506,
"rewards/symbolic_reward_partial_score/mean": 0.9324544072151184,
"rewards/symbolic_reward_partial_score/std": 0.16472379863262177,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1413416862487793,
"sampling/importance_sampling_ratio/min": 0.011449605226516724,
"sampling/sampling_logp_difference/max": 4.469799995422363,
"sampling/sampling_logp_difference/mean": 0.16359643638134003,
"step": 301
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1171875,
"entropy": 0.27188640832901,
"epoch": 0.867816091954023,
"grad_norm": 0.0026438417844474316,
"learning_rate": 1e-05,
"loss": 0.0003,
"step": 302
},
{
"clip_ratio/high_max": 0.5,
"clip_ratio/high_mean": 0.1171875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.25686506927013397,
"epoch": 0.8706896551724138,
"grad_norm": 0.002449192339554429,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 303
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.26332540810108185,
"epoch": 0.8735632183908046,
"grad_norm": 0.001110375509597361,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 304
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 60.0,
"completions/max_terminated_length": 60.0,
"completions/mean_length": 45.3671875,
"completions/mean_terminated_length": 45.3671875,
"completions/min_length": 33.0,
"completions/min_terminated_length": 33.0,
"entropy": 0.26433664560317993,
"epoch": 0.8764367816091954,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.0027283106464892626,
"learning_rate": 1e-05,
"loss": 0.0,
"num_tokens": 34535562.0,
"reward": 0.8602952361106873,
"reward_std": 0.11447380483150482,
"rewards/ngram_repetition2/mean": -0.00026701093884184957,
"rewards/ngram_repetition2/std": 0.0053100138902664185,
"rewards/ngram_repetition3/mean": -0.00048113608499988914,
"rewards/ngram_repetition3/std": 0.00627403799444437,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.82421875,
"rewards/symbolic_reward_accuracy/std": 0.3810062110424042,
"rewards/symbolic_reward_partial_score/mean": 0.9444986581802368,
"rewards/symbolic_reward_partial_score/std": 0.17329798638820648,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1487350463867188,
"sampling/importance_sampling_ratio/min": 0.007460631895810366,
"sampling/sampling_logp_difference/max": 4.898115158081055,
"sampling/sampling_logp_difference/mean": 0.16636380553245544,
"step": 305
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.25559763610363007,
"epoch": 0.8793103448275862,
"grad_norm": 0.0008761414792388678,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 306
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.25650885701179504,
"epoch": 0.882183908045977,
"grad_norm": 0.0019666922744363546,
"learning_rate": 1e-05,
"loss": 0.0004,
"step": 307
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1328125,
"entropy": 0.25443021953105927,
"epoch": 0.8850574712643678,
"grad_norm": 0.0010442298371344805,
"learning_rate": 1e-05,
"loss": -0.0004,
"step": 308
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 66.0,
"completions/max_terminated_length": 66.0,
"completions/mean_length": 46.666015625,
"completions/mean_terminated_length": 46.666015625,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 0.2645321190357208,
"epoch": 0.8879310344827587,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.001618504524230957,
"learning_rate": 1e-05,
"loss": -0.0001,
"num_tokens": 34973791.0,
"reward": 0.7723127603530884,
"reward_std": 0.09737245738506317,
"rewards/ngram_repetition2/mean": -5.219543163548224e-05,
"rewards/ngram_repetition2/std": 0.0008356897160410881,
"rewards/ngram_repetition3/mean": -0.00011343907681293786,
"rewards/ngram_repetition3/std": 0.0018149681854993105,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.71875,
"rewards/symbolic_reward_accuracy/std": 0.45004892349243164,
"rewards/symbolic_reward_partial_score/mean": 0.8972981572151184,
"rewards/symbolic_reward_partial_score/std": 0.21487106382846832,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1448173522949219,
"sampling/importance_sampling_ratio/min": 0.008199482224881649,
"sampling/sampling_logp_difference/max": 4.803684234619141,
"sampling/sampling_logp_difference/mean": 0.16441011428833008,
"step": 309
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.21875,
"entropy": 0.26343706250190735,
"epoch": 0.8908045977011494,
"grad_norm": 0.0015610281843692064,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 310
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.2810946851968765,
"epoch": 0.8936781609195402,
"grad_norm": 0.00144387932959944,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 311
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2265625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.265625,
"entropy": 0.2614471912384033,
"epoch": 0.896551724137931,
"grad_norm": 0.0014948392054066062,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 312
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 62.0,
"completions/max_terminated_length": 62.0,
"completions/mean_length": 45.744140625,
"completions/mean_terminated_length": 45.744140625,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 0.2560636028647423,
"epoch": 0.8994252873563219,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.0025938425678759813,
"learning_rate": 1e-05,
"loss": -0.0001,
"num_tokens": 35424380.0,
"reward": 0.7669895887374878,
"reward_std": 0.12247426062822342,
"rewards/ngram_repetition2/mean": -8.339614578289911e-05,
"rewards/ngram_repetition2/std": 0.0017096961382776499,
"rewards/ngram_repetition3/mean": -0.00017701656906865537,
"rewards/ngram_repetition3/std": 0.002198555273935199,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.712890625,
"rewards/symbolic_reward_accuracy/std": 0.45285552740097046,
"rewards/symbolic_reward_partial_score/mean": 0.8932291269302368,
"rewards/symbolic_reward_partial_score/std": 0.21745367348194122,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1427170038223267,
"sampling/importance_sampling_ratio/min": 0.006228272803127766,
"sampling/sampling_logp_difference/max": 5.078656196594238,
"sampling/sampling_logp_difference/mean": 0.1635725051164627,
"step": 313
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.2608746290206909,
"epoch": 0.9022988505747126,
"grad_norm": 0.0004902381915599108,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 314
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.15625,
"clip_ratio/low_mean": 0.09375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.2573833763599396,
"epoch": 0.9051724137931034,
"grad_norm": 0.0018413531361147761,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 315
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.24609864503145218,
"epoch": 0.9080459770114943,
"grad_norm": 0.0009675182518549263,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 316
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 63.0,
"completions/max_terminated_length": 63.0,
"completions/mean_length": 45.85546875,
"completions/mean_terminated_length": 45.85546875,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 0.259112149477005,
"epoch": 0.9109195402298851,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.0020838521886616945,
"learning_rate": 1e-05,
"loss": -0.0002,
"num_tokens": 35850002.0,
"reward": 0.838620662689209,
"reward_std": 0.11595729738473892,
"rewards/ngram_repetition2/mean": -0.00015648298722226173,
"rewards/ngram_repetition2/std": 0.0020981046836823225,
"rewards/ngram_repetition3/mean": -8.584936585975811e-05,
"rewards/ngram_repetition3/std": 0.0013526281109079719,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.794921875,
"rewards/symbolic_reward_accuracy/std": 0.4041535556316376,
"rewards/symbolic_reward_partial_score/mean": 0.9405924081802368,
"rewards/symbolic_reward_partial_score/std": 0.17341189086437225,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1437469720840454,
"sampling/importance_sampling_ratio/min": 0.0033797782380133867,
"sampling/sampling_logp_difference/max": 5.689945220947266,
"sampling/sampling_logp_difference/mean": 0.16323210299015045,
"step": 317
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1484375,
"entropy": 0.25699713826179504,
"epoch": 0.9137931034482759,
"grad_norm": 0.0027148413937538862,
"learning_rate": 1e-05,
"loss": -0.0,
"step": 318
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.2601219713687897,
"epoch": 0.9166666666666666,
"grad_norm": 0.0011547203175723553,
"learning_rate": 1e-05,
"loss": 0.0004,
"step": 319
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.171875,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.2641301900148392,
"epoch": 0.9195402298850575,
"grad_norm": 0.002280124928802252,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 320
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 70.0,
"completions/max_terminated_length": 70.0,
"completions/mean_length": 47.744140625,
"completions/mean_terminated_length": 47.744140625,
"completions/min_length": 32.0,
"completions/min_terminated_length": 32.0,
"entropy": 0.27469751238822937,
"epoch": 0.9224137931034483,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.001370429527014494,
"learning_rate": 1e-05,
"loss": -0.0002,
"num_tokens": 36301935.0,
"reward": 0.8414936661720276,
"reward_std": 0.08875171840190887,
"rewards/ngram_repetition2/mean": -0.000355370226316154,
"rewards/ngram_repetition2/std": 0.003095061983913183,
"rewards/ngram_repetition3/mean": -0.0006652825977653265,
"rewards/ngram_repetition3/std": 0.003885059617459774,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.80859375,
"rewards/symbolic_reward_accuracy/std": 0.3937928080558777,
"rewards/symbolic_reward_partial_score/mean": 0.9182942509651184,
"rewards/symbolic_reward_partial_score/std": 0.21086378395557404,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1547727584838867,
"sampling/importance_sampling_ratio/min": 0.002118661068379879,
"sampling/sampling_logp_difference/max": 6.156970977783203,
"sampling/sampling_logp_difference/mean": 0.17685817182064056,
"step": 321
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.171875,
"entropy": 0.2789808511734009,
"epoch": 0.9252873563218391,
"grad_norm": 0.0013959211064502597,
"learning_rate": 1e-05,
"loss": 0.0,
"step": 322
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.2980600446462631,
"epoch": 0.9281609195402298,
"grad_norm": 0.002104496583342552,
"learning_rate": 1e-05,
"loss": -0.0,
"step": 323
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.125,
"clip_ratio/low_mean": 0.0625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.26825977861881256,
"epoch": 0.9310344827586207,
"grad_norm": 0.00133817782625556,
"learning_rate": 1e-05,
"loss": 0.0003,
"step": 324
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 69.0,
"completions/max_terminated_length": 69.0,
"completions/mean_length": 48.28125,
"completions/mean_terminated_length": 48.28125,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 0.2749357968568802,
"epoch": 0.9339080459770115,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.0011246444191783667,
"learning_rate": 1e-05,
"loss": -0.0002,
"num_tokens": 36785023.0,
"reward": 0.7912914156913757,
"reward_std": 0.11614967882633209,
"rewards/ngram_repetition2/mean": -0.0005333342123776674,
"rewards/ngram_repetition2/std": 0.003764254041016102,
"rewards/ngram_repetition3/mean": -0.001186647918075323,
"rewards/ngram_repetition3/std": 0.00689814705401659,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.734375,
"rewards/symbolic_reward_accuracy/std": 0.44209739565849304,
"rewards/symbolic_reward_partial_score/mean": 0.9241536855697632,
"rewards/symbolic_reward_partial_score/std": 0.17832158505916595,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1506119966506958,
"sampling/importance_sampling_ratio/min": 0.005779067520052195,
"sampling/sampling_logp_difference/max": 5.153512954711914,
"sampling/sampling_logp_difference/mean": 0.17207130789756775,
"step": 325
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.203125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.2859587073326111,
"epoch": 0.9367816091954023,
"grad_norm": 0.001755884732119739,
"learning_rate": 1e-05,
"loss": -0.0002,
"step": 326
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2890625,
"clip_ratio/low_mean": 0.0390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.328125,
"entropy": 0.2725732624530792,
"epoch": 0.9396551724137931,
"grad_norm": 0.002358856610953808,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 327
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2890625,
"entropy": 0.28172941505908966,
"epoch": 0.9425287356321839,
"grad_norm": 0.0033431202173233032,
"learning_rate": 1e-05,
"loss": 0.0004,
"step": 328
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 66.0,
"completions/max_terminated_length": 66.0,
"completions/mean_length": 47.630859375,
"completions/mean_terminated_length": 47.630859375,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 0.27418090403079987,
"epoch": 0.9454022988505747,
"frac_reward_zero_std": 0.3125,
"grad_norm": 0.0034468630328774452,
"learning_rate": 1e-05,
"loss": 0.0001,
"num_tokens": 37230082.0,
"reward": 0.8493610620498657,
"reward_std": 0.12724488973617554,
"rewards/ngram_repetition2/mean": -0.00019963737577199936,
"rewards/ngram_repetition2/std": 0.003859966993331909,
"rewards/ngram_repetition3/mean": -0.00022095959866419435,
"rewards/ngram_repetition3/std": 0.004134077113121748,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.81640625,
"rewards/symbolic_reward_accuracy/std": 0.3875311613082886,
"rewards/symbolic_reward_partial_score/mean": 0.92626953125,
"rewards/symbolic_reward_partial_score/std": 0.20346400141716003,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.14650559425354,
"sampling/importance_sampling_ratio/min": 0.0031521148048341274,
"sampling/sampling_logp_difference/max": 5.759681701660156,
"sampling/sampling_logp_difference/mean": 0.16905644536018372,
"step": 329
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2265625,
"entropy": 0.28660906851291656,
"epoch": 0.9482758620689655,
"grad_norm": 0.0015979736344888806,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 330
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.234375,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.28125,
"entropy": 0.28045378625392914,
"epoch": 0.9511494252873564,
"grad_norm": 0.0019632827024906874,
"learning_rate": 1e-05,
"loss": -0.0003,
"step": 331
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.2773226499557495,
"epoch": 0.9540229885057471,
"grad_norm": 0.0015298571670427918,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 332
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 60.0,
"completions/max_terminated_length": 60.0,
"completions/mean_length": 44.287109375,
"completions/mean_terminated_length": 44.287109375,
"completions/min_length": 28.0,
"completions/min_terminated_length": 28.0,
"entropy": 0.2780776619911194,
"epoch": 0.9568965517241379,
"frac_reward_zero_std": 0.40625,
"grad_norm": 0.002085136715322733,
"learning_rate": 1e-05,
"loss": 0.0002,
"num_tokens": 37667573.0,
"reward": 0.89306640625,
"reward_std": 0.1382032036781311,
"rewards/ngram_repetition2/mean": 0.0,
"rewards/ngram_repetition2/std": 0.0,
"rewards/ngram_repetition3/mean": 0.0,
"rewards/ngram_repetition3/std": 0.0,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.857421875,
"rewards/symbolic_reward_accuracy/std": 0.3499840497970581,
"rewards/symbolic_reward_partial_score/mean": 0.9762369394302368,
"rewards/symbolic_reward_partial_score/std": 0.1040315255522728,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1428526639938354,
"sampling/importance_sampling_ratio/min": 0.009135945700109005,
"sampling/sampling_logp_difference/max": 4.695538520812988,
"sampling/sampling_logp_difference/mean": 0.1694696843624115,
"step": 333
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.1796875,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.28392770886421204,
"epoch": 0.9597701149425287,
"grad_norm": 0.0020160162821412086,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 334
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1640625,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1953125,
"entropy": 0.28121981024742126,
"epoch": 0.9626436781609196,
"grad_norm": 0.0019810826051980257,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 335
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.2109375,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2578125,
"entropy": 0.27402184903621674,
"epoch": 0.9655172413793104,
"grad_norm": 0.0008822702220641077,
"learning_rate": 1e-05,
"loss": -0.0003,
"step": 336
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 63.0,
"completions/max_terminated_length": 63.0,
"completions/mean_length": 45.30078125,
"completions/mean_terminated_length": 45.30078125,
"completions/min_length": 35.0,
"completions/min_terminated_length": 35.0,
"entropy": 0.2705724239349365,
"epoch": 0.9683908045977011,
"frac_reward_zero_std": 0.34375,
"grad_norm": 0.0026577531825751066,
"learning_rate": 1e-05,
"loss": 0.0002,
"num_tokens": 38093551.0,
"reward": 0.815625011920929,
"reward_std": 0.09714089334011078,
"rewards/ngram_repetition2/mean": 0.0,
"rewards/ngram_repetition2/std": 0.0,
"rewards/ngram_repetition3/mean": 0.0,
"rewards/ngram_repetition3/std": 0.0,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.779296875,
"rewards/symbolic_reward_accuracy/std": 0.4151262938976288,
"rewards/symbolic_reward_partial_score/mean": 0.900390625,
"rewards/symbolic_reward_partial_score/std": 0.23409590125083923,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1370817422866821,
"sampling/importance_sampling_ratio/min": 0.003680461086332798,
"sampling/sampling_logp_difference/max": 5.604717254638672,
"sampling/sampling_logp_difference/mean": 0.16051676869392395,
"step": 337
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1484375,
"clip_ratio/low_mean": 0.0546875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.203125,
"entropy": 0.25970782339572906,
"epoch": 0.9712643678160919,
"grad_norm": 0.0025209251325577497,
"learning_rate": 1e-05,
"loss": 0.0002,
"step": 338
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1953125,
"clip_ratio/low_mean": 0.078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.2734375,
"entropy": 0.25134529173374176,
"epoch": 0.9741379310344828,
"grad_norm": 0.0011691589606925845,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 339
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.21875,
"clip_ratio/low_mean": 0.03125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.25,
"entropy": 0.25292646884918213,
"epoch": 0.9770114942528736,
"grad_norm": 0.0009160270565189421,
"learning_rate": 1e-05,
"loss": -0.0005,
"step": 340
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 67.0,
"completions/max_terminated_length": 67.0,
"completions/mean_length": 45.47265625,
"completions/mean_terminated_length": 45.47265625,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.24236022680997849,
"epoch": 0.9798850574712644,
"frac_reward_zero_std": 0.46875,
"grad_norm": 0.002754463814198971,
"learning_rate": 1e-05,
"loss": 0.0003,
"num_tokens": 38537985.0,
"reward": 0.790576159954071,
"reward_std": 0.10548844188451767,
"rewards/ngram_repetition2/mean": 0.0,
"rewards/ngram_repetition2/std": 0.0,
"rewards/ngram_repetition3/mean": 0.0,
"rewards/ngram_repetition3/std": 0.0,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.740234375,
"rewards/symbolic_reward_accuracy/std": 0.4389347732067108,
"rewards/symbolic_reward_partial_score/mean": 0.9080403447151184,
"rewards/symbolic_reward_partial_score/std": 0.20210063457489014,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1354217529296875,
"sampling/importance_sampling_ratio/min": 0.005959612783044577,
"sampling/sampling_logp_difference/max": 5.1227498054504395,
"sampling/sampling_logp_difference/mean": 0.16084396839141846,
"step": 341
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.109375,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.15625,
"entropy": 0.2414681240916252,
"epoch": 0.9827586206896551,
"grad_norm": 0.0012340678367763758,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 342
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.0234375,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1171875,
"entropy": 0.23134589940309525,
"epoch": 0.985632183908046,
"grad_norm": 0.0013804087648168206,
"learning_rate": 1e-05,
"loss": -0.0003,
"step": 343
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.09375,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.09375,
"entropy": 0.23478808999061584,
"epoch": 0.9885057471264368,
"grad_norm": 0.0015636914176866412,
"learning_rate": 1e-05,
"loss": -0.0001,
"step": 344
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 66.0,
"completions/max_terminated_length": 66.0,
"completions/mean_length": 45.44140625,
"completions/mean_terminated_length": 45.44140625,
"completions/min_length": 36.0,
"completions/min_terminated_length": 36.0,
"entropy": 0.2531556040048599,
"epoch": 0.9913793103448276,
"frac_reward_zero_std": 0.46875,
"grad_norm": 0.001333514112047851,
"learning_rate": 1e-05,
"loss": 0.0,
"num_tokens": 38975747.0,
"reward": 0.8312982320785522,
"reward_std": 0.09679631888866425,
"rewards/ngram_repetition2/mean": -6.133001443231478e-05,
"rewards/ngram_repetition2/std": 0.00123770406935364,
"rewards/ngram_repetition3/mean": 0.0,
"rewards/ngram_repetition3/std": 0.0,
"rewards/sentence_repetition/mean": 0.0,
"rewards/sentence_repetition/std": 0.0,
"rewards/symbolic_reward_accuracy/mean": 0.787109375,
"rewards/symbolic_reward_accuracy/std": 0.409751296043396,
"rewards/symbolic_reward_partial_score/mean": 0.9344075322151184,
"rewards/symbolic_reward_partial_score/std": 0.172744482755661,
"rewards/tag_count_reward/mean": 0.0,
"rewards/tag_count_reward/std": 0.0,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1468303203582764,
"sampling/importance_sampling_ratio/min": 0.009383895434439182,
"sampling/sampling_logp_difference/max": 4.668760299682617,
"sampling/sampling_logp_difference/mean": 0.16860270500183105,
"step": 345
},
{
"clip_ratio/high_max": 0.75,
"clip_ratio/high_mean": 0.0703125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1171875,
"entropy": 0.2506842166185379,
"epoch": 0.9942528735632183,
"grad_norm": 0.0016360621666535735,
"learning_rate": 1e-05,
"loss": 0.0001,
"step": 346
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.140625,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1875,
"entropy": 0.25267085433006287,
"epoch": 0.9971264367816092,
"grad_norm": 0.0012759178644046187,
"learning_rate": 1e-05,
"loss": -0.0,
"step": 347
},
{
"clip_ratio/high_max": 1.0,
"clip_ratio/high_mean": 0.1328125,
"clip_ratio/low_mean": 0.046875,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.1796875,
"entropy": 0.2591947913169861,
"epoch": 1.0,
"grad_norm": 0.0011629678774625063,
"learning_rate": 1e-05,
"loss": 0.0,
"step": 348
},
{
"epoch": 1.0,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 60.78947368421053,
"eval_completions/max_terminated_length": 60.78947368421053,
"eval_completions/mean_length": 46.36307565789474,
"eval_completions/mean_terminated_length": 46.36307565789474,
"eval_completions/min_length": 30.05263157894737,
"eval_completions/min_terminated_length": 30.05263157894737,
"eval_entropy": 0.254553884267807,
"eval_frac_reward_zero_std": 0.3815789473684211,
"eval_loss": -4.107596851099515e-06,
"eval_num_tokens": 38975747.0,
"eval_reward": 0.7022716889255926,
"eval_reward_std": 0.12830137265355965,
"eval_rewards/ngram_repetition2/mean": -5.1669975571138295e-06,
"eval_rewards/ngram_repetition2/std": 5.8457904838417706e-05,
"eval_rewards/ngram_repetition3/mean": -6.034150015023586e-06,
"eval_rewards/ngram_repetition3/std": 6.826861614459439e-05,
"eval_rewards/sentence_repetition/mean": 0.0,
"eval_rewards/sentence_repetition/std": 0.0,
"eval_rewards/symbolic_reward_accuracy/mean": 0.6480263157894737,
"eval_rewards/symbolic_reward_accuracy/std": 0.44470502357733876,
"eval_rewards/symbolic_reward_partial_score/mean": 0.8326822895752756,
"eval_rewards/symbolic_reward_partial_score/std": 0.25731427104849564,
"eval_rewards/tag_count_reward/mean": -0.011513157894736841,
"eval_rewards/tag_count_reward/std": 0.05738983381735651,
"eval_runtime": 281.7844,
"eval_samples_per_second": 0.532,
"eval_sampling/importance_sampling_ratio/max": 2.0,
"eval_sampling/importance_sampling_ratio/mean": 1.1433405562451011,
"eval_sampling/importance_sampling_ratio/min": 0.010514126441674418,
"eval_sampling/sampling_logp_difference/max": 15.934183672854775,
"eval_sampling/sampling_logp_difference/mean": 0.18365594979963804,
"eval_steps_per_second": 0.007,
"step": 348
},
{
"epoch": 1.0,
"step": 348,
"total_flos": 0.0,
"train_loss": 0.0006693344619362641,
"train_runtime": 4432.3723,
"train_samples_per_second": 0.63,
"train_steps_per_second": 0.079
}
],
"logging_steps": 1,
"max_steps": 348,
"num_input_tokens_seen": 38975747,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}