leonMW's picture
Add files using upload-large-folder tool
364e5d2 verified
{
"best_global_step": 60,
"best_metric": 0.000557390449102968,
"best_model_checkpoint": "data/DeepSeek-R1-Distill-Qwen-7B-Staged-2/checkpoint-60",
"epoch": 2.0,
"eval_steps": 500,
"global_step": 120,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 431.0,
"completions/max_terminated_length": 431.0,
"completions/mean_length": 253.26416015625,
"completions/mean_terminated_length": 253.26416015625,
"completions/min_length": 137.0,
"completions/min_terminated_length": 137.0,
"entropy": 0.38996245712041855,
"epoch": 0.016666666666666666,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.04174759238958359,
"learning_rate": 1e-05,
"loss": 0.0019,
"num_tokens": 2067997.0,
"reward": 3.603942394256592,
"reward_std": 0.22614431381225586,
"rewards/ngram_repetition2/mean": 0.9632381200790405,
"rewards/ngram_repetition2/std": 0.020422853529453278,
"rewards/ngram_repetition3/mean": 0.9955652356147766,
"rewards/ngram_repetition3/std": 0.006543538998812437,
"rewards/symbolic_reward_accuracy/mean": 0.81787109375,
"rewards/symbolic_reward_accuracy/std": 0.386044979095459,
"rewards/symbolic_reward_partial_score/mean": 0.9390869140625,
"rewards/symbolic_reward_partial_score/std": 0.1457148790359497,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9525362849235535,
"rewards/thinking_answer_ratio_reward/std": 0.009070757776498795,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.095194935798645,
"sampling/importance_sampling_ratio/min": 0.0016489994013682008,
"sampling/sampling_logp_difference/max": 6.407586574554443,
"sampling/sampling_logp_difference/mean": 0.16560198366641998,
"step": 1
},
{
"clip_ratio/high_max": 0.16666666666666666,
"clip_ratio/high_mean": 0.07291666666666667,
"clip_ratio/low_mean": 0.3567708333333333,
"clip_ratio/low_min": 0.17708333333333334,
"clip_ratio/region_mean": 0.4296875,
"entropy": 0.3816731671492259,
"epoch": 0.06666666666666667,
"grad_norm": 0.03443225473165512,
"learning_rate": 1e-05,
"loss": 0.0008,
"step": 4
},
{
"clip_ratio/high_max": 0.14453125,
"clip_ratio/high_mean": 0.06396484375,
"clip_ratio/low_mean": 0.28955078125,
"clip_ratio/low_min": 0.15234375,
"clip_ratio/region_mean": 0.353515625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 624.0,
"completions/max_terminated_length": 624.0,
"completions/mean_length": 248.92529296875,
"completions/mean_terminated_length": 248.92529296875,
"completions/min_length": 157.0,
"completions/min_terminated_length": 157.0,
"entropy": 0.39708597399294376,
"epoch": 0.13333333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0413547120988369,
"learning_rate": 1e-05,
"loss": 0.0014,
"num_tokens": 4147140.0,
"reward": 3.5625743865966797,
"reward_std": 0.3117659091949463,
"rewards/ngram_repetition2/mean": 0.965080976486206,
"rewards/ngram_repetition2/std": 0.021233825013041496,
"rewards/ngram_repetition3/mean": 0.9958759546279907,
"rewards/ngram_repetition3/std": 0.006623170338571072,
"rewards/symbolic_reward_accuracy/mean": 0.8046875,
"rewards/symbolic_reward_accuracy/std": 0.3965378999710083,
"rewards/symbolic_reward_partial_score/mean": 0.924072265625,
"rewards/symbolic_reward_partial_score/std": 0.1713024228811264,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9517685174942017,
"rewards/thinking_answer_ratio_reward/std": 0.00964184757322073,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.0969865322113037,
"sampling/importance_sampling_ratio/min": 0.005477577913552523,
"sampling/sampling_logp_difference/max": 5.20709228515625,
"sampling/sampling_logp_difference/mean": 0.16714993119239807,
"step": 8
},
{
"clip_ratio/high_max": 0.13671875,
"clip_ratio/high_mean": 0.06396484375,
"clip_ratio/low_mean": 0.2880859375,
"clip_ratio/low_min": 0.15234375,
"clip_ratio/region_mean": 0.35205078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 449.0,
"completions/max_terminated_length": 449.0,
"completions/mean_length": 266.74072265625,
"completions/mean_terminated_length": 266.74072265625,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"entropy": 0.41098711267113686,
"epoch": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.04287375509738922,
"learning_rate": 1e-05,
"loss": 0.0015,
"num_tokens": 6299505.0,
"reward": 3.46004581451416,
"reward_std": 0.3250929117202759,
"rewards/ngram_repetition2/mean": 0.9622728824615479,
"rewards/ngram_repetition2/std": 0.02167431451380253,
"rewards/ngram_repetition3/mean": 0.9954921007156372,
"rewards/ngram_repetition3/std": 0.007107834331691265,
"rewards/symbolic_reward_accuracy/mean": 0.75927734375,
"rewards/symbolic_reward_accuracy/std": 0.4276266396045685,
"rewards/symbolic_reward_partial_score/mean": 0.912353515625,
"rewards/symbolic_reward_partial_score/std": 0.1758050173521042,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9559780359268188,
"rewards/thinking_answer_ratio_reward/std": 0.007843377068638802,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1009982824325562,
"sampling/importance_sampling_ratio/min": 0.0013655413640663028,
"sampling/sampling_logp_difference/max": 6.5962042808532715,
"sampling/sampling_logp_difference/mean": 0.17280443012714386,
"step": 12
},
{
"clip_ratio/high_max": 0.1640625,
"clip_ratio/high_mean": 0.0791015625,
"clip_ratio/low_mean": 0.2392578125,
"clip_ratio/low_min": 0.12109375,
"clip_ratio/region_mean": 0.318359375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 472.0,
"completions/max_terminated_length": 472.0,
"completions/mean_length": 274.34619140625,
"completions/mean_terminated_length": 274.34619140625,
"completions/min_length": 167.0,
"completions/min_terminated_length": 167.0,
"entropy": 0.41036204621195793,
"epoch": 0.26666666666666666,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.04127402976155281,
"learning_rate": 1e-05,
"loss": 0.0014,
"num_tokens": 8395638.0,
"reward": 3.7558231353759766,
"reward_std": 0.32028520107269287,
"rewards/ngram_repetition2/mean": 0.9618723392486572,
"rewards/ngram_repetition2/std": 0.022064488381147385,
"rewards/ngram_repetition3/mean": 0.9952791929244995,
"rewards/ngram_repetition3/std": 0.006986668799072504,
"rewards/symbolic_reward_accuracy/mean": 0.88330078125,
"rewards/symbolic_reward_accuracy/std": 0.32113996148109436,
"rewards/symbolic_reward_partial_score/mean": 0.9600830078125,
"rewards/symbolic_reward_partial_score/std": 0.12365403771400452,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9566828608512878,
"rewards/thinking_answer_ratio_reward/std": 0.008525538258254528,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1039669513702393,
"sampling/importance_sampling_ratio/min": 8.078159589786083e-05,
"sampling/sampling_logp_difference/max": 9.423761367797852,
"sampling/sampling_logp_difference/mean": 0.17331793904304504,
"step": 16
},
{
"clip_ratio/high_max": 0.1640625,
"clip_ratio/high_mean": 0.0947265625,
"clip_ratio/low_mean": 0.23095703125,
"clip_ratio/low_min": 0.12109375,
"clip_ratio/region_mean": 0.32568359375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 546.0,
"completions/max_terminated_length": 546.0,
"completions/mean_length": 280.68359375,
"completions/mean_terminated_length": 280.68359375,
"completions/min_length": 147.0,
"completions/min_terminated_length": 147.0,
"entropy": 0.411681417375803,
"epoch": 0.3333333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0389348641037941,
"learning_rate": 1e-05,
"loss": 0.0023,
"num_tokens": 10517102.0,
"reward": 3.778409242630005,
"reward_std": 0.297868549823761,
"rewards/ngram_repetition2/mean": 0.9617278575897217,
"rewards/ngram_repetition2/std": 0.022929087281227112,
"rewards/ngram_repetition3/mean": 0.9948153495788574,
"rewards/ngram_repetition3/std": 0.007431842386722565,
"rewards/symbolic_reward_accuracy/mean": 0.8955078125,
"rewards/symbolic_reward_accuracy/std": 0.3059726655483246,
"rewards/symbolic_reward_partial_score/mean": 0.958251953125,
"rewards/symbolic_reward_partial_score/std": 0.134090393781662,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9576171636581421,
"rewards/thinking_answer_ratio_reward/std": 0.007972889579832554,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1057276725769043,
"sampling/importance_sampling_ratio/min": 0.0023463296238332987,
"sampling/sampling_logp_difference/max": 6.054903030395508,
"sampling/sampling_logp_difference/mean": 0.17559605836868286,
"step": 20
},
{
"clip_ratio/high_max": 0.1796875,
"clip_ratio/high_mean": 0.091796875,
"clip_ratio/low_mean": 0.23193359375,
"clip_ratio/low_min": 0.10546875,
"clip_ratio/region_mean": 0.32373046875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 475.0,
"completions/max_terminated_length": 475.0,
"completions/mean_length": 281.13232421875,
"completions/mean_terminated_length": 281.13232421875,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"entropy": 0.41807421669363976,
"epoch": 0.4,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03686573728919029,
"learning_rate": 1e-05,
"loss": 0.0022,
"num_tokens": 12666877.0,
"reward": 3.802722454071045,
"reward_std": 0.280770868062973,
"rewards/ngram_repetition2/mean": 0.9632822871208191,
"rewards/ngram_repetition2/std": 0.02198909968137741,
"rewards/ngram_repetition3/mean": 0.995023250579834,
"rewards/ngram_repetition3/std": 0.007552496623247862,
"rewards/symbolic_reward_accuracy/mean": 0.9052734375,
"rewards/symbolic_reward_accuracy/std": 0.2929084002971649,
"rewards/symbolic_reward_partial_score/mean": 0.9630126953125,
"rewards/symbolic_reward_partial_score/std": 0.12233622372150421,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.957996129989624,
"rewards/thinking_answer_ratio_reward/std": 0.007673850283026695,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.109450101852417,
"sampling/importance_sampling_ratio/min": 0.0014090462354943156,
"sampling/sampling_logp_difference/max": 6.564842224121094,
"sampling/sampling_logp_difference/mean": 0.17978055775165558,
"step": 24
},
{
"clip_ratio/high_max": 0.22265625,
"clip_ratio/high_mean": 0.1142578125,
"clip_ratio/low_mean": 0.21435546875,
"clip_ratio/low_min": 0.08984375,
"clip_ratio/region_mean": 0.32861328125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 786.0,
"completions/max_terminated_length": 786.0,
"completions/mean_length": 295.8984375,
"completions/mean_terminated_length": 295.8984375,
"completions/min_length": 169.0,
"completions/min_terminated_length": 169.0,
"entropy": 0.4283344931900501,
"epoch": 0.4666666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.04093848541378975,
"learning_rate": 1e-05,
"loss": 0.002,
"num_tokens": 14831853.0,
"reward": 3.85359525680542,
"reward_std": 0.2568969428539276,
"rewards/ngram_repetition2/mean": 0.9597057700157166,
"rewards/ngram_repetition2/std": 0.024175945669412613,
"rewards/ngram_repetition3/mean": 0.9941831827163696,
"rewards/ngram_repetition3/std": 0.008336109109222889,
"rewards/symbolic_reward_accuracy/mean": 0.92578125,
"rewards/symbolic_reward_accuracy/std": 0.2621905505657196,
"rewards/symbolic_reward_partial_score/mean": 0.972900390625,
"rewards/symbolic_reward_partial_score/std": 0.10685121268033981,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9593456387519836,
"rewards/thinking_answer_ratio_reward/std": 0.007558419369161129,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1124420166015625,
"sampling/importance_sampling_ratio/min": 0.0036084535531699657,
"sampling/sampling_logp_difference/max": 5.624475955963135,
"sampling/sampling_logp_difference/mean": 0.1830388605594635,
"step": 28
},
{
"clip_ratio/high_max": 0.203125,
"clip_ratio/high_mean": 0.1083984375,
"clip_ratio/low_mean": 0.2470703125,
"clip_ratio/low_min": 0.12890625,
"clip_ratio/region_mean": 0.35546875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 547.0,
"completions/max_terminated_length": 547.0,
"completions/mean_length": 298.046875,
"completions/mean_terminated_length": 298.046875,
"completions/min_length": 179.0,
"completions/min_terminated_length": 179.0,
"entropy": 0.431485241279006,
"epoch": 0.5333333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03429108485579491,
"learning_rate": 1e-05,
"loss": 0.0025,
"num_tokens": 16993549.0,
"reward": 3.9130945205688477,
"reward_std": 0.15431927144527435,
"rewards/ngram_repetition2/mean": 0.9620877504348755,
"rewards/ngram_repetition2/std": 0.0209511611610651,
"rewards/ngram_repetition3/mean": 0.995103120803833,
"rewards/ngram_repetition3/std": 0.0067704287357628345,
"rewards/symbolic_reward_accuracy/mean": 0.9521484375,
"rewards/symbolic_reward_accuracy/std": 0.21350421011447906,
"rewards/symbolic_reward_partial_score/mean": 0.9796142578125,
"rewards/symbolic_reward_partial_score/std": 0.10119316726922989,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9611598253250122,
"rewards/thinking_answer_ratio_reward/std": 0.006729719694703817,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1149195432662964,
"sampling/importance_sampling_ratio/min": 0.001943365903571248,
"sampling/sampling_logp_difference/max": 6.24333381652832,
"sampling/sampling_logp_difference/mean": 0.18532943725585938,
"step": 32
},
{
"clip_ratio/high_max": 0.203125,
"clip_ratio/high_mean": 0.08837890625,
"clip_ratio/low_mean": 0.2294921875,
"clip_ratio/low_min": 0.1015625,
"clip_ratio/region_mean": 0.31787109375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 582.0,
"completions/max_terminated_length": 582.0,
"completions/mean_length": 298.3740234375,
"completions/mean_terminated_length": 298.3740234375,
"completions/min_length": 192.0,
"completions/min_terminated_length": 192.0,
"entropy": 0.4310462474822998,
"epoch": 0.6,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03929264098405838,
"learning_rate": 1e-05,
"loss": 0.0026,
"num_tokens": 19169931.0,
"reward": 3.8662376403808594,
"reward_std": 0.21625936031341553,
"rewards/ngram_repetition2/mean": 0.9633911848068237,
"rewards/ngram_repetition2/std": 0.020250339061021805,
"rewards/ngram_repetition3/mean": 0.99515700340271,
"rewards/ngram_repetition3/std": 0.006854338105767965,
"rewards/symbolic_reward_accuracy/mean": 0.931640625,
"rewards/symbolic_reward_accuracy/std": 0.2524232268333435,
"rewards/symbolic_reward_partial_score/mean": 0.9737548828125,
"rewards/symbolic_reward_partial_score/std": 0.10834010690450668,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9616156816482544,
"rewards/thinking_answer_ratio_reward/std": 0.006748661864548922,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1172317266464233,
"sampling/importance_sampling_ratio/min": 0.0015663664089515805,
"sampling/sampling_logp_difference/max": 6.458996772766113,
"sampling/sampling_logp_difference/mean": 0.18730762600898743,
"step": 36
},
{
"clip_ratio/high_max": 0.16796875,
"clip_ratio/high_mean": 0.09033203125,
"clip_ratio/low_mean": 0.25537109375,
"clip_ratio/low_min": 0.12890625,
"clip_ratio/region_mean": 0.345703125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 909.0,
"completions/max_terminated_length": 909.0,
"completions/mean_length": 299.11181640625,
"completions/mean_terminated_length": 299.11181640625,
"completions/min_length": 193.0,
"completions/min_terminated_length": 193.0,
"entropy": 0.4281404986977577,
"epoch": 0.6666666666666666,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.035706695169210434,
"learning_rate": 1e-05,
"loss": 0.0028,
"num_tokens": 21357168.0,
"reward": 3.8661556243896484,
"reward_std": 0.10097986459732056,
"rewards/ngram_repetition2/mean": 0.9660295248031616,
"rewards/ngram_repetition2/std": 0.021356722339987755,
"rewards/ngram_repetition3/mean": 0.9956341981887817,
"rewards/ngram_repetition3/std": 0.007333936635404825,
"rewards/symbolic_reward_accuracy/mean": 0.93017578125,
"rewards/symbolic_reward_accuracy/std": 0.254912793636322,
"rewards/symbolic_reward_partial_score/mean": 0.9765625,
"rewards/symbolic_reward_partial_score/std": 0.09246132522821426,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9624710083007812,
"rewards/thinking_answer_ratio_reward/std": 0.005544988438487053,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1155461072921753,
"sampling/importance_sampling_ratio/min": 0.0017055901698768139,
"sampling/sampling_logp_difference/max": 6.373844146728516,
"sampling/sampling_logp_difference/mean": 0.18562009930610657,
"step": 40
},
{
"clip_ratio/high_max": 0.15625,
"clip_ratio/high_mean": 0.07568359375,
"clip_ratio/low_mean": 0.26416015625,
"clip_ratio/low_min": 0.1328125,
"clip_ratio/region_mean": 0.33984375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 718.0,
"completions/max_terminated_length": 718.0,
"completions/mean_length": 297.33154296875,
"completions/mean_terminated_length": 297.33154296875,
"completions/min_length": 196.0,
"completions/min_terminated_length": 196.0,
"entropy": 0.4222128689289093,
"epoch": 0.7333333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.027303706854581833,
"learning_rate": 1e-05,
"loss": 0.0018,
"num_tokens": 23521431.0,
"reward": 3.8894946575164795,
"reward_std": 0.110419362783432,
"rewards/ngram_repetition2/mean": 0.9679132699966431,
"rewards/ngram_repetition2/std": 0.020372966304421425,
"rewards/ngram_repetition3/mean": 0.9960880875587463,
"rewards/ngram_repetition3/std": 0.0068605802953243256,
"rewards/symbolic_reward_accuracy/mean": 0.93994140625,
"rewards/symbolic_reward_accuracy/std": 0.23765340447425842,
"rewards/symbolic_reward_partial_score/mean": 0.9803466796875,
"rewards/symbolic_reward_partial_score/std": 0.08386269956827164,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9625216722488403,
"rewards/thinking_answer_ratio_reward/std": 0.004792334046214819,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.114952802658081,
"sampling/importance_sampling_ratio/min": 0.002068887697532773,
"sampling/sampling_logp_difference/max": 6.180744171142578,
"sampling/sampling_logp_difference/mean": 0.18375319242477417,
"step": 44
},
{
"clip_ratio/high_max": 0.20703125,
"clip_ratio/high_mean": 0.099609375,
"clip_ratio/low_mean": 0.2666015625,
"clip_ratio/low_min": 0.14453125,
"clip_ratio/region_mean": 0.3662109375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 558.0,
"completions/max_terminated_length": 558.0,
"completions/mean_length": 317.89453125,
"completions/mean_terminated_length": 317.89453125,
"completions/min_length": 220.0,
"completions/min_terminated_length": 220.0,
"entropy": 0.4387316107749939,
"epoch": 0.8,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03348841145634651,
"learning_rate": 1e-05,
"loss": 0.0023,
"num_tokens": 25740159.0,
"reward": 3.916114091873169,
"reward_std": 0.11179815232753754,
"rewards/ngram_repetition2/mean": 0.9658024311065674,
"rewards/ngram_repetition2/std": 0.016465168446302414,
"rewards/ngram_repetition3/mean": 0.9962982535362244,
"rewards/ngram_repetition3/std": 0.005400184541940689,
"rewards/symbolic_reward_accuracy/mean": 0.951171875,
"rewards/symbolic_reward_accuracy/std": 0.21556119620800018,
"rewards/symbolic_reward_partial_score/mean": 0.9844970703125,
"rewards/symbolic_reward_partial_score/std": 0.07353945821523666,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9652310609817505,
"rewards/thinking_answer_ratio_reward/std": 0.00431449618190527,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1190249919891357,
"sampling/importance_sampling_ratio/min": 0.001858800882473588,
"sampling/sampling_logp_difference/max": 6.287823677062988,
"sampling/sampling_logp_difference/mean": 0.18923214077949524,
"step": 48
},
{
"clip_ratio/high_max": 0.15234375,
"clip_ratio/high_mean": 0.0849609375,
"clip_ratio/low_mean": 0.26806640625,
"clip_ratio/low_min": 0.15234375,
"clip_ratio/region_mean": 0.35302734375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 558.0,
"completions/max_terminated_length": 558.0,
"completions/mean_length": 317.11865234375,
"completions/mean_terminated_length": 317.11865234375,
"completions/min_length": 204.0,
"completions/min_terminated_length": 204.0,
"entropy": 0.4330588784068823,
"epoch": 0.8666666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.031064650043845177,
"learning_rate": 1e-05,
"loss": 0.0013,
"num_tokens": 27939250.0,
"reward": 3.918456554412842,
"reward_std": 0.07680399715900421,
"rewards/ngram_repetition2/mean": 0.9678490161895752,
"rewards/ngram_repetition2/std": 0.016260815784335136,
"rewards/ngram_repetition3/mean": 0.9966074824333191,
"rewards/ngram_repetition3/std": 0.005310139153152704,
"rewards/symbolic_reward_accuracy/mean": 0.9521484375,
"rewards/symbolic_reward_accuracy/std": 0.21350421011447906,
"rewards/symbolic_reward_partial_score/mean": 0.98486328125,
"rewards/symbolic_reward_partial_score/std": 0.07214950025081635,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9651836156845093,
"rewards/thinking_answer_ratio_reward/std": 0.004080226644873619,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1175155639648438,
"sampling/importance_sampling_ratio/min": 0.002885101828724146,
"sampling/sampling_logp_difference/max": 5.8481950759887695,
"sampling/sampling_logp_difference/mean": 0.18709684908390045,
"step": 52
},
{
"clip_ratio/high_max": 0.18359375,
"clip_ratio/high_mean": 0.09423828125,
"clip_ratio/low_mean": 0.2568359375,
"clip_ratio/low_min": 0.1171875,
"clip_ratio/region_mean": 0.35107421875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 518.0,
"completions/max_terminated_length": 518.0,
"completions/mean_length": 330.2392578125,
"completions/mean_terminated_length": 330.2392578125,
"completions/min_length": 214.0,
"completions/min_terminated_length": 214.0,
"entropy": 0.4368158672004938,
"epoch": 0.9333333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0355696976184845,
"learning_rate": 1e-05,
"loss": 0.0019,
"num_tokens": 30189276.0,
"reward": 3.906731128692627,
"reward_std": 0.1086559072136879,
"rewards/ngram_repetition2/mean": 0.9662027359008789,
"rewards/ngram_repetition2/std": 0.015879785642027855,
"rewards/ngram_repetition3/mean": 0.996091365814209,
"rewards/ngram_repetition3/std": 0.005460316780954599,
"rewards/symbolic_reward_accuracy/mean": 0.94873046875,
"rewards/symbolic_reward_accuracy/std": 0.22060084342956543,
"rewards/symbolic_reward_partial_score/mean": 0.97998046875,
"rewards/symbolic_reward_partial_score/std": 0.09358829259872437,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9666886925697327,
"rewards/thinking_answer_ratio_reward/std": 0.004018484149128199,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1205523014068604,
"sampling/importance_sampling_ratio/min": 0.0029683702159672976,
"sampling/sampling_logp_difference/max": 5.819742202758789,
"sampling/sampling_logp_difference/mean": 0.19034436345100403,
"step": 56
},
{
"clip_ratio/high_max": 0.16015625,
"clip_ratio/high_mean": 0.0791015625,
"clip_ratio/low_mean": 0.26416015625,
"clip_ratio/low_min": 0.1484375,
"clip_ratio/region_mean": 0.34326171875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 616.0,
"completions/max_terminated_length": 616.0,
"completions/mean_length": 335.173828125,
"completions/mean_terminated_length": 335.173828125,
"completions/min_length": 237.0,
"completions/min_terminated_length": 237.0,
"entropy": 0.4378073513507843,
"epoch": 1.0,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03477643057703972,
"learning_rate": 1e-05,
"loss": 0.0014,
"num_tokens": 32438720.0,
"reward": 3.88734769821167,
"reward_std": 0.06660252809524536,
"rewards/ngram_repetition2/mean": 0.9677587747573853,
"rewards/ngram_repetition2/std": 0.017478443682193756,
"rewards/ngram_repetition3/mean": 0.9965513944625854,
"rewards/ngram_repetition3/std": 0.005568178836256266,
"rewards/symbolic_reward_accuracy/mean": 0.93994140625,
"rewards/symbolic_reward_accuracy/std": 0.23765340447425842,
"rewards/symbolic_reward_partial_score/mean": 0.9781494140625,
"rewards/symbolic_reward_partial_score/std": 0.09496273845434189,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9672399759292603,
"rewards/thinking_answer_ratio_reward/std": 0.003744090674445033,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1204638481140137,
"sampling/importance_sampling_ratio/min": 0.002623903099447489,
"sampling/sampling_logp_difference/max": 5.943092346191406,
"sampling/sampling_logp_difference/mean": 0.1902998983860016,
"step": 60
},
{
"epoch": 1.0,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 436.36842105263156,
"eval_completions/max_terminated_length": 436.36842105263156,
"eval_completions/mean_length": 328.875,
"eval_completions/mean_terminated_length": 328.875,
"eval_completions/min_length": 248.89473684210526,
"eval_completions/min_terminated_length": 248.89473684210526,
"eval_entropy": 0.44058212324192647,
"eval_frac_reward_zero_std": 0.0,
"eval_loss": 0.000557390449102968,
"eval_num_tokens": 32438720.0,
"eval_reward": 4.020364962126079,
"eval_reward_std": 0.030778637624232368,
"eval_rewards/ngram_repetition2/mean": 0.9677612467816001,
"eval_rewards/ngram_repetition2/std": 0.014938431252774439,
"eval_rewards/ngram_repetition3/mean": 0.9964643026653089,
"eval_rewards/ngram_repetition3/std": 0.004579812300538546,
"eval_rewards/symbolic_reward_accuracy/mean": 0.9962993421052632,
"eval_rewards/symbolic_reward_accuracy/std": 0.03590594467363859,
"eval_rewards/symbolic_reward_partial_score/mean": 0.9984580592105263,
"eval_rewards/symbolic_reward_partial_score/std": 0.015954513494905672,
"eval_rewards/tag_count_reward/mean": 1.0,
"eval_rewards/tag_count_reward/std": 0.0,
"eval_rewards/thinking_answer_ratio_reward/mean": 0.9665957563801816,
"eval_rewards/thinking_answer_ratio_reward/std": 0.003913806052878499,
"eval_runtime": 432.901,
"eval_samples_per_second": 0.346,
"eval_sampling/importance_sampling_ratio/max": 2.0,
"eval_sampling/importance_sampling_ratio/mean": 1.1209219003978528,
"eval_sampling/importance_sampling_ratio/min": 0.010366919444334743,
"eval_sampling/sampling_logp_difference/max": 4.7942703146683545,
"eval_sampling/sampling_logp_difference/mean": 0.1907721946113988,
"eval_steps_per_second": 0.005,
"step": 60
},
{
"clip_ratio/high_max": 0.16015625,
"clip_ratio/high_mean": 0.0712890625,
"clip_ratio/low_mean": 0.2685546875,
"clip_ratio/low_min": 0.14453125,
"clip_ratio/region_mean": 0.33984375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 602.0,
"completions/max_terminated_length": 602.0,
"completions/mean_length": 336.6640625,
"completions/mean_terminated_length": 336.6640625,
"completions/min_length": 238.0,
"completions/min_terminated_length": 238.0,
"entropy": 0.43299879133701324,
"epoch": 1.0666666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03558320552110672,
"learning_rate": 1e-05,
"loss": 0.0019,
"num_tokens": 34676176.0,
"reward": 3.9476747512817383,
"reward_std": 0.08613574504852295,
"rewards/ngram_repetition2/mean": 0.9695593118667603,
"rewards/ngram_repetition2/std": 0.01567245088517666,
"rewards/ngram_repetition3/mean": 0.9968794584274292,
"rewards/ngram_repetition3/std": 0.004831254947930574,
"rewards/symbolic_reward_accuracy/mean": 0.96630859375,
"rewards/symbolic_reward_accuracy/std": 0.18047769367694855,
"rewards/symbolic_reward_partial_score/mean": 0.9857177734375,
"rewards/symbolic_reward_partial_score/std": 0.08201702684164047,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9675599336624146,
"rewards/thinking_answer_ratio_reward/std": 0.003663764800876379,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1207427978515625,
"sampling/importance_sampling_ratio/min": 0.0029126314911991358,
"sampling/sampling_logp_difference/max": 5.838698387145996,
"sampling/sampling_logp_difference/mean": 0.18947181105613708,
"step": 64
},
{
"clip_ratio/high_max": 0.1484375,
"clip_ratio/high_mean": 0.07958984375,
"clip_ratio/low_mean": 0.26123046875,
"clip_ratio/low_min": 0.12890625,
"clip_ratio/region_mean": 0.3408203125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 622.0,
"completions/max_terminated_length": 622.0,
"completions/mean_length": 347.7255859375,
"completions/mean_terminated_length": 347.7255859375,
"completions/min_length": 230.0,
"completions/min_terminated_length": 230.0,
"entropy": 0.43613532558083534,
"epoch": 1.1333333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03729822859168053,
"learning_rate": 1e-05,
"loss": 0.002,
"num_tokens": 36963358.0,
"reward": 3.978569269180298,
"reward_std": 0.0661315992474556,
"rewards/ngram_repetition2/mean": 0.9695651531219482,
"rewards/ngram_repetition2/std": 0.015225501731038094,
"rewards/ngram_repetition3/mean": 0.9969500303268433,
"rewards/ngram_repetition3/std": 0.004695891868323088,
"rewards/symbolic_reward_accuracy/mean": 0.978515625,
"rewards/symbolic_reward_accuracy/std": 0.14502781629562378,
"rewards/symbolic_reward_partial_score/mean": 0.9921875,
"rewards/symbolic_reward_partial_score/std": 0.05580603703856468,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9685267210006714,
"rewards/thinking_answer_ratio_reward/std": 0.0036575142294168472,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1233885288238525,
"sampling/importance_sampling_ratio/min": 0.0019186872523277998,
"sampling/sampling_logp_difference/max": 6.2561140060424805,
"sampling/sampling_logp_difference/mean": 0.19366593658924103,
"step": 68
},
{
"clip_ratio/high_max": 0.1796875,
"clip_ratio/high_mean": 0.080078125,
"clip_ratio/low_mean": 0.2646484375,
"clip_ratio/low_min": 0.15234375,
"clip_ratio/region_mean": 0.3447265625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 619.0,
"completions/max_terminated_length": 619.0,
"completions/mean_length": 360.83251953125,
"completions/mean_terminated_length": 360.83251953125,
"completions/min_length": 265.0,
"completions/min_terminated_length": 265.0,
"entropy": 0.44109424389898777,
"epoch": 1.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.037427984178066254,
"learning_rate": 1e-05,
"loss": 0.0021,
"num_tokens": 39281671.0,
"reward": 3.9073996543884277,
"reward_std": 0.047650568187236786,
"rewards/ngram_repetition2/mean": 0.9681066274642944,
"rewards/ngram_repetition2/std": 0.015408935956656933,
"rewards/ngram_repetition3/mean": 0.9968547224998474,
"rewards/ngram_repetition3/std": 0.004804851021617651,
"rewards/symbolic_reward_accuracy/mean": 0.94775390625,
"rewards/symbolic_reward_accuracy/std": 0.22257724404335022,
"rewards/symbolic_reward_partial_score/mean": 0.9825439453125,
"rewards/symbolic_reward_partial_score/std": 0.0814003199338913,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9698253870010376,
"rewards/thinking_answer_ratio_reward/std": 0.0032405967358499765,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1255016326904297,
"sampling/importance_sampling_ratio/min": 0.001506246393546462,
"sampling/sampling_logp_difference/max": 6.498134613037109,
"sampling/sampling_logp_difference/mean": 0.19701868295669556,
"step": 72
},
{
"clip_ratio/high_max": 0.16796875,
"clip_ratio/high_mean": 0.0751953125,
"clip_ratio/low_mean": 0.2666015625,
"clip_ratio/low_min": 0.125,
"clip_ratio/region_mean": 0.341796875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 692.0,
"completions/max_terminated_length": 692.0,
"completions/mean_length": 365.287109375,
"completions/mean_terminated_length": 365.287109375,
"completions/min_length": 258.0,
"completions/min_terminated_length": 258.0,
"entropy": 0.44637384451925755,
"epoch": 1.2666666666666666,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03856123611330986,
"learning_rate": 1e-05,
"loss": 0.0023,
"num_tokens": 41580755.0,
"reward": 3.949028491973877,
"reward_std": 0.0856785699725151,
"rewards/ngram_repetition2/mean": 0.9681618213653564,
"rewards/ngram_repetition2/std": 0.01548727136105299,
"rewards/ngram_repetition3/mean": 0.9968844652175903,
"rewards/ngram_repetition3/std": 0.004666218534111977,
"rewards/symbolic_reward_accuracy/mean": 0.96630859375,
"rewards/symbolic_reward_accuracy/std": 0.18047769367694855,
"rewards/symbolic_reward_partial_score/mean": 0.987060546875,
"rewards/symbolic_reward_partial_score/std": 0.07464982569217682,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.970026969909668,
"rewards/thinking_answer_ratio_reward/std": 0.003470814088359475,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1290401220321655,
"sampling/importance_sampling_ratio/min": 0.0027039332780987024,
"sampling/sampling_logp_difference/max": 5.913047790527344,
"sampling/sampling_logp_difference/mean": 0.2006792426109314,
"step": 76
},
{
"clip_ratio/high_max": 0.17578125,
"clip_ratio/high_mean": 0.0830078125,
"clip_ratio/low_mean": 0.244140625,
"clip_ratio/low_min": 0.13671875,
"clip_ratio/region_mean": 0.3271484375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 611.0,
"completions/max_terminated_length": 611.0,
"completions/mean_length": 380.34423828125,
"completions/mean_terminated_length": 380.34423828125,
"completions/min_length": 265.0,
"completions/min_terminated_length": 265.0,
"entropy": 0.47546265460550785,
"epoch": 1.3333333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03867423161864281,
"learning_rate": 1e-05,
"loss": 0.003,
"num_tokens": 43912980.0,
"reward": 3.915666103363037,
"reward_std": 0.1401577889919281,
"rewards/ngram_repetition2/mean": 0.9641463160514832,
"rewards/ngram_repetition2/std": 0.016802899539470673,
"rewards/ngram_repetition3/mean": 0.99601149559021,
"rewards/ngram_repetition3/std": 0.0052982522174716,
"rewards/symbolic_reward_accuracy/mean": 0.95263671875,
"rewards/symbolic_reward_accuracy/std": 0.21246656775474548,
"rewards/symbolic_reward_partial_score/mean": 0.9810791015625,
"rewards/symbolic_reward_partial_score/std": 0.08897262811660767,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9711929559707642,
"rewards/thinking_answer_ratio_reward/std": 0.0034950862172991037,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1358217000961304,
"sampling/importance_sampling_ratio/min": 0.0015437521506100893,
"sampling/sampling_logp_difference/max": 6.473539352416992,
"sampling/sampling_logp_difference/mean": 0.2107161283493042,
"step": 80
},
{
"clip_ratio/high_max": 0.1796875,
"clip_ratio/high_mean": 0.07666015625,
"clip_ratio/low_mean": 0.2470703125,
"clip_ratio/low_min": 0.12109375,
"clip_ratio/region_mean": 0.32373046875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 685.0,
"completions/max_terminated_length": 685.0,
"completions/mean_length": 390.22705078125,
"completions/mean_terminated_length": 390.22705078125,
"completions/min_length": 265.0,
"completions/min_terminated_length": 265.0,
"entropy": 0.47949288971722126,
"epoch": 1.4,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.03864947333931923,
"learning_rate": 1e-05,
"loss": 0.0029,
"num_tokens": 46284517.0,
"reward": 3.903589963912964,
"reward_std": 0.1544736623764038,
"rewards/ngram_repetition2/mean": 0.9642166495323181,
"rewards/ngram_repetition2/std": 0.01607567071914673,
"rewards/ngram_repetition3/mean": 0.9960227608680725,
"rewards/ngram_repetition3/std": 0.00498650036752224,
"rewards/symbolic_reward_accuracy/mean": 0.94873046875,
"rewards/symbolic_reward_accuracy/std": 0.22060084342956543,
"rewards/symbolic_reward_partial_score/mean": 0.976806640625,
"rewards/symbolic_reward_partial_score/std": 0.10547613352537155,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9719994068145752,
"rewards/thinking_answer_ratio_reward/std": 0.0034010012168437243,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.138932466506958,
"sampling/importance_sampling_ratio/min": 0.0013370462693274021,
"sampling/sampling_logp_difference/max": 6.617292404174805,
"sampling/sampling_logp_difference/mean": 0.2153591811656952,
"step": 84
},
{
"clip_ratio/high_max": 0.1640625,
"clip_ratio/high_mean": 0.07421875,
"clip_ratio/low_mean": 0.23486328125,
"clip_ratio/low_min": 0.1484375,
"clip_ratio/region_mean": 0.30908203125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 615.0,
"completions/max_terminated_length": 615.0,
"completions/mean_length": 394.150390625,
"completions/mean_terminated_length": 394.150390625,
"completions/min_length": 271.0,
"completions/min_terminated_length": 271.0,
"entropy": 0.5021626558154821,
"epoch": 1.4666666666666668,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.05966160073876381,
"learning_rate": 1e-05,
"loss": 0.003,
"num_tokens": 48626649.0,
"reward": 3.880753517150879,
"reward_std": 0.12686511874198914,
"rewards/ngram_repetition2/mean": 0.9631878733634949,
"rewards/ngram_repetition2/std": 0.015505960211157799,
"rewards/ngram_repetition3/mean": 0.995897650718689,
"rewards/ngram_repetition3/std": 0.0049866680055856705,
"rewards/symbolic_reward_accuracy/mean": 0.93701171875,
"rewards/symbolic_reward_accuracy/std": 0.24300122261047363,
"rewards/symbolic_reward_partial_score/mean": 0.9774169921875,
"rewards/symbolic_reward_partial_score/std": 0.09701967239379883,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9722087383270264,
"rewards/thinking_answer_ratio_reward/std": 0.0038304529152810574,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1457538604736328,
"sampling/importance_sampling_ratio/min": 0.001241249148733914,
"sampling/sampling_logp_difference/max": 6.69163703918457,
"sampling/sampling_logp_difference/mean": 0.22615361213684082,
"step": 88
},
{
"clip_ratio/high_max": 0.08984375,
"clip_ratio/high_mean": 0.03173828125,
"clip_ratio/low_mean": 0.2744140625,
"clip_ratio/low_min": 0.109375,
"clip_ratio/region_mean": 0.30615234375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 608.0,
"completions/max_terminated_length": 608.0,
"completions/mean_length": 402.93798828125,
"completions/mean_terminated_length": 402.93798828125,
"completions/min_length": 267.0,
"completions/min_terminated_length": 267.0,
"entropy": 0.5079601276665926,
"epoch": 1.5333333333333332,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.06473611295223236,
"learning_rate": 1e-05,
"loss": 0.0032,
"num_tokens": 51035546.0,
"reward": 3.862441062927246,
"reward_std": 0.14964377880096436,
"rewards/ngram_repetition2/mean": 0.9625787734985352,
"rewards/ngram_repetition2/std": 0.015453252010047436,
"rewards/ngram_repetition3/mean": 0.9958549737930298,
"rewards/ngram_repetition3/std": 0.0049506500363349915,
"rewards/symbolic_reward_accuracy/mean": 0.931640625,
"rewards/symbolic_reward_accuracy/std": 0.2524232268333435,
"rewards/symbolic_reward_partial_score/mean": 0.9698486328125,
"rewards/symbolic_reward_partial_score/std": 0.12419875711202621,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9726754426956177,
"rewards/thinking_answer_ratio_reward/std": 0.0038639178965240717,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.149343729019165,
"sampling/importance_sampling_ratio/min": 0.0010150460293516517,
"sampling/sampling_logp_difference/max": 6.892821311950684,
"sampling/sampling_logp_difference/mean": 0.23112602531909943,
"step": 92
},
{
"clip_ratio/high_max": 0.15625,
"clip_ratio/high_mean": 0.07275390625,
"clip_ratio/low_mean": 0.2626953125,
"clip_ratio/low_min": 0.15234375,
"clip_ratio/region_mean": 0.33544921875,
"completions/clipped_ratio": 0.0,
"completions/max_length": 667.0,
"completions/max_terminated_length": 667.0,
"completions/mean_length": 426.3505859375,
"completions/mean_terminated_length": 426.3505859375,
"completions/min_length": 287.0,
"completions/min_terminated_length": 287.0,
"entropy": 0.5987689010798931,
"epoch": 1.6,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0717100203037262,
"learning_rate": 1e-05,
"loss": 0.0048,
"num_tokens": 53520488.0,
"reward": 3.874765634536743,
"reward_std": 0.17516621947288513,
"rewards/ngram_repetition2/mean": 0.9517251253128052,
"rewards/ngram_repetition2/std": 0.01955697126686573,
"rewards/ngram_repetition3/mean": 0.9931901693344116,
"rewards/ngram_repetition3/std": 0.006746932398527861,
"rewards/symbolic_reward_accuracy/mean": 0.9345703125,
"rewards/symbolic_reward_accuracy/std": 0.24734291434288025,
"rewards/symbolic_reward_partial_score/mean": 0.9764404296875,
"rewards/symbolic_reward_partial_score/std": 0.09835170954465866,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9735493659973145,
"rewards/thinking_answer_ratio_reward/std": 0.004703040700405836,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1692607402801514,
"sampling/importance_sampling_ratio/min": 0.0016325593460351229,
"sampling/sampling_logp_difference/max": 6.417606353759766,
"sampling/sampling_logp_difference/mean": 0.25711047649383545,
"step": 96
},
{
"clip_ratio/high_max": 0.08984375,
"clip_ratio/high_mean": 0.0322265625,
"clip_ratio/low_mean": 0.294921875,
"clip_ratio/low_min": 0.16015625,
"clip_ratio/region_mean": 0.3271484375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 730.0,
"completions/max_terminated_length": 730.0,
"completions/mean_length": 446.4931640625,
"completions/mean_terminated_length": 446.4931640625,
"completions/min_length": 248.0,
"completions/min_terminated_length": 248.0,
"entropy": 0.6369560994207859,
"epoch": 1.6666666666666665,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.08183422684669495,
"learning_rate": 1e-05,
"loss": 0.0057,
"num_tokens": 55986906.0,
"reward": 3.8245627880096436,
"reward_std": 0.161258727312088,
"rewards/ngram_repetition2/mean": 0.9479941129684448,
"rewards/ngram_repetition2/std": 0.02070331759750843,
"rewards/ngram_repetition3/mean": 0.9925798177719116,
"rewards/ngram_repetition3/std": 0.006819311063736677,
"rewards/symbolic_reward_accuracy/mean": 0.912109375,
"rewards/symbolic_reward_accuracy/std": 0.28320491313934326,
"rewards/symbolic_reward_partial_score/mean": 0.97119140625,
"rewards/symbolic_reward_partial_score/std": 0.10378827154636383,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9746991991996765,
"rewards/thinking_answer_ratio_reward/std": 0.004666423425078392,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.1847954988479614,
"sampling/importance_sampling_ratio/min": 0.002613254589959979,
"sampling/sampling_logp_difference/max": 5.9471588134765625,
"sampling/sampling_logp_difference/mean": 0.27063658833503723,
"step": 100
},
{
"clip_ratio/high_max": 0.12109375,
"clip_ratio/high_mean": 0.052734375,
"clip_ratio/low_mean": 0.2841796875,
"clip_ratio/low_min": 0.15234375,
"clip_ratio/region_mean": 0.3369140625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 888.0,
"completions/max_terminated_length": 888.0,
"completions/mean_length": 464.78515625,
"completions/mean_terminated_length": 464.78515625,
"completions/min_length": 294.0,
"completions/min_terminated_length": 294.0,
"entropy": 0.6995432414114475,
"epoch": 1.7333333333333334,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.09580597281455994,
"learning_rate": 1e-05,
"loss": 0.0072,
"num_tokens": 58507106.0,
"reward": 3.8571386337280273,
"reward_std": 0.15944623947143555,
"rewards/ngram_repetition2/mean": 0.9362776875495911,
"rewards/ngram_repetition2/std": 0.02282153069972992,
"rewards/ngram_repetition3/mean": 0.98973149061203,
"rewards/ngram_repetition3/std": 0.008181481622159481,
"rewards/symbolic_reward_accuracy/mean": 0.9267578125,
"rewards/symbolic_reward_accuracy/std": 0.26059725880622864,
"rewards/symbolic_reward_partial_score/mean": 0.974609375,
"rewards/symbolic_reward_partial_score/std": 0.09773869067430496,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9753755927085876,
"rewards/thinking_answer_ratio_reward/std": 0.004951382987201214,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2027469873428345,
"sampling/importance_sampling_ratio/min": 0.0011123154545202851,
"sampling/sampling_logp_difference/max": 6.801311492919922,
"sampling/sampling_logp_difference/mean": 0.287597119808197,
"step": 104
},
{
"clip_ratio/high_max": 0.0859375,
"clip_ratio/high_mean": 0.02734375,
"clip_ratio/low_mean": 0.28125,
"clip_ratio/low_min": 0.1484375,
"clip_ratio/region_mean": 0.30859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 844.0,
"completions/max_terminated_length": 844.0,
"completions/mean_length": 476.17138671875,
"completions/mean_terminated_length": 476.17138671875,
"completions/min_length": 290.0,
"completions/min_terminated_length": 290.0,
"entropy": 0.7207919657230377,
"epoch": 1.8,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.08035453408956528,
"learning_rate": 1e-05,
"loss": 0.0083,
"num_tokens": 61044993.0,
"reward": 3.845712184906006,
"reward_std": 0.15924084186553955,
"rewards/ngram_repetition2/mean": 0.9297587871551514,
"rewards/ngram_repetition2/std": 0.025162160396575928,
"rewards/ngram_repetition3/mean": 0.9881495237350464,
"rewards/ngram_repetition3/std": 0.009133792482316494,
"rewards/symbolic_reward_accuracy/mean": 0.92138671875,
"rewards/symbolic_reward_accuracy/std": 0.2691999673843384,
"rewards/symbolic_reward_partial_score/mean": 0.9739990234375,
"rewards/symbolic_reward_partial_score/std": 0.09710752964019775,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9760768413543701,
"rewards/thinking_answer_ratio_reward/std": 0.005398593842983246,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2120659351348877,
"sampling/importance_sampling_ratio/min": 0.002052898984402418,
"sampling/sampling_logp_difference/max": 6.188502311706543,
"sampling/sampling_logp_difference/mean": 0.2954035997390747,
"step": 108
},
{
"clip_ratio/high_max": 0.1484375,
"clip_ratio/high_mean": 0.06103515625,
"clip_ratio/low_mean": 0.26611328125,
"clip_ratio/low_min": 0.13671875,
"clip_ratio/region_mean": 0.3271484375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 844.0,
"completions/max_terminated_length": 844.0,
"completions/mean_length": 489.7314453125,
"completions/mean_terminated_length": 489.7314453125,
"completions/min_length": 308.0,
"completions/min_terminated_length": 308.0,
"entropy": 0.7733415886759758,
"epoch": 1.8666666666666667,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.10550207644701004,
"learning_rate": 1e-05,
"loss": 0.008,
"num_tokens": 63600603.0,
"reward": 3.808783531188965,
"reward_std": 0.21294765174388885,
"rewards/ngram_repetition2/mean": 0.9247410297393799,
"rewards/ngram_repetition2/std": 0.02524981088936329,
"rewards/ngram_repetition3/mean": 0.9873223304748535,
"rewards/ngram_repetition3/std": 0.009321006014943123,
"rewards/symbolic_reward_accuracy/mean": 0.90478515625,
"rewards/symbolic_reward_accuracy/std": 0.2935831546783447,
"rewards/symbolic_reward_partial_score/mean": 0.9703369140625,
"rewards/symbolic_reward_partial_score/std": 0.10009879618883133,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9755770564079285,
"rewards/thinking_answer_ratio_reward/std": 0.007946600206196308,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2247238159179688,
"sampling/importance_sampling_ratio/min": 0.0008502666023559868,
"sampling/sampling_logp_difference/max": 7.069960594177246,
"sampling/sampling_logp_difference/mean": 0.30872130393981934,
"step": 112
},
{
"clip_ratio/high_max": 0.109375,
"clip_ratio/high_mean": 0.03759765625,
"clip_ratio/low_mean": 0.27392578125,
"clip_ratio/low_min": 0.14453125,
"clip_ratio/region_mean": 0.3115234375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 827.0,
"completions/max_terminated_length": 827.0,
"completions/mean_length": 477.89501953125,
"completions/mean_terminated_length": 477.89501953125,
"completions/min_length": 299.0,
"completions/min_terminated_length": 299.0,
"entropy": 0.7558131814002991,
"epoch": 1.9333333333333333,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.11424785852432251,
"learning_rate": 1e-05,
"loss": 0.0062,
"num_tokens": 66125636.0,
"reward": 3.8450608253479004,
"reward_std": 0.21053476631641388,
"rewards/ngram_repetition2/mean": 0.9281049966812134,
"rewards/ngram_repetition2/std": 0.02371094562113285,
"rewards/ngram_repetition3/mean": 0.987943172454834,
"rewards/ngram_repetition3/std": 0.008886902593076229,
"rewards/symbolic_reward_accuracy/mean": 0.92138671875,
"rewards/symbolic_reward_accuracy/std": 0.2691999673843384,
"rewards/symbolic_reward_partial_score/mean": 0.973388671875,
"rewards/symbolic_reward_partial_score/std": 0.09772618114948273,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9738227128982544,
"rewards/thinking_answer_ratio_reward/std": 0.007113671861588955,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2223323583602905,
"sampling/importance_sampling_ratio/min": 0.0015530625823885202,
"sampling/sampling_logp_difference/max": 6.467526435852051,
"sampling/sampling_logp_difference/mean": 0.3047195076942444,
"step": 116
},
{
"clip_ratio/high_max": 0.1171875,
"clip_ratio/high_mean": 0.04541015625,
"clip_ratio/low_mean": 0.30029296875,
"clip_ratio/low_min": 0.16796875,
"clip_ratio/region_mean": 0.345703125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 848.0,
"completions/max_terminated_length": 848.0,
"completions/mean_length": 490.2041015625,
"completions/mean_terminated_length": 490.2041015625,
"completions/min_length": 284.0,
"completions/min_terminated_length": 284.0,
"entropy": 0.8120047375559807,
"epoch": 2.0,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1353643834590912,
"learning_rate": 1e-05,
"loss": 0.0081,
"num_tokens": 68691558.0,
"reward": 3.777482748031616,
"reward_std": 0.2158791422843933,
"rewards/ngram_repetition2/mean": 0.9211949110031128,
"rewards/ngram_repetition2/std": 0.025232266634702682,
"rewards/ngram_repetition3/mean": 0.9863015413284302,
"rewards/ngram_repetition3/std": 0.010167215950787067,
"rewards/symbolic_reward_accuracy/mean": 0.89208984375,
"rewards/symbolic_reward_accuracy/std": 0.3103426992893219,
"rewards/symbolic_reward_partial_score/mean": 0.9644775390625,
"rewards/symbolic_reward_partial_score/std": 0.11238247156143188,
"rewards/tag_count_reward/mean": 1.0,
"rewards/tag_count_reward/std": 0.0,
"rewards/thinking_answer_ratio_reward/mean": 0.9750710725784302,
"rewards/thinking_answer_ratio_reward/std": 0.006316343788057566,
"sampling/importance_sampling_ratio/max": 2.0,
"sampling/importance_sampling_ratio/mean": 1.2388031482696533,
"sampling/importance_sampling_ratio/min": 0.0007285280153155327,
"sampling/sampling_logp_difference/max": 7.224484443664551,
"sampling/sampling_logp_difference/mean": 0.3197594881057739,
"step": 120
},
{
"epoch": 2.0,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 724.9473684210526,
"eval_completions/max_terminated_length": 724.9473684210526,
"eval_completions/mean_length": 479.13075657894734,
"eval_completions/mean_terminated_length": 479.13075657894734,
"eval_completions/min_length": 320.7368421052632,
"eval_completions/min_terminated_length": 320.7368421052632,
"eval_entropy": 0.8422083133145383,
"eval_frac_reward_zero_std": 0.0,
"eval_loss": 0.002333071082830429,
"eval_num_tokens": 68691558.0,
"eval_reward": 3.9059901237487793,
"eval_reward_std": 0.15592951225852103,
"eval_rewards/ngram_repetition2/mean": 0.916603662465748,
"eval_rewards/ngram_repetition2/std": 0.02568632119188183,
"eval_rewards/ngram_repetition3/mean": 0.985195959869184,
"eval_rewards/ngram_repetition3/std": 0.00971777170994564,
"eval_rewards/symbolic_reward_accuracy/mean": 0.9469572368421053,
"eval_rewards/symbolic_reward_accuracy/std": 0.18719423405433955,
"eval_rewards/symbolic_reward_partial_score/mean": 0.9833470394736842,
"eval_rewards/symbolic_reward_partial_score/std": 0.06975230809889342,
"eval_rewards/tag_count_reward/mean": 1.0,
"eval_rewards/tag_count_reward/std": 0.0,
"eval_rewards/thinking_answer_ratio_reward/mean": 0.9710588988504911,
"eval_rewards/thinking_answer_ratio_reward/std": 0.008188823862981639,
"eval_runtime": 516.789,
"eval_samples_per_second": 0.29,
"eval_sampling/importance_sampling_ratio/max": 2.0,
"eval_sampling/importance_sampling_ratio/mean": 1.248612272111993,
"eval_sampling/importance_sampling_ratio/min": 0.0039012981946335025,
"eval_sampling/sampling_logp_difference/max": 5.646010674928364,
"eval_sampling/sampling_logp_difference/mean": 0.32886375722132233,
"eval_steps_per_second": 0.004,
"step": 120
},
{
"epoch": 2.0,
"step": 120,
"total_flos": 0.0,
"train_loss": 0.0032316646189428865,
"train_runtime": 11600.051,
"train_samples_per_second": 0.349,
"train_steps_per_second": 0.01
}
],
"logging_steps": 4,
"max_steps": 120,
"num_input_tokens_seen": 68691558,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}