grpo01 / trainer_state.json
aaaarpittttt's picture
Upload final GRPO checkpoint (checkpoint-100)
6c568a6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.0782472613458529,
"eval_steps": 5,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 505.6333343505859,
"completions/clipped_ratio": 0.9833333333333334,
"completions/max_length": 512.0,
"completions/max_terminated_length": 13.0,
"completions/mean_length": 505.6333343505859,
"completions/mean_terminated_length": 13.0,
"completions/min_length": 473.8,
"completions/min_terminated_length": 13.0,
"epoch": 0.00782472613458529,
"frac_reward_zero_std": 0.0,
"grad_norm": 3.736088514328003,
"kl": 0.6088978718966246,
"learning_rate": 2.25e-07,
"loss": 0.0227,
"num_tokens": 45938.0,
"reward": 0.6808711916208268,
"reward_std": 0.3632167547941208,
"rewards/clinical_similarity_reward/mean": 0.15434668213129044,
"rewards/clinical_similarity_reward/std": 0.1388381078839302,
"rewards/reasoning_coverage_reward/mean": 0.026524468092247844,
"rewards/reasoning_coverage_reward/std": 0.024786698445677758,
"rewards/structural_reward/mean": 0.49999999403953554,
"rewards/structural_reward/std": 0.27526672184467316,
"step": 10
},
{
"completion_length": 499.23333435058595,
"completions/clipped_ratio": 0.9666666666666666,
"completions/max_length": 512.0,
"completions/max_terminated_length": 19.3,
"completions/mean_length": 499.23333435058595,
"completions/mean_terminated_length": 12.9,
"completions/min_length": 467.3,
"completions/min_terminated_length": 6.5,
"epoch": 0.01564945226917058,
"frac_reward_zero_std": 0.05,
"grad_norm": 9.93607234954834,
"kl": 0.27236317191272974,
"learning_rate": 4.7499999999999995e-07,
"loss": -0.0332,
"num_tokens": 91492.0,
"reward": 0.6439534038305282,
"reward_std": 0.3386003218591213,
"rewards/clinical_similarity_reward/mean": 0.15627755597233772,
"rewards/clinical_similarity_reward/std": 0.1460672415792942,
"rewards/reasoning_coverage_reward/mean": 0.021009179577231408,
"rewards/reasoning_coverage_reward/std": 0.019322671368718146,
"rewards/structural_reward/mean": 0.46666666567325593,
"rewards/structural_reward/std": 0.29707907438278197,
"step": 20
},
{
"completion_length": 486.8333343505859,
"completions/clipped_ratio": 0.95,
"completions/max_length": 512.0,
"completions/max_terminated_length": 2.5,
"completions/mean_length": 486.8333343505859,
"completions/mean_terminated_length": 1.35,
"completions/min_length": 409.8,
"completions/min_terminated_length": 0.2,
"epoch": 0.023474178403755867,
"frac_reward_zero_std": 0.0,
"grad_norm": 8.582813262939453,
"kl": 0.22508624009788036,
"learning_rate": 7.249999999999999e-07,
"loss": -0.0334,
"num_tokens": 136302.0,
"reward": 0.7573434472084045,
"reward_std": 0.34938589334487913,
"rewards/clinical_similarity_reward/mean": 0.1784390039741993,
"rewards/clinical_similarity_reward/std": 0.1555245727300644,
"rewards/reasoning_coverage_reward/mean": 0.028904422465711833,
"rewards/reasoning_coverage_reward/std": 0.02461537951603532,
"rewards/structural_reward/mean": 0.5499999940395355,
"rewards/structural_reward/std": 0.27120388448238375,
"step": 30
},
{
"completion_length": 512.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.03129890453834116,
"frac_reward_zero_std": 0.1,
"grad_norm": 5.6369709968566895,
"kl": 0.2943277703016065,
"learning_rate": 9.75e-07,
"loss": 0.0006,
"num_tokens": 182622.0,
"reward": 0.667632919549942,
"reward_std": 0.3562010109424591,
"rewards/clinical_similarity_reward/mean": 0.14395808503031732,
"rewards/clinical_similarity_reward/std": 0.1386837735772133,
"rewards/reasoning_coverage_reward/mean": 0.02367481905966997,
"rewards/reasoning_coverage_reward/std": 0.026593181863427164,
"rewards/structural_reward/mean": 0.4999999970197678,
"rewards/structural_reward/std": 0.3440804839134216,
"step": 40
},
{
"completion_length": 512.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.03912363067292645,
"frac_reward_zero_std": 0.1,
"grad_norm": 2.707392692565918,
"kl": 0.2239656963472953,
"learning_rate": 9.98458666866564e-07,
"loss": 0.0004,
"num_tokens": 228942.0,
"reward": 0.6695286631584167,
"reward_std": 0.28249876499176024,
"rewards/clinical_similarity_reward/mean": 0.1714836047962308,
"rewards/clinical_similarity_reward/std": 0.11788879204541444,
"rewards/reasoning_coverage_reward/mean": 0.0230450589209795,
"rewards/reasoning_coverage_reward/std": 0.02328432989306748,
"rewards/structural_reward/mean": 0.4750000037252903,
"rewards/structural_reward/std": 0.26233516484498975,
"step": 50
},
{
"completion_length": 512.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.046948356807511735,
"frac_reward_zero_std": 0.0,
"grad_norm": 29.3859806060791,
"kl": 0.23591715623624623,
"learning_rate": 9.931428007686156e-07,
"loss": 0.0005,
"num_tokens": 275262.0,
"reward": 0.7382774829864502,
"reward_std": 0.4251179203391075,
"rewards/clinical_similarity_reward/mean": 0.1646916825324297,
"rewards/clinical_similarity_reward/std": 0.16600224673748015,
"rewards/reasoning_coverage_reward/mean": 0.023585778661072254,
"rewards/reasoning_coverage_reward/std": 0.02774114878848195,
"rewards/structural_reward/mean": 0.55,
"rewards/structural_reward/std": 0.29995107650756836,
"step": 60
},
{
"completion_length": 498.1000015258789,
"completions/clipped_ratio": 0.9666666666666666,
"completions/max_length": 512.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 498.1000030517578,
"completions/mean_terminated_length": 19.0,
"completions/min_length": 428.6,
"completions/min_terminated_length": 19.0,
"epoch": 0.054773082942097026,
"frac_reward_zero_std": 0.0,
"grad_norm": 7.979323387145996,
"kl": 0.8969629426021128,
"learning_rate": 9.840738201890539e-07,
"loss": -0.0103,
"num_tokens": 320748.0,
"reward": 0.7410261273384094,
"reward_std": 0.4414085656404495,
"rewards/clinical_similarity_reward/mean": 0.17917022854089737,
"rewards/clinical_similarity_reward/std": 0.1667772613465786,
"rewards/reasoning_coverage_reward/mean": 0.028522509895265103,
"rewards/reasoning_coverage_reward/std": 0.03220408074557781,
"rewards/structural_reward/mean": 0.5333333283662796,
"rewards/structural_reward/std": 0.3038123190402985,
"step": 70
},
{
"completion_length": 508.8833343505859,
"completions/clipped_ratio": 0.9833333333333334,
"completions/max_length": 512.0,
"completions/max_terminated_length": 32.5,
"completions/mean_length": 508.8833343505859,
"completions/mean_terminated_length": 32.5,
"completions/min_length": 493.3,
"completions/min_terminated_length": 32.5,
"epoch": 0.06259780907668232,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.5743541717529297,
"kl": 0.16917975191026927,
"learning_rate": 9.713207455460892e-07,
"loss": -0.0071,
"num_tokens": 366881.0,
"reward": 0.8143157064914703,
"reward_std": 0.3965289264917374,
"rewards/clinical_similarity_reward/mean": 0.1812053605914116,
"rewards/clinical_similarity_reward/std": 0.150646311044693,
"rewards/reasoning_coverage_reward/mean": 0.0247769545763731,
"rewards/reasoning_coverage_reward/std": 0.02672698013484478,
"rewards/structural_reward/mean": 0.6083333313465118,
"rewards/structural_reward/std": 0.2841934531927109,
"step": 80
},
{
"completion_length": 508.28333435058596,
"completions/clipped_ratio": 0.9833333333333334,
"completions/max_length": 512.0,
"completions/max_terminated_length": 28.9,
"completions/mean_length": 508.28333435058596,
"completions/mean_terminated_length": 28.9,
"completions/min_length": 489.7,
"completions/min_terminated_length": 28.9,
"epoch": 0.07042253521126761,
"frac_reward_zero_std": 0.0,
"grad_norm": 81.7920150756836,
"kl": 2.0460782941430806,
"learning_rate": 9.549806354382715e-07,
"loss": -0.0094,
"num_tokens": 412978.0,
"reward": 0.8557363629341126,
"reward_std": 0.30585863888263704,
"rewards/clinical_similarity_reward/mean": 0.19592185989022254,
"rewards/clinical_similarity_reward/std": 0.14855169430375098,
"rewards/reasoning_coverage_reward/mean": 0.03481445591896772,
"rewards/reasoning_coverage_reward/std": 0.02430342249572277,
"rewards/structural_reward/mean": 0.625,
"rewards/structural_reward/std": 0.22509194910526276,
"step": 90
},
{
"completion_length": 512.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 512.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 512.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 512.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.0782472613458529,
"frac_reward_zero_std": 0.0,
"grad_norm": 5.595300197601318,
"kl": 0.35413739252835513,
"learning_rate": 9.351778479699498e-07,
"loss": 0.0007,
"num_tokens": 459298.0,
"reward": 0.7952392637729645,
"reward_std": 0.2889839544892311,
"rewards/clinical_similarity_reward/mean": 0.1676620215177536,
"rewards/clinical_similarity_reward/std": 0.13793348520994186,
"rewards/reasoning_coverage_reward/mean": 0.027577185444533824,
"rewards/reasoning_coverage_reward/std": 0.025368471443653107,
"rewards/structural_reward/mean": 0.5999999940395355,
"rewards/structural_reward/std": 0.22518454492092133,
"step": 100
}
],
"logging_steps": 10,
"max_steps": 400,
"num_input_tokens_seen": 459298,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}