{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0782472613458529, "eval_steps": 5, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 505.6333343505859, "completions/clipped_ratio": 0.9833333333333334, "completions/max_length": 512.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 505.6333343505859, "completions/mean_terminated_length": 13.0, "completions/min_length": 473.8, "completions/min_terminated_length": 13.0, "epoch": 0.00782472613458529, "frac_reward_zero_std": 0.0, "grad_norm": 3.736088514328003, "kl": 0.6088978718966246, "learning_rate": 2.25e-07, "loss": 0.0227, "num_tokens": 45938.0, "reward": 0.6808711916208268, "reward_std": 0.3632167547941208, "rewards/clinical_similarity_reward/mean": 0.15434668213129044, "rewards/clinical_similarity_reward/std": 0.1388381078839302, "rewards/reasoning_coverage_reward/mean": 0.026524468092247844, "rewards/reasoning_coverage_reward/std": 0.024786698445677758, "rewards/structural_reward/mean": 0.49999999403953554, "rewards/structural_reward/std": 0.27526672184467316, "step": 10 }, { "completion_length": 499.23333435058595, "completions/clipped_ratio": 0.9666666666666666, "completions/max_length": 512.0, "completions/max_terminated_length": 19.3, "completions/mean_length": 499.23333435058595, "completions/mean_terminated_length": 12.9, "completions/min_length": 467.3, "completions/min_terminated_length": 6.5, "epoch": 0.01564945226917058, "frac_reward_zero_std": 0.05, "grad_norm": 9.93607234954834, "kl": 0.27236317191272974, "learning_rate": 4.7499999999999995e-07, "loss": -0.0332, "num_tokens": 91492.0, "reward": 0.6439534038305282, "reward_std": 0.3386003218591213, "rewards/clinical_similarity_reward/mean": 0.15627755597233772, "rewards/clinical_similarity_reward/std": 0.1460672415792942, "rewards/reasoning_coverage_reward/mean": 0.021009179577231408, "rewards/reasoning_coverage_reward/std": 0.019322671368718146, "rewards/structural_reward/mean": 0.46666666567325593, "rewards/structural_reward/std": 0.29707907438278197, "step": 20 }, { "completion_length": 486.8333343505859, "completions/clipped_ratio": 0.95, "completions/max_length": 512.0, "completions/max_terminated_length": 2.5, "completions/mean_length": 486.8333343505859, "completions/mean_terminated_length": 1.35, "completions/min_length": 409.8, "completions/min_terminated_length": 0.2, "epoch": 0.023474178403755867, "frac_reward_zero_std": 0.0, "grad_norm": 8.582813262939453, "kl": 0.22508624009788036, "learning_rate": 7.249999999999999e-07, "loss": -0.0334, "num_tokens": 136302.0, "reward": 0.7573434472084045, "reward_std": 0.34938589334487913, "rewards/clinical_similarity_reward/mean": 0.1784390039741993, "rewards/clinical_similarity_reward/std": 0.1555245727300644, "rewards/reasoning_coverage_reward/mean": 0.028904422465711833, "rewards/reasoning_coverage_reward/std": 0.02461537951603532, "rewards/structural_reward/mean": 0.5499999940395355, "rewards/structural_reward/std": 0.27120388448238375, "step": 30 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03129890453834116, "frac_reward_zero_std": 0.1, "grad_norm": 5.6369709968566895, "kl": 0.2943277703016065, "learning_rate": 9.75e-07, "loss": 0.0006, "num_tokens": 182622.0, "reward": 0.667632919549942, "reward_std": 0.3562010109424591, "rewards/clinical_similarity_reward/mean": 0.14395808503031732, "rewards/clinical_similarity_reward/std": 0.1386837735772133, "rewards/reasoning_coverage_reward/mean": 0.02367481905966997, "rewards/reasoning_coverage_reward/std": 0.026593181863427164, "rewards/structural_reward/mean": 0.4999999970197678, "rewards/structural_reward/std": 0.3440804839134216, "step": 40 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.03912363067292645, "frac_reward_zero_std": 0.1, "grad_norm": 2.707392692565918, "kl": 0.2239656963472953, "learning_rate": 9.98458666866564e-07, "loss": 0.0004, "num_tokens": 228942.0, "reward": 0.6695286631584167, "reward_std": 0.28249876499176024, "rewards/clinical_similarity_reward/mean": 0.1714836047962308, "rewards/clinical_similarity_reward/std": 0.11788879204541444, "rewards/reasoning_coverage_reward/mean": 0.0230450589209795, "rewards/reasoning_coverage_reward/std": 0.02328432989306748, "rewards/structural_reward/mean": 0.4750000037252903, "rewards/structural_reward/std": 0.26233516484498975, "step": 50 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.046948356807511735, "frac_reward_zero_std": 0.0, "grad_norm": 29.3859806060791, "kl": 0.23591715623624623, "learning_rate": 9.931428007686156e-07, "loss": 0.0005, "num_tokens": 275262.0, "reward": 0.7382774829864502, "reward_std": 0.4251179203391075, "rewards/clinical_similarity_reward/mean": 0.1646916825324297, "rewards/clinical_similarity_reward/std": 0.16600224673748015, "rewards/reasoning_coverage_reward/mean": 0.023585778661072254, "rewards/reasoning_coverage_reward/std": 0.02774114878848195, "rewards/structural_reward/mean": 0.55, "rewards/structural_reward/std": 0.29995107650756836, "step": 60 }, { "completion_length": 498.1000015258789, "completions/clipped_ratio": 0.9666666666666666, "completions/max_length": 512.0, "completions/max_terminated_length": 19.0, "completions/mean_length": 498.1000030517578, "completions/mean_terminated_length": 19.0, "completions/min_length": 428.6, "completions/min_terminated_length": 19.0, "epoch": 0.054773082942097026, "frac_reward_zero_std": 0.0, "grad_norm": 7.979323387145996, "kl": 0.8969629426021128, "learning_rate": 9.840738201890539e-07, "loss": -0.0103, "num_tokens": 320748.0, "reward": 0.7410261273384094, "reward_std": 0.4414085656404495, "rewards/clinical_similarity_reward/mean": 0.17917022854089737, "rewards/clinical_similarity_reward/std": 0.1667772613465786, "rewards/reasoning_coverage_reward/mean": 0.028522509895265103, "rewards/reasoning_coverage_reward/std": 0.03220408074557781, "rewards/structural_reward/mean": 0.5333333283662796, "rewards/structural_reward/std": 0.3038123190402985, "step": 70 }, { "completion_length": 508.8833343505859, "completions/clipped_ratio": 0.9833333333333334, "completions/max_length": 512.0, "completions/max_terminated_length": 32.5, "completions/mean_length": 508.8833343505859, "completions/mean_terminated_length": 32.5, "completions/min_length": 493.3, "completions/min_terminated_length": 32.5, "epoch": 0.06259780907668232, "frac_reward_zero_std": 0.0, "grad_norm": 2.5743541717529297, "kl": 0.16917975191026927, "learning_rate": 9.713207455460892e-07, "loss": -0.0071, "num_tokens": 366881.0, "reward": 0.8143157064914703, "reward_std": 0.3965289264917374, "rewards/clinical_similarity_reward/mean": 0.1812053605914116, "rewards/clinical_similarity_reward/std": 0.150646311044693, "rewards/reasoning_coverage_reward/mean": 0.0247769545763731, "rewards/reasoning_coverage_reward/std": 0.02672698013484478, "rewards/structural_reward/mean": 0.6083333313465118, "rewards/structural_reward/std": 0.2841934531927109, "step": 80 }, { "completion_length": 508.28333435058596, "completions/clipped_ratio": 0.9833333333333334, "completions/max_length": 512.0, "completions/max_terminated_length": 28.9, "completions/mean_length": 508.28333435058596, "completions/mean_terminated_length": 28.9, "completions/min_length": 489.7, "completions/min_terminated_length": 28.9, "epoch": 0.07042253521126761, "frac_reward_zero_std": 0.0, "grad_norm": 81.7920150756836, "kl": 2.0460782941430806, "learning_rate": 9.549806354382715e-07, "loss": -0.0094, "num_tokens": 412978.0, "reward": 0.8557363629341126, "reward_std": 0.30585863888263704, "rewards/clinical_similarity_reward/mean": 0.19592185989022254, "rewards/clinical_similarity_reward/std": 0.14855169430375098, "rewards/reasoning_coverage_reward/mean": 0.03481445591896772, "rewards/reasoning_coverage_reward/std": 0.02430342249572277, "rewards/structural_reward/mean": 0.625, "rewards/structural_reward/std": 0.22509194910526276, "step": 90 }, { "completion_length": 512.0, "completions/clipped_ratio": 1.0, "completions/max_length": 512.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 512.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 512.0, "completions/min_terminated_length": 0.0, "epoch": 0.0782472613458529, "frac_reward_zero_std": 0.0, "grad_norm": 5.595300197601318, "kl": 0.35413739252835513, "learning_rate": 9.351778479699498e-07, "loss": 0.0007, "num_tokens": 459298.0, "reward": 0.7952392637729645, "reward_std": 0.2889839544892311, "rewards/clinical_similarity_reward/mean": 0.1676620215177536, "rewards/clinical_similarity_reward/std": 0.13793348520994186, "rewards/reasoning_coverage_reward/mean": 0.027577185444533824, "rewards/reasoning_coverage_reward/std": 0.025368471443653107, "rewards/structural_reward/mean": 0.5999999940395355, "rewards/structural_reward/std": 0.22518454492092133, "step": 100 } ], "logging_steps": 10, "max_steps": 400, "num_input_tokens_seen": 459298, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }