Supernova-0.6B / trainer_state.json
Ashima's picture
Upload checkpoint-600
44b6aad verified
{
"best_global_step": 600,
"best_metric": 0.10775862068965517,
"best_model_checkpoint": "/local2/asuvarna31/distractors/Qwen3-0.6B_Mar18-0256_qwen3_0.6b_micro_top2_Mar18-0250_10000/checkpoint-600",
"epoch": 0.12,
"eval_steps": 100,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0475,
"completions/max_length": 1907.96,
"completions/max_terminated_length": 1542.92,
"completions/mean_length": 794.835,
"completions/mean_terminated_length": 635.4152294921874,
"completions/min_length": 231.32,
"completions/min_terminated_length": 231.32,
"entropy": 0.39757278442382815,
"epoch": 0.005,
"frac_reward_zero_std": 0.26,
"grad_norm": 0.7908472990123143,
"learning_rate": 9.952e-07,
"loss": -0.0219,
"num_tokens": 370918.0,
"reward": 0.37,
"reward_std": 0.42105737447738645,
"rewards/combined_reward/mean": 0.37,
"rewards/combined_reward/std": 0.4210573983192444,
"sampling/importance_sampling_ratio/max": 2.198445258140564,
"sampling/importance_sampling_ratio/mean": 0.6045227408409118,
"sampling/importance_sampling_ratio/min": 0.022674707409983057,
"sampling/sampling_logp_difference/max": 0.698764111995697,
"sampling/sampling_logp_difference/mean": 0.02563132591545582,
"step": 25,
"step_time": 7.23048153122887
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0225,
"completions/max_length": 1275.04,
"completions/max_terminated_length": 1168.96,
"completions/mean_length": 563.5925,
"completions/mean_terminated_length": 512.9527490234375,
"completions/min_length": 240.0,
"completions/min_terminated_length": 240.0,
"entropy": 0.41257245182991026,
"epoch": 0.01,
"frac_reward_zero_std": 0.34,
"grad_norm": 1.1362625413839582,
"learning_rate": 9.901999999999999e-07,
"loss": 0.0233,
"num_tokens": 649035.0,
"reward": 0.5125,
"reward_std": 0.3963914489746094,
"rewards/combined_reward/mean": 0.5125,
"rewards/combined_reward/std": 0.39639145612716675,
"sampling/importance_sampling_ratio/max": 2.1132223176956177,
"sampling/importance_sampling_ratio/mean": 0.6526571130752563,
"sampling/importance_sampling_ratio/min": 0.029447911735951494,
"sampling/sampling_logp_difference/max": 0.6200782227516174,
"sampling/sampling_logp_difference/mean": 0.026439733169972897,
"step": 50,
"step_time": 4.926513614905998
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0575,
"completions/max_length": 2291.2,
"completions/max_terminated_length": 1943.84,
"completions/mean_length": 961.34,
"completions/mean_terminated_length": 788.9747546386719,
"completions/min_length": 298.44,
"completions/min_terminated_length": 298.44,
"entropy": 0.38926577985286714,
"epoch": 0.015,
"frac_reward_zero_std": 0.24,
"grad_norm": 1.8509531292907888,
"learning_rate": 9.852e-07,
"loss": 0.0555,
"num_tokens": 1089027.0,
"reward": 0.535,
"reward_std": 0.44107813596725465,
"rewards/combined_reward/mean": 0.535,
"rewards/combined_reward/std": 0.44107815861701966,
"sampling/importance_sampling_ratio/max": 1.9247741317749023,
"sampling/importance_sampling_ratio/mean": 0.593499310016632,
"sampling/importance_sampling_ratio/min": 0.025702737645042363,
"sampling/sampling_logp_difference/max": 0.7170420527458191,
"sampling/sampling_logp_difference/mean": 0.02496741436421871,
"step": 75,
"step_time": 8.62744049324654
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03,
"completions/max_length": 1365.76,
"completions/max_terminated_length": 1239.48,
"completions/mean_length": 614.215,
"completions/mean_terminated_length": 525.4255004882813,
"completions/min_length": 196.08,
"completions/min_terminated_length": 196.08,
"entropy": 0.42757334232330324,
"epoch": 0.02,
"frac_reward_zero_std": 0.32,
"grad_norm": 2.7107245292264266,
"learning_rate": 9.802e-07,
"loss": -0.016,
"num_tokens": 1389617.0,
"reward": 0.625,
"reward_std": 0.4001571249961853,
"rewards/combined_reward/mean": 0.625,
"rewards/combined_reward/std": 0.400157151222229,
"sampling/importance_sampling_ratio/max": 2.0216542720794677,
"sampling/importance_sampling_ratio/mean": 0.6695219826698303,
"sampling/importance_sampling_ratio/min": 0.031132260655103893,
"sampling/sampling_logp_difference/max": 0.7458893251419068,
"sampling/sampling_logp_difference/mean": 0.027404132559895517,
"step": 100,
"step_time": 5.374540434898808
},
{
"epoch": 0.02,
"eval_bbeh_mini_clip_ratio/high_max": 0.0,
"eval_bbeh_mini_clip_ratio/high_mean": 0.0,
"eval_bbeh_mini_clip_ratio/low_mean": 0.0,
"eval_bbeh_mini_clip_ratio/low_min": 0.0,
"eval_bbeh_mini_clip_ratio/region_mean": 0.0,
"eval_bbeh_mini_completions/clipped_ratio": 0.3599137931034483,
"eval_bbeh_mini_completions/max_length": 4096.0,
"eval_bbeh_mini_completions/max_terminated_length": 3399.1724137931033,
"eval_bbeh_mini_completions/mean_length": 2556.969827586207,
"eval_bbeh_mini_completions/mean_terminated_length": 1690.4230199353449,
"eval_bbeh_mini_completions/min_length": 389.13793103448273,
"eval_bbeh_mini_completions/min_terminated_length": 389.13793103448273,
"eval_bbeh_mini_entropy": 0.33320264775177527,
"eval_bbeh_mini_frac_reward_zero_std": 1.0,
"eval_bbeh_mini_loss": 0.0,
"eval_bbeh_mini_num_tokens": 1389617.0,
"eval_bbeh_mini_reward": 0.0625,
"eval_bbeh_mini_reward_std": 0.1932970984228726,
"eval_bbeh_mini_rewards/combined_reward/mean": 0.0625,
"eval_bbeh_mini_rewards/combined_reward/std": 0.19329710458887034,
"eval_bbeh_mini_runtime": 493.3587,
"eval_bbeh_mini_samples_per_second": 0.932,
"eval_bbeh_mini_sampling/importance_sampling_ratio/max": 1.5544916828130853,
"eval_bbeh_mini_sampling/importance_sampling_ratio/mean": 0.24806074290696917,
"eval_bbeh_mini_sampling/importance_sampling_ratio/min": 4.2029642265184464e-05,
"eval_bbeh_mini_sampling/sampling_logp_difference/max": 3.1536410635915297,
"eval_bbeh_mini_sampling/sampling_logp_difference/mean": 0.022588976112932993,
"eval_bbeh_mini_steps_per_second": 0.059,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0925,
"completions/max_length": 2125.76,
"completions/max_terminated_length": 1810.64,
"completions/mean_length": 1089.9775,
"completions/mean_terminated_length": 842.0292138671875,
"completions/min_length": 302.92,
"completions/min_terminated_length": 302.92,
"entropy": 0.381637504696846,
"epoch": 0.025,
"frac_reward_zero_std": 0.44,
"grad_norm": 0.8500470780968633,
"learning_rate": 9.752e-07,
"loss": 0.0054,
"num_tokens": 1879680.0,
"reward": 0.5525,
"reward_std": 0.3752482628822327,
"rewards/combined_reward/mean": 0.5525,
"rewards/combined_reward/std": 0.3752482759952545,
"sampling/importance_sampling_ratio/max": 2.1725155782699583,
"sampling/importance_sampling_ratio/mean": 0.5781891107559204,
"sampling/importance_sampling_ratio/min": 0.014303099608951016,
"sampling/sampling_logp_difference/max": 0.7780866742134094,
"sampling/sampling_logp_difference/mean": 0.02495789147913456,
"step": 125,
"step_time": 8.32747592881322
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025,
"completions/max_length": 1443.16,
"completions/max_terminated_length": 1350.84,
"completions/mean_length": 602.2575,
"completions/mean_terminated_length": 526.3002880859375,
"completions/min_length": 181.92,
"completions/min_terminated_length": 181.92,
"entropy": 0.3935022366046905,
"epoch": 0.03,
"frac_reward_zero_std": 0.46,
"grad_norm": 3.9983544495385286,
"learning_rate": 9.701999999999998e-07,
"loss": -0.0266,
"num_tokens": 2173791.0,
"reward": 0.675,
"reward_std": 0.4132915163040161,
"rewards/combined_reward/mean": 0.675,
"rewards/combined_reward/std": 0.41329152703285216,
"sampling/importance_sampling_ratio/max": 1.974114351272583,
"sampling/importance_sampling_ratio/mean": 0.6311233282089234,
"sampling/importance_sampling_ratio/min": 0.03929259464435745,
"sampling/sampling_logp_difference/max": 0.6166027712821961,
"sampling/sampling_logp_difference/mean": 0.02564044661819935,
"step": 150,
"step_time": 5.59985625769943
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0475,
"completions/max_length": 1882.6,
"completions/max_terminated_length": 1692.64,
"completions/mean_length": 824.66,
"completions/mean_terminated_length": 694.9667797851563,
"completions/min_length": 231.4,
"completions/min_terminated_length": 231.4,
"entropy": 0.38833099365234375,
"epoch": 0.035,
"frac_reward_zero_std": 0.44,
"grad_norm": 0.0,
"learning_rate": 9.651999999999999e-07,
"loss": -0.0042,
"num_tokens": 2556495.0,
"reward": 0.615,
"reward_std": 0.3986561942100525,
"rewards/combined_reward/mean": 0.615,
"rewards/combined_reward/std": 0.39865620732307433,
"sampling/importance_sampling_ratio/max": 2.0362929487228394,
"sampling/importance_sampling_ratio/mean": 0.6166802150011063,
"sampling/importance_sampling_ratio/min": 0.03415079687081743,
"sampling/sampling_logp_difference/max": 0.9840151739120483,
"sampling/sampling_logp_difference/mean": 0.025189833119511604,
"step": 175,
"step_time": 7.186449560681358
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0475,
"completions/max_length": 2255.96,
"completions/max_terminated_length": 1958.16,
"completions/mean_length": 876.6025,
"completions/mean_terminated_length": 734.1006811523438,
"completions/min_length": 209.52,
"completions/min_terminated_length": 209.52,
"entropy": 0.3978144943714142,
"epoch": 0.04,
"frac_reward_zero_std": 0.34,
"grad_norm": 2.407379133784635,
"learning_rate": 9.602e-07,
"loss": 0.0184,
"num_tokens": 2959120.0,
"reward": 0.5975,
"reward_std": 0.4489552092552185,
"rewards/combined_reward/mean": 0.5975,
"rewards/combined_reward/std": 0.4489552354812622,
"sampling/importance_sampling_ratio/max": 1.9040312719345094,
"sampling/importance_sampling_ratio/mean": 0.5416174274682999,
"sampling/importance_sampling_ratio/min": 0.018405352871644087,
"sampling/sampling_logp_difference/max": 0.8134938716888428,
"sampling/sampling_logp_difference/mean": 0.025685913935303686,
"step": 200,
"step_time": 8.387271651169286
},
{
"epoch": 0.04,
"eval_bbeh_mini_clip_ratio/high_max": 0.0,
"eval_bbeh_mini_clip_ratio/high_mean": 0.0,
"eval_bbeh_mini_clip_ratio/low_mean": 0.0,
"eval_bbeh_mini_clip_ratio/low_min": 0.0,
"eval_bbeh_mini_clip_ratio/region_mean": 0.0,
"eval_bbeh_mini_completions/clipped_ratio": 0.31896551724137934,
"eval_bbeh_mini_completions/max_length": 4096.0,
"eval_bbeh_mini_completions/max_terminated_length": 3501.3793103448274,
"eval_bbeh_mini_completions/mean_length": 2493.7543103448274,
"eval_bbeh_mini_completions/mean_terminated_length": 1744.4683248585668,
"eval_bbeh_mini_completions/min_length": 402.0689655172414,
"eval_bbeh_mini_completions/min_terminated_length": 402.0689655172414,
"eval_bbeh_mini_entropy": 0.33658467079031057,
"eval_bbeh_mini_frac_reward_zero_std": 1.0,
"eval_bbeh_mini_loss": 0.0,
"eval_bbeh_mini_num_tokens": 2959120.0,
"eval_bbeh_mini_reward": 0.07327586206896551,
"eval_bbeh_mini_reward_std": 0.22547399586644665,
"eval_bbeh_mini_rewards/combined_reward/mean": 0.07327586206896551,
"eval_bbeh_mini_rewards/combined_reward/std": 0.22547400408777699,
"eval_bbeh_mini_runtime": 485.7912,
"eval_bbeh_mini_samples_per_second": 0.947,
"eval_bbeh_mini_sampling/importance_sampling_ratio/max": 1.4247941641971982,
"eval_bbeh_mini_sampling/importance_sampling_ratio/mean": 0.21224350610683704,
"eval_bbeh_mini_sampling/importance_sampling_ratio/min": 1.9089912678375065e-05,
"eval_bbeh_mini_sampling/sampling_logp_difference/max": 3.4790474669686677,
"eval_bbeh_mini_sampling/sampling_logp_difference/mean": 0.022729232650378656,
"eval_bbeh_mini_steps_per_second": 0.06,
"step": 200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.045,
"completions/max_length": 2272.24,
"completions/max_terminated_length": 1942.2,
"completions/mean_length": 935.3925,
"completions/mean_terminated_length": 812.9290356445313,
"completions/min_length": 275.64,
"completions/min_terminated_length": 275.64,
"entropy": 0.4116176557540894,
"epoch": 0.045,
"frac_reward_zero_std": 0.42,
"grad_norm": 3.5037476412669593,
"learning_rate": 9.552e-07,
"loss": 0.0173,
"num_tokens": 3383261.0,
"reward": 0.6625,
"reward_std": 0.41886321067810056,
"rewards/combined_reward/mean": 0.6625,
"rewards/combined_reward/std": 0.41886322021484373,
"sampling/importance_sampling_ratio/max": 2.0795044040679933,
"sampling/importance_sampling_ratio/mean": 0.6006859976053238,
"sampling/importance_sampling_ratio/min": 0.014239588570781052,
"sampling/sampling_logp_difference/max": 0.7338527536392212,
"sampling/sampling_logp_difference/mean": 0.02595676988363266,
"step": 225,
"step_time": 8.487474374789745
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03,
"completions/max_length": 1640.28,
"completions/max_terminated_length": 1389.36,
"completions/mean_length": 653.0075,
"completions/mean_terminated_length": 565.9887109375,
"completions/min_length": 255.52,
"completions/min_terminated_length": 255.52,
"entropy": 0.4170915710926056,
"epoch": 0.05,
"frac_reward_zero_std": 0.42,
"grad_norm": 2.1638076114867757,
"learning_rate": 9.502e-07,
"loss": -0.0148,
"num_tokens": 3698864.0,
"reward": 0.705,
"reward_std": 0.38157126426696775,
"rewards/combined_reward/mean": 0.705,
"rewards/combined_reward/std": 0.38157127737998964,
"sampling/importance_sampling_ratio/max": 1.9969594597816467,
"sampling/importance_sampling_ratio/mean": 0.628093301653862,
"sampling/importance_sampling_ratio/min": 0.018454841256149202,
"sampling/sampling_logp_difference/max": 0.659376904964447,
"sampling/sampling_logp_difference/mean": 0.026758400201797487,
"step": 250,
"step_time": 6.111578326625749
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.055,
"completions/max_length": 2229.44,
"completions/max_terminated_length": 1948.52,
"completions/mean_length": 942.01,
"completions/mean_terminated_length": 788.1107543945312,
"completions/min_length": 249.4,
"completions/min_terminated_length": 249.4,
"entropy": 0.4110789692401886,
"epoch": 0.055,
"frac_reward_zero_std": 0.5,
"grad_norm": 4.667525978474157,
"learning_rate": 9.452e-07,
"loss": -0.0242,
"num_tokens": 4129764.0,
"reward": 0.6575,
"reward_std": 0.4181215476989746,
"rewards/combined_reward/mean": 0.6575,
"rewards/combined_reward/std": 0.41812155723571776,
"sampling/importance_sampling_ratio/max": 1.8673054933547975,
"sampling/importance_sampling_ratio/mean": 0.5125843727588654,
"sampling/importance_sampling_ratio/min": 0.01608145761529954,
"sampling/sampling_logp_difference/max": 0.8164897656440735,
"sampling/sampling_logp_difference/mean": 0.02648505486547947,
"step": 275,
"step_time": 8.40903925454244
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04,
"completions/max_length": 2057.4,
"completions/max_terminated_length": 1949.44,
"completions/mean_length": 919.8125,
"completions/mean_terminated_length": 816.5338427734375,
"completions/min_length": 261.0,
"completions/min_terminated_length": 261.0,
"entropy": 0.41194292187690734,
"epoch": 0.06,
"frac_reward_zero_std": 0.42,
"grad_norm": 1.1934433538781786,
"learning_rate": 9.402e-07,
"loss": 0.0465,
"num_tokens": 4550697.0,
"reward": 0.6275,
"reward_std": 0.417396194934845,
"rewards/combined_reward/mean": 0.6275,
"rewards/combined_reward/std": 0.41739620923995974,
"sampling/importance_sampling_ratio/max": 2.1660247945785525,
"sampling/importance_sampling_ratio/mean": 0.6122340059280396,
"sampling/importance_sampling_ratio/min": 0.014834024702245853,
"sampling/sampling_logp_difference/max": 0.8179536938667298,
"sampling/sampling_logp_difference/mean": 0.025995058193802833,
"step": 300,
"step_time": 7.779672881411389
},
{
"epoch": 0.06,
"eval_bbeh_mini_clip_ratio/high_max": 0.0,
"eval_bbeh_mini_clip_ratio/high_mean": 0.0,
"eval_bbeh_mini_clip_ratio/low_mean": 0.0,
"eval_bbeh_mini_clip_ratio/low_min": 0.0,
"eval_bbeh_mini_clip_ratio/region_mean": 0.0,
"eval_bbeh_mini_completions/clipped_ratio": 0.36637931034482757,
"eval_bbeh_mini_completions/max_length": 4096.0,
"eval_bbeh_mini_completions/max_terminated_length": 3524.7241379310344,
"eval_bbeh_mini_completions/mean_length": 2685.9525862068967,
"eval_bbeh_mini_completions/mean_terminated_length": 1877.7948924097523,
"eval_bbeh_mini_completions/min_length": 571.0689655172414,
"eval_bbeh_mini_completions/min_terminated_length": 571.0689655172414,
"eval_bbeh_mini_entropy": 0.35236285369971704,
"eval_bbeh_mini_frac_reward_zero_std": 1.0,
"eval_bbeh_mini_loss": 0.0,
"eval_bbeh_mini_num_tokens": 4550697.0,
"eval_bbeh_mini_reward": 0.10344827586206896,
"eval_bbeh_mini_reward_std": 0.2699657020897701,
"eval_bbeh_mini_rewards/combined_reward/mean": 0.10344827586206896,
"eval_bbeh_mini_rewards/combined_reward/std": 0.26996571339409925,
"eval_bbeh_mini_runtime": 494.6737,
"eval_bbeh_mini_samples_per_second": 0.93,
"eval_bbeh_mini_sampling/importance_sampling_ratio/max": 1.2562965689034298,
"eval_bbeh_mini_sampling/importance_sampling_ratio/mean": 0.1846869053511784,
"eval_bbeh_mini_sampling/importance_sampling_ratio/min": 8.863040915959077e-06,
"eval_bbeh_mini_sampling/sampling_logp_difference/max": 3.5924758705599555,
"eval_bbeh_mini_sampling/sampling_logp_difference/mean": 0.02370224494872422,
"eval_bbeh_mini_steps_per_second": 0.059,
"step": 300
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.07,
"completions/max_length": 2060.08,
"completions/max_terminated_length": 1839.84,
"completions/mean_length": 893.005,
"completions/mean_terminated_length": 669.4148620605469,
"completions/min_length": 229.84,
"completions/min_terminated_length": 229.84,
"entropy": 0.40538407385349273,
"epoch": 0.065,
"frac_reward_zero_std": 0.52,
"grad_norm": 5.036384229761231,
"learning_rate": 9.352e-07,
"loss": 0.0219,
"num_tokens": 4961307.0,
"reward": 0.61,
"reward_std": 0.39049545288085935,
"rewards/combined_reward/mean": 0.61,
"rewards/combined_reward/std": 0.3904954707622528,
"sampling/importance_sampling_ratio/max": 2.171120972633362,
"sampling/importance_sampling_ratio/mean": 0.6055274987220765,
"sampling/importance_sampling_ratio/min": 0.031129270781748347,
"sampling/sampling_logp_difference/max": 0.7485472440719605,
"sampling/sampling_logp_difference/mean": 0.02611954465508461,
"step": 325,
"step_time": 7.8838804703298955
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0075,
"completions/max_length": 1540.16,
"completions/max_terminated_length": 1477.4,
"completions/mean_length": 649.275,
"completions/mean_terminated_length": 629.781923828125,
"completions/min_length": 224.08,
"completions/min_terminated_length": 224.08,
"entropy": 0.44817891597747805,
"epoch": 0.07,
"frac_reward_zero_std": 0.4,
"grad_norm": 1.262847377601464,
"learning_rate": 9.302e-07,
"loss": -0.0023,
"num_tokens": 5272369.0,
"reward": 0.63,
"reward_std": 0.40423260688781737,
"rewards/combined_reward/mean": 0.63,
"rewards/combined_reward/std": 0.4042326259613037,
"sampling/importance_sampling_ratio/max": 1.9462181329727173,
"sampling/importance_sampling_ratio/mean": 0.5410465461015701,
"sampling/importance_sampling_ratio/min": 0.007918943986296653,
"sampling/sampling_logp_difference/max": 0.658315749168396,
"sampling/sampling_logp_difference/mean": 0.028008888587355615,
"step": 350,
"step_time": 5.602628886112943
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.055,
"completions/max_length": 1894.84,
"completions/max_terminated_length": 1623.76,
"completions/mean_length": 823.715,
"completions/mean_terminated_length": 638.96466796875,
"completions/min_length": 199.48,
"completions/min_terminated_length": 199.48,
"entropy": 0.4437406814098358,
"epoch": 0.075,
"frac_reward_zero_std": 0.42,
"grad_norm": 1.416585270355939,
"learning_rate": 9.251999999999999e-07,
"loss": -0.0063,
"num_tokens": 5651599.0,
"reward": 0.6375,
"reward_std": 0.4141005539894104,
"rewards/combined_reward/mean": 0.6375,
"rewards/combined_reward/std": 0.41410056471824647,
"sampling/importance_sampling_ratio/max": 2.079793190956116,
"sampling/importance_sampling_ratio/mean": 0.6110782504081727,
"sampling/importance_sampling_ratio/min": 0.012589475377462805,
"sampling/sampling_logp_difference/max": 0.7504757452011108,
"sampling/sampling_logp_difference/mean": 0.027451810911297798,
"step": 375,
"step_time": 7.127676211716607
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04,
"completions/max_length": 1728.44,
"completions/max_terminated_length": 1628.96,
"completions/mean_length": 761.1325,
"completions/mean_terminated_length": 657.4637036132813,
"completions/min_length": 202.8,
"completions/min_terminated_length": 202.8,
"entropy": 0.40143827855587005,
"epoch": 0.08,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.0,
"learning_rate": 9.202e-07,
"loss": 0.0175,
"num_tokens": 6009092.0,
"reward": 0.5775,
"reward_std": 0.38390336275100706,
"rewards/combined_reward/mean": 0.5775,
"rewards/combined_reward/std": 0.38390338063240054,
"sampling/importance_sampling_ratio/max": 2.096400910615921,
"sampling/importance_sampling_ratio/mean": 0.6684997088462115,
"sampling/importance_sampling_ratio/min": 0.027380506457557203,
"sampling/sampling_logp_difference/max": 0.6964709210395813,
"sampling/sampling_logp_difference/mean": 0.02609425738453865,
"step": 400,
"step_time": 6.615593434758484
},
{
"epoch": 0.08,
"eval_bbeh_mini_clip_ratio/high_max": 0.0,
"eval_bbeh_mini_clip_ratio/high_mean": 0.0,
"eval_bbeh_mini_clip_ratio/low_mean": 0.0,
"eval_bbeh_mini_clip_ratio/low_min": 0.0,
"eval_bbeh_mini_clip_ratio/region_mean": 0.0,
"eval_bbeh_mini_completions/clipped_ratio": 0.33620689655172414,
"eval_bbeh_mini_completions/max_length": 4096.0,
"eval_bbeh_mini_completions/max_terminated_length": 3580.5172413793102,
"eval_bbeh_mini_completions/mean_length": 2591.877155172414,
"eval_bbeh_mini_completions/mean_terminated_length": 1825.3767721241918,
"eval_bbeh_mini_completions/min_length": 451.62068965517244,
"eval_bbeh_mini_completions/min_terminated_length": 451.62068965517244,
"eval_bbeh_mini_entropy": 0.3378912878447565,
"eval_bbeh_mini_frac_reward_zero_std": 1.0,
"eval_bbeh_mini_loss": 0.0,
"eval_bbeh_mini_num_tokens": 6009092.0,
"eval_bbeh_mini_reward": 0.10560344827586207,
"eval_bbeh_mini_reward_std": 0.25904289196277486,
"eval_bbeh_mini_rewards/combined_reward/mean": 0.10560344827586207,
"eval_bbeh_mini_rewards/combined_reward/std": 0.2590429012117715,
"eval_bbeh_mini_runtime": 491.0126,
"eval_bbeh_mini_samples_per_second": 0.937,
"eval_bbeh_mini_sampling/importance_sampling_ratio/max": 1.1502898427946815,
"eval_bbeh_mini_sampling/importance_sampling_ratio/mean": 0.16786166884261985,
"eval_bbeh_mini_sampling/importance_sampling_ratio/min": 5.115552837594649e-05,
"eval_bbeh_mini_sampling/sampling_logp_difference/max": 2.436232163988311,
"eval_bbeh_mini_sampling/sampling_logp_difference/mean": 0.023024646565318108,
"eval_bbeh_mini_steps_per_second": 0.059,
"step": 400
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0725,
"completions/max_length": 2179.08,
"completions/max_terminated_length": 2003.44,
"completions/mean_length": 953.6125,
"completions/mean_terminated_length": 736.740048828125,
"completions/min_length": 218.08,
"completions/min_terminated_length": 218.08,
"entropy": 0.4012847465276718,
"epoch": 0.085,
"frac_reward_zero_std": 0.42,
"grad_norm": 0.5398184196341114,
"learning_rate": 9.151999999999999e-07,
"loss": -0.0237,
"num_tokens": 6441177.0,
"reward": 0.695,
"reward_std": 0.3829051518440247,
"rewards/combined_reward/mean": 0.695,
"rewards/combined_reward/std": 0.382905170917511,
"sampling/importance_sampling_ratio/max": 1.9372561621665954,
"sampling/importance_sampling_ratio/mean": 0.5673770725727081,
"sampling/importance_sampling_ratio/min": 0.03025458916346361,
"sampling/sampling_logp_difference/max": 0.7728560495376587,
"sampling/sampling_logp_difference/mean": 0.02587804526090622,
"step": 425,
"step_time": 8.324159880718216
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03,
"completions/max_length": 1513.88,
"completions/max_terminated_length": 1419.48,
"completions/mean_length": 615.395,
"completions/mean_terminated_length": 520.6183935546875,
"completions/min_length": 177.0,
"completions/min_terminated_length": 177.0,
"entropy": 0.41070492506027223,
"epoch": 0.09,
"frac_reward_zero_std": 0.52,
"grad_norm": 1.5270349642075316,
"learning_rate": 9.102e-07,
"loss": -0.0022,
"num_tokens": 6739631.0,
"reward": 0.735,
"reward_std": 0.33413492679595946,
"rewards/combined_reward/mean": 0.735,
"rewards/combined_reward/std": 0.3341349399089813,
"sampling/importance_sampling_ratio/max": 2.027468514442444,
"sampling/importance_sampling_ratio/mean": 0.6084394156932831,
"sampling/importance_sampling_ratio/min": 0.02255286922645837,
"sampling/sampling_logp_difference/max": 0.5962899017333985,
"sampling/sampling_logp_difference/mean": 0.02680632047355175,
"step": 450,
"step_time": 5.793452201336622
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025,
"completions/max_length": 1631.92,
"completions/max_terminated_length": 1465.8,
"completions/mean_length": 605.425,
"completions/mean_terminated_length": 531.262001953125,
"completions/min_length": 168.24,
"completions/min_terminated_length": 168.24,
"entropy": 0.4006887876987457,
"epoch": 0.095,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.0,
"learning_rate": 9.051999999999999e-07,
"loss": 0.051,
"num_tokens": 7030409.0,
"reward": 0.685,
"reward_std": 0.3645173120498657,
"rewards/combined_reward/mean": 0.685,
"rewards/combined_reward/std": 0.3645173156261444,
"sampling/importance_sampling_ratio/max": 1.9704220461845399,
"sampling/importance_sampling_ratio/mean": 0.62828111410141,
"sampling/importance_sampling_ratio/min": 0.012673925184790278,
"sampling/sampling_logp_difference/max": 0.7063542747497559,
"sampling/sampling_logp_difference/mean": 0.026762503162026406,
"step": 475,
"step_time": 6.15127008873038
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0275,
"completions/max_length": 1725.32,
"completions/max_terminated_length": 1643.52,
"completions/mean_length": 700.63,
"completions/mean_terminated_length": 630.799814453125,
"completions/min_length": 195.16,
"completions/min_terminated_length": 195.16,
"entropy": 0.40278098702430726,
"epoch": 0.1,
"frac_reward_zero_std": 0.46,
"grad_norm": 0.0,
"learning_rate": 9.002e-07,
"loss": 0.0015,
"num_tokens": 7360269.0,
"reward": 0.7075,
"reward_std": 0.32035229206085203,
"rewards/combined_reward/mean": 0.7075,
"rewards/combined_reward/std": 0.3203523027896881,
"sampling/importance_sampling_ratio/max": 1.9510860872268676,
"sampling/importance_sampling_ratio/mean": 0.5961837387084961,
"sampling/importance_sampling_ratio/min": 0.02190992054884191,
"sampling/sampling_logp_difference/max": 0.6417275023460388,
"sampling/sampling_logp_difference/mean": 0.026219148635864258,
"step": 500,
"step_time": 6.48362862716429
},
{
"epoch": 0.1,
"eval_bbeh_mini_clip_ratio/high_max": 0.0,
"eval_bbeh_mini_clip_ratio/high_mean": 0.0,
"eval_bbeh_mini_clip_ratio/low_mean": 0.0,
"eval_bbeh_mini_clip_ratio/low_min": 0.0,
"eval_bbeh_mini_clip_ratio/region_mean": 0.0,
"eval_bbeh_mini_completions/clipped_ratio": 0.3017241379310345,
"eval_bbeh_mini_completions/max_length": 4096.0,
"eval_bbeh_mini_completions/max_terminated_length": 3449.0689655172414,
"eval_bbeh_mini_completions/mean_length": 2359.5905172413795,
"eval_bbeh_mini_completions/mean_terminated_length": 1620.2735132677801,
"eval_bbeh_mini_completions/min_length": 336.0689655172414,
"eval_bbeh_mini_completions/min_terminated_length": 336.0689655172414,
"eval_bbeh_mini_entropy": 0.338118112292783,
"eval_bbeh_mini_frac_reward_zero_std": 1.0,
"eval_bbeh_mini_loss": 0.0,
"eval_bbeh_mini_num_tokens": 7360269.0,
"eval_bbeh_mini_reward": 0.10344827586206896,
"eval_bbeh_mini_reward_std": 0.27482735083020965,
"eval_bbeh_mini_rewards/combined_reward/mean": 0.10344827586206896,
"eval_bbeh_mini_rewards/combined_reward/std": 0.27482736316220513,
"eval_bbeh_mini_runtime": 478.438,
"eval_bbeh_mini_samples_per_second": 0.961,
"eval_bbeh_mini_sampling/importance_sampling_ratio/max": 1.6468630525572547,
"eval_bbeh_mini_sampling/importance_sampling_ratio/mean": 0.2634151639609501,
"eval_bbeh_mini_sampling/importance_sampling_ratio/min": 2.0942759789977906e-05,
"eval_bbeh_mini_sampling/sampling_logp_difference/max": 3.216358838410213,
"eval_bbeh_mini_sampling/sampling_logp_difference/mean": 0.023251901348603183,
"eval_bbeh_mini_steps_per_second": 0.061,
"step": 500
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03,
"completions/max_length": 1828.68,
"completions/max_terminated_length": 1601.08,
"completions/mean_length": 683.7675,
"completions/mean_terminated_length": 593.3429418945312,
"completions/min_length": 211.0,
"completions/min_terminated_length": 211.0,
"entropy": 0.4082434940338135,
"epoch": 0.105,
"frac_reward_zero_std": 0.44,
"grad_norm": 0.0,
"learning_rate": 8.951999999999999e-07,
"loss": 0.0051,
"num_tokens": 7685824.0,
"reward": 0.6175,
"reward_std": 0.4563824462890625,
"rewards/combined_reward/mean": 0.6175,
"rewards/combined_reward/std": 0.4563824665546417,
"sampling/importance_sampling_ratio/max": 2.16779132604599,
"sampling/importance_sampling_ratio/mean": 0.6069332575798034,
"sampling/importance_sampling_ratio/min": 0.019685860924364532,
"sampling/sampling_logp_difference/max": 0.6934374618530273,
"sampling/sampling_logp_difference/mean": 0.026688539236783982,
"step": 525,
"step_time": 6.72239572064951
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03,
"completions/max_length": 1910.32,
"completions/max_terminated_length": 1841.88,
"completions/mean_length": 776.53,
"completions/mean_terminated_length": 694.096181640625,
"completions/min_length": 208.48,
"completions/min_terminated_length": 208.48,
"entropy": 0.4136668372154236,
"epoch": 0.11,
"frac_reward_zero_std": 0.52,
"grad_norm": 1.4963386849443154,
"learning_rate": 8.902e-07,
"loss": 0.0071,
"num_tokens": 8047172.0,
"reward": 0.705,
"reward_std": 0.37073400259017947,
"rewards/combined_reward/mean": 0.705,
"rewards/combined_reward/std": 0.3707340121269226,
"sampling/importance_sampling_ratio/max": 2.074637842178345,
"sampling/importance_sampling_ratio/mean": 0.5935343915224075,
"sampling/importance_sampling_ratio/min": 0.033500756603752964,
"sampling/sampling_logp_difference/max": 0.6976750469207764,
"sampling/sampling_logp_difference/mean": 0.02682505249977112,
"step": 550,
"step_time": 7.161583522455767
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04,
"completions/max_length": 2021.56,
"completions/max_terminated_length": 1834.32,
"completions/mean_length": 893.8875,
"completions/mean_terminated_length": 778.9953515625,
"completions/min_length": 246.56,
"completions/min_terminated_length": 246.56,
"entropy": 0.38500270128250125,
"epoch": 0.115,
"frac_reward_zero_std": 0.3,
"grad_norm": 0.0,
"learning_rate": 8.851999999999999e-07,
"loss": 0.0164,
"num_tokens": 8457615.0,
"reward": 0.555,
"reward_std": 0.4233776617050171,
"rewards/combined_reward/mean": 0.555,
"rewards/combined_reward/std": 0.42337767004966737,
"sampling/importance_sampling_ratio/max": 2.0310398817062376,
"sampling/importance_sampling_ratio/mean": 0.5933060163259506,
"sampling/importance_sampling_ratio/min": 0.04317198476808926,
"sampling/sampling_logp_difference/max": 0.821718487739563,
"sampling/sampling_logp_difference/mean": 0.02523998260498047,
"step": 575,
"step_time": 7.77931922157295
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0125,
"completions/max_length": 1184.28,
"completions/max_terminated_length": 1105.92,
"completions/mean_length": 524.98,
"completions/mean_terminated_length": 489.793486328125,
"completions/min_length": 213.0,
"completions/min_terminated_length": 213.0,
"entropy": 0.3971246266365051,
"epoch": 0.12,
"frac_reward_zero_std": 0.46,
"grad_norm": 0.0,
"learning_rate": 8.802e-07,
"loss": -0.0002,
"num_tokens": 8715415.0,
"reward": 0.705,
"reward_std": 0.376721715927124,
"rewards/combined_reward/mean": 0.705,
"rewards/combined_reward/std": 0.3767217338085175,
"sampling/importance_sampling_ratio/max": 2.0127450704574583,
"sampling/importance_sampling_ratio/mean": 0.6475339376926422,
"sampling/importance_sampling_ratio/min": 0.04534088842570782,
"sampling/sampling_logp_difference/max": 0.7423757553100586,
"sampling/sampling_logp_difference/mean": 0.025974995717406274,
"step": 600,
"step_time": 4.796778986351565
},
{
"epoch": 0.12,
"eval_bbeh_mini_clip_ratio/high_max": 0.0,
"eval_bbeh_mini_clip_ratio/high_mean": 0.0,
"eval_bbeh_mini_clip_ratio/low_mean": 0.0,
"eval_bbeh_mini_clip_ratio/low_min": 0.0,
"eval_bbeh_mini_clip_ratio/region_mean": 0.0,
"eval_bbeh_mini_completions/clipped_ratio": 0.2650862068965517,
"eval_bbeh_mini_completions/max_length": 4096.0,
"eval_bbeh_mini_completions/max_terminated_length": 3541.310344827586,
"eval_bbeh_mini_completions/mean_length": 2310.0086206896553,
"eval_bbeh_mini_completions/mean_terminated_length": 1660.6524237271012,
"eval_bbeh_mini_completions/min_length": 390.48275862068965,
"eval_bbeh_mini_completions/min_terminated_length": 390.48275862068965,
"eval_bbeh_mini_entropy": 0.33623800195496656,
"eval_bbeh_mini_frac_reward_zero_std": 1.0,
"eval_bbeh_mini_loss": 0.0,
"eval_bbeh_mini_num_tokens": 8715415.0,
"eval_bbeh_mini_reward": 0.10775862068965517,
"eval_bbeh_mini_reward_std": 0.27093698238504343,
"eval_bbeh_mini_rewards/combined_reward/mean": 0.10775862068965517,
"eval_bbeh_mini_rewards/combined_reward/std": 0.27093699163404006,
"eval_bbeh_mini_runtime": 475.4716,
"eval_bbeh_mini_samples_per_second": 0.967,
"eval_bbeh_mini_sampling/importance_sampling_ratio/max": 1.5325407144324532,
"eval_bbeh_mini_sampling/importance_sampling_ratio/mean": 0.24085120471387073,
"eval_bbeh_mini_sampling/importance_sampling_ratio/min": 5.8291943258501656e-05,
"eval_bbeh_mini_sampling/sampling_logp_difference/max": 3.5541970894254487,
"eval_bbeh_mini_sampling/sampling_logp_difference/mean": 0.023053794135821277,
"eval_bbeh_mini_steps_per_second": 0.061,
"step": 600
}
],
"logging_steps": 25,
"max_steps": 5000,
"num_input_tokens_seen": 8715415,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}