| { |
| "best_global_step": 600, |
| "best_metric": 0.10775862068965517, |
| "best_model_checkpoint": "/local2/asuvarna31/distractors/Qwen3-0.6B_Mar18-0256_qwen3_0.6b_micro_top2_Mar18-0250_10000/checkpoint-600", |
| "epoch": 0.12, |
| "eval_steps": 100, |
| "global_step": 600, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0475, |
| "completions/max_length": 1907.96, |
| "completions/max_terminated_length": 1542.92, |
| "completions/mean_length": 794.835, |
| "completions/mean_terminated_length": 635.4152294921874, |
| "completions/min_length": 231.32, |
| "completions/min_terminated_length": 231.32, |
| "entropy": 0.39757278442382815, |
| "epoch": 0.005, |
| "frac_reward_zero_std": 0.26, |
| "grad_norm": 0.7908472990123143, |
| "learning_rate": 9.952e-07, |
| "loss": -0.0219, |
| "num_tokens": 370918.0, |
| "reward": 0.37, |
| "reward_std": 0.42105737447738645, |
| "rewards/combined_reward/mean": 0.37, |
| "rewards/combined_reward/std": 0.4210573983192444, |
| "sampling/importance_sampling_ratio/max": 2.198445258140564, |
| "sampling/importance_sampling_ratio/mean": 0.6045227408409118, |
| "sampling/importance_sampling_ratio/min": 0.022674707409983057, |
| "sampling/sampling_logp_difference/max": 0.698764111995697, |
| "sampling/sampling_logp_difference/mean": 0.02563132591545582, |
| "step": 25, |
| "step_time": 7.23048153122887 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0225, |
| "completions/max_length": 1275.04, |
| "completions/max_terminated_length": 1168.96, |
| "completions/mean_length": 563.5925, |
| "completions/mean_terminated_length": 512.9527490234375, |
| "completions/min_length": 240.0, |
| "completions/min_terminated_length": 240.0, |
| "entropy": 0.41257245182991026, |
| "epoch": 0.01, |
| "frac_reward_zero_std": 0.34, |
| "grad_norm": 1.1362625413839582, |
| "learning_rate": 9.901999999999999e-07, |
| "loss": 0.0233, |
| "num_tokens": 649035.0, |
| "reward": 0.5125, |
| "reward_std": 0.3963914489746094, |
| "rewards/combined_reward/mean": 0.5125, |
| "rewards/combined_reward/std": 0.39639145612716675, |
| "sampling/importance_sampling_ratio/max": 2.1132223176956177, |
| "sampling/importance_sampling_ratio/mean": 0.6526571130752563, |
| "sampling/importance_sampling_ratio/min": 0.029447911735951494, |
| "sampling/sampling_logp_difference/max": 0.6200782227516174, |
| "sampling/sampling_logp_difference/mean": 0.026439733169972897, |
| "step": 50, |
| "step_time": 4.926513614905998 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0575, |
| "completions/max_length": 2291.2, |
| "completions/max_terminated_length": 1943.84, |
| "completions/mean_length": 961.34, |
| "completions/mean_terminated_length": 788.9747546386719, |
| "completions/min_length": 298.44, |
| "completions/min_terminated_length": 298.44, |
| "entropy": 0.38926577985286714, |
| "epoch": 0.015, |
| "frac_reward_zero_std": 0.24, |
| "grad_norm": 1.8509531292907888, |
| "learning_rate": 9.852e-07, |
| "loss": 0.0555, |
| "num_tokens": 1089027.0, |
| "reward": 0.535, |
| "reward_std": 0.44107813596725465, |
| "rewards/combined_reward/mean": 0.535, |
| "rewards/combined_reward/std": 0.44107815861701966, |
| "sampling/importance_sampling_ratio/max": 1.9247741317749023, |
| "sampling/importance_sampling_ratio/mean": 0.593499310016632, |
| "sampling/importance_sampling_ratio/min": 0.025702737645042363, |
| "sampling/sampling_logp_difference/max": 0.7170420527458191, |
| "sampling/sampling_logp_difference/mean": 0.02496741436421871, |
| "step": 75, |
| "step_time": 8.62744049324654 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03, |
| "completions/max_length": 1365.76, |
| "completions/max_terminated_length": 1239.48, |
| "completions/mean_length": 614.215, |
| "completions/mean_terminated_length": 525.4255004882813, |
| "completions/min_length": 196.08, |
| "completions/min_terminated_length": 196.08, |
| "entropy": 0.42757334232330324, |
| "epoch": 0.02, |
| "frac_reward_zero_std": 0.32, |
| "grad_norm": 2.7107245292264266, |
| "learning_rate": 9.802e-07, |
| "loss": -0.016, |
| "num_tokens": 1389617.0, |
| "reward": 0.625, |
| "reward_std": 0.4001571249961853, |
| "rewards/combined_reward/mean": 0.625, |
| "rewards/combined_reward/std": 0.400157151222229, |
| "sampling/importance_sampling_ratio/max": 2.0216542720794677, |
| "sampling/importance_sampling_ratio/mean": 0.6695219826698303, |
| "sampling/importance_sampling_ratio/min": 0.031132260655103893, |
| "sampling/sampling_logp_difference/max": 0.7458893251419068, |
| "sampling/sampling_logp_difference/mean": 0.027404132559895517, |
| "step": 100, |
| "step_time": 5.374540434898808 |
| }, |
| { |
| "epoch": 0.02, |
| "eval_bbeh_mini_clip_ratio/high_max": 0.0, |
| "eval_bbeh_mini_clip_ratio/high_mean": 0.0, |
| "eval_bbeh_mini_clip_ratio/low_mean": 0.0, |
| "eval_bbeh_mini_clip_ratio/low_min": 0.0, |
| "eval_bbeh_mini_clip_ratio/region_mean": 0.0, |
| "eval_bbeh_mini_completions/clipped_ratio": 0.3599137931034483, |
| "eval_bbeh_mini_completions/max_length": 4096.0, |
| "eval_bbeh_mini_completions/max_terminated_length": 3399.1724137931033, |
| "eval_bbeh_mini_completions/mean_length": 2556.969827586207, |
| "eval_bbeh_mini_completions/mean_terminated_length": 1690.4230199353449, |
| "eval_bbeh_mini_completions/min_length": 389.13793103448273, |
| "eval_bbeh_mini_completions/min_terminated_length": 389.13793103448273, |
| "eval_bbeh_mini_entropy": 0.33320264775177527, |
| "eval_bbeh_mini_frac_reward_zero_std": 1.0, |
| "eval_bbeh_mini_loss": 0.0, |
| "eval_bbeh_mini_num_tokens": 1389617.0, |
| "eval_bbeh_mini_reward": 0.0625, |
| "eval_bbeh_mini_reward_std": 0.1932970984228726, |
| "eval_bbeh_mini_rewards/combined_reward/mean": 0.0625, |
| "eval_bbeh_mini_rewards/combined_reward/std": 0.19329710458887034, |
| "eval_bbeh_mini_runtime": 493.3587, |
| "eval_bbeh_mini_samples_per_second": 0.932, |
| "eval_bbeh_mini_sampling/importance_sampling_ratio/max": 1.5544916828130853, |
| "eval_bbeh_mini_sampling/importance_sampling_ratio/mean": 0.24806074290696917, |
| "eval_bbeh_mini_sampling/importance_sampling_ratio/min": 4.2029642265184464e-05, |
| "eval_bbeh_mini_sampling/sampling_logp_difference/max": 3.1536410635915297, |
| "eval_bbeh_mini_sampling/sampling_logp_difference/mean": 0.022588976112932993, |
| "eval_bbeh_mini_steps_per_second": 0.059, |
| "step": 100 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0925, |
| "completions/max_length": 2125.76, |
| "completions/max_terminated_length": 1810.64, |
| "completions/mean_length": 1089.9775, |
| "completions/mean_terminated_length": 842.0292138671875, |
| "completions/min_length": 302.92, |
| "completions/min_terminated_length": 302.92, |
| "entropy": 0.381637504696846, |
| "epoch": 0.025, |
| "frac_reward_zero_std": 0.44, |
| "grad_norm": 0.8500470780968633, |
| "learning_rate": 9.752e-07, |
| "loss": 0.0054, |
| "num_tokens": 1879680.0, |
| "reward": 0.5525, |
| "reward_std": 0.3752482628822327, |
| "rewards/combined_reward/mean": 0.5525, |
| "rewards/combined_reward/std": 0.3752482759952545, |
| "sampling/importance_sampling_ratio/max": 2.1725155782699583, |
| "sampling/importance_sampling_ratio/mean": 0.5781891107559204, |
| "sampling/importance_sampling_ratio/min": 0.014303099608951016, |
| "sampling/sampling_logp_difference/max": 0.7780866742134094, |
| "sampling/sampling_logp_difference/mean": 0.02495789147913456, |
| "step": 125, |
| "step_time": 8.32747592881322 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 1443.16, |
| "completions/max_terminated_length": 1350.84, |
| "completions/mean_length": 602.2575, |
| "completions/mean_terminated_length": 526.3002880859375, |
| "completions/min_length": 181.92, |
| "completions/min_terminated_length": 181.92, |
| "entropy": 0.3935022366046905, |
| "epoch": 0.03, |
| "frac_reward_zero_std": 0.46, |
| "grad_norm": 3.9983544495385286, |
| "learning_rate": 9.701999999999998e-07, |
| "loss": -0.0266, |
| "num_tokens": 2173791.0, |
| "reward": 0.675, |
| "reward_std": 0.4132915163040161, |
| "rewards/combined_reward/mean": 0.675, |
| "rewards/combined_reward/std": 0.41329152703285216, |
| "sampling/importance_sampling_ratio/max": 1.974114351272583, |
| "sampling/importance_sampling_ratio/mean": 0.6311233282089234, |
| "sampling/importance_sampling_ratio/min": 0.03929259464435745, |
| "sampling/sampling_logp_difference/max": 0.6166027712821961, |
| "sampling/sampling_logp_difference/mean": 0.02564044661819935, |
| "step": 150, |
| "step_time": 5.59985625769943 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0475, |
| "completions/max_length": 1882.6, |
| "completions/max_terminated_length": 1692.64, |
| "completions/mean_length": 824.66, |
| "completions/mean_terminated_length": 694.9667797851563, |
| "completions/min_length": 231.4, |
| "completions/min_terminated_length": 231.4, |
| "entropy": 0.38833099365234375, |
| "epoch": 0.035, |
| "frac_reward_zero_std": 0.44, |
| "grad_norm": 0.0, |
| "learning_rate": 9.651999999999999e-07, |
| "loss": -0.0042, |
| "num_tokens": 2556495.0, |
| "reward": 0.615, |
| "reward_std": 0.3986561942100525, |
| "rewards/combined_reward/mean": 0.615, |
| "rewards/combined_reward/std": 0.39865620732307433, |
| "sampling/importance_sampling_ratio/max": 2.0362929487228394, |
| "sampling/importance_sampling_ratio/mean": 0.6166802150011063, |
| "sampling/importance_sampling_ratio/min": 0.03415079687081743, |
| "sampling/sampling_logp_difference/max": 0.9840151739120483, |
| "sampling/sampling_logp_difference/mean": 0.025189833119511604, |
| "step": 175, |
| "step_time": 7.186449560681358 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0475, |
| "completions/max_length": 2255.96, |
| "completions/max_terminated_length": 1958.16, |
| "completions/mean_length": 876.6025, |
| "completions/mean_terminated_length": 734.1006811523438, |
| "completions/min_length": 209.52, |
| "completions/min_terminated_length": 209.52, |
| "entropy": 0.3978144943714142, |
| "epoch": 0.04, |
| "frac_reward_zero_std": 0.34, |
| "grad_norm": 2.407379133784635, |
| "learning_rate": 9.602e-07, |
| "loss": 0.0184, |
| "num_tokens": 2959120.0, |
| "reward": 0.5975, |
| "reward_std": 0.4489552092552185, |
| "rewards/combined_reward/mean": 0.5975, |
| "rewards/combined_reward/std": 0.4489552354812622, |
| "sampling/importance_sampling_ratio/max": 1.9040312719345094, |
| "sampling/importance_sampling_ratio/mean": 0.5416174274682999, |
| "sampling/importance_sampling_ratio/min": 0.018405352871644087, |
| "sampling/sampling_logp_difference/max": 0.8134938716888428, |
| "sampling/sampling_logp_difference/mean": 0.025685913935303686, |
| "step": 200, |
| "step_time": 8.387271651169286 |
| }, |
| { |
| "epoch": 0.04, |
| "eval_bbeh_mini_clip_ratio/high_max": 0.0, |
| "eval_bbeh_mini_clip_ratio/high_mean": 0.0, |
| "eval_bbeh_mini_clip_ratio/low_mean": 0.0, |
| "eval_bbeh_mini_clip_ratio/low_min": 0.0, |
| "eval_bbeh_mini_clip_ratio/region_mean": 0.0, |
| "eval_bbeh_mini_completions/clipped_ratio": 0.31896551724137934, |
| "eval_bbeh_mini_completions/max_length": 4096.0, |
| "eval_bbeh_mini_completions/max_terminated_length": 3501.3793103448274, |
| "eval_bbeh_mini_completions/mean_length": 2493.7543103448274, |
| "eval_bbeh_mini_completions/mean_terminated_length": 1744.4683248585668, |
| "eval_bbeh_mini_completions/min_length": 402.0689655172414, |
| "eval_bbeh_mini_completions/min_terminated_length": 402.0689655172414, |
| "eval_bbeh_mini_entropy": 0.33658467079031057, |
| "eval_bbeh_mini_frac_reward_zero_std": 1.0, |
| "eval_bbeh_mini_loss": 0.0, |
| "eval_bbeh_mini_num_tokens": 2959120.0, |
| "eval_bbeh_mini_reward": 0.07327586206896551, |
| "eval_bbeh_mini_reward_std": 0.22547399586644665, |
| "eval_bbeh_mini_rewards/combined_reward/mean": 0.07327586206896551, |
| "eval_bbeh_mini_rewards/combined_reward/std": 0.22547400408777699, |
| "eval_bbeh_mini_runtime": 485.7912, |
| "eval_bbeh_mini_samples_per_second": 0.947, |
| "eval_bbeh_mini_sampling/importance_sampling_ratio/max": 1.4247941641971982, |
| "eval_bbeh_mini_sampling/importance_sampling_ratio/mean": 0.21224350610683704, |
| "eval_bbeh_mini_sampling/importance_sampling_ratio/min": 1.9089912678375065e-05, |
| "eval_bbeh_mini_sampling/sampling_logp_difference/max": 3.4790474669686677, |
| "eval_bbeh_mini_sampling/sampling_logp_difference/mean": 0.022729232650378656, |
| "eval_bbeh_mini_steps_per_second": 0.06, |
| "step": 200 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.045, |
| "completions/max_length": 2272.24, |
| "completions/max_terminated_length": 1942.2, |
| "completions/mean_length": 935.3925, |
| "completions/mean_terminated_length": 812.9290356445313, |
| "completions/min_length": 275.64, |
| "completions/min_terminated_length": 275.64, |
| "entropy": 0.4116176557540894, |
| "epoch": 0.045, |
| "frac_reward_zero_std": 0.42, |
| "grad_norm": 3.5037476412669593, |
| "learning_rate": 9.552e-07, |
| "loss": 0.0173, |
| "num_tokens": 3383261.0, |
| "reward": 0.6625, |
| "reward_std": 0.41886321067810056, |
| "rewards/combined_reward/mean": 0.6625, |
| "rewards/combined_reward/std": 0.41886322021484373, |
| "sampling/importance_sampling_ratio/max": 2.0795044040679933, |
| "sampling/importance_sampling_ratio/mean": 0.6006859976053238, |
| "sampling/importance_sampling_ratio/min": 0.014239588570781052, |
| "sampling/sampling_logp_difference/max": 0.7338527536392212, |
| "sampling/sampling_logp_difference/mean": 0.02595676988363266, |
| "step": 225, |
| "step_time": 8.487474374789745 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03, |
| "completions/max_length": 1640.28, |
| "completions/max_terminated_length": 1389.36, |
| "completions/mean_length": 653.0075, |
| "completions/mean_terminated_length": 565.9887109375, |
| "completions/min_length": 255.52, |
| "completions/min_terminated_length": 255.52, |
| "entropy": 0.4170915710926056, |
| "epoch": 0.05, |
| "frac_reward_zero_std": 0.42, |
| "grad_norm": 2.1638076114867757, |
| "learning_rate": 9.502e-07, |
| "loss": -0.0148, |
| "num_tokens": 3698864.0, |
| "reward": 0.705, |
| "reward_std": 0.38157126426696775, |
| "rewards/combined_reward/mean": 0.705, |
| "rewards/combined_reward/std": 0.38157127737998964, |
| "sampling/importance_sampling_ratio/max": 1.9969594597816467, |
| "sampling/importance_sampling_ratio/mean": 0.628093301653862, |
| "sampling/importance_sampling_ratio/min": 0.018454841256149202, |
| "sampling/sampling_logp_difference/max": 0.659376904964447, |
| "sampling/sampling_logp_difference/mean": 0.026758400201797487, |
| "step": 250, |
| "step_time": 6.111578326625749 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.055, |
| "completions/max_length": 2229.44, |
| "completions/max_terminated_length": 1948.52, |
| "completions/mean_length": 942.01, |
| "completions/mean_terminated_length": 788.1107543945312, |
| "completions/min_length": 249.4, |
| "completions/min_terminated_length": 249.4, |
| "entropy": 0.4110789692401886, |
| "epoch": 0.055, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 4.667525978474157, |
| "learning_rate": 9.452e-07, |
| "loss": -0.0242, |
| "num_tokens": 4129764.0, |
| "reward": 0.6575, |
| "reward_std": 0.4181215476989746, |
| "rewards/combined_reward/mean": 0.6575, |
| "rewards/combined_reward/std": 0.41812155723571776, |
| "sampling/importance_sampling_ratio/max": 1.8673054933547975, |
| "sampling/importance_sampling_ratio/mean": 0.5125843727588654, |
| "sampling/importance_sampling_ratio/min": 0.01608145761529954, |
| "sampling/sampling_logp_difference/max": 0.8164897656440735, |
| "sampling/sampling_logp_difference/mean": 0.02648505486547947, |
| "step": 275, |
| "step_time": 8.40903925454244 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.04, |
| "completions/max_length": 2057.4, |
| "completions/max_terminated_length": 1949.44, |
| "completions/mean_length": 919.8125, |
| "completions/mean_terminated_length": 816.5338427734375, |
| "completions/min_length": 261.0, |
| "completions/min_terminated_length": 261.0, |
| "entropy": 0.41194292187690734, |
| "epoch": 0.06, |
| "frac_reward_zero_std": 0.42, |
| "grad_norm": 1.1934433538781786, |
| "learning_rate": 9.402e-07, |
| "loss": 0.0465, |
| "num_tokens": 4550697.0, |
| "reward": 0.6275, |
| "reward_std": 0.417396194934845, |
| "rewards/combined_reward/mean": 0.6275, |
| "rewards/combined_reward/std": 0.41739620923995974, |
| "sampling/importance_sampling_ratio/max": 2.1660247945785525, |
| "sampling/importance_sampling_ratio/mean": 0.6122340059280396, |
| "sampling/importance_sampling_ratio/min": 0.014834024702245853, |
| "sampling/sampling_logp_difference/max": 0.8179536938667298, |
| "sampling/sampling_logp_difference/mean": 0.025995058193802833, |
| "step": 300, |
| "step_time": 7.779672881411389 |
| }, |
| { |
| "epoch": 0.06, |
| "eval_bbeh_mini_clip_ratio/high_max": 0.0, |
| "eval_bbeh_mini_clip_ratio/high_mean": 0.0, |
| "eval_bbeh_mini_clip_ratio/low_mean": 0.0, |
| "eval_bbeh_mini_clip_ratio/low_min": 0.0, |
| "eval_bbeh_mini_clip_ratio/region_mean": 0.0, |
| "eval_bbeh_mini_completions/clipped_ratio": 0.36637931034482757, |
| "eval_bbeh_mini_completions/max_length": 4096.0, |
| "eval_bbeh_mini_completions/max_terminated_length": 3524.7241379310344, |
| "eval_bbeh_mini_completions/mean_length": 2685.9525862068967, |
| "eval_bbeh_mini_completions/mean_terminated_length": 1877.7948924097523, |
| "eval_bbeh_mini_completions/min_length": 571.0689655172414, |
| "eval_bbeh_mini_completions/min_terminated_length": 571.0689655172414, |
| "eval_bbeh_mini_entropy": 0.35236285369971704, |
| "eval_bbeh_mini_frac_reward_zero_std": 1.0, |
| "eval_bbeh_mini_loss": 0.0, |
| "eval_bbeh_mini_num_tokens": 4550697.0, |
| "eval_bbeh_mini_reward": 0.10344827586206896, |
| "eval_bbeh_mini_reward_std": 0.2699657020897701, |
| "eval_bbeh_mini_rewards/combined_reward/mean": 0.10344827586206896, |
| "eval_bbeh_mini_rewards/combined_reward/std": 0.26996571339409925, |
| "eval_bbeh_mini_runtime": 494.6737, |
| "eval_bbeh_mini_samples_per_second": 0.93, |
| "eval_bbeh_mini_sampling/importance_sampling_ratio/max": 1.2562965689034298, |
| "eval_bbeh_mini_sampling/importance_sampling_ratio/mean": 0.1846869053511784, |
| "eval_bbeh_mini_sampling/importance_sampling_ratio/min": 8.863040915959077e-06, |
| "eval_bbeh_mini_sampling/sampling_logp_difference/max": 3.5924758705599555, |
| "eval_bbeh_mini_sampling/sampling_logp_difference/mean": 0.02370224494872422, |
| "eval_bbeh_mini_steps_per_second": 0.059, |
| "step": 300 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.07, |
| "completions/max_length": 2060.08, |
| "completions/max_terminated_length": 1839.84, |
| "completions/mean_length": 893.005, |
| "completions/mean_terminated_length": 669.4148620605469, |
| "completions/min_length": 229.84, |
| "completions/min_terminated_length": 229.84, |
| "entropy": 0.40538407385349273, |
| "epoch": 0.065, |
| "frac_reward_zero_std": 0.52, |
| "grad_norm": 5.036384229761231, |
| "learning_rate": 9.352e-07, |
| "loss": 0.0219, |
| "num_tokens": 4961307.0, |
| "reward": 0.61, |
| "reward_std": 0.39049545288085935, |
| "rewards/combined_reward/mean": 0.61, |
| "rewards/combined_reward/std": 0.3904954707622528, |
| "sampling/importance_sampling_ratio/max": 2.171120972633362, |
| "sampling/importance_sampling_ratio/mean": 0.6055274987220765, |
| "sampling/importance_sampling_ratio/min": 0.031129270781748347, |
| "sampling/sampling_logp_difference/max": 0.7485472440719605, |
| "sampling/sampling_logp_difference/mean": 0.02611954465508461, |
| "step": 325, |
| "step_time": 7.8838804703298955 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0075, |
| "completions/max_length": 1540.16, |
| "completions/max_terminated_length": 1477.4, |
| "completions/mean_length": 649.275, |
| "completions/mean_terminated_length": 629.781923828125, |
| "completions/min_length": 224.08, |
| "completions/min_terminated_length": 224.08, |
| "entropy": 0.44817891597747805, |
| "epoch": 0.07, |
| "frac_reward_zero_std": 0.4, |
| "grad_norm": 1.262847377601464, |
| "learning_rate": 9.302e-07, |
| "loss": -0.0023, |
| "num_tokens": 5272369.0, |
| "reward": 0.63, |
| "reward_std": 0.40423260688781737, |
| "rewards/combined_reward/mean": 0.63, |
| "rewards/combined_reward/std": 0.4042326259613037, |
| "sampling/importance_sampling_ratio/max": 1.9462181329727173, |
| "sampling/importance_sampling_ratio/mean": 0.5410465461015701, |
| "sampling/importance_sampling_ratio/min": 0.007918943986296653, |
| "sampling/sampling_logp_difference/max": 0.658315749168396, |
| "sampling/sampling_logp_difference/mean": 0.028008888587355615, |
| "step": 350, |
| "step_time": 5.602628886112943 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.055, |
| "completions/max_length": 1894.84, |
| "completions/max_terminated_length": 1623.76, |
| "completions/mean_length": 823.715, |
| "completions/mean_terminated_length": 638.96466796875, |
| "completions/min_length": 199.48, |
| "completions/min_terminated_length": 199.48, |
| "entropy": 0.4437406814098358, |
| "epoch": 0.075, |
| "frac_reward_zero_std": 0.42, |
| "grad_norm": 1.416585270355939, |
| "learning_rate": 9.251999999999999e-07, |
| "loss": -0.0063, |
| "num_tokens": 5651599.0, |
| "reward": 0.6375, |
| "reward_std": 0.4141005539894104, |
| "rewards/combined_reward/mean": 0.6375, |
| "rewards/combined_reward/std": 0.41410056471824647, |
| "sampling/importance_sampling_ratio/max": 2.079793190956116, |
| "sampling/importance_sampling_ratio/mean": 0.6110782504081727, |
| "sampling/importance_sampling_ratio/min": 0.012589475377462805, |
| "sampling/sampling_logp_difference/max": 0.7504757452011108, |
| "sampling/sampling_logp_difference/mean": 0.027451810911297798, |
| "step": 375, |
| "step_time": 7.127676211716607 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.04, |
| "completions/max_length": 1728.44, |
| "completions/max_terminated_length": 1628.96, |
| "completions/mean_length": 761.1325, |
| "completions/mean_terminated_length": 657.4637036132813, |
| "completions/min_length": 202.8, |
| "completions/min_terminated_length": 202.8, |
| "entropy": 0.40143827855587005, |
| "epoch": 0.08, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 0.0, |
| "learning_rate": 9.202e-07, |
| "loss": 0.0175, |
| "num_tokens": 6009092.0, |
| "reward": 0.5775, |
| "reward_std": 0.38390336275100706, |
| "rewards/combined_reward/mean": 0.5775, |
| "rewards/combined_reward/std": 0.38390338063240054, |
| "sampling/importance_sampling_ratio/max": 2.096400910615921, |
| "sampling/importance_sampling_ratio/mean": 0.6684997088462115, |
| "sampling/importance_sampling_ratio/min": 0.027380506457557203, |
| "sampling/sampling_logp_difference/max": 0.6964709210395813, |
| "sampling/sampling_logp_difference/mean": 0.02609425738453865, |
| "step": 400, |
| "step_time": 6.615593434758484 |
| }, |
| { |
| "epoch": 0.08, |
| "eval_bbeh_mini_clip_ratio/high_max": 0.0, |
| "eval_bbeh_mini_clip_ratio/high_mean": 0.0, |
| "eval_bbeh_mini_clip_ratio/low_mean": 0.0, |
| "eval_bbeh_mini_clip_ratio/low_min": 0.0, |
| "eval_bbeh_mini_clip_ratio/region_mean": 0.0, |
| "eval_bbeh_mini_completions/clipped_ratio": 0.33620689655172414, |
| "eval_bbeh_mini_completions/max_length": 4096.0, |
| "eval_bbeh_mini_completions/max_terminated_length": 3580.5172413793102, |
| "eval_bbeh_mini_completions/mean_length": 2591.877155172414, |
| "eval_bbeh_mini_completions/mean_terminated_length": 1825.3767721241918, |
| "eval_bbeh_mini_completions/min_length": 451.62068965517244, |
| "eval_bbeh_mini_completions/min_terminated_length": 451.62068965517244, |
| "eval_bbeh_mini_entropy": 0.3378912878447565, |
| "eval_bbeh_mini_frac_reward_zero_std": 1.0, |
| "eval_bbeh_mini_loss": 0.0, |
| "eval_bbeh_mini_num_tokens": 6009092.0, |
| "eval_bbeh_mini_reward": 0.10560344827586207, |
| "eval_bbeh_mini_reward_std": 0.25904289196277486, |
| "eval_bbeh_mini_rewards/combined_reward/mean": 0.10560344827586207, |
| "eval_bbeh_mini_rewards/combined_reward/std": 0.2590429012117715, |
| "eval_bbeh_mini_runtime": 491.0126, |
| "eval_bbeh_mini_samples_per_second": 0.937, |
| "eval_bbeh_mini_sampling/importance_sampling_ratio/max": 1.1502898427946815, |
| "eval_bbeh_mini_sampling/importance_sampling_ratio/mean": 0.16786166884261985, |
| "eval_bbeh_mini_sampling/importance_sampling_ratio/min": 5.115552837594649e-05, |
| "eval_bbeh_mini_sampling/sampling_logp_difference/max": 2.436232163988311, |
| "eval_bbeh_mini_sampling/sampling_logp_difference/mean": 0.023024646565318108, |
| "eval_bbeh_mini_steps_per_second": 0.059, |
| "step": 400 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0725, |
| "completions/max_length": 2179.08, |
| "completions/max_terminated_length": 2003.44, |
| "completions/mean_length": 953.6125, |
| "completions/mean_terminated_length": 736.740048828125, |
| "completions/min_length": 218.08, |
| "completions/min_terminated_length": 218.08, |
| "entropy": 0.4012847465276718, |
| "epoch": 0.085, |
| "frac_reward_zero_std": 0.42, |
| "grad_norm": 0.5398184196341114, |
| "learning_rate": 9.151999999999999e-07, |
| "loss": -0.0237, |
| "num_tokens": 6441177.0, |
| "reward": 0.695, |
| "reward_std": 0.3829051518440247, |
| "rewards/combined_reward/mean": 0.695, |
| "rewards/combined_reward/std": 0.382905170917511, |
| "sampling/importance_sampling_ratio/max": 1.9372561621665954, |
| "sampling/importance_sampling_ratio/mean": 0.5673770725727081, |
| "sampling/importance_sampling_ratio/min": 0.03025458916346361, |
| "sampling/sampling_logp_difference/max": 0.7728560495376587, |
| "sampling/sampling_logp_difference/mean": 0.02587804526090622, |
| "step": 425, |
| "step_time": 8.324159880718216 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03, |
| "completions/max_length": 1513.88, |
| "completions/max_terminated_length": 1419.48, |
| "completions/mean_length": 615.395, |
| "completions/mean_terminated_length": 520.6183935546875, |
| "completions/min_length": 177.0, |
| "completions/min_terminated_length": 177.0, |
| "entropy": 0.41070492506027223, |
| "epoch": 0.09, |
| "frac_reward_zero_std": 0.52, |
| "grad_norm": 1.5270349642075316, |
| "learning_rate": 9.102e-07, |
| "loss": -0.0022, |
| "num_tokens": 6739631.0, |
| "reward": 0.735, |
| "reward_std": 0.33413492679595946, |
| "rewards/combined_reward/mean": 0.735, |
| "rewards/combined_reward/std": 0.3341349399089813, |
| "sampling/importance_sampling_ratio/max": 2.027468514442444, |
| "sampling/importance_sampling_ratio/mean": 0.6084394156932831, |
| "sampling/importance_sampling_ratio/min": 0.02255286922645837, |
| "sampling/sampling_logp_difference/max": 0.5962899017333985, |
| "sampling/sampling_logp_difference/mean": 0.02680632047355175, |
| "step": 450, |
| "step_time": 5.793452201336622 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 1631.92, |
| "completions/max_terminated_length": 1465.8, |
| "completions/mean_length": 605.425, |
| "completions/mean_terminated_length": 531.262001953125, |
| "completions/min_length": 168.24, |
| "completions/min_terminated_length": 168.24, |
| "entropy": 0.4006887876987457, |
| "epoch": 0.095, |
| "frac_reward_zero_std": 0.5, |
| "grad_norm": 0.0, |
| "learning_rate": 9.051999999999999e-07, |
| "loss": 0.051, |
| "num_tokens": 7030409.0, |
| "reward": 0.685, |
| "reward_std": 0.3645173120498657, |
| "rewards/combined_reward/mean": 0.685, |
| "rewards/combined_reward/std": 0.3645173156261444, |
| "sampling/importance_sampling_ratio/max": 1.9704220461845399, |
| "sampling/importance_sampling_ratio/mean": 0.62828111410141, |
| "sampling/importance_sampling_ratio/min": 0.012673925184790278, |
| "sampling/sampling_logp_difference/max": 0.7063542747497559, |
| "sampling/sampling_logp_difference/mean": 0.026762503162026406, |
| "step": 475, |
| "step_time": 6.15127008873038 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0275, |
| "completions/max_length": 1725.32, |
| "completions/max_terminated_length": 1643.52, |
| "completions/mean_length": 700.63, |
| "completions/mean_terminated_length": 630.799814453125, |
| "completions/min_length": 195.16, |
| "completions/min_terminated_length": 195.16, |
| "entropy": 0.40278098702430726, |
| "epoch": 0.1, |
| "frac_reward_zero_std": 0.46, |
| "grad_norm": 0.0, |
| "learning_rate": 9.002e-07, |
| "loss": 0.0015, |
| "num_tokens": 7360269.0, |
| "reward": 0.7075, |
| "reward_std": 0.32035229206085203, |
| "rewards/combined_reward/mean": 0.7075, |
| "rewards/combined_reward/std": 0.3203523027896881, |
| "sampling/importance_sampling_ratio/max": 1.9510860872268676, |
| "sampling/importance_sampling_ratio/mean": 0.5961837387084961, |
| "sampling/importance_sampling_ratio/min": 0.02190992054884191, |
| "sampling/sampling_logp_difference/max": 0.6417275023460388, |
| "sampling/sampling_logp_difference/mean": 0.026219148635864258, |
| "step": 500, |
| "step_time": 6.48362862716429 |
| }, |
| { |
| "epoch": 0.1, |
| "eval_bbeh_mini_clip_ratio/high_max": 0.0, |
| "eval_bbeh_mini_clip_ratio/high_mean": 0.0, |
| "eval_bbeh_mini_clip_ratio/low_mean": 0.0, |
| "eval_bbeh_mini_clip_ratio/low_min": 0.0, |
| "eval_bbeh_mini_clip_ratio/region_mean": 0.0, |
| "eval_bbeh_mini_completions/clipped_ratio": 0.3017241379310345, |
| "eval_bbeh_mini_completions/max_length": 4096.0, |
| "eval_bbeh_mini_completions/max_terminated_length": 3449.0689655172414, |
| "eval_bbeh_mini_completions/mean_length": 2359.5905172413795, |
| "eval_bbeh_mini_completions/mean_terminated_length": 1620.2735132677801, |
| "eval_bbeh_mini_completions/min_length": 336.0689655172414, |
| "eval_bbeh_mini_completions/min_terminated_length": 336.0689655172414, |
| "eval_bbeh_mini_entropy": 0.338118112292783, |
| "eval_bbeh_mini_frac_reward_zero_std": 1.0, |
| "eval_bbeh_mini_loss": 0.0, |
| "eval_bbeh_mini_num_tokens": 7360269.0, |
| "eval_bbeh_mini_reward": 0.10344827586206896, |
| "eval_bbeh_mini_reward_std": 0.27482735083020965, |
| "eval_bbeh_mini_rewards/combined_reward/mean": 0.10344827586206896, |
| "eval_bbeh_mini_rewards/combined_reward/std": 0.27482736316220513, |
| "eval_bbeh_mini_runtime": 478.438, |
| "eval_bbeh_mini_samples_per_second": 0.961, |
| "eval_bbeh_mini_sampling/importance_sampling_ratio/max": 1.6468630525572547, |
| "eval_bbeh_mini_sampling/importance_sampling_ratio/mean": 0.2634151639609501, |
| "eval_bbeh_mini_sampling/importance_sampling_ratio/min": 2.0942759789977906e-05, |
| "eval_bbeh_mini_sampling/sampling_logp_difference/max": 3.216358838410213, |
| "eval_bbeh_mini_sampling/sampling_logp_difference/mean": 0.023251901348603183, |
| "eval_bbeh_mini_steps_per_second": 0.061, |
| "step": 500 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03, |
| "completions/max_length": 1828.68, |
| "completions/max_terminated_length": 1601.08, |
| "completions/mean_length": 683.7675, |
| "completions/mean_terminated_length": 593.3429418945312, |
| "completions/min_length": 211.0, |
| "completions/min_terminated_length": 211.0, |
| "entropy": 0.4082434940338135, |
| "epoch": 0.105, |
| "frac_reward_zero_std": 0.44, |
| "grad_norm": 0.0, |
| "learning_rate": 8.951999999999999e-07, |
| "loss": 0.0051, |
| "num_tokens": 7685824.0, |
| "reward": 0.6175, |
| "reward_std": 0.4563824462890625, |
| "rewards/combined_reward/mean": 0.6175, |
| "rewards/combined_reward/std": 0.4563824665546417, |
| "sampling/importance_sampling_ratio/max": 2.16779132604599, |
| "sampling/importance_sampling_ratio/mean": 0.6069332575798034, |
| "sampling/importance_sampling_ratio/min": 0.019685860924364532, |
| "sampling/sampling_logp_difference/max": 0.6934374618530273, |
| "sampling/sampling_logp_difference/mean": 0.026688539236783982, |
| "step": 525, |
| "step_time": 6.72239572064951 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03, |
| "completions/max_length": 1910.32, |
| "completions/max_terminated_length": 1841.88, |
| "completions/mean_length": 776.53, |
| "completions/mean_terminated_length": 694.096181640625, |
| "completions/min_length": 208.48, |
| "completions/min_terminated_length": 208.48, |
| "entropy": 0.4136668372154236, |
| "epoch": 0.11, |
| "frac_reward_zero_std": 0.52, |
| "grad_norm": 1.4963386849443154, |
| "learning_rate": 8.902e-07, |
| "loss": 0.0071, |
| "num_tokens": 8047172.0, |
| "reward": 0.705, |
| "reward_std": 0.37073400259017947, |
| "rewards/combined_reward/mean": 0.705, |
| "rewards/combined_reward/std": 0.3707340121269226, |
| "sampling/importance_sampling_ratio/max": 2.074637842178345, |
| "sampling/importance_sampling_ratio/mean": 0.5935343915224075, |
| "sampling/importance_sampling_ratio/min": 0.033500756603752964, |
| "sampling/sampling_logp_difference/max": 0.6976750469207764, |
| "sampling/sampling_logp_difference/mean": 0.02682505249977112, |
| "step": 550, |
| "step_time": 7.161583522455767 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.04, |
| "completions/max_length": 2021.56, |
| "completions/max_terminated_length": 1834.32, |
| "completions/mean_length": 893.8875, |
| "completions/mean_terminated_length": 778.9953515625, |
| "completions/min_length": 246.56, |
| "completions/min_terminated_length": 246.56, |
| "entropy": 0.38500270128250125, |
| "epoch": 0.115, |
| "frac_reward_zero_std": 0.3, |
| "grad_norm": 0.0, |
| "learning_rate": 8.851999999999999e-07, |
| "loss": 0.0164, |
| "num_tokens": 8457615.0, |
| "reward": 0.555, |
| "reward_std": 0.4233776617050171, |
| "rewards/combined_reward/mean": 0.555, |
| "rewards/combined_reward/std": 0.42337767004966737, |
| "sampling/importance_sampling_ratio/max": 2.0310398817062376, |
| "sampling/importance_sampling_ratio/mean": 0.5933060163259506, |
| "sampling/importance_sampling_ratio/min": 0.04317198476808926, |
| "sampling/sampling_logp_difference/max": 0.821718487739563, |
| "sampling/sampling_logp_difference/mean": 0.02523998260498047, |
| "step": 575, |
| "step_time": 7.77931922157295 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0125, |
| "completions/max_length": 1184.28, |
| "completions/max_terminated_length": 1105.92, |
| "completions/mean_length": 524.98, |
| "completions/mean_terminated_length": 489.793486328125, |
| "completions/min_length": 213.0, |
| "completions/min_terminated_length": 213.0, |
| "entropy": 0.3971246266365051, |
| "epoch": 0.12, |
| "frac_reward_zero_std": 0.46, |
| "grad_norm": 0.0, |
| "learning_rate": 8.802e-07, |
| "loss": -0.0002, |
| "num_tokens": 8715415.0, |
| "reward": 0.705, |
| "reward_std": 0.376721715927124, |
| "rewards/combined_reward/mean": 0.705, |
| "rewards/combined_reward/std": 0.3767217338085175, |
| "sampling/importance_sampling_ratio/max": 2.0127450704574583, |
| "sampling/importance_sampling_ratio/mean": 0.6475339376926422, |
| "sampling/importance_sampling_ratio/min": 0.04534088842570782, |
| "sampling/sampling_logp_difference/max": 0.7423757553100586, |
| "sampling/sampling_logp_difference/mean": 0.025974995717406274, |
| "step": 600, |
| "step_time": 4.796778986351565 |
| }, |
| { |
| "epoch": 0.12, |
| "eval_bbeh_mini_clip_ratio/high_max": 0.0, |
| "eval_bbeh_mini_clip_ratio/high_mean": 0.0, |
| "eval_bbeh_mini_clip_ratio/low_mean": 0.0, |
| "eval_bbeh_mini_clip_ratio/low_min": 0.0, |
| "eval_bbeh_mini_clip_ratio/region_mean": 0.0, |
| "eval_bbeh_mini_completions/clipped_ratio": 0.2650862068965517, |
| "eval_bbeh_mini_completions/max_length": 4096.0, |
| "eval_bbeh_mini_completions/max_terminated_length": 3541.310344827586, |
| "eval_bbeh_mini_completions/mean_length": 2310.0086206896553, |
| "eval_bbeh_mini_completions/mean_terminated_length": 1660.6524237271012, |
| "eval_bbeh_mini_completions/min_length": 390.48275862068965, |
| "eval_bbeh_mini_completions/min_terminated_length": 390.48275862068965, |
| "eval_bbeh_mini_entropy": 0.33623800195496656, |
| "eval_bbeh_mini_frac_reward_zero_std": 1.0, |
| "eval_bbeh_mini_loss": 0.0, |
| "eval_bbeh_mini_num_tokens": 8715415.0, |
| "eval_bbeh_mini_reward": 0.10775862068965517, |
| "eval_bbeh_mini_reward_std": 0.27093698238504343, |
| "eval_bbeh_mini_rewards/combined_reward/mean": 0.10775862068965517, |
| "eval_bbeh_mini_rewards/combined_reward/std": 0.27093699163404006, |
| "eval_bbeh_mini_runtime": 475.4716, |
| "eval_bbeh_mini_samples_per_second": 0.967, |
| "eval_bbeh_mini_sampling/importance_sampling_ratio/max": 1.5325407144324532, |
| "eval_bbeh_mini_sampling/importance_sampling_ratio/mean": 0.24085120471387073, |
| "eval_bbeh_mini_sampling/importance_sampling_ratio/min": 5.8291943258501656e-05, |
| "eval_bbeh_mini_sampling/sampling_logp_difference/max": 3.5541970894254487, |
| "eval_bbeh_mini_sampling/sampling_logp_difference/mean": 0.023053794135821277, |
| "eval_bbeh_mini_steps_per_second": 0.061, |
| "step": 600 |
| } |
| ], |
| "logging_steps": 25, |
| "max_steps": 5000, |
| "num_input_tokens_seen": 8715415, |
| "num_train_epochs": 1, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|