Liars_dice_final_memek / trainer_state.json
Gege24's picture
Upload task output 1
ce44c8c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.00225,
"eval_steps": 500,
"global_step": 225,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 839.0,
"completions/max_terminated_length": 839.0,
"completions/mean_length": 265.75,
"completions/mean_terminated_length": 265.75,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.740012645721436,
"epoch": 1e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.019221410155296326,
"kl": 0.0,
"learning_rate": 0.0,
"loss": -0.0006,
"num_tokens": 45751.0,
"reward": 0.816877007484436,
"reward_std": 1.4014036655426025,
"rewards/rollout_reward_func/mean": 0.816877007484436,
"rewards/rollout_reward_func/std": 1.6075319051742554,
"sampling/importance_sampling_ratio/max": 0.03914691507816315,
"sampling/importance_sampling_ratio/mean": 0.013615390285849571,
"sampling/importance_sampling_ratio/min": 1.1552421904970122e-15,
"sampling/sampling_logp_difference/max": 3.914313554763794,
"sampling/sampling_logp_difference/mean": 1.6371219158172607,
"step": 1,
"step_time": 9.72409536699979
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.740012645721436,
"epoch": 2e-05,
"grad_norm": 0.01974678784608841,
"kl": 0.0,
"learning_rate": 2.8571428571428575e-07,
"loss": -0.0006,
"step": 2,
"step_time": 4.797613267999623
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 760.0,
"completions/max_terminated_length": 760.0,
"completions/mean_length": 412.5625,
"completions/mean_terminated_length": 412.5625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.820557475090027,
"epoch": 3e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.011553559452295303,
"kl": 0.0009907482417474966,
"learning_rate": 5.714285714285715e-07,
"loss": 0.0001,
"num_tokens": 99312.0,
"reward": 2.8992574214935303,
"reward_std": 1.8266513347625732,
"rewards/rollout_reward_func/mean": 2.8992574214935303,
"rewards/rollout_reward_func/std": 1.9147884845733643,
"sampling/importance_sampling_ratio/max": 0.0381130687892437,
"sampling/importance_sampling_ratio/mean": 0.009180868044495583,
"sampling/importance_sampling_ratio/min": 9.134832647250679e-12,
"sampling/sampling_logp_difference/max": 3.4724807739257812,
"sampling/sampling_logp_difference/mean": 1.698885440826416,
"step": 3,
"step_time": 8.947379413000363
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.818018198013306,
"epoch": 4e-05,
"grad_norm": 0.011284240521490574,
"kl": 0.0009902061865432188,
"learning_rate": 8.571428571428572e-07,
"loss": 0.0001,
"step": 4,
"step_time": 5.229739129000336
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 571.0,
"completions/max_terminated_length": 571.0,
"completions/mean_length": 165.09375,
"completions/mean_terminated_length": 165.09375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.794174313545227,
"epoch": 5e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.028869854286313057,
"kl": 0.0008135313764796592,
"learning_rate": 1.142857142857143e-06,
"loss": -0.0004,
"num_tokens": 141029.0,
"reward": 1.8564525842666626,
"reward_std": 2.077150344848633,
"rewards/rollout_reward_func/mean": 1.8564525842666626,
"rewards/rollout_reward_func/std": 2.0850281715393066,
"sampling/importance_sampling_ratio/max": 0.05348784476518631,
"sampling/importance_sampling_ratio/mean": 0.017640406265854836,
"sampling/importance_sampling_ratio/min": 0.00042824808042496443,
"sampling/sampling_logp_difference/max": 2.332674980163574,
"sampling/sampling_logp_difference/mean": 1.7570207118988037,
"step": 5,
"step_time": 8.192974794999373
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.788055181503296,
"epoch": 6e-05,
"grad_norm": 0.029661983251571655,
"kl": 0.0007373044900305104,
"learning_rate": 1.4285714285714286e-06,
"loss": -0.0005,
"step": 6,
"step_time": 4.266827427000862
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 848.0,
"completions/max_terminated_length": 848.0,
"completions/mean_length": 541.6875,
"completions/mean_terminated_length": 541.6875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.803173184394836,
"epoch": 7e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004336625337600708,
"kl": 0.0010216275259153917,
"learning_rate": 1.7142857142857145e-06,
"loss": 0.0002,
"num_tokens": 199637.0,
"reward": 1.30655038356781,
"reward_std": 0.9892024993896484,
"rewards/rollout_reward_func/mean": 1.30655038356781,
"rewards/rollout_reward_func/std": 1.138155221939087,
"sampling/importance_sampling_ratio/max": 0.028351690620183945,
"sampling/importance_sampling_ratio/mean": 0.005052408203482628,
"sampling/importance_sampling_ratio/min": 2.671416343005633e-15,
"sampling/sampling_logp_difference/max": 4.701449394226074,
"sampling/sampling_logp_difference/mean": 1.7298243045806885,
"step": 7,
"step_time": 9.286785637000321
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.789715051651001,
"epoch": 8e-05,
"grad_norm": 0.004228756297379732,
"kl": 0.000888259346538689,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0002,
"step": 8,
"step_time": 4.969429294999827
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0026041667442768812,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0026041667442768812,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 686.0,
"completions/max_terminated_length": 686.0,
"completions/mean_length": 286.75,
"completions/mean_terminated_length": 295.4838562011719,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.3694349527359,
"epoch": 9e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.010730103589594364,
"kl": 0.0009608958062017336,
"learning_rate": 2.285714285714286e-06,
"loss": -0.0,
"num_tokens": 246380.0,
"reward": 2.1979100704193115,
"reward_std": 1.8867942094802856,
"rewards/rollout_reward_func/mean": 2.1979100704193115,
"rewards/rollout_reward_func/std": 2.1932425498962402,
"sampling/importance_sampling_ratio/max": 0.03839043155312538,
"sampling/importance_sampling_ratio/mean": 0.012242316268384457,
"sampling/importance_sampling_ratio/min": 3.078865162819966e-08,
"sampling/sampling_logp_difference/max": 3.9608242511749268,
"sampling/sampling_logp_difference/mean": 1.4708642959594727,
"step": 9,
"step_time": 8.68651038400003
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.358309984207153,
"epoch": 0.0001,
"grad_norm": 0.0108121233060956,
"kl": 0.000685311508277664,
"learning_rate": 2.571428571428571e-06,
"loss": -0.0,
"step": 10,
"step_time": 5.445634689999224
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0028409091755747795,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0028409091755747795,
"completions/clipped_ratio": 0.0,
"completions/max_length": 803.0,
"completions/max_terminated_length": 803.0,
"completions/mean_length": 553.90625,
"completions/mean_terminated_length": 553.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.565176486968994,
"epoch": 0.00011,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0031674716155976057,
"kl": 0.0009231339645339176,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.0001,
"num_tokens": 306137.0,
"reward": 1.9738842248916626,
"reward_std": 1.5114688873291016,
"rewards/rollout_reward_func/mean": 1.9738842248916626,
"rewards/rollout_reward_func/std": 1.8342463970184326,
"sampling/importance_sampling_ratio/max": 0.02065931260585785,
"sampling/importance_sampling_ratio/mean": 0.0028495141305029392,
"sampling/importance_sampling_ratio/min": 5.273884899763661e-19,
"sampling/sampling_logp_difference/max": 3.751443862915039,
"sampling/sampling_logp_difference/mean": 1.6439871788024902,
"step": 11,
"step_time": 9.134554064999975
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0028409091755747795,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0028409091755747795,
"entropy": 8.561085760593414,
"epoch": 0.00012,
"grad_norm": 0.0026493030600249767,
"kl": 0.0009400276176165789,
"learning_rate": 3.142857142857143e-06,
"loss": 0.0001,
"step": 12,
"step_time": 4.8191932119998455
},
{
"clip_ratio/high_max": 0.010416666977107525,
"clip_ratio/high_mean": 0.0052083334885537624,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0052083334885537624,
"completions/clipped_ratio": 0.0,
"completions/max_length": 571.0,
"completions/max_terminated_length": 571.0,
"completions/mean_length": 162.4375,
"completions/mean_terminated_length": 162.4375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.512920141220093,
"epoch": 0.00013,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.013727321289479733,
"kl": 0.0018250496359542012,
"learning_rate": 3.428571428571429e-06,
"loss": -0.0004,
"num_tokens": 349131.0,
"reward": 1.47185218334198,
"reward_std": 1.5472846031188965,
"rewards/rollout_reward_func/mean": 1.47185218334198,
"rewards/rollout_reward_func/std": 2.0390946865081787,
"sampling/importance_sampling_ratio/max": 0.049390941858291626,
"sampling/importance_sampling_ratio/mean": 0.020272064954042435,
"sampling/importance_sampling_ratio/min": 2.633779558891547e-06,
"sampling/sampling_logp_difference/max": 2.360596179962158,
"sampling/sampling_logp_difference/mean": 1.5058674812316895,
"step": 13,
"step_time": 7.890245483999934
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"entropy": 8.52327024936676,
"epoch": 0.00014,
"grad_norm": 0.01371886394917965,
"kl": 0.002251528945635073,
"learning_rate": 3.7142857142857146e-06,
"loss": -0.0004,
"step": 14,
"step_time": 4.277807705000669
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 866.0,
"completions/max_terminated_length": 866.0,
"completions/mean_length": 493.03125,
"completions/mean_terminated_length": 508.4193420410156,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.769269943237305,
"epoch": 0.00015,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.004188260994851589,
"kl": 0.0018109382945112884,
"learning_rate": 4.000000000000001e-06,
"loss": -0.0001,
"num_tokens": 405372.0,
"reward": 2.176278591156006,
"reward_std": 1.8223553895950317,
"rewards/rollout_reward_func/mean": 2.176278591156006,
"rewards/rollout_reward_func/std": 1.8436557054519653,
"sampling/importance_sampling_ratio/max": 0.021065089851617813,
"sampling/importance_sampling_ratio/mean": 0.0036145278718322515,
"sampling/importance_sampling_ratio/min": 4.2438622060991804e-13,
"sampling/sampling_logp_difference/max": 3.6341652870178223,
"sampling/sampling_logp_difference/mean": 1.674858808517456,
"step": 15,
"step_time": 9.12190689900035
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.767740607261658,
"epoch": 0.00016,
"grad_norm": 0.004105101805180311,
"kl": 0.002771631450741552,
"learning_rate": 4.2857142857142855e-06,
"loss": -0.0001,
"step": 16,
"step_time": 5.4121323870003835
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 733.0,
"completions/max_terminated_length": 733.0,
"completions/mean_length": 420.0,
"completions/mean_terminated_length": 421.7241516113281,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.60806268453598,
"epoch": 0.00017,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.009108365513384342,
"kl": 0.004436066417838447,
"learning_rate": 4.571428571428572e-06,
"loss": 0.0001,
"num_tokens": 458204.0,
"reward": 2.0131897926330566,
"reward_std": 1.7921838760375977,
"rewards/rollout_reward_func/mean": 2.0131897926330566,
"rewards/rollout_reward_func/std": 1.917612910270691,
"sampling/importance_sampling_ratio/max": 0.03794016316533089,
"sampling/importance_sampling_ratio/mean": 0.00851379707455635,
"sampling/importance_sampling_ratio/min": 5.970022844210435e-30,
"sampling/sampling_logp_difference/max": 3.762781858444214,
"sampling/sampling_logp_difference/mean": 1.7653887271881104,
"step": 17,
"step_time": 8.833719273000042
},
{
"clip_ratio/high_max": 0.0034722222480922937,
"clip_ratio/high_mean": 0.0017361111240461469,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0017361111240461469,
"entropy": 8.605763673782349,
"epoch": 0.00018,
"grad_norm": 0.009003642946481705,
"kl": 0.004826090880669653,
"learning_rate": 4.857142857142858e-06,
"loss": 0.0001,
"step": 18,
"step_time": 4.669556054000623
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 778.0,
"completions/max_terminated_length": 778.0,
"completions/mean_length": 356.1875,
"completions/mean_terminated_length": 356.1875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.589665651321411,
"epoch": 0.00019,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.012068537063896656,
"kl": 0.007034957525320351,
"learning_rate": 5.142857142857142e-06,
"loss": 0.0003,
"num_tokens": 510714.0,
"reward": 1.6357378959655762,
"reward_std": 1.9768089056015015,
"rewards/rollout_reward_func/mean": 1.6357378959655762,
"rewards/rollout_reward_func/std": 1.9230132102966309,
"sampling/importance_sampling_ratio/max": 0.0377332866191864,
"sampling/importance_sampling_ratio/mean": 0.00977294985204935,
"sampling/importance_sampling_ratio/min": 1.5810989850706392e-08,
"sampling/sampling_logp_difference/max": 2.8323609828948975,
"sampling/sampling_logp_difference/mean": 1.57611083984375,
"step": 19,
"step_time": 8.702261807000923
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 8.588200807571411,
"epoch": 0.0002,
"grad_norm": 0.01012762077152729,
"kl": 0.008162530430126935,
"learning_rate": 5.428571428571429e-06,
"loss": 0.0003,
"step": 20,
"step_time": 4.793898748000174
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 725.0,
"completions/max_terminated_length": 676.0,
"completions/mean_length": 169.8125,
"completions/mean_terminated_length": 156.43333435058594,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.763131499290466,
"epoch": 0.00021,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.014919566921889782,
"kl": 0.021441200777189806,
"learning_rate": 5.7142857142857145e-06,
"loss": -0.0005,
"num_tokens": 554259.0,
"reward": 2.2263033390045166,
"reward_std": 1.681884765625,
"rewards/rollout_reward_func/mean": 2.2263033390045166,
"rewards/rollout_reward_func/std": 1.8072566986083984,
"sampling/importance_sampling_ratio/max": 0.08593336492776871,
"sampling/importance_sampling_ratio/mean": 0.022409576922655106,
"sampling/importance_sampling_ratio/min": 2.349876534956627e-22,
"sampling/sampling_logp_difference/max": 4.372560501098633,
"sampling/sampling_logp_difference/mean": 1.931687831878662,
"step": 21,
"step_time": 8.055510063999009
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 8.737543225288391,
"epoch": 0.00022,
"grad_norm": 0.014876801520586014,
"kl": 0.029705224180361256,
"learning_rate": 6e-06,
"loss": -0.0005,
"step": 22,
"step_time": 5.419155312999919
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 758.0,
"completions/max_terminated_length": 758.0,
"completions/mean_length": 182.1875,
"completions/mean_terminated_length": 171.50001525878906,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.859729766845703,
"epoch": 0.00023,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.029221149161458015,
"kl": 0.0323223132872954,
"learning_rate": 6.285714285714286e-06,
"loss": -0.0011,
"num_tokens": 596571.0,
"reward": 1.5583102703094482,
"reward_std": 1.6529381275177002,
"rewards/rollout_reward_func/mean": 1.5583102703094482,
"rewards/rollout_reward_func/std": 1.7341761589050293,
"sampling/importance_sampling_ratio/max": 0.06399935483932495,
"sampling/importance_sampling_ratio/mean": 0.025177521631121635,
"sampling/importance_sampling_ratio/min": 1.0080106696608216e-18,
"sampling/sampling_logp_difference/max": 3.969541072845459,
"sampling/sampling_logp_difference/mean": 1.8030247688293457,
"step": 23,
"step_time": 8.169906658999935
},
{
"clip_ratio/high_max": 0.03125,
"clip_ratio/high_mean": 0.015625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"entropy": 8.80251955986023,
"epoch": 0.00024,
"grad_norm": 0.029200905933976173,
"kl": 0.04814133094623685,
"learning_rate": 6.571428571428572e-06,
"loss": -0.0012,
"step": 24,
"step_time": 4.54475868199961
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 830.0,
"completions/max_terminated_length": 830.0,
"completions/mean_length": 517.65625,
"completions/mean_terminated_length": 530.7333374023438,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.177129030227661,
"epoch": 0.00025,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.008773105219006538,
"kl": 0.03207368147559464,
"learning_rate": 6.857142857142858e-06,
"loss": -0.0007,
"num_tokens": 653422.0,
"reward": 2.0930328369140625,
"reward_std": 1.470797061920166,
"rewards/rollout_reward_func/mean": 2.0930328369140625,
"rewards/rollout_reward_func/std": 1.5851061344146729,
"sampling/importance_sampling_ratio/max": 0.08151775598526001,
"sampling/importance_sampling_ratio/mean": 0.01129196584224701,
"sampling/importance_sampling_ratio/min": 2.138438081125682e-13,
"sampling/sampling_logp_difference/max": 3.773285388946533,
"sampling/sampling_logp_difference/mean": 1.4078741073608398,
"step": 25,
"step_time": 9.088412857000094
},
{
"clip_ratio/high_max": 0.043181818444281816,
"clip_ratio/high_mean": 0.021590909222140908,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.021590909222140908,
"entropy": 8.089222967624664,
"epoch": 0.00026,
"grad_norm": 0.008307461626827717,
"kl": 0.04364914959296584,
"learning_rate": 7.1428571428571436e-06,
"loss": -0.0007,
"step": 26,
"step_time": 4.877906865999648
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 821.0,
"completions/max_terminated_length": 821.0,
"completions/mean_length": 309.9375,
"completions/mean_terminated_length": 309.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.212523341178894,
"epoch": 0.00027,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.014188895933330059,
"kl": 0.0853999936953187,
"learning_rate": 7.428571428571429e-06,
"loss": -0.001,
"num_tokens": 702104.0,
"reward": 1.7692888975143433,
"reward_std": 1.6422841548919678,
"rewards/rollout_reward_func/mean": 1.7692888975143433,
"rewards/rollout_reward_func/std": 1.989976406097412,
"sampling/importance_sampling_ratio/max": 0.11609657108783722,
"sampling/importance_sampling_ratio/mean": 0.027356663718819618,
"sampling/importance_sampling_ratio/min": 9.024407518154476e-06,
"sampling/sampling_logp_difference/max": 2.4522972106933594,
"sampling/sampling_logp_difference/mean": 1.4562647342681885,
"step": 27,
"step_time": 9.140366876000371
},
{
"clip_ratio/high_max": 0.12500000186264515,
"clip_ratio/high_mean": 0.06250000093132257,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.06250000093132257,
"entropy": 8.088905096054077,
"epoch": 0.00028,
"grad_norm": 0.014231563545763493,
"kl": 0.10766742378473282,
"learning_rate": 7.714285714285716e-06,
"loss": -0.0011,
"step": 28,
"step_time": 5.39836863499977
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 803.0,
"completions/max_terminated_length": 803.0,
"completions/mean_length": 254.8125,
"completions/mean_terminated_length": 262.51611328125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.048985123634338,
"epoch": 0.00029,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.015394963324069977,
"kl": 0.1432503336109221,
"learning_rate": 8.000000000000001e-06,
"loss": -0.001,
"num_tokens": 749391.0,
"reward": 1.818474292755127,
"reward_std": 1.1649471521377563,
"rewards/rollout_reward_func/mean": 1.818474292755127,
"rewards/rollout_reward_func/std": 1.8718231916427612,
"sampling/importance_sampling_ratio/max": 0.130377858877182,
"sampling/importance_sampling_ratio/mean": 0.036613546311855316,
"sampling/importance_sampling_ratio/min": 1.2765659362923287e-10,
"sampling/sampling_logp_difference/max": 3.609269380569458,
"sampling/sampling_logp_difference/mean": 1.4590439796447754,
"step": 29,
"step_time": 8.3333206719999
},
{
"clip_ratio/high_max": 0.0416666679084301,
"clip_ratio/high_mean": 0.02083333395421505,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.031250000931322575,
"entropy": 7.883777499198914,
"epoch": 0.0003,
"grad_norm": 0.01582338474690914,
"kl": 0.17406905256211758,
"learning_rate": 8.285714285714287e-06,
"loss": -0.0011,
"step": 30,
"step_time": 4.778778166000393
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 830.0,
"completions/max_terminated_length": 830.0,
"completions/mean_length": 541.90625,
"completions/mean_terminated_length": 558.8709716796875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 8.091109991073608,
"epoch": 0.00031,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.021641016006469727,
"kl": 0.12537556886672974,
"learning_rate": 8.571428571428571e-06,
"loss": 0.0003,
"num_tokens": 807902.0,
"reward": 1.436830997467041,
"reward_std": 1.0506994724273682,
"rewards/rollout_reward_func/mean": 1.436830997467041,
"rewards/rollout_reward_func/std": 1.2816261053085327,
"sampling/importance_sampling_ratio/max": 0.07519304007291794,
"sampling/importance_sampling_ratio/mean": 0.013723745942115784,
"sampling/importance_sampling_ratio/min": 1.7366062713998758e-16,
"sampling/sampling_logp_difference/max": 5.218207359313965,
"sampling/sampling_logp_difference/mean": 1.5592684745788574,
"step": 31,
"step_time": 9.165752515999884
},
{
"clip_ratio/high_max": 0.005681818351149559,
"clip_ratio/high_mean": 0.0028409091755747795,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0028409091755747795,
"entropy": 8.013701796531677,
"epoch": 0.00032,
"grad_norm": 0.021766290068626404,
"kl": 0.1357055138796568,
"learning_rate": 8.857142857142858e-06,
"loss": 0.0003,
"step": 32,
"step_time": 4.949108714999966
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 663.0,
"completions/max_terminated_length": 662.0,
"completions/mean_length": 355.46875,
"completions/mean_terminated_length": 337.3000183105469,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 7.517293989658356,
"epoch": 0.00033,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.017694007605314255,
"kl": 0.20308297593146563,
"learning_rate": 9.142857142857144e-06,
"loss": -0.0019,
"num_tokens": 859075.0,
"reward": 2.665842056274414,
"reward_std": 2.0510294437408447,
"rewards/rollout_reward_func/mean": 2.665842056274414,
"rewards/rollout_reward_func/std": 2.058197498321533,
"sampling/importance_sampling_ratio/max": 0.1805843561887741,
"sampling/importance_sampling_ratio/mean": 0.033763326704502106,
"sampling/importance_sampling_ratio/min": 9.382639013877456e-15,
"sampling/sampling_logp_difference/max": 4.034926891326904,
"sampling/sampling_logp_difference/mean": 1.42521071434021,
"step": 33,
"step_time": 9.64415260199985
},
{
"clip_ratio/high_max": 0.005681818351149559,
"clip_ratio/high_mean": 0.0028409091755747795,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0028409091755747795,
"entropy": 7.461727142333984,
"epoch": 0.00034,
"grad_norm": 0.01653335802257061,
"kl": 0.21375709865242243,
"learning_rate": 9.42857142857143e-06,
"loss": -0.002,
"step": 34,
"step_time": 4.6672892820001834
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 832.0,
"completions/max_terminated_length": 832.0,
"completions/mean_length": 287.5,
"completions/mean_terminated_length": 272.1612854003906,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 7.252508640289307,
"epoch": 0.00035,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.04618426784873009,
"kl": 0.294549023732543,
"learning_rate": 9.714285714285715e-06,
"loss": -0.001,
"num_tokens": 908468.0,
"reward": 1.9603195190429688,
"reward_std": 1.7872586250305176,
"rewards/rollout_reward_func/mean": 1.9603195190429688,
"rewards/rollout_reward_func/std": 1.841855764389038,
"sampling/importance_sampling_ratio/max": 0.19450248777866364,
"sampling/importance_sampling_ratio/mean": 0.0430486798286438,
"sampling/importance_sampling_ratio/min": 1.1371217567557323e-07,
"sampling/sampling_logp_difference/max": 3.5069642066955566,
"sampling/sampling_logp_difference/mean": 1.187075138092041,
"step": 35,
"step_time": 8.733247314000437
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.026041666977107525,
"entropy": 7.227072596549988,
"epoch": 0.00036,
"grad_norm": 0.0316547267138958,
"kl": 0.26715745590627193,
"learning_rate": 1e-05,
"loss": -0.0011,
"step": 36,
"step_time": 4.828337421000015
},
{
"clip_ratio/high_max": 0.013888888992369175,
"clip_ratio/high_mean": 0.0069444444961845875,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.014756944496184587,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 778.0,
"completions/max_terminated_length": 778.0,
"completions/mean_length": 390.0,
"completions/mean_terminated_length": 402.06451416015625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.85473507642746,
"epoch": 0.00037,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01456042192876339,
"kl": 0.20423047989606857,
"learning_rate": 9.999999999884322e-06,
"loss": -0.0029,
"num_tokens": 961096.0,
"reward": 3.3347973823547363,
"reward_std": 1.635354995727539,
"rewards/rollout_reward_func/mean": 3.3347973823547363,
"rewards/rollout_reward_func/std": 1.591873288154602,
"sampling/importance_sampling_ratio/max": 0.2037617266178131,
"sampling/importance_sampling_ratio/mean": 0.04216703772544861,
"sampling/importance_sampling_ratio/min": 3.60183348667997e-18,
"sampling/sampling_logp_difference/max": 4.788333415985107,
"sampling/sampling_logp_difference/mean": 1.3821991682052612,
"step": 37,
"step_time": 8.829268903999946
},
{
"clip_ratio/high_max": 0.013888888992369175,
"clip_ratio/high_mean": 0.0069444444961845875,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.014756944496184587,
"entropy": 6.8267329931259155,
"epoch": 0.00038,
"grad_norm": 0.012915832921862602,
"kl": 0.2027184907346964,
"learning_rate": 9.999999999537282e-06,
"loss": -0.003,
"step": 38,
"step_time": 5.279609900000196
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 767.0,
"completions/max_terminated_length": 767.0,
"completions/mean_length": 292.125,
"completions/mean_terminated_length": 291.9666748046875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 7.531160056591034,
"epoch": 0.00039,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.015933845192193985,
"kl": 0.2402525246143341,
"learning_rate": 9.999999998958884e-06,
"loss": -0.0013,
"num_tokens": 1010524.0,
"reward": 1.7904164791107178,
"reward_std": 1.6788225173950195,
"rewards/rollout_reward_func/mean": 1.7904164791107178,
"rewards/rollout_reward_func/std": 1.8581712245941162,
"sampling/importance_sampling_ratio/max": 0.042524565011262894,
"sampling/importance_sampling_ratio/mean": 0.021317776292562485,
"sampling/importance_sampling_ratio/min": 5.162857177539051e-24,
"sampling/sampling_logp_difference/max": 12.552388191223145,
"sampling/sampling_logp_difference/mean": 1.622736930847168,
"step": 39,
"step_time": 9.170014825000635
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 7.470682978630066,
"epoch": 0.0004,
"grad_norm": 0.014897222630679607,
"kl": 0.2379161100834608,
"learning_rate": 9.999999998149125e-06,
"loss": -0.0013,
"step": 40,
"step_time": 4.626162753000699
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 785.0,
"completions/max_terminated_length": 785.0,
"completions/mean_length": 588.96875,
"completions/mean_terminated_length": 594.258056640625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.788816154003143,
"epoch": 0.00041,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.020633654668927193,
"kl": 0.17094908049330115,
"learning_rate": 9.99999999710801e-06,
"loss": -0.0025,
"num_tokens": 1070581.0,
"reward": 2.306253433227539,
"reward_std": 1.3609917163848877,
"rewards/rollout_reward_func/mean": 2.306253433227539,
"rewards/rollout_reward_func/std": 1.8414863348007202,
"sampling/importance_sampling_ratio/max": 0.05647118017077446,
"sampling/importance_sampling_ratio/mean": 0.019686147570610046,
"sampling/importance_sampling_ratio/min": 8.113022520378068e-17,
"sampling/sampling_logp_difference/max": 5.106669902801514,
"sampling/sampling_logp_difference/mean": 1.3194385766983032,
"step": 41,
"step_time": 9.059421735999877
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0028409091755747795,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0028409091755747795,
"entropy": 6.697398841381073,
"epoch": 0.00042,
"grad_norm": 0.012766940519213676,
"kl": 0.16542547149583697,
"learning_rate": 9.999999995835533e-06,
"loss": -0.0026,
"step": 42,
"step_time": 4.873758401001396
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 731.0,
"completions/max_terminated_length": 731.0,
"completions/mean_length": 368.4375,
"completions/mean_terminated_length": 368.4375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 7.104367315769196,
"epoch": 0.00043,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.014938808046281338,
"kl": 0.23283970914781094,
"learning_rate": 9.999999994331697e-06,
"loss": -0.0029,
"num_tokens": 1121352.0,
"reward": 2.358142614364624,
"reward_std": 1.2651467323303223,
"rewards/rollout_reward_func/mean": 2.358142614364624,
"rewards/rollout_reward_func/std": 1.9181171655654907,
"sampling/importance_sampling_ratio/max": 0.26942965388298035,
"sampling/importance_sampling_ratio/mean": 0.06439891457557678,
"sampling/importance_sampling_ratio/min": 0.00032740956521593034,
"sampling/sampling_logp_difference/max": 2.4414572715759277,
"sampling/sampling_logp_difference/mean": 1.2698404788970947,
"step": 43,
"step_time": 8.609952049000185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 7.033655643463135,
"epoch": 0.00044,
"grad_norm": 0.014094019308686256,
"kl": 0.23375796806067228,
"learning_rate": 9.999999992596503e-06,
"loss": -0.003,
"step": 44,
"step_time": 5.138805102998958
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 769.0,
"completions/max_terminated_length": 769.0,
"completions/mean_length": 323.90625,
"completions/mean_terminated_length": 315.8333435058594,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 7.381109952926636,
"epoch": 0.00045,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.015758151188492775,
"kl": 0.1762851346284151,
"learning_rate": 9.999999990629948e-06,
"loss": -0.0067,
"num_tokens": 1168932.0,
"reward": 2.3062658309936523,
"reward_std": 1.6873681545257568,
"rewards/rollout_reward_func/mean": 2.3062658309936523,
"rewards/rollout_reward_func/std": 2.015537738800049,
"sampling/importance_sampling_ratio/max": 0.28785669803619385,
"sampling/importance_sampling_ratio/mean": 0.07849664986133575,
"sampling/importance_sampling_ratio/min": 1.9031297972719374e-18,
"sampling/sampling_logp_difference/max": 4.624307155609131,
"sampling/sampling_logp_difference/mean": 1.4169467687606812,
"step": 45,
"step_time": 9.268211923999843
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 7.324137270450592,
"epoch": 0.00046,
"grad_norm": 0.016143618151545525,
"kl": 0.17720989137887955,
"learning_rate": 9.999999988432035e-06,
"loss": -0.0069,
"step": 46,
"step_time": 4.723417413000334
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 640.0,
"completions/max_terminated_length": 640.0,
"completions/mean_length": 289.6875,
"completions/mean_terminated_length": 289.6875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.798922121524811,
"epoch": 0.00047,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.022382335737347603,
"kl": 0.45002966932952404,
"learning_rate": 9.999999986002761e-06,
"loss": -0.0059,
"num_tokens": 1217968.0,
"reward": 1.891412377357483,
"reward_std": 1.739563226699829,
"rewards/rollout_reward_func/mean": 1.891412377357483,
"rewards/rollout_reward_func/std": 2.1437840461730957,
"sampling/importance_sampling_ratio/max": 0.09762566536664963,
"sampling/importance_sampling_ratio/mean": 0.04518824815750122,
"sampling/importance_sampling_ratio/min": 4.220652438657879e-10,
"sampling/sampling_logp_difference/max": 4.6860880851745605,
"sampling/sampling_logp_difference/mean": 1.201680064201355,
"step": 47,
"step_time": 8.263996364999457
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0028409091755747795,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0028409091755747795,
"entropy": 6.750116407871246,
"epoch": 0.00048,
"grad_norm": 0.015847016125917435,
"kl": 0.3995134783908725,
"learning_rate": 9.999999983342127e-06,
"loss": -0.0061,
"step": 48,
"step_time": 4.558645490999879
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 758.0,
"completions/max_terminated_length": 758.0,
"completions/mean_length": 548.90625,
"completions/mean_terminated_length": 548.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.013215720653534,
"epoch": 0.00049,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.021171115338802338,
"kl": 0.3200267134234309,
"learning_rate": 9.999999980450137e-06,
"loss": -0.0033,
"num_tokens": 1277645.0,
"reward": 2.5847978591918945,
"reward_std": 0.8504736423492432,
"rewards/rollout_reward_func/mean": 2.5847978591918945,
"rewards/rollout_reward_func/std": 2.012620687484741,
"sampling/importance_sampling_ratio/max": 0.10617782175540924,
"sampling/importance_sampling_ratio/mean": 0.057969365268945694,
"sampling/importance_sampling_ratio/min": 0.0006385967135429382,
"sampling/sampling_logp_difference/max": 2.5639331340789795,
"sampling/sampling_logp_difference/mean": 0.9616080522537231,
"step": 49,
"step_time": 8.934477900000275
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.897445023059845,
"epoch": 0.0005,
"grad_norm": 0.018860990181565285,
"kl": 0.33296194672584534,
"learning_rate": 9.999999977326787e-06,
"loss": -0.0034,
"step": 50,
"step_time": 5.351620988000377
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 767.0,
"completions/max_terminated_length": 767.0,
"completions/mean_length": 537.46875,
"completions/mean_terminated_length": 537.46875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.633717179298401,
"epoch": 0.00051,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01536987628787756,
"kl": 0.2537247408181429,
"learning_rate": 9.999999973972076e-06,
"loss": -0.0022,
"num_tokens": 1337256.0,
"reward": 2.237421989440918,
"reward_std": 1.3681035041809082,
"rewards/rollout_reward_func/mean": 2.237421989440918,
"rewards/rollout_reward_func/std": 1.740581750869751,
"sampling/importance_sampling_ratio/max": 0.1205218955874443,
"sampling/importance_sampling_ratio/mean": 0.04655706137418747,
"sampling/importance_sampling_ratio/min": 0.001190877752378583,
"sampling/sampling_logp_difference/max": 2.396751880645752,
"sampling/sampling_logp_difference/mean": 1.094531774520874,
"step": 51,
"step_time": 9.431871319000038
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"entropy": 6.529919326305389,
"epoch": 0.00052,
"grad_norm": 0.010579893365502357,
"kl": 0.26615126617252827,
"learning_rate": 9.999999970386004e-06,
"loss": -0.0023,
"step": 52,
"step_time": 4.8396601220001685
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 646.0,
"completions/max_terminated_length": 646.0,
"completions/mean_length": 324.125,
"completions/mean_terminated_length": 324.125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.771689236164093,
"epoch": 0.00053,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.015091736800968647,
"kl": 0.2611819123849273,
"learning_rate": 9.999999966568576e-06,
"loss": -0.011,
"num_tokens": 1386134.0,
"reward": 2.217728853225708,
"reward_std": 2.024404287338257,
"rewards/rollout_reward_func/mean": 2.217728853225708,
"rewards/rollout_reward_func/std": 2.3452394008636475,
"sampling/importance_sampling_ratio/max": 0.3583432734012604,
"sampling/importance_sampling_ratio/mean": 0.08469430357217789,
"sampling/importance_sampling_ratio/min": 1.4002454964024292e-14,
"sampling/sampling_logp_difference/max": 4.249520301818848,
"sampling/sampling_logp_difference/mean": 1.3450038433074951,
"step": 53,
"step_time": 8.218689654000627
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 6.760413706302643,
"epoch": 0.00054,
"grad_norm": 0.015123301185667515,
"kl": 0.26832089852541685,
"learning_rate": 9.999999962519787e-06,
"loss": -0.0111,
"step": 54,
"step_time": 4.458178430999396
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 785.0,
"completions/max_terminated_length": 785.0,
"completions/mean_length": 401.15625,
"completions/mean_terminated_length": 393.1290283203125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.331356406211853,
"epoch": 0.00055,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.07398711144924164,
"kl": 0.45008302945643663,
"learning_rate": 9.999999958239642e-06,
"loss": -0.0078,
"num_tokens": 1439006.0,
"reward": 1.4235717058181763,
"reward_std": 1.2011210918426514,
"rewards/rollout_reward_func/mean": 1.4235717058181763,
"rewards/rollout_reward_func/std": 1.8165862560272217,
"sampling/importance_sampling_ratio/max": 0.37091660499572754,
"sampling/importance_sampling_ratio/mean": 0.08141454309225082,
"sampling/importance_sampling_ratio/min": 5.328838854689677e-16,
"sampling/sampling_logp_difference/max": 6.0843353271484375,
"sampling/sampling_logp_difference/mean": 1.2536330223083496,
"step": 55,
"step_time": 8.796601195999756
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 6.339613497257233,
"epoch": 0.00056,
"grad_norm": 0.026661040261387825,
"kl": 0.4182750675827265,
"learning_rate": 9.999999953728133e-06,
"loss": -0.0081,
"step": 56,
"step_time": 5.088884858999336
},
{
"clip_ratio/high_max": 0.02025462966412306,
"clip_ratio/high_mean": 0.01012731483206153,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.01012731483206153,
"completions/clipped_ratio": 0.0,
"completions/max_length": 767.0,
"completions/max_terminated_length": 767.0,
"completions/mean_length": 436.9375,
"completions/mean_terminated_length": 436.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.944717228412628,
"epoch": 0.00057,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01468927413225174,
"kl": 0.3506404645740986,
"learning_rate": 9.999999948985266e-06,
"loss": -0.0092,
"num_tokens": 1493962.0,
"reward": 1.8320198059082031,
"reward_std": 0.8225011825561523,
"rewards/rollout_reward_func/mean": 1.8320198059082031,
"rewards/rollout_reward_func/std": 1.2535020112991333,
"sampling/importance_sampling_ratio/max": 0.3948986232280731,
"sampling/importance_sampling_ratio/mean": 0.11361236125230789,
"sampling/importance_sampling_ratio/min": 1.1934993257971545e-26,
"sampling/sampling_logp_difference/max": 11.387396812438965,
"sampling/sampling_logp_difference/mean": 1.256805658340454,
"step": 57,
"step_time": 9.449572570999862
},
{
"clip_ratio/high_max": 0.004629629664123058,
"clip_ratio/high_mean": 0.002314814832061529,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002314814832061529,
"entropy": 5.935160547494888,
"epoch": 0.00058,
"grad_norm": 0.015919912606477737,
"kl": 0.3587344065308571,
"learning_rate": 9.99999994401104e-06,
"loss": -0.0093,
"step": 58,
"step_time": 4.746457023000403
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 751.0,
"completions/max_terminated_length": 751.0,
"completions/mean_length": 480.59375,
"completions/mean_terminated_length": 480.59375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.848488986492157,
"epoch": 0.00059,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.011293532326817513,
"kl": 0.3660230152308941,
"learning_rate": 9.999999938805455e-06,
"loss": -0.0115,
"num_tokens": 1551299.0,
"reward": 2.3304104804992676,
"reward_std": 1.6922229528427124,
"rewards/rollout_reward_func/mean": 2.3304104804992676,
"rewards/rollout_reward_func/std": 1.9971063137054443,
"sampling/importance_sampling_ratio/max": 0.1643364131450653,
"sampling/importance_sampling_ratio/mean": 0.08756053447723389,
"sampling/importance_sampling_ratio/min": 0.0002673097769729793,
"sampling/sampling_logp_difference/max": 2.872664213180542,
"sampling/sampling_logp_difference/mean": 0.9991017580032349,
"step": 59,
"step_time": 8.483788936999645
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.8199068903923035,
"epoch": 0.0006,
"grad_norm": 0.009261633269488811,
"kl": 0.3585043679922819,
"learning_rate": 9.999999933368511e-06,
"loss": -0.0116,
"step": 60,
"step_time": 4.707603984000343
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 697.0,
"completions/max_terminated_length": 697.0,
"completions/mean_length": 475.6875,
"completions/mean_terminated_length": 470.774169921875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.473706513643265,
"epoch": 0.00061,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.013257281854748726,
"kl": 0.32057628221809864,
"learning_rate": 9.999999927700208e-06,
"loss": -0.0136,
"num_tokens": 1608125.0,
"reward": 3.2187983989715576,
"reward_std": 1.597353458404541,
"rewards/rollout_reward_func/mean": 3.2187983989715576,
"rewards/rollout_reward_func/std": 2.062941551208496,
"sampling/importance_sampling_ratio/max": 0.17399519681930542,
"sampling/importance_sampling_ratio/mean": 0.09229454398155212,
"sampling/importance_sampling_ratio/min": 5.896088738771565e-13,
"sampling/sampling_logp_difference/max": 4.472219944000244,
"sampling/sampling_logp_difference/mean": 1.0552775859832764,
"step": 61,
"step_time": 9.438191732000632
},
{
"clip_ratio/high_max": 0.005681818351149559,
"clip_ratio/high_mean": 0.0028409091755747795,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0028409091755747795,
"entropy": 5.456495136022568,
"epoch": 0.00062,
"grad_norm": 0.01240911427885294,
"kl": 0.32513533532619476,
"learning_rate": 9.999999921800544e-06,
"loss": -0.0137,
"step": 62,
"step_time": 4.807036896999307
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 733.0,
"completions/max_terminated_length": 733.0,
"completions/mean_length": 240.96875,
"completions/mean_terminated_length": 226.9677276611328,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.3534833788871765,
"epoch": 0.00063,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01809127815067768,
"kl": 0.36241104267537594,
"learning_rate": 9.999999915669521e-06,
"loss": -0.0133,
"num_tokens": 1652976.0,
"reward": 2.7536964416503906,
"reward_std": 1.9717084169387817,
"rewards/rollout_reward_func/mean": 2.7536964416503906,
"rewards/rollout_reward_func/std": 2.0205845832824707,
"sampling/importance_sampling_ratio/max": 0.43795350193977356,
"sampling/importance_sampling_ratio/mean": 0.12350660562515259,
"sampling/importance_sampling_ratio/min": 5.083219400958683e-10,
"sampling/sampling_logp_difference/max": 3.9410667419433594,
"sampling/sampling_logp_difference/mean": 1.179007887840271,
"step": 63,
"step_time": 8.806214823998744
},
{
"clip_ratio/high_max": 0.07859848625957966,
"clip_ratio/high_mean": 0.03929924312978983,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.03929924312978983,
"entropy": 6.309255123138428,
"epoch": 0.00064,
"grad_norm": 0.013196753337979317,
"kl": 0.3604668825864792,
"learning_rate": 9.99999990930714e-06,
"loss": -0.0134,
"step": 64,
"step_time": 4.585649621999437
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 749.0,
"completions/max_terminated_length": 749.0,
"completions/mean_length": 520.15625,
"completions/mean_terminated_length": 520.15625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.630438059568405,
"epoch": 0.00065,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.009761711582541466,
"kl": 0.25976699963212013,
"learning_rate": 9.999999902713398e-06,
"loss": -0.0096,
"num_tokens": 1710411.0,
"reward": 2.439373731613159,
"reward_std": 1.6874518394470215,
"rewards/rollout_reward_func/mean": 2.439373731613159,
"rewards/rollout_reward_func/std": 1.8576425313949585,
"sampling/importance_sampling_ratio/max": 0.1951960325241089,
"sampling/importance_sampling_ratio/mean": 0.06827103346586227,
"sampling/importance_sampling_ratio/min": 1.1635305696700016e-07,
"sampling/sampling_logp_difference/max": 4.405527591705322,
"sampling/sampling_logp_difference/mean": 1.1796586513519287,
"step": 65,
"step_time": 8.827101629000026
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 6.571233093738556,
"epoch": 0.00066,
"grad_norm": 0.008613799698650837,
"kl": 0.2607234949246049,
"learning_rate": 9.999999895888298e-06,
"loss": -0.0096,
"step": 66,
"step_time": 4.809835060999376
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 679.0,
"completions/max_terminated_length": 679.0,
"completions/mean_length": 301.1875,
"completions/mean_terminated_length": 301.1875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.944601535797119,
"epoch": 0.00067,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.012496919371187687,
"kl": 0.4631600920110941,
"learning_rate": 9.99999988883184e-06,
"loss": -0.0128,
"num_tokens": 1759738.0,
"reward": 1.9331152439117432,
"reward_std": 1.1334162950515747,
"rewards/rollout_reward_func/mean": 1.9331152439117432,
"rewards/rollout_reward_func/std": 2.0543787479400635,
"sampling/importance_sampling_ratio/max": 0.46345254778862,
"sampling/importance_sampling_ratio/mean": 0.1404380202293396,
"sampling/importance_sampling_ratio/min": 5.890969418942404e-07,
"sampling/sampling_logp_difference/max": 4.386819839477539,
"sampling/sampling_logp_difference/mean": 1.1129919290542603,
"step": 67,
"step_time": 8.765856677001011
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.008928571827709675,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008928571827709675,
"entropy": 5.895752668380737,
"epoch": 0.00068,
"grad_norm": 0.012568553909659386,
"kl": 0.46327478997409344,
"learning_rate": 9.999999881544019e-06,
"loss": -0.0128,
"step": 68,
"step_time": 5.110902637000436
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 794.0,
"completions/max_terminated_length": 794.0,
"completions/mean_length": 452.34375,
"completions/mean_terminated_length": 452.34375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.041722625494003,
"epoch": 0.00069,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.07526283711194992,
"kl": 0.6064412295818329,
"learning_rate": 9.999999874024841e-06,
"loss": -0.0146,
"num_tokens": 1814594.0,
"reward": 3.077542781829834,
"reward_std": 1.2306993007659912,
"rewards/rollout_reward_func/mean": 3.077542781829834,
"rewards/rollout_reward_func/std": 1.810649037361145,
"sampling/importance_sampling_ratio/max": 0.4706134796142578,
"sampling/importance_sampling_ratio/mean": 0.165305495262146,
"sampling/importance_sampling_ratio/min": 0.0011951870983466506,
"sampling/sampling_logp_difference/max": 2.497037887573242,
"sampling/sampling_logp_difference/mean": 0.7777004837989807,
"step": 69,
"step_time": 8.851037519998954
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 4.988219231367111,
"epoch": 0.0007,
"grad_norm": 0.20997123420238495,
"kl": 0.8107541762292385,
"learning_rate": 9.999999866274303e-06,
"loss": -0.0143,
"step": 70,
"step_time": 4.902573127999858
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 697.0,
"completions/max_terminated_length": 697.0,
"completions/mean_length": 275.4375,
"completions/mean_terminated_length": 265.8709716796875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.022567570209503,
"epoch": 0.00071,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02304830029606819,
"kl": 0.34742444939911366,
"learning_rate": 9.999999858292407e-06,
"loss": -0.0139,
"num_tokens": 1861821.0,
"reward": 2.4483556747436523,
"reward_std": 1.4657219648361206,
"rewards/rollout_reward_func/mean": 2.4483556747436523,
"rewards/rollout_reward_func/std": 1.791189432144165,
"sampling/importance_sampling_ratio/max": 0.47685861587524414,
"sampling/importance_sampling_ratio/mean": 0.16110098361968994,
"sampling/importance_sampling_ratio/min": 1.8858786060560462e-11,
"sampling/sampling_logp_difference/max": 4.634000301361084,
"sampling/sampling_logp_difference/mean": 1.0669944286346436,
"step": 71,
"step_time": 7.997900912999285
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 6.069274663925171,
"epoch": 0.00072,
"grad_norm": 0.02166938968002796,
"kl": 0.34302423894405365,
"learning_rate": 9.99999985007915e-06,
"loss": -0.0139,
"step": 72,
"step_time": 4.490313349000189
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 805.0,
"completions/max_terminated_length": 805.0,
"completions/mean_length": 488.6875,
"completions/mean_terminated_length": 503.9354553222656,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.873573303222656,
"epoch": 0.00073,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01142603438347578,
"kl": 0.3447978002950549,
"learning_rate": 9.999999841634535e-06,
"loss": -0.0091,
"num_tokens": 1916473.0,
"reward": 2.781144857406616,
"reward_std": 0.9423757791519165,
"rewards/rollout_reward_func/mean": 2.781144857406616,
"rewards/rollout_reward_func/std": 1.5482388734817505,
"sampling/importance_sampling_ratio/max": 0.4879491329193115,
"sampling/importance_sampling_ratio/mean": 0.12894627451896667,
"sampling/importance_sampling_ratio/min": 5.110472808822486e-12,
"sampling/sampling_logp_difference/max": 3.6776225566864014,
"sampling/sampling_logp_difference/mean": 1.0262196063995361,
"step": 73,
"step_time": 8.777917193999201
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.9237218499183655,
"epoch": 0.00074,
"grad_norm": 0.011944163590669632,
"kl": 0.3441983833909035,
"learning_rate": 9.99999983295856e-06,
"loss": -0.0091,
"step": 74,
"step_time": 5.257215922000341
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 635.0,
"completions/max_terminated_length": 635.0,
"completions/mean_length": 217.28125,
"completions/mean_terminated_length": 206.09677124023438,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.533183515071869,
"epoch": 0.00075,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.04438586160540581,
"kl": 0.6188310664147139,
"learning_rate": 9.999999824051225e-06,
"loss": -0.0088,
"num_tokens": 1961274.0,
"reward": 2.8512258529663086,
"reward_std": 0.732064962387085,
"rewards/rollout_reward_func/mean": 2.8512258529663086,
"rewards/rollout_reward_func/std": 1.670456051826477,
"sampling/importance_sampling_ratio/max": 0.49958670139312744,
"sampling/importance_sampling_ratio/mean": 0.21123701333999634,
"sampling/importance_sampling_ratio/min": 3.237672987783241e-14,
"sampling/sampling_logp_difference/max": 5.040909767150879,
"sampling/sampling_logp_difference/mean": 1.135288953781128,
"step": 75,
"step_time": 7.92215164100071
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 5.562716752290726,
"epoch": 0.00076,
"grad_norm": 0.02424151450395584,
"kl": 0.565250052139163,
"learning_rate": 9.999999814912531e-06,
"loss": -0.0089,
"step": 76,
"step_time": 4.395662217000336
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 794.0,
"completions/max_terminated_length": 794.0,
"completions/mean_length": 291.0625,
"completions/mean_terminated_length": 289.93548583984375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.875417947769165,
"epoch": 0.00077,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.015563178807497025,
"kl": 0.41501787677407265,
"learning_rate": 9.999999805542478e-06,
"loss": -0.0118,
"num_tokens": 2010634.0,
"reward": 1.0790379047393799,
"reward_std": 1.1985231637954712,
"rewards/rollout_reward_func/mean": 1.0790379047393799,
"rewards/rollout_reward_func/std": 1.5060786008834839,
"sampling/importance_sampling_ratio/max": 0.5051907300949097,
"sampling/importance_sampling_ratio/mean": 0.1884518414735794,
"sampling/importance_sampling_ratio/min": 1.3433022472142397e-09,
"sampling/sampling_logp_difference/max": 10.98376750946045,
"sampling/sampling_logp_difference/mean": 1.1490110158920288,
"step": 77,
"step_time": 8.915195299999596
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0062500000931322575,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0062500000931322575,
"entropy": 5.885697066783905,
"epoch": 0.00078,
"grad_norm": 0.014125452376902103,
"kl": 0.4149230867624283,
"learning_rate": 9.999999795941065e-06,
"loss": -0.0119,
"step": 78,
"step_time": 5.149517803999061
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 794.0,
"completions/max_terminated_length": 794.0,
"completions/mean_length": 291.9375,
"completions/mean_terminated_length": 300.8387145996094,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.519651710987091,
"epoch": 0.00079,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.10713054239749908,
"kl": 0.5323284231126308,
"learning_rate": 9.999999786108293e-06,
"loss": -0.0227,
"num_tokens": 2059737.0,
"reward": 1.5329793691635132,
"reward_std": 0.5013623237609863,
"rewards/rollout_reward_func/mean": 1.5329793691635132,
"rewards/rollout_reward_func/std": 1.3323529958724976,
"sampling/importance_sampling_ratio/max": 0.5639442801475525,
"sampling/importance_sampling_ratio/mean": 0.25227874517440796,
"sampling/importance_sampling_ratio/min": 2.7577478468139224e-14,
"sampling/sampling_logp_difference/max": 3.539957046508789,
"sampling/sampling_logp_difference/mean": 1.098191261291504,
"step": 79,
"step_time": 8.723111569000139
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.00631313119083643,
"clip_ratio/low_min": 0.005681818351149559,
"clip_ratio/region_mean": 0.01412563119083643,
"entropy": 5.470509052276611,
"epoch": 0.0008,
"grad_norm": 0.06396406888961792,
"kl": 0.5383136495947838,
"learning_rate": 9.999999776044163e-06,
"loss": -0.0233,
"step": 80,
"step_time": 4.934400118000212
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 794.0,
"completions/max_terminated_length": 794.0,
"completions/mean_length": 384.1875,
"completions/mean_terminated_length": 371.0322570800781,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.105532318353653,
"epoch": 0.00081,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.014338403008878231,
"kl": 0.5314097702503204,
"learning_rate": 9.999999765748672e-06,
"loss": -0.0119,
"num_tokens": 2109547.0,
"reward": 3.0377540588378906,
"reward_std": 1.2538565397262573,
"rewards/rollout_reward_func/mean": 3.0377540588378906,
"rewards/rollout_reward_func/std": 1.6159894466400146,
"sampling/importance_sampling_ratio/max": 0.5120093822479248,
"sampling/importance_sampling_ratio/mean": 0.18446674942970276,
"sampling/importance_sampling_ratio/min": 3.6131722613852446e-11,
"sampling/sampling_logp_difference/max": 3.793142080307007,
"sampling/sampling_logp_difference/mean": 0.8700344562530518,
"step": 81,
"step_time": 8.634114757999669
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0031250000465661287,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0031250000465661287,
"entropy": 5.07496240735054,
"epoch": 0.00082,
"grad_norm": 0.012873583473265171,
"kl": 0.5282706655561924,
"learning_rate": 9.999999755221823e-06,
"loss": -0.012,
"step": 82,
"step_time": 4.737106287001097
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 803.0,
"completions/max_terminated_length": 803.0,
"completions/mean_length": 284.125,
"completions/mean_terminated_length": 284.125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.206637322902679,
"epoch": 0.00083,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02513122372329235,
"kl": 0.5926351137459278,
"learning_rate": 9.999999744463613e-06,
"loss": -0.0183,
"num_tokens": 2156768.0,
"reward": 2.840292453765869,
"reward_std": 1.3401542901992798,
"rewards/rollout_reward_func/mean": 2.840292453765869,
"rewards/rollout_reward_func/std": 1.8102028369903564,
"sampling/importance_sampling_ratio/max": 0.520283579826355,
"sampling/importance_sampling_ratio/mean": 0.2087681144475937,
"sampling/importance_sampling_ratio/min": 2.49877535329901e-10,
"sampling/sampling_logp_difference/max": 4.733933448791504,
"sampling/sampling_logp_difference/mean": 0.9762767553329468,
"step": 83,
"step_time": 8.479427639000278
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.208840876817703,
"epoch": 0.00084,
"grad_norm": 0.03026541694998741,
"kl": 0.6035371646285057,
"learning_rate": 9.999999733474045e-06,
"loss": -0.0183,
"step": 84,
"step_time": 5.254075981999904
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 848.0,
"completions/max_terminated_length": 848.0,
"completions/mean_length": 450.0,
"completions/mean_terminated_length": 464.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.834830671548843,
"epoch": 0.00085,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.012555217370390892,
"kl": 0.3606609106063843,
"learning_rate": 9.999999722253117e-06,
"loss": -0.0117,
"num_tokens": 2211652.0,
"reward": 1.648930311203003,
"reward_std": 1.215954065322876,
"rewards/rollout_reward_func/mean": 1.648930311203003,
"rewards/rollout_reward_func/std": 1.3405512571334839,
"sampling/importance_sampling_ratio/max": 0.5233410000801086,
"sampling/importance_sampling_ratio/mean": 0.13886834681034088,
"sampling/importance_sampling_ratio/min": 1.688752483300254e-16,
"sampling/sampling_logp_difference/max": 3.9144070148468018,
"sampling/sampling_logp_difference/mean": 1.1154017448425293,
"step": 85,
"step_time": 9.084167009999419
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"entropy": 5.833049297332764,
"epoch": 0.00086,
"grad_norm": 0.012575902976095676,
"kl": 0.3626660779118538,
"learning_rate": 9.99999971080083e-06,
"loss": -0.0118,
"step": 86,
"step_time": 5.319575449999775
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 722.0,
"completions/max_terminated_length": 722.0,
"completions/mean_length": 235.9375,
"completions/mean_terminated_length": 243.03225708007812,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.763930678367615,
"epoch": 0.00087,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.014682337641716003,
"kl": 0.39749570190906525,
"learning_rate": 9.999999699117184e-06,
"loss": -0.0146,
"num_tokens": 2257453.0,
"reward": 2.5003364086151123,
"reward_std": 1.9458928108215332,
"rewards/rollout_reward_func/mean": 2.5003364086151123,
"rewards/rollout_reward_func/std": 2.2692220211029053,
"sampling/importance_sampling_ratio/max": 0.5295130610466003,
"sampling/importance_sampling_ratio/mean": 0.11728590726852417,
"sampling/importance_sampling_ratio/min": 3.7461741065995813e-13,
"sampling/sampling_logp_difference/max": 3.0392837524414062,
"sampling/sampling_logp_difference/mean": 1.4054160118103027,
"step": 87,
"step_time": 8.034342769000887
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 6.757734656333923,
"epoch": 0.00088,
"grad_norm": 0.013116477057337761,
"kl": 0.38789064437150955,
"learning_rate": 9.999999687202177e-06,
"loss": -0.0146,
"step": 88,
"step_time": 4.444944719000432
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 755.0,
"completions/max_terminated_length": 741.0,
"completions/mean_length": 447.34375,
"completions/mean_terminated_length": 451.4667053222656,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.2215496301651,
"epoch": 0.00089,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.013803564012050629,
"kl": 0.5341093055903912,
"learning_rate": 9.999999675055814e-06,
"loss": -0.0177,
"num_tokens": 2312250.0,
"reward": 1.7047960758209229,
"reward_std": 1.1020748615264893,
"rewards/rollout_reward_func/mean": 1.7047960758209229,
"rewards/rollout_reward_func/std": 1.8067169189453125,
"sampling/importance_sampling_ratio/max": 0.5259312391281128,
"sampling/importance_sampling_ratio/mean": 0.17643724381923676,
"sampling/importance_sampling_ratio/min": 2.5637248400600665e-12,
"sampling/sampling_logp_difference/max": 5.182948589324951,
"sampling/sampling_logp_difference/mean": 1.0466536283493042,
"step": 89,
"step_time": 8.874949301000925
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.204747676849365,
"epoch": 0.0009,
"grad_norm": 0.01240418292582035,
"kl": 0.5262163020670414,
"learning_rate": 9.999999662678088e-06,
"loss": -0.0177,
"step": 90,
"step_time": 5.186326979000114
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 823.0,
"completions/max_terminated_length": 823.0,
"completions/mean_length": 558.34375,
"completions/mean_terminated_length": 558.34375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.52349066734314,
"epoch": 0.00091,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.010751481167972088,
"kl": 0.408858347684145,
"learning_rate": 9.999999650069006e-06,
"loss": -0.0171,
"num_tokens": 2371581.0,
"reward": 1.3657488822937012,
"reward_std": 1.0520522594451904,
"rewards/rollout_reward_func/mean": 1.3657488822937012,
"rewards/rollout_reward_func/std": 1.2561410665512085,
"sampling/importance_sampling_ratio/max": 0.2838555574417114,
"sampling/importance_sampling_ratio/mean": 0.12384120374917984,
"sampling/importance_sampling_ratio/min": 4.310294829390493e-11,
"sampling/sampling_logp_difference/max": 3.8953909873962402,
"sampling/sampling_logp_difference/mean": 0.9854850769042969,
"step": 91,
"step_time": 9.148271317999388
},
{
"clip_ratio/high_max": 0.03125,
"clip_ratio/high_mean": 0.015625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"entropy": 5.50886458158493,
"epoch": 0.00092,
"grad_norm": 0.011888951063156128,
"kl": 0.4073612429201603,
"learning_rate": 9.999999637228563e-06,
"loss": -0.0171,
"step": 92,
"step_time": 5.326846187999763
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 751.0,
"completions/max_terminated_length": 751.0,
"completions/mean_length": 254.34375,
"completions/mean_terminated_length": 254.34375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.463173568248749,
"epoch": 0.00093,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.011303669773042202,
"kl": 0.4870901219546795,
"learning_rate": 9.99999962415676e-06,
"loss": -0.0089,
"num_tokens": 2416625.0,
"reward": 2.9061293601989746,
"reward_std": 1.1144578456878662,
"rewards/rollout_reward_func/mean": 2.9061293601989746,
"rewards/rollout_reward_func/std": 1.872575283050537,
"sampling/importance_sampling_ratio/max": 0.538914144039154,
"sampling/importance_sampling_ratio/mean": 0.23089367151260376,
"sampling/importance_sampling_ratio/min": 0.001189270755276084,
"sampling/sampling_logp_difference/max": 2.552368640899658,
"sampling/sampling_logp_difference/mean": 0.8883153200149536,
"step": 93,
"step_time": 8.310906286000318
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.453524351119995,
"epoch": 0.00094,
"grad_norm": 0.011560036800801754,
"kl": 0.49043361097574234,
"learning_rate": 9.999999610853598e-06,
"loss": -0.009,
"step": 94,
"step_time": 4.578007039998738
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 695.0,
"completions/max_terminated_length": 695.0,
"completions/mean_length": 323.0,
"completions/mean_terminated_length": 323.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.127210021018982,
"epoch": 0.00095,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.016390182077884674,
"kl": 0.40985831432044506,
"learning_rate": 9.999999597319077e-06,
"loss": -0.0144,
"num_tokens": 2467574.0,
"reward": 2.420485019683838,
"reward_std": 1.8477942943572998,
"rewards/rollout_reward_func/mean": 2.420485019683838,
"rewards/rollout_reward_func/std": 2.099099636077881,
"sampling/importance_sampling_ratio/max": 0.5398046970367432,
"sampling/importance_sampling_ratio/mean": 0.14346127212047577,
"sampling/importance_sampling_ratio/min": 7.899796268528991e-12,
"sampling/sampling_logp_difference/max": 3.005741596221924,
"sampling/sampling_logp_difference/mean": 1.1369695663452148,
"step": 95,
"step_time": 9.084458965998692
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 6.112871825695038,
"epoch": 0.00096,
"grad_norm": 0.015214670449495316,
"kl": 0.4128855764865875,
"learning_rate": 9.999999583553198e-06,
"loss": -0.0144,
"step": 96,
"step_time": 4.6464639670002725
},
{
"clip_ratio/high_max": 0.005681818351149559,
"clip_ratio/high_mean": 0.0028409091755747795,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0028409091755747795,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 830.0,
"completions/max_terminated_length": 830.0,
"completions/mean_length": 404.1875,
"completions/mean_terminated_length": 413.0000305175781,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.4772584438323975,
"epoch": 0.00097,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.013531319797039032,
"kl": 0.33689754270017147,
"learning_rate": 9.999999569555958e-06,
"loss": -0.0195,
"num_tokens": 2519297.0,
"reward": 1.2118922472000122,
"reward_std": 0.9071276187896729,
"rewards/rollout_reward_func/mean": 1.2118922472000122,
"rewards/rollout_reward_func/std": 1.6393996477127075,
"sampling/importance_sampling_ratio/max": 0.5455878376960754,
"sampling/importance_sampling_ratio/mean": 0.12733827531337738,
"sampling/importance_sampling_ratio/min": 3.602854342990569e-12,
"sampling/sampling_logp_difference/max": 4.47752046585083,
"sampling/sampling_logp_difference/mean": 1.2203165292739868,
"step": 97,
"step_time": 9.439923181000268
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 6.469627916812897,
"epoch": 0.00098,
"grad_norm": 0.011723111383616924,
"kl": 0.3316629286855459,
"learning_rate": 9.99999955532736e-06,
"loss": -0.0196,
"step": 98,
"step_time": 4.828754623999885
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 848.0,
"completions/max_terminated_length": 848.0,
"completions/mean_length": 536.90625,
"completions/mean_terminated_length": 536.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.867543816566467,
"epoch": 0.00099,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00992999505251646,
"kl": 0.35577549040317535,
"learning_rate": 9.999999540867401e-06,
"loss": -0.0137,
"num_tokens": 2578290.0,
"reward": 2.7717137336730957,
"reward_std": 1.5454163551330566,
"rewards/rollout_reward_func/mean": 2.7717137336730957,
"rewards/rollout_reward_func/std": 2.1133692264556885,
"sampling/importance_sampling_ratio/max": 0.2941095232963562,
"sampling/importance_sampling_ratio/mean": 0.1273624747991562,
"sampling/importance_sampling_ratio/min": 3.663484793303695e-10,
"sampling/sampling_logp_difference/max": 2.9304747581481934,
"sampling/sampling_logp_difference/mean": 1.0121349096298218,
"step": 99,
"step_time": 9.026237381999636
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 5.866902112960815,
"epoch": 0.001,
"grad_norm": 0.008921584114432335,
"kl": 0.3485883306711912,
"learning_rate": 9.999999526176084e-06,
"loss": -0.0137,
"step": 100,
"step_time": 4.884540873000333
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 767.0,
"completions/max_terminated_length": 767.0,
"completions/mean_length": 318.6875,
"completions/mean_terminated_length": 318.6875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.43700635433197,
"epoch": 0.00101,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.006189876701682806,
"kl": 0.5527824461460114,
"learning_rate": 9.999999511253408e-06,
"loss": -0.0067,
"num_tokens": 2626871.0,
"reward": 2.009554862976074,
"reward_std": 0.6518650054931641,
"rewards/rollout_reward_func/mean": 2.009554862976074,
"rewards/rollout_reward_func/std": 1.5595077276229858,
"sampling/importance_sampling_ratio/max": 0.5488349199295044,
"sampling/importance_sampling_ratio/mean": 0.19652444124221802,
"sampling/importance_sampling_ratio/min": 0.0011575144017115235,
"sampling/sampling_logp_difference/max": 2.923558473587036,
"sampling/sampling_logp_difference/mean": 0.9920997619628906,
"step": 101,
"step_time": 9.15831846600031
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 5.427984565496445,
"epoch": 0.00102,
"grad_norm": 0.006101899314671755,
"kl": 0.556145828217268,
"learning_rate": 9.99999949609937e-06,
"loss": -0.0067,
"step": 102,
"step_time": 4.759649325001192
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 721.0,
"completions/max_terminated_length": 721.0,
"completions/mean_length": 266.03125,
"completions/mean_terminated_length": 257.8709716796875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 4.900549799203873,
"epoch": 0.00103,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.007550720125436783,
"kl": 0.46073490381240845,
"learning_rate": 9.999999480713976e-06,
"loss": -0.0124,
"num_tokens": 2674005.0,
"reward": 3.7651147842407227,
"reward_std": 1.2847379446029663,
"rewards/rollout_reward_func/mean": 3.7651147842407227,
"rewards/rollout_reward_func/std": 1.5383661985397339,
"sampling/importance_sampling_ratio/max": 0.5506238341331482,
"sampling/importance_sampling_ratio/mean": 0.26077592372894287,
"sampling/importance_sampling_ratio/min": 9.013636733702646e-14,
"sampling/sampling_logp_difference/max": 3.8743910789489746,
"sampling/sampling_logp_difference/mean": 0.9717509150505066,
"step": 103,
"step_time": 8.535875374000625
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 4.909915745258331,
"epoch": 0.00104,
"grad_norm": 0.007524185813963413,
"kl": 0.45878610014915466,
"learning_rate": 9.99999946509722e-06,
"loss": -0.0124,
"step": 104,
"step_time": 4.519358364999334
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 694.0,
"completions/max_terminated_length": 694.0,
"completions/mean_length": 263.75,
"completions/mean_terminated_length": 271.7419128417969,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 4.765733242034912,
"epoch": 0.00105,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.006037894636392593,
"kl": 0.5730921756476164,
"learning_rate": 9.999999449249107e-06,
"loss": -0.0164,
"num_tokens": 2719481.0,
"reward": 3.5972437858581543,
"reward_std": 1.0879021883010864,
"rewards/rollout_reward_func/mean": 3.5972437858581543,
"rewards/rollout_reward_func/std": 1.4054116010665894,
"sampling/importance_sampling_ratio/max": 0.5536695122718811,
"sampling/importance_sampling_ratio/mean": 0.25917497277259827,
"sampling/importance_sampling_ratio/min": 7.426475804095389e-06,
"sampling/sampling_logp_difference/max": 3.8071212768554688,
"sampling/sampling_logp_difference/mean": 0.7888709902763367,
"step": 105,
"step_time": 7.902388452000196
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 4.764300674200058,
"epoch": 0.00106,
"grad_norm": 0.006110228598117828,
"kl": 0.5746921207755804,
"learning_rate": 9.999999433169634e-06,
"loss": -0.0165,
"step": 106,
"step_time": 4.451162388999819
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 796.0,
"completions/max_terminated_length": 796.0,
"completions/mean_length": 504.03125,
"completions/mean_terminated_length": 504.03125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.259044528007507,
"epoch": 0.00107,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.09284374862909317,
"kl": 0.5678669139742851,
"learning_rate": 9.999999416858801e-06,
"loss": -0.023,
"num_tokens": 2778110.0,
"reward": 3.0518195629119873,
"reward_std": 2.0326967239379883,
"rewards/rollout_reward_func/mean": 3.0518195629119873,
"rewards/rollout_reward_func/std": 2.1500182151794434,
"sampling/importance_sampling_ratio/max": 0.3023638129234314,
"sampling/importance_sampling_ratio/mean": 0.15813782811164856,
"sampling/importance_sampling_ratio/min": 0.0001912050211103633,
"sampling/sampling_logp_difference/max": 2.7088003158569336,
"sampling/sampling_logp_difference/mean": 0.8632931709289551,
"step": 107,
"step_time": 9.265566470999602
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.301291227340698,
"epoch": 0.00108,
"grad_norm": 0.0192717295140028,
"kl": 0.40367304906249046,
"learning_rate": 9.999999400316609e-06,
"loss": -0.0233,
"step": 108,
"step_time": 4.872972249000213
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 785.0,
"completions/max_terminated_length": 785.0,
"completions/mean_length": 453.8125,
"completions/mean_terminated_length": 454.06451416015625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 4.344372421503067,
"epoch": 0.00109,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.005915234796702862,
"kl": 0.5280559062957764,
"learning_rate": 9.999999383543059e-06,
"loss": -0.0117,
"num_tokens": 2832092.0,
"reward": 2.0643255710601807,
"reward_std": 0.36830055713653564,
"rewards/rollout_reward_func/mean": 2.0643255710601807,
"rewards/rollout_reward_func/std": 1.2795445919036865,
"sampling/importance_sampling_ratio/max": 0.5460331439971924,
"sampling/importance_sampling_ratio/mean": 0.2571074366569519,
"sampling/importance_sampling_ratio/min": 3.3921960648563643e-15,
"sampling/sampling_logp_difference/max": 4.154726505279541,
"sampling/sampling_logp_difference/mean": 0.7197229862213135,
"step": 109,
"step_time": 9.136753535999105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 4.370859324932098,
"epoch": 0.0011,
"grad_norm": 0.006120497360825539,
"kl": 0.5247279852628708,
"learning_rate": 9.999999366538148e-06,
"loss": -0.0117,
"step": 110,
"step_time": 4.750533235999228
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 704.0,
"completions/max_terminated_length": 704.0,
"completions/mean_length": 468.09375,
"completions/mean_terminated_length": 468.09375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.781158804893494,
"epoch": 0.00111,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.009389814920723438,
"kl": 0.40754758939146996,
"learning_rate": 9.999999349301878e-06,
"loss": -0.0128,
"num_tokens": 2887456.0,
"reward": 2.372213840484619,
"reward_std": 1.1804518699645996,
"rewards/rollout_reward_func/mean": 2.372213840484619,
"rewards/rollout_reward_func/std": 1.6503901481628418,
"sampling/importance_sampling_ratio/max": 0.5524845123291016,
"sampling/importance_sampling_ratio/mean": 0.1906067430973053,
"sampling/importance_sampling_ratio/min": 4.788781764827768e-16,
"sampling/sampling_logp_difference/max": 3.3403866291046143,
"sampling/sampling_logp_difference/mean": 1.0884251594543457,
"step": 111,
"step_time": 8.606538135000847
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.815384209156036,
"epoch": 0.00112,
"grad_norm": 0.01099073514342308,
"kl": 0.40177351236343384,
"learning_rate": 9.999999331834249e-06,
"loss": -0.0128,
"step": 112,
"step_time": 5.018896831999882
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 722.0,
"completions/max_terminated_length": 722.0,
"completions/mean_length": 444.90625,
"completions/mean_terminated_length": 444.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.825027763843536,
"epoch": 0.00113,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.011304070241749287,
"kl": 0.35968483053147793,
"learning_rate": 9.99999931413526e-06,
"loss": -0.0059,
"num_tokens": 2943025.0,
"reward": 2.8332765102386475,
"reward_std": 1.7419302463531494,
"rewards/rollout_reward_func/mean": 2.8332765102386475,
"rewards/rollout_reward_func/std": 1.946202278137207,
"sampling/importance_sampling_ratio/max": 0.30486950278282166,
"sampling/importance_sampling_ratio/mean": 0.14108777046203613,
"sampling/importance_sampling_ratio/min": 0.00019812805112451315,
"sampling/sampling_logp_difference/max": 2.6881039142608643,
"sampling/sampling_logp_difference/mean": 1.0696028470993042,
"step": 113,
"step_time": 8.58645070500097
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.8290793895721436,
"epoch": 0.00114,
"grad_norm": 0.011425897479057312,
"kl": 0.3614608943462372,
"learning_rate": 9.999999296204912e-06,
"loss": -0.0059,
"step": 114,
"step_time": 5.139510378000523
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 767.0,
"completions/max_terminated_length": 767.0,
"completions/mean_length": 235.75,
"completions/mean_terminated_length": 235.75,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.922475636005402,
"epoch": 0.00115,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.023028887808322906,
"kl": 0.4135777149349451,
"learning_rate": 9.999999278043205e-06,
"loss": -0.0119,
"num_tokens": 2988835.0,
"reward": 2.040597438812256,
"reward_std": 0.6493960618972778,
"rewards/rollout_reward_func/mean": 2.040597438812256,
"rewards/rollout_reward_func/std": 1.5975624322891235,
"sampling/importance_sampling_ratio/max": 0.5552006363868713,
"sampling/importance_sampling_ratio/mean": 0.2123653143644333,
"sampling/importance_sampling_ratio/min": 1.9730843694998335e-11,
"sampling/sampling_logp_difference/max": 3.120391845703125,
"sampling/sampling_logp_difference/mean": 1.0702314376831055,
"step": 115,
"step_time": 8.227128244000596
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.934574216604233,
"epoch": 0.00116,
"grad_norm": 0.024487733840942383,
"kl": 0.4113778416067362,
"learning_rate": 9.99999925965014e-06,
"loss": -0.0118,
"step": 116,
"step_time": 4.603303872000197
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0031250000465661287,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0031250000465661287,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 830.0,
"completions/max_terminated_length": 830.0,
"completions/mean_length": 446.0,
"completions/mean_terminated_length": 459.8709411621094,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.283336162567139,
"epoch": 0.00117,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.06837444752454758,
"kl": 0.7454761080443859,
"learning_rate": 9.999999241025713e-06,
"loss": -0.0137,
"num_tokens": 3044815.0,
"reward": 1.6962122917175293,
"reward_std": 0.9731729030609131,
"rewards/rollout_reward_func/mean": 1.6962122917175293,
"rewards/rollout_reward_func/std": 1.472454309463501,
"sampling/importance_sampling_ratio/max": 0.5520793199539185,
"sampling/importance_sampling_ratio/mean": 0.1955501139163971,
"sampling/importance_sampling_ratio/min": 8.385171321124898e-15,
"sampling/sampling_logp_difference/max": 3.5139248371124268,
"sampling/sampling_logp_difference/mean": 1.0980048179626465,
"step": 117,
"step_time": 8.851413534999665
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 5.293146431446075,
"epoch": 0.00118,
"grad_norm": 0.025546826422214508,
"kl": 0.6223187446594238,
"learning_rate": 9.99999922216993e-06,
"loss": -0.014,
"step": 118,
"step_time": 5.2780849090008815
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 823.0,
"completions/max_terminated_length": 823.0,
"completions/mean_length": 477.75,
"completions/mean_terminated_length": 498.20001220703125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.602368354797363,
"epoch": 0.00119,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01806759461760521,
"kl": 0.340764039196074,
"learning_rate": 9.999999203082784e-06,
"loss": -0.0204,
"num_tokens": 3099071.0,
"reward": 1.5769758224487305,
"reward_std": 1.3088091611862183,
"rewards/rollout_reward_func/mean": 1.5769758224487305,
"rewards/rollout_reward_func/std": 1.6474065780639648,
"sampling/importance_sampling_ratio/max": 0.5519512295722961,
"sampling/importance_sampling_ratio/mean": 0.15655829012393951,
"sampling/importance_sampling_ratio/min": 3.2326309328439702e-18,
"sampling/sampling_logp_difference/max": 3.693403959274292,
"sampling/sampling_logp_difference/mean": 1.4753286838531494,
"step": 119,
"step_time": 9.247204750000492
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 6.583022654056549,
"epoch": 0.0012,
"grad_norm": 0.016562430188059807,
"kl": 0.33369210083037615,
"learning_rate": 9.999999183764282e-06,
"loss": -0.0205,
"step": 120,
"step_time": 5.400903367999945
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 830.0,
"completions/max_terminated_length": 830.0,
"completions/mean_length": 473.59375,
"completions/mean_terminated_length": 473.59375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.3363790810108185,
"epoch": 0.00121,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.005732807330787182,
"kl": 0.3513603312894702,
"learning_rate": 9.999999164214418e-06,
"loss": -0.015,
"num_tokens": 3154721.0,
"reward": 2.565702199935913,
"reward_std": 0.9166498184204102,
"rewards/rollout_reward_func/mean": 2.565702199935913,
"rewards/rollout_reward_func/std": 1.641287922859192,
"sampling/importance_sampling_ratio/max": 0.5513830184936523,
"sampling/importance_sampling_ratio/mean": 0.21081653237342834,
"sampling/importance_sampling_ratio/min": 9.868382777611373e-10,
"sampling/sampling_logp_difference/max": 4.323619365692139,
"sampling/sampling_logp_difference/mean": 0.9557435512542725,
"step": 121,
"step_time": 8.598964995000188
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.306765913963318,
"epoch": 0.00122,
"grad_norm": 0.005753090605139732,
"kl": 0.3516113171353936,
"learning_rate": 9.999999144433197e-06,
"loss": -0.015,
"step": 122,
"step_time": 4.775643616999787
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 866.0,
"completions/max_terminated_length": 866.0,
"completions/mean_length": 574.21875,
"completions/mean_terminated_length": 574.21875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.031126022338867,
"epoch": 0.00123,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.014242246747016907,
"kl": 0.27723702508956194,
"learning_rate": 9.999999124420615e-06,
"loss": -0.0165,
"num_tokens": 3214228.0,
"reward": 2.553722381591797,
"reward_std": 1.49879789352417,
"rewards/rollout_reward_func/mean": 2.553722381591797,
"rewards/rollout_reward_func/std": 1.8885005712509155,
"sampling/importance_sampling_ratio/max": 0.30776602029800415,
"sampling/importance_sampling_ratio/mean": 0.13252753019332886,
"sampling/importance_sampling_ratio/min": 1.6167473718305125e-13,
"sampling/sampling_logp_difference/max": 3.7081410884857178,
"sampling/sampling_logp_difference/mean": 1.0756418704986572,
"step": 123,
"step_time": 8.99680862499963
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 6.018238723278046,
"epoch": 0.00124,
"grad_norm": 0.008585439994931221,
"kl": 0.27489931136369705,
"learning_rate": 9.999999104176675e-06,
"loss": -0.0165,
"step": 124,
"step_time": 5.400279564999892
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 787.0,
"completions/max_terminated_length": 787.0,
"completions/mean_length": 428.40625,
"completions/mean_terminated_length": 441.70965576171875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.530455976724625,
"epoch": 0.00125,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01981445588171482,
"kl": 0.5246058944612741,
"learning_rate": 9.999999083701375e-06,
"loss": -0.0115,
"num_tokens": 3268782.0,
"reward": 2.442720651626587,
"reward_std": 0.9873529672622681,
"rewards/rollout_reward_func/mean": 2.442720651626587,
"rewards/rollout_reward_func/std": 1.5368415117263794,
"sampling/importance_sampling_ratio/max": 0.5591750144958496,
"sampling/importance_sampling_ratio/mean": 0.1981470286846161,
"sampling/importance_sampling_ratio/min": 6.880031043303685e-15,
"sampling/sampling_logp_difference/max": 2.8299753665924072,
"sampling/sampling_logp_difference/mean": 1.1136442422866821,
"step": 125,
"step_time": 8.792223250000006
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 5.526207149028778,
"epoch": 0.00126,
"grad_norm": 0.07942020893096924,
"kl": 0.5783120766282082,
"learning_rate": 9.999999062994716e-06,
"loss": -0.0115,
"step": 126,
"step_time": 5.212345460999586
},
{
"clip_ratio/high_max": 0.011363636702299118,
"clip_ratio/high_mean": 0.005681818351149559,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005681818351149559,
"completions/clipped_ratio": 0.0,
"completions/max_length": 751.0,
"completions/max_terminated_length": 751.0,
"completions/mean_length": 355.09375,
"completions/mean_terminated_length": 355.09375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.586715936660767,
"epoch": 0.00127,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01894986629486084,
"kl": 0.36187045089900494,
"learning_rate": 9.999999042056698e-06,
"loss": -0.0093,
"num_tokens": 3320961.0,
"reward": 3.5986928939819336,
"reward_std": 1.4135947227478027,
"rewards/rollout_reward_func/mean": 3.5986928939819336,
"rewards/rollout_reward_func/std": 1.7381529808044434,
"sampling/importance_sampling_ratio/max": 0.5567580461502075,
"sampling/importance_sampling_ratio/mean": 0.18505819141864777,
"sampling/importance_sampling_ratio/min": 3.734913612349951e-18,
"sampling/sampling_logp_difference/max": 3.9289333820343018,
"sampling/sampling_logp_difference/mean": 1.1177394390106201,
"step": 127,
"step_time": 8.4797745940009
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.003289473708719015,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.003289473708719015,
"entropy": 5.59991979598999,
"epoch": 0.00128,
"grad_norm": 0.015152394771575928,
"kl": 0.36163298040628433,
"learning_rate": 9.99999902088732e-06,
"loss": -0.0093,
"step": 128,
"step_time": 4.677577405001102
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 812.0,
"completions/max_terminated_length": 812.0,
"completions/mean_length": 428.5,
"completions/mean_terminated_length": 428.5,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.128687739372253,
"epoch": 0.00129,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.005386632867157459,
"kl": 0.29446713998913765,
"learning_rate": 9.999998999486583e-06,
"loss": -0.0113,
"num_tokens": 3373920.0,
"reward": 3.073700428009033,
"reward_std": 1.0783164501190186,
"rewards/rollout_reward_func/mean": 3.073700428009033,
"rewards/rollout_reward_func/std": 1.8588871955871582,
"sampling/importance_sampling_ratio/max": 0.5572874546051025,
"sampling/importance_sampling_ratio/mean": 0.1692725121974945,
"sampling/importance_sampling_ratio/min": 0.00015941473247949034,
"sampling/sampling_logp_difference/max": 2.785651922225952,
"sampling/sampling_logp_difference/mean": 1.0858612060546875,
"step": 129,
"step_time": 8.688911320999068
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 6.14833676815033,
"epoch": 0.0013,
"grad_norm": 0.005992444232106209,
"kl": 0.2919249450787902,
"learning_rate": 9.999998977854486e-06,
"loss": -0.0113,
"step": 130,
"step_time": 5.196040378998532
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 803.0,
"completions/max_terminated_length": 803.0,
"completions/mean_length": 425.46875,
"completions/mean_terminated_length": 425.46875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.23365318775177,
"epoch": 0.00131,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.019059518352150917,
"kl": 0.34147817455232143,
"learning_rate": 9.99999895599103e-06,
"loss": -0.0035,
"num_tokens": 3427401.0,
"reward": 2.473796844482422,
"reward_std": 1.8125438690185547,
"rewards/rollout_reward_func/mean": 2.473796844482422,
"rewards/rollout_reward_func/std": 1.8101236820220947,
"sampling/importance_sampling_ratio/max": 0.5523518323898315,
"sampling/importance_sampling_ratio/mean": 0.14591509103775024,
"sampling/importance_sampling_ratio/min": 0.00020133046200498939,
"sampling/sampling_logp_difference/max": 3.7536349296569824,
"sampling/sampling_logp_difference/mean": 1.2340688705444336,
"step": 131,
"step_time": 8.596268926000448
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 6.220755457878113,
"epoch": 0.00132,
"grad_norm": 0.018474310636520386,
"kl": 0.3436393868178129,
"learning_rate": 9.999998933896215e-06,
"loss": -0.0035,
"step": 132,
"step_time": 5.2319554900000185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 821.0,
"completions/max_terminated_length": 821.0,
"completions/mean_length": 448.71875,
"completions/mean_terminated_length": 448.71875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.855165183544159,
"epoch": 0.00133,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.021189430728554726,
"kl": 0.4099526349455118,
"learning_rate": 9.999998911570041e-06,
"loss": -0.0112,
"num_tokens": 3482090.0,
"reward": 1.9857467412948608,
"reward_std": 0.9780662655830383,
"rewards/rollout_reward_func/mean": 1.9857467412948608,
"rewards/rollout_reward_func/std": 1.4453747272491455,
"sampling/importance_sampling_ratio/max": 0.5519795417785645,
"sampling/importance_sampling_ratio/mean": 0.14946779608726501,
"sampling/importance_sampling_ratio/min": 4.809954772476128e-20,
"sampling/sampling_logp_difference/max": 13.282112121582031,
"sampling/sampling_logp_difference/mean": 1.1622436046600342,
"step": 133,
"step_time": 8.62497339599986
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 5.8229920864105225,
"epoch": 0.00134,
"grad_norm": 0.02182932198047638,
"kl": 0.4096921207383275,
"learning_rate": 9.999998889012509e-06,
"loss": -0.0112,
"step": 134,
"step_time": 4.7461931110005935
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 731.0,
"completions/max_terminated_length": 731.0,
"completions/mean_length": 405.90625,
"completions/mean_terminated_length": 405.90625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 4.744675666093826,
"epoch": 0.00135,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.020258011296391487,
"kl": 0.5242082104086876,
"learning_rate": 9.999998866223617e-06,
"loss": 0.0004,
"num_tokens": 3535997.0,
"reward": 2.09535551071167,
"reward_std": 0.801947832107544,
"rewards/rollout_reward_func/mean": 2.09535551071167,
"rewards/rollout_reward_func/std": 1.5733643770217896,
"sampling/importance_sampling_ratio/max": 0.5550893545150757,
"sampling/importance_sampling_ratio/mean": 0.21865856647491455,
"sampling/importance_sampling_ratio/min": 0.0004308926872909069,
"sampling/sampling_logp_difference/max": 2.2621819972991943,
"sampling/sampling_logp_difference/mean": 0.6881621479988098,
"step": 135,
"step_time": 8.649714739999581
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0078125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 4.7144655585289,
"epoch": 0.00136,
"grad_norm": 0.017525319010019302,
"kl": 0.5250850170850754,
"learning_rate": 9.999998843203364e-06,
"loss": 0.0004,
"step": 136,
"step_time": 5.175107476999074
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 778.0,
"completions/max_terminated_length": 778.0,
"completions/mean_length": 373.84375,
"completions/mean_terminated_length": 373.84375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 4.857703149318695,
"epoch": 0.00137,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.00757873198017478,
"kl": 0.39663390070199966,
"learning_rate": 9.999998819951753e-06,
"loss": -0.0103,
"num_tokens": 3587640.0,
"reward": 3.1773321628570557,
"reward_std": 1.37308669090271,
"rewards/rollout_reward_func/mean": 3.1773321628570557,
"rewards/rollout_reward_func/std": 1.8172297477722168,
"sampling/importance_sampling_ratio/max": 0.5551076531410217,
"sampling/importance_sampling_ratio/mean": 0.21167081594467163,
"sampling/importance_sampling_ratio/min": 0.00045508454786613584,
"sampling/sampling_logp_difference/max": 3.646714210510254,
"sampling/sampling_logp_difference/mean": 0.8012743592262268,
"step": 137,
"step_time": 8.443151505000515
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 4.867439925670624,
"epoch": 0.00138,
"grad_norm": 0.007302064914256334,
"kl": 0.39654075540602207,
"learning_rate": 9.999998796468782e-06,
"loss": -0.0102,
"step": 138,
"step_time": 5.160483013000885
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 830.0,
"completions/max_terminated_length": 830.0,
"completions/mean_length": 496.53125,
"completions/mean_terminated_length": 512.0322265625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.3140488266944885,
"epoch": 0.00139,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.047983165830373764,
"kl": 0.43244970217347145,
"learning_rate": 9.999998772754452e-06,
"loss": -0.0136,
"num_tokens": 3642802.0,
"reward": 2.342683792114258,
"reward_std": 0.9078880548477173,
"rewards/rollout_reward_func/mean": 2.342683792114258,
"rewards/rollout_reward_func/std": 1.424059510231018,
"sampling/importance_sampling_ratio/max": 0.5550379753112793,
"sampling/importance_sampling_ratio/mean": 0.21025414764881134,
"sampling/importance_sampling_ratio/min": 4.1330454277109865e-14,
"sampling/sampling_logp_difference/max": 4.175216197967529,
"sampling/sampling_logp_difference/mean": 0.9738059639930725,
"step": 139,
"step_time": 8.921468903000005
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.2954131960868835,
"epoch": 0.0014,
"grad_norm": 0.03462895750999451,
"kl": 0.43446778878569603,
"learning_rate": 9.999998748808764e-06,
"loss": -0.0138,
"step": 140,
"step_time": 4.874726669999291
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 767.0,
"completions/max_terminated_length": 767.0,
"completions/mean_length": 364.84375,
"completions/mean_terminated_length": 369.3333435058594,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.6628918051719666,
"epoch": 0.00141,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.048540182411670685,
"kl": 0.37702805921435356,
"learning_rate": 9.999998724631715e-06,
"loss": -0.0215,
"num_tokens": 3694811.0,
"reward": 2.8997886180877686,
"reward_std": 1.3242307901382446,
"rewards/rollout_reward_func/mean": 2.8997886180877686,
"rewards/rollout_reward_func/std": 1.9665751457214355,
"sampling/importance_sampling_ratio/max": 0.30592989921569824,
"sampling/importance_sampling_ratio/mean": 0.13573986291885376,
"sampling/importance_sampling_ratio/min": 5.063214828144886e-15,
"sampling/sampling_logp_difference/max": 4.425807476043701,
"sampling/sampling_logp_difference/mean": 1.0662561655044556,
"step": 141,
"step_time": 8.895234876999439
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.723241686820984,
"epoch": 0.00142,
"grad_norm": 0.044111333787441254,
"kl": 0.3669211324304342,
"learning_rate": 9.999998700223308e-06,
"loss": -0.0217,
"step": 142,
"step_time": 4.655782125000314
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 866.0,
"completions/max_terminated_length": 866.0,
"completions/mean_length": 388.5625,
"completions/mean_terminated_length": 388.5625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.8627594113349915,
"epoch": 0.00143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.01214392390102148,
"kl": 0.37197905220091343,
"learning_rate": 9.999998675583542e-06,
"loss": -0.0186,
"num_tokens": 3748064.0,
"reward": 2.9884138107299805,
"reward_std": 1.394155740737915,
"rewards/rollout_reward_func/mean": 2.9884138107299805,
"rewards/rollout_reward_func/std": 1.8353135585784912,
"sampling/importance_sampling_ratio/max": 0.5596773028373718,
"sampling/importance_sampling_ratio/mean": 0.16319838166236877,
"sampling/importance_sampling_ratio/min": 3.1172959769065756e-09,
"sampling/sampling_logp_difference/max": 4.036569595336914,
"sampling/sampling_logp_difference/mean": 1.0934593677520752,
"step": 143,
"step_time": 9.0844559330003
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.029166667722165585,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.029166667722165585,
"entropy": 5.961900234222412,
"epoch": 0.00144,
"grad_norm": 0.0157622080296278,
"kl": 0.3777771629393101,
"learning_rate": 9.999998650712415e-06,
"loss": -0.0185,
"step": 144,
"step_time": 5.404956216000301
},
{
"clip_ratio/high_max": 0.005681818351149559,
"clip_ratio/high_mean": 0.0028409091755747795,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0028409091755747795,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 866.0,
"completions/max_terminated_length": 866.0,
"completions/mean_length": 385.5625,
"completions/mean_terminated_length": 389.3000183105469,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.166502773761749,
"epoch": 0.00145,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.027631990611553192,
"kl": 0.5853241086006165,
"learning_rate": 9.999998625609931e-06,
"loss": -0.0256,
"num_tokens": 3801052.0,
"reward": 2.257948398590088,
"reward_std": 1.2924938201904297,
"rewards/rollout_reward_func/mean": 2.257948398590088,
"rewards/rollout_reward_func/std": 1.759012222290039,
"sampling/importance_sampling_ratio/max": 0.5587986707687378,
"sampling/importance_sampling_ratio/mean": 0.2040739357471466,
"sampling/importance_sampling_ratio/min": 1.3630546344400862e-12,
"sampling/sampling_logp_difference/max": 3.9767651557922363,
"sampling/sampling_logp_difference/mean": 0.9805353283882141,
"step": 145,
"step_time": 9.069786099000794
},
{
"clip_ratio/high_max": 0.005681818351149559,
"clip_ratio/high_mean": 0.0028409091755747795,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0028409091755747795,
"entropy": 5.183136820793152,
"epoch": 0.00146,
"grad_norm": 0.026243364438414574,
"kl": 0.5845871120691299,
"learning_rate": 9.999998600276087e-06,
"loss": -0.0256,
"step": 146,
"step_time": 4.969717237999248
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.015625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.015625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 812.0,
"completions/max_terminated_length": 812.0,
"completions/mean_length": 382.4375,
"completions/mean_terminated_length": 382.4375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.606759130954742,
"epoch": 0.00147,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.06667429953813553,
"kl": 0.5389902517199516,
"learning_rate": 9.999998574710883e-06,
"loss": -0.0173,
"num_tokens": 3853406.0,
"reward": 1.033826231956482,
"reward_std": 0.8608077764511108,
"rewards/rollout_reward_func/mean": 1.033826231956482,
"rewards/rollout_reward_func/std": 1.2542723417282104,
"sampling/importance_sampling_ratio/max": 0.3122768998146057,
"sampling/importance_sampling_ratio/mean": 0.10800905525684357,
"sampling/importance_sampling_ratio/min": 1.0712759864892608e-17,
"sampling/sampling_logp_difference/max": 4.165693283081055,
"sampling/sampling_logp_difference/mean": 1.3520691394805908,
"step": 147,
"step_time": 8.949542632000885
},
{
"clip_ratio/high_max": 0.08159722294658422,
"clip_ratio/high_mean": 0.04079861147329211,
"clip_ratio/low_mean": 0.02434501238167286,
"clip_ratio/low_min": 0.013888888992369175,
"clip_ratio/region_mean": 0.06514362338930368,
"entropy": 6.569398641586304,
"epoch": 0.00148,
"grad_norm": 0.014427587389945984,
"kl": 0.3714602068066597,
"learning_rate": 9.999998548914318e-06,
"loss": -0.0175,
"step": 148,
"step_time": 4.723479898000278
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 832.0,
"completions/max_terminated_length": 832.0,
"completions/mean_length": 414.875,
"completions/mean_terminated_length": 414.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.415063202381134,
"epoch": 0.00149,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.006415513344109058,
"kl": 0.3409431576728821,
"learning_rate": 9.999998522886397e-06,
"loss": -0.0151,
"num_tokens": 3907390.0,
"reward": 2.522676467895508,
"reward_std": 1.2102174758911133,
"rewards/rollout_reward_func/mean": 2.522676467895508,
"rewards/rollout_reward_func/std": 1.6199886798858643,
"sampling/importance_sampling_ratio/max": 0.5543047189712524,
"sampling/importance_sampling_ratio/mean": 0.20426242053508759,
"sampling/importance_sampling_ratio/min": 0.0003250113222748041,
"sampling/sampling_logp_difference/max": 2.426882266998291,
"sampling/sampling_logp_difference/mean": 0.9172008037567139,
"step": 149,
"step_time": 9.147840422000172
},
{
"clip_ratio/high_max": 0.1197916679084301,
"clip_ratio/high_mean": 0.05989583395421505,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.05989583395421505,
"entropy": 5.369048357009888,
"epoch": 0.0015,
"grad_norm": 0.009035247378051281,
"kl": 0.3433597218245268,
"learning_rate": 9.999998496627115e-06,
"loss": -0.0151,
"step": 150,
"step_time": 4.844408977000057
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 767.0,
"completions/max_terminated_length": 767.0,
"completions/mean_length": 462.09375,
"completions/mean_terminated_length": 462.09375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.271999478340149,
"epoch": 0.00151,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.009318442083895206,
"kl": 0.2838898357003927,
"learning_rate": 9.999998470136475e-06,
"loss": -0.0107,
"num_tokens": 3963989.0,
"reward": 2.4085025787353516,
"reward_std": 1.5739270448684692,
"rewards/rollout_reward_func/mean": 2.4085025787353516,
"rewards/rollout_reward_func/std": 2.0519044399261475,
"sampling/importance_sampling_ratio/max": 0.3041028082370758,
"sampling/importance_sampling_ratio/mean": 0.10673074424266815,
"sampling/importance_sampling_ratio/min": 3.432775631405483e-20,
"sampling/sampling_logp_difference/max": 3.6299948692321777,
"sampling/sampling_logp_difference/mean": 1.236303448677063,
"step": 151,
"step_time": 8.581440807000035
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 6.249309480190277,
"epoch": 0.00152,
"grad_norm": 0.009570055641233921,
"kl": 0.2857303377240896,
"learning_rate": 9.999998443414474e-06,
"loss": -0.0106,
"step": 152,
"step_time": 4.792721901000277
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 661.0,
"completions/max_terminated_length": 661.0,
"completions/mean_length": 274.21875,
"completions/mean_terminated_length": 274.21875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.317317992448807,
"epoch": 0.00153,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.011901522986590862,
"kl": 0.47474728897213936,
"learning_rate": 9.999998416461115e-06,
"loss": -0.0159,
"num_tokens": 4010768.0,
"reward": 3.833049774169922,
"reward_std": 1.0186306238174438,
"rewards/rollout_reward_func/mean": 3.833049774169922,
"rewards/rollout_reward_func/std": 1.4386236667633057,
"sampling/importance_sampling_ratio/max": 0.5543802380561829,
"sampling/importance_sampling_ratio/mean": 0.2531720995903015,
"sampling/importance_sampling_ratio/min": 1.1614738034196326e-13,
"sampling/sampling_logp_difference/max": 4.201288223266602,
"sampling/sampling_logp_difference/mean": 1.0024373531341553,
"step": 153,
"step_time": 8.65931397399936
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.322837769985199,
"epoch": 0.00154,
"grad_norm": 0.011569921858608723,
"kl": 0.4757602885365486,
"learning_rate": 9.999998389276397e-06,
"loss": -0.016,
"step": 154,
"step_time": 4.628009893999661
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 866.0,
"completions/max_terminated_length": 866.0,
"completions/mean_length": 608.03125,
"completions/mean_terminated_length": 608.03125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.3297582268714905,
"epoch": 0.00155,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.012206662446260452,
"kl": 0.3360460437834263,
"learning_rate": 9.999998361860319e-06,
"loss": -0.0247,
"num_tokens": 4070767.0,
"reward": 1.6205885410308838,
"reward_std": 0.9136905670166016,
"rewards/rollout_reward_func/mean": 1.6205885410308838,
"rewards/rollout_reward_func/std": 1.4889774322509766,
"sampling/importance_sampling_ratio/max": 0.307391494512558,
"sampling/importance_sampling_ratio/mean": 0.12577039003372192,
"sampling/importance_sampling_ratio/min": 3.3397945867208456e-12,
"sampling/sampling_logp_difference/max": 3.970939874649048,
"sampling/sampling_logp_difference/mean": 1.1507443189620972,
"step": 155,
"step_time": 9.421827424999265
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 6.354883968830109,
"epoch": 0.00156,
"grad_norm": 0.011644980870187283,
"kl": 0.3347685132175684,
"learning_rate": 9.99999833421288e-06,
"loss": -0.0247,
"step": 156,
"step_time": 4.989567845999318
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0034722222480922937,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0034722222480922937,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 848.0,
"completions/max_terminated_length": 848.0,
"completions/mean_length": 306.9375,
"completions/mean_terminated_length": 316.32257080078125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.295707732439041,
"epoch": 0.00157,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.006841656286269426,
"kl": 0.45711028575897217,
"learning_rate": 9.999998306334084e-06,
"loss": 0.0064,
"num_tokens": 4119100.0,
"reward": 3.2127416133880615,
"reward_std": 0.8213455677032471,
"rewards/rollout_reward_func/mean": 3.2127416133880615,
"rewards/rollout_reward_func/std": 1.3526753187179565,
"sampling/importance_sampling_ratio/max": 0.5526928901672363,
"sampling/importance_sampling_ratio/mean": 0.20098578929901123,
"sampling/importance_sampling_ratio/min": 6.629302141958338e-13,
"sampling/sampling_logp_difference/max": 3.455430030822754,
"sampling/sampling_logp_difference/mean": 1.1175944805145264,
"step": 157,
"step_time": 8.230952549999529
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 6.352945655584335,
"epoch": 0.00158,
"grad_norm": 0.0065981000661849976,
"kl": 0.4474712498486042,
"learning_rate": 9.99999827822393e-06,
"loss": 0.0064,
"step": 158,
"step_time": 5.171249369000179
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 731.0,
"completions/max_terminated_length": 731.0,
"completions/mean_length": 282.75,
"completions/mean_terminated_length": 282.75,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.617429673671722,
"epoch": 0.00159,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.012413682416081429,
"kl": 0.44416868314146996,
"learning_rate": 9.999998249882414e-06,
"loss": -0.0191,
"num_tokens": 4166080.0,
"reward": 2.787449836730957,
"reward_std": 1.4347116947174072,
"rewards/rollout_reward_func/mean": 2.787449836730957,
"rewards/rollout_reward_func/std": 1.7297974824905396,
"sampling/importance_sampling_ratio/max": 0.5567501187324524,
"sampling/importance_sampling_ratio/mean": 0.2343512773513794,
"sampling/importance_sampling_ratio/min": 3.580179833845965e-16,
"sampling/sampling_logp_difference/max": 3.5624475479125977,
"sampling/sampling_logp_difference/mean": 1.0345864295959473,
"step": 159,
"step_time": 8.048230264000267
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.655277848243713,
"epoch": 0.0016,
"grad_norm": 0.011771933175623417,
"kl": 0.4446682333946228,
"learning_rate": 9.999998221309542e-06,
"loss": -0.0191,
"step": 160,
"step_time": 4.566467165000631
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 649.0,
"completions/max_terminated_length": 649.0,
"completions/mean_length": 233.40625,
"completions/mean_terminated_length": 233.40625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.265278160572052,
"epoch": 0.00161,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.013585682958364487,
"kl": 0.4800906181335449,
"learning_rate": 9.999998192505309e-06,
"loss": -0.0153,
"num_tokens": 4211781.0,
"reward": 2.0160932540893555,
"reward_std": 1.3408175706863403,
"rewards/rollout_reward_func/mean": 2.0160932540893555,
"rewards/rollout_reward_func/std": 1.7153640985488892,
"sampling/importance_sampling_ratio/max": 0.5554280281066895,
"sampling/importance_sampling_ratio/mean": 0.18753069639205933,
"sampling/importance_sampling_ratio/min": 8.604307595305727e-07,
"sampling/sampling_logp_difference/max": 4.6766486167907715,
"sampling/sampling_logp_difference/mean": 1.2480790615081787,
"step": 161,
"step_time": 8.229338431000087
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 6.281177341938019,
"epoch": 0.00162,
"grad_norm": 0.012755308300256729,
"kl": 0.4964945949614048,
"learning_rate": 9.999998163469716e-06,
"loss": -0.0154,
"step": 162,
"step_time": 4.38510970399966
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 778.0,
"completions/max_terminated_length": 778.0,
"completions/mean_length": 557.375,
"completions/mean_terminated_length": 552.1290283203125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.4221086502075195,
"epoch": 0.00163,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007355245761573315,
"kl": 0.330900888890028,
"learning_rate": 9.999998134202764e-06,
"loss": -0.0155,
"num_tokens": 4269709.0,
"reward": 2.1687724590301514,
"reward_std": 1.417752981185913,
"rewards/rollout_reward_func/mean": 2.1687724590301514,
"rewards/rollout_reward_func/std": 1.9265196323394775,
"sampling/importance_sampling_ratio/max": 0.30897635221481323,
"sampling/importance_sampling_ratio/mean": 0.12372960150241852,
"sampling/importance_sampling_ratio/min": 2.4737660090288395e-17,
"sampling/sampling_logp_difference/max": 3.5442941188812256,
"sampling/sampling_logp_difference/mean": 1.2593567371368408,
"step": 163,
"step_time": 8.674119632999918
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 6.411408066749573,
"epoch": 0.00164,
"grad_norm": 0.006261439062654972,
"kl": 0.32452805899083614,
"learning_rate": 9.999998104704453e-06,
"loss": -0.0155,
"step": 164,
"step_time": 5.15820460599889
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 805.0,
"completions/max_terminated_length": 805.0,
"completions/mean_length": 324.21875,
"completions/mean_terminated_length": 324.21875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.876501500606537,
"epoch": 0.00165,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.010572957806289196,
"kl": 0.314052717294544,
"learning_rate": 9.999998074974785e-06,
"loss": -0.0065,
"num_tokens": 4317340.0,
"reward": 2.824183702468872,
"reward_std": 1.7697460651397705,
"rewards/rollout_reward_func/mean": 2.824183702468872,
"rewards/rollout_reward_func/std": 1.7482030391693115,
"sampling/importance_sampling_ratio/max": 0.5577415227890015,
"sampling/importance_sampling_ratio/mean": 0.18741005659103394,
"sampling/importance_sampling_ratio/min": 0.0003228384011890739,
"sampling/sampling_logp_difference/max": 2.3370163440704346,
"sampling/sampling_logp_difference/mean": 1.2416086196899414,
"step": 165,
"step_time": 8.260381059999418
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 6.816007494926453,
"epoch": 0.00166,
"grad_norm": 0.010662592947483063,
"kl": 0.31538827205076814,
"learning_rate": 9.999998045013754e-06,
"loss": -0.0065,
"step": 166,
"step_time": 4.7127935759995125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 803.0,
"completions/max_terminated_length": 803.0,
"completions/mean_length": 546.21875,
"completions/mean_terminated_length": 546.21875,
"completions/min_length": 410.0,
"completions/min_terminated_length": 410.0,
"entropy": 4.806870490312576,
"epoch": 0.00167,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.00802002102136612,
"kl": 0.3522371258586645,
"learning_rate": 9.999998014821366e-06,
"loss": -0.0125,
"num_tokens": 4376707.0,
"reward": 4.180271148681641,
"reward_std": 1.340552568435669,
"rewards/rollout_reward_func/mean": 4.180271148681641,
"rewards/rollout_reward_func/std": 1.3307310342788696,
"sampling/importance_sampling_ratio/max": 0.31147125363349915,
"sampling/importance_sampling_ratio/mean": 0.18969742953777313,
"sampling/importance_sampling_ratio/min": 2.4195236403606748e-17,
"sampling/sampling_logp_difference/max": 3.992608070373535,
"sampling/sampling_logp_difference/mean": 0.8682562112808228,
"step": 167,
"step_time": 9.083963717000188
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 4.7781093418598175,
"epoch": 0.00168,
"grad_norm": 0.008071971125900745,
"kl": 0.3539010286331177,
"learning_rate": 9.999997984397618e-06,
"loss": -0.0125,
"step": 168,
"step_time": 4.795684135000101
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 740.0,
"completions/max_terminated_length": 740.0,
"completions/mean_length": 399.5,
"completions/mean_terminated_length": 399.5,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.353841185569763,
"epoch": 0.00169,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.010215525515377522,
"kl": 0.4602816812694073,
"learning_rate": 9.999997953742511e-06,
"loss": -0.0044,
"num_tokens": 4427943.0,
"reward": 2.107025146484375,
"reward_std": 1.3496294021606445,
"rewards/rollout_reward_func/mean": 2.107025146484375,
"rewards/rollout_reward_func/std": 1.8857003450393677,
"sampling/importance_sampling_ratio/max": 0.5592086315155029,
"sampling/importance_sampling_ratio/mean": 0.20410695672035217,
"sampling/importance_sampling_ratio/min": 3.2384990522604795e-11,
"sampling/sampling_logp_difference/max": 3.561155080795288,
"sampling/sampling_logp_difference/mean": 0.9430296421051025,
"step": 169,
"step_time": 8.44894137799929
},
{
"clip_ratio/high_max": 0.02083333395421505,
"clip_ratio/high_mean": 0.010416666977107525,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 5.303603112697601,
"epoch": 0.0017,
"grad_norm": 0.010156241245567799,
"kl": 0.4662858620285988,
"learning_rate": 9.999997922856044e-06,
"loss": -0.0044,
"step": 170,
"step_time": 5.110951400999511
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 733.0,
"completions/max_terminated_length": 733.0,
"completions/mean_length": 281.78125,
"completions/mean_terminated_length": 281.78125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.283334970474243,
"epoch": 0.00171,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.014388482086360455,
"kl": 0.5276032239198685,
"learning_rate": 9.999997891738219e-06,
"loss": -0.0148,
"num_tokens": 4475664.0,
"reward": 3.1777913570404053,
"reward_std": 0.9462900757789612,
"rewards/rollout_reward_func/mean": 3.1777913570404053,
"rewards/rollout_reward_func/std": 1.581575870513916,
"sampling/importance_sampling_ratio/max": 0.5628533959388733,
"sampling/importance_sampling_ratio/mean": 0.24305231869220734,
"sampling/importance_sampling_ratio/min": 0.0005128039629198611,
"sampling/sampling_logp_difference/max": 2.273298501968384,
"sampling/sampling_logp_difference/mean": 0.8266535997390747,
"step": 171,
"step_time": 8.091583472000366
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.257599115371704,
"epoch": 0.00172,
"grad_norm": 0.014056864194571972,
"kl": 0.5337305925786495,
"learning_rate": 9.999997860389035e-06,
"loss": -0.0148,
"step": 172,
"step_time": 4.970710163999684
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 814.0,
"completions/max_terminated_length": 814.0,
"completions/mean_length": 548.75,
"completions/mean_terminated_length": 548.75,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.598077058792114,
"epoch": 0.00173,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007702630013227463,
"kl": 0.26329412683844566,
"learning_rate": 9.99999782880849e-06,
"loss": -0.0126,
"num_tokens": 4534968.0,
"reward": 3.599264144897461,
"reward_std": 1.2832717895507812,
"rewards/rollout_reward_func/mean": 3.599264144897461,
"rewards/rollout_reward_func/std": 1.7741010189056396,
"sampling/importance_sampling_ratio/max": 0.3131314814090729,
"sampling/importance_sampling_ratio/mean": 0.15425240993499756,
"sampling/importance_sampling_ratio/min": 0.0001230030320584774,
"sampling/sampling_logp_difference/max": 3.5723962783813477,
"sampling/sampling_logp_difference/mean": 0.9245375394821167,
"step": 173,
"step_time": 8.712299219000215
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.617673218250275,
"epoch": 0.00174,
"grad_norm": 0.007035167887806892,
"kl": 0.2626843862235546,
"learning_rate": 9.999997796996588e-06,
"loss": -0.0126,
"step": 174,
"step_time": 4.817434919999414
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 866.0,
"completions/max_terminated_length": 866.0,
"completions/mean_length": 611.65625,
"completions/mean_terminated_length": 611.65625,
"completions/min_length": 18.0,
"completions/min_terminated_length": 18.0,
"entropy": 5.546254634857178,
"epoch": 0.00175,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007665017154067755,
"kl": 0.2765345424413681,
"learning_rate": 9.999997764953326e-06,
"loss": -0.002,
"num_tokens": 4596111.0,
"reward": 3.0970826148986816,
"reward_std": 1.2895809412002563,
"rewards/rollout_reward_func/mean": 3.0970826148986816,
"rewards/rollout_reward_func/std": 1.63455331325531,
"sampling/importance_sampling_ratio/max": 0.31010711193084717,
"sampling/importance_sampling_ratio/mean": 0.14346721768379211,
"sampling/importance_sampling_ratio/min": 2.920134970419719e-20,
"sampling/sampling_logp_difference/max": 14.014349937438965,
"sampling/sampling_logp_difference/mean": 1.0683830976486206,
"step": 175,
"step_time": 9.429678833000253
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.569147884845734,
"epoch": 0.00176,
"grad_norm": 0.007943429052829742,
"kl": 0.27362857572734356,
"learning_rate": 9.999997732678706e-06,
"loss": -0.002,
"step": 176,
"step_time": 4.9659347020001405
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 595.0,
"completions/max_terminated_length": 595.0,
"completions/mean_length": 435.15625,
"completions/mean_terminated_length": 432.58062744140625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.725856065750122,
"epoch": 0.00177,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.009265062399208546,
"kl": 0.3029242120683193,
"learning_rate": 9.999997700172724e-06,
"loss": -0.0202,
"num_tokens": 4651072.0,
"reward": 3.301889181137085,
"reward_std": 1.8498287200927734,
"rewards/rollout_reward_func/mean": 3.301889181137085,
"rewards/rollout_reward_func/std": 2.1379148960113525,
"sampling/importance_sampling_ratio/max": 0.3099573254585266,
"sampling/importance_sampling_ratio/mean": 0.14696773886680603,
"sampling/importance_sampling_ratio/min": 5.326042341532999e-13,
"sampling/sampling_logp_difference/max": 4.241425037384033,
"sampling/sampling_logp_difference/mean": 1.0613305568695068,
"step": 177,
"step_time": 8.000326000999848
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.7441646456718445,
"epoch": 0.00178,
"grad_norm": 0.009017222560942173,
"kl": 0.2993904184550047,
"learning_rate": 9.999997667435383e-06,
"loss": -0.0202,
"step": 178,
"step_time": 4.911030336000749
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 821.0,
"completions/max_terminated_length": 821.0,
"completions/mean_length": 627.59375,
"completions/mean_terminated_length": 622.8709716796875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.733899414539337,
"epoch": 0.00179,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.015371856279671192,
"kl": 0.39605566393584013,
"learning_rate": 9.999997634466684e-06,
"loss": -0.0028,
"num_tokens": 4713163.0,
"reward": 2.408566951751709,
"reward_std": 1.2345832586288452,
"rewards/rollout_reward_func/mean": 2.408566951751709,
"rewards/rollout_reward_func/std": 1.5583417415618896,
"sampling/importance_sampling_ratio/max": 0.3045631945133209,
"sampling/importance_sampling_ratio/mean": 0.14088614284992218,
"sampling/importance_sampling_ratio/min": 3.260920675229406e-17,
"sampling/sampling_logp_difference/max": 12.893571853637695,
"sampling/sampling_logp_difference/mean": 1.2033199071884155,
"step": 179,
"step_time": 9.336971251999785
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.755895584821701,
"epoch": 0.0018,
"grad_norm": 0.013228918425738811,
"kl": 0.38902273029088974,
"learning_rate": 9.999997601266627e-06,
"loss": -0.0029,
"step": 180,
"step_time": 4.989180980000128
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 803.0,
"completions/max_terminated_length": 803.0,
"completions/mean_length": 591.15625,
"completions/mean_terminated_length": 591.15625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.095756113529205,
"epoch": 0.00181,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007322733756154776,
"kl": 0.3757977671921253,
"learning_rate": 9.999997567835209e-06,
"loss": -0.0162,
"num_tokens": 4774032.0,
"reward": 2.0675711631774902,
"reward_std": 0.6423474550247192,
"rewards/rollout_reward_func/mean": 2.0675711631774902,
"rewards/rollout_reward_func/std": 1.5976166725158691,
"sampling/importance_sampling_ratio/max": 0.30949416756629944,
"sampling/importance_sampling_ratio/mean": 0.1696314811706543,
"sampling/importance_sampling_ratio/min": 0.0006361076375469565,
"sampling/sampling_logp_difference/max": 3.4188907146453857,
"sampling/sampling_logp_difference/mean": 0.8557475805282593,
"step": 181,
"step_time": 9.625972572999672
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.120969295501709,
"epoch": 0.00182,
"grad_norm": 0.006717793643474579,
"kl": 0.3749655243009329,
"learning_rate": 9.999997534172434e-06,
"loss": -0.0162,
"step": 182,
"step_time": 4.893760320000183
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 662.0,
"completions/max_terminated_length": 662.0,
"completions/mean_length": 281.8125,
"completions/mean_terminated_length": 281.8125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.411740601062775,
"epoch": 0.00183,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.01275318581610918,
"kl": 0.4849829040467739,
"learning_rate": 9.999997500278298e-06,
"loss": -0.012,
"num_tokens": 4821524.0,
"reward": 3.822073459625244,
"reward_std": 1.2214256525039673,
"rewards/rollout_reward_func/mean": 3.822073459625244,
"rewards/rollout_reward_func/std": 1.457135558128357,
"sampling/importance_sampling_ratio/max": 0.5602318644523621,
"sampling/importance_sampling_ratio/mean": 0.2258593738079071,
"sampling/importance_sampling_ratio/min": 8.38409050629707e-06,
"sampling/sampling_logp_difference/max": 3.740847587585449,
"sampling/sampling_logp_difference/mean": 0.9152244925498962,
"step": 183,
"step_time": 7.8192755909999505
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 5.4009730219841,
"epoch": 0.00184,
"grad_norm": 0.012373281642794609,
"kl": 0.49285488575696945,
"learning_rate": 9.999997466152803e-06,
"loss": -0.012,
"step": 184,
"step_time": 4.937281341000016
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 794.0,
"completions/max_terminated_length": 794.0,
"completions/mean_length": 324.21875,
"completions/mean_terminated_length": 324.21875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.895534723997116,
"epoch": 0.00185,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.12174241244792938,
"kl": 1.1918272729963064,
"learning_rate": 9.999997431795949e-06,
"loss": -0.0079,
"num_tokens": 4872521.0,
"reward": 2.4634082317352295,
"reward_std": 0.7030278444290161,
"rewards/rollout_reward_func/mean": 2.4634082317352295,
"rewards/rollout_reward_func/std": 1.6452993154525757,
"sampling/importance_sampling_ratio/max": 0.5586066246032715,
"sampling/importance_sampling_ratio/mean": 0.22821499407291412,
"sampling/importance_sampling_ratio/min": 2.045808008557789e-24,
"sampling/sampling_logp_difference/max": 11.78732967376709,
"sampling/sampling_logp_difference/mean": 1.2645323276519775,
"step": 185,
"step_time": 8.363861161000386
},
{
"clip_ratio/high_max": 0.021875000093132257,
"clip_ratio/high_mean": 0.010937500046566129,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010937500046566129,
"entropy": 5.90417143702507,
"epoch": 0.00186,
"grad_norm": 0.06167863681912422,
"kl": 0.7666090168058872,
"learning_rate": 9.999997397207736e-06,
"loss": -0.0089,
"step": 186,
"step_time": 4.719343276000018
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 767.0,
"completions/max_terminated_length": 767.0,
"completions/mean_length": 563.84375,
"completions/mean_terminated_length": 581.51611328125,
"completions/min_length": 16.0,
"completions/min_terminated_length": 312.0,
"entropy": 4.6642705500125885,
"epoch": 0.00187,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.011750214733183384,
"kl": 0.43587249517440796,
"learning_rate": 9.999997362388163e-06,
"loss": -0.0142,
"num_tokens": 4933144.0,
"reward": 2.8686366081237793,
"reward_std": 1.2804789543151855,
"rewards/rollout_reward_func/mean": 2.8686366081237793,
"rewards/rollout_reward_func/std": 1.7704367637634277,
"sampling/importance_sampling_ratio/max": 0.3205587863922119,
"sampling/importance_sampling_ratio/mean": 0.20105448365211487,
"sampling/importance_sampling_ratio/min": 1.161242035863097e-08,
"sampling/sampling_logp_difference/max": 3.013947010040283,
"sampling/sampling_logp_difference/mean": 0.8054625988006592,
"step": 187,
"step_time": 9.18670280699962
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 4.6675141751766205,
"epoch": 0.00188,
"grad_norm": 0.009789633564651012,
"kl": 0.4357186071574688,
"learning_rate": 9.999997327337232e-06,
"loss": -0.0142,
"step": 188,
"step_time": 4.775712918999034
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 796.0,
"completions/max_terminated_length": 796.0,
"completions/mean_length": 487.1875,
"completions/mean_terminated_length": 487.1875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.183352828025818,
"epoch": 0.00189,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.007217842619866133,
"kl": 0.36203115805983543,
"learning_rate": 9.99999729205494e-06,
"loss": -0.0087,
"num_tokens": 4991196.0,
"reward": 2.807063102722168,
"reward_std": 1.7962517738342285,
"rewards/rollout_reward_func/mean": 2.807063102722168,
"rewards/rollout_reward_func/std": 2.033599853515625,
"sampling/importance_sampling_ratio/max": 0.3048250079154968,
"sampling/importance_sampling_ratio/mean": 0.13251593708992004,
"sampling/importance_sampling_ratio/min": 0.00014422468666452914,
"sampling/sampling_logp_difference/max": 2.6139421463012695,
"sampling/sampling_logp_difference/mean": 1.1694201231002808,
"step": 189,
"step_time": 8.603061843000887
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 6.190056622028351,
"epoch": 0.0019,
"grad_norm": 0.0067920200526714325,
"kl": 0.3644371014088392,
"learning_rate": 9.99999725654129e-06,
"loss": -0.0087,
"step": 190,
"step_time": 5.3103096270001515
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 857.0,
"completions/max_terminated_length": 857.0,
"completions/mean_length": 593.125,
"completions/mean_terminated_length": 585.0967407226562,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.34462833404541,
"epoch": 0.00191,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.0056811366230249405,
"kl": 0.34313227608799934,
"learning_rate": 9.999997220796281e-06,
"loss": -0.0185,
"num_tokens": 5052550.0,
"reward": 3.0586280822753906,
"reward_std": 1.8891185522079468,
"rewards/rollout_reward_func/mean": 3.0586280822753906,
"rewards/rollout_reward_func/std": 1.9580395221710205,
"sampling/importance_sampling_ratio/max": 0.3123818635940552,
"sampling/importance_sampling_ratio/mean": 0.17983976006507874,
"sampling/importance_sampling_ratio/min": 1.8176118621212876e-21,
"sampling/sampling_logp_difference/max": 3.9357526302337646,
"sampling/sampling_logp_difference/mean": 1.0673115253448486,
"step": 191,
"step_time": 9.136948834000577
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.350877106189728,
"epoch": 0.00192,
"grad_norm": 0.006298670079559088,
"kl": 0.3456762544810772,
"learning_rate": 9.999997184819913e-06,
"loss": -0.0185,
"step": 192,
"step_time": 5.421018731999538
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 805.0,
"completions/max_terminated_length": 805.0,
"completions/mean_length": 463.0625,
"completions/mean_terminated_length": 463.0625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 4.9372279047966,
"epoch": 0.00193,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.006436275318264961,
"kl": 0.4286567382514477,
"learning_rate": 9.999997148612186e-06,
"loss": -0.0093,
"num_tokens": 5107904.0,
"reward": 2.5031309127807617,
"reward_std": 0.7777051329612732,
"rewards/rollout_reward_func/mean": 2.5031309127807617,
"rewards/rollout_reward_func/std": 1.5234891176223755,
"sampling/importance_sampling_ratio/max": 0.556627631187439,
"sampling/importance_sampling_ratio/mean": 0.24726936221122742,
"sampling/importance_sampling_ratio/min": 1.4663258658697762e-21,
"sampling/sampling_logp_difference/max": 11.847373008728027,
"sampling/sampling_logp_difference/mean": 1.019608974456787,
"step": 193,
"step_time": 8.926773718000277
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 4.932731568813324,
"epoch": 0.00194,
"grad_norm": 0.006187028717249632,
"kl": 0.42864546552300453,
"learning_rate": 9.9999971121731e-06,
"loss": -0.0093,
"step": 194,
"step_time": 4.810186862000137
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 796.0,
"completions/max_terminated_length": 796.0,
"completions/mean_length": 289.0,
"completions/mean_terminated_length": 289.0,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.024335443973541,
"epoch": 0.00195,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.014793330803513527,
"kl": 0.5311180464923382,
"learning_rate": 9.999997075502653e-06,
"loss": -0.0,
"num_tokens": 5153943.0,
"reward": 2.680002212524414,
"reward_std": 1.0660200119018555,
"rewards/rollout_reward_func/mean": 2.680002212524414,
"rewards/rollout_reward_func/std": 1.608717441558838,
"sampling/importance_sampling_ratio/max": 0.556300938129425,
"sampling/importance_sampling_ratio/mean": 0.1869335174560547,
"sampling/importance_sampling_ratio/min": 8.39509803904695e-14,
"sampling/sampling_logp_difference/max": 3.1887903213500977,
"sampling/sampling_logp_difference/mean": 1.1006312370300293,
"step": 195,
"step_time": 8.258161678000306
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.993339031934738,
"epoch": 0.00196,
"grad_norm": 0.01399671845138073,
"kl": 0.5270919986069202,
"learning_rate": 9.999997038600848e-06,
"loss": -0.0,
"step": 196,
"step_time": 5.157405102000212
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 810.0,
"completions/max_terminated_length": 776.0,
"completions/mean_length": 596.125,
"completions/mean_terminated_length": 586.7333374023438,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.484014600515366,
"epoch": 0.00197,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.011650401167571545,
"kl": 0.25742718297988176,
"learning_rate": 9.999997001467682e-06,
"loss": -0.0218,
"num_tokens": 5214471.0,
"reward": 2.370987892150879,
"reward_std": 1.2418047189712524,
"rewards/rollout_reward_func/mean": 2.370987892150879,
"rewards/rollout_reward_func/std": 1.7970783710479736,
"sampling/importance_sampling_ratio/max": 0.30147287249565125,
"sampling/importance_sampling_ratio/mean": 0.12158507108688354,
"sampling/importance_sampling_ratio/min": 1.8635179788931512e-17,
"sampling/sampling_logp_difference/max": 4.204184055328369,
"sampling/sampling_logp_difference/mean": 1.3044867515563965,
"step": 197,
"step_time": 9.104788559999633
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 6.468224465847015,
"epoch": 0.00198,
"grad_norm": 0.011098474264144897,
"kl": 0.2528890473768115,
"learning_rate": 9.99999696410316e-06,
"loss": -0.0218,
"step": 198,
"step_time": 5.2834800850005195
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 742.0,
"completions/max_terminated_length": 742.0,
"completions/mean_length": 425.25,
"completions/mean_terminated_length": 425.25,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.416600346565247,
"epoch": 0.00199,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.6569156646728516,
"kl": 1.3255293928086758,
"learning_rate": 9.999996926507279e-06,
"loss": -0.0115,
"num_tokens": 5267520.0,
"reward": 3.124060869216919,
"reward_std": 1.0585153102874756,
"rewards/rollout_reward_func/mean": 3.124060869216919,
"rewards/rollout_reward_func/std": 1.6291773319244385,
"sampling/importance_sampling_ratio/max": 0.5548077821731567,
"sampling/importance_sampling_ratio/mean": 0.2113969922065735,
"sampling/importance_sampling_ratio/min": 0.0004637441597878933,
"sampling/sampling_logp_difference/max": 2.2059574127197266,
"sampling/sampling_logp_difference/mean": 0.9068723320960999,
"step": 199,
"step_time": 8.44307382299985
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 5.432026922702789,
"epoch": 0.002,
"grad_norm": 0.008453106507658958,
"kl": 0.38647904247045517,
"learning_rate": 9.999996888680038e-06,
"loss": -0.014,
"step": 200,
"step_time": 4.681182854999861
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 712.0,
"completions/max_terminated_length": 712.0,
"completions/mean_length": 360.5625,
"completions/mean_terminated_length": 371.6773986816406,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.117161571979523,
"epoch": 0.00201,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.01611809805035591,
"kl": 0.4253620970994234,
"learning_rate": 9.999996850621436e-06,
"loss": -0.0152,
"num_tokens": 5318093.0,
"reward": 2.912040948867798,
"reward_std": 0.7651806473731995,
"rewards/rollout_reward_func/mean": 2.912040948867798,
"rewards/rollout_reward_func/std": 1.8777114152908325,
"sampling/importance_sampling_ratio/max": 0.556606113910675,
"sampling/importance_sampling_ratio/mean": 0.21097299456596375,
"sampling/importance_sampling_ratio/min": 2.815273847378563e-10,
"sampling/sampling_logp_difference/max": 4.575296401977539,
"sampling/sampling_logp_difference/mean": 0.9073153734207153,
"step": 201,
"step_time": 8.590784671000165
},
{
"clip_ratio/high_max": 0.015625,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"entropy": 5.127772510051727,
"epoch": 0.00202,
"grad_norm": 0.01754254475235939,
"kl": 0.42357040755450726,
"learning_rate": 9.999996812331476e-06,
"loss": -0.0152,
"step": 202,
"step_time": 4.51354878300026
},
{
"clip_ratio/high_max": 0.022727273404598236,
"clip_ratio/high_mean": 0.011363636702299118,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.011363636702299118,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 823.0,
"completions/max_terminated_length": 823.0,
"completions/mean_length": 246.34375,
"completions/mean_terminated_length": 251.36668395996094,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.282517075538635,
"epoch": 0.00203,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.014741458930075169,
"kl": 0.41361628845334053,
"learning_rate": 9.999996773810157e-06,
"loss": -0.009,
"num_tokens": 5363579.0,
"reward": 2.7251787185668945,
"reward_std": 0.8450720310211182,
"rewards/rollout_reward_func/mean": 2.7251787185668945,
"rewards/rollout_reward_func/std": 1.7295880317687988,
"sampling/importance_sampling_ratio/max": 0.5568343997001648,
"sampling/importance_sampling_ratio/mean": 0.1812913417816162,
"sampling/importance_sampling_ratio/min": 9.55414493809214e-19,
"sampling/sampling_logp_difference/max": 12.835467338562012,
"sampling/sampling_logp_difference/mean": 1.3004571199417114,
"step": 203,
"step_time": 8.776359550999132
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 6.294387757778168,
"epoch": 0.00204,
"grad_norm": 0.018707161769270897,
"kl": 0.41007015481591225,
"learning_rate": 9.99999673505748e-06,
"loss": -0.0089,
"step": 204,
"step_time": 5.04047468599947
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 712.0,
"completions/max_terminated_length": 712.0,
"completions/mean_length": 226.71875,
"completions/mean_terminated_length": 233.51612854003906,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.579507499933243,
"epoch": 0.00205,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.015186581760644913,
"kl": 0.4859627615660429,
"learning_rate": 9.999996696073441e-06,
"loss": -0.0153,
"num_tokens": 5407892.0,
"reward": 3.182164192199707,
"reward_std": 1.4856499433517456,
"rewards/rollout_reward_func/mean": 3.182164192199707,
"rewards/rollout_reward_func/std": 1.7880464792251587,
"sampling/importance_sampling_ratio/max": 0.5547261238098145,
"sampling/importance_sampling_ratio/mean": 0.23533114790916443,
"sampling/importance_sampling_ratio/min": 1.2176733044458061e-14,
"sampling/sampling_logp_difference/max": 5.080104351043701,
"sampling/sampling_logp_difference/mean": 1.195305347442627,
"step": 205,
"step_time": 8.019790785999703
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0024999999441206455,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0024999999441206455,
"entropy": 5.567036747932434,
"epoch": 0.00206,
"grad_norm": 0.015953045338392258,
"kl": 0.4858721327036619,
"learning_rate": 9.999996656858045e-06,
"loss": -0.0153,
"step": 206,
"step_time": 4.494372066999858
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 694.0,
"completions/max_terminated_length": 694.0,
"completions/mean_length": 257.25,
"completions/mean_terminated_length": 265.0322570800781,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.651438236236572,
"epoch": 0.00207,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.02735012210905552,
"kl": 0.4778033494949341,
"learning_rate": 9.99999661741129e-06,
"loss": -0.0107,
"num_tokens": 5454600.0,
"reward": 3.8261237144470215,
"reward_std": 1.0173990726470947,
"rewards/rollout_reward_func/mean": 3.8261237144470215,
"rewards/rollout_reward_func/std": 1.3678010702133179,
"sampling/importance_sampling_ratio/max": 0.5530157089233398,
"sampling/importance_sampling_ratio/mean": 0.22838255763053894,
"sampling/importance_sampling_ratio/min": 6.239156959964021e-07,
"sampling/sampling_logp_difference/max": 3.2890069484710693,
"sampling/sampling_logp_difference/mean": 0.932339072227478,
"step": 207,
"step_time": 8.438589544000479
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.5857046246528625,
"epoch": 0.00208,
"grad_norm": 0.026330100372433662,
"kl": 0.4851069003343582,
"learning_rate": 9.999996577733175e-06,
"loss": -0.0107,
"step": 208,
"step_time": 4.460181868000291
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 830.0,
"completions/max_terminated_length": 830.0,
"completions/mean_length": 397.125,
"completions/mean_terminated_length": 397.125,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.61989688873291,
"epoch": 0.00209,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.03519469127058983,
"kl": 0.4516414776444435,
"learning_rate": 9.9999965378237e-06,
"loss": -0.0089,
"num_tokens": 5507788.0,
"reward": 2.7961456775665283,
"reward_std": 1.177910566329956,
"rewards/rollout_reward_func/mean": 2.7961456775665283,
"rewards/rollout_reward_func/std": 1.9247767925262451,
"sampling/importance_sampling_ratio/max": 0.5473094582557678,
"sampling/importance_sampling_ratio/mean": 0.18874725699424744,
"sampling/importance_sampling_ratio/min": 0.00020230493100825697,
"sampling/sampling_logp_difference/max": 2.5100324153900146,
"sampling/sampling_logp_difference/mean": 0.9597841501235962,
"step": 209,
"step_time": 9.165979287000027
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.536510765552521,
"epoch": 0.0021,
"grad_norm": 0.02714668959379196,
"kl": 0.45711198449134827,
"learning_rate": 9.999996497682868e-06,
"loss": -0.009,
"step": 210,
"step_time": 4.771343512000385
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 832.0,
"completions/max_terminated_length": 832.0,
"completions/mean_length": 695.15625,
"completions/mean_terminated_length": 695.15625,
"completions/min_length": 585.0,
"completions/min_terminated_length": 585.0,
"entropy": 5.806370496749878,
"epoch": 0.00211,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.02025657892227173,
"kl": 0.2787773534655571,
"learning_rate": 9.999996457310676e-06,
"loss": -0.0145,
"num_tokens": 5572137.0,
"reward": 2.248298168182373,
"reward_std": 0.9219829440116882,
"rewards/rollout_reward_func/mean": 2.248298168182373,
"rewards/rollout_reward_func/std": 1.4261958599090576,
"sampling/importance_sampling_ratio/max": 0.301658034324646,
"sampling/importance_sampling_ratio/mean": 0.13740620017051697,
"sampling/importance_sampling_ratio/min": 9.007880552580616e-10,
"sampling/sampling_logp_difference/max": 3.91507887840271,
"sampling/sampling_logp_difference/mean": 0.9571284055709839,
"step": 211,
"step_time": 9.053193517001091
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 5.7584188580513,
"epoch": 0.00212,
"grad_norm": 0.017554691061377525,
"kl": 0.28337166644632816,
"learning_rate": 9.999996416707125e-06,
"loss": -0.0145,
"step": 212,
"step_time": 4.910072835999927
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 794.0,
"completions/max_terminated_length": 794.0,
"completions/mean_length": 306.875,
"completions/mean_terminated_length": 306.875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 4.477221041917801,
"epoch": 0.00213,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.012003457173705101,
"kl": 0.6022003293037415,
"learning_rate": 9.999996375872214e-06,
"loss": -0.0063,
"num_tokens": 5618850.0,
"reward": 2.7603349685668945,
"reward_std": 0.35092732310295105,
"rewards/rollout_reward_func/mean": 2.7603349685668945,
"rewards/rollout_reward_func/std": 1.3781588077545166,
"sampling/importance_sampling_ratio/max": 0.5524585843086243,
"sampling/importance_sampling_ratio/mean": 0.3040264844894409,
"sampling/importance_sampling_ratio/min": 0.0008875139756128192,
"sampling/sampling_logp_difference/max": 2.6261608600616455,
"sampling/sampling_logp_difference/mean": 0.7000038623809814,
"step": 213,
"step_time": 8.743301202000112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 4.459619641304016,
"epoch": 0.00214,
"grad_norm": 0.010844763368368149,
"kl": 0.6048287376761436,
"learning_rate": 9.999996334805946e-06,
"loss": -0.0063,
"step": 214,
"step_time": 4.693870484000399
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 839.0,
"completions/max_terminated_length": 839.0,
"completions/mean_length": 359.09375,
"completions/mean_terminated_length": 370.1612854003906,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 4.843548268079758,
"epoch": 0.00215,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.047307442873716354,
"kl": 1.0721620507538319,
"learning_rate": 9.999996293508317e-06,
"loss": -0.012,
"num_tokens": 5670482.0,
"reward": 2.6061129570007324,
"reward_std": 0.6701341867446899,
"rewards/rollout_reward_func/mean": 2.6061129570007324,
"rewards/rollout_reward_func/std": 1.2390384674072266,
"sampling/importance_sampling_ratio/max": 0.5575221180915833,
"sampling/importance_sampling_ratio/mean": 0.2627297639846802,
"sampling/importance_sampling_ratio/min": 5.8882815068272976e-08,
"sampling/sampling_logp_difference/max": 2.7387423515319824,
"sampling/sampling_logp_difference/mean": 0.7991349101066589,
"step": 215,
"step_time": 9.773670517000028
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 4.866431772708893,
"epoch": 0.00216,
"grad_norm": 0.02840716764330864,
"kl": 0.9429246261715889,
"learning_rate": 9.999996251979329e-06,
"loss": -0.0121,
"step": 216,
"step_time": 5.003656359999695
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 715.0,
"completions/max_terminated_length": 715.0,
"completions/mean_length": 386.65625,
"completions/mean_terminated_length": 386.65625,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 4.9683555364608765,
"epoch": 0.00217,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.056511927396059036,
"kl": 0.435198824852705,
"learning_rate": 9.999996210218981e-06,
"loss": -0.0082,
"num_tokens": 5723431.0,
"reward": 2.3611326217651367,
"reward_std": 1.23284113407135,
"rewards/rollout_reward_func/mean": 2.3611326217651367,
"rewards/rollout_reward_func/std": 2.1599881649017334,
"sampling/importance_sampling_ratio/max": 0.5584315657615662,
"sampling/importance_sampling_ratio/mean": 0.23213379085063934,
"sampling/importance_sampling_ratio/min": 9.592416063242126e-06,
"sampling/sampling_logp_difference/max": 3.0070853233337402,
"sampling/sampling_logp_difference/mean": 0.7948013544082642,
"step": 217,
"step_time": 8.342223236000336
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.010416666977107525,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.010416666977107525,
"entropy": 4.992517560720444,
"epoch": 0.00218,
"grad_norm": 0.011323577724397182,
"kl": 0.43109437450766563,
"learning_rate": 9.999996168227277e-06,
"loss": -0.0084,
"step": 218,
"step_time": 4.607645754999339
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 832.0,
"completions/max_terminated_length": 832.0,
"completions/mean_length": 301.9375,
"completions/mean_terminated_length": 301.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 4.874771684408188,
"epoch": 0.00219,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.031172219663858414,
"kl": 0.6132681779563427,
"learning_rate": 9.999996126004213e-06,
"loss": -0.0065,
"num_tokens": 5770701.0,
"reward": 2.9851279258728027,
"reward_std": 0.8050676584243774,
"rewards/rollout_reward_func/mean": 2.9851279258728027,
"rewards/rollout_reward_func/std": 1.344331979751587,
"sampling/importance_sampling_ratio/max": 0.5544815063476562,
"sampling/importance_sampling_ratio/mean": 0.24593959748744965,
"sampling/importance_sampling_ratio/min": 4.177666784559164e-14,
"sampling/sampling_logp_difference/max": 4.385872840881348,
"sampling/sampling_logp_difference/mean": 0.8459481000900269,
"step": 219,
"step_time": 8.750391007999951
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 4.893818587064743,
"epoch": 0.0022,
"grad_norm": 0.03290743753314018,
"kl": 0.6108976900577545,
"learning_rate": 9.999996083549788e-06,
"loss": -0.0065,
"step": 220,
"step_time": 4.7656036020011925
},
{
"clip_ratio/high_max": 0.00657894741743803,
"clip_ratio/high_mean": 0.003289473708719015,
"clip_ratio/low_mean": 0.0062500000931322575,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.009539473801851273,
"completions/clipped_ratio": 0.0,
"completions/max_length": 778.0,
"completions/max_terminated_length": 778.0,
"completions/mean_length": 569.1875,
"completions/mean_terminated_length": 569.1875,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.575940668582916,
"epoch": 0.00221,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.011595762334764004,
"kl": 0.347899217158556,
"learning_rate": 9.999996040864003e-06,
"loss": -0.0171,
"num_tokens": 5830699.0,
"reward": 3.000262498855591,
"reward_std": 1.67076575756073,
"rewards/rollout_reward_func/mean": 3.000262498855591,
"rewards/rollout_reward_func/std": 1.8007228374481201,
"sampling/importance_sampling_ratio/max": 0.3085794150829315,
"sampling/importance_sampling_ratio/mean": 0.15168991684913635,
"sampling/importance_sampling_ratio/min": 4.0518877315876e-12,
"sampling/sampling_logp_difference/max": 2.8902509212493896,
"sampling/sampling_logp_difference/mean": 0.9849369525909424,
"step": 221,
"step_time": 9.164814059999571
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0062500000931322575,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0062500000931322575,
"entropy": 5.600629568099976,
"epoch": 0.00222,
"grad_norm": 0.010892020538449287,
"kl": 0.3483094722032547,
"learning_rate": 9.999995997946861e-06,
"loss": -0.0171,
"step": 222,
"step_time": 4.744326772000022
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0028409091755747795,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0028409091755747795,
"completions/clipped_ratio": 0.0,
"completions/max_length": 671.0,
"completions/max_terminated_length": 671.0,
"completions/mean_length": 410.9375,
"completions/mean_terminated_length": 410.9375,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 6.076228618621826,
"epoch": 0.00223,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.007272793911397457,
"kl": 0.37392777390778065,
"learning_rate": 9.999995954798361e-06,
"loss": -0.0002,
"num_tokens": 5882689.0,
"reward": 2.952998161315918,
"reward_std": 1.2456332445144653,
"rewards/rollout_reward_func/mean": 2.952998161315918,
"rewards/rollout_reward_func/std": 1.8146508932113647,
"sampling/importance_sampling_ratio/max": 0.5566303133964539,
"sampling/importance_sampling_ratio/mean": 0.1916414350271225,
"sampling/importance_sampling_ratio/min": 5.414286691375744e-15,
"sampling/sampling_logp_difference/max": 3.779116630554199,
"sampling/sampling_logp_difference/mean": 1.1832430362701416,
"step": 223,
"step_time": 8.38111467600038
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 6.1000664830207825,
"epoch": 0.00224,
"grad_norm": 0.00639992905780673,
"kl": 0.37375164218246937,
"learning_rate": 9.9999959114185e-06,
"loss": -0.0002,
"step": 224,
"step_time": 5.022103202999915
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 751.0,
"completions/max_terminated_length": 751.0,
"completions/mean_length": 251.3125,
"completions/mean_terminated_length": 258.9032287597656,
"completions/min_length": 2.0,
"completions/min_terminated_length": 2.0,
"entropy": 5.927689909934998,
"epoch": 0.00225,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.027866557240486145,
"kl": 0.44809701666235924,
"learning_rate": 9.999995867807281e-06,
"loss": -0.0124,
"num_tokens": 5928498.0,
"reward": 2.4831409454345703,
"reward_std": 0.8753278255462646,
"rewards/rollout_reward_func/mean": 2.4831409454345703,
"rewards/rollout_reward_func/std": 1.5412579774856567,
"sampling/importance_sampling_ratio/max": 0.5537245869636536,
"sampling/importance_sampling_ratio/mean": 0.21668727695941925,
"sampling/importance_sampling_ratio/min": 7.331964479995179e-10,
"sampling/sampling_logp_difference/max": 3.372708797454834,
"sampling/sampling_logp_difference/mean": 1.118770956993103,
"step": 225,
"step_time": 8.150611867999942
}
],
"logging_steps": 1.0,
"max_steps": 400000,
"num_input_tokens_seen": 5928498,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}