test_gin_rummy_np / trainer_state.json
Gege24's picture
Upload task output 1
cb62910 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.00026,
"eval_steps": 500,
"global_step": 13,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1779.0,
"completions/max_terminated_length": 1779.0,
"completions/mean_length": 1698.46875,
"completions/mean_terminated_length": 1698.46875,
"completions/min_length": 1279.0,
"completions/min_terminated_length": 1279.0,
"entropy": 0.48651931062340736,
"epoch": 2e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.5409082174301147,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0112,
"num_tokens": 75031.0,
"reward": -9.59624195098877,
"reward_std": 5.939093589782715,
"rewards/rollout_reward_func/mean": -9.59624195098877,
"rewards/rollout_reward_func/std": 10.368197441101074,
"sampling/importance_sampling_ratio/max": 1.3440189361572266,
"sampling/importance_sampling_ratio/mean": 0.9953499436378479,
"sampling/importance_sampling_ratio/min": 0.564490556716919,
"sampling/sampling_logp_difference/max": 0.45447802543640137,
"sampling/sampling_logp_difference/mean": 0.016698362305760384,
"step": 1,
"step_time": 36.680761918001735
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.48651931062340736,
"epoch": 4e-05,
"grad_norm": 1.5392467975616455,
"kl": 0.0,
"learning_rate": 2.8571428571428575e-07,
"loss": 0.0112,
"step": 2,
"step_time": 5.709443093002847
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1816.0,
"completions/max_terminated_length": 1816.0,
"completions/mean_length": 1625.21875,
"completions/mean_terminated_length": 1625.21875,
"completions/min_length": 1159.0,
"completions/min_terminated_length": 1159.0,
"entropy": 0.48103801161050797,
"epoch": 6e-05,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.117859125137329,
"kl": 0.0010091230506077409,
"learning_rate": 5.714285714285715e-07,
"loss": -0.0237,
"num_tokens": 147721.0,
"reward": -7.404824733734131,
"reward_std": 11.744457244873047,
"rewards/rollout_reward_func/mean": -7.404824733734131,
"rewards/rollout_reward_func/std": 15.456405639648438,
"sampling/importance_sampling_ratio/max": 1.4090882539749146,
"sampling/importance_sampling_ratio/mean": 1.0395634174346924,
"sampling/importance_sampling_ratio/min": 0.7728875279426575,
"sampling/sampling_logp_difference/max": 0.2340834140777588,
"sampling/sampling_logp_difference/mean": 0.019678719341754913,
"step": 3,
"step_time": 35.33501763899767
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.005859375,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"entropy": 0.48065420985221863,
"epoch": 8e-05,
"grad_norm": 2.1440107822418213,
"kl": 0.0009154866565950215,
"learning_rate": 8.571428571428572e-07,
"loss": -0.0232,
"step": 4,
"step_time": 5.808208025997374
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.005859375,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1766.0,
"completions/max_terminated_length": 1766.0,
"completions/mean_length": 1573.65625,
"completions/mean_terminated_length": 1573.65625,
"completions/min_length": 1107.0,
"completions/min_terminated_length": 1107.0,
"entropy": 0.43740712106227875,
"epoch": 0.0001,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.864342212677002,
"kl": 0.0005766874128312338,
"learning_rate": 1.142857142857143e-06,
"loss": -0.0206,
"num_tokens": 218674.0,
"reward": -14.006583213806152,
"reward_std": 12.985024452209473,
"rewards/rollout_reward_func/mean": -14.006583213806152,
"rewards/rollout_reward_func/std": 17.190784454345703,
"sampling/importance_sampling_ratio/max": 1.3863141536712646,
"sampling/importance_sampling_ratio/mean": 0.9954429864883423,
"sampling/importance_sampling_ratio/min": 0.6810365915298462,
"sampling/sampling_logp_difference/max": 0.2415471076965332,
"sampling/sampling_logp_difference/mean": 0.016646649688482285,
"step": 5,
"step_time": 34.27298692500153
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00390625,
"entropy": 0.4379243813455105,
"epoch": 0.00012,
"grad_norm": 1.9039454460144043,
"kl": 0.00071882207703311,
"learning_rate": 1.4285714285714286e-06,
"loss": -0.0202,
"step": 6,
"step_time": 5.641448482998385
},
{
"clip_ratio/high_max": 0.0078125,
"clip_ratio/high_mean": 0.00390625,
"clip_ratio/low_mean": 0.00390625,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1748.0,
"completions/max_terminated_length": 1748.0,
"completions/mean_length": 1575.5,
"completions/mean_terminated_length": 1575.5,
"completions/min_length": 1186.0,
"completions/min_terminated_length": 1186.0,
"entropy": 0.4470406360924244,
"epoch": 0.00014,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.4503103494644165,
"kl": 0.0008566801006963942,
"learning_rate": 1.7142857142857145e-06,
"loss": 0.0132,
"num_tokens": 289160.0,
"reward": -3.2668540477752686,
"reward_std": 10.61334228515625,
"rewards/rollout_reward_func/mean": -3.2668540477752686,
"rewards/rollout_reward_func/std": 16.216392517089844,
"sampling/importance_sampling_ratio/max": 1.3690364360809326,
"sampling/importance_sampling_ratio/mean": 1.0221995115280151,
"sampling/importance_sampling_ratio/min": 0.6548231840133667,
"sampling/sampling_logp_difference/max": 0.392575740814209,
"sampling/sampling_logp_difference/mean": 0.01853613555431366,
"step": 7,
"step_time": 34.67648999299854
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001953125,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.44634250551462173,
"epoch": 0.00016,
"grad_norm": 1.4721945524215698,
"kl": 0.0007410887337755412,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.014,
"step": 8,
"step_time": 5.566038421000485
},
{
"clip_ratio/high_max": 0.01171875,
"clip_ratio/high_mean": 0.0078125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0078125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1773.0,
"completions/max_terminated_length": 1773.0,
"completions/mean_length": 1650.5,
"completions/mean_terminated_length": 1650.5,
"completions/min_length": 1169.0,
"completions/min_terminated_length": 1169.0,
"entropy": 0.5013628304004669,
"epoch": 0.00018,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.5520236492156982,
"kl": 0.001372927552438341,
"learning_rate": 2.285714285714286e-06,
"loss": -0.0308,
"num_tokens": 362601.0,
"reward": -13.83917236328125,
"reward_std": 12.006336212158203,
"rewards/rollout_reward_func/mean": -13.83917236328125,
"rewards/rollout_reward_func/std": 14.237728118896484,
"sampling/importance_sampling_ratio/max": 1.3693691492080688,
"sampling/importance_sampling_ratio/mean": 0.9588738679885864,
"sampling/importance_sampling_ratio/min": 0.5098013281822205,
"sampling/sampling_logp_difference/max": 0.735576868057251,
"sampling/sampling_logp_difference/mean": 0.02071024850010872,
"step": 9,
"step_time": 34.420860370997616
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"entropy": 0.49781131744384766,
"epoch": 0.0002,
"grad_norm": 2.5958364009857178,
"kl": 0.0012885355827165768,
"learning_rate": 2.571428571428571e-06,
"loss": -0.0278,
"step": 10,
"step_time": 5.687426060998405
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1775.0,
"completions/max_terminated_length": 1775.0,
"completions/mean_length": 1532.0,
"completions/mean_terminated_length": 1532.0,
"completions/min_length": 264.0,
"completions/min_terminated_length": 264.0,
"entropy": 0.42485806345939636,
"epoch": 0.00022,
"frac_reward_zero_std": 0.0,
"grad_norm": 1.8452433347702026,
"kl": 0.000992896981188096,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.0203,
"num_tokens": 431550.0,
"reward": -0.7765803337097168,
"reward_std": 14.750946044921875,
"rewards/rollout_reward_func/mean": -0.7765803337097168,
"rewards/rollout_reward_func/std": 21.5161190032959,
"sampling/importance_sampling_ratio/max": 1.3237504959106445,
"sampling/importance_sampling_ratio/mean": 1.0001271963119507,
"sampling/importance_sampling_ratio/min": 0.6408203840255737,
"sampling/sampling_logp_difference/max": 0.33285045623779297,
"sampling/sampling_logp_difference/mean": 0.01815984398126602,
"step": 11,
"step_time": 33.55919377600003
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"entropy": 0.4245442971587181,
"epoch": 0.00024,
"grad_norm": 1.6485886573791504,
"kl": 0.0013787990028504282,
"learning_rate": 3.142857142857143e-06,
"loss": 0.019,
"step": 12,
"step_time": 5.637909467997815
},
{
"clip_ratio/high_max": 0.00390625,
"clip_ratio/high_mean": 0.001953125,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001953125,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1791.0,
"completions/max_terminated_length": 1791.0,
"completions/mean_length": 1633.25,
"completions/mean_terminated_length": 1633.25,
"completions/min_length": 1195.0,
"completions/min_terminated_length": 1195.0,
"entropy": 0.478180218487978,
"epoch": 0.00026,
"frac_reward_zero_std": 0.0,
"grad_norm": 2.3034250736236572,
"kl": 0.0017315489676548168,
"learning_rate": 3.428571428571429e-06,
"loss": -0.0346,
"num_tokens": 504355.0,
"reward": -17.26664924621582,
"reward_std": 14.347229957580566,
"rewards/rollout_reward_func/mean": -17.26664924621582,
"rewards/rollout_reward_func/std": 18.007043838500977,
"sampling/importance_sampling_ratio/max": 1.7957122325897217,
"sampling/importance_sampling_ratio/mean": 1.0002973079681396,
"sampling/importance_sampling_ratio/min": 0.5741486549377441,
"sampling/sampling_logp_difference/max": 0.5055437088012695,
"sampling/sampling_logp_difference/mean": 0.024692352861166,
"step": 13,
"step_time": 32.15085511000143
}
],
"logging_steps": 1.0,
"max_steps": 100000,
"num_input_tokens_seen": 504355,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}