ee_gol_ep_746 / trainer_state.json
usr256864's picture
Initial model upload
b112fab verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 746,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05,
"completions/max_length": 246.3,
"completions/max_terminated_length": 240.26,
"completions/mean_length": 195.0275,
"completions/mean_terminated_length": 191.87969146728517,
"completions/min_length": 147.76,
"completions/min_terminated_length": 147.76,
"entropy": 0.06807037293910981,
"epoch": 0.06702412868632708,
"frac_reward_zero_std": 0.4475,
"grad_norm": 0.1978774070739746,
"learning_rate": 1e-05,
"loss": -0.0022,
"num_tokens": 6268258.0,
"reward": 12.489985446929932,
"reward_std": 1.05244723290205,
"rewards/event_reward_fn/mean": 11.62375,
"rewards/event_reward_fn/std": 7.598931360244751,
"rewards/format_reward_fn/mean": 0.8662354218959808,
"rewards/format_reward_fn/std": 0.24084076710045338,
"step": 50,
"step_time": 24.881226640827954
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.043125,
"completions/max_length": 249.38,
"completions/max_terminated_length": 244.26,
"completions/mean_length": 198.4925,
"completions/mean_terminated_length": 195.8674203491211,
"completions/min_length": 155.1,
"completions/min_terminated_length": 155.1,
"entropy": 0.07096008479595184,
"epoch": 0.13404825737265416,
"frac_reward_zero_std": 0.42,
"grad_norm": 0.31616032123565674,
"learning_rate": 1e-05,
"loss": -0.0052,
"num_tokens": 12603730.0,
"reward": 11.722552404403686,
"reward_std": 1.104598103761673,
"rewards/event_reward_fn/mean": 10.865,
"rewards/event_reward_fn/std": 7.203483366966248,
"rewards/format_reward_fn/mean": 0.8575523483753205,
"rewards/format_reward_fn/std": 0.25920433282852173,
"step": 100,
"step_time": 23.881343694739044
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.069375,
"completions/max_length": 251.04,
"completions/max_terminated_length": 245.08,
"completions/mean_length": 201.58125,
"completions/mean_terminated_length": 197.60445678710937,
"completions/min_length": 157.96,
"completions/min_terminated_length": 157.96,
"entropy": 0.07228697955608368,
"epoch": 0.20107238605898123,
"frac_reward_zero_std": 0.41,
"grad_norm": 0.1767224669456482,
"learning_rate": 1e-05,
"loss": 0.002,
"num_tokens": 19236102.0,
"reward": 11.989666719436645,
"reward_std": 1.2850025883316993,
"rewards/event_reward_fn/mean": 11.1225,
"rewards/event_reward_fn/std": 7.3152674865722656,
"rewards/format_reward_fn/mean": 0.8671666479110718,
"rewards/format_reward_fn/std": 0.24983404949307442,
"step": 150,
"step_time": 27.783113366477192
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.068125,
"completions/max_length": 250.62,
"completions/max_terminated_length": 244.54,
"completions/mean_length": 201.1025,
"completions/mean_terminated_length": 197.25198516845703,
"completions/min_length": 156.4,
"completions/min_terminated_length": 156.4,
"entropy": 0.06773373357951641,
"epoch": 0.2680965147453083,
"frac_reward_zero_std": 0.415,
"grad_norm": 0.13261352479457855,
"learning_rate": 1e-05,
"loss": -0.0029,
"num_tokens": 25426958.0,
"reward": 12.467143926620484,
"reward_std": 1.1554639112949372,
"rewards/event_reward_fn/mean": 11.59875,
"rewards/event_reward_fn/std": 7.149877543449402,
"rewards/format_reward_fn/mean": 0.8683938610553742,
"rewards/format_reward_fn/std": 0.24253679752349855,
"step": 200,
"step_time": 24.421198091395198
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.058125,
"completions/max_length": 250.22,
"completions/max_terminated_length": 243.8,
"completions/mean_length": 200.303125,
"completions/mean_terminated_length": 196.90248321533204,
"completions/min_length": 162.42,
"completions/min_terminated_length": 162.42,
"entropy": 0.06486415289342404,
"epoch": 0.3351206434316354,
"frac_reward_zero_std": 0.385,
"grad_norm": 0.49442073702812195,
"learning_rate": 1e-05,
"loss": -0.0036,
"num_tokens": 31582342.0,
"reward": 12.355808296203612,
"reward_std": 1.1142808997631073,
"rewards/event_reward_fn/mean": 11.48875,
"rewards/event_reward_fn/std": 7.448825697898865,
"rewards/format_reward_fn/mean": 0.8670582604408265,
"rewards/format_reward_fn/std": 0.24978963822126388,
"step": 250,
"step_time": 25.453000083304943
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.04875,
"completions/max_length": 248.68,
"completions/max_terminated_length": 244.22,
"completions/mean_length": 198.759375,
"completions/mean_terminated_length": 196.16592681884765,
"completions/min_length": 156.2,
"completions/min_terminated_length": 156.2,
"entropy": 0.0681518343836069,
"epoch": 0.40214477211796246,
"frac_reward_zero_std": 0.39,
"grad_norm": 0.48775437474250793,
"learning_rate": 1e-05,
"loss": -0.0057,
"num_tokens": 37800719.0,
"reward": 12.434584522247315,
"reward_std": 1.183589797616005,
"rewards/event_reward_fn/mean": 11.56375,
"rewards/event_reward_fn/std": 7.52141658782959,
"rewards/format_reward_fn/mean": 0.8708344352245331,
"rewards/format_reward_fn/std": 0.23306368254125118,
"step": 300,
"step_time": 25.360634116120636
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.034375,
"completions/max_length": 248.34,
"completions/max_terminated_length": 245.28,
"completions/mean_length": 203.264375,
"completions/mean_terminated_length": 201.32774475097656,
"completions/min_length": 157.54,
"completions/min_terminated_length": 157.54,
"entropy": 0.06739457175135613,
"epoch": 0.4691689008042895,
"frac_reward_zero_std": 0.3525,
"grad_norm": 0.33356958627700806,
"learning_rate": 1e-05,
"loss": -0.004,
"num_tokens": 44150011.0,
"reward": 13.173797435760498,
"reward_std": 1.2946509444713592,
"rewards/event_reward_fn/mean": 12.28875,
"rewards/event_reward_fn/std": 7.145490102767944,
"rewards/format_reward_fn/mean": 0.885047378540039,
"rewards/format_reward_fn/std": 0.22108205765485764,
"step": 350,
"step_time": 26.940150288008155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.064375,
"completions/max_length": 252.9,
"completions/max_terminated_length": 247.8,
"completions/mean_length": 203.064375,
"completions/mean_terminated_length": 199.5350747680664,
"completions/min_length": 158.26,
"completions/min_terminated_length": 158.26,
"entropy": 0.0657703248411417,
"epoch": 0.5361930294906166,
"frac_reward_zero_std": 0.435,
"grad_norm": 0.26359474658966064,
"learning_rate": 1e-05,
"loss": -0.0021,
"num_tokens": 50384400.0,
"reward": 12.238037357330322,
"reward_std": 1.057584773004055,
"rewards/event_reward_fn/mean": 11.37,
"rewards/event_reward_fn/std": 7.154304637908935,
"rewards/format_reward_fn/mean": 0.8680373668670655,
"rewards/format_reward_fn/std": 0.26109003871679304,
"step": 400,
"step_time": 25.59800311360508
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.049375,
"completions/max_length": 249.06,
"completions/max_terminated_length": 244.9,
"completions/mean_length": 203.706875,
"completions/mean_terminated_length": 200.99220581054686,
"completions/min_length": 161.06,
"completions/min_terminated_length": 161.06,
"entropy": 0.06626586891710758,
"epoch": 0.6032171581769437,
"frac_reward_zero_std": 0.3775,
"grad_norm": 0.48660293221473694,
"learning_rate": 1e-05,
"loss": -0.004,
"num_tokens": 56771056.0,
"reward": 13.009743461608887,
"reward_std": 1.2429037857055665,
"rewards/event_reward_fn/mean": 12.130625,
"rewards/event_reward_fn/std": 7.234463820457458,
"rewards/format_reward_fn/mean": 0.8791184043884277,
"rewards/format_reward_fn/std": 0.23800445690751076,
"step": 450,
"step_time": 25.550446799769997
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.066875,
"completions/max_length": 251.92,
"completions/max_terminated_length": 246.12,
"completions/mean_length": 204.07625,
"completions/mean_terminated_length": 200.35590240478516,
"completions/min_length": 160.74,
"completions/min_terminated_length": 160.74,
"entropy": 0.06663089752197265,
"epoch": 0.6702412868632708,
"frac_reward_zero_std": 0.4025,
"grad_norm": 0.6319305300712585,
"learning_rate": 1e-05,
"loss": -0.0042,
"num_tokens": 63078757.0,
"reward": 12.313038005828858,
"reward_std": 1.1368902394175529,
"rewards/event_reward_fn/mean": 11.4575,
"rewards/event_reward_fn/std": 6.7143393945693965,
"rewards/format_reward_fn/mean": 0.8555380630493165,
"rewards/format_reward_fn/std": 0.2657873314619064,
"step": 500,
"step_time": 26.24973841637373
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.091875,
"completions/max_length": 252.82,
"completions/max_terminated_length": 246.88,
"completions/mean_length": 203.815,
"completions/mean_terminated_length": 198.69242126464843,
"completions/min_length": 161.16,
"completions/min_terminated_length": 161.16,
"entropy": 0.06187104433774948,
"epoch": 0.7372654155495979,
"frac_reward_zero_std": 0.425,
"grad_norm": 0.40395304560661316,
"learning_rate": 1e-05,
"loss": -0.0025,
"num_tokens": 69170452.0,
"reward": 12.482298536300659,
"reward_std": 1.0457301473617553,
"rewards/event_reward_fn/mean": 11.64625,
"rewards/event_reward_fn/std": 7.317771224975586,
"rewards/format_reward_fn/mean": 0.8360484623908997,
"rewards/format_reward_fn/std": 0.2895883430540562,
"step": 550,
"step_time": 24.193240740820766
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.110625,
"completions/max_length": 252.62,
"completions/max_terminated_length": 246.8,
"completions/mean_length": 208.275625,
"completions/mean_terminated_length": 202.49910614013672,
"completions/min_length": 165.54,
"completions/min_terminated_length": 165.54,
"entropy": 0.0649487990140915,
"epoch": 0.8042895442359249,
"frac_reward_zero_std": 0.38,
"grad_norm": 0.37119486927986145,
"learning_rate": 1e-05,
"loss": 0.0006,
"num_tokens": 75499314.0,
"reward": 12.80059557914734,
"reward_std": 1.1889909988641738,
"rewards/event_reward_fn/mean": 11.97375,
"rewards/event_reward_fn/std": 7.475857477188111,
"rewards/format_reward_fn/mean": 0.8268455564975739,
"rewards/format_reward_fn/std": 0.29714462146162984,
"step": 600,
"step_time": 24.3176869976148
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.05625,
"completions/max_length": 249.28,
"completions/max_terminated_length": 244.8,
"completions/mean_length": 202.789375,
"completions/mean_terminated_length": 199.91522064208985,
"completions/min_length": 161.74,
"completions/min_terminated_length": 161.74,
"entropy": 0.06481640346348286,
"epoch": 0.871313672922252,
"frac_reward_zero_std": 0.3975,
"grad_norm": 0.08866075426340103,
"learning_rate": 1e-05,
"loss": -0.0023,
"num_tokens": 81673001.0,
"reward": 12.689926280975342,
"reward_std": 1.2458794575929641,
"rewards/event_reward_fn/mean": 11.815625,
"rewards/event_reward_fn/std": 7.275726590156555,
"rewards/format_reward_fn/mean": 0.8743013119697571,
"rewards/format_reward_fn/std": 0.23756251022219657,
"step": 650,
"step_time": 25.04028965227306
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.100625,
"completions/max_length": 253.72,
"completions/max_terminated_length": 248.28,
"completions/mean_length": 205.536875,
"completions/mean_terminated_length": 200.1349432373047,
"completions/min_length": 162.16,
"completions/min_terminated_length": 162.16,
"entropy": 0.0658975774794817,
"epoch": 0.938337801608579,
"frac_reward_zero_std": 0.3975,
"grad_norm": 0.2268964648246765,
"learning_rate": 1e-05,
"loss": -0.0008,
"num_tokens": 87934795.0,
"reward": 12.72035478591919,
"reward_std": 1.1722034803032875,
"rewards/event_reward_fn/mean": 11.888125,
"rewards/event_reward_fn/std": 7.583159003257752,
"rewards/format_reward_fn/mean": 0.8322297859191895,
"rewards/format_reward_fn/std": 0.29026631206274034,
"step": 700,
"step_time": 24.744350045956672
}
],
"logging_steps": 50,
"max_steps": 7460,
"num_input_tokens_seen": 93493541,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}