GRPO_adapters / trainer_state.json
SaitejaJate's picture
Upload 11 files
54d9cc1 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.927536231884058,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 131.3125,
"epoch": 0.0966183574879227,
"grad_norm": 1.5448709726333618,
"kl": 0.00043543790525291115,
"learning_rate": 4e-07,
"loss": 0.0,
"reward": 0.6250000067055226,
"reward_std": 0.29020712375640867,
"rewards/cbt_technique_reward_func": 0.1375000011175871,
"rewards/mmkay_speech_pattern_reward_func": 0.10625000018626451,
"rewards/question_asking_reward_func": 0.38124999925494196,
"step": 5
},
{
"completion_length": 116.3875,
"epoch": 0.1932367149758454,
"grad_norm": 1.3723770380020142,
"kl": 0.0007617953233420849,
"learning_rate": 9e-07,
"loss": 0.0,
"reward": 0.7187500111758709,
"reward_std": 0.3299859084188938,
"rewards/cbt_technique_reward_func": 0.1400000021792948,
"rewards/mmkay_speech_pattern_reward_func": 0.18000000081956385,
"rewards/question_asking_reward_func": 0.39874999821186063,
"step": 10
},
{
"completion_length": 127.5,
"epoch": 0.2898550724637681,
"grad_norm": 1.2058902978897095,
"kl": 0.0008091913536190986,
"learning_rate": 9.555555555555556e-07,
"loss": 0.0,
"reward": 0.7375000104308128,
"reward_std": 0.29069150872528554,
"rewards/cbt_technique_reward_func": 0.14625000171363353,
"rewards/mmkay_speech_pattern_reward_func": 0.20250000078231095,
"rewards/question_asking_reward_func": 0.3887499958276749,
"step": 15
},
{
"completion_length": 118.6875,
"epoch": 0.3864734299516908,
"grad_norm": 1.1809375286102295,
"kl": 0.000778093043481931,
"learning_rate": 9e-07,
"loss": 0.0,
"reward": 0.8087500005960464,
"reward_std": 0.2881319634616375,
"rewards/cbt_technique_reward_func": 0.15375000424683094,
"rewards/mmkay_speech_pattern_reward_func": 0.21750000212341547,
"rewards/question_asking_reward_func": 0.4374999970197678,
"step": 20
},
{
"completion_length": 130.5375,
"epoch": 0.4830917874396135,
"grad_norm": 0.8987158536911011,
"kl": 0.0007172481535235419,
"learning_rate": 8.444444444444444e-07,
"loss": 0.0,
"reward": 0.690000006556511,
"reward_std": 0.22531789541244507,
"rewards/cbt_technique_reward_func": 0.13875000309199095,
"rewards/mmkay_speech_pattern_reward_func": 0.15124999973922967,
"rewards/question_asking_reward_func": 0.39999999701976774,
"step": 25
},
{
"completion_length": 135.575,
"epoch": 0.5797101449275363,
"grad_norm": 1.636861801147461,
"kl": 0.0007032989873550832,
"learning_rate": 7.888888888888889e-07,
"loss": 0.0,
"reward": 0.657500009611249,
"reward_std": 0.24160839468240738,
"rewards/cbt_technique_reward_func": 0.1337500031106174,
"rewards/mmkay_speech_pattern_reward_func": 0.11499999947845936,
"rewards/question_asking_reward_func": 0.40874999612569807,
"step": 30
},
{
"completion_length": 130.85,
"epoch": 0.6763285024154589,
"grad_norm": 1.004942536354065,
"kl": 0.0006958791636861861,
"learning_rate": 7.333333333333332e-07,
"loss": 0.0,
"reward": 0.6937500074505806,
"reward_std": 0.25641300678253176,
"rewards/cbt_technique_reward_func": 0.12000000299885868,
"rewards/mmkay_speech_pattern_reward_func": 0.17000000029802323,
"rewards/question_asking_reward_func": 0.4037499949336052,
"step": 35
},
{
"completion_length": 109.175,
"epoch": 0.7729468599033816,
"grad_norm": 1.200303077697754,
"kl": 0.0006988899374846369,
"learning_rate": 6.777777777777778e-07,
"loss": 0.0,
"reward": 0.5937500014901161,
"reward_std": 0.2189602989703417,
"rewards/cbt_technique_reward_func": 0.11375000216066837,
"rewards/mmkay_speech_pattern_reward_func": 0.10499999951571226,
"rewards/question_asking_reward_func": 0.37499999776482584,
"step": 40
},
{
"completion_length": 130.275,
"epoch": 0.8695652173913043,
"grad_norm": 1.5182042121887207,
"kl": 0.000668867253989447,
"learning_rate": 6.222222222222223e-07,
"loss": 0.0,
"reward": 0.6749999985098839,
"reward_std": 0.256393301486969,
"rewards/cbt_technique_reward_func": 0.14625000339001418,
"rewards/mmkay_speech_pattern_reward_func": 0.12875000070780515,
"rewards/question_asking_reward_func": 0.3999999985098839,
"step": 45
},
{
"completion_length": 116.8,
"epoch": 0.966183574879227,
"grad_norm": 1.9656929969787598,
"kl": 0.0007837369499611669,
"learning_rate": 5.666666666666666e-07,
"loss": 0.0,
"reward": 0.687499999254942,
"reward_std": 0.2334746764972806,
"rewards/cbt_technique_reward_func": 0.11000000266358256,
"rewards/mmkay_speech_pattern_reward_func": 0.19375000055879354,
"rewards/question_asking_reward_func": 0.38374999538064003,
"step": 50
},
{
"completion_length": 130.02631578947367,
"epoch": 1.0579710144927537,
"grad_norm": 1.276727557182312,
"kl": 0.0007787851224604406,
"learning_rate": 5.111111111111111e-07,
"loss": 0.0,
"reward": 0.7000000014116889,
"reward_std": 0.27186120026989985,
"rewards/cbt_technique_reward_func": 0.15789473929295414,
"rewards/mmkay_speech_pattern_reward_func": 0.1684210531805691,
"rewards/question_asking_reward_func": 0.37368421099687876,
"step": 55
},
{
"completion_length": 124.0375,
"epoch": 1.1545893719806763,
"grad_norm": 1.4976823329925537,
"kl": 0.0006855996092781424,
"learning_rate": 4.555555555555555e-07,
"loss": 0.0,
"reward": 0.731249999627471,
"reward_std": 0.2586277686059475,
"rewards/cbt_technique_reward_func": 0.1425000037997961,
"rewards/mmkay_speech_pattern_reward_func": 0.19749999791383743,
"rewards/question_asking_reward_func": 0.39124999567866325,
"step": 60
},
{
"completion_length": 136.325,
"epoch": 1.251207729468599,
"grad_norm": 2.217841863632202,
"kl": 0.0007303371050511487,
"learning_rate": 4e-07,
"loss": 0.0,
"reward": 0.7212500154972077,
"reward_std": 0.23724802657961847,
"rewards/cbt_technique_reward_func": 0.11375000402331352,
"rewards/mmkay_speech_pattern_reward_func": 0.17875000182539225,
"rewards/question_asking_reward_func": 0.4287499986588955,
"step": 65
},
{
"completion_length": 127.1,
"epoch": 1.3478260869565217,
"grad_norm": 1.2720558643341064,
"kl": 0.0008214380504796281,
"learning_rate": 3.4444444444444444e-07,
"loss": 0.0,
"reward": 0.6512500032782554,
"reward_std": 0.24096153806895018,
"rewards/cbt_technique_reward_func": 0.12250000247731804,
"rewards/mmkay_speech_pattern_reward_func": 0.13250000029802322,
"rewards/question_asking_reward_func": 0.39624999538064004,
"step": 70
},
{
"completion_length": 123.4,
"epoch": 1.4444444444444444,
"grad_norm": 1.5885432958602905,
"kl": 0.0008428851724602282,
"learning_rate": 2.8888888888888885e-07,
"loss": 0.0,
"reward": 0.6575000032782554,
"reward_std": 0.3075646057724953,
"rewards/cbt_technique_reward_func": 0.14500000271946192,
"rewards/mmkay_speech_pattern_reward_func": 0.14625000059604645,
"rewards/question_asking_reward_func": 0.3662499986588955,
"step": 75
},
{
"completion_length": 122.525,
"epoch": 1.541062801932367,
"grad_norm": 1.6414885520935059,
"kl": 0.0007202147302450612,
"learning_rate": 2.3333333333333333e-07,
"loss": 0.0,
"reward": 0.6725000083446503,
"reward_std": 0.2713221043348312,
"rewards/cbt_technique_reward_func": 0.1287500030361116,
"rewards/mmkay_speech_pattern_reward_func": 0.13625000175088645,
"rewards/question_asking_reward_func": 0.40749999806284903,
"step": 80
},
{
"completion_length": 113.65,
"epoch": 1.6376811594202898,
"grad_norm": 1.197077751159668,
"kl": 0.0007829821581253782,
"learning_rate": 1.7777777777777776e-07,
"loss": 0.0,
"reward": 0.6862500173039734,
"reward_std": 0.227858448587358,
"rewards/cbt_technique_reward_func": 0.12375000417232514,
"rewards/mmkay_speech_pattern_reward_func": 0.1512499988079071,
"rewards/question_asking_reward_func": 0.4112499952316284,
"step": 85
},
{
"completion_length": 121.9,
"epoch": 1.7342995169082127,
"grad_norm": 0.9502215385437012,
"kl": 0.000955963070737198,
"learning_rate": 1.2222222222222222e-07,
"loss": 0.0,
"reward": 0.6025000005960465,
"reward_std": 0.22266108132898807,
"rewards/cbt_technique_reward_func": 0.11750000119209289,
"rewards/mmkay_speech_pattern_reward_func": 0.09875000026077033,
"rewards/question_asking_reward_func": 0.3862499982118607,
"step": 90
},
{
"completion_length": 123.5125,
"epoch": 1.8309178743961354,
"grad_norm": 1.4943199157714844,
"kl": 0.0007219786857604049,
"learning_rate": 6.666666666666667e-08,
"loss": 0.0,
"reward": 0.6962499976158142,
"reward_std": 0.23088937066495419,
"rewards/cbt_technique_reward_func": 0.1312500026077032,
"rewards/mmkay_speech_pattern_reward_func": 0.14625000022351742,
"rewards/question_asking_reward_func": 0.41874999478459357,
"step": 95
},
{
"completion_length": 125.6,
"epoch": 1.927536231884058,
"grad_norm": 1.0279828310012817,
"kl": 0.000812371401116252,
"learning_rate": 1.111111111111111e-08,
"loss": 0.0,
"reward": 0.7137500122189522,
"reward_std": 0.27089230343699455,
"rewards/cbt_technique_reward_func": 0.12625000337138773,
"rewards/mmkay_speech_pattern_reward_func": 0.2049999987706542,
"rewards/question_asking_reward_func": 0.38249999582767485,
"step": 100
}
],
"logging_steps": 5,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}