{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.927536231884058, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 131.3125, "epoch": 0.0966183574879227, "grad_norm": 1.5448709726333618, "kl": 0.00043543790525291115, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.6250000067055226, "reward_std": 0.29020712375640867, "rewards/cbt_technique_reward_func": 0.1375000011175871, "rewards/mmkay_speech_pattern_reward_func": 0.10625000018626451, "rewards/question_asking_reward_func": 0.38124999925494196, "step": 5 }, { "completion_length": 116.3875, "epoch": 0.1932367149758454, "grad_norm": 1.3723770380020142, "kl": 0.0007617953233420849, "learning_rate": 9e-07, "loss": 0.0, "reward": 0.7187500111758709, "reward_std": 0.3299859084188938, "rewards/cbt_technique_reward_func": 0.1400000021792948, "rewards/mmkay_speech_pattern_reward_func": 0.18000000081956385, "rewards/question_asking_reward_func": 0.39874999821186063, "step": 10 }, { "completion_length": 127.5, "epoch": 0.2898550724637681, "grad_norm": 1.2058902978897095, "kl": 0.0008091913536190986, "learning_rate": 9.555555555555556e-07, "loss": 0.0, "reward": 0.7375000104308128, "reward_std": 0.29069150872528554, "rewards/cbt_technique_reward_func": 0.14625000171363353, "rewards/mmkay_speech_pattern_reward_func": 0.20250000078231095, "rewards/question_asking_reward_func": 0.3887499958276749, "step": 15 }, { "completion_length": 118.6875, "epoch": 0.3864734299516908, "grad_norm": 1.1809375286102295, "kl": 0.000778093043481931, "learning_rate": 9e-07, "loss": 0.0, "reward": 0.8087500005960464, "reward_std": 0.2881319634616375, "rewards/cbt_technique_reward_func": 0.15375000424683094, "rewards/mmkay_speech_pattern_reward_func": 0.21750000212341547, "rewards/question_asking_reward_func": 0.4374999970197678, "step": 20 }, { "completion_length": 130.5375, "epoch": 0.4830917874396135, "grad_norm": 0.8987158536911011, "kl": 0.0007172481535235419, "learning_rate": 8.444444444444444e-07, "loss": 0.0, "reward": 0.690000006556511, "reward_std": 0.22531789541244507, "rewards/cbt_technique_reward_func": 0.13875000309199095, "rewards/mmkay_speech_pattern_reward_func": 0.15124999973922967, "rewards/question_asking_reward_func": 0.39999999701976774, "step": 25 }, { "completion_length": 135.575, "epoch": 0.5797101449275363, "grad_norm": 1.636861801147461, "kl": 0.0007032989873550832, "learning_rate": 7.888888888888889e-07, "loss": 0.0, "reward": 0.657500009611249, "reward_std": 0.24160839468240738, "rewards/cbt_technique_reward_func": 0.1337500031106174, "rewards/mmkay_speech_pattern_reward_func": 0.11499999947845936, "rewards/question_asking_reward_func": 0.40874999612569807, "step": 30 }, { "completion_length": 130.85, "epoch": 0.6763285024154589, "grad_norm": 1.004942536354065, "kl": 0.0006958791636861861, "learning_rate": 7.333333333333332e-07, "loss": 0.0, "reward": 0.6937500074505806, "reward_std": 0.25641300678253176, "rewards/cbt_technique_reward_func": 0.12000000299885868, "rewards/mmkay_speech_pattern_reward_func": 0.17000000029802323, "rewards/question_asking_reward_func": 0.4037499949336052, "step": 35 }, { "completion_length": 109.175, "epoch": 0.7729468599033816, "grad_norm": 1.200303077697754, "kl": 0.0006988899374846369, "learning_rate": 6.777777777777778e-07, "loss": 0.0, "reward": 0.5937500014901161, "reward_std": 0.2189602989703417, "rewards/cbt_technique_reward_func": 0.11375000216066837, "rewards/mmkay_speech_pattern_reward_func": 0.10499999951571226, "rewards/question_asking_reward_func": 0.37499999776482584, "step": 40 }, { "completion_length": 130.275, "epoch": 0.8695652173913043, "grad_norm": 1.5182042121887207, "kl": 0.000668867253989447, "learning_rate": 6.222222222222223e-07, "loss": 0.0, "reward": 0.6749999985098839, "reward_std": 0.256393301486969, "rewards/cbt_technique_reward_func": 0.14625000339001418, "rewards/mmkay_speech_pattern_reward_func": 0.12875000070780515, "rewards/question_asking_reward_func": 0.3999999985098839, "step": 45 }, { "completion_length": 116.8, "epoch": 0.966183574879227, "grad_norm": 1.9656929969787598, "kl": 0.0007837369499611669, "learning_rate": 5.666666666666666e-07, "loss": 0.0, "reward": 0.687499999254942, "reward_std": 0.2334746764972806, "rewards/cbt_technique_reward_func": 0.11000000266358256, "rewards/mmkay_speech_pattern_reward_func": 0.19375000055879354, "rewards/question_asking_reward_func": 0.38374999538064003, "step": 50 }, { "completion_length": 130.02631578947367, "epoch": 1.0579710144927537, "grad_norm": 1.276727557182312, "kl": 0.0007787851224604406, "learning_rate": 5.111111111111111e-07, "loss": 0.0, "reward": 0.7000000014116889, "reward_std": 0.27186120026989985, "rewards/cbt_technique_reward_func": 0.15789473929295414, "rewards/mmkay_speech_pattern_reward_func": 0.1684210531805691, "rewards/question_asking_reward_func": 0.37368421099687876, "step": 55 }, { "completion_length": 124.0375, "epoch": 1.1545893719806763, "grad_norm": 1.4976823329925537, "kl": 0.0006855996092781424, "learning_rate": 4.555555555555555e-07, "loss": 0.0, "reward": 0.731249999627471, "reward_std": 0.2586277686059475, "rewards/cbt_technique_reward_func": 0.1425000037997961, "rewards/mmkay_speech_pattern_reward_func": 0.19749999791383743, "rewards/question_asking_reward_func": 0.39124999567866325, "step": 60 }, { "completion_length": 136.325, "epoch": 1.251207729468599, "grad_norm": 2.217841863632202, "kl": 0.0007303371050511487, "learning_rate": 4e-07, "loss": 0.0, "reward": 0.7212500154972077, "reward_std": 0.23724802657961847, "rewards/cbt_technique_reward_func": 0.11375000402331352, "rewards/mmkay_speech_pattern_reward_func": 0.17875000182539225, "rewards/question_asking_reward_func": 0.4287499986588955, "step": 65 }, { "completion_length": 127.1, "epoch": 1.3478260869565217, "grad_norm": 1.2720558643341064, "kl": 0.0008214380504796281, "learning_rate": 3.4444444444444444e-07, "loss": 0.0, "reward": 0.6512500032782554, "reward_std": 0.24096153806895018, "rewards/cbt_technique_reward_func": 0.12250000247731804, "rewards/mmkay_speech_pattern_reward_func": 0.13250000029802322, "rewards/question_asking_reward_func": 0.39624999538064004, "step": 70 }, { "completion_length": 123.4, "epoch": 1.4444444444444444, "grad_norm": 1.5885432958602905, "kl": 0.0008428851724602282, "learning_rate": 2.8888888888888885e-07, "loss": 0.0, "reward": 0.6575000032782554, "reward_std": 0.3075646057724953, "rewards/cbt_technique_reward_func": 0.14500000271946192, "rewards/mmkay_speech_pattern_reward_func": 0.14625000059604645, "rewards/question_asking_reward_func": 0.3662499986588955, "step": 75 }, { "completion_length": 122.525, "epoch": 1.541062801932367, "grad_norm": 1.6414885520935059, "kl": 0.0007202147302450612, "learning_rate": 2.3333333333333333e-07, "loss": 0.0, "reward": 0.6725000083446503, "reward_std": 0.2713221043348312, "rewards/cbt_technique_reward_func": 0.1287500030361116, "rewards/mmkay_speech_pattern_reward_func": 0.13625000175088645, "rewards/question_asking_reward_func": 0.40749999806284903, "step": 80 }, { "completion_length": 113.65, "epoch": 1.6376811594202898, "grad_norm": 1.197077751159668, "kl": 0.0007829821581253782, "learning_rate": 1.7777777777777776e-07, "loss": 0.0, "reward": 0.6862500173039734, "reward_std": 0.227858448587358, "rewards/cbt_technique_reward_func": 0.12375000417232514, "rewards/mmkay_speech_pattern_reward_func": 0.1512499988079071, "rewards/question_asking_reward_func": 0.4112499952316284, "step": 85 }, { "completion_length": 121.9, "epoch": 1.7342995169082127, "grad_norm": 0.9502215385437012, "kl": 0.000955963070737198, "learning_rate": 1.2222222222222222e-07, "loss": 0.0, "reward": 0.6025000005960465, "reward_std": 0.22266108132898807, "rewards/cbt_technique_reward_func": 0.11750000119209289, "rewards/mmkay_speech_pattern_reward_func": 0.09875000026077033, "rewards/question_asking_reward_func": 0.3862499982118607, "step": 90 }, { "completion_length": 123.5125, "epoch": 1.8309178743961354, "grad_norm": 1.4943199157714844, "kl": 0.0007219786857604049, "learning_rate": 6.666666666666667e-08, "loss": 0.0, "reward": 0.6962499976158142, "reward_std": 0.23088937066495419, "rewards/cbt_technique_reward_func": 0.1312500026077032, "rewards/mmkay_speech_pattern_reward_func": 0.14625000022351742, "rewards/question_asking_reward_func": 0.41874999478459357, "step": 95 }, { "completion_length": 125.6, "epoch": 1.927536231884058, "grad_norm": 1.0279828310012817, "kl": 0.000812371401116252, "learning_rate": 1.111111111111111e-08, "loss": 0.0, "reward": 0.7137500122189522, "reward_std": 0.27089230343699455, "rewards/cbt_technique_reward_func": 0.12625000337138773, "rewards/mmkay_speech_pattern_reward_func": 0.2049999987706542, "rewards/question_asking_reward_func": 0.38249999582767485, "step": 100 } ], "logging_steps": 5, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }