ppo_trainer_model / checkpoint-100 /trainer_state.json
zlyngkhoi's picture
Upload folder using huggingface_hub
ff3096d verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"episode": 1600,
"epoch": 2.0,
"eval_steps": 50,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"episode": 16,
"epoch": 0.02,
"eps": 0,
"loss/policy_avg": 0.0004486599937081337,
"loss/value_avg": 1.2938655614852905,
"lr": 0.0,
"objective/entropy": 162.11053466796875,
"objective/kl": 4.939833164215088,
"objective/non_score_reward": -0.24699166417121887,
"objective/rlhf_reward": -0.29166939854621887,
"objective/scores": -0.044677734375,
"policy/approxkl_avg": 0.001968675758689642,
"policy/clipfrac_avg": 0.00244140625,
"policy/entropy_avg": 1.5513076782226562,
"step": 1,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9633879661560059,
"val/ratio_var": 3.8020948522898834e-07
},
{
"episode": 32,
"epoch": 0.04,
"eps": 0,
"loss/policy_avg": -0.0011649401858448982,
"loss/value_avg": 1.1032419204711914,
"lr": 1.0000000000000001e-07,
"objective/entropy": 185.68942260742188,
"objective/kl": 5.192228317260742,
"objective/non_score_reward": -0.25961142778396606,
"objective/rlhf_reward": -0.6131270527839661,
"objective/scores": -0.353515625,
"policy/approxkl_avg": 0.002094644121825695,
"policy/clipfrac_avg": 0.00146484375,
"policy/entropy_avg": 1.7179677486419678,
"step": 2,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9616182446479797,
"val/ratio_var": 2.1992273104842752e-06
},
{
"episode": 48,
"epoch": 0.06,
"eps": 0,
"loss/policy_avg": -0.00016614759806543589,
"loss/value_avg": 1.5101345777511597,
"lr": 2.0000000000000002e-07,
"objective/entropy": 174.795654296875,
"objective/kl": 5.0596089363098145,
"objective/non_score_reward": -0.25298047065734863,
"objective/rlhf_reward": -0.013722658157348633,
"objective/scores": 0.2392578125,
"policy/approxkl_avg": 0.0020569346379488707,
"policy/clipfrac_avg": 0.0018310546875,
"policy/entropy_avg": 1.5935308933258057,
"step": 3,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9618856906890869,
"val/ratio_var": 3.546992275005323e-06
},
{
"episode": 64,
"epoch": 0.08,
"eps": 0,
"loss/policy_avg": -0.0001529245637357235,
"loss/value_avg": 1.3132586479187012,
"lr": 3.0000000000000004e-07,
"objective/entropy": 167.1978759765625,
"objective/kl": 5.142940044403076,
"objective/non_score_reward": -0.25714701414108276,
"objective/rlhf_reward": -0.31622904539108276,
"objective/scores": -0.05908203125,
"policy/approxkl_avg": 0.002137545496225357,
"policy/clipfrac_avg": 0.0025634765625,
"policy/entropy_avg": 1.6127209663391113,
"step": 4,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9616750478744507,
"val/ratio_var": 1.2533583912954782e-06
},
{
"episode": 80,
"epoch": 0.1,
"eps": 0,
"loss/policy_avg": -0.00246006203815341,
"loss/value_avg": 1.1861101388931274,
"lr": 4.0000000000000003e-07,
"objective/entropy": 161.64456176757812,
"objective/kl": 4.87198543548584,
"objective/non_score_reward": -0.24359926581382751,
"objective/rlhf_reward": -0.3009723126888275,
"objective/scores": -0.057373046875,
"policy/approxkl_avg": 0.001990006770938635,
"policy/clipfrac_avg": 0.002197265625,
"policy/entropy_avg": 1.506896734237671,
"step": 5,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9635031819343567,
"val/ratio_var": 3.90803052141564e-06
},
{
"episode": 96,
"epoch": 0.12,
"eps": 0,
"loss/policy_avg": -0.0034578712657094,
"loss/value_avg": 1.237225890159607,
"lr": 5.000000000000001e-07,
"objective/entropy": 176.53646850585938,
"objective/kl": 5.119318962097168,
"objective/non_score_reward": -0.2559659481048584,
"objective/rlhf_reward": -0.2571256160736084,
"objective/scores": -0.00115966796875,
"policy/approxkl_avg": 0.0020507299341261387,
"policy/clipfrac_avg": 0.002197265625,
"policy/entropy_avg": 1.5803486108779907,
"step": 6,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9626491069793701,
"val/ratio_var": 4.622608685167506e-06
},
{
"episode": 112,
"epoch": 0.14,
"eps": 0,
"loss/policy_avg": -0.0025506531819701195,
"loss/value_avg": 1.258737325668335,
"lr": 6.000000000000001e-07,
"objective/entropy": 177.9249725341797,
"objective/kl": 5.401371002197266,
"objective/non_score_reward": -0.27006858587265015,
"objective/rlhf_reward": -0.5435060858726501,
"objective/scores": -0.2734375,
"policy/approxkl_avg": 0.0022585245314985514,
"policy/clipfrac_avg": 0.0029296875,
"policy/entropy_avg": 1.6439510583877563,
"step": 7,
"val/clipfrac_avg": 0.0003662109375,
"val/num_eos_tokens": 0,
"val/ratio": 0.9604523181915283,
"val/ratio_var": 6.208483227965189e-06
},
{
"episode": 128,
"epoch": 0.16,
"eps": 0,
"loss/policy_avg": -0.0017832191661000252,
"loss/value_avg": 1.2353310585021973,
"lr": 7.000000000000001e-07,
"objective/entropy": 175.4106903076172,
"objective/kl": 4.9585371017456055,
"objective/non_score_reward": -0.24792686104774475,
"objective/rlhf_reward": -0.37683311104774475,
"objective/scores": -0.12890625,
"policy/approxkl_avg": 0.002244518604129553,
"policy/clipfrac_avg": 0.003173828125,
"policy/entropy_avg": 1.6177117824554443,
"step": 8,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9618170261383057,
"val/ratio_var": 4.459182491700631e-06
},
{
"episode": 144,
"epoch": 0.18,
"eps": 0,
"loss/policy_avg": -0.002314523793756962,
"loss/value_avg": 1.3316550254821777,
"lr": 8.000000000000001e-07,
"objective/entropy": 175.06866455078125,
"objective/kl": 4.954601764678955,
"objective/non_score_reward": -0.2477300763130188,
"objective/rlhf_reward": -0.0358160138130188,
"objective/scores": 0.2119140625,
"policy/approxkl_avg": 0.002068200148642063,
"policy/clipfrac_avg": 0.0015869140625,
"policy/entropy_avg": 1.619426965713501,
"step": 9,
"val/clipfrac_avg": 0.0001220703125,
"val/num_eos_tokens": 0,
"val/ratio": 0.962709367275238,
"val/ratio_var": 3.7464862998604076e-06
},
{
"episode": 160,
"epoch": 0.2,
"eps": 0,
"loss/policy_avg": -0.0006005736067891121,
"loss/value_avg": 1.2744617462158203,
"lr": 9.000000000000001e-07,
"objective/entropy": 169.3418731689453,
"objective/kl": 4.965663909912109,
"objective/non_score_reward": -0.24828319251537323,
"objective/rlhf_reward": 0.03491993248462677,
"objective/scores": 0.283203125,
"policy/approxkl_avg": 0.002044553868472576,
"policy/clipfrac_avg": 0.00244140625,
"policy/entropy_avg": 1.5893105268478394,
"step": 10,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9636451601982117,
"val/ratio_var": 1.0843526069947984e-05
},
{
"episode": 176,
"epoch": 0.22,
"eps": 0,
"loss/policy_avg": -0.0016839192248880863,
"loss/value_avg": 1.1354484558105469,
"lr": 1.0000000000000002e-06,
"objective/entropy": 167.39401245117188,
"objective/kl": 5.0550994873046875,
"objective/non_score_reward": -0.25275495648384094,
"objective/rlhf_reward": -0.08673933148384094,
"objective/scores": 0.166015625,
"policy/approxkl_avg": 0.002176377223804593,
"policy/clipfrac_avg": 0.001708984375,
"policy/entropy_avg": 1.5987792015075684,
"step": 11,
"val/clipfrac_avg": 0.0001220703125,
"val/num_eos_tokens": 0,
"val/ratio": 0.9604842066764832,
"val/ratio_var": 1.695754804131866e-06
},
{
"episode": 192,
"epoch": 0.24,
"eps": 0,
"loss/policy_avg": -0.0015829752665013075,
"loss/value_avg": 1.138115406036377,
"lr": 1.1e-06,
"objective/entropy": 169.50271606445312,
"objective/kl": 5.478184223175049,
"objective/non_score_reward": -0.27390921115875244,
"objective/rlhf_reward": -0.43992483615875244,
"objective/scores": -0.166015625,
"policy/approxkl_avg": 0.00216104369610548,
"policy/clipfrac_avg": 0.0020751953125,
"policy/entropy_avg": 1.6336290836334229,
"step": 12,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9616599678993225,
"val/ratio_var": 1.954093249878497e-06
},
{
"episode": 208,
"epoch": 0.26,
"eps": 0,
"loss/policy_avg": -0.0012321844696998596,
"loss/value_avg": 1.1720128059387207,
"lr": 1.2000000000000002e-06,
"objective/entropy": 181.92147827148438,
"objective/kl": 5.243380546569824,
"objective/non_score_reward": -0.2621690332889557,
"objective/rlhf_reward": -0.2769395411014557,
"objective/scores": -0.0147705078125,
"policy/approxkl_avg": 0.002162383636459708,
"policy/clipfrac_avg": 0.0008544921875,
"policy/entropy_avg": 1.696028470993042,
"step": 13,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9608445763587952,
"val/ratio_var": 3.191448058714741e-06
},
{
"episode": 224,
"epoch": 0.28,
"eps": 0,
"loss/policy_avg": -0.00338327931240201,
"loss/value_avg": 1.1387133598327637,
"lr": 1.3e-06,
"objective/entropy": 185.0263671875,
"objective/kl": 5.307041168212891,
"objective/non_score_reward": -0.2653520703315735,
"objective/rlhf_reward": -0.5329301953315735,
"objective/scores": -0.267578125,
"policy/approxkl_avg": 0.0022570325527340174,
"policy/clipfrac_avg": 0.001953125,
"policy/entropy_avg": 1.674648642539978,
"step": 14,
"val/clipfrac_avg": 0.000244140625,
"val/num_eos_tokens": 0,
"val/ratio": 0.9600518941879272,
"val/ratio_var": 1.6098074411274865e-06
},
{
"episode": 240,
"epoch": 0.3,
"eps": 0,
"loss/policy_avg": -0.0031057698652148247,
"loss/value_avg": 1.1971783638000488,
"lr": 1.4000000000000001e-06,
"objective/entropy": 168.69589233398438,
"objective/kl": 4.939919471740723,
"objective/non_score_reward": -0.2469959706068039,
"objective/rlhf_reward": -0.4647694230079651,
"objective/scores": -0.2177734375,
"policy/approxkl_avg": 0.0019079549238085747,
"policy/clipfrac_avg": 0.00146484375,
"policy/entropy_avg": 1.599722981452942,
"step": 15,
"val/clipfrac_avg": 0.0001220703125,
"val/num_eos_tokens": 0,
"val/ratio": 0.9643268585205078,
"val/ratio_var": 3.971512342104688e-06
},
{
"episode": 256,
"epoch": 0.32,
"eps": 0,
"loss/policy_avg": -0.0029412589501589537,
"loss/value_avg": 1.1075444221496582,
"lr": 1.5e-06,
"objective/entropy": 155.5027618408203,
"objective/kl": 4.750798225402832,
"objective/non_score_reward": -0.23753991723060608,
"objective/rlhf_reward": -0.2956453859806061,
"objective/scores": -0.05810546875,
"policy/approxkl_avg": 0.0019890139810740948,
"policy/clipfrac_avg": 0.00244140625,
"policy/entropy_avg": 1.4853081703186035,
"step": 16,
"val/clipfrac_avg": 0.0001220703125,
"val/num_eos_tokens": 0,
"val/ratio": 0.9637153148651123,
"val/ratio_var": 1.7876334368338576e-06
},
{
"episode": 272,
"epoch": 0.34,
"eps": 0,
"loss/policy_avg": -0.0014371530851349235,
"loss/value_avg": 1.2808384895324707,
"lr": 1.6000000000000001e-06,
"objective/entropy": 174.11500549316406,
"objective/kl": 5.128355503082275,
"objective/non_score_reward": -0.25641775131225586,
"objective/rlhf_reward": -0.43610525131225586,
"objective/scores": -0.1796875,
"policy/approxkl_avg": 0.0020958627574145794,
"policy/clipfrac_avg": 0.001220703125,
"policy/entropy_avg": 1.6312119960784912,
"step": 17,
"val/clipfrac_avg": 0.0001220703125,
"val/num_eos_tokens": 0,
"val/ratio": 0.9622437953948975,
"val/ratio_var": 2.562965846664156e-06
},
{
"episode": 288,
"epoch": 0.36,
"eps": 0,
"loss/policy_avg": -0.0013770293444395065,
"loss/value_avg": 1.1316921710968018,
"lr": 1.7000000000000002e-06,
"objective/entropy": 177.11965942382812,
"objective/kl": 4.7733917236328125,
"objective/non_score_reward": -0.23866958916187286,
"objective/rlhf_reward": -0.36562269926071167,
"objective/scores": -0.126953125,
"policy/approxkl_avg": 0.0020855318289250135,
"policy/clipfrac_avg": 0.00244140625,
"policy/entropy_avg": 1.652011513710022,
"step": 18,
"val/clipfrac_avg": 0.0001220703125,
"val/num_eos_tokens": 0,
"val/ratio": 0.9627939462661743,
"val/ratio_var": 2.3717864223726792e-06
},
{
"episode": 304,
"epoch": 0.38,
"eps": 0,
"loss/policy_avg": -0.002823261544108391,
"loss/value_avg": 1.0748326778411865,
"lr": 1.8000000000000001e-06,
"objective/entropy": 173.44908142089844,
"objective/kl": 4.939946174621582,
"objective/non_score_reward": -0.24699731171131134,
"objective/rlhf_reward": -0.31145042181015015,
"objective/scores": -0.064453125,
"policy/approxkl_avg": 0.002068548696115613,
"policy/clipfrac_avg": 0.001953125,
"policy/entropy_avg": 1.6042956113815308,
"step": 19,
"val/clipfrac_avg": 0.0001220703125,
"val/num_eos_tokens": 0,
"val/ratio": 0.9631877541542053,
"val/ratio_var": 7.887209903856274e-06
},
{
"episode": 320,
"epoch": 0.4,
"eps": 0,
"loss/policy_avg": -0.0016451161354780197,
"loss/value_avg": 1.1182610988616943,
"lr": 1.9000000000000002e-06,
"objective/entropy": 161.34744262695312,
"objective/kl": 5.098767280578613,
"objective/non_score_reward": -0.25493836402893066,
"objective/rlhf_reward": -0.49419617652893066,
"objective/scores": -0.2392578125,
"policy/approxkl_avg": 0.001963268965482712,
"policy/clipfrac_avg": 0.0015869140625,
"policy/entropy_avg": 1.510756254196167,
"step": 20,
"val/clipfrac_avg": 0.0001220703125,
"val/num_eos_tokens": 0,
"val/ratio": 0.9648376107215881,
"val/ratio_var": 5.044429599365685e-06
},
{
"episode": 336,
"epoch": 0.42,
"eps": 0,
"loss/policy_avg": -0.0013257116079330444,
"loss/value_avg": 1.215050220489502,
"lr": 2.0000000000000003e-06,
"objective/entropy": 155.19345092773438,
"objective/kl": 5.083975791931152,
"objective/non_score_reward": -0.2541987895965576,
"objective/rlhf_reward": -0.3499019145965576,
"objective/scores": -0.095703125,
"policy/approxkl_avg": 0.002038246486335993,
"policy/clipfrac_avg": 0.0023193359375,
"policy/entropy_avg": 1.447933316230774,
"step": 21,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9646758437156677,
"val/ratio_var": 2.349487203900935e-06
},
{
"episode": 352,
"epoch": 0.44,
"eps": 0,
"loss/policy_avg": -0.0019622594118118286,
"loss/value_avg": 0.9825942516326904,
"lr": 2.1000000000000002e-06,
"objective/entropy": 184.58116149902344,
"objective/kl": 5.247394561767578,
"objective/non_score_reward": -0.26236969232559204,
"objective/rlhf_reward": -0.41471344232559204,
"objective/scores": -0.15234375,
"policy/approxkl_avg": 0.002102417405694723,
"policy/clipfrac_avg": 0.0015869140625,
"policy/entropy_avg": 1.712675929069519,
"step": 22,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9619762897491455,
"val/ratio_var": 2.346512928852462e-06
},
{
"episode": 368,
"epoch": 0.46,
"eps": 0,
"loss/policy_avg": -0.0008872179314494133,
"loss/value_avg": 1.2242329120635986,
"lr": 2.2e-06,
"objective/entropy": 158.87860107421875,
"objective/kl": 4.657800197601318,
"objective/non_score_reward": -0.23289000988006592,
"objective/rlhf_reward": -0.3383587598800659,
"objective/scores": -0.10546875,
"policy/approxkl_avg": 0.0019458475289866328,
"policy/clipfrac_avg": 0.0018310546875,
"policy/entropy_avg": 1.4549307823181152,
"step": 23,
"val/clipfrac_avg": 0.0003662109375,
"val/num_eos_tokens": 0,
"val/ratio": 0.966155469417572,
"val/ratio_var": 2.9357854600675637e-06
},
{
"episode": 384,
"epoch": 0.48,
"eps": 0,
"loss/policy_avg": -0.0034685591235756874,
"loss/value_avg": 1.0394468307495117,
"lr": 2.3000000000000004e-06,
"objective/entropy": 164.78189086914062,
"objective/kl": 4.869457244873047,
"objective/non_score_reward": -0.2434728741645813,
"objective/rlhf_reward": -0.7161291241645813,
"objective/scores": -0.47265625,
"policy/approxkl_avg": 0.002021776745095849,
"policy/clipfrac_avg": 0.00244140625,
"policy/entropy_avg": 1.4984092712402344,
"step": 24,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9638509750366211,
"val/ratio_var": 1.510236529611575e-06
},
{
"episode": 400,
"epoch": 0.5,
"eps": 0,
"loss/policy_avg": -0.0017958339303731918,
"loss/value_avg": 1.0017895698547363,
"lr": 2.4000000000000003e-06,
"objective/entropy": 155.18121337890625,
"objective/kl": 4.872950553894043,
"objective/non_score_reward": -0.2436475157737732,
"objective/rlhf_reward": 0.0004931092262268066,
"objective/scores": 0.244140625,
"policy/approxkl_avg": 0.002069193869829178,
"policy/clipfrac_avg": 0.001708984375,
"policy/entropy_avg": 1.506508469581604,
"step": 25,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9632376432418823,
"val/ratio_var": 2.6072054879477946e-06
},
{
"episode": 416,
"epoch": 0.52,
"eps": 0,
"loss/policy_avg": -0.0009807262104004622,
"loss/value_avg": 0.9018648862838745,
"lr": 2.5e-06,
"objective/entropy": 176.93988037109375,
"objective/kl": 5.13535213470459,
"objective/non_score_reward": -0.256767600774765,
"objective/rlhf_reward": -0.24211916327476501,
"objective/scores": 0.0146484375,
"policy/approxkl_avg": 0.0020367163233458996,
"policy/clipfrac_avg": 0.001953125,
"policy/entropy_avg": 1.5867078304290771,
"step": 26,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.962630569934845,
"val/ratio_var": 6.3079619394557085e-06
},
{
"episode": 432,
"epoch": 0.54,
"eps": 0,
"loss/policy_avg": -0.0008640075102448463,
"loss/value_avg": 1.1066097021102905,
"lr": 2.6e-06,
"objective/entropy": 162.58128356933594,
"objective/kl": 4.76553201675415,
"objective/non_score_reward": -0.23827658593654633,
"objective/rlhf_reward": -0.5585891008377075,
"objective/scores": -0.3203125,
"policy/approxkl_avg": 0.0020129948388785124,
"policy/clipfrac_avg": 0.0030517578125,
"policy/entropy_avg": 1.5091907978057861,
"step": 27,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9642030000686646,
"val/ratio_var": 3.215169272152707e-06
},
{
"episode": 448,
"epoch": 0.56,
"eps": 0,
"loss/policy_avg": -0.003287755884230137,
"loss/value_avg": 0.9977786540985107,
"lr": 2.7000000000000004e-06,
"objective/entropy": 156.29837036132812,
"objective/kl": 4.870822906494141,
"objective/non_score_reward": -0.2435411512851715,
"objective/rlhf_reward": -0.2541612684726715,
"objective/scores": -0.0106201171875,
"policy/approxkl_avg": 0.001957303611561656,
"policy/clipfrac_avg": 0.0013427734375,
"policy/entropy_avg": 1.4713962078094482,
"step": 28,
"val/clipfrac_avg": 0.000244140625,
"val/num_eos_tokens": 0,
"val/ratio": 0.965004563331604,
"val/ratio_var": 2.187839299949701e-06
},
{
"episode": 464,
"epoch": 0.58,
"eps": 0,
"loss/policy_avg": -0.0021433092188090086,
"loss/value_avg": 0.9118539094924927,
"lr": 2.8000000000000003e-06,
"objective/entropy": 163.495849609375,
"objective/kl": 4.88909912109375,
"objective/non_score_reward": -0.24445496499538422,
"objective/rlhf_reward": -0.017892464995384216,
"objective/scores": 0.2265625,
"policy/approxkl_avg": 0.0021847893949598074,
"policy/clipfrac_avg": 0.003173828125,
"policy/entropy_avg": 1.5781747102737427,
"step": 29,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9634232521057129,
"val/ratio_var": 4.35601987192058e-06
},
{
"episode": 480,
"epoch": 0.6,
"eps": 0,
"loss/policy_avg": -0.0023626741021871567,
"loss/value_avg": 0.9815002679824829,
"lr": 2.9e-06,
"objective/entropy": 167.91629028320312,
"objective/kl": 4.972644805908203,
"objective/non_score_reward": -0.2486322671175003,
"objective/rlhf_reward": -0.6919916272163391,
"objective/scores": -0.443359375,
"policy/approxkl_avg": 0.002145718550309539,
"policy/clipfrac_avg": 0.00146484375,
"policy/entropy_avg": 1.5622011423110962,
"step": 30,
"val/clipfrac_avg": 0.0006103515625,
"val/num_eos_tokens": 0,
"val/ratio": 0.9621492028236389,
"val/ratio_var": 5.3428561841428746e-06
},
{
"episode": 496,
"epoch": 0.62,
"eps": 0,
"loss/policy_avg": -0.003964821808040142,
"loss/value_avg": 0.8878471851348877,
"lr": 3e-06,
"objective/entropy": 166.8218994140625,
"objective/kl": 5.199556827545166,
"objective/non_score_reward": -0.2599778473377228,
"objective/rlhf_reward": -0.4894700348377228,
"objective/scores": -0.2294921875,
"policy/approxkl_avg": 0.00210373941808939,
"policy/clipfrac_avg": 0.00244140625,
"policy/entropy_avg": 1.568078875541687,
"step": 31,
"val/clipfrac_avg": 0.0006103515625,
"val/num_eos_tokens": 0,
"val/ratio": 0.9622396230697632,
"val/ratio_var": 5.290597528073704e-06
},
{
"episode": 512,
"epoch": 0.64,
"eps": 0,
"loss/policy_avg": -0.0024933372624218464,
"loss/value_avg": 0.7922680377960205,
"lr": 3.1000000000000004e-06,
"objective/entropy": 176.921875,
"objective/kl": 5.756374359130859,
"objective/non_score_reward": -0.2878187298774719,
"objective/rlhf_reward": -0.3390882611274719,
"objective/scores": -0.05126953125,
"policy/approxkl_avg": 0.002244055736809969,
"policy/clipfrac_avg": 0.002197265625,
"policy/entropy_avg": 1.6759358644485474,
"step": 32,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9592071175575256,
"val/ratio_var": 4.2995147850888316e-06
},
{
"episode": 528,
"epoch": 0.66,
"eps": 0,
"loss/policy_avg": -0.0017838962376117706,
"loss/value_avg": 0.7303538918495178,
"lr": 3.2000000000000003e-06,
"objective/entropy": 175.94744873046875,
"objective/kl": 5.338512420654297,
"objective/non_score_reward": -0.2669256329536438,
"objective/rlhf_reward": -0.2965887188911438,
"objective/scores": -0.0296630859375,
"policy/approxkl_avg": 0.002238932531327009,
"policy/clipfrac_avg": 0.00244140625,
"policy/entropy_avg": 1.6415915489196777,
"step": 33,
"val/clipfrac_avg": 0.0001220703125,
"val/num_eos_tokens": 0,
"val/ratio": 0.9602410793304443,
"val/ratio_var": 1.0174014732911019e-06
},
{
"episode": 544,
"epoch": 0.68,
"eps": 0,
"loss/policy_avg": -0.002395186573266983,
"loss/value_avg": 0.9008848667144775,
"lr": 3.3000000000000006e-06,
"objective/entropy": 159.09799194335938,
"objective/kl": 4.972096920013428,
"objective/non_score_reward": -0.24860484898090363,
"objective/rlhf_reward": -0.012276723980903625,
"objective/scores": 0.236328125,
"policy/approxkl_avg": 0.002032281132414937,
"policy/clipfrac_avg": 0.0025634765625,
"policy/entropy_avg": 1.5187857151031494,
"step": 34,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.963579535484314,
"val/ratio_var": 1.3504248954632203e-06
},
{
"episode": 560,
"epoch": 0.7,
"eps": 0,
"loss/policy_avg": -0.0040529826655983925,
"loss/value_avg": 0.738201916217804,
"lr": 3.4000000000000005e-06,
"objective/entropy": 189.93099975585938,
"objective/kl": 5.9919657707214355,
"objective/non_score_reward": -0.2995982766151428,
"objective/rlhf_reward": -0.8152232766151428,
"objective/scores": -0.515625,
"policy/approxkl_avg": 0.0024048539344221354,
"policy/clipfrac_avg": 0.0037841796875,
"policy/entropy_avg": 1.7418715953826904,
"step": 35,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9581993222236633,
"val/ratio_var": 2.2299211650533834e-06
},
{
"episode": 576,
"epoch": 0.72,
"eps": 0,
"loss/policy_avg": -0.003544020466506481,
"loss/value_avg": 0.7707731127738953,
"lr": 3.5e-06,
"objective/entropy": 159.47125244140625,
"objective/kl": 4.8106207847595215,
"objective/non_score_reward": -0.2405310571193695,
"objective/rlhf_reward": -0.6545935869216919,
"objective/scores": -0.4140625,
"policy/approxkl_avg": 0.002112124115228653,
"policy/clipfrac_avg": 0.003173828125,
"policy/entropy_avg": 1.4905831813812256,
"step": 36,
"val/clipfrac_avg": 0.0003662109375,
"val/num_eos_tokens": 0,
"val/ratio": 0.9643076658248901,
"val/ratio_var": 7.036061106191482e-06
},
{
"episode": 592,
"epoch": 0.74,
"eps": 0,
"loss/policy_avg": -0.0030114920809865,
"loss/value_avg": 0.6595614552497864,
"lr": 3.6000000000000003e-06,
"objective/entropy": 180.1719970703125,
"objective/kl": 5.4066996574401855,
"objective/non_score_reward": -0.27033501863479614,
"objective/rlhf_reward": -0.31232720613479614,
"objective/scores": -0.0419921875,
"policy/approxkl_avg": 0.0021726740524172783,
"policy/clipfrac_avg": 0.001708984375,
"policy/entropy_avg": 1.7136811017990112,
"step": 37,
"val/clipfrac_avg": 0.0001220703125,
"val/num_eos_tokens": 0,
"val/ratio": 0.9603984355926514,
"val/ratio_var": 3.5773789477389073e-06
},
{
"episode": 608,
"epoch": 0.76,
"eps": 0,
"loss/policy_avg": -0.002592116594314575,
"loss/value_avg": 0.5918077230453491,
"lr": 3.7e-06,
"objective/entropy": 167.25192260742188,
"objective/kl": 4.988465309143066,
"objective/non_score_reward": -0.24942323565483093,
"objective/rlhf_reward": -0.19009706377983093,
"objective/scores": 0.059326171875,
"policy/approxkl_avg": 0.0021313969045877457,
"policy/clipfrac_avg": 0.003662109375,
"policy/entropy_avg": 1.5170129537582397,
"step": 38,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9635162353515625,
"val/ratio_var": 4.518392870522803e-06
},
{
"episode": 624,
"epoch": 0.78,
"eps": 0,
"loss/policy_avg": -0.0039001721888780594,
"loss/value_avg": 0.7430535554885864,
"lr": 3.8000000000000005e-06,
"objective/entropy": 180.25039672851562,
"objective/kl": 5.839836597442627,
"objective/non_score_reward": -0.29199182987213135,
"objective/rlhf_reward": -0.39160120487213135,
"objective/scores": -0.099609375,
"policy/approxkl_avg": 0.002172058681026101,
"policy/clipfrac_avg": 0.003662109375,
"policy/entropy_avg": 1.696380853652954,
"step": 39,
"val/clipfrac_avg": 0.0001220703125,
"val/num_eos_tokens": 0,
"val/ratio": 0.962264895439148,
"val/ratio_var": 3.2267200822388986e-06
},
{
"episode": 640,
"epoch": 0.8,
"eps": 0,
"loss/policy_avg": -0.0031222254037857056,
"loss/value_avg": 0.6954755783081055,
"lr": 3.900000000000001e-06,
"objective/entropy": 147.38571166992188,
"objective/kl": 5.248948097229004,
"objective/non_score_reward": -0.26244741678237915,
"objective/rlhf_reward": -0.41576772928237915,
"objective/scores": -0.1533203125,
"policy/approxkl_avg": 0.0020004287362098694,
"policy/clipfrac_avg": 0.0028076171875,
"policy/entropy_avg": 1.4099265336990356,
"step": 40,
"val/clipfrac_avg": 0.0003662109375,
"val/num_eos_tokens": 0,
"val/ratio": 0.964896559715271,
"val/ratio_var": 5.165110906091286e-06
},
{
"episode": 656,
"epoch": 0.82,
"eps": 0,
"loss/policy_avg": -0.0036731180734932423,
"loss/value_avg": 0.6531814336776733,
"lr": 4.000000000000001e-06,
"objective/entropy": 186.69369506835938,
"objective/kl": 5.787052154541016,
"objective/non_score_reward": -0.2893525958061218,
"objective/rlhf_reward": -0.15849322080612183,
"objective/scores": 0.130859375,
"policy/approxkl_avg": 0.002225282369181514,
"policy/clipfrac_avg": 0.0023193359375,
"policy/entropy_avg": 1.731877088546753,
"step": 41,
"val/clipfrac_avg": 0.0006103515625,
"val/num_eos_tokens": 0,
"val/ratio": 0.9604864120483398,
"val/ratio_var": 2.4040532480285037e-06
},
{
"episode": 672,
"epoch": 0.84,
"eps": 0,
"loss/policy_avg": -0.003208685666322708,
"loss/value_avg": 0.4511321187019348,
"lr": 4.1e-06,
"objective/entropy": 193.63333129882812,
"objective/kl": 5.826928615570068,
"objective/non_score_reward": -0.2913464307785034,
"objective/rlhf_reward": -0.6956433057785034,
"objective/scores": -0.404296875,
"policy/approxkl_avg": 0.0021463697776198387,
"policy/clipfrac_avg": 0.0028076171875,
"policy/entropy_avg": 1.808129906654358,
"step": 42,
"val/clipfrac_avg": 0.000244140625,
"val/num_eos_tokens": 0,
"val/ratio": 0.9606460928916931,
"val/ratio_var": 1.945327767316485e-06
},
{
"episode": 688,
"epoch": 0.86,
"eps": 0,
"loss/policy_avg": -0.00527562340721488,
"loss/value_avg": 0.5236003398895264,
"lr": 4.2000000000000004e-06,
"objective/entropy": 200.14642333984375,
"objective/kl": 6.3243584632873535,
"objective/non_score_reward": -0.31621789932250977,
"objective/rlhf_reward": -0.21416711807250977,
"objective/scores": 0.10205078125,
"policy/approxkl_avg": 0.002284294692799449,
"policy/clipfrac_avg": 0.002685546875,
"policy/entropy_avg": 1.8296796083450317,
"step": 43,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9591376781463623,
"val/ratio_var": 3.582605586416321e-06
},
{
"episode": 704,
"epoch": 0.88,
"eps": 0,
"loss/policy_avg": -0.004552279599010944,
"loss/value_avg": 0.44474709033966064,
"lr": 4.3e-06,
"objective/entropy": 198.2988739013672,
"objective/kl": 6.11703634262085,
"objective/non_score_reward": -0.3058518171310425,
"objective/rlhf_reward": -0.3976486921310425,
"objective/scores": -0.091796875,
"policy/approxkl_avg": 0.002352041658014059,
"policy/clipfrac_avg": 0.0030517578125,
"policy/entropy_avg": 1.8390659093856812,
"step": 44,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9594440460205078,
"val/ratio_var": 1.183144263450231e-06
},
{
"episode": 720,
"epoch": 0.9,
"eps": 0,
"loss/policy_avg": -0.0043452465906739235,
"loss/value_avg": 0.5411888360977173,
"lr": 4.4e-06,
"objective/entropy": 176.61441040039062,
"objective/kl": 6.0966949462890625,
"objective/non_score_reward": -0.30483478307724,
"objective/rlhf_reward": -0.35756915807724,
"objective/scores": -0.052734375,
"policy/approxkl_avg": 0.0021276341285556555,
"policy/clipfrac_avg": 0.0040283203125,
"policy/entropy_avg": 1.6279263496398926,
"step": 45,
"val/clipfrac_avg": 0.0003662109375,
"val/num_eos_tokens": 0,
"val/ratio": 0.9633927345275879,
"val/ratio_var": 3.1293473057303345e-06
},
{
"episode": 736,
"epoch": 0.92,
"eps": 0,
"loss/policy_avg": -0.004358572885394096,
"loss/value_avg": 0.42890092730522156,
"lr": 4.5e-06,
"objective/entropy": 184.73455810546875,
"objective/kl": 6.081456184387207,
"objective/non_score_reward": -0.3040727972984314,
"objective/rlhf_reward": -0.4119829535484314,
"objective/scores": -0.10791015625,
"policy/approxkl_avg": 0.0021303845569491386,
"policy/clipfrac_avg": 0.001953125,
"policy/entropy_avg": 1.743346929550171,
"step": 46,
"val/clipfrac_avg": 0.00048828125,
"val/num_eos_tokens": 0,
"val/ratio": 0.9609779119491577,
"val/ratio_var": 2.9215109407232376e-06
},
{
"episode": 752,
"epoch": 0.94,
"eps": 0,
"loss/policy_avg": -0.00496285455301404,
"loss/value_avg": 0.4583837687969208,
"lr": 4.600000000000001e-06,
"objective/entropy": 193.1529541015625,
"objective/kl": 6.68864631652832,
"objective/non_score_reward": -0.33443236351013184,
"objective/rlhf_reward": -0.22603392601013184,
"objective/scores": 0.1083984375,
"policy/approxkl_avg": 0.002309663686901331,
"policy/clipfrac_avg": 0.0028076171875,
"policy/entropy_avg": 1.819280743598938,
"step": 47,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.958611011505127,
"val/ratio_var": 3.0761455036554253e-06
},
{
"episode": 768,
"epoch": 0.96,
"eps": 0,
"loss/policy_avg": -0.0036918839905411005,
"loss/value_avg": 0.5343762636184692,
"lr": 4.7e-06,
"objective/entropy": 172.60386657714844,
"objective/kl": 5.954087734222412,
"objective/non_score_reward": -0.29770439863204956,
"objective/rlhf_reward": -0.9109856486320496,
"objective/scores": -0.61328125,
"policy/approxkl_avg": 0.0021302001550793648,
"policy/clipfrac_avg": 0.0029296875,
"policy/entropy_avg": 1.6243486404418945,
"step": 48,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9628680944442749,
"val/ratio_var": 7.764373549434822e-06
},
{
"episode": 784,
"epoch": 0.98,
"eps": 0,
"loss/policy_avg": -0.0063984692096710205,
"loss/value_avg": 0.4495910108089447,
"lr": 4.800000000000001e-06,
"objective/entropy": 177.82730102539062,
"objective/kl": 6.320567607879639,
"objective/non_score_reward": -0.3160283863544464,
"objective/rlhf_reward": -0.685168981552124,
"objective/scores": -0.369140625,
"policy/approxkl_avg": 0.002262189518660307,
"policy/clipfrac_avg": 0.00341796875,
"policy/entropy_avg": 1.6700631380081177,
"step": 49,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9607704877853394,
"val/ratio_var": 6.711939931847155e-06
},
{
"episode": 800,
"epoch": 1.0,
"eps": 0,
"loss/policy_avg": -0.007105860859155655,
"loss/value_avg": 0.3898148536682129,
"lr": 4.9000000000000005e-06,
"objective/entropy": 202.3233184814453,
"objective/kl": 7.163693428039551,
"objective/non_score_reward": -0.35818469524383545,
"objective/rlhf_reward": -0.45291125774383545,
"objective/scores": -0.0947265625,
"policy/approxkl_avg": 0.0024232997093349695,
"policy/clipfrac_avg": 0.0042724609375,
"policy/entropy_avg": 1.8700368404388428,
"step": 50,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9586131572723389,
"val/ratio_var": 2.8943763936695177e-06
},
{
"episode": 816,
"epoch": 1.02,
"eps": 0,
"loss/policy_avg": -0.003659446258097887,
"loss/value_avg": 0.36625581979751587,
"lr": 5e-06,
"objective/entropy": 202.9866180419922,
"objective/kl": 8.40648078918457,
"objective/non_score_reward": -0.42032405734062195,
"objective/rlhf_reward": -0.7484490871429443,
"objective/scores": -0.328125,
"policy/approxkl_avg": 0.002428454579785466,
"policy/clipfrac_avg": 0.0023193359375,
"policy/entropy_avg": 1.9231579303741455,
"step": 51,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.958379864692688,
"val/ratio_var": 6.084300139264087e-07
},
{
"episode": 832,
"epoch": 1.04,
"eps": 0,
"loss/policy_avg": -0.006424235180020332,
"loss/value_avg": 0.4322627782821655,
"lr": 5.1e-06,
"objective/entropy": 188.87704467773438,
"objective/kl": 6.012404441833496,
"objective/non_score_reward": -0.30062025785446167,
"objective/rlhf_reward": -0.34749525785446167,
"objective/scores": -0.046875,
"policy/approxkl_avg": 0.0023025020491331816,
"policy/clipfrac_avg": 0.0025634765625,
"policy/entropy_avg": 1.7421900033950806,
"step": 52,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9606554508209229,
"val/ratio_var": 3.889582330884878e-06
},
{
"episode": 848,
"epoch": 1.06,
"eps": 0,
"loss/policy_avg": -0.0050460826605558395,
"loss/value_avg": 0.4796571731567383,
"lr": 5.2e-06,
"objective/entropy": 164.9066162109375,
"objective/kl": 5.981889724731445,
"objective/non_score_reward": -0.2990944981575012,
"objective/rlhf_reward": -0.6916726231575012,
"objective/scores": -0.392578125,
"policy/approxkl_avg": 0.0021395045332610607,
"policy/clipfrac_avg": 0.0048828125,
"policy/entropy_avg": 1.5888962745666504,
"step": 53,
"val/clipfrac_avg": 0.0001220703125,
"val/num_eos_tokens": 0,
"val/ratio": 0.9628995656967163,
"val/ratio_var": 4.597809947881615e-06
},
{
"episode": 864,
"epoch": 1.08,
"eps": 0,
"loss/policy_avg": -0.006381146609783173,
"loss/value_avg": 0.3751007616519928,
"lr": 5.300000000000001e-06,
"objective/entropy": 197.0960693359375,
"objective/kl": 6.693915843963623,
"objective/non_score_reward": -0.3346957862377167,
"objective/rlhf_reward": -0.7624301910400391,
"objective/scores": -0.427734375,
"policy/approxkl_avg": 0.0023175738751888275,
"policy/clipfrac_avg": 0.004150390625,
"policy/entropy_avg": 1.8260283470153809,
"step": 54,
"val/clipfrac_avg": 0.000244140625,
"val/num_eos_tokens": 0,
"val/ratio": 0.9607384204864502,
"val/ratio_var": 3.528241450112546e-06
},
{
"episode": 880,
"epoch": 1.1,
"eps": 0,
"loss/policy_avg": -0.005390330217778683,
"loss/value_avg": 0.3408937454223633,
"lr": 5.400000000000001e-06,
"objective/entropy": 189.24441528320312,
"objective/kl": 7.497405052185059,
"objective/non_score_reward": -0.374870240688324,
"objective/rlhf_reward": -0.493034303188324,
"objective/scores": -0.1181640625,
"policy/approxkl_avg": 0.0023757517337799072,
"policy/clipfrac_avg": 0.0042724609375,
"policy/entropy_avg": 1.729443073272705,
"step": 55,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9610804319381714,
"val/ratio_var": 5.716868599847658e-06
},
{
"episode": 896,
"epoch": 1.12,
"eps": 0,
"loss/policy_avg": -0.009562751278281212,
"loss/value_avg": 0.34088462591171265,
"lr": 5.500000000000001e-06,
"objective/entropy": 186.15057373046875,
"objective/kl": 7.024878025054932,
"objective/non_score_reward": -0.35124391317367554,
"objective/rlhf_reward": -0.31706422567367554,
"objective/scores": 0.0341796875,
"policy/approxkl_avg": 0.002572981407865882,
"policy/clipfrac_avg": 0.0069580078125,
"policy/entropy_avg": 1.7276947498321533,
"step": 56,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9590590000152588,
"val/ratio_var": 3.823240604106104e-06
},
{
"episode": 912,
"epoch": 1.14,
"eps": 0,
"loss/policy_avg": -0.006179399322718382,
"loss/value_avg": 0.3504621088504791,
"lr": 5.600000000000001e-06,
"objective/entropy": 203.68405151367188,
"objective/kl": 7.05451774597168,
"objective/non_score_reward": -0.3527258634567261,
"objective/rlhf_reward": -0.2843664884567261,
"objective/scores": 0.068359375,
"policy/approxkl_avg": 0.0024855139199644327,
"policy/clipfrac_avg": 0.0064697265625,
"policy/entropy_avg": 1.8781225681304932,
"step": 57,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9579586982727051,
"val/ratio_var": 6.8513472797349095e-06
},
{
"episode": 928,
"epoch": 1.16,
"eps": 0,
"loss/policy_avg": -0.006329299416393042,
"loss/value_avg": 0.2883201241493225,
"lr": 5.7e-06,
"objective/entropy": 188.50772094726562,
"objective/kl": 7.495998382568359,
"objective/non_score_reward": -0.3747999370098114,
"objective/rlhf_reward": -0.3398878276348114,
"objective/scores": 0.034912109375,
"policy/approxkl_avg": 0.002424823120236397,
"policy/clipfrac_avg": 0.0064697265625,
"policy/entropy_avg": 1.7669949531555176,
"step": 58,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9617897272109985,
"val/ratio_var": 3.0927415082260268e-06
},
{
"episode": 944,
"epoch": 1.18,
"eps": 0,
"loss/policy_avg": -0.008085895329713821,
"loss/value_avg": 0.30470138788223267,
"lr": 5.8e-06,
"objective/entropy": 190.35797119140625,
"objective/kl": 8.07366943359375,
"objective/non_score_reward": -0.40368348360061646,
"objective/rlhf_reward": -0.6810272336006165,
"objective/scores": -0.27734375,
"policy/approxkl_avg": 0.002423493890091777,
"policy/clipfrac_avg": 0.005859375,
"policy/entropy_avg": 1.745201826095581,
"step": 59,
"val/clipfrac_avg": 0.0003662109375,
"val/num_eos_tokens": 0,
"val/ratio": 0.9606545567512512,
"val/ratio_var": 2.6997802251571557e-06
},
{
"episode": 960,
"epoch": 1.2,
"eps": 0,
"loss/policy_avg": -0.0073398323729634285,
"loss/value_avg": 0.2474157214164734,
"lr": 5.9e-06,
"objective/entropy": 186.44775390625,
"objective/kl": 7.319401741027832,
"objective/non_score_reward": -0.36597010493278503,
"objective/rlhf_reward": -0.32300135493278503,
"objective/scores": 0.04296875,
"policy/approxkl_avg": 0.0022131437435746193,
"policy/clipfrac_avg": 0.0037841796875,
"policy/entropy_avg": 1.7592250108718872,
"step": 60,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9618268609046936,
"val/ratio_var": 1.1245646192037384e-06
},
{
"episode": 976,
"epoch": 1.22,
"eps": 0,
"loss/policy_avg": -0.008636923506855965,
"loss/value_avg": 0.3011922240257263,
"lr": 6e-06,
"objective/entropy": 194.01901245117188,
"objective/kl": 8.093948364257812,
"objective/non_score_reward": -0.404697448015213,
"objective/rlhf_reward": -0.5292091369628906,
"objective/scores": -0.12451171875,
"policy/approxkl_avg": 0.002539466368034482,
"policy/clipfrac_avg": 0.0064697265625,
"policy/entropy_avg": 1.7535340785980225,
"step": 61,
"val/clipfrac_avg": 0.0001220703125,
"val/num_eos_tokens": 0,
"val/ratio": 0.9611643552780151,
"val/ratio_var": 4.485158569877967e-06
},
{
"episode": 992,
"epoch": 1.24,
"eps": 0,
"loss/policy_avg": -0.008942335844039917,
"loss/value_avg": 0.314566433429718,
"lr": 6.1e-06,
"objective/entropy": 207.50445556640625,
"objective/kl": 9.318485260009766,
"objective/non_score_reward": -0.4659242033958435,
"objective/rlhf_reward": -0.6280335783958435,
"objective/scores": -0.162109375,
"policy/approxkl_avg": 0.002469500759616494,
"policy/clipfrac_avg": 0.0057373046875,
"policy/entropy_avg": 1.886934757232666,
"step": 62,
"val/clipfrac_avg": 0.0001220703125,
"val/num_eos_tokens": 0,
"val/ratio": 0.9599511027336121,
"val/ratio_var": 1.6062762142610154e-06
},
{
"episode": 1008,
"epoch": 1.26,
"eps": 0,
"loss/policy_avg": -0.0077853333204984665,
"loss/value_avg": 0.2659590542316437,
"lr": 6.200000000000001e-06,
"objective/entropy": 184.21324157714844,
"objective/kl": 8.50517463684082,
"objective/non_score_reward": -0.4252587556838989,
"objective/rlhf_reward": -0.6410790681838989,
"objective/scores": -0.2158203125,
"policy/approxkl_avg": 0.0023279902525246143,
"policy/clipfrac_avg": 0.0050048828125,
"policy/entropy_avg": 1.704209566116333,
"step": 63,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9618266820907593,
"val/ratio_var": 2.3681529910390964e-06
},
{
"episode": 1024,
"epoch": 1.28,
"eps": 0,
"loss/policy_avg": -0.008203014731407166,
"loss/value_avg": 0.24721446633338928,
"lr": 6.300000000000001e-06,
"objective/entropy": 167.58615112304688,
"objective/kl": 8.397467613220215,
"objective/non_score_reward": -0.4198733866214752,
"objective/rlhf_reward": -0.5258303880691528,
"objective/scores": -0.10595703125,
"policy/approxkl_avg": 0.0023372513242065907,
"policy/clipfrac_avg": 0.00634765625,
"policy/entropy_avg": 1.590405821800232,
"step": 64,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9629535675048828,
"val/ratio_var": 1.0923018862740719e-06
},
{
"episode": 1040,
"epoch": 1.3,
"eps": 0,
"loss/policy_avg": -0.010154301300644875,
"loss/value_avg": 0.255871057510376,
"lr": 6.4000000000000006e-06,
"objective/entropy": 187.9450225830078,
"objective/kl": 8.685728073120117,
"objective/non_score_reward": -0.4342864155769348,
"objective/rlhf_reward": -0.4543059468269348,
"objective/scores": -0.02001953125,
"policy/approxkl_avg": 0.0025775341782718897,
"policy/clipfrac_avg": 0.0072021484375,
"policy/entropy_avg": 1.717556118965149,
"step": 65,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9599798917770386,
"val/ratio_var": 3.966343228967162e-06
},
{
"episode": 1056,
"epoch": 1.32,
"eps": 0,
"loss/policy_avg": -0.006520813796669245,
"loss/value_avg": 0.20695430040359497,
"lr": 6.5000000000000004e-06,
"objective/entropy": 171.31948852539062,
"objective/kl": 8.512733459472656,
"objective/non_score_reward": -0.4256366491317749,
"objective/rlhf_reward": -0.7537616491317749,
"objective/scores": -0.328125,
"policy/approxkl_avg": 0.0022518050391227007,
"policy/clipfrac_avg": 0.0069580078125,
"policy/entropy_avg": 1.5771468877792358,
"step": 66,
"val/clipfrac_avg": 0.0001220703125,
"val/num_eos_tokens": 0,
"val/ratio": 0.9653177857398987,
"val/ratio_var": 5.141545898368349e-06
},
{
"episode": 1072,
"epoch": 1.34,
"eps": 0,
"loss/policy_avg": -0.007219092920422554,
"loss/value_avg": 0.21666979789733887,
"lr": 6.600000000000001e-06,
"objective/entropy": 182.22146606445312,
"objective/kl": 10.377399444580078,
"objective/non_score_reward": -0.518869936466217,
"objective/rlhf_reward": -1.4524636268615723,
"objective/scores": -0.93359375,
"policy/approxkl_avg": 0.0024627677630633116,
"policy/clipfrac_avg": 0.0068359375,
"policy/entropy_avg": 1.7197209596633911,
"step": 67,
"val/clipfrac_avg": 0.0040283203125,
"val/num_eos_tokens": 0,
"val/ratio": 0.9610635042190552,
"val/ratio_var": 2.259886286992696e-06
},
{
"episode": 1088,
"epoch": 1.36,
"eps": 0,
"loss/policy_avg": -0.009395209141075611,
"loss/value_avg": 0.22811496257781982,
"lr": 6.700000000000001e-06,
"objective/entropy": 184.89671325683594,
"objective/kl": 9.966436386108398,
"objective/non_score_reward": -0.4983218014240265,
"objective/rlhf_reward": -0.5730288028717041,
"objective/scores": -0.07470703125,
"policy/approxkl_avg": 0.002558775944635272,
"policy/clipfrac_avg": 0.0076904296875,
"policy/entropy_avg": 1.7344484329223633,
"step": 68,
"val/clipfrac_avg": 0.0013427734375,
"val/num_eos_tokens": 0,
"val/ratio": 0.9611660242080688,
"val/ratio_var": 4.669778263632907e-06
},
{
"episode": 1104,
"epoch": 1.38,
"eps": 0,
"loss/policy_avg": -0.011558989062905312,
"loss/value_avg": 0.2556169629096985,
"lr": 6.800000000000001e-06,
"objective/entropy": 185.43399047851562,
"objective/kl": 9.309539794921875,
"objective/non_score_reward": -0.46547698974609375,
"objective/rlhf_reward": -0.16078948974609375,
"objective/scores": 0.3046875,
"policy/approxkl_avg": 0.0025009019300341606,
"policy/clipfrac_avg": 0.0067138671875,
"policy/entropy_avg": 1.7306885719299316,
"step": 69,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9632130265235901,
"val/ratio_var": 8.589087883592583e-06
},
{
"episode": 1120,
"epoch": 1.4,
"eps": 0,
"loss/policy_avg": -0.008942339569330215,
"loss/value_avg": 0.19310247898101807,
"lr": 6.9e-06,
"objective/entropy": 186.04307556152344,
"objective/kl": 8.404040336608887,
"objective/non_score_reward": -0.4202020466327667,
"objective/rlhf_reward": -0.7112176418304443,
"objective/scores": -0.291015625,
"policy/approxkl_avg": 0.0025021119508892298,
"policy/clipfrac_avg": 0.0072021484375,
"policy/entropy_avg": 1.6525442600250244,
"step": 70,
"val/clipfrac_avg": 0.0003662109375,
"val/num_eos_tokens": 0,
"val/ratio": 0.9628069400787354,
"val/ratio_var": 1.0831280405909638e-06
},
{
"episode": 1136,
"epoch": 1.42,
"eps": 0,
"loss/policy_avg": -0.011153987608850002,
"loss/value_avg": 0.1662825345993042,
"lr": 7e-06,
"objective/entropy": 180.92037963867188,
"objective/kl": 10.789932250976562,
"objective/non_score_reward": -0.539496660232544,
"objective/rlhf_reward": -0.670356035232544,
"objective/scores": -0.130859375,
"policy/approxkl_avg": 0.002762062707915902,
"policy/clipfrac_avg": 0.01025390625,
"policy/entropy_avg": 1.6263654232025146,
"step": 71,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9615206122398376,
"val/ratio_var": 3.7071347378514474e-06
},
{
"episode": 1152,
"epoch": 1.44,
"eps": 0,
"loss/policy_avg": -0.00964896660298109,
"loss/value_avg": 0.1657545566558838,
"lr": 7.100000000000001e-06,
"objective/entropy": 199.1170654296875,
"objective/kl": 9.041791915893555,
"objective/non_score_reward": -0.4520896077156067,
"objective/rlhf_reward": -0.7138083577156067,
"objective/scores": -0.26171875,
"policy/approxkl_avg": 0.002562435809522867,
"policy/clipfrac_avg": 0.0069580078125,
"policy/entropy_avg": 1.7946536540985107,
"step": 72,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9602845907211304,
"val/ratio_var": 6.241244591365103e-06
},
{
"episode": 1168,
"epoch": 1.46,
"eps": 0,
"loss/policy_avg": -0.011143850162625313,
"loss/value_avg": 0.15495336055755615,
"lr": 7.2000000000000005e-06,
"objective/entropy": 177.09274291992188,
"objective/kl": 10.396652221679688,
"objective/non_score_reward": -0.5198326110839844,
"objective/rlhf_reward": -0.6262779235839844,
"objective/scores": -0.1064453125,
"policy/approxkl_avg": 0.002627840731292963,
"policy/clipfrac_avg": 0.0086669921875,
"policy/entropy_avg": 1.652453899383545,
"step": 73,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9642380475997925,
"val/ratio_var": 2.231507551186951e-06
},
{
"episode": 1184,
"epoch": 1.48,
"eps": 0,
"loss/policy_avg": -0.011983148753643036,
"loss/value_avg": 0.1448766589164734,
"lr": 7.3e-06,
"objective/entropy": 184.20877075195312,
"objective/kl": 9.668464660644531,
"objective/non_score_reward": -0.48342326283454895,
"objective/rlhf_reward": -0.7168216705322266,
"objective/scores": -0.2333984375,
"policy/approxkl_avg": 0.0025414335541427135,
"policy/clipfrac_avg": 0.0081787109375,
"policy/entropy_avg": 1.7255767583847046,
"step": 74,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9632611274719238,
"val/ratio_var": 8.115292075672187e-06
},
{
"episode": 1200,
"epoch": 1.5,
"eps": 0,
"loss/policy_avg": -0.007804821245372295,
"loss/value_avg": 0.16503594815731049,
"lr": 7.4e-06,
"objective/entropy": 160.60726928710938,
"objective/kl": 10.010700225830078,
"objective/non_score_reward": -0.5005350112915039,
"objective/rlhf_reward": -0.7358865737915039,
"objective/scores": -0.2353515625,
"policy/approxkl_avg": 0.002399697434157133,
"policy/clipfrac_avg": 0.0081787109375,
"policy/entropy_avg": 1.5353442430496216,
"step": 75,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9654887914657593,
"val/ratio_var": 7.977667337399907e-06
},
{
"episode": 1216,
"epoch": 1.52,
"eps": 0,
"loss/policy_avg": -0.012328230775892735,
"loss/value_avg": 0.1142549216747284,
"lr": 7.500000000000001e-06,
"objective/entropy": 200.383544921875,
"objective/kl": 9.883283615112305,
"objective/non_score_reward": -0.49416422843933105,
"objective/rlhf_reward": -0.875023603439331,
"objective/scores": -0.380859375,
"policy/approxkl_avg": 0.0030652470886707306,
"policy/clipfrac_avg": 0.0091552734375,
"policy/entropy_avg": 1.8219434022903442,
"step": 76,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9619243144989014,
"val/ratio_var": 9.272188435716089e-06
},
{
"episode": 1232,
"epoch": 1.54,
"eps": 0,
"loss/policy_avg": -0.009849337860941887,
"loss/value_avg": 0.09608270227909088,
"lr": 7.600000000000001e-06,
"objective/entropy": 194.978759765625,
"objective/kl": 9.415257453918457,
"objective/non_score_reward": -0.47076287865638733,
"objective/rlhf_reward": -0.6123644113540649,
"objective/scores": -0.1416015625,
"policy/approxkl_avg": 0.002732700901106,
"policy/clipfrac_avg": 0.009033203125,
"policy/entropy_avg": 1.7990806102752686,
"step": 77,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.959631085395813,
"val/ratio_var": 1.135609363700496e-05
},
{
"episode": 1248,
"epoch": 1.56,
"eps": 0,
"loss/policy_avg": -0.010330687277019024,
"loss/value_avg": 0.10260234773159027,
"lr": 7.7e-06,
"objective/entropy": 195.35775756835938,
"objective/kl": 11.809048652648926,
"objective/non_score_reward": -0.5904524326324463,
"objective/rlhf_reward": -1.0259993076324463,
"objective/scores": -0.435546875,
"policy/approxkl_avg": 0.0028561949729919434,
"policy/clipfrac_avg": 0.0106201171875,
"policy/entropy_avg": 1.775145173072815,
"step": 78,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9603179097175598,
"val/ratio_var": 7.384338687188574e-07
},
{
"episode": 1264,
"epoch": 1.58,
"eps": 0,
"loss/policy_avg": -0.008724691346287727,
"loss/value_avg": 0.1127011626958847,
"lr": 7.800000000000002e-06,
"objective/entropy": 192.0875701904297,
"objective/kl": 11.521632194519043,
"objective/non_score_reward": -0.5760816335678101,
"objective/rlhf_reward": -0.7323316335678101,
"objective/scores": -0.15625,
"policy/approxkl_avg": 0.0027797226794064045,
"policy/clipfrac_avg": 0.009765625,
"policy/entropy_avg": 1.8669971227645874,
"step": 79,
"val/clipfrac_avg": 0.0001220703125,
"val/num_eos_tokens": 0,
"val/ratio": 0.9582688808441162,
"val/ratio_var": 3.075662561968784e-06
},
{
"episode": 1280,
"epoch": 1.6,
"eps": 0,
"loss/policy_avg": -0.010857291519641876,
"loss/value_avg": 0.09611570090055466,
"lr": 7.9e-06,
"objective/entropy": 205.4402313232422,
"objective/kl": 10.072277069091797,
"objective/non_score_reward": -0.5036138296127319,
"objective/rlhf_reward": -0.8102544546127319,
"objective/scores": -0.306640625,
"policy/approxkl_avg": 0.002693342510610819,
"policy/clipfrac_avg": 0.0098876953125,
"policy/entropy_avg": 1.8436152935028076,
"step": 80,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9618000984191895,
"val/ratio_var": 3.328791990497848e-06
},
{
"episode": 1296,
"epoch": 1.62,
"eps": 0,
"loss/policy_avg": -0.011511455290019512,
"loss/value_avg": 0.15145719051361084,
"lr": 8.000000000000001e-06,
"objective/entropy": 179.18080139160156,
"objective/kl": 9.748101234436035,
"objective/non_score_reward": -0.48740506172180176,
"objective/rlhf_reward": -0.7432644367218018,
"objective/scores": -0.255859375,
"policy/approxkl_avg": 0.0032036681659519672,
"policy/clipfrac_avg": 0.0150146484375,
"policy/entropy_avg": 1.6454665660858154,
"step": 81,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9626948237419128,
"val/ratio_var": 3.5850334825227037e-06
},
{
"episode": 1312,
"epoch": 1.64,
"eps": 0,
"loss/policy_avg": -0.013651885092258453,
"loss/value_avg": 0.11120277643203735,
"lr": 8.1e-06,
"objective/entropy": 210.29153442382812,
"objective/kl": 12.029438018798828,
"objective/non_score_reward": -0.6014719009399414,
"objective/rlhf_reward": -0.6336984634399414,
"objective/scores": -0.0322265625,
"policy/approxkl_avg": 0.00316830538213253,
"policy/clipfrac_avg": 0.01513671875,
"policy/entropy_avg": 1.9450147151947021,
"step": 82,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9584970474243164,
"val/ratio_var": 1.1431648999860045e-05
},
{
"episode": 1328,
"epoch": 1.66,
"eps": 0,
"loss/policy_avg": -0.012597035616636276,
"loss/value_avg": 0.0916157066822052,
"lr": 8.2e-06,
"objective/entropy": 191.00723266601562,
"objective/kl": 12.672134399414062,
"objective/non_score_reward": -0.6336066722869873,
"objective/rlhf_reward": -1.0515754222869873,
"objective/scores": -0.41796875,
"policy/approxkl_avg": 0.002965549472719431,
"policy/clipfrac_avg": 0.0135498046875,
"policy/entropy_avg": 1.7200603485107422,
"step": 83,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9628971815109253,
"val/ratio_var": 8.506719495926518e-06
},
{
"episode": 1344,
"epoch": 1.68,
"eps": 0,
"loss/policy_avg": -0.011588087305426598,
"loss/value_avg": 0.08295813202857971,
"lr": 8.3e-06,
"objective/entropy": 171.20484924316406,
"objective/kl": 11.827956199645996,
"objective/non_score_reward": -0.5913978219032288,
"objective/rlhf_reward": -0.6612220406532288,
"objective/scores": -0.06982421875,
"policy/approxkl_avg": 0.0028498598840087652,
"policy/clipfrac_avg": 0.01220703125,
"policy/entropy_avg": 1.6282272338867188,
"step": 84,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9648880958557129,
"val/ratio_var": 2.407702822893043e-06
},
{
"episode": 1360,
"epoch": 1.7,
"eps": 0,
"loss/policy_avg": -0.011325595900416374,
"loss/value_avg": 0.11857910454273224,
"lr": 8.400000000000001e-06,
"objective/entropy": 210.7764434814453,
"objective/kl": 11.930746078491211,
"objective/non_score_reward": -0.5965373516082764,
"objective/rlhf_reward": -0.7498576641082764,
"objective/scores": -0.1533203125,
"policy/approxkl_avg": 0.0030916037503629923,
"policy/clipfrac_avg": 0.01220703125,
"policy/entropy_avg": 1.8590654134750366,
"step": 85,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9609018564224243,
"val/ratio_var": 2.87379384644737e-06
},
{
"episode": 1376,
"epoch": 1.72,
"eps": 0,
"loss/policy_avg": -0.009613092988729477,
"loss/value_avg": 0.08760805428028107,
"lr": 8.5e-06,
"objective/entropy": 177.6520233154297,
"objective/kl": 11.344982147216797,
"objective/non_score_reward": -0.567249059677124,
"objective/rlhf_reward": -0.786975622177124,
"objective/scores": -0.2197265625,
"policy/approxkl_avg": 0.0025511980056762695,
"policy/clipfrac_avg": 0.010009765625,
"policy/entropy_avg": 1.7009525299072266,
"step": 86,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9633545875549316,
"val/ratio_var": 4.815346528630471e-06
},
{
"episode": 1392,
"epoch": 1.74,
"eps": 0,
"loss/policy_avg": -0.012850948609411716,
"loss/value_avg": 0.0762033462524414,
"lr": 8.6e-06,
"objective/entropy": 177.36361694335938,
"objective/kl": 11.560365676879883,
"objective/non_score_reward": -0.578018307685852,
"objective/rlhf_reward": -0.583633542060852,
"objective/scores": -0.005615234375,
"policy/approxkl_avg": 0.002776096574962139,
"policy/clipfrac_avg": 0.0123291015625,
"policy/entropy_avg": 1.6667418479919434,
"step": 87,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9627068042755127,
"val/ratio_var": 1.2500585398811381e-05
},
{
"episode": 1408,
"epoch": 1.76,
"eps": 0,
"loss/policy_avg": -0.009959004819393158,
"loss/value_avg": 0.0731486976146698,
"lr": 8.700000000000001e-06,
"objective/entropy": 173.85479736328125,
"objective/kl": 11.528783798217773,
"objective/non_score_reward": -0.5764391422271729,
"objective/rlhf_reward": -0.29714226722717285,
"objective/scores": 0.279296875,
"policy/approxkl_avg": 0.002866474213078618,
"policy/clipfrac_avg": 0.011962890625,
"policy/entropy_avg": 1.6390188932418823,
"step": 88,
"val/clipfrac_avg": 0.0028076171875,
"val/num_eos_tokens": 0,
"val/ratio": 0.9629536867141724,
"val/ratio_var": 1.2676916412601713e-05
},
{
"episode": 1424,
"epoch": 1.78,
"eps": 0,
"loss/policy_avg": -0.00972694717347622,
"loss/value_avg": 0.09233620762825012,
"lr": 8.8e-06,
"objective/entropy": 185.50747680664062,
"objective/kl": 9.55545425415039,
"objective/non_score_reward": -0.4777727723121643,
"objective/rlhf_reward": -0.6418352723121643,
"objective/scores": -0.1640625,
"policy/approxkl_avg": 0.0028601475059986115,
"policy/clipfrac_avg": 0.0111083984375,
"policy/entropy_avg": 1.7339318990707397,
"step": 89,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9640640616416931,
"val/ratio_var": 1.0825136087078135e-05
},
{
"episode": 1440,
"epoch": 1.8,
"eps": 0,
"loss/policy_avg": -0.014528412371873856,
"loss/value_avg": 0.06530416011810303,
"lr": 8.900000000000001e-06,
"objective/entropy": 179.24249267578125,
"objective/kl": 11.102625846862793,
"objective/non_score_reward": -0.5551312565803528,
"objective/rlhf_reward": -0.5868695378303528,
"objective/scores": -0.03173828125,
"policy/approxkl_avg": 0.0030379812233150005,
"policy/clipfrac_avg": 0.0152587890625,
"policy/entropy_avg": 1.651671051979065,
"step": 90,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9644126296043396,
"val/ratio_var": 2.1015594029449858e-05
},
{
"episode": 1456,
"epoch": 1.82,
"eps": 0,
"loss/policy_avg": -0.014452735893428326,
"loss/value_avg": 0.08553409576416016,
"lr": 9e-06,
"objective/entropy": 175.57264709472656,
"objective/kl": 11.622018814086914,
"objective/non_score_reward": -0.5811009407043457,
"objective/rlhf_reward": -0.3135228157043457,
"objective/scores": 0.267578125,
"policy/approxkl_avg": 0.0030906922183930874,
"policy/clipfrac_avg": 0.0172119140625,
"policy/entropy_avg": 1.63707435131073,
"step": 91,
"val/clipfrac_avg": 0.00048828125,
"val/num_eos_tokens": 0,
"val/ratio": 0.9620192646980286,
"val/ratio_var": 7.649259714526124e-06
},
{
"episode": 1472,
"epoch": 1.84,
"eps": 0,
"loss/policy_avg": -0.013859651982784271,
"loss/value_avg": 0.07400637865066528,
"lr": 9.100000000000001e-06,
"objective/entropy": 194.52926635742188,
"objective/kl": 9.147865295410156,
"objective/non_score_reward": -0.45739322900772095,
"objective/rlhf_reward": -0.27086979150772095,
"objective/scores": 0.1865234375,
"policy/approxkl_avg": 0.004359320737421513,
"policy/clipfrac_avg": 0.019775390625,
"policy/entropy_avg": 1.7847541570663452,
"step": 92,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9608691930770874,
"val/ratio_var": 6.1120294958527666e-06
},
{
"episode": 1488,
"epoch": 1.86,
"eps": 0,
"loss/policy_avg": -0.013310099020600319,
"loss/value_avg": 0.06840085983276367,
"lr": 9.200000000000002e-06,
"objective/entropy": 200.57981872558594,
"objective/kl": 13.42363452911377,
"objective/non_score_reward": -0.6711816787719727,
"objective/rlhf_reward": -0.8000879287719727,
"objective/scores": -0.12890625,
"policy/approxkl_avg": 0.0038994555361568928,
"policy/clipfrac_avg": 0.017333984375,
"policy/entropy_avg": 1.8697346448898315,
"step": 93,
"val/clipfrac_avg": 0.000244140625,
"val/num_eos_tokens": 0,
"val/ratio": 0.961440920829773,
"val/ratio_var": 1.7970425687963143e-05
},
{
"episode": 1504,
"epoch": 1.88,
"eps": 0,
"loss/policy_avg": -0.010019056499004364,
"loss/value_avg": 0.0840313732624054,
"lr": 9.3e-06,
"objective/entropy": 157.54547119140625,
"objective/kl": 11.56136703491211,
"objective/non_score_reward": -0.5780683755874634,
"objective/rlhf_reward": -0.9257246255874634,
"objective/scores": -0.34765625,
"policy/approxkl_avg": 0.0030462387949228287,
"policy/clipfrac_avg": 0.0146484375,
"policy/entropy_avg": 1.48014235496521,
"step": 94,
"val/clipfrac_avg": 0.0001220703125,
"val/num_eos_tokens": 0,
"val/ratio": 0.9657242298126221,
"val/ratio_var": 2.893652663260582e-06
},
{
"episode": 1520,
"epoch": 1.9,
"eps": 0,
"loss/policy_avg": -0.011758793145418167,
"loss/value_avg": 0.05380406230688095,
"lr": 9.4e-06,
"objective/entropy": 158.7347412109375,
"objective/kl": 12.345122337341309,
"objective/non_score_reward": -0.6172561049461365,
"objective/rlhf_reward": -0.4190139174461365,
"objective/scores": 0.1982421875,
"policy/approxkl_avg": 0.003418966196477413,
"policy/clipfrac_avg": 0.021728515625,
"policy/entropy_avg": 1.4787856340408325,
"step": 95,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.964798092842102,
"val/ratio_var": 7.247604116855655e-06
},
{
"episode": 1536,
"epoch": 1.92,
"eps": 0,
"loss/policy_avg": -0.011863401159644127,
"loss/value_avg": 0.06334728747606277,
"lr": 9.5e-06,
"objective/entropy": 159.30589294433594,
"objective/kl": 14.144343376159668,
"objective/non_score_reward": -0.7072170972824097,
"objective/rlhf_reward": -0.6832913160324097,
"objective/scores": 0.02392578125,
"policy/approxkl_avg": 0.005127988290041685,
"policy/clipfrac_avg": 0.01611328125,
"policy/entropy_avg": 1.5055749416351318,
"step": 96,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9649109840393066,
"val/ratio_var": 7.781473868817557e-06
},
{
"episode": 1552,
"epoch": 1.94,
"eps": 0,
"loss/policy_avg": -0.017321571707725525,
"loss/value_avg": 0.08850554376840591,
"lr": 9.600000000000001e-06,
"objective/entropy": 159.99664306640625,
"objective/kl": 13.361457824707031,
"objective/non_score_reward": -0.6680729389190674,
"objective/rlhf_reward": -0.9053776264190674,
"objective/scores": -0.2373046875,
"policy/approxkl_avg": 0.0042739748023450375,
"policy/clipfrac_avg": 0.02294921875,
"policy/entropy_avg": 1.5079615116119385,
"step": 97,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9678010940551758,
"val/ratio_var": 1.4525655387842562e-05
},
{
"episode": 1568,
"epoch": 1.96,
"eps": 0,
"loss/policy_avg": -0.007993525825440884,
"loss/value_avg": 0.0631803646683693,
"lr": 9.7e-06,
"objective/entropy": 131.73757934570312,
"objective/kl": 13.03054428100586,
"objective/non_score_reward": -0.6515272259712219,
"objective/rlhf_reward": -0.4103162884712219,
"objective/scores": 0.2412109375,
"policy/approxkl_avg": 0.002947921399027109,
"policy/clipfrac_avg": 0.013427734375,
"policy/entropy_avg": 1.2414445877075195,
"step": 98,
"val/clipfrac_avg": 0.0001220703125,
"val/num_eos_tokens": 0,
"val/ratio": 0.9706454277038574,
"val/ratio_var": 3.9927599573275074e-05
},
{
"episode": 1584,
"epoch": 1.98,
"eps": 0,
"loss/policy_avg": -0.010491937398910522,
"loss/value_avg": 0.05786694213747978,
"lr": 9.800000000000001e-06,
"objective/entropy": 161.06646728515625,
"objective/kl": 12.394622802734375,
"objective/non_score_reward": -0.6197311282157898,
"objective/rlhf_reward": -0.7295944094657898,
"objective/scores": -0.10986328125,
"policy/approxkl_avg": 0.003401440568268299,
"policy/clipfrac_avg": 0.0164794921875,
"policy/entropy_avg": 1.5170197486877441,
"step": 99,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9651945233345032,
"val/ratio_var": 9.938579751178622e-06
},
{
"episode": 1600,
"epoch": 2.0,
"eps": 0,
"loss/policy_avg": -0.011365102604031563,
"loss/value_avg": 0.04688062146306038,
"lr": 9.9e-06,
"objective/entropy": 138.40078735351562,
"objective/kl": 13.543630599975586,
"objective/non_score_reward": -0.6771814823150635,
"objective/rlhf_reward": -0.5014002323150635,
"objective/scores": 0.17578125,
"policy/approxkl_avg": 0.0028700516559183598,
"policy/clipfrac_avg": 0.0155029296875,
"policy/entropy_avg": 1.324121117591858,
"step": 100,
"val/clipfrac_avg": 0.0,
"val/num_eos_tokens": 0,
"val/ratio": 0.9678511619567871,
"val/ratio_var": 7.740309229120612e-06
}
],
"logging_steps": 5,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 2.0,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": true,
"should_log": true,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0,
"train_batch_size": null,
"trial_name": null,
"trial_params": null
}