{ "best_metric": null, "best_model_checkpoint": null, "episode": 20032, "epoch": 3.7803359124363087, "eval_steps": 20, "global_step": 313, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "episode": 64, "epoch": 0.012077750518965842, "eps": 0, "loss/policy_avg": -0.0021184529177844524, "loss/value_avg": 0.9311372637748718, "lr": 0.0, "objective/entropy": -600.715087890625, "objective/kl": 0.46257561445236206, "objective/non_score_reward": -0.013877267949283123, "objective/rlhf_reward": 0.384560227394104, "objective/scores": 0.3984375, "policy/approxkl_avg": 0.00044652423821389675, "policy/clipfrac_avg": 0.005666394717991352, "policy/entropy_avg": 0.21374297142028809, "step": 1, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 45, "val/ratio": 1.000056505203247, "val/ratio_var": 4.3476825339894276e-07 }, { "episode": 128, "epoch": 0.024155501037931685, "eps": 0, "loss/policy_avg": -0.0030812141485512257, "loss/value_avg": 0.8693833351135254, "lr": 3.125e-08, "objective/entropy": -595.1883544921875, "objective/kl": 0.6688432097434998, "objective/non_score_reward": -0.020065294578671455, "objective/rlhf_reward": 0.3813018798828125, "objective/scores": 0.40234375, "policy/approxkl_avg": 0.0004370739625301212, "policy/clipfrac_avg": 0.006835754029452801, "policy/entropy_avg": 0.21932220458984375, "step": 2, "val/clipfrac_avg": 0.00023904947738628834, "val/num_eos_tokens": 49, "val/ratio": 0.9999904632568359, "val/ratio_var": 5.597846097771253e-07 }, { "episode": 192, "epoch": 0.03623325155689753, "eps": 0, "loss/policy_avg": -0.0009602411882951856, "loss/value_avg": 0.9131457209587097, "lr": 6.25e-08, "objective/entropy": -561.6600341796875, "objective/kl": 0.7238848805427551, "objective/non_score_reward": -0.021716546267271042, "objective/rlhf_reward": 0.39674046635627747, "objective/scores": 0.41796875, "policy/approxkl_avg": 0.00045788957504555583, "policy/clipfrac_avg": 0.007044724188745022, "policy/entropy_avg": 0.2180023193359375, "step": 3, "val/clipfrac_avg": 0.00015014366363175213, "val/num_eos_tokens": 50, "val/ratio": 1.0000163316726685, "val/ratio_var": 5.647702892019879e-07 }, { "episode": 256, "epoch": 0.04831100207586337, "eps": 0, "loss/policy_avg": -0.0013294187374413013, "loss/value_avg": 0.9107441902160645, "lr": 9.375e-08, "objective/entropy": -489.9579162597656, "objective/kl": 0.710690438747406, "objective/non_score_reward": -0.021320713683962822, "objective/rlhf_reward": 0.2943531274795532, "objective/scores": 0.31640625, "policy/approxkl_avg": 0.0008129056077450514, "policy/clipfrac_avg": 0.0068802861496806145, "policy/entropy_avg": 0.20705923438072205, "step": 4, "val/clipfrac_avg": 0.00046966708032414317, "val/num_eos_tokens": 36, "val/ratio": 0.999975860118866, "val/ratio_var": 7.655679041818075e-07 }, { "episode": 320, "epoch": 0.06038875259482921, "eps": 0, "loss/policy_avg": -0.0032467995770275593, "loss/value_avg": 0.8995952606201172, "lr": 1.25e-07, "objective/entropy": -685.2054443359375, "objective/kl": 0.3006611764431, "objective/non_score_reward": -0.00901983492076397, "objective/rlhf_reward": 0.4690075218677521, "objective/scores": 0.478515625, "policy/approxkl_avg": 0.0003806678578257561, "policy/clipfrac_avg": 0.006184540688991547, "policy/entropy_avg": 0.2066497802734375, "step": 5, "val/clipfrac_avg": 0.0001254370145034045, "val/num_eos_tokens": 42, "val/ratio": 1.0000379085540771, "val/ratio_var": 4.958400268151308e-07 }, { "episode": 384, "epoch": 0.07246650311379506, "eps": 0, "loss/policy_avg": -0.00029869808349758387, "loss/value_avg": 0.9305676221847534, "lr": 1.5624999999999999e-07, "objective/entropy": -588.39697265625, "objective/kl": 0.5641751885414124, "objective/non_score_reward": -0.016925256699323654, "objective/rlhf_reward": 0.39762550592422485, "objective/scores": 0.4140625, "policy/approxkl_avg": 0.00041988492012023926, "policy/clipfrac_avg": 0.006766438018530607, "policy/entropy_avg": 0.20317253470420837, "step": 6, "val/clipfrac_avg": 0.00019526462710928172, "val/num_eos_tokens": 45, "val/ratio": 0.9999103546142578, "val/ratio_var": 6.99273925874877e-07 }, { "episode": 448, "epoch": 0.0845442536327609, "eps": 0, "loss/policy_avg": -0.0019068828551098704, "loss/value_avg": 0.8919577598571777, "lr": 1.875e-07, "objective/entropy": -614.7843017578125, "objective/kl": 0.33637887239456177, "objective/non_score_reward": -0.010091365315020084, "objective/rlhf_reward": 0.3663734793663025, "objective/scores": 0.376953125, "policy/approxkl_avg": 0.0004250165948178619, "policy/clipfrac_avg": 0.0070974379777908325, "policy/entropy_avg": 0.2131398618221283, "step": 7, "val/clipfrac_avg": 0.00019152543973177671, "val/num_eos_tokens": 43, "val/ratio": 1.000044822692871, "val/ratio_var": 5.280454047351668e-07 }, { "episode": 512, "epoch": 0.09662200415172674, "eps": 0, "loss/policy_avg": -0.003216695738956332, "loss/value_avg": 0.8838874101638794, "lr": 2.1875e-07, "objective/entropy": -576.663330078125, "objective/kl": 0.7862333059310913, "objective/non_score_reward": -0.023586997762322426, "objective/rlhf_reward": 0.3934051990509033, "objective/scores": 0.41796875, "policy/approxkl_avg": 0.000438332324847579, "policy/clipfrac_avg": 0.006604321300983429, "policy/entropy_avg": 0.20785841345787048, "step": 8, "val/clipfrac_avg": 0.0003107336815446615, "val/num_eos_tokens": 36, "val/ratio": 0.9998782873153687, "val/ratio_var": 6.208914555827505e-07 }, { "episode": 576, "epoch": 0.10869975467069258, "eps": 0, "loss/policy_avg": -0.002265141811221838, "loss/value_avg": 0.869255542755127, "lr": 2.5e-07, "objective/entropy": -627.076171875, "objective/kl": 0.32534003257751465, "objective/non_score_reward": -0.009760200046002865, "objective/rlhf_reward": 0.39697808027267456, "objective/scores": 0.40625, "policy/approxkl_avg": 0.00039745302638038993, "policy/clipfrac_avg": 0.006331109441816807, "policy/entropy_avg": 0.19233450293540955, "step": 9, "val/clipfrac_avg": 0.00015899499703664333, "val/num_eos_tokens": 41, "val/ratio": 1.0001146793365479, "val/ratio_var": 7.010530111983826e-07 }, { "episode": 640, "epoch": 0.12077750518965842, "eps": 0, "loss/policy_avg": -0.0026860858779400587, "loss/value_avg": 0.8342069387435913, "lr": 2.8125e-07, "objective/entropy": -670.0674438476562, "objective/kl": 0.7622801661491394, "objective/non_score_reward": -0.022868404164910316, "objective/rlhf_reward": 0.38533473014831543, "objective/scores": 0.408203125, "policy/approxkl_avg": 0.0003712985198944807, "policy/clipfrac_avg": 0.006433566566556692, "policy/entropy_avg": 0.20289739966392517, "step": 10, "val/clipfrac_avg": 0.027581503614783287, "val/num_eos_tokens": 45, "val/ratio": 0.999953031539917, "val/ratio_var": 6.136452270766313e-07 }, { "episode": 704, "epoch": 0.13285525570862428, "eps": 0, "loss/policy_avg": -0.008414413779973984, "loss/value_avg": 0.7493016123771667, "lr": 3.1249999999999997e-07, "objective/entropy": -643.5463256835938, "objective/kl": 0.8814950585365295, "objective/non_score_reward": -0.026444854214787483, "objective/rlhf_reward": 0.42033249139785767, "objective/scores": 0.447265625, "policy/approxkl_avg": 0.0004149469896219671, "policy/clipfrac_avg": 0.0073681240901350975, "policy/entropy_avg": 0.21221670508384705, "step": 11, "val/clipfrac_avg": 0.0004212568746879697, "val/num_eos_tokens": 36, "val/ratio": 0.9999436736106873, "val/ratio_var": 6.905478926455544e-07 }, { "episode": 768, "epoch": 0.14493300622759012, "eps": 0, "loss/policy_avg": -0.008181717246770859, "loss/value_avg": 0.7314225435256958, "lr": 3.4375e-07, "objective/entropy": -599.110595703125, "objective/kl": 0.7627489566802979, "objective/non_score_reward": -0.02288246899843216, "objective/rlhf_reward": 0.44879722595214844, "objective/scores": 0.47265625, "policy/approxkl_avg": 0.00046156253665685654, "policy/clipfrac_avg": 0.0068002426996827126, "policy/entropy_avg": 0.2235361784696579, "step": 12, "val/clipfrac_avg": 0.00017142172146122903, "val/num_eos_tokens": 41, "val/ratio": 0.9999423027038574, "val/ratio_var": 8.461731226816482e-07 }, { "episode": 832, "epoch": 0.15701075674655596, "eps": 0, "loss/policy_avg": -0.006054941564798355, "loss/value_avg": 0.7129493951797485, "lr": 3.75e-07, "objective/entropy": -625.335693359375, "objective/kl": 0.9311845898628235, "objective/non_score_reward": -0.027935536578297615, "objective/rlhf_reward": 0.3563418388366699, "objective/scores": 0.384765625, "policy/approxkl_avg": 0.0003990530385635793, "policy/clipfrac_avg": 0.006634948309510946, "policy/entropy_avg": 0.20673498511314392, "step": 13, "val/clipfrac_avg": 0.00031250983010977507, "val/num_eos_tokens": 46, "val/ratio": 0.9999412298202515, "val/ratio_var": 6.676891075585445e-07 }, { "episode": 896, "epoch": 0.1690885072655218, "eps": 0, "loss/policy_avg": -0.007185523398220539, "loss/value_avg": 0.663692831993103, "lr": 4.0625e-07, "objective/entropy": -611.6224365234375, "objective/kl": 0.598267674446106, "objective/non_score_reward": -0.017948029562830925, "objective/rlhf_reward": 0.3819543123245239, "objective/scores": 0.400390625, "policy/approxkl_avg": 0.0004264025192242116, "policy/clipfrac_avg": 0.0068409196101129055, "policy/entropy_avg": 0.20851516723632812, "step": 14, "val/clipfrac_avg": 0.17968440055847168, "val/num_eos_tokens": 45, "val/ratio": 0.9998716115951538, "val/ratio_var": 8.126443162836949e-07 }, { "episode": 960, "epoch": 0.18116625778448764, "eps": 0, "loss/policy_avg": -0.01342801284044981, "loss/value_avg": 0.5221339464187622, "lr": 4.375e-07, "objective/entropy": -577.73486328125, "objective/kl": 1.043047547340393, "objective/non_score_reward": -0.031291425228118896, "objective/rlhf_reward": 0.3266187310218811, "objective/scores": 0.357421875, "policy/approxkl_avg": 0.00048208641237579286, "policy/clipfrac_avg": 0.00789344497025013, "policy/entropy_avg": 0.21881103515625, "step": 15, "val/clipfrac_avg": 0.015999414026737213, "val/num_eos_tokens": 44, "val/ratio": 1.0000479221343994, "val/ratio_var": 7.270523383340333e-07 }, { "episode": 1024, "epoch": 0.19324400830345348, "eps": 0, "loss/policy_avg": -0.014559760689735413, "loss/value_avg": 0.4795520305633545, "lr": 4.6874999999999996e-07, "objective/entropy": -704.927978515625, "objective/kl": 1.3000061511993408, "objective/non_score_reward": -0.03900018334388733, "objective/rlhf_reward": 0.39117559790611267, "objective/scores": 0.4296875, "policy/approxkl_avg": 0.00035720731830224395, "policy/clipfrac_avg": 0.006962340325117111, "policy/entropy_avg": 0.19104096293449402, "step": 16, "val/clipfrac_avg": 0.0004542362876236439, "val/num_eos_tokens": 46, "val/ratio": 0.9999203681945801, "val/ratio_var": 6.327572918962687e-07 }, { "episode": 1088, "epoch": 0.20532175882241932, "eps": 0, "loss/policy_avg": -0.017483970150351524, "loss/value_avg": 0.4525485634803772, "lr": 5e-07, "objective/entropy": -632.856689453125, "objective/kl": 1.6142749786376953, "objective/non_score_reward": -0.0484282523393631, "objective/rlhf_reward": 0.3153412640094757, "objective/scores": 0.36328125, "policy/approxkl_avg": 0.0004206376615911722, "policy/clipfrac_avg": 0.007758093532174826, "policy/entropy_avg": 0.20397186279296875, "step": 17, "val/clipfrac_avg": 0.001352960942313075, "val/num_eos_tokens": 55, "val/ratio": 1.0000765323638916, "val/ratio_var": 6.663805720563687e-07 }, { "episode": 1152, "epoch": 0.21739950934138516, "eps": 0, "loss/policy_avg": -0.01164332777261734, "loss/value_avg": 0.414880633354187, "lr": 4.983164983164983e-07, "objective/entropy": -660.91943359375, "objective/kl": 2.686311721801758, "objective/non_score_reward": -0.08058934658765793, "objective/rlhf_reward": 0.3207778334617615, "objective/scores": 0.40234375, "policy/approxkl_avg": 0.0004323392640799284, "policy/clipfrac_avg": 0.008179331198334694, "policy/entropy_avg": 0.19593684375286102, "step": 18, "val/clipfrac_avg": 0.0011506883893162012, "val/num_eos_tokens": 51, "val/ratio": 0.9998383522033691, "val/ratio_var": 7.198556772891607e-07 }, { "episode": 1216, "epoch": 0.229477259860351, "eps": 0, "loss/policy_avg": -0.011872556060552597, "loss/value_avg": 0.38459596037864685, "lr": 4.966329966329966e-07, "objective/entropy": -632.638916015625, "objective/kl": 3.21876859664917, "objective/non_score_reward": -0.09656305611133575, "objective/rlhf_reward": 0.23790958523750305, "objective/scores": 0.333984375, "policy/approxkl_avg": 0.0004764531913679093, "policy/clipfrac_avg": 0.008287805132567883, "policy/entropy_avg": 0.2162272185087204, "step": 19, "val/clipfrac_avg": 0.00905265286564827, "val/num_eos_tokens": 37, "val/ratio": 1.0000991821289062, "val/ratio_var": 7.565479904769745e-07 }, { "episode": 1280, "epoch": 0.24155501037931684, "eps": 0, "loss/policy_avg": -0.014671847224235535, "loss/value_avg": 0.3181418478488922, "lr": 4.949494949494949e-07, "objective/entropy": -706.05078125, "objective/kl": 3.669142007827759, "objective/non_score_reward": -0.11007425934076309, "objective/rlhf_reward": 0.3664882481098175, "objective/scores": 0.4765625, "policy/approxkl_avg": 0.0003931926330551505, "policy/clipfrac_avg": 0.008725658059120178, "policy/entropy_avg": 0.17680613696575165, "step": 20, "val/clipfrac_avg": 0.03898124024271965, "val/num_eos_tokens": 54, "val/ratio": 1.0001307725906372, "val/ratio_var": 6.557406777574215e-07 }, { "episode": 1344, "epoch": 0.2536327608982827, "eps": 0, "loss/policy_avg": -0.014831740409135818, "loss/value_avg": 0.2612203061580658, "lr": 4.932659932659932e-07, "objective/entropy": -704.768310546875, "objective/kl": 3.722026824951172, "objective/non_score_reward": -0.11166080832481384, "objective/rlhf_reward": 0.33413997292518616, "objective/scores": 0.4453125, "policy/approxkl_avg": 0.00046230730367824435, "policy/clipfrac_avg": 0.008072879165410995, "policy/entropy_avg": 0.18310165405273438, "step": 21, "val/clipfrac_avg": 0.006685478147119284, "val/num_eos_tokens": 47, "val/ratio": 0.9999584555625916, "val/ratio_var": 6.261999487833236e-07 }, { "episode": 1408, "epoch": 0.26571051141724855, "eps": 0, "loss/policy_avg": -0.015546409413218498, "loss/value_avg": 0.22234514355659485, "lr": 4.915824915824915e-07, "objective/entropy": -686.92041015625, "objective/kl": 5.413008689880371, "objective/non_score_reward": -0.16239026188850403, "objective/rlhf_reward": 0.30050036311149597, "objective/scores": 0.462890625, "policy/approxkl_avg": 0.00041679860441945493, "policy/clipfrac_avg": 0.00833301804959774, "policy/entropy_avg": 0.1929423063993454, "step": 22, "val/clipfrac_avg": 0.009531511925160885, "val/num_eos_tokens": 48, "val/ratio": 0.999933660030365, "val/ratio_var": 7.41119151825842e-07 }, { "episode": 1472, "epoch": 0.27778826193621436, "eps": 0, "loss/policy_avg": -0.013549113646149635, "loss/value_avg": 0.20537236332893372, "lr": 4.898989898989898e-07, "objective/entropy": -710.58544921875, "objective/kl": 5.630204200744629, "objective/non_score_reward": -0.16890612244606018, "objective/rlhf_reward": 0.2529688775539398, "objective/scores": 0.421875, "policy/approxkl_avg": 0.00040828564669936895, "policy/clipfrac_avg": 0.009321928024291992, "policy/entropy_avg": 0.18656031787395477, "step": 23, "val/clipfrac_avg": 0.0010966494446620345, "val/num_eos_tokens": 45, "val/ratio": 1.0000791549682617, "val/ratio_var": 8.043146522140887e-07 }, { "episode": 1536, "epoch": 0.28986601245518023, "eps": 0, "loss/policy_avg": -0.014127358794212341, "loss/value_avg": 0.184538334608078, "lr": 4.882154882154882e-07, "objective/entropy": -721.2072143554688, "objective/kl": 7.244493007659912, "objective/non_score_reward": -0.2173347771167755, "objective/rlhf_reward": 0.2309074103832245, "objective/scores": 0.44921875, "policy/approxkl_avg": 0.00042275150190107524, "policy/clipfrac_avg": 0.009146442636847496, "policy/entropy_avg": 0.19444656372070312, "step": 24, "val/clipfrac_avg": 0.00022602800163440406, "val/num_eos_tokens": 48, "val/ratio": 0.9999423027038574, "val/ratio_var": 6.538029424518754e-07 }, { "episode": 1600, "epoch": 0.30194376297414605, "eps": 0, "loss/policy_avg": -0.013546439819037914, "loss/value_avg": 0.15871131420135498, "lr": 4.865319865319866e-07, "objective/entropy": -625.06640625, "objective/kl": 6.906096935272217, "objective/non_score_reward": -0.20718291401863098, "objective/rlhf_reward": 0.14267060160636902, "objective/scores": 0.349609375, "policy/approxkl_avg": 0.0004575018538162112, "policy/clipfrac_avg": 0.009727372787892818, "policy/entropy_avg": 0.2004598081111908, "step": 25, "val/clipfrac_avg": 9.441343718208373e-05, "val/num_eos_tokens": 33, "val/ratio": 1.0002121925354004, "val/ratio_var": 6.285458766797092e-07 }, { "episode": 1664, "epoch": 0.3140215134931119, "eps": 0, "loss/policy_avg": -0.011472932994365692, "loss/value_avg": 0.15153326094150543, "lr": 4.848484848484849e-07, "objective/entropy": -657.1741333007812, "objective/kl": 8.73617172241211, "objective/non_score_reward": -0.26208510994911194, "objective/rlhf_reward": 0.11779768764972687, "objective/scores": 0.37890625, "policy/approxkl_avg": 0.00045209572999738157, "policy/clipfrac_avg": 0.0087648406624794, "policy/entropy_avg": 0.20401255786418915, "step": 26, "val/clipfrac_avg": 0.00012975589197594672, "val/num_eos_tokens": 49, "val/ratio": 1.0000041723251343, "val/ratio_var": 9.090064736483328e-07 }, { "episode": 1728, "epoch": 0.3260992640120777, "eps": 0, "loss/policy_avg": -0.018957365304231644, "loss/value_avg": 0.1270497739315033, "lr": 4.831649831649832e-07, "objective/entropy": -725.923828125, "objective/kl": 9.15277099609375, "objective/non_score_reward": -0.2745831608772278, "objective/rlhf_reward": 0.1878191977739334, "objective/scores": 0.462890625, "policy/approxkl_avg": 0.00039935283712111413, "policy/clipfrac_avg": 0.009701108559966087, "policy/entropy_avg": 0.18258032202720642, "step": 27, "val/clipfrac_avg": 0.0003692947211675346, "val/num_eos_tokens": 50, "val/ratio": 0.9999105334281921, "val/ratio_var": 6.027379413353628e-07 }, { "episode": 1792, "epoch": 0.3381770145310436, "eps": 0, "loss/policy_avg": -0.015594224445521832, "loss/value_avg": 0.11441653966903687, "lr": 4.814814814814814e-07, "objective/entropy": -735.0335693359375, "objective/kl": 9.175653457641602, "objective/non_score_reward": -0.27526962757110596, "objective/rlhf_reward": 0.13391008973121643, "objective/scores": 0.41015625, "policy/approxkl_avg": 0.0004158214433118701, "policy/clipfrac_avg": 0.008673434145748615, "policy/entropy_avg": 0.17068862915039062, "step": 28, "val/clipfrac_avg": 0.0001518530771136284, "val/num_eos_tokens": 52, "val/ratio": 0.9999884366989136, "val/ratio_var": 8.806293294583156e-07 }, { "episode": 1856, "epoch": 0.3502547650500094, "eps": 0, "loss/policy_avg": -0.011532934382557869, "loss/value_avg": 0.09116180986166, "lr": 4.797979797979798e-07, "objective/entropy": -773.42919921875, "objective/kl": 10.120838165283203, "objective/non_score_reward": -0.3036251366138458, "objective/rlhf_reward": 0.17586705088615417, "objective/scores": 0.48046875, "policy/approxkl_avg": 0.00036347960121929646, "policy/clipfrac_avg": 0.009156275540590286, "policy/entropy_avg": 0.16425704956054688, "step": 29, "val/clipfrac_avg": 3.780891711357981e-05, "val/num_eos_tokens": 50, "val/ratio": 1.0000102519989014, "val/ratio_var": 5.752245328949357e-07 }, { "episode": 1920, "epoch": 0.3623325155689753, "eps": 0, "loss/policy_avg": -0.01278415322303772, "loss/value_avg": 0.08662945032119751, "lr": 4.781144781144781e-07, "objective/entropy": -718.801025390625, "objective/kl": 10.949674606323242, "objective/non_score_reward": -0.3284902274608612, "objective/rlhf_reward": 0.0650644600391388, "objective/scores": 0.39453125, "policy/approxkl_avg": 0.0004170535539742559, "policy/clipfrac_avg": 0.009168609045445919, "policy/entropy_avg": 0.17079035937786102, "step": 30, "val/clipfrac_avg": 0.00021001597633585334, "val/num_eos_tokens": 53, "val/ratio": 0.9999901652336121, "val/ratio_var": 7.901667800069845e-07 }, { "episode": 1984, "epoch": 0.37441026608794115, "eps": 0, "loss/policy_avg": -0.013262813910841942, "loss/value_avg": 0.07429289817810059, "lr": 4.7643097643097643e-07, "objective/entropy": -708.4739379882812, "objective/kl": 11.724746704101562, "objective/non_score_reward": -0.35174238681793213, "objective/rlhf_reward": -0.014340057969093323, "objective/scores": 0.337890625, "policy/approxkl_avg": 0.0004177941009402275, "policy/clipfrac_avg": 0.009081902913749218, "policy/entropy_avg": 0.18209967017173767, "step": 31, "val/clipfrac_avg": 0.0003269795561209321, "val/num_eos_tokens": 49, "val/ratio": 0.999956488609314, "val/ratio_var": 6.545072324115608e-07 }, { "episode": 2048, "epoch": 0.38648801660690696, "eps": 0, "loss/policy_avg": -0.018486851826310158, "loss/value_avg": 0.06496821343898773, "lr": 4.7474747474747474e-07, "objective/entropy": -730.5189819335938, "objective/kl": 13.40658187866211, "objective/non_score_reward": -0.4021974205970764, "objective/rlhf_reward": -0.03842787444591522, "objective/scores": 0.36328125, "policy/approxkl_avg": 0.00040673528565093875, "policy/clipfrac_avg": 0.008559602312743664, "policy/entropy_avg": 0.16196060180664062, "step": 32, "val/clipfrac_avg": 8.302954665850848e-05, "val/num_eos_tokens": 51, "val/ratio": 0.9999591708183289, "val/ratio_var": 6.225269544302137e-07 }, { "episode": 2112, "epoch": 0.3985657671258728, "eps": 0, "loss/policy_avg": -0.022785823792219162, "loss/value_avg": 0.054975174367427826, "lr": 4.7306397306397305e-07, "objective/entropy": -750.11865234375, "objective/kl": 12.143804550170898, "objective/non_score_reward": -0.36431413888931274, "objective/rlhf_reward": 0.09711165726184845, "objective/scores": 0.4609375, "policy/approxkl_avg": 0.00039204792119562626, "policy/clipfrac_avg": 0.009267458692193031, "policy/entropy_avg": 0.15810012817382812, "step": 33, "val/clipfrac_avg": 6.448548811022192e-05, "val/num_eos_tokens": 51, "val/ratio": 1.0001405477523804, "val/ratio_var": 6.74541126954864e-07 }, { "episode": 2176, "epoch": 0.41064351764483864, "eps": 0, "loss/policy_avg": -0.021774116903543472, "loss/value_avg": 0.048116378486156464, "lr": 4.7138047138047136e-07, "objective/entropy": -726.4152221679688, "objective/kl": 12.474294662475586, "objective/non_score_reward": -0.3742288053035736, "objective/rlhf_reward": 0.02713838219642639, "objective/scores": 0.40234375, "policy/approxkl_avg": 0.0006013754173181951, "policy/clipfrac_avg": 0.009044626727700233, "policy/entropy_avg": 0.17004776000976562, "step": 34, "val/clipfrac_avg": 1.9868224626407027e-05, "val/num_eos_tokens": 57, "val/ratio": 1.0000003576278687, "val/ratio_var": 5.519239607565396e-07 }, { "episode": 2240, "epoch": 0.4227212681638045, "eps": 0, "loss/policy_avg": -0.011024661362171173, "loss/value_avg": 0.04245440661907196, "lr": 4.696969696969697e-07, "objective/entropy": -765.597900390625, "objective/kl": 12.814224243164062, "objective/non_score_reward": -0.38442671298980713, "objective/rlhf_reward": -0.020657174289226532, "objective/scores": 0.36328125, "policy/approxkl_avg": 0.00036188805825076997, "policy/clipfrac_avg": 0.009150207042694092, "policy/entropy_avg": 0.14880117774009705, "step": 35, "val/clipfrac_avg": 6.972333721932955e-06, "val/num_eos_tokens": 54, "val/ratio": 0.9999343752861023, "val/ratio_var": 4.806460651707312e-07 }, { "episode": 2304, "epoch": 0.4347990186827703, "eps": 0, "loss/policy_avg": -0.018790725618600845, "loss/value_avg": 0.038986437022686005, "lr": 4.68013468013468e-07, "objective/entropy": -748.5819091796875, "objective/kl": 13.179786682128906, "objective/non_score_reward": -0.39539361000061035, "objective/rlhf_reward": 0.02013372629880905, "objective/scores": 0.416015625, "policy/approxkl_avg": 0.00038032219163142145, "policy/clipfrac_avg": 0.009533729404211044, "policy/entropy_avg": 0.15250270068645477, "step": 36, "val/clipfrac_avg": 2.8229449526406825e-05, "val/num_eos_tokens": 55, "val/ratio": 1.0000063180923462, "val/ratio_var": 7.776847610330151e-07 }, { "episode": 2368, "epoch": 0.4468767692017362, "eps": 0, "loss/policy_avg": -0.018214058130979538, "loss/value_avg": 0.034576669335365295, "lr": 4.663299663299663e-07, "objective/entropy": -702.28369140625, "objective/kl": 15.299257278442383, "objective/non_score_reward": -0.45897769927978516, "objective/rlhf_reward": -0.15282535552978516, "objective/scores": 0.306640625, "policy/approxkl_avg": 0.00047967006685212255, "policy/clipfrac_avg": 0.009605104103684425, "policy/entropy_avg": 0.16119003295898438, "step": 37, "val/clipfrac_avg": 1.4692055628984235e-05, "val/num_eos_tokens": 44, "val/ratio": 1.0000842809677124, "val/ratio_var": 6.511489800686832e-07 }, { "episode": 2432, "epoch": 0.458954519720702, "eps": 0, "loss/policy_avg": -0.022558456286787987, "loss/value_avg": 0.032565370202064514, "lr": 4.646464646464646e-07, "objective/entropy": -743.125732421875, "objective/kl": 15.134292602539062, "objective/non_score_reward": -0.45402878522872925, "objective/rlhf_reward": -0.12102095782756805, "objective/scores": 0.33203125, "policy/approxkl_avg": 0.0005570814246311784, "policy/clipfrac_avg": 0.009378625079989433, "policy/entropy_avg": 0.15362422168254852, "step": 38, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 52, "val/ratio": 1.0000710487365723, "val/ratio_var": 7.270390369740198e-07 }, { "episode": 2496, "epoch": 0.47103227023966787, "eps": 0, "loss/policy_avg": -0.029226083308458328, "loss/value_avg": 0.029515882954001427, "lr": 4.6296296296296297e-07, "objective/entropy": -698.0175170898438, "objective/kl": 15.121957778930664, "objective/non_score_reward": -0.4536587595939636, "objective/rlhf_reward": -0.06596343964338303, "objective/scores": 0.38671875, "policy/approxkl_avg": 0.00044198459363542497, "policy/clipfrac_avg": 0.008979331701993942, "policy/entropy_avg": 0.172088623046875, "step": 39, "val/clipfrac_avg": 1.966176751011517e-05, "val/num_eos_tokens": 48, "val/ratio": 1.0001137256622314, "val/ratio_var": 9.834893717197701e-07 }, { "episode": 2560, "epoch": 0.4831100207586337, "eps": 0, "loss/policy_avg": -0.025167806074023247, "loss/value_avg": 0.027836887165904045, "lr": 4.612794612794613e-07, "objective/entropy": -725.21875, "objective/kl": 14.953241348266602, "objective/non_score_reward": -0.4485971927642822, "objective/rlhf_reward": -0.04869486391544342, "objective/scores": 0.400390625, "policy/approxkl_avg": 0.0005358229391276836, "policy/clipfrac_avg": 0.008523606695234776, "policy/entropy_avg": 0.15728633105754852, "step": 40, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 1.00004243850708, "val/ratio_var": 5.402997089731798e-07 }, { "episode": 2624, "epoch": 0.49518777127759955, "eps": 0, "loss/policy_avg": -0.030176600441336632, "loss/value_avg": 0.026436101645231247, "lr": 4.595959595959596e-07, "objective/entropy": -781.7913818359375, "objective/kl": 14.04931926727295, "objective/non_score_reward": -0.42147958278656006, "objective/rlhf_reward": 0.09707509726285934, "objective/scores": 0.51953125, "policy/approxkl_avg": 0.0003690444864332676, "policy/clipfrac_avg": 0.008865940384566784, "policy/entropy_avg": 0.14566168189048767, "step": 41, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 62, "val/ratio": 1.0000038146972656, "val/ratio_var": 8.146014920384914e-07 }, { "episode": 2688, "epoch": 0.5072655217965654, "eps": 0, "loss/policy_avg": -0.02603175863623619, "loss/value_avg": 0.026311784982681274, "lr": 4.579124579124579e-07, "objective/entropy": -753.4428100585938, "objective/kl": 15.049712181091309, "objective/non_score_reward": -0.4514913558959961, "objective/rlhf_reward": -0.0354757234454155, "objective/scores": 0.416015625, "policy/approxkl_avg": 0.0003813736548181623, "policy/clipfrac_avg": 0.009573683142662048, "policy/entropy_avg": 0.17486445605754852, "step": 42, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 57, "val/ratio": 1.0002083778381348, "val/ratio_var": 6.971042694203788e-07 }, { "episode": 2752, "epoch": 0.5193432723155312, "eps": 0, "loss/policy_avg": -0.027423618361353874, "loss/value_avg": 0.024181833490729332, "lr": 4.562289562289562e-07, "objective/entropy": -720.7274780273438, "objective/kl": 17.52047348022461, "objective/non_score_reward": -0.5256141424179077, "objective/rlhf_reward": -0.1222938597202301, "objective/scores": 0.40234375, "policy/approxkl_avg": 0.00042545428732410073, "policy/clipfrac_avg": 0.00958210788667202, "policy/entropy_avg": 0.1733601987361908, "step": 43, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 54, "val/ratio": 0.9999567866325378, "val/ratio_var": 7.062473628138832e-07 }, { "episode": 2816, "epoch": 0.5314210228344971, "eps": 0, "loss/policy_avg": -0.033347710967063904, "loss/value_avg": 0.023924967274069786, "lr": 4.545454545454545e-07, "objective/entropy": -753.5906982421875, "objective/kl": 16.592561721801758, "objective/non_score_reward": -0.4977768063545227, "objective/rlhf_reward": -0.04953461140394211, "objective/scores": 0.44921875, "policy/approxkl_avg": 0.000394497939851135, "policy/clipfrac_avg": 0.009626075625419617, "policy/entropy_avg": 0.14611563086509705, "step": 44, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 50, "val/ratio": 1.0000271797180176, "val/ratio_var": 8.373976356779167e-07 }, { "episode": 2880, "epoch": 0.543498773353463, "eps": 0, "loss/policy_avg": -0.03707805275917053, "loss/value_avg": 0.02223985455930233, "lr": 4.5286195286195283e-07, "objective/entropy": -725.1084594726562, "objective/kl": 15.50640869140625, "objective/non_score_reward": -0.4651922583580017, "objective/rlhf_reward": -0.021344579756259918, "objective/scores": 0.443359375, "policy/approxkl_avg": 0.0004309536307118833, "policy/clipfrac_avg": 0.009066203609108925, "policy/entropy_avg": 0.16333135962486267, "step": 45, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 48, "val/ratio": 1.00018310546875, "val/ratio_var": 1.699194172033458e-06 }, { "episode": 2944, "epoch": 0.5555765238724287, "eps": 0, "loss/policy_avg": -0.03546188026666641, "loss/value_avg": 0.021326132118701935, "lr": 4.5117845117845114e-07, "objective/entropy": -742.35107421875, "objective/kl": 15.387621879577637, "objective/non_score_reward": -0.46162867546081543, "objective/rlhf_reward": -0.03047630935907364, "objective/scores": 0.431640625, "policy/approxkl_avg": 0.0004147663130424917, "policy/clipfrac_avg": 0.009872214868664742, "policy/entropy_avg": 0.15110652148723602, "step": 46, "val/clipfrac_avg": 9.790101103135385e-06, "val/num_eos_tokens": 43, "val/ratio": 1.000044822692871, "val/ratio_var": 8.873777233020519e-07 }, { "episode": 3008, "epoch": 0.5676542743913946, "eps": 0, "loss/policy_avg": -0.022155379876494408, "loss/value_avg": 0.021301649510860443, "lr": 4.494949494949495e-07, "objective/entropy": -754.4105834960938, "objective/kl": 14.502567291259766, "objective/non_score_reward": -0.4350770115852356, "objective/rlhf_reward": -0.0415223091840744, "objective/scores": 0.39453125, "policy/approxkl_avg": 0.0004166339640505612, "policy/clipfrac_avg": 0.008984292857348919, "policy/entropy_avg": 0.14285914599895477, "step": 47, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 51, "val/ratio": 1.0002052783966064, "val/ratio_var": 1.7462091363995569e-06 }, { "episode": 3072, "epoch": 0.5797320249103605, "eps": 0, "loss/policy_avg": -0.027517154812812805, "loss/value_avg": 0.020472221076488495, "lr": 4.478114478114478e-07, "objective/entropy": -718.2733154296875, "objective/kl": 16.20379638671875, "objective/non_score_reward": -0.48611387610435486, "objective/rlhf_reward": -0.15676817297935486, "objective/scores": 0.330078125, "policy/approxkl_avg": 0.00043310271576046944, "policy/clipfrac_avg": 0.009795701131224632, "policy/entropy_avg": 0.16522979736328125, "step": 48, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 53, "val/ratio": 1.000115156173706, "val/ratio_var": 8.753415272622078e-07 }, { "episode": 3136, "epoch": 0.5918097754293263, "eps": 0, "loss/policy_avg": -0.037540458142757416, "loss/value_avg": 0.01891172304749489, "lr": 4.461279461279461e-07, "objective/entropy": -738.3416748046875, "objective/kl": 15.657567977905273, "objective/non_score_reward": -0.4697270393371582, "objective/rlhf_reward": 0.01708938181400299, "objective/scores": 0.486328125, "policy/approxkl_avg": 0.00037665231502614915, "policy/clipfrac_avg": 0.008994007483124733, "policy/entropy_avg": 0.14120499789714813, "step": 49, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 56, "val/ratio": 1.000009536743164, "val/ratio_var": 6.900321523062303e-07 }, { "episode": 3200, "epoch": 0.6038875259482921, "eps": 0, "loss/policy_avg": -0.03546880930662155, "loss/value_avg": 0.01827932894229889, "lr": 4.444444444444444e-07, "objective/entropy": -700.5809936523438, "objective/kl": 16.617774963378906, "objective/non_score_reward": -0.4985332190990448, "objective/rlhf_reward": -0.1123027503490448, "objective/scores": 0.38671875, "policy/approxkl_avg": 0.00040262818220071495, "policy/clipfrac_avg": 0.009067821316421032, "policy/entropy_avg": 0.13315296173095703, "step": 50, "val/clipfrac_avg": 6.10590086580487e-06, "val/num_eos_tokens": 53, "val/ratio": 1.0000566244125366, "val/ratio_var": 8.310821044688055e-07 }, { "episode": 3264, "epoch": 0.615965276467258, "eps": 0, "loss/policy_avg": -0.022362984716892242, "loss/value_avg": 0.01790526881814003, "lr": 4.4276094276094275e-07, "objective/entropy": -797.2921752929688, "objective/kl": 14.45788860321045, "objective/non_score_reward": -0.4337366223335266, "objective/rlhf_reward": 0.025736041367053986, "objective/scores": 0.458984375, "policy/approxkl_avg": 0.00034859025618061423, "policy/clipfrac_avg": 0.008412575349211693, "policy/entropy_avg": 0.12615332007408142, "step": 51, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 58, "val/ratio": 1.000011682510376, "val/ratio_var": 5.029750127505395e-07 }, { "episode": 3328, "epoch": 0.6280430269862238, "eps": 0, "loss/policy_avg": -0.03401505947113037, "loss/value_avg": 0.018212419003248215, "lr": 4.4107744107744106e-07, "objective/entropy": -690.7019653320312, "objective/kl": 15.584673881530762, "objective/non_score_reward": -0.4675402045249939, "objective/rlhf_reward": -0.03858514130115509, "objective/scores": 0.4296875, "policy/approxkl_avg": 0.0003729221352841705, "policy/clipfrac_avg": 0.00865244958549738, "policy/entropy_avg": 0.13912074267864227, "step": 52, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 51, "val/ratio": 1.000160574913025, "val/ratio_var": 6.19857644323929e-07 }, { "episode": 3392, "epoch": 0.6401207775051897, "eps": 0, "loss/policy_avg": -0.012439190410077572, "loss/value_avg": 0.016416631639003754, "lr": 4.3939393939393937e-07, "objective/entropy": -722.3007202148438, "objective/kl": 15.750506401062012, "objective/non_score_reward": -0.47251516580581665, "objective/rlhf_reward": -0.16929252445697784, "objective/scores": 0.302734375, "policy/approxkl_avg": 0.00040123704820871353, "policy/clipfrac_avg": 0.009440924972295761, "policy/entropy_avg": 0.14617919921875, "step": 53, "val/clipfrac_avg": 6.367155947373249e-06, "val/num_eos_tokens": 51, "val/ratio": 0.9999165534973145, "val/ratio_var": 5.078387061985268e-07 }, { "episode": 3456, "epoch": 0.6521985280241555, "eps": 0, "loss/policy_avg": -0.026336457580327988, "loss/value_avg": 0.01662503555417061, "lr": 4.377104377104377e-07, "objective/entropy": -792.0408935546875, "objective/kl": 15.416799545288086, "objective/non_score_reward": -0.4625040292739868, "objective/rlhf_reward": -0.012308701872825623, "objective/scores": 0.44921875, "policy/approxkl_avg": 0.0010212509660050273, "policy/clipfrac_avg": 0.008898193016648293, "policy/entropy_avg": 0.13633601367473602, "step": 54, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 54, "val/ratio": 0.9999532699584961, "val/ratio_var": 4.915744966638158e-07 }, { "episode": 3520, "epoch": 0.6642762785431213, "eps": 0, "loss/policy_avg": -0.03656713292002678, "loss/value_avg": 0.016763746738433838, "lr": 4.3602693602693604e-07, "objective/entropy": -728.6891479492188, "objective/kl": 15.019660949707031, "objective/non_score_reward": -0.4505898356437683, "objective/rlhf_reward": -0.0018593519926071167, "objective/scores": 0.44921875, "policy/approxkl_avg": 0.0011785384267568588, "policy/clipfrac_avg": 0.009183433838188648, "policy/entropy_avg": 0.15262095630168915, "step": 55, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 51, "val/ratio": 1.0000860691070557, "val/ratio_var": 8.0383756539959e-07 }, { "episode": 3584, "epoch": 0.6763540290620872, "eps": 0, "loss/policy_avg": -0.022373374551534653, "loss/value_avg": 0.016118954867124557, "lr": 4.3434343434343435e-07, "objective/entropy": -780.7221069335938, "objective/kl": 14.14107608795166, "objective/non_score_reward": -0.4242323040962219, "objective/rlhf_reward": -0.03507213294506073, "objective/scores": 0.388671875, "policy/approxkl_avg": 0.0003644491662271321, "policy/clipfrac_avg": 0.009697480127215385, "policy/entropy_avg": 0.1477101743221283, "step": 56, "val/clipfrac_avg": 4.006410563306417e-06, "val/num_eos_tokens": 58, "val/ratio": 1.0000145435333252, "val/ratio_var": 4.715680574918224e-07 }, { "episode": 3648, "epoch": 0.6884317795810531, "eps": 0, "loss/policy_avg": -0.02042277157306671, "loss/value_avg": 0.015200886875391006, "lr": 4.326599326599326e-07, "objective/entropy": -730.958740234375, "objective/kl": 15.971136093139648, "objective/non_score_reward": -0.47913408279418945, "objective/rlhf_reward": -0.11194658279418945, "objective/scores": 0.3671875, "policy/approxkl_avg": 0.0004102127568330616, "policy/clipfrac_avg": 0.009771636687219143, "policy/entropy_avg": 0.14384841918945312, "step": 57, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 54, "val/ratio": 1.0000948905944824, "val/ratio_var": 5.781259915238479e-07 }, { "episode": 3712, "epoch": 0.7005095301000188, "eps": 0, "loss/policy_avg": -0.04674074053764343, "loss/value_avg": 0.015770789235830307, "lr": 4.309764309764309e-07, "objective/entropy": -689.0645141601562, "objective/kl": 15.481245994567871, "objective/non_score_reward": -0.4644373655319214, "objective/rlhf_reward": -0.0034998655319213867, "objective/scores": 0.4609375, "policy/approxkl_avg": 0.00042656456935219467, "policy/clipfrac_avg": 0.009159904904663563, "policy/entropy_avg": 0.15465545654296875, "step": 58, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 48, "val/ratio": 1.000055193901062, "val/ratio_var": 6.497099320768029e-07 }, { "episode": 3776, "epoch": 0.7125872806189847, "eps": 0, "loss/policy_avg": -0.018530046567320824, "loss/value_avg": 0.014360702596604824, "lr": 4.292929292929293e-07, "objective/entropy": -639.9326782226562, "objective/kl": 17.16830825805664, "objective/non_score_reward": -0.5150492787361145, "objective/rlhf_reward": -0.2088969349861145, "objective/scores": 0.306640625, "policy/approxkl_avg": 0.00047735171392560005, "policy/clipfrac_avg": 0.010190478526055813, "policy/entropy_avg": 0.17077922821044922, "step": 59, "val/clipfrac_avg": 8.251181498053484e-06, "val/num_eos_tokens": 47, "val/ratio": 1.0000905990600586, "val/ratio_var": 1.1262843599979533e-06 }, { "episode": 3840, "epoch": 0.7246650311379506, "eps": 0, "loss/policy_avg": -0.03477172553539276, "loss/value_avg": 0.014889835380017757, "lr": 4.276094276094276e-07, "objective/entropy": -693.819091796875, "objective/kl": 14.728355407714844, "objective/non_score_reward": -0.4418506622314453, "objective/rlhf_reward": -0.03706549108028412, "objective/scores": 0.404296875, "policy/approxkl_avg": 0.00047620845725759864, "policy/clipfrac_avg": 0.00950541626662016, "policy/entropy_avg": 0.15099716186523438, "step": 60, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 43, "val/ratio": 1.0001107454299927, "val/ratio_var": 7.359848268606584e-07 }, { "episode": 3904, "epoch": 0.7367427816569164, "eps": 0, "loss/policy_avg": -0.02886144444346428, "loss/value_avg": 0.014064384624361992, "lr": 4.259259259259259e-07, "objective/entropy": -732.0101318359375, "objective/kl": 16.070905685424805, "objective/non_score_reward": -0.48212718963623047, "objective/rlhf_reward": -0.09003733098506927, "objective/scores": 0.392578125, "policy/approxkl_avg": 0.0004534229519777, "policy/clipfrac_avg": 0.008900020271539688, "policy/entropy_avg": 0.15126292407512665, "step": 61, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 0.9999483823776245, "val/ratio_var": 5.890042302780785e-07 }, { "episode": 3968, "epoch": 0.7488205321758823, "eps": 0, "loss/policy_avg": -0.03744254261255264, "loss/value_avg": 0.015733784064650536, "lr": 4.242424242424242e-07, "objective/entropy": -729.4462890625, "objective/kl": 15.581929206848145, "objective/non_score_reward": -0.4674578905105591, "objective/rlhf_reward": 0.05256165564060211, "objective/scores": 0.51953125, "policy/approxkl_avg": 0.0003785984590649605, "policy/clipfrac_avg": 0.008160373196005821, "policy/entropy_avg": 0.14229774475097656, "step": 62, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 58, "val/ratio": 1.0000548362731934, "val/ratio_var": 4.930154204885184e-07 }, { "episode": 4032, "epoch": 0.760898282694848, "eps": 0, "loss/policy_avg": -0.030014147982001305, "loss/value_avg": 0.014109417796134949, "lr": 4.225589225589226e-07, "objective/entropy": -747.200439453125, "objective/kl": 15.121429443359375, "objective/non_score_reward": -0.453642874956131, "objective/rlhf_reward": -0.04592801630496979, "objective/scores": 0.408203125, "policy/approxkl_avg": 0.0004679747798945755, "policy/clipfrac_avg": 0.008998386561870575, "policy/entropy_avg": 0.15067800879478455, "step": 63, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 55, "val/ratio": 1.0000910758972168, "val/ratio_var": 5.872569204257161e-07 }, { "episode": 4096, "epoch": 0.7729760332138139, "eps": 0, "loss/policy_avg": -0.02917386218905449, "loss/value_avg": 0.013095545582473278, "lr": 4.208754208754209e-07, "objective/entropy": -728.0292358398438, "objective/kl": 15.903536796569824, "objective/non_score_reward": -0.4771060347557068, "objective/rlhf_reward": -0.11333651840686798, "objective/scores": 0.36328125, "policy/approxkl_avg": 0.000516406842507422, "policy/clipfrac_avg": 0.008886368945240974, "policy/entropy_avg": 0.1517333984375, "step": 64, "val/clipfrac_avg": 5.552594302571379e-06, "val/num_eos_tokens": 51, "val/ratio": 0.9999017119407654, "val/ratio_var": 7.202033316389134e-07 }, { "episode": 4160, "epoch": 0.7850537837327798, "eps": 0, "loss/policy_avg": -0.026137467473745346, "loss/value_avg": 0.012687700800597668, "lr": 4.1919191919191915e-07, "objective/entropy": -767.3764038085938, "objective/kl": 15.278279304504395, "objective/non_score_reward": -0.4583483934402466, "objective/rlhf_reward": -0.008641347289085388, "objective/scores": 0.44921875, "policy/approxkl_avg": 0.0003777016536332667, "policy/clipfrac_avg": 0.009344375692307949, "policy/entropy_avg": 0.14270401000976562, "step": 65, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 45, "val/ratio": 0.9999732971191406, "val/ratio_var": 6.31260888894758e-07 }, { "episode": 4224, "epoch": 0.7971315342517457, "eps": 0, "loss/policy_avg": -0.02155480347573757, "loss/value_avg": 0.012883363291621208, "lr": 4.1750841750841746e-07, "objective/entropy": -701.712890625, "objective/kl": 15.514598846435547, "objective/non_score_reward": -0.4654379189014435, "objective/rlhf_reward": -0.08604338765144348, "objective/scores": 0.37890625, "policy/approxkl_avg": 0.00044276120024733245, "policy/clipfrac_avg": 0.00973192136734724, "policy/entropy_avg": 0.15912756323814392, "step": 66, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 51, "val/ratio": 1.0000464916229248, "val/ratio_var": 7.58379371745832e-07 }, { "episode": 4288, "epoch": 0.8092092847707114, "eps": 0, "loss/policy_avg": -0.03506336733698845, "loss/value_avg": 0.014060527086257935, "lr": 4.158249158249158e-07, "objective/entropy": -723.3513793945312, "objective/kl": 16.13052749633789, "objective/non_score_reward": -0.4839158058166504, "objective/rlhf_reward": -0.03518534451723099, "objective/scores": 0.44921875, "policy/approxkl_avg": 0.0006458936259150505, "policy/clipfrac_avg": 0.009050115011632442, "policy/entropy_avg": 0.14800135791301727, "step": 67, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 50, "val/ratio": 0.9999250173568726, "val/ratio_var": 6.238656169443857e-07 }, { "episode": 4352, "epoch": 0.8212870352896773, "eps": 0, "loss/policy_avg": -0.03492492437362671, "loss/value_avg": 0.013395547866821289, "lr": 4.1414141414141413e-07, "objective/entropy": -756.134765625, "objective/kl": 14.228071212768555, "objective/non_score_reward": -0.4268421530723572, "objective/rlhf_reward": 0.005775056779384613, "objective/scores": 0.43359375, "policy/approxkl_avg": 0.00040148766129277647, "policy/clipfrac_avg": 0.008605660870671272, "policy/entropy_avg": 0.14455795288085938, "step": 68, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 51, "val/ratio": 0.9998807907104492, "val/ratio_var": 5.104772071717889e-07 }, { "episode": 4416, "epoch": 0.8333647858086431, "eps": 0, "loss/policy_avg": -0.03622628003358841, "loss/value_avg": 0.012129010632634163, "lr": 4.1245791245791244e-07, "objective/entropy": -692.700439453125, "objective/kl": 15.215728759765625, "objective/non_score_reward": -0.45647183060646057, "objective/rlhf_reward": -0.03947964310646057, "objective/scores": 0.41796875, "policy/approxkl_avg": 0.00043082493357360363, "policy/clipfrac_avg": 0.009177702479064465, "policy/entropy_avg": 0.15514373779296875, "step": 69, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 44, "val/ratio": 1.0000324249267578, "val/ratio_var": 8.417625281254004e-07 }, { "episode": 4480, "epoch": 0.845442536327609, "eps": 0, "loss/policy_avg": -0.0306796133518219, "loss/value_avg": 0.011692370288074017, "lr": 4.1077441077441075e-07, "objective/entropy": -700.46533203125, "objective/kl": 14.929758071899414, "objective/non_score_reward": -0.4478927254676819, "objective/rlhf_reward": -0.05726771056652069, "objective/scores": 0.390625, "policy/approxkl_avg": 0.000412381486967206, "policy/clipfrac_avg": 0.008844866417348385, "policy/entropy_avg": 0.15450796484947205, "step": 70, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 52, "val/ratio": 1.0001542568206787, "val/ratio_var": 7.975154403538909e-07 }, { "episode": 4544, "epoch": 0.8575202868465748, "eps": 0, "loss/policy_avg": -0.03419748693704605, "loss/value_avg": 0.012416936457157135, "lr": 4.090909090909091e-07, "objective/entropy": -772.2770385742188, "objective/kl": 12.517084121704102, "objective/non_score_reward": -0.3755125403404236, "objective/rlhf_reward": 0.14401870965957642, "objective/scores": 0.51953125, "policy/approxkl_avg": 0.00036138106952421367, "policy/clipfrac_avg": 0.008941545151174068, "policy/entropy_avg": 0.13918177783489227, "step": 71, "val/clipfrac_avg": 9.753433914738707e-06, "val/num_eos_tokens": 55, "val/ratio": 0.9999549388885498, "val/ratio_var": 4.947730758431135e-07 }, { "episode": 4608, "epoch": 0.8695980373655406, "eps": 0, "loss/policy_avg": -0.0443786196410656, "loss/value_avg": 0.011243673972785473, "lr": 4.0740740740740737e-07, "objective/entropy": -702.0181884765625, "objective/kl": 14.902286529541016, "objective/non_score_reward": -0.44706863164901733, "objective/rlhf_reward": -0.01152174174785614, "objective/scores": 0.435546875, "policy/approxkl_avg": 0.00046824943274259567, "policy/clipfrac_avg": 0.009215106256306171, "policy/entropy_avg": 0.15385818481445312, "step": 72, "val/clipfrac_avg": 4.2761357690324076e-06, "val/num_eos_tokens": 51, "val/ratio": 0.9999047517776489, "val/ratio_var": 8.828073418953863e-07 }, { "episode": 4672, "epoch": 0.8816757878845065, "eps": 0, "loss/policy_avg": -0.044579483568668365, "loss/value_avg": 0.013477655127644539, "lr": 4.057239057239057e-07, "objective/entropy": -750.3492431640625, "objective/kl": 12.742916107177734, "objective/non_score_reward": -0.38228750228881836, "objective/rlhf_reward": 0.16458749771118164, "objective/scores": 0.546875, "policy/approxkl_avg": 0.00037200923543423414, "policy/clipfrac_avg": 0.008558372035622597, "policy/entropy_avg": 0.14228439331054688, "step": 73, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 48, "val/ratio": 0.999910831451416, "val/ratio_var": 6.942154300304537e-07 }, { "episode": 4736, "epoch": 0.8937535384034724, "eps": 0, "loss/policy_avg": -0.020492155104875565, "loss/value_avg": 0.01326964795589447, "lr": 4.04040404040404e-07, "objective/entropy": -776.6446533203125, "objective/kl": 14.296285629272461, "objective/non_score_reward": -0.42888855934143066, "objective/rlhf_reward": 0.007146604359149933, "objective/scores": 0.435546875, "policy/approxkl_avg": 0.00041618672548793256, "policy/clipfrac_avg": 0.009593900293111801, "policy/entropy_avg": 0.15248744189739227, "step": 74, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 0.9999758005142212, "val/ratio_var": 6.57985367524816e-07 }, { "episode": 4800, "epoch": 0.9058312889224382, "eps": 0, "loss/policy_avg": -0.03961968421936035, "loss/value_avg": 0.013151820749044418, "lr": 4.0235690235690236e-07, "objective/entropy": -710.4595336914062, "objective/kl": 14.758489608764648, "objective/non_score_reward": -0.44275468587875366, "objective/rlhf_reward": -0.023809373378753662, "objective/scores": 0.41796875, "policy/approxkl_avg": 0.0004320571315474808, "policy/clipfrac_avg": 0.009666088968515396, "policy/entropy_avg": 0.16128668189048767, "step": 75, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 50, "val/ratio": 1.0000278949737549, "val/ratio_var": 7.166216846599127e-07 }, { "episode": 4864, "epoch": 0.917909039441404, "eps": 0, "loss/policy_avg": -0.04150720685720444, "loss/value_avg": 0.013188062235713005, "lr": 4.0067340067340067e-07, "objective/entropy": -744.156005859375, "objective/kl": 15.219215393066406, "objective/non_score_reward": -0.4565764367580414, "objective/rlhf_reward": 0.07613840699195862, "objective/scores": 0.53125, "policy/approxkl_avg": 0.0003768115711864084, "policy/clipfrac_avg": 0.00844576582312584, "policy/entropy_avg": 0.15825144946575165, "step": 76, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 45, "val/ratio": 0.9999443292617798, "val/ratio_var": 8.649810183669615e-07 }, { "episode": 4928, "epoch": 0.9299867899603699, "eps": 0, "loss/policy_avg": -0.025156065821647644, "loss/value_avg": 0.011967229656875134, "lr": 3.98989898989899e-07, "objective/entropy": -674.7507934570312, "objective/kl": 14.667293548583984, "objective/non_score_reward": -0.44001880288124084, "objective/rlhf_reward": -0.10603442043066025, "objective/scores": 0.333984375, "policy/approxkl_avg": 0.00046505866339430213, "policy/clipfrac_avg": 0.009665473364293575, "policy/entropy_avg": 0.15473303198814392, "step": 77, "val/clipfrac_avg": 7.867573003750294e-06, "val/num_eos_tokens": 47, "val/ratio": 0.9999678134918213, "val/ratio_var": 6.291583645179344e-07 }, { "episode": 4992, "epoch": 0.9420645404793357, "eps": 0, "loss/policy_avg": -0.04044274613261223, "loss/value_avg": 0.012631962075829506, "lr": 3.973063973063973e-07, "objective/entropy": -680.3353271484375, "objective/kl": 14.307917594909668, "objective/non_score_reward": -0.4292375147342682, "objective/rlhf_reward": -0.0073625147342681885, "objective/scores": 0.421875, "policy/approxkl_avg": 0.0005386772681958973, "policy/clipfrac_avg": 0.008763562887907028, "policy/entropy_avg": 0.16646194458007812, "step": 78, "val/clipfrac_avg": 1.0013714927481487e-05, "val/num_eos_tokens": 53, "val/ratio": 0.9999792575836182, "val/ratio_var": 7.390693212983024e-07 }, { "episode": 5056, "epoch": 0.9541422909983016, "eps": 0, "loss/policy_avg": -0.048411011695861816, "loss/value_avg": 0.011281365528702736, "lr": 3.956228956228956e-07, "objective/entropy": -662.266845703125, "objective/kl": 15.492654800415039, "objective/non_score_reward": -0.46477964520454407, "objective/rlhf_reward": -0.049252308905124664, "objective/scores": 0.416015625, "policy/approxkl_avg": 0.0005287184612825513, "policy/clipfrac_avg": 0.009335631504654884, "policy/entropy_avg": 0.18247604370117188, "step": 79, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 40, "val/ratio": 1.0000967979431152, "val/ratio_var": 5.371284714783542e-07 }, { "episode": 5120, "epoch": 0.9662200415172674, "eps": 0, "loss/policy_avg": -0.05055360123515129, "loss/value_avg": 0.01074596494436264, "lr": 3.939393939393939e-07, "objective/entropy": -696.76025390625, "objective/kl": 13.393022537231445, "objective/non_score_reward": -0.40179064869880676, "objective/rlhf_reward": 0.028385117650032043, "objective/scores": 0.4296875, "policy/approxkl_avg": 0.00045566324843093753, "policy/clipfrac_avg": 0.01011097151786089, "policy/entropy_avg": 0.16834895312786102, "step": 80, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 58, "val/ratio": 1.0001335144042969, "val/ratio_var": 8.958918442658614e-07 }, { "episode": 5184, "epoch": 0.9782977920362332, "eps": 0, "loss/policy_avg": -0.03783099725842476, "loss/value_avg": 0.010885774157941341, "lr": 3.922558922558922e-07, "objective/entropy": -701.8968505859375, "objective/kl": 14.081245422363281, "objective/non_score_reward": -0.4224373698234558, "objective/rlhf_reward": -0.01716393232345581, "objective/scores": 0.40625, "policy/approxkl_avg": 0.00042904424481093884, "policy/clipfrac_avg": 0.008891528472304344, "policy/entropy_avg": 0.16935603320598602, "step": 81, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 55, "val/ratio": 0.9999715089797974, "val/ratio_var": 8.756766760598111e-07 }, { "episode": 5248, "epoch": 0.9903755425551991, "eps": 0, "loss/policy_avg": -0.03562987968325615, "loss/value_avg": 0.010367941111326218, "lr": 3.9057239057239053e-07, "objective/entropy": -733.0692749023438, "objective/kl": 12.260353088378906, "objective/non_score_reward": -0.36781054735183716, "objective/rlhf_reward": 0.10777536779642105, "objective/scores": 0.4765625, "policy/approxkl_avg": 0.00047890731366351247, "policy/clipfrac_avg": 0.008688896894454956, "policy/entropy_avg": 0.1473541259765625, "step": 82, "val/clipfrac_avg": 4.130320121475961e-06, "val/num_eos_tokens": 54, "val/ratio": 1.000084400177002, "val/ratio_var": 5.904771569475997e-07 }, { "episode": 5312, "epoch": 1.002453293074165, "eps": 0, "loss/policy_avg": -0.03139907121658325, "loss/value_avg": 0.010464398190379143, "lr": 3.888888888888889e-07, "objective/entropy": -696.6053466796875, "objective/kl": 12.150976181030273, "objective/non_score_reward": -0.36452925205230713, "objective/rlhf_reward": -0.008083932101726532, "objective/scores": 0.35546875, "policy/approxkl_avg": 0.00038306796341203153, "policy/clipfrac_avg": 0.009303595870733261, "policy/entropy_avg": 0.17142996191978455, "step": 83, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 52, "val/ratio": 1.0002813339233398, "val/ratio_var": 6.230750955182884e-07 }, { "episode": 5376, "epoch": 1.0145310435931307, "eps": 0, "loss/policy_avg": -0.03326902911067009, "loss/value_avg": 0.009499987587332726, "lr": 3.872053872053872e-07, "objective/entropy": -681.0773315429688, "objective/kl": 13.791988372802734, "objective/non_score_reward": -0.41375964879989624, "objective/rlhf_reward": 0.0037208348512649536, "objective/scores": 0.41796875, "policy/approxkl_avg": 0.00044912411249242723, "policy/clipfrac_avg": 0.009229006245732307, "policy/entropy_avg": 0.16622290015220642, "step": 84, "val/clipfrac_avg": 1.3086264516459778e-05, "val/num_eos_tokens": 50, "val/ratio": 1.0001062154769897, "val/ratio_var": 6.348414558488003e-07 }, { "episode": 5440, "epoch": 1.0266087941120967, "eps": 0, "loss/policy_avg": -0.024213604629039764, "loss/value_avg": 0.009874923154711723, "lr": 3.855218855218855e-07, "objective/entropy": -694.9255981445312, "objective/kl": 12.345937728881836, "objective/non_score_reward": -0.37037813663482666, "objective/rlhf_reward": 0.06370390206575394, "objective/scores": 0.43359375, "policy/approxkl_avg": 0.0010469364933669567, "policy/clipfrac_avg": 0.007577784359455109, "policy/entropy_avg": 0.17229843139648438, "step": 85, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 35, "val/ratio": 1.000024676322937, "val/ratio_var": 6.738481488355319e-07 }, { "episode": 5504, "epoch": 1.0386865446310625, "eps": 0, "loss/policy_avg": -0.024558693170547485, "loss/value_avg": 0.00916454941034317, "lr": 3.8383838383838377e-07, "objective/entropy": -758.008544921875, "objective/kl": 11.96615219116211, "objective/non_score_reward": -0.3589845895767212, "objective/rlhf_reward": 0.05996072292327881, "objective/scores": 0.41796875, "policy/approxkl_avg": 0.0004611395706888288, "policy/clipfrac_avg": 0.009094095788896084, "policy/entropy_avg": 0.1533660888671875, "step": 86, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 44, "val/ratio": 1.0000550746917725, "val/ratio_var": 5.709296146960696e-07 }, { "episode": 5568, "epoch": 1.0507642951500282, "eps": 0, "loss/policy_avg": -0.03639592230319977, "loss/value_avg": 0.010131916962563992, "lr": 3.8215488215488214e-07, "objective/entropy": -700.35693359375, "objective/kl": 13.447929382324219, "objective/non_score_reward": -0.40343791246414185, "objective/rlhf_reward": -0.0006058886647224426, "objective/scores": 0.40234375, "policy/approxkl_avg": 0.00040239907684735954, "policy/clipfrac_avg": 0.00921421404927969, "policy/entropy_avg": 0.17262396216392517, "step": 87, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 50, "val/ratio": 1.0000275373458862, "val/ratio_var": 6.002900931889599e-07 }, { "episode": 5632, "epoch": 1.0628420456689942, "eps": 0, "loss/policy_avg": -0.01775004342198372, "loss/value_avg": 0.010766441933810711, "lr": 3.8047138047138045e-07, "objective/entropy": -757.93701171875, "objective/kl": 12.896736145019531, "objective/non_score_reward": -0.3869020938873291, "objective/rlhf_reward": 0.0408322811126709, "objective/scores": 0.427734375, "policy/approxkl_avg": 0.0003993526042904705, "policy/clipfrac_avg": 0.00875895470380783, "policy/entropy_avg": 0.158355712890625, "step": 88, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 61, "val/ratio": 0.9999960660934448, "val/ratio_var": 5.071574946668989e-07 }, { "episode": 5696, "epoch": 1.07491979618796, "eps": 0, "loss/policy_avg": -0.019584549590945244, "loss/value_avg": 0.009444857016205788, "lr": 3.7878787878787876e-07, "objective/entropy": -716.8355102539062, "objective/kl": 12.824501037597656, "objective/non_score_reward": -0.3847350478172302, "objective/rlhf_reward": 0.03274543583393097, "objective/scores": 0.41796875, "policy/approxkl_avg": 0.00036994865513406694, "policy/clipfrac_avg": 0.008531251922249794, "policy/entropy_avg": 0.1638692319393158, "step": 89, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 43, "val/ratio": 1.000080943107605, "val/ratio_var": 4.814820613319171e-07 }, { "episode": 5760, "epoch": 1.086997546706926, "eps": 0, "loss/policy_avg": -0.03423365205526352, "loss/value_avg": 0.009232178330421448, "lr": 3.7710437710437707e-07, "objective/entropy": -693.8837280273438, "objective/kl": 14.44405746459961, "objective/non_score_reward": -0.4333217144012451, "objective/rlhf_reward": 0.0012486129999160767, "objective/scores": 0.43359375, "policy/approxkl_avg": 0.0004389469395391643, "policy/clipfrac_avg": 0.009360795840620995, "policy/entropy_avg": 0.17364120483398438, "step": 90, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 55, "val/ratio": 1.000040054321289, "val/ratio_var": 6.926705395926547e-07 }, { "episode": 5824, "epoch": 1.0990752972258917, "eps": 0, "loss/policy_avg": -0.032273001968860626, "loss/value_avg": 0.008706326596438885, "lr": 3.7542087542087543e-07, "objective/entropy": -689.3602294921875, "objective/kl": 12.720474243164062, "objective/non_score_reward": -0.38161420822143555, "objective/rlhf_reward": 0.004616260528564453, "objective/scores": 0.38671875, "policy/approxkl_avg": 0.0006273721810430288, "policy/clipfrac_avg": 0.009181271307170391, "policy/entropy_avg": 0.18628311157226562, "step": 91, "val/clipfrac_avg": 1.6534391761524603e-05, "val/num_eos_tokens": 41, "val/ratio": 1.0000529289245605, "val/ratio_var": 8.713162742424174e-07 }, { "episode": 5888, "epoch": 1.1111530477448575, "eps": 0, "loss/policy_avg": -0.02137349173426628, "loss/value_avg": 0.009642090648412704, "lr": 3.7373737373737374e-07, "objective/entropy": -692.1001586914062, "objective/kl": 14.837923049926758, "objective/non_score_reward": -0.4451376795768738, "objective/rlhf_reward": -0.06671970337629318, "objective/scores": 0.37890625, "policy/approxkl_avg": 0.00043404646567068994, "policy/clipfrac_avg": 0.009237932972609997, "policy/entropy_avg": 0.17236074805259705, "step": 92, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 56, "val/ratio": 1.0002210140228271, "val/ratio_var": 5.661090085595788e-07 }, { "episode": 5952, "epoch": 1.1232307982638234, "eps": 0, "loss/policy_avg": -0.02585110068321228, "loss/value_avg": 0.008299533277750015, "lr": 3.7205387205387205e-07, "objective/entropy": -694.7158813476562, "objective/kl": 13.083290100097656, "objective/non_score_reward": -0.3924986720085144, "objective/rlhf_reward": -0.0001646876335144043, "objective/scores": 0.392578125, "policy/approxkl_avg": 0.00045136193512007594, "policy/clipfrac_avg": 0.009170552715659142, "policy/entropy_avg": 0.1658935546875, "step": 93, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 47, "val/ratio": 1.0000104904174805, "val/ratio_var": 6.102125666984648e-07 }, { "episode": 6016, "epoch": 1.1353085487827892, "eps": 0, "loss/policy_avg": -0.009908072650432587, "loss/value_avg": 0.008584199473261833, "lr": 3.703703703703703e-07, "objective/entropy": -704.2706298828125, "objective/kl": 13.47339916229248, "objective/non_score_reward": -0.4042019844055176, "objective/rlhf_reward": -0.041409000754356384, "objective/scores": 0.36328125, "policy/approxkl_avg": 0.0004217799287289381, "policy/clipfrac_avg": 0.009384381584823132, "policy/entropy_avg": 0.15645718574523926, "step": 94, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 51, "val/ratio": 0.9999314546585083, "val/ratio_var": 5.486367058438191e-07 }, { "episode": 6080, "epoch": 1.147386299301755, "eps": 0, "loss/policy_avg": -0.017584126442670822, "loss/value_avg": 0.008460859768092632, "lr": 3.686868686868687e-07, "objective/entropy": -668.5169677734375, "objective/kl": 13.884815216064453, "objective/non_score_reward": -0.41654446721076965, "objective/rlhf_reward": -0.041544459760189056, "objective/scores": 0.375, "policy/approxkl_avg": 0.00046371493954211473, "policy/clipfrac_avg": 0.0087856724858284, "policy/entropy_avg": 0.17994818091392517, "step": 95, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 1.0000855922698975, "val/ratio_var": 8.110731641863822e-07 }, { "episode": 6144, "epoch": 1.159464049820721, "eps": 0, "loss/policy_avg": -0.044272422790527344, "loss/value_avg": 0.008409342728555202, "lr": 3.67003367003367e-07, "objective/entropy": -681.5213623046875, "objective/kl": 12.985597610473633, "objective/non_score_reward": -0.38956788182258606, "objective/rlhf_reward": 0.06941649317741394, "objective/scores": 0.458984375, "policy/approxkl_avg": 0.0004158633528277278, "policy/clipfrac_avg": 0.008287884294986725, "policy/entropy_avg": 0.17576727271080017, "step": 96, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 50, "val/ratio": 0.9998923540115356, "val/ratio_var": 7.354411764026736e-07 }, { "episode": 6208, "epoch": 1.1715418003396867, "eps": 0, "loss/policy_avg": -0.021865837275981903, "loss/value_avg": 0.008920140564441681, "lr": 3.653198653198653e-07, "objective/entropy": -668.6990966796875, "objective/kl": 13.717606544494629, "objective/non_score_reward": -0.41152817010879517, "objective/rlhf_reward": -0.03311019390821457, "objective/scores": 0.37890625, "policy/approxkl_avg": 0.0005852892645634711, "policy/clipfrac_avg": 0.009737148880958557, "policy/entropy_avg": 0.17390570044517517, "step": 97, "val/clipfrac_avg": 5.780614628747571e-06, "val/num_eos_tokens": 49, "val/ratio": 1.0000433921813965, "val/ratio_var": 7.945446327539685e-07 }, { "episode": 6272, "epoch": 1.1836195508586527, "eps": 0, "loss/policy_avg": -0.01734349876642227, "loss/value_avg": 0.008430123329162598, "lr": 3.636363636363636e-07, "objective/entropy": -680.8672485351562, "objective/kl": 13.248254776000977, "objective/non_score_reward": -0.3974476158618927, "objective/rlhf_reward": -0.0346546545624733, "objective/scores": 0.36328125, "policy/approxkl_avg": 0.0004206518060527742, "policy/clipfrac_avg": 0.008977975696325302, "policy/entropy_avg": 0.18427658081054688, "step": 98, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 0.9999732971191406, "val/ratio_var": 7.201407470347476e-07 }, { "episode": 6336, "epoch": 1.1956973013776184, "eps": 0, "loss/policy_avg": -0.01861773617565632, "loss/value_avg": 0.007797658443450928, "lr": 3.6195286195286197e-07, "objective/entropy": -750.1210327148438, "objective/kl": 12.454809188842773, "objective/non_score_reward": -0.37364426255226135, "objective/rlhf_reward": 0.08729323744773865, "objective/scores": 0.4609375, "policy/approxkl_avg": 0.00039422509144060314, "policy/clipfrac_avg": 0.008532920852303505, "policy/entropy_avg": 0.15262095630168915, "step": 99, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 51, "val/ratio": 1.0000927448272705, "val/ratio_var": 6.317893621599069e-07 }, { "episode": 6400, "epoch": 1.2077750518965842, "eps": 0, "loss/policy_avg": -0.018518339842557907, "loss/value_avg": 0.008531475439667702, "lr": 3.602693602693603e-07, "objective/entropy": -659.4075317382812, "objective/kl": 14.026962280273438, "objective/non_score_reward": -0.4208088517189026, "objective/rlhf_reward": -0.014070577919483185, "objective/scores": 0.40625, "policy/approxkl_avg": 0.0004532616585493088, "policy/clipfrac_avg": 0.009551241993904114, "policy/entropy_avg": 0.1776987761259079, "step": 100, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 1.0001015663146973, "val/ratio_var": 7.798981869200361e-07 }, { "episode": 6464, "epoch": 1.2198528024155502, "eps": 0, "loss/policy_avg": -0.016511594876646996, "loss/value_avg": 0.007756595965474844, "lr": 3.5858585858585854e-07, "objective/entropy": -679.53662109375, "objective/kl": 11.8580322265625, "objective/non_score_reward": -0.3557409346103668, "objective/rlhf_reward": 0.003634057939052582, "objective/scores": 0.359375, "policy/approxkl_avg": 0.000443882163381204, "policy/clipfrac_avg": 0.008907586336135864, "policy/entropy_avg": 0.16290760040283203, "step": 101, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 1.0000029802322388, "val/ratio_var": 6.948280315555166e-07 }, { "episode": 6528, "epoch": 1.231930552934516, "eps": 0, "loss/policy_avg": -0.0391608364880085, "loss/value_avg": 0.00805463083088398, "lr": 3.5690235690235685e-07, "objective/entropy": -706.1781616210938, "objective/kl": 12.108580589294434, "objective/non_score_reward": -0.36325740814208984, "objective/rlhf_reward": 0.11525820195674896, "objective/scores": 0.478515625, "policy/approxkl_avg": 0.00041044512181542814, "policy/clipfrac_avg": 0.009424544870853424, "policy/entropy_avg": 0.18370692431926727, "step": 102, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 44, "val/ratio": 0.9999408721923828, "val/ratio_var": 7.074789323269215e-07 }, { "episode": 6592, "epoch": 1.2440083034534817, "eps": 0, "loss/policy_avg": -0.028079848736524582, "loss/value_avg": 0.007725001312792301, "lr": 3.552188552188552e-07, "objective/entropy": -759.4559326171875, "objective/kl": 10.86036491394043, "objective/non_score_reward": -0.3258109390735626, "objective/rlhf_reward": 0.17907187342643738, "objective/scores": 0.50390625, "policy/approxkl_avg": 0.0003799691912718117, "policy/clipfrac_avg": 0.009264815598726273, "policy/entropy_avg": 0.16744868457317352, "step": 103, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 57, "val/ratio": 1.0000367164611816, "val/ratio_var": 8.271580327345873e-07 }, { "episode": 6656, "epoch": 1.2560860539724477, "eps": 0, "loss/policy_avg": -0.029258184134960175, "loss/value_avg": 0.008032194338738918, "lr": 3.535353535353535e-07, "objective/entropy": -731.3373413085938, "objective/kl": 12.31856632232666, "objective/non_score_reward": -0.3695569932460785, "objective/rlhf_reward": 0.0728258341550827, "objective/scores": 0.44140625, "policy/approxkl_avg": 0.00038161594420671463, "policy/clipfrac_avg": 0.008145595900714397, "policy/entropy_avg": 0.15793482959270477, "step": 104, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 0.9998757839202881, "val/ratio_var": 5.09608128140826e-07 }, { "episode": 6720, "epoch": 1.2681638044914134, "eps": 0, "loss/policy_avg": -0.02511785924434662, "loss/value_avg": 0.007496064528822899, "lr": 3.5185185185185183e-07, "objective/entropy": -748.223388671875, "objective/kl": 10.859156608581543, "objective/non_score_reward": -0.3257746994495392, "objective/rlhf_reward": 0.12442061305046082, "objective/scores": 0.44921875, "policy/approxkl_avg": 0.00035941399983130395, "policy/clipfrac_avg": 0.007973343133926392, "policy/entropy_avg": 0.1588083952665329, "step": 105, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 0.9999687075614929, "val/ratio_var": 5.749298566115613e-07 }, { "episode": 6784, "epoch": 1.2802415550103794, "eps": 0, "loss/policy_avg": -0.016160570085048676, "loss/value_avg": 0.0075116828083992004, "lr": 3.5016835016835014e-07, "objective/entropy": -734.2767333984375, "objective/kl": 10.657108306884766, "objective/non_score_reward": -0.3197132647037506, "objective/rlhf_reward": 0.08360705524682999, "objective/scores": 0.40234375, "policy/approxkl_avg": 0.0003991243429481983, "policy/clipfrac_avg": 0.008354730904102325, "policy/entropy_avg": 0.17373785376548767, "step": 106, "val/clipfrac_avg": 6.916777692822507e-06, "val/num_eos_tokens": 48, "val/ratio": 0.9999769926071167, "val/ratio_var": 2.0136023977102013e-06 }, { "episode": 6848, "epoch": 1.2923193055293452, "eps": 0, "loss/policy_avg": -0.031159965321421623, "loss/value_avg": 0.007707377430051565, "lr": 3.484848484848485e-07, "objective/entropy": -751.6048583984375, "objective/kl": 10.399733543395996, "objective/non_score_reward": -0.31199198961257935, "objective/rlhf_reward": 0.16603532433509827, "objective/scores": 0.478515625, "policy/approxkl_avg": 0.0003584410878829658, "policy/clipfrac_avg": 0.0083873700350523, "policy/entropy_avg": 0.17502593994140625, "step": 107, "val/clipfrac_avg": 4.006410563306417e-06, "val/num_eos_tokens": 59, "val/ratio": 1.0000163316726685, "val/ratio_var": 4.1057577959691116e-07 }, { "episode": 6912, "epoch": 1.304397056048311, "eps": 0, "loss/policy_avg": -0.028454942628741264, "loss/value_avg": 0.007068981416523457, "lr": 3.4680134680134676e-07, "objective/entropy": -743.0848388671875, "objective/kl": 11.543351173400879, "objective/non_score_reward": -0.34630051255226135, "objective/rlhf_reward": 0.08485182374715805, "objective/scores": 0.431640625, "policy/approxkl_avg": 0.00046511981054209173, "policy/clipfrac_avg": 0.00906567182391882, "policy/entropy_avg": 0.17058055102825165, "step": 108, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 1.0001753568649292, "val/ratio_var": 5.996980121381057e-07 }, { "episode": 6976, "epoch": 1.316474806567277, "eps": 0, "loss/policy_avg": -0.025691799819469452, "loss/value_avg": 0.007134787738323212, "lr": 3.451178451178451e-07, "objective/entropy": -738.3368530273438, "objective/kl": 12.694337844848633, "objective/non_score_reward": -0.38083016872406006, "objective/rlhf_reward": 0.022490166127681732, "objective/scores": 0.40234375, "policy/approxkl_avg": 0.0003700514789670706, "policy/clipfrac_avg": 0.008202668279409409, "policy/entropy_avg": 0.16368231177330017, "step": 109, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 0.9999978542327881, "val/ratio_var": 7.537715305261372e-07 }, { "episode": 7040, "epoch": 1.3285525570862426, "eps": 0, "loss/policy_avg": -0.02102075144648552, "loss/value_avg": 0.0069115618243813515, "lr": 3.434343434343434e-07, "objective/entropy": -674.031494140625, "objective/kl": 11.79364013671875, "objective/non_score_reward": -0.3538092076778412, "objective/rlhf_reward": 0.0021478235721588135, "objective/scores": 0.35546875, "policy/approxkl_avg": 0.0004363355692476034, "policy/clipfrac_avg": 0.009443921968340874, "policy/entropy_avg": 0.18636958301067352, "step": 110, "val/clipfrac_avg": 1.0153373295906931e-05, "val/num_eos_tokens": 40, "val/ratio": 1.0000288486480713, "val/ratio_var": 8.098888883978361e-07 }, { "episode": 7104, "epoch": 1.3406303076052086, "eps": 0, "loss/policy_avg": -0.032738231122493744, "loss/value_avg": 0.006995133124291897, "lr": 3.4175084175084175e-07, "objective/entropy": -694.0885620117188, "objective/kl": 11.100361824035645, "objective/non_score_reward": -0.33301082253456116, "objective/rlhf_reward": 0.11034853756427765, "objective/scores": 0.443359375, "policy/approxkl_avg": 0.0004124289262108505, "policy/clipfrac_avg": 0.009696437045931816, "policy/entropy_avg": 0.16990280151367188, "step": 111, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 41, "val/ratio": 1.000012993812561, "val/ratio_var": 6.493988848887966e-07 }, { "episode": 7168, "epoch": 1.3527080581241744, "eps": 0, "loss/policy_avg": -0.014131966978311539, "loss/value_avg": 0.0068663340061903, "lr": 3.4006734006734006e-07, "objective/entropy": -708.8604125976562, "objective/kl": 12.0945405960083, "objective/non_score_reward": -0.36283618211746216, "objective/rlhf_reward": 0.07319895178079605, "objective/scores": 0.435546875, "policy/approxkl_avg": 0.0003713433106895536, "policy/clipfrac_avg": 0.008440559729933739, "policy/entropy_avg": 0.17221546173095703, "step": 112, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 42, "val/ratio": 0.9999561309814453, "val/ratio_var": 7.880616976763122e-07 }, { "episode": 7232, "epoch": 1.3647858086431401, "eps": 0, "loss/policy_avg": -0.025116082280874252, "loss/value_avg": 0.006925811991095543, "lr": 3.3838383838383837e-07, "objective/entropy": -764.6071166992188, "objective/kl": 9.985912322998047, "objective/non_score_reward": -0.29957738518714905, "objective/rlhf_reward": 0.20384058356285095, "objective/scores": 0.50390625, "policy/approxkl_avg": 0.00035295903217047453, "policy/clipfrac_avg": 0.008869624696671963, "policy/entropy_avg": 0.14903895556926727, "step": 113, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 56, "val/ratio": 0.9999246597290039, "val/ratio_var": 6.762850830455136e-07 }, { "episode": 7296, "epoch": 1.3768635591621061, "eps": 0, "loss/policy_avg": -0.03208712860941887, "loss/value_avg": 0.006576072424650192, "lr": 3.3670033670033673e-07, "objective/entropy": -727.884521484375, "objective/kl": 10.056183815002441, "objective/non_score_reward": -0.30168551206588745, "objective/rlhf_reward": 0.20173244178295135, "objective/scores": 0.50390625, "policy/approxkl_avg": 0.00036313707823865116, "policy/clipfrac_avg": 0.008787401020526886, "policy/entropy_avg": 0.16502508521080017, "step": 114, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 55, "val/ratio": 1.000067114830017, "val/ratio_var": 6.396308549483365e-07 }, { "episode": 7360, "epoch": 1.3889413096810719, "eps": 0, "loss/policy_avg": -0.012382835149765015, "loss/value_avg": 0.007463869638741016, "lr": 3.35016835016835e-07, "objective/entropy": -742.7560424804688, "objective/kl": 11.383095741271973, "objective/non_score_reward": -0.3414928615093231, "objective/rlhf_reward": 0.06329229474067688, "objective/scores": 0.404296875, "policy/approxkl_avg": 0.0003729486488737166, "policy/clipfrac_avg": 0.008608178235590458, "policy/entropy_avg": 0.15799586474895477, "step": 115, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 1.0000958442687988, "val/ratio_var": 5.714954340874101e-07 }, { "episode": 7424, "epoch": 1.4010190602000376, "eps": 0, "loss/policy_avg": -0.025664834305644035, "loss/value_avg": 0.007761640008538961, "lr": 3.333333333333333e-07, "objective/entropy": -698.7653198242188, "objective/kl": 11.45352554321289, "objective/non_score_reward": -0.34360575675964355, "objective/rlhf_reward": 0.14565207064151764, "objective/scores": 0.48828125, "policy/approxkl_avg": 0.0004009153926745057, "policy/clipfrac_avg": 0.009290624409914017, "policy/entropy_avg": 0.17402777075767517, "step": 116, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 42, "val/ratio": 1.0000454187393188, "val/ratio_var": 5.306055754772387e-07 }, { "episode": 7488, "epoch": 1.4130968107190036, "eps": 0, "loss/policy_avg": 0.002953662071377039, "loss/value_avg": 0.008123742416501045, "lr": 3.316498316498316e-07, "objective/entropy": -682.9542236328125, "objective/kl": 11.626581192016602, "objective/non_score_reward": -0.348797470331192, "objective/rlhf_reward": -0.012859970331192017, "objective/scores": 0.3359375, "policy/approxkl_avg": 0.00040126274689100683, "policy/clipfrac_avg": 0.008363310247659683, "policy/entropy_avg": 0.17609915137290955, "step": 117, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 48, "val/ratio": 1.0000150203704834, "val/ratio_var": 5.960429234619369e-07 }, { "episode": 7552, "epoch": 1.4251745612379694, "eps": 0, "loss/policy_avg": -0.024843420833349228, "loss/value_avg": 0.008115454576909542, "lr": 3.2996632996633e-07, "objective/entropy": -706.5005493164062, "objective/kl": 10.046842575073242, "objective/non_score_reward": -0.30140525102615356, "objective/rlhf_reward": 0.17369240522384644, "objective/scores": 0.474609375, "policy/approxkl_avg": 0.0003913644468411803, "policy/clipfrac_avg": 0.008209867402911186, "policy/entropy_avg": 0.16899840533733368, "step": 118, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 50, "val/ratio": 1.000002145767212, "val/ratio_var": 4.6929091013225843e-07 }, { "episode": 7616, "epoch": 1.4372523117569354, "eps": 0, "loss/policy_avg": -0.027740802615880966, "loss/value_avg": 0.008931613527238369, "lr": 3.282828282828283e-07, "objective/entropy": -657.9352416992188, "objective/kl": 13.040338516235352, "objective/non_score_reward": -0.39121013879776, "objective/rlhf_reward": -0.047460153698921204, "objective/scores": 0.34375, "policy/approxkl_avg": 0.0005606787162832916, "policy/clipfrac_avg": 0.010201944038271904, "policy/entropy_avg": 0.19644801318645477, "step": 119, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 51, "val/ratio": 0.9999899864196777, "val/ratio_var": 7.442695277859457e-07 }, { "episode": 7680, "epoch": 1.449330062275901, "eps": 0, "loss/policy_avg": -0.020683161914348602, "loss/value_avg": 0.009700208902359009, "lr": 3.265993265993266e-07, "objective/entropy": -690.8984375, "objective/kl": 11.422820091247559, "objective/non_score_reward": -0.3426845967769623, "objective/rlhf_reward": 0.07919040322303772, "objective/scores": 0.421875, "policy/approxkl_avg": 0.0004005637892987579, "policy/clipfrac_avg": 0.008436471223831177, "policy/entropy_avg": 0.17758052051067352, "step": 120, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 47, "val/ratio": 1.00006103515625, "val/ratio_var": 6.303826580733585e-07 }, { "episode": 7744, "epoch": 1.4614078127948669, "eps": 0, "loss/policy_avg": -0.013201544992625713, "loss/value_avg": 0.008843690156936646, "lr": 3.249158249158249e-07, "objective/entropy": -661.7539672851562, "objective/kl": 11.296271324157715, "objective/non_score_reward": -0.33888810873031616, "objective/rlhf_reward": 0.023416556417942047, "objective/scores": 0.36328125, "policy/approxkl_avg": 0.00045110570499673486, "policy/clipfrac_avg": 0.008799891918897629, "policy/entropy_avg": 0.1949361264705658, "step": 121, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 50, "val/ratio": 0.9999641180038452, "val/ratio_var": 8.055627063185966e-07 }, { "episode": 7808, "epoch": 1.4734855633138328, "eps": 0, "loss/policy_avg": -0.017284566536545753, "loss/value_avg": 0.011503017507493496, "lr": 3.2323232323232327e-07, "objective/entropy": -659.644287109375, "objective/kl": 11.284911155700684, "objective/non_score_reward": -0.33854734897613525, "objective/rlhf_reward": 0.04646243155002594, "objective/scores": 0.384765625, "policy/approxkl_avg": 0.00042937506805174053, "policy/clipfrac_avg": 0.008618071675300598, "policy/entropy_avg": 0.19123205542564392, "step": 122, "val/clipfrac_avg": 0.00022032562992535532, "val/num_eos_tokens": 43, "val/ratio": 1.0000629425048828, "val/ratio_var": 6.566247634509637e-07 }, { "episode": 7872, "epoch": 1.4855633138327986, "eps": 0, "loss/policy_avg": -0.021937822923064232, "loss/value_avg": 0.009018287062644958, "lr": 3.2154882154882153e-07, "objective/entropy": -657.0106811523438, "objective/kl": 10.135122299194336, "objective/non_score_reward": -0.3040536642074585, "objective/rlhf_reward": 0.10219632089138031, "objective/scores": 0.40625, "policy/approxkl_avg": 0.0003951935505028814, "policy/clipfrac_avg": 0.008559124544262886, "policy/entropy_avg": 0.17734527587890625, "step": 123, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 1.0001037120819092, "val/ratio_var": 6.916661732248031e-07 }, { "episode": 7936, "epoch": 1.4976410643517646, "eps": 0, "loss/policy_avg": -0.013917829841375351, "loss/value_avg": 0.009599328972399235, "lr": 3.1986531986531984e-07, "objective/entropy": -703.0108642578125, "objective/kl": 11.34697151184082, "objective/non_score_reward": -0.34040915966033936, "objective/rlhf_reward": 0.048262715339660645, "objective/scores": 0.388671875, "policy/approxkl_avg": 0.0003819286357611418, "policy/clipfrac_avg": 0.008146028965711594, "policy/entropy_avg": 0.1836903989315033, "step": 124, "val/clipfrac_avg": 8.00051202531904e-06, "val/num_eos_tokens": 60, "val/ratio": 1.0001184940338135, "val/ratio_var": 8.107883218144707e-07 }, { "episode": 8000, "epoch": 1.5097188148707303, "eps": 0, "loss/policy_avg": -0.02694123610854149, "loss/value_avg": 0.00868251547217369, "lr": 3.1818181818181815e-07, "objective/entropy": -613.013671875, "objective/kl": 13.164669036865234, "objective/non_score_reward": -0.394940048456192, "objective/rlhf_reward": -0.043865837156772614, "objective/scores": 0.3515625, "policy/approxkl_avg": 0.0005462130066007376, "policy/clipfrac_avg": 0.008205180056393147, "policy/entropy_avg": 0.20569229125976562, "step": 125, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 1.0000652074813843, "val/ratio_var": 7.626629781043448e-07 }, { "episode": 8064, "epoch": 1.521796565389696, "eps": 0, "loss/policy_avg": -0.021035056561231613, "loss/value_avg": 0.008920412510633469, "lr": 3.164983164983165e-07, "objective/entropy": -658.7114868164062, "objective/kl": 11.970376968383789, "objective/non_score_reward": -0.35911130905151367, "objective/rlhf_reward": 0.07545901089906693, "objective/scores": 0.43359375, "policy/approxkl_avg": 0.00043813008232973516, "policy/clipfrac_avg": 0.00897503923624754, "policy/entropy_avg": 0.1892774999141693, "step": 126, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 44, "val/ratio": 0.9998860359191895, "val/ratio_var": 5.986601081531262e-07 }, { "episode": 8128, "epoch": 1.533874315908662, "eps": 0, "loss/policy_avg": -0.009907988831400871, "loss/value_avg": 0.007388514932245016, "lr": 3.148148148148148e-07, "objective/entropy": -683.1361083984375, "objective/kl": 11.096467971801758, "objective/non_score_reward": -0.3328940272331238, "objective/rlhf_reward": 0.006461434066295624, "objective/scores": 0.33984375, "policy/approxkl_avg": 0.0004163893754594028, "policy/clipfrac_avg": 0.00852059293538332, "policy/entropy_avg": 0.1878509521484375, "step": 127, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 1.000105381011963, "val/ratio_var": 7.102952963577991e-07 }, { "episode": 8192, "epoch": 1.5459520664276278, "eps": 0, "loss/policy_avg": -0.026054969057440758, "loss/value_avg": 0.0072658974677324295, "lr": 3.1313131313131313e-07, "objective/entropy": -708.2806396484375, "objective/kl": 10.58319091796875, "objective/non_score_reward": -0.31749576330184937, "objective/rlhf_reward": 0.10144957154989243, "objective/scores": 0.41796875, "policy/approxkl_avg": 0.0004081852675881237, "policy/clipfrac_avg": 0.008715375326573849, "policy/entropy_avg": 0.17965063452720642, "step": 128, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 42, "val/ratio": 1.0000120401382446, "val/ratio_var": 7.822923180356156e-07 }, { "episode": 8256, "epoch": 1.5580298169465936, "eps": 0, "loss/policy_avg": -0.0229483749717474, "loss/value_avg": 0.007459428161382675, "lr": 3.1144781144781144e-07, "objective/entropy": -659.6595458984375, "objective/kl": 10.650728225708008, "objective/non_score_reward": -0.31952184438705444, "objective/rlhf_reward": 0.08672817051410675, "objective/scores": 0.40625, "policy/approxkl_avg": 0.00047252658987417817, "policy/clipfrac_avg": 0.008922006003558636, "policy/entropy_avg": 0.18110594153404236, "step": 129, "val/clipfrac_avg": 4.006410563306417e-06, "val/num_eos_tokens": 41, "val/ratio": 1.0000056028366089, "val/ratio_var": 6.670750281045912e-07 }, { "episode": 8320, "epoch": 1.5701075674655596, "eps": 0, "loss/policy_avg": -0.02617826871573925, "loss/value_avg": 0.007010661065578461, "lr": 3.0976430976430975e-07, "objective/entropy": -678.2298583984375, "objective/kl": 12.300437927246094, "objective/non_score_reward": -0.36901313066482544, "objective/rlhf_reward": 0.03870171308517456, "objective/scores": 0.408203125, "policy/approxkl_avg": 0.0006277774227783084, "policy/clipfrac_avg": 0.009144840762019157, "policy/entropy_avg": 0.19200897216796875, "step": 130, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 43, "val/ratio": 1.0000146627426147, "val/ratio_var": 6.612851848331047e-07 }, { "episode": 8384, "epoch": 1.5821853179845253, "eps": 0, "loss/policy_avg": -0.010964921675622463, "loss/value_avg": 0.006414837669581175, "lr": 3.0808080808080806e-07, "objective/entropy": -716.4149169921875, "objective/kl": 10.809772491455078, "objective/non_score_reward": -0.3242931365966797, "objective/rlhf_reward": 0.038011543452739716, "objective/scores": 0.36328125, "policy/approxkl_avg": 0.0003922901814803481, "policy/clipfrac_avg": 0.00839821808040142, "policy/entropy_avg": 0.18431854248046875, "step": 131, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 52, "val/ratio": 0.9999863505363464, "val/ratio_var": 6.125242180132773e-07 }, { "episode": 8448, "epoch": 1.5942630685034913, "eps": 0, "loss/policy_avg": -0.02889040671288967, "loss/value_avg": 0.00650613009929657, "lr": 3.063973063973064e-07, "objective/entropy": -713.104736328125, "objective/kl": 10.613985061645508, "objective/non_score_reward": -0.31841954588890076, "objective/rlhf_reward": 0.11712733656167984, "objective/scores": 0.435546875, "policy/approxkl_avg": 0.00041002966463565826, "policy/clipfrac_avg": 0.008486878126859665, "policy/entropy_avg": 0.1799418181180954, "step": 132, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 44, "val/ratio": 1.0001921653747559, "val/ratio_var": 7.679034865759604e-07 }, { "episode": 8512, "epoch": 1.606340819022457, "eps": 0, "loss/policy_avg": -0.017285965383052826, "loss/value_avg": 0.005914734210819006, "lr": 3.047138047138047e-07, "objective/entropy": -597.8193359375, "objective/kl": 10.843616485595703, "objective/non_score_reward": -0.3253084719181061, "objective/rlhf_reward": 0.024300895631313324, "objective/scores": 0.349609375, "policy/approxkl_avg": 0.0005316430469974875, "policy/clipfrac_avg": 0.008745117112994194, "policy/entropy_avg": 0.18398921191692352, "step": 133, "val/clipfrac_avg": 1.3069845408608671e-05, "val/num_eos_tokens": 43, "val/ratio": 1.0001111030578613, "val/ratio_var": 1.0886411700994358e-06 }, { "episode": 8576, "epoch": 1.6184185695414228, "eps": 0, "loss/policy_avg": -0.0031869204249233007, "loss/value_avg": 0.0062155104242265224, "lr": 3.0303030303030305e-07, "objective/entropy": -668.35791015625, "objective/kl": 11.458172798156738, "objective/non_score_reward": -0.3437451720237732, "objective/rlhf_reward": 0.018315374851226807, "objective/scores": 0.361328125, "policy/approxkl_avg": 0.00043556466698646545, "policy/clipfrac_avg": 0.009600733406841755, "policy/entropy_avg": 0.1877404898405075, "step": 134, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 44, "val/ratio": 1.0000466108322144, "val/ratio_var": 5.28957230017113e-07 }, { "episode": 8640, "epoch": 1.6304963200603888, "eps": 0, "loss/policy_avg": -0.02297011762857437, "loss/value_avg": 0.005568277090787888, "lr": 3.0134680134680136e-07, "objective/entropy": -664.0026245117188, "objective/kl": 10.111173629760742, "objective/non_score_reward": -0.30333518981933594, "objective/rlhf_reward": 0.15467262268066406, "objective/scores": 0.45703125, "policy/approxkl_avg": 0.0005884866695851088, "policy/clipfrac_avg": 0.008443444035947323, "policy/entropy_avg": 0.1873784065246582, "step": 135, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 0.9999123811721802, "val/ratio_var": 6.365415856635082e-07 }, { "episode": 8704, "epoch": 1.6425740705793546, "eps": 0, "loss/policy_avg": -0.032759517431259155, "loss/value_avg": 0.005268210079520941, "lr": 2.9966329966329967e-07, "objective/entropy": -600.26708984375, "objective/kl": 11.405153274536133, "objective/non_score_reward": -0.3421545624732971, "objective/rlhf_reward": 0.05628291517496109, "objective/scores": 0.3984375, "policy/approxkl_avg": 0.0004980469821020961, "policy/clipfrac_avg": 0.00921311229467392, "policy/entropy_avg": 0.1832377165555954, "step": 136, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 1.0000090599060059, "val/ratio_var": 8.463471772302e-07 }, { "episode": 8768, "epoch": 1.6546518210983205, "eps": 0, "loss/policy_avg": -0.02364802360534668, "loss/value_avg": 0.005613654851913452, "lr": 2.9797979797979793e-07, "objective/entropy": -710.4251708984375, "objective/kl": 10.040770530700684, "objective/non_score_reward": -0.30122309923171997, "objective/rlhf_reward": 0.18656986951828003, "objective/scores": 0.48828125, "policy/approxkl_avg": 0.00038759977906011045, "policy/clipfrac_avg": 0.008407797664403915, "policy/entropy_avg": 0.17165374755859375, "step": 137, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 48, "val/ratio": 1.0000735521316528, "val/ratio_var": 5.960384896752657e-07 }, { "episode": 8832, "epoch": 1.6667295716172863, "eps": 0, "loss/policy_avg": -0.02203012816607952, "loss/value_avg": 0.00595112843438983, "lr": 2.962962962962963e-07, "objective/entropy": -672.9989624023438, "objective/kl": 10.036592483520508, "objective/non_score_reward": -0.3010977804660797, "objective/rlhf_reward": 0.07243738323450089, "objective/scores": 0.373046875, "policy/approxkl_avg": 0.00040325592271983624, "policy/clipfrac_avg": 0.008593715727329254, "policy/entropy_avg": 0.17470209300518036, "step": 138, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 0.9999663233757019, "val/ratio_var": 5.66750088637491e-07 }, { "episode": 8896, "epoch": 1.678807322136252, "eps": 0, "loss/policy_avg": -0.026858514174818993, "loss/value_avg": 0.005622117780148983, "lr": 2.946127946127946e-07, "objective/entropy": -661.4876708984375, "objective/kl": 11.446596145629883, "objective/non_score_reward": -0.343397855758667, "objective/rlhf_reward": 0.11412166804075241, "objective/scores": 0.45703125, "policy/approxkl_avg": 0.00048249890096485615, "policy/clipfrac_avg": 0.008328979834914207, "policy/entropy_avg": 0.18089675903320312, "step": 139, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 42, "val/ratio": 1.0000536441802979, "val/ratio_var": 7.740272849332541e-07 }, { "episode": 8960, "epoch": 1.690885072655218, "eps": 0, "loss/policy_avg": -0.012071679346263409, "loss/value_avg": 0.005673854611814022, "lr": 2.929292929292929e-07, "objective/entropy": -716.2989501953125, "objective/kl": 9.62091064453125, "objective/non_score_reward": -0.2886272668838501, "objective/rlhf_reward": 0.07660708576440811, "objective/scores": 0.365234375, "policy/approxkl_avg": 0.0004490650608204305, "policy/clipfrac_avg": 0.008078145794570446, "policy/entropy_avg": 0.17590078711509705, "step": 140, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 44, "val/ratio": 0.9998876452445984, "val/ratio_var": 7.121066687432176e-07 }, { "episode": 9024, "epoch": 1.7029628231741838, "eps": 0, "loss/policy_avg": -0.01407882571220398, "loss/value_avg": 0.005721048917621374, "lr": 2.912457912457912e-07, "objective/entropy": -635.1033325195312, "objective/kl": 11.491706848144531, "objective/non_score_reward": -0.34475117921829224, "objective/rlhf_reward": 0.012670673429965973, "objective/scores": 0.357421875, "policy/approxkl_avg": 0.00046376598766073585, "policy/clipfrac_avg": 0.008426757529377937, "policy/entropy_avg": 0.18351492285728455, "step": 141, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 0.9998941421508789, "val/ratio_var": 7.447699772455962e-07 }, { "episode": 9088, "epoch": 1.7150405736931496, "eps": 0, "loss/policy_avg": -0.026968976482748985, "loss/value_avg": 0.005361597985029221, "lr": 2.895622895622896e-07, "objective/entropy": -688.4439086914062, "objective/kl": 10.791741371154785, "objective/non_score_reward": -0.323752224445343, "objective/rlhf_reward": 0.10886494815349579, "objective/scores": 0.43359375, "policy/approxkl_avg": 0.00042141028097830713, "policy/clipfrac_avg": 0.008808997459709644, "policy/entropy_avg": 0.18024954199790955, "step": 142, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 1.0000135898590088, "val/ratio_var": 5.709226229555497e-07 }, { "episode": 9152, "epoch": 1.7271183242121155, "eps": 0, "loss/policy_avg": -0.027867591008543968, "loss/value_avg": 0.005742911249399185, "lr": 2.878787878787879e-07, "objective/entropy": -663.561279296875, "objective/kl": 9.091588973999023, "objective/non_score_reward": -0.272747665643692, "objective/rlhf_reward": 0.17207655310630798, "objective/scores": 0.4453125, "policy/approxkl_avg": 0.00043330591870471835, "policy/clipfrac_avg": 0.008460369892418385, "policy/entropy_avg": 0.19001516699790955, "step": 143, "val/clipfrac_avg": 6.127450888016028e-06, "val/num_eos_tokens": 39, "val/ratio": 1.0000602006912231, "val/ratio_var": 6.067602953407913e-07 }, { "episode": 9216, "epoch": 1.7391960747310813, "eps": 0, "loss/policy_avg": -0.048003654927015305, "loss/value_avg": 0.004765670746564865, "lr": 2.8619528619528615e-07, "objective/entropy": -609.2054443359375, "objective/kl": 11.087499618530273, "objective/non_score_reward": -0.33262500166893005, "objective/rlhf_reward": 0.15663282573223114, "objective/scores": 0.48828125, "policy/approxkl_avg": 0.00043542124330997467, "policy/clipfrac_avg": 0.008409352041780949, "policy/entropy_avg": 0.18917052447795868, "step": 144, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 42, "val/ratio": 0.9998970031738281, "val/ratio_var": 7.055933224364708e-07 }, { "episode": 9280, "epoch": 1.7512738252500473, "eps": 0, "loss/policy_avg": -0.022776823490858078, "loss/value_avg": 0.005445465445518494, "lr": 2.8451178451178446e-07, "objective/entropy": -649.3797607421875, "objective/kl": 11.225455284118652, "objective/non_score_reward": -0.3367636799812317, "objective/rlhf_reward": 0.056791022419929504, "objective/scores": 0.39453125, "policy/approxkl_avg": 0.00043666589772328734, "policy/clipfrac_avg": 0.008527948521077633, "policy/entropy_avg": 0.1882273405790329, "step": 145, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 42, "val/ratio": 1.0000823736190796, "val/ratio_var": 7.544516051893879e-07 }, { "episode": 9344, "epoch": 1.763351575769013, "eps": 0, "loss/policy_avg": -0.02756238356232643, "loss/value_avg": 0.005087685771286488, "lr": 2.8282828282828283e-07, "objective/entropy": -622.69384765625, "objective/kl": 10.821589469909668, "objective/non_score_reward": -0.3246476650238037, "objective/rlhf_reward": 0.08306717872619629, "objective/scores": 0.408203125, "policy/approxkl_avg": 0.00043378135887905955, "policy/clipfrac_avg": 0.008366484194993973, "policy/entropy_avg": 0.18392562866210938, "step": 146, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 45, "val/ratio": 0.9999205470085144, "val/ratio_var": 7.217399229375587e-07 }, { "episode": 9408, "epoch": 1.7754293262879788, "eps": 0, "loss/policy_avg": -0.01545548252761364, "loss/value_avg": 0.005610906984657049, "lr": 2.8114478114478114e-07, "objective/entropy": -710.6151733398438, "objective/kl": 10.56408977508545, "objective/non_score_reward": -0.316922664642334, "objective/rlhf_reward": 0.09079217165708542, "objective/scores": 0.408203125, "policy/approxkl_avg": 0.00047942213132046163, "policy/clipfrac_avg": 0.008930440992116928, "policy/entropy_avg": 0.1723581999540329, "step": 147, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 48, "val/ratio": 0.9999326467514038, "val/ratio_var": 5.841548045282252e-07 }, { "episode": 9472, "epoch": 1.7875070768069448, "eps": 0, "loss/policy_avg": -0.030797995626926422, "loss/value_avg": 0.005243232008069754, "lr": 2.7946127946127945e-07, "objective/entropy": -729.7215576171875, "objective/kl": 7.677038192749023, "objective/non_score_reward": -0.23031114041805267, "objective/rlhf_reward": 0.26822400093078613, "objective/scores": 0.498046875, "policy/approxkl_avg": 0.0003217764897271991, "policy/clipfrac_avg": 0.007711475715041161, "policy/entropy_avg": 0.15406641364097595, "step": 148, "val/clipfrac_avg": 2.40384615608491e-05, "val/num_eos_tokens": 51, "val/ratio": 0.9999207854270935, "val/ratio_var": 6.207331466612231e-07 }, { "episode": 9536, "epoch": 1.7995848273259105, "eps": 0, "loss/policy_avg": -0.013873748481273651, "loss/value_avg": 0.005824836902320385, "lr": 2.7777777777777776e-07, "objective/entropy": -724.1422729492188, "objective/kl": 9.410755157470703, "objective/non_score_reward": -0.28232264518737793, "objective/rlhf_reward": 0.10390782356262207, "objective/scores": 0.38671875, "policy/approxkl_avg": 0.00034999821218661964, "policy/clipfrac_avg": 0.00804916676133871, "policy/entropy_avg": 0.1864827573299408, "step": 149, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 1.0000414848327637, "val/ratio_var": 6.957430400689191e-07 }, { "episode": 9600, "epoch": 1.8116625778448765, "eps": 0, "loss/policy_avg": -0.03579283133149147, "loss/value_avg": 0.0054255155846476555, "lr": 2.760942760942761e-07, "objective/entropy": -673.954833984375, "objective/kl": 11.36821174621582, "objective/non_score_reward": -0.3410463333129883, "objective/rlhf_reward": 0.10768412053585052, "objective/scores": 0.44921875, "policy/approxkl_avg": 0.00042447494342923164, "policy/clipfrac_avg": 0.00855330191552639, "policy/entropy_avg": 0.19662603735923767, "step": 150, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 53, "val/ratio": 1.0001200437545776, "val/ratio_var": 7.517347171415167e-07 }, { "episode": 9664, "epoch": 1.8237403283638423, "eps": 0, "loss/policy_avg": -0.008654760196805, "loss/value_avg": 0.005138866137713194, "lr": 2.7441077441077443e-07, "objective/entropy": -660.7220458984375, "objective/kl": 11.443527221679688, "objective/non_score_reward": -0.3433057963848114, "objective/rlhf_reward": 0.031938336789608, "objective/scores": 0.375, "policy/approxkl_avg": 0.00044237799011170864, "policy/clipfrac_avg": 0.008529680781066418, "policy/entropy_avg": 0.18802008032798767, "step": 151, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 47, "val/ratio": 1.0000139474868774, "val/ratio_var": 7.119359111129597e-07 }, { "episode": 9728, "epoch": 1.835818078882808, "eps": 0, "loss/policy_avg": -0.02010141685605049, "loss/value_avg": 0.0053723035380244255, "lr": 2.727272727272727e-07, "objective/entropy": -687.9390869140625, "objective/kl": 9.019115447998047, "objective/non_score_reward": -0.2705734670162201, "objective/rlhf_reward": 0.0761062279343605, "objective/scores": 0.34765625, "policy/approxkl_avg": 0.00040762912249192595, "policy/clipfrac_avg": 0.008179357275366783, "policy/entropy_avg": 0.18950526416301727, "step": 152, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 47, "val/ratio": 0.9998996257781982, "val/ratio_var": 6.454416165979637e-07 }, { "episode": 9792, "epoch": 1.847895829401774, "eps": 0, "loss/policy_avg": -0.003955461550503969, "loss/value_avg": 0.0057320622727274895, "lr": 2.71043771043771e-07, "objective/entropy": -709.93115234375, "objective/kl": 9.436538696289062, "objective/non_score_reward": -0.2830961346626282, "objective/rlhf_reward": 0.09629838913679123, "objective/scores": 0.37890625, "policy/approxkl_avg": 0.00038871431024745107, "policy/clipfrac_avg": 0.0074014379642903805, "policy/entropy_avg": 0.18319067358970642, "step": 153, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 48, "val/ratio": 1.0000646114349365, "val/ratio_var": 5.15572935455566e-07 }, { "episode": 9856, "epoch": 1.8599735799207398, "eps": 0, "loss/policy_avg": -0.02044486068189144, "loss/value_avg": 0.00510798767209053, "lr": 2.6936026936026936e-07, "objective/entropy": -618.0216064453125, "objective/kl": 10.58153247833252, "objective/non_score_reward": -0.3174459636211395, "objective/rlhf_reward": 0.05462434142827988, "objective/scores": 0.37109375, "policy/approxkl_avg": 0.00045742533984594047, "policy/clipfrac_avg": 0.008799334987998009, "policy/entropy_avg": 0.2110799252986908, "step": 154, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 43, "val/ratio": 0.9999656677246094, "val/ratio_var": 6.85510656239785e-07 }, { "episode": 9920, "epoch": 1.8720513304397055, "eps": 0, "loss/policy_avg": -0.030187513679265976, "loss/value_avg": 0.0049795545637607574, "lr": 2.676767676767677e-07, "objective/entropy": -632.8074951171875, "objective/kl": 10.401787757873535, "objective/non_score_reward": -0.3120536208152771, "objective/rlhf_reward": 0.12495807558298111, "objective/scores": 0.4375, "policy/approxkl_avg": 0.0004211895284242928, "policy/clipfrac_avg": 0.007907616905868053, "policy/entropy_avg": 0.20108795166015625, "step": 155, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 47, "val/ratio": 0.999963641166687, "val/ratio_var": 6.760049586773675e-07 }, { "episode": 9984, "epoch": 1.8841290809586715, "eps": 0, "loss/policy_avg": -0.029912468045949936, "loss/value_avg": 0.005053409840911627, "lr": 2.65993265993266e-07, "objective/entropy": -617.66943359375, "objective/kl": 11.245973587036133, "objective/non_score_reward": -0.33737921714782715, "objective/rlhf_reward": 0.07375361770391464, "objective/scores": 0.41015625, "policy/approxkl_avg": 0.00048606080235913396, "policy/clipfrac_avg": 0.00850730575621128, "policy/entropy_avg": 0.2064310759305954, "step": 156, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 40, "val/ratio": 1.000071406364441, "val/ratio_var": 9.804023193282774e-07 }, { "episode": 10048, "epoch": 1.8962068314776372, "eps": 0, "loss/policy_avg": -0.04771365970373154, "loss/value_avg": 0.004816756118088961, "lr": 2.643097643097643e-07, "objective/entropy": -655.9871826171875, "objective/kl": 9.88708782196045, "objective/non_score_reward": -0.2966126501560211, "objective/rlhf_reward": 0.19313344359397888, "objective/scores": 0.490234375, "policy/approxkl_avg": 0.00041826663073152304, "policy/clipfrac_avg": 0.00776095874607563, "policy/entropy_avg": 0.18035888671875, "step": 157, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 43, "val/ratio": 1.0000017881393433, "val/ratio_var": 6.541795869452471e-07 }, { "episode": 10112, "epoch": 1.9082845819966032, "eps": 0, "loss/policy_avg": -0.015434409491717815, "loss/value_avg": 0.004857035353779793, "lr": 2.6262626262626266e-07, "objective/entropy": -639.9647216796875, "objective/kl": 9.765108108520508, "objective/non_score_reward": -0.2929532527923584, "objective/rlhf_reward": 0.09498622268438339, "objective/scores": 0.388671875, "policy/approxkl_avg": 0.00043778051622211933, "policy/clipfrac_avg": 0.009346296079456806, "policy/entropy_avg": 0.1973876953125, "step": 158, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 42, "val/ratio": 1.0000224113464355, "val/ratio_var": 8.540955036551168e-07 }, { "episode": 10176, "epoch": 1.920362332515569, "eps": 0, "loss/policy_avg": -0.029195090755820274, "loss/value_avg": 0.004645572509616613, "lr": 2.609427609427609e-07, "objective/entropy": -655.7578125, "objective/kl": 11.296720504760742, "objective/non_score_reward": -0.3389016389846802, "objective/rlhf_reward": 0.10152805596590042, "objective/scores": 0.44140625, "policy/approxkl_avg": 0.0004221507697366178, "policy/clipfrac_avg": 0.008524060249328613, "policy/entropy_avg": 0.1910552978515625, "step": 159, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 48, "val/ratio": 0.999944806098938, "val/ratio_var": 5.19986315339338e-07 }, { "episode": 10240, "epoch": 1.9324400830345347, "eps": 0, "loss/policy_avg": -0.026694564148783684, "loss/value_avg": 0.0052330996841192245, "lr": 2.5925925925925923e-07, "objective/entropy": -705.4793701171875, "objective/kl": 9.390169143676758, "objective/non_score_reward": -0.2817050516605377, "objective/rlhf_reward": 0.17459377646446228, "objective/scores": 0.45703125, "policy/approxkl_avg": 0.000462901167338714, "policy/clipfrac_avg": 0.00837808009237051, "policy/entropy_avg": 0.18092474341392517, "step": 160, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 40, "val/ratio": 1.0000178813934326, "val/ratio_var": 6.675589929727721e-07 }, { "episode": 10304, "epoch": 1.9445178335535007, "eps": 0, "loss/policy_avg": -0.03287532925605774, "loss/value_avg": 0.004949102643877268, "lr": 2.5757575757575754e-07, "objective/entropy": -731.1031494140625, "objective/kl": 9.570659637451172, "objective/non_score_reward": -0.2871198058128357, "objective/rlhf_reward": 0.2748919129371643, "objective/scores": 0.5625, "policy/approxkl_avg": 0.00036840554093942046, "policy/clipfrac_avg": 0.008850205689668655, "policy/entropy_avg": 0.17508062720298767, "step": 161, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 0.9999874830245972, "val/ratio_var": 6.138056392046565e-07 }, { "episode": 10368, "epoch": 1.9565955840724665, "eps": 0, "loss/policy_avg": -0.028138628229498863, "loss/value_avg": 0.004971574060618877, "lr": 2.558922558922559e-07, "objective/entropy": -692.5169677734375, "objective/kl": 9.982763290405273, "objective/non_score_reward": -0.29948288202285767, "objective/rlhf_reward": 0.18977493047714233, "objective/scores": 0.48828125, "policy/approxkl_avg": 0.0013259215047582984, "policy/clipfrac_avg": 0.007515524979680777, "policy/entropy_avg": 0.18165206909179688, "step": 162, "val/clipfrac_avg": 4.633748631022172e-06, "val/num_eos_tokens": 40, "val/ratio": 0.9999215006828308, "val/ratio_var": 6.52264759537502e-07 }, { "episode": 10432, "epoch": 1.9686733345914325, "eps": 0, "loss/policy_avg": -0.007500559091567993, "loss/value_avg": 0.005513847805559635, "lr": 2.542087542087542e-07, "objective/entropy": -709.478515625, "objective/kl": 8.840448379516602, "objective/non_score_reward": -0.2652134299278259, "objective/rlhf_reward": 0.15617327392101288, "objective/scores": 0.421875, "policy/approxkl_avg": 0.00044637074461206794, "policy/clipfrac_avg": 0.007994147948920727, "policy/entropy_avg": 0.185394287109375, "step": 163, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 45, "val/ratio": 1.0000178813934326, "val/ratio_var": 6.560539986821823e-07 }, { "episode": 10496, "epoch": 1.9807510851103982, "eps": 0, "loss/policy_avg": -0.012414928525686264, "loss/value_avg": 0.004942988511174917, "lr": 2.525252525252525e-07, "objective/entropy": -699.0042724609375, "objective/kl": 9.015774726867676, "objective/non_score_reward": -0.2704732418060303, "objective/rlhf_reward": 0.17386269569396973, "objective/scores": 0.4453125, "policy/approxkl_avg": 0.0003955226275138557, "policy/clipfrac_avg": 0.00792492926120758, "policy/entropy_avg": 0.18563461303710938, "step": 164, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 48, "val/ratio": 1.0000154972076416, "val/ratio_var": 7.629991500834876e-07 }, { "episode": 10560, "epoch": 1.992828835629364, "eps": 0, "loss/policy_avg": -0.009898051619529724, "loss/value_avg": 0.004960807505995035, "lr": 2.5084175084175083e-07, "objective/entropy": -661.8831787109375, "objective/kl": 9.828740119934082, "objective/non_score_reward": -0.29486221075057983, "objective/rlhf_reward": 0.04888780415058136, "objective/scores": 0.34375, "policy/approxkl_avg": 0.00043535567237995565, "policy/clipfrac_avg": 0.008342267014086246, "policy/entropy_avg": 0.1956939697265625, "step": 165, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 42, "val/ratio": 0.9997992515563965, "val/ratio_var": 8.213489763875259e-07 }, { "episode": 10624, "epoch": 2.00490658614833, "eps": 0, "loss/policy_avg": 0.0006142702768556774, "loss/value_avg": 0.0057089244946837425, "lr": 2.4915824915824914e-07, "objective/entropy": -616.60546875, "objective/kl": 11.599992752075195, "objective/non_score_reward": -0.34799978137016296, "objective/rlhf_reward": -0.018409937620162964, "objective/scores": 0.330078125, "policy/approxkl_avg": 0.000447861006250605, "policy/clipfrac_avg": 0.008485405705869198, "policy/entropy_avg": 0.19893011450767517, "step": 166, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 35, "val/ratio": 0.9999897480010986, "val/ratio_var": 8.826224302538321e-07 }, { "episode": 10688, "epoch": 2.0169843366672957, "eps": 0, "loss/policy_avg": -0.029148969799280167, "loss/value_avg": 0.0051203519105911255, "lr": 2.4747474747474745e-07, "objective/entropy": -707.4173583984375, "objective/kl": 9.282992362976074, "objective/non_score_reward": -0.27848976850509644, "objective/rlhf_reward": 0.21418601274490356, "objective/scores": 0.4921875, "policy/approxkl_avg": 0.00035875054891221225, "policy/clipfrac_avg": 0.007683487143367529, "policy/entropy_avg": 0.15778478980064392, "step": 167, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 51, "val/ratio": 0.9999899864196777, "val/ratio_var": 5.554063591262093e-07 }, { "episode": 10752, "epoch": 2.0290620871862615, "eps": 0, "loss/policy_avg": -0.020282533019781113, "loss/value_avg": 0.00484459986910224, "lr": 2.4579124579124576e-07, "objective/entropy": -612.2288818359375, "objective/kl": 10.996728897094727, "objective/non_score_reward": -0.32990187406539917, "objective/rlhf_reward": 0.05437546968460083, "objective/scores": 0.384765625, "policy/approxkl_avg": 0.0004697911790572107, "policy/clipfrac_avg": 0.00856415368616581, "policy/entropy_avg": 0.19417700171470642, "step": 168, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 42, "val/ratio": 1.0000133514404297, "val/ratio_var": 9.346218803329975e-07 }, { "episode": 10816, "epoch": 2.0411398377052272, "eps": 0, "loss/policy_avg": -0.0368703156709671, "loss/value_avg": 0.005465418100357056, "lr": 2.441077441077441e-07, "objective/entropy": -655.9403076171875, "objective/kl": 10.055724143981934, "objective/non_score_reward": -0.30167171359062195, "objective/rlhf_reward": 0.18416813015937805, "objective/scores": 0.486328125, "policy/approxkl_avg": 0.00042185792699456215, "policy/clipfrac_avg": 0.00785021297633648, "policy/entropy_avg": 0.17223486304283142, "step": 169, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 41, "val/ratio": 1.0000656843185425, "val/ratio_var": 6.886172627673659e-07 }, { "episode": 10880, "epoch": 2.0532175882241934, "eps": 0, "loss/policy_avg": -0.05005396902561188, "loss/value_avg": 0.00523067032918334, "lr": 2.4242424242424244e-07, "objective/entropy": -686.6971435546875, "objective/kl": 10.104284286499023, "objective/non_score_reward": -0.30312851071357727, "objective/rlhf_reward": 0.20663711428642273, "objective/scores": 0.5078125, "policy/approxkl_avg": 0.0010287510231137276, "policy/clipfrac_avg": 0.0077257584780454636, "policy/entropy_avg": 0.16867446899414062, "step": 170, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 1.0000211000442505, "val/ratio_var": 8.372552429136704e-07 }, { "episode": 10944, "epoch": 2.065295338743159, "eps": 0, "loss/policy_avg": -0.025773359462618828, "loss/value_avg": 0.004905715584754944, "lr": 2.407407407407407e-07, "objective/entropy": -692.7886352539062, "objective/kl": 8.267000198364258, "objective/non_score_reward": -0.24800997972488403, "objective/rlhf_reward": 0.18021267652511597, "objective/scores": 0.427734375, "policy/approxkl_avg": 0.0003900658048223704, "policy/clipfrac_avg": 0.008449830114841461, "policy/entropy_avg": 0.16828536987304688, "step": 171, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 50, "val/ratio": 1.000011682510376, "val/ratio_var": 7.061549354148156e-07 }, { "episode": 11008, "epoch": 2.077373089262125, "eps": 0, "loss/policy_avg": -0.013189585879445076, "loss/value_avg": 0.005143987946212292, "lr": 2.3905723905723906e-07, "objective/entropy": -726.6845703125, "objective/kl": 9.84701156616211, "objective/non_score_reward": -0.2954103648662567, "objective/rlhf_reward": 0.2207029163837433, "objective/scores": 0.515625, "policy/approxkl_avg": 0.000674139242619276, "policy/clipfrac_avg": 0.008698908612132072, "policy/entropy_avg": 0.17170843482017517, "step": 172, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 44, "val/ratio": 1.000104308128357, "val/ratio_var": 6.811029038544802e-07 }, { "episode": 11072, "epoch": 2.0894508397810907, "eps": 0, "loss/policy_avg": -0.015965035185217857, "loss/value_avg": 0.004617646336555481, "lr": 2.3737373737373737e-07, "objective/entropy": -650.63671875, "objective/kl": 11.198604583740234, "objective/non_score_reward": -0.3359581232070923, "objective/rlhf_reward": 0.009256713092327118, "objective/scores": 0.345703125, "policy/approxkl_avg": 0.0004264616873115301, "policy/clipfrac_avg": 0.008644884452223778, "policy/entropy_avg": 0.1950274407863617, "step": 173, "val/clipfrac_avg": 5.36388643013197e-06, "val/num_eos_tokens": 48, "val/ratio": 1.000180721282959, "val/ratio_var": 1.0091738431583508e-06 }, { "episode": 11136, "epoch": 2.1015285903000565, "eps": 0, "loss/policy_avg": -0.034316565841436386, "loss/value_avg": 0.004706418141722679, "lr": 2.3569023569023568e-07, "objective/entropy": -705.0397338867188, "objective/kl": 8.905685424804688, "objective/non_score_reward": -0.26717060804367065, "objective/rlhf_reward": 0.27775126695632935, "objective/scores": 0.546875, "policy/approxkl_avg": 0.0004036706523038447, "policy/clipfrac_avg": 0.007710058242082596, "policy/entropy_avg": 0.1774342954158783, "step": 174, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 41, "val/ratio": 0.9999071955680847, "val/ratio_var": 6.414345534722088e-07 }, { "episode": 11200, "epoch": 2.1136063408190227, "eps": 0, "loss/policy_avg": -0.02365967631340027, "loss/value_avg": 0.004380353260785341, "lr": 2.34006734006734e-07, "objective/entropy": -633.821044921875, "objective/kl": 9.5961332321167, "objective/non_score_reward": -0.287883996963501, "objective/rlhf_reward": 0.11690116673707962, "objective/scores": 0.404296875, "policy/approxkl_avg": 0.00041495892219245434, "policy/clipfrac_avg": 0.008334731683135033, "policy/entropy_avg": 0.18314361572265625, "step": 175, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 40, "val/ratio": 1.0000516176223755, "val/ratio_var": 5.880065145902336e-07 }, { "episode": 11264, "epoch": 2.1256840913379884, "eps": 0, "loss/policy_avg": -0.012720011174678802, "loss/value_avg": 0.004893209785223007, "lr": 2.323232323232323e-07, "objective/entropy": -651.2650146484375, "objective/kl": 9.453258514404297, "objective/non_score_reward": -0.28359776735305786, "objective/rlhf_reward": 0.10409756004810333, "objective/scores": 0.38671875, "policy/approxkl_avg": 0.0007613954949192703, "policy/clipfrac_avg": 0.007399224676191807, "policy/entropy_avg": 0.17717742919921875, "step": 176, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 1.0000202655792236, "val/ratio_var": 6.240634888854402e-07 }, { "episode": 11328, "epoch": 2.137761841856954, "eps": 0, "loss/policy_avg": -0.027373038232326508, "loss/value_avg": 0.0046966951340436935, "lr": 2.3063973063973064e-07, "objective/entropy": -717.2123413085938, "objective/kl": 8.953085899353027, "objective/non_score_reward": -0.26859256625175476, "objective/rlhf_reward": 0.21578243374824524, "objective/scores": 0.484375, "policy/approxkl_avg": 0.00035203900188207626, "policy/clipfrac_avg": 0.007572174072265625, "policy/entropy_avg": 0.159637451171875, "step": 177, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 54, "val/ratio": 1.0000574588775635, "val/ratio_var": 5.12606447955477e-07 }, { "episode": 11392, "epoch": 2.14983959237592, "eps": 0, "loss/policy_avg": -0.006380847655236721, "loss/value_avg": 0.0049251774325966835, "lr": 2.2895622895622895e-07, "objective/entropy": -712.037841796875, "objective/kl": 8.245269775390625, "objective/non_score_reward": -0.24735809862613678, "objective/rlhf_reward": 0.13057157397270203, "objective/scores": 0.37890625, "policy/approxkl_avg": 0.0003549880930222571, "policy/clipfrac_avg": 0.007925866171717644, "policy/entropy_avg": 0.1665090024471283, "step": 178, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 1.0001332759857178, "val/ratio_var": 5.663550268764084e-07 }, { "episode": 11456, "epoch": 2.1619173428948857, "eps": 0, "loss/policy_avg": -0.0014782699290663004, "loss/value_avg": 0.004644377622753382, "lr": 2.2727272727272726e-07, "objective/entropy": -662.6466064453125, "objective/kl": 9.573324203491211, "objective/non_score_reward": -0.2871997356414795, "objective/rlhf_reward": 0.0892651155591011, "objective/scores": 0.376953125, "policy/approxkl_avg": 0.0003882443706970662, "policy/clipfrac_avg": 0.007613882422447205, "policy/entropy_avg": 0.17233356833457947, "step": 179, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 45, "val/ratio": 1.0000505447387695, "val/ratio_var": 5.133804279466858e-07 }, { "episode": 11520, "epoch": 2.173995093413852, "eps": 0, "loss/policy_avg": -0.01099494006484747, "loss/value_avg": 0.00442532729357481, "lr": 2.2558922558922557e-07, "objective/entropy": -641.4156494140625, "objective/kl": 9.477804183959961, "objective/non_score_reward": -0.28433412313461304, "objective/rlhf_reward": 0.05966003239154816, "objective/scores": 0.34375, "policy/approxkl_avg": 0.0004196200461592525, "policy/clipfrac_avg": 0.008064374327659607, "policy/entropy_avg": 0.1929067075252533, "step": 180, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 40, "val/ratio": 0.9999755620956421, "val/ratio_var": 6.541202992593753e-07 }, { "episode": 11584, "epoch": 2.1860728439328176, "eps": 0, "loss/policy_avg": -0.0352904237806797, "loss/value_avg": 0.004265302326530218, "lr": 2.239057239057239e-07, "objective/entropy": -642.61669921875, "objective/kl": 10.230189323425293, "objective/non_score_reward": -0.30690568685531616, "objective/rlhf_reward": 0.11008650809526443, "objective/scores": 0.41796875, "policy/approxkl_avg": 0.0004299771972000599, "policy/clipfrac_avg": 0.008415701799094677, "policy/entropy_avg": 0.18710581958293915, "step": 181, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 1.0000258684158325, "val/ratio_var": 7.682069735892583e-07 }, { "episode": 11648, "epoch": 2.1981505944517834, "eps": 0, "loss/policy_avg": -0.018043210729956627, "loss/value_avg": 0.004413206595927477, "lr": 2.222222222222222e-07, "objective/entropy": -659.050537109375, "objective/kl": 9.415782928466797, "objective/non_score_reward": -0.28247350454330444, "objective/rlhf_reward": 0.09106165170669556, "objective/scores": 0.373046875, "policy/approxkl_avg": 0.00045144298928789794, "policy/clipfrac_avg": 0.008088944479823112, "policy/entropy_avg": 0.18143844604492188, "step": 182, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 39, "val/ratio": 1.000122308731079, "val/ratio_var": 6.821082365604525e-07 }, { "episode": 11712, "epoch": 2.210228344970749, "eps": 0, "loss/policy_avg": -0.01957491599023342, "loss/value_avg": 0.004746252205222845, "lr": 2.2053872053872053e-07, "objective/entropy": -673.1300659179688, "objective/kl": 10.322259902954102, "objective/non_score_reward": -0.30966776609420776, "objective/rlhf_reward": 0.12490253895521164, "objective/scores": 0.43359375, "policy/approxkl_avg": 0.00038717369898222387, "policy/clipfrac_avg": 0.007530445232987404, "policy/entropy_avg": 0.18404261767864227, "step": 183, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 47, "val/ratio": 1.000051498413086, "val/ratio_var": 7.273123401319026e-07 }, { "episode": 11776, "epoch": 2.222306095489715, "eps": 0, "loss/policy_avg": 0.01077682338654995, "loss/value_avg": 0.004722831770777702, "lr": 2.1885521885521884e-07, "objective/entropy": -616.521484375, "objective/kl": 10.416351318359375, "objective/non_score_reward": -0.3124905228614807, "objective/rlhf_reward": -0.03417021036148071, "objective/scores": 0.27734375, "policy/approxkl_avg": 0.0005021474789828062, "policy/clipfrac_avg": 0.008746202103793621, "policy/entropy_avg": 0.2025197446346283, "step": 184, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 42, "val/ratio": 0.9999576807022095, "val/ratio_var": 6.454907293118595e-07 }, { "episode": 11840, "epoch": 2.2343838460086807, "eps": 0, "loss/policy_avg": -0.0027505457401275635, "loss/value_avg": 0.0048131197690963745, "lr": 2.1717171717171718e-07, "objective/entropy": -694.5650634765625, "objective/kl": 9.623019218444824, "objective/non_score_reward": -0.28869056701660156, "objective/rlhf_reward": 0.10217857360839844, "objective/scores": 0.390625, "policy/approxkl_avg": 0.0003909420920535922, "policy/clipfrac_avg": 0.009193172678351402, "policy/entropy_avg": 0.17480087280273438, "step": 185, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 1.00017249584198, "val/ratio_var": 6.435092814172094e-07 }, { "episode": 11904, "epoch": 2.246461596527647, "eps": 0, "loss/policy_avg": -0.02041536569595337, "loss/value_avg": 0.005415412597358227, "lr": 2.1548821548821546e-07, "objective/entropy": -636.8676147460938, "objective/kl": 10.659719467163086, "objective/non_score_reward": -0.3197915852069855, "objective/rlhf_reward": 0.11331389844417572, "objective/scores": 0.43359375, "policy/approxkl_avg": 0.00040606613038107753, "policy/clipfrac_avg": 0.008012184873223305, "policy/entropy_avg": 0.18035762012004852, "step": 186, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 0.9999368190765381, "val/ratio_var": 5.389789521359489e-07 }, { "episode": 11968, "epoch": 2.2585393470466126, "eps": 0, "loss/policy_avg": -0.00026329857064411044, "loss/value_avg": 0.0059758638963103294, "lr": 2.138047138047138e-07, "objective/entropy": -671.1956787109375, "objective/kl": 10.020427703857422, "objective/non_score_reward": -0.30061283707618713, "objective/rlhf_reward": 0.10319577157497406, "objective/scores": 0.404296875, "policy/approxkl_avg": 0.0004317883576732129, "policy/clipfrac_avg": 0.007914026267826557, "policy/entropy_avg": 0.19041061401367188, "step": 187, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 44, "val/ratio": 1.0000234842300415, "val/ratio_var": 6.567967147930176e-07 }, { "episode": 12032, "epoch": 2.2706170975655784, "eps": 0, "loss/policy_avg": -0.016351381316781044, "loss/value_avg": 0.004221225157380104, "lr": 2.121212121212121e-07, "objective/entropy": -581.1964721679688, "objective/kl": 11.140705108642578, "objective/non_score_reward": -0.3342211842536926, "objective/rlhf_reward": 0.006599150598049164, "objective/scores": 0.33984375, "policy/approxkl_avg": 0.0005099625559523702, "policy/clipfrac_avg": 0.008483211509883404, "policy/entropy_avg": 0.2137502133846283, "step": 188, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 1.00002121925354, "val/ratio_var": 7.082234105837415e-07 }, { "episode": 12096, "epoch": 2.282694848084544, "eps": 0, "loss/policy_avg": -0.004529799334704876, "loss/value_avg": 0.00449700653553009, "lr": 2.1043771043771044e-07, "objective/entropy": -733.6138305664062, "objective/kl": 9.21728229522705, "objective/non_score_reward": -0.27651846408843994, "objective/rlhf_reward": 0.08773934096097946, "objective/scores": 0.36328125, "policy/approxkl_avg": 0.00036096826079301536, "policy/clipfrac_avg": 0.0077320970594882965, "policy/entropy_avg": 0.16572698950767517, "step": 189, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 48, "val/ratio": 0.9999610781669617, "val/ratio_var": 5.408356287261995e-07 }, { "episode": 12160, "epoch": 2.29477259860351, "eps": 0, "loss/policy_avg": -0.011266498826444149, "loss/value_avg": 0.004761071410030127, "lr": 2.0875420875420873e-07, "objective/entropy": -695.8765869140625, "objective/kl": 10.30718994140625, "objective/non_score_reward": -0.3092157244682312, "objective/rlhf_reward": 0.1111944392323494, "objective/scores": 0.419921875, "policy/approxkl_avg": 0.0003880340082105249, "policy/clipfrac_avg": 0.008478551171720028, "policy/entropy_avg": 0.1890207976102829, "step": 190, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 51, "val/ratio": 0.9999769926071167, "val/ratio_var": 6.441308642024524e-07 }, { "episode": 12224, "epoch": 2.306850349122476, "eps": 0, "loss/policy_avg": -0.020808562636375427, "loss/value_avg": 0.00449121231213212, "lr": 2.0707070707070707e-07, "objective/entropy": -632.7673950195312, "objective/kl": 9.880966186523438, "objective/non_score_reward": -0.29642897844314575, "objective/rlhf_reward": 0.15034836530685425, "objective/scores": 0.447265625, "policy/approxkl_avg": 0.0005901949480175972, "policy/clipfrac_avg": 0.007830065675079823, "policy/entropy_avg": 0.1974080502986908, "step": 191, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 43, "val/ratio": 0.9999300241470337, "val/ratio_var": 5.447765261124005e-07 }, { "episode": 12288, "epoch": 2.318928099641442, "eps": 0, "loss/policy_avg": -0.01821664161980152, "loss/value_avg": 0.004452117718756199, "lr": 2.0538720538720538e-07, "objective/entropy": -589.8839721679688, "objective/kl": 11.97592544555664, "objective/non_score_reward": -0.35927775502204895, "objective/rlhf_reward": -0.04506877064704895, "objective/scores": 0.314453125, "policy/approxkl_avg": 0.000494088395498693, "policy/clipfrac_avg": 0.008692565374076366, "policy/entropy_avg": 0.20995458960533142, "step": 192, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 45, "val/ratio": 1.000139594078064, "val/ratio_var": 7.803449193488632e-07 }, { "episode": 12352, "epoch": 2.3310058501604076, "eps": 0, "loss/policy_avg": 0.004255164880305529, "loss/value_avg": 0.004434296861290932, "lr": 2.0370370370370369e-07, "objective/entropy": -708.4127197265625, "objective/kl": 9.581413269042969, "objective/non_score_reward": -0.2874424159526825, "objective/rlhf_reward": 0.0768154114484787, "objective/scores": 0.36328125, "policy/approxkl_avg": 0.0004709034110419452, "policy/clipfrac_avg": 0.007526259869337082, "policy/entropy_avg": 0.19600550830364227, "step": 193, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 52, "val/ratio": 0.9999573230743408, "val/ratio_var": 6.966297405597288e-07 }, { "episode": 12416, "epoch": 2.3430836006793734, "eps": 0, "loss/policy_avg": -0.030452650040388107, "loss/value_avg": 0.004531817510724068, "lr": 2.02020202020202e-07, "objective/entropy": -637.3931274414062, "objective/kl": 10.20181941986084, "objective/non_score_reward": -0.30605456233024597, "objective/rlhf_reward": 0.14560559391975403, "objective/scores": 0.451171875, "policy/approxkl_avg": 0.0004737515118904412, "policy/clipfrac_avg": 0.00787600688636303, "policy/entropy_avg": 0.1870168149471283, "step": 194, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 44, "val/ratio": 1.0000483989715576, "val/ratio_var": 8.018863582037739e-07 }, { "episode": 12480, "epoch": 2.355161351198339, "eps": 0, "loss/policy_avg": -0.014566441997885704, "loss/value_avg": 0.004195361863821745, "lr": 2.0033670033670033e-07, "objective/entropy": -724.8233642578125, "objective/kl": 8.487796783447266, "objective/non_score_reward": -0.25463390350341797, "objective/rlhf_reward": 0.19556137919425964, "objective/scores": 0.44921875, "policy/approxkl_avg": 0.00040911376709118485, "policy/clipfrac_avg": 0.008261054754257202, "policy/entropy_avg": 0.1738535612821579, "step": 195, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 1.0000691413879395, "val/ratio_var": 5.904955173718918e-07 }, { "episode": 12544, "epoch": 2.3672391017173053, "eps": 0, "loss/policy_avg": -0.022108733654022217, "loss/value_avg": 0.0041517410427331924, "lr": 1.9865319865319864e-07, "objective/entropy": -605.9757080078125, "objective/kl": 10.502775192260742, "objective/non_score_reward": -0.31508326530456543, "objective/rlhf_reward": 0.07261204719543457, "objective/scores": 0.38671875, "policy/approxkl_avg": 0.00047133685438893735, "policy/clipfrac_avg": 0.007839309982955456, "policy/entropy_avg": 0.19527944922447205, "step": 196, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 45, "val/ratio": 0.9999372959136963, "val/ratio_var": 8.094798431557138e-07 }, { "episode": 12608, "epoch": 2.379316852236271, "eps": 0, "loss/policy_avg": -0.02246645651757717, "loss/value_avg": 0.004199231043457985, "lr": 1.9696969696969696e-07, "objective/entropy": -639.3452758789062, "objective/kl": 9.353677749633789, "objective/non_score_reward": -0.2806103229522705, "objective/rlhf_reward": 0.0772998258471489, "objective/scores": 0.357421875, "policy/approxkl_avg": 0.0004011366399936378, "policy/clipfrac_avg": 0.007909499108791351, "policy/entropy_avg": 0.1853407323360443, "step": 197, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 0.9999942779541016, "val/ratio_var": 5.837881076331541e-07 }, { "episode": 12672, "epoch": 2.391394602755237, "eps": 0, "loss/policy_avg": -0.035098060965538025, "loss/value_avg": 0.004067492671310902, "lr": 1.9528619528619527e-07, "objective/entropy": -698.6280517578125, "objective/kl": 8.177114486694336, "objective/non_score_reward": -0.24531343579292297, "objective/rlhf_reward": 0.18925687670707703, "objective/scores": 0.43359375, "policy/approxkl_avg": 0.00038718507857993245, "policy/clipfrac_avg": 0.008279062807559967, "policy/entropy_avg": 0.18160438537597656, "step": 198, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 47, "val/ratio": 1.0000050067901611, "val/ratio_var": 5.738119170928258e-07 }, { "episode": 12736, "epoch": 2.4034723532742026, "eps": 0, "loss/policy_avg": -0.022469520568847656, "loss/value_avg": 0.00446331687271595, "lr": 1.936026936026936e-07, "objective/entropy": -676.8287963867188, "objective/kl": 7.68222713470459, "objective/non_score_reward": -0.2304668128490448, "objective/rlhf_reward": 0.1865253746509552, "objective/scores": 0.41796875, "policy/approxkl_avg": 0.00038739325827918947, "policy/clipfrac_avg": 0.008100518956780434, "policy/entropy_avg": 0.17807134985923767, "step": 199, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 0.9999980926513672, "val/ratio_var": 7.030320716694405e-07 }, { "episode": 12800, "epoch": 2.4155501037931684, "eps": 0, "loss/policy_avg": -0.008693840354681015, "loss/value_avg": 0.004308072850108147, "lr": 1.9191919191919189e-07, "objective/entropy": -678.0347900390625, "objective/kl": 8.073586463928223, "objective/non_score_reward": -0.2422075867652893, "objective/rlhf_reward": 0.1515912413597107, "objective/scores": 0.39453125, "policy/approxkl_avg": 0.00046054236008785665, "policy/clipfrac_avg": 0.008118792437016964, "policy/entropy_avg": 0.19109344482421875, "step": 200, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 45, "val/ratio": 0.999969482421875, "val/ratio_var": 5.471772510645678e-07 }, { "episode": 12864, "epoch": 2.4276278543121346, "eps": 0, "loss/policy_avg": -0.02215806394815445, "loss/value_avg": 0.00444747693836689, "lr": 1.9023569023569022e-07, "objective/entropy": -715.016357421875, "objective/kl": 8.792640686035156, "objective/non_score_reward": -0.2637792229652405, "objective/rlhf_reward": 0.22645515203475952, "objective/scores": 0.490234375, "policy/approxkl_avg": 0.0003587045648600906, "policy/clipfrac_avg": 0.0076547968201339245, "policy/entropy_avg": 0.1849416196346283, "step": 201, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 1.000044584274292, "val/ratio_var": 5.405810838965408e-07 }, { "episode": 12928, "epoch": 2.4397056048311003, "eps": 0, "loss/policy_avg": 0.004962640814483166, "loss/value_avg": 0.004433006979525089, "lr": 1.8855218855218853e-07, "objective/entropy": -646.1973876953125, "objective/kl": 9.110560417175293, "objective/non_score_reward": -0.27331680059432983, "objective/rlhf_reward": 0.10168319940567017, "objective/scores": 0.375, "policy/approxkl_avg": 0.0004163091944064945, "policy/clipfrac_avg": 0.007584965787827969, "policy/entropy_avg": 0.18780645728111267, "step": 202, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 51, "val/ratio": 0.9999338984489441, "val/ratio_var": 9.266473739444336e-07 }, { "episode": 12992, "epoch": 2.451783355350066, "eps": 0, "loss/policy_avg": -0.011171831749379635, "loss/value_avg": 0.0047411127015948296, "lr": 1.8686868686868687e-07, "objective/entropy": -690.5891723632812, "objective/kl": 8.7337646484375, "objective/non_score_reward": -0.26201295852661133, "objective/rlhf_reward": 0.13154172897338867, "objective/scores": 0.39453125, "policy/approxkl_avg": 0.00038152310298755765, "policy/clipfrac_avg": 0.007856165990233421, "policy/entropy_avg": 0.1933492124080658, "step": 203, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 47, "val/ratio": 0.9999629855155945, "val/ratio_var": 6.461675070568162e-07 }, { "episode": 13056, "epoch": 2.463861105869032, "eps": 0, "loss/policy_avg": -0.03274771571159363, "loss/value_avg": 0.004181091673672199, "lr": 1.8518518518518516e-07, "objective/entropy": -734.372802734375, "objective/kl": 6.88037109375, "objective/non_score_reward": -0.20641113817691803, "objective/rlhf_reward": 0.25159668922424316, "objective/scores": 0.45703125, "policy/approxkl_avg": 0.0003489043447189033, "policy/clipfrac_avg": 0.007448253687471151, "policy/entropy_avg": 0.17730967700481415, "step": 204, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 53, "val/ratio": 1.0000073909759521, "val/ratio_var": 6.07091635629331e-07 }, { "episode": 13120, "epoch": 2.4759388563879976, "eps": 0, "loss/policy_avg": -0.016816487535834312, "loss/value_avg": 0.0042554219253361225, "lr": 1.835016835016835e-07, "objective/entropy": -794.4978637695312, "objective/kl": 5.602289199829102, "objective/non_score_reward": -0.16806866228580475, "objective/rlhf_reward": 0.39443135261535645, "objective/scores": 0.5625, "policy/approxkl_avg": 0.0002819629153236747, "policy/clipfrac_avg": 0.006934004835784435, "policy/entropy_avg": 0.15758514404296875, "step": 205, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 48, "val/ratio": 0.9999727010726929, "val/ratio_var": 4.5193257847131463e-07 }, { "episode": 13184, "epoch": 2.4880166069069634, "eps": 0, "loss/policy_avg": -0.017609162256121635, "loss/value_avg": 0.00427992781624198, "lr": 1.818181818181818e-07, "objective/entropy": -712.7396850585938, "objective/kl": 8.044666290283203, "objective/non_score_reward": -0.24133998155593872, "objective/rlhf_reward": 0.19616001844406128, "objective/scores": 0.4375, "policy/approxkl_avg": 0.000342213868862018, "policy/clipfrac_avg": 0.006520026829093695, "policy/entropy_avg": 0.17752330005168915, "step": 206, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 47, "val/ratio": 0.9999613165855408, "val/ratio_var": 5.999673931000871e-07 }, { "episode": 13248, "epoch": 2.5000943574259296, "eps": 0, "loss/policy_avg": -0.03822872042655945, "loss/value_avg": 0.0038146479055285454, "lr": 1.8013468013468014e-07, "objective/entropy": -676.6316528320312, "objective/kl": 8.43276596069336, "objective/non_score_reward": -0.2529829740524292, "objective/rlhf_reward": 0.2226029485464096, "objective/scores": 0.4765625, "policy/approxkl_avg": 0.00040029053343459964, "policy/clipfrac_avg": 0.007741398643702269, "policy/entropy_avg": 0.18157577514648438, "step": 207, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 38, "val/ratio": 1.0000382661819458, "val/ratio_var": 7.777077826176537e-07 }, { "episode": 13312, "epoch": 2.5121721079448953, "eps": 0, "loss/policy_avg": -0.017532743513584137, "loss/value_avg": 0.004521360620856285, "lr": 1.7845117845117842e-07, "objective/entropy": -642.2452392578125, "objective/kl": 9.599320411682129, "objective/non_score_reward": -0.2879796028137207, "objective/rlhf_reward": 0.0982508733868599, "objective/scores": 0.38671875, "policy/approxkl_avg": 0.0005058823153376579, "policy/clipfrac_avg": 0.007621736731380224, "policy/entropy_avg": 0.19717535376548767, "step": 208, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 45, "val/ratio": 0.9999377727508545, "val/ratio_var": 6.733653208357282e-07 }, { "episode": 13376, "epoch": 2.524249858463861, "eps": 0, "loss/policy_avg": -0.012456863187253475, "loss/value_avg": 0.004330017603933811, "lr": 1.7676767676767676e-07, "objective/entropy": -684.7198486328125, "objective/kl": 8.890013694763184, "objective/non_score_reward": -0.2667003870010376, "objective/rlhf_reward": 0.1131824254989624, "objective/scores": 0.37890625, "policy/approxkl_avg": 0.0003934592823497951, "policy/clipfrac_avg": 0.008775051683187485, "policy/entropy_avg": 0.18945693969726562, "step": 209, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 44, "val/ratio": 1.0001251697540283, "val/ratio_var": 6.748607574991183e-07 }, { "episode": 13440, "epoch": 2.536327608982827, "eps": 0, "loss/policy_avg": -0.01945299468934536, "loss/value_avg": 0.0042161582969129086, "lr": 1.7508417508417507e-07, "objective/entropy": -704.7137451171875, "objective/kl": 9.013429641723633, "objective/non_score_reward": -0.27040284872055054, "objective/rlhf_reward": 0.13438229262828827, "objective/scores": 0.404296875, "policy/approxkl_avg": 0.0003603402874432504, "policy/clipfrac_avg": 0.008057435974478722, "policy/entropy_avg": 0.18650183081626892, "step": 210, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 50, "val/ratio": 1.0000133514404297, "val/ratio_var": 5.646013505611336e-07 }, { "episode": 13504, "epoch": 2.5484053595017926, "eps": 0, "loss/policy_avg": -0.01828945055603981, "loss/value_avg": 0.004183897748589516, "lr": 1.7340067340067338e-07, "objective/entropy": -715.1771240234375, "objective/kl": 7.647032737731934, "objective/non_score_reward": -0.22941097617149353, "objective/rlhf_reward": 0.21931949257850647, "objective/scores": 0.44921875, "policy/approxkl_avg": 0.0005932983476668596, "policy/clipfrac_avg": 0.006059914827346802, "policy/entropy_avg": 0.17059580981731415, "step": 211, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 51, "val/ratio": 0.9999650716781616, "val/ratio_var": 4.5039382712275255e-07 }, { "episode": 13568, "epoch": 2.560483110020759, "eps": 0, "loss/policy_avg": -0.01205519214272499, "loss/value_avg": 0.004262065049260855, "lr": 1.717171717171717e-07, "objective/entropy": -680.666015625, "objective/kl": 9.011266708374023, "objective/non_score_reward": -0.2703379988670349, "objective/rlhf_reward": 0.14860734343528748, "objective/scores": 0.41796875, "policy/approxkl_avg": 0.0003913857217412442, "policy/clipfrac_avg": 0.008144761435687542, "policy/entropy_avg": 0.1854146420955658, "step": 212, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 47, "val/ratio": 1.000110387802124, "val/ratio_var": 4.5980641516507603e-07 }, { "episode": 13632, "epoch": 2.5725608605397245, "eps": 0, "loss/policy_avg": -0.0006327772280201316, "loss/value_avg": 0.004324252717196941, "lr": 1.7003367003367003e-07, "objective/entropy": -680.5126953125, "objective/kl": 8.441967010498047, "objective/non_score_reward": -0.25325900316238403, "objective/rlhf_reward": 0.11051052808761597, "objective/scores": 0.36328125, "policy/approxkl_avg": 0.0003816906246356666, "policy/clipfrac_avg": 0.007939379662275314, "policy/entropy_avg": 0.19525527954101562, "step": 213, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 1.0000630617141724, "val/ratio_var": 6.067442654966726e-07 }, { "episode": 13696, "epoch": 2.5846386110586903, "eps": 0, "loss/policy_avg": -0.03070930764079094, "loss/value_avg": 0.0039781369268894196, "lr": 1.6835016835016837e-07, "objective/entropy": -648.4881591796875, "objective/kl": 8.808910369873047, "objective/non_score_reward": -0.26426729559898376, "objective/rlhf_reward": 0.16542020440101624, "objective/scores": 0.4296875, "policy/approxkl_avg": 0.0004112160240765661, "policy/clipfrac_avg": 0.008430849760770798, "policy/entropy_avg": 0.18458303809165955, "step": 214, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 41, "val/ratio": 1.0000827312469482, "val/ratio_var": 5.764974844169046e-07 }, { "episode": 13760, "epoch": 2.596716361577656, "eps": 0, "loss/policy_avg": -0.01275802031159401, "loss/value_avg": 0.004106822889298201, "lr": 1.6666666666666665e-07, "objective/entropy": -706.819580078125, "objective/kl": 8.268199920654297, "objective/non_score_reward": -0.24804598093032837, "objective/rlhf_reward": 0.18847745656967163, "objective/scores": 0.4375, "policy/approxkl_avg": 0.0003638950875028968, "policy/clipfrac_avg": 0.00869040098041296, "policy/entropy_avg": 0.18547821044921875, "step": 215, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 1.0000922679901123, "val/ratio_var": 5.714567805625848e-07 }, { "episode": 13824, "epoch": 2.608794112096622, "eps": 0, "loss/policy_avg": -0.014732430689036846, "loss/value_avg": 0.004122150130569935, "lr": 1.64983164983165e-07, "objective/entropy": -678.539306640625, "objective/kl": 8.51124382019043, "objective/non_score_reward": -0.25533732771873474, "objective/rlhf_reward": 0.13821735978126526, "objective/scores": 0.39453125, "policy/approxkl_avg": 0.0003934210108127445, "policy/clipfrac_avg": 0.008303534239530563, "policy/entropy_avg": 0.19069163501262665, "step": 216, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 1.0001001358032227, "val/ratio_var": 6.961653866710549e-07 }, { "episode": 13888, "epoch": 2.620871862615588, "eps": 0, "loss/policy_avg": -0.0114736994728446, "loss/value_avg": 0.004468954633921385, "lr": 1.632996632996633e-07, "objective/entropy": -760.7581787109375, "objective/kl": 6.764780044555664, "objective/non_score_reward": -0.20294338464736938, "objective/rlhf_reward": 0.2609238028526306, "objective/scores": 0.46484375, "policy/approxkl_avg": 0.000336907512973994, "policy/clipfrac_avg": 0.007351381238549948, "policy/entropy_avg": 0.17353948950767517, "step": 217, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 50, "val/ratio": 1.000031590461731, "val/ratio_var": 5.147477963873826e-07 }, { "episode": 13952, "epoch": 2.632949613134554, "eps": 0, "loss/policy_avg": -0.025034895166754723, "loss/value_avg": 0.004216045141220093, "lr": 1.6161616161616163e-07, "objective/entropy": -698.8699951171875, "objective/kl": 7.570901870727539, "objective/non_score_reward": -0.2271270602941513, "objective/rlhf_reward": 0.2767791748046875, "objective/scores": 0.50390625, "policy/approxkl_avg": 0.0004006924282293767, "policy/clipfrac_avg": 0.007912924513220787, "policy/entropy_avg": 0.17928314208984375, "step": 218, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 52, "val/ratio": 1.0000429153442383, "val/ratio_var": 5.616009843834036e-07 }, { "episode": 14016, "epoch": 2.6450273636535195, "eps": 0, "loss/policy_avg": -0.010850876569747925, "loss/value_avg": 0.004759899340569973, "lr": 1.5993265993265992e-07, "objective/entropy": -677.7619018554688, "objective/kl": 9.129847526550293, "objective/non_score_reward": -0.27389541268348694, "objective/rlhf_reward": 0.10403427481651306, "objective/scores": 0.37890625, "policy/approxkl_avg": 0.00039389575249515474, "policy/clipfrac_avg": 0.008119095116853714, "policy/entropy_avg": 0.19026947021484375, "step": 219, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 43, "val/ratio": 0.9999043345451355, "val/ratio_var": 6.120080797700211e-07 }, { "episode": 14080, "epoch": 2.6571051141724853, "eps": 0, "loss/policy_avg": 0.0019542532972991467, "loss/value_avg": 0.004348535090684891, "lr": 1.5824915824915826e-07, "objective/entropy": -694.8416748046875, "objective/kl": 7.038139343261719, "objective/non_score_reward": -0.21114417910575867, "objective/rlhf_reward": 0.17752769589424133, "objective/scores": 0.388671875, "policy/approxkl_avg": 0.0003559057950042188, "policy/clipfrac_avg": 0.007189783733338118, "policy/entropy_avg": 0.16869863867759705, "step": 220, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 38, "val/ratio": 0.9999446868896484, "val/ratio_var": 6.206274747455609e-07 }, { "episode": 14144, "epoch": 2.669182864691451, "eps": 0, "loss/policy_avg": -0.013687599450349808, "loss/value_avg": 0.004020463675260544, "lr": 1.5656565656565657e-07, "objective/entropy": -699.5345458984375, "objective/kl": 8.55117416381836, "objective/non_score_reward": -0.25653523206710815, "objective/rlhf_reward": 0.11699993908405304, "objective/scores": 0.373046875, "policy/approxkl_avg": 0.00037346064345911145, "policy/clipfrac_avg": 0.008171791210770607, "policy/entropy_avg": 0.19686762988567352, "step": 221, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 1.0000358819961548, "val/ratio_var": 5.536150524676486e-07 }, { "episode": 14208, "epoch": 2.6812606152104173, "eps": 0, "loss/policy_avg": -0.016610831022262573, "loss/value_avg": 0.003959144465625286, "lr": 1.5488215488215488e-07, "objective/entropy": -733.2947998046875, "objective/kl": 7.419867515563965, "objective/non_score_reward": -0.22259601950645447, "objective/rlhf_reward": 0.25543132424354553, "objective/scores": 0.478515625, "policy/approxkl_avg": 0.0003535112482495606, "policy/clipfrac_avg": 0.007308521773666143, "policy/entropy_avg": 0.1968231201171875, "step": 222, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 50, "val/ratio": 0.9999250173568726, "val/ratio_var": 4.929243573315034e-07 }, { "episode": 14272, "epoch": 2.693338365729383, "eps": 0, "loss/policy_avg": -0.03384992107748985, "loss/value_avg": 0.003912989050149918, "lr": 1.531986531986532e-07, "objective/entropy": -646.734130859375, "objective/kl": 8.461997985839844, "objective/non_score_reward": -0.25385990738868713, "objective/rlhf_reward": 0.16215571761131287, "objective/scores": 0.416015625, "policy/approxkl_avg": 0.0005014871712774038, "policy/clipfrac_avg": 0.00828520953655243, "policy/entropy_avg": 0.19067637622356415, "step": 223, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 45, "val/ratio": 1.0000280141830444, "val/ratio_var": 7.016036533968872e-07 }, { "episode": 14336, "epoch": 2.7054161162483488, "eps": 0, "loss/policy_avg": -0.0035064732655882835, "loss/value_avg": 0.003849966451525688, "lr": 1.5151515151515152e-07, "objective/entropy": -686.4854736328125, "objective/kl": 8.041788101196289, "objective/non_score_reward": -0.24125364422798157, "objective/rlhf_reward": 0.12544558942317963, "objective/scores": 0.3671875, "policy/approxkl_avg": 0.00039225397631525993, "policy/clipfrac_avg": 0.00783010758459568, "policy/entropy_avg": 0.19433467090129852, "step": 224, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 43, "val/ratio": 0.9999794960021973, "val/ratio_var": 5.356313295123982e-07 }, { "episode": 14400, "epoch": 2.7174938667673145, "eps": 0, "loss/policy_avg": -0.005889839958399534, "loss/value_avg": 0.004009313881397247, "lr": 1.4983164983164983e-07, "objective/entropy": -688.8871459960938, "objective/kl": 7.107503890991211, "objective/non_score_reward": -0.2132251262664795, "objective/rlhf_reward": 0.1681225299835205, "objective/scores": 0.380859375, "policy/approxkl_avg": 0.00035966013092547655, "policy/clipfrac_avg": 0.0072424449026584625, "policy/entropy_avg": 0.1872762143611908, "step": 225, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 44, "val/ratio": 1.0000863075256348, "val/ratio_var": 6.394271281351394e-07 }, { "episode": 14464, "epoch": 2.7295716172862803, "eps": 0, "loss/policy_avg": -0.03164386376738548, "loss/value_avg": 0.003974028863012791, "lr": 1.4814814814814815e-07, "objective/entropy": -636.7869873046875, "objective/kl": 9.086867332458496, "objective/non_score_reward": -0.2726060152053833, "objective/rlhf_reward": 0.1607435941696167, "objective/scores": 0.43359375, "policy/approxkl_avg": 0.0004679976846091449, "policy/clipfrac_avg": 0.007549212779849768, "policy/entropy_avg": 0.1928914487361908, "step": 226, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 42, "val/ratio": 0.9999264478683472, "val/ratio_var": 5.871561938874947e-07 }, { "episode": 14528, "epoch": 2.7416493678052465, "eps": 0, "loss/policy_avg": -0.029706722125411034, "loss/value_avg": 0.003999405540525913, "lr": 1.4646464646464646e-07, "objective/entropy": -714.131103515625, "objective/kl": 8.953620910644531, "objective/non_score_reward": -0.26860859990119934, "objective/rlhf_reward": 0.22016091644763947, "objective/scores": 0.48828125, "policy/approxkl_avg": 0.0004997976357117295, "policy/clipfrac_avg": 0.0073518408462405205, "policy/entropy_avg": 0.19288381934165955, "step": 227, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 50, "val/ratio": 0.999913215637207, "val/ratio_var": 6.02664385951357e-07 }, { "episode": 14592, "epoch": 2.7537271183242122, "eps": 0, "loss/policy_avg": -0.024763260036706924, "loss/value_avg": 0.003725615097209811, "lr": 1.447811447811448e-07, "objective/entropy": -598.0142822265625, "objective/kl": 8.618000030517578, "objective/non_score_reward": -0.2585400342941284, "objective/rlhf_reward": 0.17554199695587158, "objective/scores": 0.43359375, "policy/approxkl_avg": 0.00044109029113315046, "policy/clipfrac_avg": 0.008174901828169823, "policy/entropy_avg": 0.207733154296875, "step": 228, "val/clipfrac_avg": 1.2475050425564405e-05, "val/num_eos_tokens": 35, "val/ratio": 0.9999794363975525, "val/ratio_var": 7.014916718617314e-07 }, { "episode": 14656, "epoch": 2.765804868843178, "eps": 0, "loss/policy_avg": -0.025254826992750168, "loss/value_avg": 0.004053793381899595, "lr": 1.4309764309764308e-07, "objective/entropy": -670.363037109375, "objective/kl": 7.579680442810059, "objective/non_score_reward": -0.22739042341709137, "objective/rlhf_reward": 0.23550020158290863, "objective/scores": 0.462890625, "policy/approxkl_avg": 0.00038262922316789627, "policy/clipfrac_avg": 0.007922390475869179, "policy/entropy_avg": 0.18486277759075165, "step": 229, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 36, "val/ratio": 1.0000662803649902, "val/ratio_var": 6.002863983667339e-07 }, { "episode": 14720, "epoch": 2.7778826193621438, "eps": 0, "loss/policy_avg": -0.009143723174929619, "loss/value_avg": 0.004357549827545881, "lr": 1.4141414141414141e-07, "objective/entropy": -646.9794921875, "objective/kl": 9.054327964782715, "objective/non_score_reward": -0.27162984013557434, "objective/rlhf_reward": 0.09873150289058685, "objective/scores": 0.37109375, "policy/approxkl_avg": 0.00039568787906318903, "policy/clipfrac_avg": 0.007754582446068525, "policy/entropy_avg": 0.1911672055721283, "step": 230, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 51, "val/ratio": 1.000026822090149, "val/ratio_var": 7.080183195284917e-07 }, { "episode": 14784, "epoch": 2.7899603698811095, "eps": 0, "loss/policy_avg": -0.03357026353478432, "loss/value_avg": 0.00450306199491024, "lr": 1.3973063973063972e-07, "objective/entropy": -694.3631591796875, "objective/kl": 7.8914995193481445, "objective/non_score_reward": -0.2367449700832367, "objective/rlhf_reward": 0.2432354986667633, "objective/scores": 0.48046875, "policy/approxkl_avg": 0.00037845989572815597, "policy/clipfrac_avg": 0.007996518164873123, "policy/entropy_avg": 0.19372813403606415, "step": 231, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 0.9998430013656616, "val/ratio_var": 5.800552003165649e-07 }, { "episode": 14848, "epoch": 2.8020381204000753, "eps": 0, "loss/policy_avg": 0.001023156102746725, "loss/value_avg": 0.004101088736206293, "lr": 1.3804713804713806e-07, "objective/entropy": -682.6402587890625, "objective/kl": 9.124707221984863, "objective/non_score_reward": -0.273741215467453, "objective/rlhf_reward": 0.102479487657547, "objective/scores": 0.376953125, "policy/approxkl_avg": 0.0006178760668262839, "policy/clipfrac_avg": 0.007577064447104931, "policy/entropy_avg": 0.19171142578125, "step": 232, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 53, "val/ratio": 0.9999958276748657, "val/ratio_var": 4.763281822306453e-07 }, { "episode": 14912, "epoch": 2.8141158709190415, "eps": 0, "loss/policy_avg": -0.011374952271580696, "loss/value_avg": 0.004087153356522322, "lr": 1.3636363636363635e-07, "objective/entropy": -650.131103515625, "objective/kl": 9.105212211608887, "objective/non_score_reward": -0.27315637469291687, "objective/rlhf_reward": 0.15897253155708313, "objective/scores": 0.431640625, "policy/approxkl_avg": 0.00038859708001837134, "policy/clipfrac_avg": 0.007399349473416805, "policy/entropy_avg": 0.17907333374023438, "step": 233, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 43, "val/ratio": 1.0000715255737305, "val/ratio_var": 5.53652057533327e-07 }, { "episode": 14976, "epoch": 2.8261936214380072, "eps": 0, "loss/policy_avg": -0.01846359483897686, "loss/value_avg": 0.003997947089374065, "lr": 1.3468013468013468e-07, "objective/entropy": -721.1434326171875, "objective/kl": 7.176656723022461, "objective/non_score_reward": -0.21529969573020935, "objective/rlhf_reward": 0.24759092926979065, "objective/scores": 0.462890625, "policy/approxkl_avg": 0.0003538678865879774, "policy/clipfrac_avg": 0.00723436800763011, "policy/entropy_avg": 0.17639541625976562, "step": 234, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 51, "val/ratio": 1.0001486539840698, "val/ratio_var": 5.433992669168219e-07 }, { "episode": 15040, "epoch": 2.838271371956973, "eps": 0, "loss/policy_avg": -0.003030909225344658, "loss/value_avg": 0.0041738273575901985, "lr": 1.32996632996633e-07, "objective/entropy": -593.6438598632812, "objective/kl": 10.389270782470703, "objective/non_score_reward": -0.31167811155319214, "objective/rlhf_reward": 0.017911747097969055, "objective/scores": 0.330078125, "policy/approxkl_avg": 0.00048703886568546295, "policy/clipfrac_avg": 0.007803687360137701, "policy/entropy_avg": 0.20614878833293915, "step": 235, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 41, "val/ratio": 0.9999415278434753, "val/ratio_var": 8.815354703983758e-07 }, { "episode": 15104, "epoch": 2.8503491224759387, "eps": 0, "loss/policy_avg": -0.028734426945447922, "loss/value_avg": 0.00377194257453084, "lr": 1.3131313131313133e-07, "objective/entropy": -750.08837890625, "objective/kl": 7.2935791015625, "objective/non_score_reward": -0.2188073694705963, "objective/rlhf_reward": 0.2689855992794037, "objective/scores": 0.48828125, "policy/approxkl_avg": 0.0003561212797649205, "policy/clipfrac_avg": 0.007725189905613661, "policy/entropy_avg": 0.1868082731962204, "step": 236, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 54, "val/ratio": 0.9999493360519409, "val/ratio_var": 5.775621048087487e-07 }, { "episode": 15168, "epoch": 2.8624268729949045, "eps": 0, "loss/policy_avg": -0.02059931308031082, "loss/value_avg": 0.004160116892307997, "lr": 1.2962962962962961e-07, "objective/entropy": -689.4764404296875, "objective/kl": 7.785543441772461, "objective/non_score_reward": -0.2335663139820099, "objective/rlhf_reward": 0.1804961860179901, "objective/scores": 0.4140625, "policy/approxkl_avg": 0.0003638736379798502, "policy/clipfrac_avg": 0.007504904642701149, "policy/entropy_avg": 0.19126257300376892, "step": 237, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 45, "val/ratio": 1.000117540359497, "val/ratio_var": 5.907762670176453e-07 }, { "episode": 15232, "epoch": 2.8745046235138707, "eps": 0, "loss/policy_avg": -0.002399355173110962, "loss/value_avg": 0.003902244148775935, "lr": 1.2794612794612795e-07, "objective/entropy": -693.060791015625, "objective/kl": 6.487215042114258, "objective/non_score_reward": -0.19461645185947418, "objective/rlhf_reward": 0.18038354814052582, "objective/scores": 0.375, "policy/approxkl_avg": 0.00036661222111433744, "policy/clipfrac_avg": 0.007351069711148739, "policy/entropy_avg": 0.1860555112361908, "step": 238, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 44, "val/ratio": 1.0000566244125366, "val/ratio_var": 6.291702447924763e-07 }, { "episode": 15296, "epoch": 2.8865823740328365, "eps": 0, "loss/policy_avg": -0.007437670137733221, "loss/value_avg": 0.004013408906757832, "lr": 1.2626262626262626e-07, "objective/entropy": -704.9505615234375, "objective/kl": 6.939170837402344, "objective/non_score_reward": -0.20817512273788452, "objective/rlhf_reward": 0.17024284601211548, "objective/scores": 0.37890625, "policy/approxkl_avg": 0.00033546844497323036, "policy/clipfrac_avg": 0.007042970508337021, "policy/entropy_avg": 0.1695149838924408, "step": 239, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 1.0000756978988647, "val/ratio_var": 6.734092039550887e-07 }, { "episode": 15360, "epoch": 2.898660124551802, "eps": 0, "loss/policy_avg": -0.028576653450727463, "loss/value_avg": 0.004010652657598257, "lr": 1.2457912457912457e-07, "objective/entropy": -654.0175170898438, "objective/kl": 8.195779800415039, "objective/non_score_reward": -0.24587342143058777, "objective/rlhf_reward": 0.18869690597057343, "objective/scores": 0.43359375, "policy/approxkl_avg": 0.0004402039048727602, "policy/clipfrac_avg": 0.007189431693404913, "policy/entropy_avg": 0.17950567603111267, "step": 240, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 43, "val/ratio": 0.9998716115951538, "val/ratio_var": 6.682057573925704e-07 }, { "episode": 15424, "epoch": 2.910737875070768, "eps": 0, "loss/policy_avg": -0.005296625196933746, "loss/value_avg": 0.0039635319262743, "lr": 1.2289562289562288e-07, "objective/entropy": -656.5180053710938, "objective/kl": 8.551060676574707, "objective/non_score_reward": -0.2565317749977112, "objective/rlhf_reward": 0.13604632019996643, "objective/scores": 0.392578125, "policy/approxkl_avg": 0.00038327404763549566, "policy/clipfrac_avg": 0.007787951733916998, "policy/entropy_avg": 0.17977142333984375, "step": 241, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 1.0001020431518555, "val/ratio_var": 5.759193868470902e-07 }, { "episode": 15488, "epoch": 2.9228156255897337, "eps": 0, "loss/policy_avg": -0.009168568067252636, "loss/value_avg": 0.004365907050669193, "lr": 1.2121212121212122e-07, "objective/entropy": -702.2899169921875, "objective/kl": 7.23637056350708, "objective/non_score_reward": -0.21709111332893372, "objective/rlhf_reward": 0.22870966792106628, "objective/scores": 0.4453125, "policy/approxkl_avg": 0.0003652493469417095, "policy/clipfrac_avg": 0.007043258287012577, "policy/entropy_avg": 0.1999460905790329, "step": 242, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 50, "val/ratio": 0.9999537467956543, "val/ratio_var": 4.7287392135331174e-07 }, { "episode": 15552, "epoch": 2.9348933761087, "eps": 0, "loss/policy_avg": -0.03705034777522087, "loss/value_avg": 0.0035443564411252737, "lr": 1.1952861952861953e-07, "objective/entropy": -629.838623046875, "objective/kl": 8.425390243530273, "objective/non_score_reward": -0.25276172161102295, "objective/rlhf_reward": 0.22135938704013824, "objective/scores": 0.474609375, "policy/approxkl_avg": 0.00040427930071018636, "policy/clipfrac_avg": 0.00702214939519763, "policy/entropy_avg": 0.19646072387695312, "step": 243, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 43, "val/ratio": 1.0001170635223389, "val/ratio_var": 6.153204026304593e-07 }, { "episode": 15616, "epoch": 2.9469711266276657, "eps": 0, "loss/policy_avg": -0.03318122774362564, "loss/value_avg": 0.003984754905104637, "lr": 1.1784511784511784e-07, "objective/entropy": -658.1822509765625, "objective/kl": 8.162740707397461, "objective/non_score_reward": -0.2448822408914566, "objective/rlhf_reward": 0.2568267583847046, "objective/scores": 0.5, "policy/approxkl_avg": 0.00037857884308323264, "policy/clipfrac_avg": 0.0069054896011948586, "policy/entropy_avg": 0.18754324316978455, "step": 244, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 37, "val/ratio": 0.9999241828918457, "val/ratio_var": 6.279624926719407e-07 }, { "episode": 15680, "epoch": 2.9590488771466315, "eps": 0, "loss/policy_avg": -0.019931495189666748, "loss/value_avg": 0.003651971695944667, "lr": 1.1616161616161615e-07, "objective/entropy": -606.1823120117188, "objective/kl": 7.644600868225098, "objective/non_score_reward": -0.22933802008628845, "objective/rlhf_reward": 0.14859166741371155, "objective/scores": 0.37890625, "policy/approxkl_avg": 0.0005081476992927492, "policy/clipfrac_avg": 0.007128735538572073, "policy/entropy_avg": 0.19517645239830017, "step": 245, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 40, "val/ratio": 1.0000133514404297, "val/ratio_var": 6.433290309360018e-07 }, { "episode": 15744, "epoch": 2.971126627665597, "eps": 0, "loss/policy_avg": -0.044989436864852905, "loss/value_avg": 0.004129257518798113, "lr": 1.1447811447811447e-07, "objective/entropy": -718.4443359375, "objective/kl": 6.58664608001709, "objective/non_score_reward": -0.197599396109581, "objective/rlhf_reward": 0.3522053062915802, "objective/scores": 0.55078125, "policy/approxkl_avg": 0.00035908090649172664, "policy/clipfrac_avg": 0.006561779882758856, "policy/entropy_avg": 0.17469915747642517, "step": 246, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 43, "val/ratio": 1.000040888786316, "val/ratio_var": 6.177034492793609e-07 }, { "episode": 15808, "epoch": 2.983204378184563, "eps": 0, "loss/policy_avg": -0.006306433584541082, "loss/value_avg": 0.00377975357696414, "lr": 1.1279461279461279e-07, "objective/entropy": -669.0179443359375, "objective/kl": 7.104648590087891, "objective/non_score_reward": -0.21313944458961487, "objective/rlhf_reward": 0.19604022800922394, "objective/scores": 0.41015625, "policy/approxkl_avg": 0.0003768262395169586, "policy/clipfrac_avg": 0.007034813519567251, "policy/entropy_avg": 0.1779836118221283, "step": 247, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 47, "val/ratio": 0.999952495098114, "val/ratio_var": 5.565264018514426e-07 }, { "episode": 15872, "epoch": 2.995282128703529, "eps": 0, "loss/policy_avg": -0.030956070870161057, "loss/value_avg": 0.0035534966737031937, "lr": 1.111111111111111e-07, "objective/entropy": -611.2562255859375, "objective/kl": 8.399555206298828, "objective/non_score_reward": -0.25198665261268616, "objective/rlhf_reward": 0.16500553488731384, "objective/scores": 0.41796875, "policy/approxkl_avg": 0.00039667676901444793, "policy/clipfrac_avg": 0.007198335137218237, "policy/entropy_avg": 0.19819514453411102, "step": 248, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 42, "val/ratio": 0.999969482421875, "val/ratio_var": 4.946619469592406e-07 }, { "episode": 15936, "epoch": 3.007359879222495, "eps": 0, "loss/policy_avg": -0.017340319231152534, "loss/value_avg": 0.004106181673705578, "lr": 1.0942760942760942e-07, "objective/entropy": -661.902099609375, "objective/kl": 7.087057113647461, "objective/non_score_reward": -0.21261171996593475, "objective/rlhf_reward": 0.24832576513290405, "objective/scores": 0.4609375, "policy/approxkl_avg": 0.00036943715531378984, "policy/clipfrac_avg": 0.0072855958715081215, "policy/entropy_avg": 0.19068431854248047, "step": 249, "val/clipfrac_avg": 5.44804743185523e-06, "val/num_eos_tokens": 46, "val/ratio": 1.0000860691070557, "val/ratio_var": 6.541467314491456e-07 }, { "episode": 16000, "epoch": 3.0194376297414607, "eps": 0, "loss/policy_avg": -0.006075289100408554, "loss/value_avg": 0.0037019317969679832, "lr": 1.0774410774410773e-07, "objective/entropy": -622.1962890625, "objective/kl": 8.35627555847168, "objective/non_score_reward": -0.25068825483322144, "objective/rlhf_reward": 0.055464085191488266, "objective/scores": 0.306640625, "policy/approxkl_avg": 0.0004216305387672037, "policy/clipfrac_avg": 0.00783085823059082, "policy/entropy_avg": 0.2043101042509079, "step": 250, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 33, "val/ratio": 0.9999951124191284, "val/ratio_var": 7.26553423646692e-07 }, { "episode": 16064, "epoch": 3.0315153802604264, "eps": 0, "loss/policy_avg": -0.02081982046365738, "loss/value_avg": 0.0035106416326016188, "lr": 1.0606060606060605e-07, "objective/entropy": -641.5849609375, "objective/kl": 8.316513061523438, "objective/non_score_reward": -0.24949535727500916, "objective/rlhf_reward": 0.19825854897499084, "objective/scores": 0.447265625, "policy/approxkl_avg": 0.00040991941932588816, "policy/clipfrac_avg": 0.007065493613481522, "policy/entropy_avg": 0.19249090552330017, "step": 251, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 43, "val/ratio": 0.999900221824646, "val/ratio_var": 8.348190476681339e-07 }, { "episode": 16128, "epoch": 3.043593130779392, "eps": 0, "loss/policy_avg": -0.03310645744204521, "loss/value_avg": 0.003968073055148125, "lr": 1.0437710437710436e-07, "objective/entropy": -636.48974609375, "objective/kl": 7.7286529541015625, "objective/non_score_reward": -0.23185959458351135, "objective/rlhf_reward": 0.17146071791648865, "objective/scores": 0.40234375, "policy/approxkl_avg": 0.00042435387149453163, "policy/clipfrac_avg": 0.007897584699094296, "policy/entropy_avg": 0.20297622680664062, "step": 252, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 47, "val/ratio": 0.99993896484375, "val/ratio_var": 8.238701525442593e-07 }, { "episode": 16192, "epoch": 3.0556708812983584, "eps": 0, "loss/policy_avg": -0.01020126324146986, "loss/value_avg": 0.0036760650109499693, "lr": 1.0269360269360269e-07, "objective/entropy": -679.7139892578125, "objective/kl": 7.155096054077148, "objective/non_score_reward": -0.2146528959274292, "objective/rlhf_reward": 0.216499462723732, "objective/scores": 0.431640625, "policy/approxkl_avg": 0.0003815985983237624, "policy/clipfrac_avg": 0.007189772091805935, "policy/entropy_avg": 0.19544348120689392, "step": 253, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 0.9999157190322876, "val/ratio_var": 5.615696636596113e-07 }, { "episode": 16256, "epoch": 3.067748631817324, "eps": 0, "loss/policy_avg": -0.005997738800942898, "loss/value_avg": 0.004006213508546352, "lr": 1.01010101010101e-07, "objective/entropy": -635.3106689453125, "objective/kl": 8.364971160888672, "objective/non_score_reward": -0.2509491443634033, "objective/rlhf_reward": 0.12844537198543549, "objective/scores": 0.37890625, "policy/approxkl_avg": 0.000423733436036855, "policy/clipfrac_avg": 0.006593957543373108, "policy/entropy_avg": 0.2101338803768158, "step": 254, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 0.9999582171440125, "val/ratio_var": 5.447363378152659e-07 }, { "episode": 16320, "epoch": 3.07982638233629, "eps": 0, "loss/policy_avg": -0.016920043155550957, "loss/value_avg": 0.0037770867347717285, "lr": 9.932659932659932e-08, "objective/entropy": -681.9376220703125, "objective/kl": 7.062074661254883, "objective/non_score_reward": -0.2118622362613678, "objective/rlhf_reward": 0.2236846387386322, "objective/scores": 0.435546875, "policy/approxkl_avg": 0.00037374263047240674, "policy/clipfrac_avg": 0.0069709960371255875, "policy/entropy_avg": 0.18284988403320312, "step": 255, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 40, "val/ratio": 1.0000779628753662, "val/ratio_var": 6.587182497241884e-07 }, { "episode": 16384, "epoch": 3.0919041328552557, "eps": 0, "loss/policy_avg": -0.01712847873568535, "loss/value_avg": 0.0041097188368439674, "lr": 9.764309764309763e-08, "objective/entropy": -683.023193359375, "objective/kl": 6.437891960144043, "objective/non_score_reward": -0.19313675165176392, "objective/rlhf_reward": 0.18479293584823608, "objective/scores": 0.37890625, "policy/approxkl_avg": 0.0003696854109875858, "policy/clipfrac_avg": 0.007503229193389416, "policy/entropy_avg": 0.18361155688762665, "step": 256, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 47, "val/ratio": 1.0000218152999878, "val/ratio_var": 5.72627527617442e-07 }, { "episode": 16448, "epoch": 3.1039818833742214, "eps": 0, "loss/policy_avg": -0.03213302046060562, "loss/value_avg": 0.0037582411896437407, "lr": 9.595959595959594e-08, "objective/entropy": -648.9036254882812, "objective/kl": 7.319805145263672, "objective/non_score_reward": -0.21959413588047028, "objective/rlhf_reward": 0.22278867661952972, "objective/scores": 0.44140625, "policy/approxkl_avg": 0.0003951989929191768, "policy/clipfrac_avg": 0.006181157194077969, "policy/entropy_avg": 0.18743896484375, "step": 257, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 42, "val/ratio": 1.0000240802764893, "val/ratio_var": 6.705196256007184e-07 }, { "episode": 16512, "epoch": 3.116059633893187, "eps": 0, "loss/policy_avg": -0.026423348113894463, "loss/value_avg": 0.00365483108907938, "lr": 9.427609427609427e-08, "objective/entropy": -658.4329223632812, "objective/kl": 6.591666221618652, "objective/non_score_reward": -0.19774997234344482, "objective/rlhf_reward": 0.23535549640655518, "objective/scores": 0.43359375, "policy/approxkl_avg": 0.0003852533991448581, "policy/clipfrac_avg": 0.006929041352123022, "policy/entropy_avg": 0.18116506934165955, "step": 258, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 1.000004768371582, "val/ratio_var": 6.329533448479197e-07 }, { "episode": 16576, "epoch": 3.1281373844121534, "eps": 0, "loss/policy_avg": -0.01229805313050747, "loss/value_avg": 0.004078000318259001, "lr": 9.259259259259258e-08, "objective/entropy": -706.8790893554688, "objective/kl": 6.781729698181152, "objective/non_score_reward": -0.20345187187194824, "objective/rlhf_reward": 0.25748562812805176, "objective/scores": 0.4609375, "policy/approxkl_avg": 0.00035077554639428854, "policy/clipfrac_avg": 0.00699991500005126, "policy/entropy_avg": 0.18130874633789062, "step": 259, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 0.9999758005142212, "val/ratio_var": 5.301695296111575e-07 }, { "episode": 16640, "epoch": 3.140215134931119, "eps": 0, "loss/policy_avg": -0.03154832124710083, "loss/value_avg": 0.003632882609963417, "lr": 9.09090909090909e-08, "objective/entropy": -613.3504638671875, "objective/kl": 7.657683372497559, "objective/non_score_reward": -0.2297305017709732, "objective/rlhf_reward": 0.148199200630188, "objective/scores": 0.37890625, "policy/approxkl_avg": 0.0005249691312201321, "policy/clipfrac_avg": 0.007619872223585844, "policy/entropy_avg": 0.18793997168540955, "step": 260, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 1.000046730041504, "val/ratio_var": 7.239053161356424e-07 }, { "episode": 16704, "epoch": 3.152292885450085, "eps": 0, "loss/policy_avg": -0.01213142555207014, "loss/value_avg": 0.0035843513906002045, "lr": 8.922558922558921e-08, "objective/entropy": -649.6549072265625, "objective/kl": 9.2880277633667, "objective/non_score_reward": -0.27864083647727966, "objective/rlhf_reward": 0.12467947602272034, "objective/scores": 0.40234375, "policy/approxkl_avg": 0.00038886774564161897, "policy/clipfrac_avg": 0.006718775257468224, "policy/entropy_avg": 0.18620681762695312, "step": 261, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 45, "val/ratio": 0.999856173992157, "val/ratio_var": 4.936819664180803e-07 }, { "episode": 16768, "epoch": 3.1643706359690507, "eps": 0, "loss/policy_avg": -0.007179616950452328, "loss/value_avg": 0.0035246573388576508, "lr": 8.754208754208754e-08, "objective/entropy": -623.578857421875, "objective/kl": 8.410058975219727, "objective/non_score_reward": -0.25230175256729126, "objective/rlhf_reward": 0.08363573253154755, "objective/scores": 0.3359375, "policy/approxkl_avg": 0.00044147943845018744, "policy/clipfrac_avg": 0.007476884871721268, "policy/entropy_avg": 0.20351791381835938, "step": 262, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 41, "val/ratio": 1.0000280141830444, "val/ratio_var": 6.352038326440379e-07 }, { "episode": 16832, "epoch": 3.1764483864880164, "eps": 0, "loss/policy_avg": -0.003754607168957591, "loss/value_avg": 0.003930999897420406, "lr": 8.585858585858585e-08, "objective/entropy": -696.397705078125, "objective/kl": 8.027302742004395, "objective/non_score_reward": -0.2408190667629242, "objective/rlhf_reward": 0.0904797613620758, "objective/scores": 0.33203125, "policy/approxkl_avg": 0.0003533074341248721, "policy/clipfrac_avg": 0.00765426829457283, "policy/entropy_avg": 0.18895339965820312, "step": 263, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 0.99993896484375, "val/ratio_var": 4.5711240659329633e-07 }, { "episode": 16896, "epoch": 3.1885261370069826, "eps": 0, "loss/policy_avg": -0.017033755779266357, "loss/value_avg": 0.003304037032648921, "lr": 8.417508417508418e-08, "objective/entropy": -598.6932373046875, "objective/kl": 7.669593811035156, "objective/non_score_reward": -0.23008780181407928, "objective/rlhf_reward": 0.13075204193592072, "objective/scores": 0.361328125, "policy/approxkl_avg": 0.00047474654274992645, "policy/clipfrac_avg": 0.007650506682693958, "policy/entropy_avg": 0.20692571997642517, "step": 264, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 39, "val/ratio": 1.0001184940338135, "val/ratio_var": 9.710288395581301e-07 }, { "episode": 16960, "epoch": 3.2006038875259484, "eps": 0, "loss/policy_avg": -0.02458018623292446, "loss/value_avg": 0.00406844075769186, "lr": 8.24915824915825e-08, "objective/entropy": -673.213134765625, "objective/kl": 6.648907661437988, "objective/non_score_reward": -0.1994672268629074, "objective/rlhf_reward": 0.2600054144859314, "objective/scores": 0.458984375, "policy/approxkl_avg": 0.0003697554930113256, "policy/clipfrac_avg": 0.0070952074602246284, "policy/entropy_avg": 0.1852405071258545, "step": 265, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 53, "val/ratio": 1.0000247955322266, "val/ratio_var": 5.141792485119367e-07 }, { "episode": 17024, "epoch": 3.212681638044914, "eps": 0, "loss/policy_avg": -0.009238027967512608, "loss/value_avg": 0.004165910184383392, "lr": 8.080808080808082e-08, "objective/entropy": -735.2806396484375, "objective/kl": 6.235321998596191, "objective/non_score_reward": -0.1870596557855606, "objective/rlhf_reward": 0.2616708278656006, "objective/scores": 0.44921875, "policy/approxkl_avg": 0.00032986787846311927, "policy/clipfrac_avg": 0.006603958085179329, "policy/entropy_avg": 0.17215602099895477, "step": 266, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 42, "val/ratio": 0.9999458193778992, "val/ratio_var": 5.301350824993278e-07 }, { "episode": 17088, "epoch": 3.22475938856388, "eps": 0, "loss/policy_avg": -0.01698639616370201, "loss/value_avg": 0.0037531605921685696, "lr": 7.912457912457913e-08, "objective/entropy": -652.947265625, "objective/kl": 7.257780075073242, "objective/non_score_reward": -0.21773339807987213, "objective/rlhf_reward": 0.21537207067012787, "objective/scores": 0.43359375, "policy/approxkl_avg": 0.000403220416046679, "policy/clipfrac_avg": 0.006984008476138115, "policy/entropy_avg": 0.1990203857421875, "step": 267, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 48, "val/ratio": 1.0000309944152832, "val/ratio_var": 6.781556862733851e-07 }, { "episode": 17152, "epoch": 3.2368371390828456, "eps": 0, "loss/policy_avg": -0.028039943426847458, "loss/value_avg": 0.003373341169208288, "lr": 7.744107744107744e-08, "objective/entropy": -569.8634033203125, "objective/kl": 7.230891227722168, "objective/non_score_reward": -0.21692675352096558, "objective/rlhf_reward": 0.14879590272903442, "objective/scores": 0.365234375, "policy/approxkl_avg": 0.0004610806645359844, "policy/clipfrac_avg": 0.007782801054418087, "policy/entropy_avg": 0.21935272216796875, "step": 268, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 38, "val/ratio": 0.9999889135360718, "val/ratio_var": 6.67467986659176e-07 }, { "episode": 17216, "epoch": 3.248914889601812, "eps": 0, "loss/policy_avg": 0.012302754446864128, "loss/value_avg": 0.0036267773248255253, "lr": 7.575757575757576e-08, "objective/entropy": -655.1212768554688, "objective/kl": 8.060359954833984, "objective/non_score_reward": -0.24181079864501953, "objective/rlhf_reward": 0.03614329546689987, "objective/scores": 0.27734375, "policy/approxkl_avg": 0.0003768009482882917, "policy/clipfrac_avg": 0.0069151511415839195, "policy/entropy_avg": 0.20404815673828125, "step": 269, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 45, "val/ratio": 0.9999961256980896, "val/ratio_var": 5.263832463242579e-07 }, { "episode": 17280, "epoch": 3.2609926401207776, "eps": 0, "loss/policy_avg": -0.004253086168318987, "loss/value_avg": 0.0037821203004568815, "lr": 7.407407407407407e-08, "objective/entropy": -601.41015625, "objective/kl": 7.641479015350342, "objective/non_score_reward": -0.2292443811893463, "objective/rlhf_reward": 0.1254919469356537, "objective/scores": 0.35546875, "policy/approxkl_avg": 0.0004039438790641725, "policy/clipfrac_avg": 0.0064240507781505585, "policy/entropy_avg": 0.19796499609947205, "step": 270, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 41, "val/ratio": 0.9998639822006226, "val/ratio_var": 5.743943347624736e-07 }, { "episode": 17344, "epoch": 3.2730703906397434, "eps": 0, "loss/policy_avg": -0.03380737453699112, "loss/value_avg": 0.0034328820183873177, "lr": 7.23905723905724e-08, "objective/entropy": -647.0220947265625, "objective/kl": 7.08984899520874, "objective/non_score_reward": -0.21269546449184418, "objective/rlhf_reward": 0.27851545810699463, "objective/scores": 0.4921875, "policy/approxkl_avg": 0.00040110870031639934, "policy/clipfrac_avg": 0.007048811763525009, "policy/entropy_avg": 0.19513702392578125, "step": 271, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 0.9998549222946167, "val/ratio_var": 6.118988267189707e-07 }, { "episode": 17408, "epoch": 3.285148141158709, "eps": 0, "loss/policy_avg": -0.014850600622594357, "loss/value_avg": 0.0036329745780676603, "lr": 7.070707070707071e-08, "objective/entropy": -586.4075317382812, "objective/kl": 8.249979019165039, "objective/non_score_reward": -0.24749934673309326, "objective/rlhf_reward": 0.15679752826690674, "objective/scores": 0.404296875, "policy/approxkl_avg": 0.00042898274841718376, "policy/clipfrac_avg": 0.0069151753559708595, "policy/entropy_avg": 0.19608816504478455, "step": 272, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 41, "val/ratio": 1.0001020431518555, "val/ratio_var": 8.127553314807301e-07 }, { "episode": 17472, "epoch": 3.297225891677675, "eps": 0, "loss/policy_avg": -0.01862741820514202, "loss/value_avg": 0.0038365069776773453, "lr": 6.902356902356903e-08, "objective/entropy": -696.0003051757812, "objective/kl": 7.6856584548950195, "objective/non_score_reward": -0.2305697500705719, "objective/rlhf_reward": 0.2308560311794281, "objective/scores": 0.4609375, "policy/approxkl_avg": 0.00037589838029816747, "policy/clipfrac_avg": 0.007162098772823811, "policy/entropy_avg": 0.186614990234375, "step": 273, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 42, "val/ratio": 1.0000542402267456, "val/ratio_var": 5.687811608368065e-07 }, { "episode": 17536, "epoch": 3.309303642196641, "eps": 0, "loss/policy_avg": -0.02315349504351616, "loss/value_avg": 0.0040380991995334625, "lr": 6.734006734006734e-08, "objective/entropy": -722.7449951171875, "objective/kl": 6.226775646209717, "objective/non_score_reward": -0.18680325150489807, "objective/rlhf_reward": 0.24581393599510193, "objective/scores": 0.43359375, "policy/approxkl_avg": 0.0003334844659548253, "policy/clipfrac_avg": 0.0064841099083423615, "policy/entropy_avg": 0.18703460693359375, "step": 274, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 1.0000557899475098, "val/ratio_var": 5.662852800014662e-07 }, { "episode": 17600, "epoch": 3.321381392715607, "eps": 0, "loss/policy_avg": -0.02682194858789444, "loss/value_avg": 0.0037449407391250134, "lr": 6.565656565656566e-08, "objective/entropy": -635.6344604492188, "objective/kl": 7.442312240600586, "objective/non_score_reward": -0.22326935827732086, "objective/rlhf_reward": 0.22106656432151794, "objective/scores": 0.4453125, "policy/approxkl_avg": 0.00040927709778770804, "policy/clipfrac_avg": 0.006474938243627548, "policy/entropy_avg": 0.19021479785442352, "step": 275, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 42, "val/ratio": 1.0002195835113525, "val/ratio_var": 6.032373676134739e-07 }, { "episode": 17664, "epoch": 3.3334591432345726, "eps": 0, "loss/policy_avg": -0.005783136934041977, "loss/value_avg": 0.003839105134829879, "lr": 6.397306397306398e-08, "objective/entropy": -667.8414306640625, "objective/kl": 6.772984504699707, "objective/non_score_reward": -0.20318952202796936, "objective/rlhf_reward": 0.18743547797203064, "objective/scores": 0.390625, "policy/approxkl_avg": 0.0003651580773293972, "policy/clipfrac_avg": 0.007003391161561012, "policy/entropy_avg": 0.19083023071289062, "step": 276, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 47, "val/ratio": 1.0000221729278564, "val/ratio_var": 5.736771981901256e-07 }, { "episode": 17728, "epoch": 3.3455368937535384, "eps": 0, "loss/policy_avg": -0.006362794898450375, "loss/value_avg": 0.003617867361754179, "lr": 6.228956228956229e-08, "objective/entropy": -669.930419921875, "objective/kl": 7.383747100830078, "objective/non_score_reward": -0.22151240706443787, "objective/rlhf_reward": 0.18522588908672333, "objective/scores": 0.40625, "policy/approxkl_avg": 0.00036430690670385957, "policy/clipfrac_avg": 0.007155537139624357, "policy/entropy_avg": 0.18641917407512665, "step": 277, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 45, "val/ratio": 0.9999706745147705, "val/ratio_var": 6.842114999017213e-07 }, { "episode": 17792, "epoch": 3.357614644272504, "eps": 0, "loss/policy_avg": -0.03245866298675537, "loss/value_avg": 0.003861590987071395, "lr": 6.060606060606061e-08, "objective/entropy": -712.3450927734375, "objective/kl": 6.383450984954834, "objective/non_score_reward": -0.19150352478027344, "objective/rlhf_reward": 0.31337928771972656, "objective/scores": 0.50390625, "policy/approxkl_avg": 0.00036257284227758646, "policy/clipfrac_avg": 0.006432620342820883, "policy/entropy_avg": 0.18203863501548767, "step": 278, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 0.9999188184738159, "val/ratio_var": 3.687934793106251e-07 }, { "episode": 17856, "epoch": 3.36969239479147, "eps": 0, "loss/policy_avg": -0.025677090510725975, "loss/value_avg": 0.003610721556469798, "lr": 5.892255892255892e-08, "objective/entropy": -635.7150268554688, "objective/kl": 6.59326171875, "objective/non_score_reward": -0.19779784977436066, "objective/rlhf_reward": 0.20503418147563934, "objective/scores": 0.40234375, "policy/approxkl_avg": 0.0003899303264915943, "policy/clipfrac_avg": 0.006962133105844259, "policy/entropy_avg": 0.19839096069335938, "step": 279, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 47, "val/ratio": 1.0000261068344116, "val/ratio_var": 5.185344775782141e-07 }, { "episode": 17920, "epoch": 3.381770145310436, "eps": 0, "loss/policy_avg": -0.028471484780311584, "loss/value_avg": 0.0032405236270278692, "lr": 5.723905723905724e-08, "objective/entropy": -642.4448852539062, "objective/kl": 6.222779750823975, "objective/non_score_reward": -0.18668338656425476, "objective/rlhf_reward": 0.26058220863342285, "objective/scores": 0.447265625, "policy/approxkl_avg": 0.0004811930702999234, "policy/clipfrac_avg": 0.007410434540361166, "policy/entropy_avg": 0.201324462890625, "step": 280, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 0.9999183416366577, "val/ratio_var": 6.145901920717733e-07 }, { "episode": 17984, "epoch": 3.393847895829402, "eps": 0, "loss/policy_avg": -0.018460756167769432, "loss/value_avg": 0.0036118782591074705, "lr": 5.555555555555555e-08, "objective/entropy": -647.02392578125, "objective/kl": 7.956416130065918, "objective/non_score_reward": -0.2386924773454666, "objective/rlhf_reward": 0.1660926640033722, "objective/scores": 0.404296875, "policy/approxkl_avg": 0.00041467935079708695, "policy/clipfrac_avg": 0.007317427080124617, "policy/entropy_avg": 0.20193736255168915, "step": 281, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 42, "val/ratio": 1.0000567436218262, "val/ratio_var": 5.499433655131725e-07 }, { "episode": 18048, "epoch": 3.4059256463483676, "eps": 0, "loss/policy_avg": -0.012457543984055519, "loss/value_avg": 0.0038994457572698593, "lr": 5.3872053872053865e-08, "objective/entropy": -716.0316162109375, "objective/kl": 5.913008689880371, "objective/non_score_reward": -0.17739026248455048, "objective/rlhf_reward": 0.30991441011428833, "objective/scores": 0.48828125, "policy/approxkl_avg": 0.0003301530086901039, "policy/clipfrac_avg": 0.0064779892563819885, "policy/entropy_avg": 0.18742243945598602, "step": 282, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 1.0000253915786743, "val/ratio_var": 4.4198901605341234e-07 }, { "episode": 18112, "epoch": 3.4180033968673333, "eps": 0, "loss/policy_avg": -0.013042353093624115, "loss/value_avg": 0.0037472937256097794, "lr": 5.218855218855218e-08, "objective/entropy": -752.298828125, "objective/kl": 6.022947311401367, "objective/non_score_reward": -0.18068841099739075, "objective/rlhf_reward": 0.35056155920028687, "objective/scores": 0.53125, "policy/approxkl_avg": 0.0003260195953771472, "policy/clipfrac_avg": 0.0073262769728899, "policy/entropy_avg": 0.193359375, "step": 283, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 58, "val/ratio": 1.00008225440979, "val/ratio_var": 5.816585826323717e-07 }, { "episode": 18176, "epoch": 3.430081147386299, "eps": 0, "loss/policy_avg": -0.022922195494174957, "loss/value_avg": 0.003924447111785412, "lr": 5.05050505050505e-08, "objective/entropy": -727.8096923828125, "objective/kl": 6.566383361816406, "objective/non_score_reward": -0.19699150323867798, "objective/rlhf_reward": 0.274688184261322, "objective/scores": 0.47265625, "policy/approxkl_avg": 0.0003402878064662218, "policy/clipfrac_avg": 0.006386288907378912, "policy/entropy_avg": 0.17614874243736267, "step": 284, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 56, "val/ratio": 1.0001291036605835, "val/ratio_var": 7.245762958518753e-07 }, { "episode": 18240, "epoch": 3.4421588979052653, "eps": 0, "loss/policy_avg": -0.005379597656428814, "loss/value_avg": 0.003668400924652815, "lr": 4.8821548821548816e-08, "objective/entropy": -657.02294921875, "objective/kl": 7.11133337020874, "objective/non_score_reward": -0.21333999931812286, "objective/rlhf_reward": 0.26126939058303833, "objective/scores": 0.474609375, "policy/approxkl_avg": 0.0003993379359599203, "policy/clipfrac_avg": 0.00656835176050663, "policy/entropy_avg": 0.19129817187786102, "step": 285, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 39, "val/ratio": 1.0000909566879272, "val/ratio_var": 4.280297787317977e-07 }, { "episode": 18304, "epoch": 3.454236648424231, "eps": 0, "loss/policy_avg": -0.0007903016521595418, "loss/value_avg": 0.003940465394407511, "lr": 4.7138047138047134e-08, "objective/entropy": -678.7791748046875, "objective/kl": 7.29849100112915, "objective/non_score_reward": -0.21895474195480347, "objective/rlhf_reward": 0.15433627367019653, "objective/scores": 0.373046875, "policy/approxkl_avg": 0.0003655221953522414, "policy/clipfrac_avg": 0.006983469240367413, "policy/entropy_avg": 0.19457626342773438, "step": 286, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 41, "val/ratio": 1.0000255107879639, "val/ratio_var": 6.261829526010843e-07 }, { "episode": 18368, "epoch": 3.466314398943197, "eps": 0, "loss/policy_avg": -0.013323694467544556, "loss/value_avg": 0.003565411549061537, "lr": 4.545454545454545e-08, "objective/entropy": -666.1871948242188, "objective/kl": 6.453955173492432, "objective/non_score_reward": -0.19361865520477295, "objective/rlhf_reward": 0.23753370344638824, "objective/scores": 0.431640625, "policy/approxkl_avg": 0.0003667096607387066, "policy/clipfrac_avg": 0.0066523477435112, "policy/entropy_avg": 0.19250616431236267, "step": 287, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 41, "val/ratio": 0.9998884201049805, "val/ratio_var": 4.2977848124792217e-07 }, { "episode": 18432, "epoch": 3.4783921494621626, "eps": 0, "loss/policy_avg": -0.01416645385324955, "loss/value_avg": 0.0035817499738186598, "lr": 4.377104377104377e-08, "objective/entropy": -700.9827270507812, "objective/kl": 6.070189476013184, "objective/non_score_reward": -0.1821056753396988, "objective/rlhf_reward": 0.2661365270614624, "objective/scores": 0.44921875, "policy/approxkl_avg": 0.0003335383953526616, "policy/clipfrac_avg": 0.006237865425646305, "policy/entropy_avg": 0.18711933493614197, "step": 288, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 44, "val/ratio": 1.000044584274292, "val/ratio_var": 4.85237990233145e-07 }, { "episode": 18496, "epoch": 3.4904698999811283, "eps": 0, "loss/policy_avg": -0.022722497582435608, "loss/value_avg": 0.0033240667544305325, "lr": 4.208754208754209e-08, "objective/entropy": -738.82373046875, "objective/kl": 5.70413064956665, "objective/non_score_reward": -0.1711239218711853, "objective/rlhf_reward": 0.3274112343788147, "objective/scores": 0.498046875, "policy/approxkl_avg": 0.0003393371298443526, "policy/clipfrac_avg": 0.007512836717069149, "policy/entropy_avg": 0.1853078305721283, "step": 289, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 44, "val/ratio": 0.9999982714653015, "val/ratio_var": 4.772057877744373e-07 }, { "episode": 18560, "epoch": 3.5025476505000945, "eps": 0, "loss/policy_avg": -0.0007028168765828013, "loss/value_avg": 0.0038722134195268154, "lr": 4.040404040404041e-08, "objective/entropy": -688.8043212890625, "objective/kl": 7.602431297302246, "objective/non_score_reward": -0.2280729115009308, "objective/rlhf_reward": 0.1908724009990692, "objective/scores": 0.41796875, "policy/approxkl_avg": 0.00037353829247877, "policy/clipfrac_avg": 0.007045034319162369, "policy/entropy_avg": 0.19284312427043915, "step": 290, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 42, "val/ratio": 0.9999067187309265, "val/ratio_var": 6.341031166812172e-07 }, { "episode": 18624, "epoch": 3.5146254010190603, "eps": 0, "loss/policy_avg": -0.013677339069545269, "loss/value_avg": 0.0034771780483424664, "lr": 3.872053872053872e-08, "objective/entropy": -623.31884765625, "objective/kl": 6.28436803817749, "objective/non_score_reward": -0.1885310411453247, "objective/rlhf_reward": 0.1708439588546753, "objective/scores": 0.359375, "policy/approxkl_avg": 0.00041524547850713134, "policy/clipfrac_avg": 0.006799499504268169, "policy/entropy_avg": 0.182159423828125, "step": 291, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 35, "val/ratio": 1.0000431537628174, "val/ratio_var": 6.771080052203615e-07 }, { "episode": 18688, "epoch": 3.526703151538026, "eps": 0, "loss/policy_avg": -0.02629309892654419, "loss/value_avg": 0.003640729933977127, "lr": 3.7037037037037036e-08, "objective/entropy": -668.0806884765625, "objective/kl": 7.632940292358398, "objective/non_score_reward": -0.22898820042610168, "objective/rlhf_reward": 0.24610945582389832, "objective/scores": 0.474609375, "policy/approxkl_avg": 0.00038289197254925966, "policy/clipfrac_avg": 0.0064846850000321865, "policy/entropy_avg": 0.19228872656822205, "step": 292, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 45, "val/ratio": 1.0000791549682617, "val/ratio_var": 6.841331696705311e-07 }, { "episode": 18752, "epoch": 3.538780902056992, "eps": 0, "loss/policy_avg": -0.012860393151640892, "loss/value_avg": 0.004311088938266039, "lr": 3.5353535353535353e-08, "objective/entropy": -699.3877563476562, "objective/kl": 6.807313919067383, "objective/non_score_reward": -0.20421940088272095, "objective/rlhf_reward": 0.27234309911727905, "objective/scores": 0.4765625, "policy/approxkl_avg": 0.00037193228490650654, "policy/clipfrac_avg": 0.006834262516349554, "policy/entropy_avg": 0.17558543384075165, "step": 293, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 40, "val/ratio": 0.9999904632568359, "val/ratio_var": 5.597553922598308e-07 }, { "episode": 18816, "epoch": 3.5508586525759576, "eps": 0, "loss/policy_avg": -0.03740035742521286, "loss/value_avg": 0.003467664122581482, "lr": 3.367003367003367e-08, "objective/entropy": -682.0465698242188, "objective/kl": 7.4768147468566895, "objective/non_score_reward": -0.2243044376373291, "objective/rlhf_reward": 0.2590939998626709, "objective/scores": 0.484375, "policy/approxkl_avg": 0.0004095996846444905, "policy/clipfrac_avg": 0.006977352779358625, "policy/entropy_avg": 0.19226329028606415, "step": 294, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 39, "val/ratio": 0.9999417066574097, "val/ratio_var": 6.161255896586226e-07 }, { "episode": 18880, "epoch": 3.5629364030949238, "eps": 0, "loss/policy_avg": -0.02602003514766693, "loss/value_avg": 0.003619457595050335, "lr": 3.198653198653199e-08, "objective/entropy": -725.4286499023438, "objective/kl": 6.233606338500977, "objective/non_score_reward": -0.18700820207595825, "objective/rlhf_reward": 0.21924179792404175, "objective/scores": 0.40625, "policy/approxkl_avg": 0.00038076151395216584, "policy/clipfrac_avg": 0.006789907813072205, "policy/entropy_avg": 0.1863047331571579, "step": 295, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 44, "val/ratio": 1.0000081062316895, "val/ratio_var": 6.039576874172781e-07 }, { "episode": 18944, "epoch": 3.5750141536138895, "eps": 0, "loss/policy_avg": -0.02581647038459778, "loss/value_avg": 0.003386137541383505, "lr": 3.0303030303030305e-08, "objective/entropy": -664.8191528320312, "objective/kl": 7.328958988189697, "objective/non_score_reward": -0.21986877918243408, "objective/rlhf_reward": 0.17368590831756592, "objective/scores": 0.39453125, "policy/approxkl_avg": 0.0003837857802864164, "policy/clipfrac_avg": 0.006847723387181759, "policy/entropy_avg": 0.2102610319852829, "step": 296, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 48, "val/ratio": 0.9997448921203613, "val/ratio_var": 6.460118697759754e-07 }, { "episode": 19008, "epoch": 3.5870919041328553, "eps": 0, "loss/policy_avg": -0.01845662109553814, "loss/value_avg": 0.0036777183413505554, "lr": 2.861952861952862e-08, "objective/entropy": -644.3194580078125, "objective/kl": 6.178158760070801, "objective/non_score_reward": -0.18534475564956665, "objective/rlhf_reward": 0.24483102560043335, "objective/scores": 0.4296875, "policy/approxkl_avg": 0.0003848365449812263, "policy/clipfrac_avg": 0.007219640072435141, "policy/entropy_avg": 0.18966802954673767, "step": 297, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 34, "val/ratio": 1.0000991821289062, "val/ratio_var": 4.2051803461617965e-07 }, { "episode": 19072, "epoch": 3.599169654651821, "eps": 0, "loss/policy_avg": -0.03636598959565163, "loss/value_avg": 0.0033596050925552845, "lr": 2.6936026936026933e-08, "objective/entropy": -675.494384765625, "objective/kl": 6.287810802459717, "objective/non_score_reward": -0.18863432109355927, "objective/rlhf_reward": 0.34456878900527954, "objective/scores": 0.53125, "policy/approxkl_avg": 0.000387437641620636, "policy/clipfrac_avg": 0.006644147448241711, "policy/entropy_avg": 0.2019907683134079, "step": 298, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 45, "val/ratio": 0.9999553561210632, "val/ratio_var": 5.573217549681431e-07 }, { "episode": 19136, "epoch": 3.611247405170787, "eps": 0, "loss/policy_avg": -0.00651153177022934, "loss/value_avg": 0.003799569560214877, "lr": 2.525252525252525e-08, "objective/entropy": -662.1578369140625, "objective/kl": 6.4900946617126465, "objective/non_score_reward": -0.1947028487920761, "objective/rlhf_reward": 0.1832268387079239, "objective/scores": 0.37890625, "policy/approxkl_avg": 0.0003824663581326604, "policy/clipfrac_avg": 0.006935178767889738, "policy/entropy_avg": 0.18524932861328125, "step": 299, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 0.9999604225158691, "val/ratio_var": 5.727119400944503e-07 }, { "episode": 19200, "epoch": 3.623325155689753, "eps": 0, "loss/policy_avg": 4.80624566989718e-06, "loss/value_avg": 0.003957442473620176, "lr": 2.3569023569023567e-08, "objective/entropy": -613.0647583007812, "objective/kl": 7.1408891677856445, "objective/non_score_reward": -0.21422669291496277, "objective/rlhf_reward": 0.14368346333503723, "objective/scores": 0.357421875, "policy/approxkl_avg": 0.0004018023028038442, "policy/clipfrac_avg": 0.007539949379861355, "policy/entropy_avg": 0.18967437744140625, "step": 300, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 1.0000495910644531, "val/ratio_var": 7.146343250497011e-07 }, { "episode": 19264, "epoch": 3.6354029062087188, "eps": 0, "loss/policy_avg": -0.019861344248056412, "loss/value_avg": 0.0036555491387844086, "lr": 2.1885521885521884e-08, "objective/entropy": -695.1436767578125, "objective/kl": 5.617988586425781, "objective/non_score_reward": -0.16853967308998108, "objective/rlhf_reward": 0.2938627004623413, "objective/scores": 0.462890625, "policy/approxkl_avg": 0.00036370521411299706, "policy/clipfrac_avg": 0.00675880815833807, "policy/entropy_avg": 0.19301223754882812, "step": 301, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 53, "val/ratio": 0.9999930262565613, "val/ratio_var": 4.901943952972942e-07 }, { "episode": 19328, "epoch": 3.6474806567276845, "eps": 0, "loss/policy_avg": 6.916312031535199e-06, "loss/value_avg": 0.003553304821252823, "lr": 2.0202020202020204e-08, "objective/entropy": -603.779541015625, "objective/kl": 7.373934745788574, "objective/non_score_reward": -0.2212180495262146, "objective/rlhf_reward": 0.1664772629737854, "objective/scores": 0.38671875, "policy/approxkl_avg": 0.0004275983665138483, "policy/clipfrac_avg": 0.006720641162246466, "policy/entropy_avg": 0.20763906836509705, "step": 302, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 42, "val/ratio": 1.0000828504562378, "val/ratio_var": 5.606044624073547e-07 }, { "episode": 19392, "epoch": 3.6595584072466503, "eps": 0, "loss/policy_avg": -0.016103968024253845, "loss/value_avg": 0.0041246358305215836, "lr": 1.8518518518518518e-08, "objective/entropy": -669.2377319335938, "objective/kl": 5.863403797149658, "objective/non_score_reward": -0.1759021282196045, "objective/rlhf_reward": 0.2698986530303955, "objective/scores": 0.4453125, "policy/approxkl_avg": 0.00045897456584498286, "policy/clipfrac_avg": 0.005576698109507561, "policy/entropy_avg": 0.17554092407226562, "step": 303, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 45, "val/ratio": 1.0000157356262207, "val/ratio_var": 6.06298158345453e-07 }, { "episode": 19456, "epoch": 3.671636157765616, "eps": 0, "loss/policy_avg": -0.014837692491710186, "loss/value_avg": 0.0034644536208361387, "lr": 1.6835016835016835e-08, "objective/entropy": -638.3107299804688, "objective/kl": 6.990595817565918, "objective/non_score_reward": -0.20971786975860596, "objective/rlhf_reward": 0.19067275524139404, "objective/scores": 0.400390625, "policy/approxkl_avg": 0.00039926738827489316, "policy/clipfrac_avg": 0.0072073861956596375, "policy/entropy_avg": 0.19615554809570312, "step": 304, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 55, "val/ratio": 0.9998650550842285, "val/ratio_var": 6.70040776640235e-07 }, { "episode": 19520, "epoch": 3.683713908284582, "eps": 0, "loss/policy_avg": -0.03533481806516647, "loss/value_avg": 0.003643455682322383, "lr": 1.5151515151515152e-08, "objective/entropy": -708.4891357421875, "objective/kl": 6.058835506439209, "objective/non_score_reward": -0.18176504969596863, "objective/rlhf_reward": 0.36803963780403137, "objective/scores": 0.55078125, "policy/approxkl_avg": 0.00035262128221802413, "policy/clipfrac_avg": 0.006895636674016714, "policy/entropy_avg": 0.18742243945598602, "step": 305, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 49, "val/ratio": 1.0000662803649902, "val/ratio_var": 4.963605420016393e-07 }, { "episode": 19584, "epoch": 3.695791658803548, "eps": 0, "loss/policy_avg": -0.02189006470143795, "loss/value_avg": 0.0033562961034476757, "lr": 1.3468013468013466e-08, "objective/entropy": -635.7755737304688, "objective/kl": 7.733546257019043, "objective/non_score_reward": -0.23200638592243195, "objective/rlhf_reward": 0.15861861407756805, "objective/scores": 0.390625, "policy/approxkl_avg": 0.00040599549538455904, "policy/clipfrac_avg": 0.006881616078317165, "policy/entropy_avg": 0.19137954711914062, "step": 306, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 50, "val/ratio": 1.0000026226043701, "val/ratio_var": 6.138251933407446e-07 }, { "episode": 19648, "epoch": 3.7078694093225137, "eps": 0, "loss/policy_avg": -0.03934434801340103, "loss/value_avg": 0.004052530974149704, "lr": 1.1784511784511783e-08, "objective/entropy": -738.7037353515625, "objective/kl": 6.419657230377197, "objective/non_score_reward": -0.19258970022201538, "objective/rlhf_reward": 0.3157110810279846, "objective/scores": 0.5078125, "policy/approxkl_avg": 0.0003348543250467628, "policy/clipfrac_avg": 0.006614835001528263, "policy/entropy_avg": 0.1747385710477829, "step": 307, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 50, "val/ratio": 1.0000545978546143, "val/ratio_var": 4.5710487484029727e-07 }, { "episode": 19712, "epoch": 3.7199471598414795, "eps": 0, "loss/policy_avg": -0.01892966404557228, "loss/value_avg": 0.0036056172102689743, "lr": 1.0101010101010102e-08, "objective/entropy": -665.5927124023438, "objective/kl": 6.821819305419922, "objective/non_score_reward": -0.20465457439422607, "objective/rlhf_reward": 0.2562829256057739, "objective/scores": 0.4609375, "policy/approxkl_avg": 0.0003650089493021369, "policy/clipfrac_avg": 0.006426130421459675, "policy/entropy_avg": 0.19108709692955017, "step": 308, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 46, "val/ratio": 0.9999858736991882, "val/ratio_var": 4.877447850049066e-07 }, { "episode": 19776, "epoch": 3.7320249103604453, "eps": 0, "loss/policy_avg": -0.012006578966975212, "loss/value_avg": 0.003470144933089614, "lr": 8.417508417508418e-09, "objective/entropy": -619.7354736328125, "objective/kl": 8.15022087097168, "objective/non_score_reward": -0.2445066124200821, "objective/rlhf_reward": 0.11438010632991791, "objective/scores": 0.359375, "policy/approxkl_avg": 0.00041244737803936005, "policy/clipfrac_avg": 0.007321036886423826, "policy/entropy_avg": 0.19420623779296875, "step": 309, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 42, "val/ratio": 1.000077486038208, "val/ratio_var": 6.490017199212161e-07 }, { "episode": 19840, "epoch": 3.744102660879411, "eps": 0, "loss/policy_avg": -0.015695635229349136, "loss/value_avg": 0.00352904899045825, "lr": 6.734006734006733e-09, "objective/entropy": -641.0952758789062, "objective/kl": 7.442835330963135, "objective/non_score_reward": -0.22328504920005798, "objective/rlhf_reward": 0.20689073204994202, "objective/scores": 0.4296875, "policy/approxkl_avg": 0.00039796438068151474, "policy/clipfrac_avg": 0.007473438512533903, "policy/entropy_avg": 0.19350814819335938, "step": 310, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 43, "val/ratio": 0.9999562501907349, "val/ratio_var": 4.872820227319608e-07 }, { "episode": 19904, "epoch": 3.756180411398377, "eps": 0, "loss/policy_avg": 0.0016891969135031104, "loss/value_avg": 0.003535608295351267, "lr": 5.050505050505051e-09, "objective/entropy": -627.7642822265625, "objective/kl": 6.384559631347656, "objective/non_score_reward": -0.1915367841720581, "objective/rlhf_reward": 0.1114417240023613, "objective/scores": 0.302734375, "policy/approxkl_avg": 0.0004028166295029223, "policy/clipfrac_avg": 0.006718785967677832, "policy/entropy_avg": 0.2012532651424408, "step": 311, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 45, "val/ratio": 1.0000300407409668, "val/ratio_var": 6.633476914430503e-07 }, { "episode": 19968, "epoch": 3.768258161917343, "eps": 0, "loss/policy_avg": -0.012200575321912766, "loss/value_avg": 0.003601629287004471, "lr": 3.3670033670033666e-09, "objective/entropy": -709.2785034179688, "objective/kl": 6.045020580291748, "objective/non_score_reward": -0.18135061860084534, "objective/rlhf_reward": 0.14872750639915466, "objective/scores": 0.330078125, "policy/approxkl_avg": 0.00035006398684345186, "policy/clipfrac_avg": 0.006922123488038778, "policy/entropy_avg": 0.1940714567899704, "step": 312, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 52, "val/ratio": 0.9998501539230347, "val/ratio_var": 5.097400048725831e-07 }, { "episode": 20032, "epoch": 3.7803359124363087, "eps": 0, "loss/policy_avg": -0.005085974466055632, "loss/value_avg": 0.003475761041045189, "lr": 1.6835016835016833e-09, "objective/entropy": -692.533203125, "objective/kl": 6.632614612579346, "objective/non_score_reward": -0.19897842407226562, "objective/rlhf_reward": 0.22387313842773438, "objective/scores": 0.421875, "policy/approxkl_avg": 0.000355798052623868, "policy/clipfrac_avg": 0.006758556701242924, "policy/entropy_avg": 0.17910131812095642, "step": 313, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 48, "val/ratio": 1.0001018047332764, "val/ratio_var": 6.0731622397725e-07 } ], "logging_steps": 500, "max_steps": 313, "num_input_tokens_seen": 0, "num_train_epochs": 3.774297037176826, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": true, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0, "train_batch_size": null, "trial_name": null, "trial_params": null }