{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "episode": 1712, "epoch": 3.0035087719298246, "eval_steps": 500, "global_step": 107, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "episode": 16, "epoch": 0.028070175438596492, "eps": 2, "loss/policy_avg": 0.040254347026348114, "loss/value_avg": 4.365694046020508, "lr": 1.41e-05, "objective/entropy": 20.665752410888672, "objective/kl": 38.31879425048828, "objective/non_score_reward": -1.9159398078918457, "objective/rlhf_reward": -0.5565648078918457, "objective/scores": 1.359375, "policy/approxkl_avg": 5.953035354614258, "policy/clipfrac_avg": 0.2399764209985733, "policy/entropy_avg": 0.6761884689331055, "step": 1, "val/clipfrac_avg": 0.26650944352149963, "val/num_eos_tokens": 0, "val/ratio": 0.8525064587593079, "val/ratio_var": 0.0006113838753663003 }, { "episode": 32, "epoch": 0.056140350877192984, "eps": 2, "loss/policy_avg": 0.0446447990834713, "loss/value_avg": 3.0273280143737793, "lr": 1.3968224299065421e-05, "objective/entropy": 21.37685203552246, "objective/kl": 73.40159606933594, "objective/non_score_reward": -3.6700797080993652, "objective/rlhf_reward": -2.5294547080993652, "objective/scores": 1.140625, "policy/approxkl_avg": 6.8038129806518555, "policy/clipfrac_avg": 0.22936320304870605, "policy/entropy_avg": 0.6141112446784973, "step": 2, "val/clipfrac_avg": 0.2458726465702057, "val/num_eos_tokens": 0, "val/ratio": 0.8650269508361816, "val/ratio_var": 0.0003330775070935488 }, { "episode": 48, "epoch": 0.08421052631578947, "eps": 3, "loss/policy_avg": 0.07702315598726273, "loss/value_avg": 2.1064209938049316, "lr": 1.3836448598130842e-05, "objective/entropy": 26.10182762145996, "objective/kl": 74.03025817871094, "objective/non_score_reward": -3.7015130519866943, "objective/rlhf_reward": -2.6546380519866943, "objective/scores": 1.046875, "policy/approxkl_avg": 5.6432695388793945, "policy/clipfrac_avg": 0.2087264060974121, "policy/entropy_avg": 0.7391780614852905, "step": 3, "val/clipfrac_avg": 0.21639150381088257, "val/num_eos_tokens": 0, "val/ratio": 0.8773487210273743, "val/ratio_var": 0.00039911610656417906 }, { "episode": 64, "epoch": 0.11228070175438597, "eps": 3, "loss/policy_avg": 0.03978118300437927, "loss/value_avg": 1.5315862894058228, "lr": 1.3704672897196262e-05, "objective/entropy": 20.781639099121094, "objective/kl": 68.94087219238281, "objective/non_score_reward": -3.4470434188842773, "objective/rlhf_reward": -2.4470434188842773, "objective/scores": 1.0, "policy/approxkl_avg": 4.726678848266602, "policy/clipfrac_avg": 0.20341980457305908, "policy/entropy_avg": 0.7119085788726807, "step": 4, "val/clipfrac_avg": 0.16096699237823486, "val/num_eos_tokens": 0, "val/ratio": 0.8485063314437866, "val/ratio_var": 0.0003752955235540867 }, { "episode": 80, "epoch": 0.14035087719298245, "eps": 3, "loss/policy_avg": 0.06184825301170349, "loss/value_avg": 0.9904976487159729, "lr": 1.3572897196261683e-05, "objective/entropy": 22.48508071899414, "objective/kl": 75.60542297363281, "objective/non_score_reward": -3.780271291732788, "objective/rlhf_reward": -3.272458791732788, "objective/scores": 0.5078125, "policy/approxkl_avg": 4.352260589599609, "policy/clipfrac_avg": 0.2146226465702057, "policy/entropy_avg": 0.7200251817703247, "step": 5, "val/clipfrac_avg": 0.09669811278581619, "val/num_eos_tokens": 0, "val/ratio": 0.8546276092529297, "val/ratio_var": 0.00010198648669756949 }, { "episode": 96, "epoch": 0.16842105263157894, "eps": 3, "loss/policy_avg": 0.046236515045166016, "loss/value_avg": 1.2219572067260742, "lr": 1.3441121495327103e-05, "objective/entropy": 22.197113037109375, "objective/kl": 82.56051635742188, "objective/non_score_reward": -4.128026008605957, "objective/rlhf_reward": -3.253026008605957, "objective/scores": 0.875, "policy/approxkl_avg": 3.8577029705047607, "policy/clipfrac_avg": 0.22641509771347046, "policy/entropy_avg": 0.7238099575042725, "step": 6, "val/clipfrac_avg": 0.0383254736661911, "val/num_eos_tokens": 0, "val/ratio": 0.8292837738990784, "val/ratio_var": 6.185401434777305e-05 }, { "episode": 112, "epoch": 0.19649122807017544, "eps": 3, "loss/policy_avg": 0.06606701016426086, "loss/value_avg": 1.325202226638794, "lr": 1.3309345794392524e-05, "objective/entropy": 27.022249221801758, "objective/kl": 101.50897216796875, "objective/non_score_reward": -5.075448513031006, "objective/rlhf_reward": -3.653573513031006, "objective/scores": 1.421875, "policy/approxkl_avg": 5.585095405578613, "policy/clipfrac_avg": 0.24174529314041138, "policy/entropy_avg": 0.8859995603561401, "step": 7, "val/clipfrac_avg": 0.028301887214183807, "val/num_eos_tokens": 0, "val/ratio": 0.8486474752426147, "val/ratio_var": 0.0005388148711062968 }, { "episode": 128, "epoch": 0.22456140350877193, "eps": 3, "loss/policy_avg": 0.06467999517917633, "loss/value_avg": 0.9481044411659241, "lr": 1.3177570093457945e-05, "objective/entropy": 26.585155487060547, "objective/kl": 116.76457214355469, "objective/non_score_reward": -5.838229179382324, "objective/rlhf_reward": -4.111666679382324, "objective/scores": 1.7265625, "policy/approxkl_avg": 5.351860046386719, "policy/clipfrac_avg": 0.24469339847564697, "policy/entropy_avg": 0.8991943597793579, "step": 8, "val/clipfrac_avg": 0.08785377442836761, "val/num_eos_tokens": 0, "val/ratio": 0.8524093627929688, "val/ratio_var": 3.493872281978838e-05 }, { "episode": 144, "epoch": 0.25263157894736843, "eps": 3, "loss/policy_avg": 0.03949737548828125, "loss/value_avg": 0.9386715888977051, "lr": 1.3045794392523365e-05, "objective/entropy": 29.664459228515625, "objective/kl": 132.19940185546875, "objective/non_score_reward": -6.6099700927734375, "objective/rlhf_reward": -5.6763763427734375, "objective/scores": 0.93359375, "policy/approxkl_avg": 5.967764377593994, "policy/clipfrac_avg": 0.2057783007621765, "policy/entropy_avg": 1.0122931003570557, "step": 9, "val/clipfrac_avg": 0.017099056392908096, "val/num_eos_tokens": 0, "val/ratio": 0.8205329179763794, "val/ratio_var": 7.763963367324322e-05 }, { "episode": 160, "epoch": 0.2807017543859649, "eps": 3, "loss/policy_avg": 0.0211679395288229, "loss/value_avg": 1.213599681854248, "lr": 1.2914018691588786e-05, "objective/entropy": 31.076860427856445, "objective/kl": 135.477294921875, "objective/non_score_reward": -6.77386474609375, "objective/rlhf_reward": -5.00042724609375, "objective/scores": 1.7734375, "policy/approxkl_avg": 3.5569186210632324, "policy/clipfrac_avg": 0.21304652094841003, "policy/entropy_avg": 1.1141610145568848, "step": 10, "val/clipfrac_avg": 0.06719152629375458, "val/num_eos_tokens": 0, "val/ratio": 0.8466310501098633, "val/ratio_var": 0.0001414915022905916 }, { "episode": 176, "epoch": 0.3087719298245614, "eps": 3, "loss/policy_avg": 0.0240048635751009, "loss/value_avg": 1.3511584997177124, "lr": 1.2782242990654206e-05, "objective/entropy": 32.90422058105469, "objective/kl": 146.03787231445312, "objective/non_score_reward": -7.301893711090088, "objective/rlhf_reward": -5.630018711090088, "objective/scores": 1.671875, "policy/approxkl_avg": 5.625584125518799, "policy/clipfrac_avg": 0.20400942862033844, "policy/entropy_avg": 1.1266499757766724, "step": 11, "val/clipfrac_avg": 0.05483490601181984, "val/num_eos_tokens": 0, "val/ratio": 0.8710312843322754, "val/ratio_var": 0.00014447471767198294 }, { "episode": 192, "epoch": 0.3368421052631579, "eps": 3, "loss/policy_avg": 0.05039631202816963, "loss/value_avg": 1.4507163763046265, "lr": 1.2650467289719627e-05, "objective/entropy": 34.778846740722656, "objective/kl": 152.17945861816406, "objective/non_score_reward": -7.608973503112793, "objective/rlhf_reward": -5.726161003112793, "objective/scores": 1.8828125, "policy/approxkl_avg": 4.985030174255371, "policy/clipfrac_avg": 0.1987028270959854, "policy/entropy_avg": 1.1480720043182373, "step": 12, "val/clipfrac_avg": 0.05188679322600365, "val/num_eos_tokens": 0, "val/ratio": 0.8588758111000061, "val/ratio_var": 3.4296579542569816e-05 }, { "episode": 208, "epoch": 0.3649122807017544, "eps": 3, "loss/policy_avg": 0.037938639521598816, "loss/value_avg": 1.5314528942108154, "lr": 1.2518691588785048e-05, "objective/entropy": 45.65214538574219, "objective/kl": 132.1256866455078, "objective/non_score_reward": -6.606284141540527, "objective/rlhf_reward": -6.803549766540527, "objective/scores": -0.197265625, "policy/approxkl_avg": 5.0863566398620605, "policy/clipfrac_avg": 0.18867924809455872, "policy/entropy_avg": 1.3655226230621338, "step": 13, "val/clipfrac_avg": 0.06367924809455872, "val/num_eos_tokens": 0, "val/ratio": 0.8282334804534912, "val/ratio_var": 0.0004519254434853792 }, { "episode": 224, "epoch": 0.3929824561403509, "eps": 3, "loss/policy_avg": 0.036732763051986694, "loss/value_avg": 2.189356803894043, "lr": 1.2386915887850468e-05, "objective/entropy": 38.35211944580078, "objective/kl": 113.13496398925781, "objective/non_score_reward": -5.656748294830322, "objective/rlhf_reward": -4.539560794830322, "objective/scores": 1.1171875, "policy/approxkl_avg": 4.408792972564697, "policy/clipfrac_avg": 0.21201542019844055, "policy/entropy_avg": 1.2410473823547363, "step": 14, "val/clipfrac_avg": 0.10222071409225464, "val/num_eos_tokens": 0, "val/ratio": 0.8754662275314331, "val/ratio_var": 0.0007480247295461595 }, { "episode": 240, "epoch": 0.42105263157894735, "eps": 3, "loss/policy_avg": 0.03690744563937187, "loss/value_avg": 1.4401739835739136, "lr": 1.2255140186915889e-05, "objective/entropy": 38.69694519042969, "objective/kl": 115.77085876464844, "objective/non_score_reward": -5.788543224334717, "objective/rlhf_reward": -5.627410411834717, "objective/scores": 0.1611328125, "policy/approxkl_avg": 4.673148155212402, "policy/clipfrac_avg": 0.1833726465702057, "policy/entropy_avg": 1.2108347415924072, "step": 15, "val/clipfrac_avg": 0.01179245300590992, "val/num_eos_tokens": 0, "val/ratio": 0.8487275838851929, "val/ratio_var": 0.0008522844291292131 }, { "episode": 256, "epoch": 0.44912280701754387, "eps": 3, "loss/policy_avg": 0.019956424832344055, "loss/value_avg": 1.5383408069610596, "lr": 1.212336448598131e-05, "objective/entropy": 32.02195739746094, "objective/kl": 122.87109375, "objective/non_score_reward": -6.1435546875, "objective/rlhf_reward": -5.7470703125, "objective/scores": 0.396484375, "policy/approxkl_avg": 5.454257965087891, "policy/clipfrac_avg": 0.22314535081386566, "policy/entropy_avg": 1.1098759174346924, "step": 16, "val/clipfrac_avg": 0.03384597226977348, "val/num_eos_tokens": 0, "val/ratio": 0.8839770555496216, "val/ratio_var": 0.0004538022622000426 }, { "episode": 272, "epoch": 0.47719298245614034, "eps": 3, "loss/policy_avg": 0.016002152115106583, "loss/value_avg": 1.5554126501083374, "lr": 1.199158878504673e-05, "objective/entropy": 28.698740005493164, "objective/kl": 129.80386352539062, "objective/non_score_reward": -6.4901933670043945, "objective/rlhf_reward": -6.1796464920043945, "objective/scores": 0.310546875, "policy/approxkl_avg": 3.0268969535827637, "policy/clipfrac_avg": 0.19969519972801208, "policy/entropy_avg": 0.9890843629837036, "step": 17, "val/clipfrac_avg": 0.07565668225288391, "val/num_eos_tokens": 0, "val/ratio": 0.8847370743751526, "val/ratio_var": 0.000195541579159908 }, { "episode": 288, "epoch": 0.5052631578947369, "eps": 3, "loss/policy_avg": 0.013964798301458359, "loss/value_avg": 1.590545892715454, "lr": 1.185981308411215e-05, "objective/entropy": 12.885665893554688, "objective/kl": 107.77202606201172, "objective/non_score_reward": -5.388601303100586, "objective/rlhf_reward": -4.169851303100586, "objective/scores": 1.21875, "policy/approxkl_avg": 5.9386420249938965, "policy/clipfrac_avg": 0.18009786307811737, "policy/entropy_avg": 0.6803157925605774, "step": 18, "val/clipfrac_avg": 0.12902730703353882, "val/num_eos_tokens": 0, "val/ratio": 0.9129149913787842, "val/ratio_var": 0.0004151359898969531 }, { "episode": 304, "epoch": 0.5333333333333333, "eps": 3, "loss/policy_avg": 0.02675745077431202, "loss/value_avg": 1.745023488998413, "lr": 1.1728037383177571e-05, "objective/entropy": 22.75409698486328, "objective/kl": 119.34423828125, "objective/non_score_reward": -5.967212200164795, "objective/rlhf_reward": -5.197680950164795, "objective/scores": 0.76953125, "policy/approxkl_avg": 6.532215118408203, "policy/clipfrac_avg": 0.1759602427482605, "policy/entropy_avg": 0.8050931692123413, "step": 19, "val/clipfrac_avg": 0.0052770450711250305, "val/num_eos_tokens": 0, "val/ratio": 0.9205665588378906, "val/ratio_var": 0.0005024418351240456 }, { "episode": 320, "epoch": 0.5614035087719298, "eps": 3, "loss/policy_avg": 0.014609305188059807, "loss/value_avg": 1.7434797286987305, "lr": 1.159626168224299e-05, "objective/entropy": 29.933460235595703, "objective/kl": 144.45672607421875, "objective/non_score_reward": -7.222836494445801, "objective/rlhf_reward": -6.675961494445801, "objective/scores": 0.546875, "policy/approxkl_avg": 7.733211517333984, "policy/clipfrac_avg": 0.1762971729040146, "policy/entropy_avg": 0.8804416656494141, "step": 20, "val/clipfrac_avg": 0.026533018797636032, "val/num_eos_tokens": 0, "val/ratio": 0.8693416714668274, "val/ratio_var": 0.000557584862690419 }, { "episode": 336, "epoch": 0.5894736842105263, "eps": 3, "loss/policy_avg": 0.019340746104717255, "loss/value_avg": 1.471176266670227, "lr": 1.146448598130841e-05, "objective/entropy": 23.77098846435547, "objective/kl": 149.46974182128906, "objective/non_score_reward": -7.473487377166748, "objective/rlhf_reward": -5.668799877166748, "objective/scores": 1.8046875, "policy/approxkl_avg": 5.010880470275879, "policy/clipfrac_avg": 0.18341976404190063, "policy/entropy_avg": 0.973499596118927, "step": 21, "val/clipfrac_avg": 0.016483133658766747, "val/num_eos_tokens": 0, "val/ratio": 0.8893705606460571, "val/ratio_var": 0.0012118957238271832 }, { "episode": 352, "epoch": 0.6175438596491228, "eps": 3, "loss/policy_avg": 0.009950436651706696, "loss/value_avg": 1.1944406032562256, "lr": 1.1332710280373831e-05, "objective/entropy": 23.91471290588379, "objective/kl": 155.48843383789062, "objective/non_score_reward": -7.774421691894531, "objective/rlhf_reward": -7.125984191894531, "objective/scores": 0.6484375, "policy/approxkl_avg": 6.8445940017700195, "policy/clipfrac_avg": 0.16214622557163239, "policy/entropy_avg": 0.837052047252655, "step": 22, "val/clipfrac_avg": 0.09257075190544128, "val/num_eos_tokens": 0, "val/ratio": 0.9027042984962463, "val/ratio_var": 0.00019580482330638915 }, { "episode": 368, "epoch": 0.6456140350877193, "eps": 3, "loss/policy_avg": 0.021877210587263107, "loss/value_avg": 1.0937385559082031, "lr": 1.1200934579439252e-05, "objective/entropy": 23.552492141723633, "objective/kl": 172.6247100830078, "objective/non_score_reward": -8.631235122680664, "objective/rlhf_reward": -6.787485122680664, "objective/scores": 1.84375, "policy/approxkl_avg": 6.227967262268066, "policy/clipfrac_avg": 0.16203010082244873, "policy/entropy_avg": 0.8671172857284546, "step": 23, "val/clipfrac_avg": 0.018410665914416313, "val/num_eos_tokens": 0, "val/ratio": 0.8919187784194946, "val/ratio_var": 0.001309347921051085 }, { "episode": 384, "epoch": 0.6736842105263158, "eps": 3, "loss/policy_avg": 0.036975178867578506, "loss/value_avg": 1.1198090314865112, "lr": 1.1069158878504672e-05, "objective/entropy": 20.281631469726562, "objective/kl": 170.099365234375, "objective/non_score_reward": -8.504968643188477, "objective/rlhf_reward": -7.262781143188477, "objective/scores": 1.2421875, "policy/approxkl_avg": 6.825319766998291, "policy/clipfrac_avg": 0.14049983024597168, "policy/entropy_avg": 0.802283525466919, "step": 24, "val/clipfrac_avg": 0.0785929411649704, "val/num_eos_tokens": 0, "val/ratio": 0.8865500092506409, "val/ratio_var": 0.0006475243135355413 }, { "episode": 400, "epoch": 0.7017543859649122, "eps": 3, "loss/policy_avg": 0.0237587820738554, "loss/value_avg": 1.3415908813476562, "lr": 1.0937383177570093e-05, "objective/entropy": 21.36954689025879, "objective/kl": 179.03819274902344, "objective/non_score_reward": -8.951910018920898, "objective/rlhf_reward": -7.334722518920898, "objective/scores": 1.6171875, "policy/approxkl_avg": 7.667660713195801, "policy/clipfrac_avg": 0.11468379199504852, "policy/entropy_avg": 0.7623839974403381, "step": 25, "val/clipfrac_avg": 0.27072370052337646, "val/num_eos_tokens": 0, "val/ratio": 0.8912988901138306, "val/ratio_var": 2.9881122827646323e-05 }, { "episode": 416, "epoch": 0.7298245614035088, "eps": 3, "loss/policy_avg": 0.018890127539634705, "loss/value_avg": 1.2341463565826416, "lr": 1.0805607476635514e-05, "objective/entropy": 16.852466583251953, "objective/kl": 179.8382568359375, "objective/non_score_reward": -8.991912841796875, "objective/rlhf_reward": -8.312225341796875, "objective/scores": 0.6796875, "policy/approxkl_avg": 7.357123374938965, "policy/clipfrac_avg": 0.11944779008626938, "policy/entropy_avg": 0.7555092573165894, "step": 26, "val/clipfrac_avg": 0.2508935034275055, "val/num_eos_tokens": 0, "val/ratio": 0.8802620768547058, "val/ratio_var": 0.00018670025747269392 }, { "episode": 432, "epoch": 0.7578947368421053, "eps": 3, "loss/policy_avg": 0.021100062876939774, "loss/value_avg": 0.9362931847572327, "lr": 1.0673831775700934e-05, "objective/entropy": 21.68787384033203, "objective/kl": 183.68655395507812, "objective/non_score_reward": -9.184328079223633, "objective/rlhf_reward": -7.199953079223633, "objective/scores": 1.984375, "policy/approxkl_avg": 5.103993892669678, "policy/clipfrac_avg": 0.12146226316690445, "policy/entropy_avg": 0.7987968921661377, "step": 27, "val/clipfrac_avg": 0.05424528568983078, "val/num_eos_tokens": 0, "val/ratio": 0.8928566575050354, "val/ratio_var": 0.00013602118997368962 }, { "episode": 448, "epoch": 0.7859649122807018, "eps": 3, "loss/policy_avg": 0.011967534199357033, "loss/value_avg": 1.0544021129608154, "lr": 1.0542056074766355e-05, "objective/entropy": 23.05614471435547, "objective/kl": 182.8275146484375, "objective/non_score_reward": -9.141375541687012, "objective/rlhf_reward": -7.789813041687012, "objective/scores": 1.3515625, "policy/approxkl_avg": 4.963105201721191, "policy/clipfrac_avg": 0.14268869161605835, "policy/entropy_avg": 0.8151739835739136, "step": 28, "val/clipfrac_avg": 0.2146226465702057, "val/num_eos_tokens": 0, "val/ratio": 0.8715541362762451, "val/ratio_var": 7.266430475283414e-05 }, { "episode": 464, "epoch": 0.8140350877192982, "eps": 3, "loss/policy_avg": 0.011134720407426357, "loss/value_avg": 0.775411069393158, "lr": 1.0410280373831775e-05, "objective/entropy": 25.063724517822266, "objective/kl": 188.51422119140625, "objective/non_score_reward": -9.425710678100586, "objective/rlhf_reward": -8.394460678100586, "objective/scores": 1.03125, "policy/approxkl_avg": 7.372692108154297, "policy/clipfrac_avg": 0.1320754736661911, "policy/entropy_avg": 0.8475193977355957, "step": 29, "val/clipfrac_avg": 0.04245283082127571, "val/num_eos_tokens": 0, "val/ratio": 0.891329824924469, "val/ratio_var": 3.0236831207730575e-06 }, { "episode": 480, "epoch": 0.8421052631578947, "eps": 3, "loss/policy_avg": 0.017270730808377266, "loss/value_avg": 0.7942019701004028, "lr": 1.0278504672897196e-05, "objective/entropy": 22.402624130249023, "objective/kl": 185.72421264648438, "objective/non_score_reward": -9.286211013793945, "objective/rlhf_reward": -7.606523513793945, "objective/scores": 1.6796875, "policy/approxkl_avg": 8.755260467529297, "policy/clipfrac_avg": 0.11261792480945587, "policy/entropy_avg": 0.799101710319519, "step": 30, "val/clipfrac_avg": 0.014740565791726112, "val/num_eos_tokens": 0, "val/ratio": 0.8892796635627747, "val/ratio_var": 6.262218084884807e-05 }, { "episode": 496, "epoch": 0.8701754385964913, "eps": 3, "loss/policy_avg": 0.019226763397455215, "loss/value_avg": 0.6826229095458984, "lr": 1.0146728971962616e-05, "objective/entropy": 23.43070411682129, "objective/kl": 201.67799377441406, "objective/non_score_reward": -10.083900451660156, "objective/rlhf_reward": -9.154212951660156, "objective/scores": 0.9296875, "policy/approxkl_avg": 8.144369125366211, "policy/clipfrac_avg": 0.12264151126146317, "policy/entropy_avg": 0.8400179147720337, "step": 31, "val/clipfrac_avg": 0.2228773534297943, "val/num_eos_tokens": 0, "val/ratio": 0.8812745213508606, "val/ratio_var": 0.00010276544344378635 }, { "episode": 512, "epoch": 0.8982456140350877, "eps": 3, "loss/policy_avg": 0.005793810822069645, "loss/value_avg": 0.8381754159927368, "lr": 1.0014953271028037e-05, "objective/entropy": 21.22824478149414, "objective/kl": 185.61614990234375, "objective/non_score_reward": -9.280807495117188, "objective/rlhf_reward": -7.3511199951171875, "objective/scores": 1.9296875, "policy/approxkl_avg": 7.533528804779053, "policy/clipfrac_avg": 0.12205187976360321, "policy/entropy_avg": 0.7801576256752014, "step": 32, "val/clipfrac_avg": 0.09787735342979431, "val/num_eos_tokens": 0, "val/ratio": 0.8987904787063599, "val/ratio_var": 0.00018136559810955077 }, { "episode": 528, "epoch": 0.9263157894736842, "eps": 3, "loss/policy_avg": 0.02254486456513405, "loss/value_avg": 0.8877236843109131, "lr": 9.883177570093458e-06, "objective/entropy": 23.442203521728516, "objective/kl": 188.65457153320312, "objective/non_score_reward": -9.432729721069336, "objective/rlhf_reward": -7.534292221069336, "objective/scores": 1.8984375, "policy/approxkl_avg": 5.329720497131348, "policy/clipfrac_avg": 0.12323113530874252, "policy/entropy_avg": 0.8007351160049438, "step": 33, "val/clipfrac_avg": 0.036556605249643326, "val/num_eos_tokens": 0, "val/ratio": 0.8945379257202148, "val/ratio_var": 0.0001225806336151436 }, { "episode": 544, "epoch": 0.9543859649122807, "eps": 3, "loss/policy_avg": 0.01662730611860752, "loss/value_avg": 0.637324869632721, "lr": 9.751401869158878e-06, "objective/entropy": 20.620216369628906, "objective/kl": 186.25180053710938, "objective/non_score_reward": -9.312589645385742, "objective/rlhf_reward": -7.851652145385742, "objective/scores": 1.4609375, "policy/approxkl_avg": 8.322406768798828, "policy/clipfrac_avg": 0.125, "policy/entropy_avg": 0.7402328848838806, "step": 34, "val/clipfrac_avg": 0.002358490601181984, "val/num_eos_tokens": 0, "val/ratio": 0.8899158239364624, "val/ratio_var": 0.0002540757122915238 }, { "episode": 560, "epoch": 0.9824561403508771, "eps": 3, "loss/policy_avg": 0.0003616265021264553, "loss/value_avg": 0.6015822291374207, "lr": 9.619626168224299e-06, "objective/entropy": 21.12371063232422, "objective/kl": 186.5103302001953, "objective/non_score_reward": -9.325516700744629, "objective/rlhf_reward": -7.817704200744629, "objective/scores": 1.5078125, "policy/approxkl_avg": 6.682015419006348, "policy/clipfrac_avg": 0.13443395495414734, "policy/entropy_avg": 0.7727504372596741, "step": 35, "val/clipfrac_avg": 0.037146229296922684, "val/num_eos_tokens": 0, "val/ratio": 0.8780511021614075, "val/ratio_var": 6.755981303285807e-05 }, { "episode": 576, "epoch": 1.0105263157894737, "eps": 3, "loss/policy_avg": 0.009338408708572388, "loss/value_avg": 0.6681860685348511, "lr": 9.48785046728972e-06, "objective/entropy": 21.184894561767578, "objective/kl": 192.3019256591797, "objective/non_score_reward": -9.615096092224121, "objective/rlhf_reward": -7.443221092224121, "objective/scores": 2.171875, "policy/approxkl_avg": 6.053627967834473, "policy/clipfrac_avg": 0.1179245263338089, "policy/entropy_avg": 0.7489176392555237, "step": 36, "val/clipfrac_avg": 0.15683962404727936, "val/num_eos_tokens": 0, "val/ratio": 0.8922820091247559, "val/ratio_var": 0.00026253468240611255 }, { "episode": 592, "epoch": 1.03859649122807, "eps": 3, "loss/policy_avg": 0.006294197402894497, "loss/value_avg": 0.8030184507369995, "lr": 9.35607476635514e-06, "objective/entropy": 20.368942260742188, "objective/kl": 190.313232421875, "objective/non_score_reward": -9.51566219329834, "objective/rlhf_reward": -8.02347469329834, "objective/scores": 1.4921875, "policy/approxkl_avg": 6.838142395019531, "policy/clipfrac_avg": 0.11615566164255142, "policy/entropy_avg": 0.7397478222846985, "step": 37, "val/clipfrac_avg": 0.003537735901772976, "val/num_eos_tokens": 0, "val/ratio": 0.8849948644638062, "val/ratio_var": 8.990589412860572e-05 }, { "episode": 608, "epoch": 1.0666666666666667, "eps": 3, "loss/policy_avg": 0.016940509900450706, "loss/value_avg": 0.5882998704910278, "lr": 9.22429906542056e-06, "objective/entropy": 23.183269500732422, "objective/kl": 186.7628631591797, "objective/non_score_reward": -9.338143348693848, "objective/rlhf_reward": -7.564705848693848, "objective/scores": 1.7734375, "policy/approxkl_avg": 3.9737157821655273, "policy/clipfrac_avg": 0.13089622557163239, "policy/entropy_avg": 0.7805944681167603, "step": 38, "val/clipfrac_avg": 0.004127358552068472, "val/num_eos_tokens": 0, "val/ratio": 0.8967232704162598, "val/ratio_var": 0.00010195688810199499 }, { "episode": 624, "epoch": 1.0947368421052632, "eps": 3, "loss/policy_avg": 0.011485239490866661, "loss/value_avg": 0.5787074565887451, "lr": 9.092523364485981e-06, "objective/entropy": 17.85469627380371, "objective/kl": 187.50631713867188, "objective/non_score_reward": -9.375316619873047, "objective/rlhf_reward": -7.203441619873047, "objective/scores": 2.171875, "policy/approxkl_avg": 5.274375915527344, "policy/clipfrac_avg": 0.11556603759527206, "policy/entropy_avg": 0.7150323390960693, "step": 39, "val/clipfrac_avg": 0.16509434580802917, "val/num_eos_tokens": 0, "val/ratio": 0.8858749866485596, "val/ratio_var": 0.0001809587120078504 }, { "episode": 640, "epoch": 1.1228070175438596, "eps": 3, "loss/policy_avg": 0.017689252272248268, "loss/value_avg": 0.7021454572677612, "lr": 8.960747663551402e-06, "objective/entropy": 21.994365692138672, "objective/kl": 182.05810546875, "objective/non_score_reward": -9.1029052734375, "objective/rlhf_reward": -6.8841552734375, "objective/scores": 2.21875, "policy/approxkl_avg": 4.578630447387695, "policy/clipfrac_avg": 0.12558962404727936, "policy/entropy_avg": 0.7208189964294434, "step": 40, "val/clipfrac_avg": 0.06426886469125748, "val/num_eos_tokens": 0, "val/ratio": 0.8957177400588989, "val/ratio_var": 7.126481068553403e-05 }, { "episode": 656, "epoch": 1.1508771929824562, "eps": 3, "loss/policy_avg": 0.005923585034906864, "loss/value_avg": 0.5532969236373901, "lr": 8.828971962616822e-06, "objective/entropy": 18.056703567504883, "objective/kl": 170.11863708496094, "objective/non_score_reward": -8.505931854248047, "objective/rlhf_reward": -6.380931854248047, "objective/scores": 2.125, "policy/approxkl_avg": 3.3685710430145264, "policy/clipfrac_avg": 0.12028301507234573, "policy/entropy_avg": 0.6859033107757568, "step": 41, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.8874868154525757, "val/ratio_var": 4.332972093834542e-05 }, { "episode": 672, "epoch": 1.1789473684210525, "eps": 3, "loss/policy_avg": 0.012987145222723484, "loss/value_avg": 0.632028341293335, "lr": 8.697196261682243e-06, "objective/entropy": 19.865787506103516, "objective/kl": 177.42971801757812, "objective/non_score_reward": -8.871485710144043, "objective/rlhf_reward": -6.527735710144043, "objective/scores": 2.34375, "policy/approxkl_avg": 6.186136245727539, "policy/clipfrac_avg": 0.13325470685958862, "policy/entropy_avg": 0.6923173069953918, "step": 42, "val/clipfrac_avg": 0.125, "val/num_eos_tokens": 0, "val/ratio": 0.8991193771362305, "val/ratio_var": 0.0009607934043742716 }, { "episode": 688, "epoch": 1.207017543859649, "eps": 3, "loss/policy_avg": 0.011711956933140755, "loss/value_avg": 0.4474850296974182, "lr": 8.565420560747664e-06, "objective/entropy": 19.79934310913086, "objective/kl": 176.83395385742188, "objective/non_score_reward": -8.84169864654541, "objective/rlhf_reward": -7.02138614654541, "objective/scores": 1.8203125, "policy/approxkl_avg": 5.904331207275391, "policy/clipfrac_avg": 0.13089622557163239, "policy/entropy_avg": 0.6940422654151917, "step": 43, "val/clipfrac_avg": 0.000589622650295496, "val/num_eos_tokens": 0, "val/ratio": 0.8869062066078186, "val/ratio_var": 0.0008940807892940938 }, { "episode": 704, "epoch": 1.2350877192982457, "eps": 3, "loss/policy_avg": 0.014619714580476284, "loss/value_avg": 0.625725507736206, "lr": 8.433644859813084e-06, "objective/entropy": 22.385128021240234, "objective/kl": 181.2716064453125, "objective/non_score_reward": -9.063579559326172, "objective/rlhf_reward": -7.493267059326172, "objective/scores": 1.5703125, "policy/approxkl_avg": 4.966976165771484, "policy/clipfrac_avg": 0.14799527823925018, "policy/entropy_avg": 0.7825338840484619, "step": 44, "val/clipfrac_avg": 0.14681604504585266, "val/num_eos_tokens": 0, "val/ratio": 0.883050799369812, "val/ratio_var": 1.7675925846560858e-05 }, { "episode": 720, "epoch": 1.263157894736842, "eps": 3, "loss/policy_avg": 0.024550937116146088, "loss/value_avg": 0.7726404070854187, "lr": 8.301869158878505e-06, "objective/entropy": 19.87116050720215, "objective/kl": 172.38674926757812, "objective/non_score_reward": -8.619338035583496, "objective/rlhf_reward": -6.572463035583496, "objective/scores": 2.046875, "policy/approxkl_avg": 4.95554256439209, "policy/clipfrac_avg": 0.13089622557163239, "policy/entropy_avg": 0.6934086680412292, "step": 45, "val/clipfrac_avg": 0.009433962404727936, "val/num_eos_tokens": 0, "val/ratio": 0.8879209756851196, "val/ratio_var": 0.0003393842780496925 }, { "episode": 736, "epoch": 1.2912280701754386, "eps": 3, "loss/policy_avg": 0.019363895058631897, "loss/value_avg": 0.5529497861862183, "lr": 8.170093457943925e-06, "objective/entropy": 17.47795295715332, "objective/kl": 174.22576904296875, "objective/non_score_reward": -8.711288452148438, "objective/rlhf_reward": -6.3206634521484375, "objective/scores": 2.390625, "policy/approxkl_avg": 3.8177051544189453, "policy/clipfrac_avg": 0.11851415038108826, "policy/entropy_avg": 0.6579099297523499, "step": 46, "val/clipfrac_avg": 0.001179245300590992, "val/num_eos_tokens": 0, "val/ratio": 0.9020024538040161, "val/ratio_var": 0.00011536524834809825 }, { "episode": 752, "epoch": 1.3192982456140352, "eps": 3, "loss/policy_avg": 0.012542858719825745, "loss/value_avg": 0.6548900604248047, "lr": 8.038317757009346e-06, "objective/entropy": 17.823394775390625, "objective/kl": 171.56602478027344, "objective/non_score_reward": -8.578301429748535, "objective/rlhf_reward": -6.828301429748535, "objective/scores": 1.75, "policy/approxkl_avg": 5.147519111633301, "policy/clipfrac_avg": 0.14563679695129395, "policy/entropy_avg": 0.6417368650436401, "step": 47, "val/clipfrac_avg": 0.004127358552068472, "val/num_eos_tokens": 0, "val/ratio": 0.8848338723182678, "val/ratio_var": 0.000510354817379266 }, { "episode": 768, "epoch": 1.3473684210526315, "eps": 3, "loss/policy_avg": 0.013358336873352528, "loss/value_avg": 0.3859623968601227, "lr": 7.906542056074766e-06, "objective/entropy": 16.128374099731445, "objective/kl": 168.9840087890625, "objective/non_score_reward": -8.449200630187988, "objective/rlhf_reward": -6.847638130187988, "objective/scores": 1.6015625, "policy/approxkl_avg": 4.781813144683838, "policy/clipfrac_avg": 0.12028302252292633, "policy/entropy_avg": 0.5887731313705444, "step": 48, "val/clipfrac_avg": 0.01179245300590992, "val/num_eos_tokens": 0, "val/ratio": 0.9012677669525146, "val/ratio_var": 0.00041765146306715906 }, { "episode": 784, "epoch": 1.3754385964912281, "eps": 3, "loss/policy_avg": 0.017281489446759224, "loss/value_avg": 0.5614770650863647, "lr": 7.774766355140187e-06, "objective/entropy": 20.177621841430664, "objective/kl": 166.16497802734375, "objective/non_score_reward": -8.308249473571777, "objective/rlhf_reward": -7.081686973571777, "objective/scores": 1.2265625, "policy/approxkl_avg": 3.1813056468963623, "policy/clipfrac_avg": 0.1432783007621765, "policy/entropy_avg": 0.7520714998245239, "step": 49, "val/clipfrac_avg": 0.1845518946647644, "val/num_eos_tokens": 0, "val/ratio": 0.8848077654838562, "val/ratio_var": 0.0005292592104524374 }, { "episode": 800, "epoch": 1.4035087719298245, "eps": 3, "loss/policy_avg": 0.015007663518190384, "loss/value_avg": 0.918763279914856, "lr": 7.642990654205608e-06, "objective/entropy": 17.82724380493164, "objective/kl": 179.90628051757812, "objective/non_score_reward": -8.99531364440918, "objective/rlhf_reward": -7.37812614440918, "objective/scores": 1.6171875, "policy/approxkl_avg": 5.0764312744140625, "policy/clipfrac_avg": 0.1149764209985733, "policy/entropy_avg": 0.6353041529655457, "step": 50, "val/clipfrac_avg": 0.014740565791726112, "val/num_eos_tokens": 0, "val/ratio": 0.9066751599311829, "val/ratio_var": 0.00023197307018563151 }, { "episode": 816, "epoch": 1.431578947368421, "eps": 3, "loss/policy_avg": 0.013827439397573471, "loss/value_avg": 0.6559504270553589, "lr": 7.511214953271027e-06, "objective/entropy": 17.98678207397461, "objective/kl": 173.85345458984375, "objective/non_score_reward": -8.692672729492188, "objective/rlhf_reward": -7.0911102294921875, "objective/scores": 1.6015625, "policy/approxkl_avg": 4.863851547241211, "policy/clipfrac_avg": 0.12382075190544128, "policy/entropy_avg": 0.6412214040756226, "step": 51, "val/clipfrac_avg": 0.03419811278581619, "val/num_eos_tokens": 0, "val/ratio": 0.8822081089019775, "val/ratio_var": 5.379146841733018e-06 }, { "episode": 832, "epoch": 1.4596491228070176, "eps": 3, "loss/policy_avg": 0.009551126509904861, "loss/value_avg": 0.46615320444107056, "lr": 7.379439252336448e-06, "objective/entropy": 14.611004829406738, "objective/kl": 169.49464416503906, "objective/non_score_reward": -8.4747314453125, "objective/rlhf_reward": -7.0137939453125, "objective/scores": 1.4609375, "policy/approxkl_avg": 4.765644073486328, "policy/clipfrac_avg": 0.09964622557163239, "policy/entropy_avg": 0.5645979642868042, "step": 52, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9012112021446228, "val/ratio_var": 1.3516472790797707e-05 }, { "episode": 848, "epoch": 1.487719298245614, "eps": 3, "loss/policy_avg": 0.009903261438012123, "loss/value_avg": 1.024169683456421, "lr": 7.2476635514018685e-06, "objective/entropy": 16.012420654296875, "objective/kl": 173.99917602539062, "objective/non_score_reward": -8.699958801269531, "objective/rlhf_reward": -6.371833801269531, "objective/scores": 2.328125, "policy/approxkl_avg": 4.889924049377441, "policy/clipfrac_avg": 0.11261792480945587, "policy/entropy_avg": 0.5950597524642944, "step": 53, "val/clipfrac_avg": 0.20518869161605835, "val/num_eos_tokens": 0, "val/ratio": 0.8948594331741333, "val/ratio_var": 6.705896521452814e-05 }, { "episode": 864, "epoch": 1.5157894736842106, "eps": 3, "loss/policy_avg": -0.007397271227091551, "loss/value_avg": 0.4296436011791229, "lr": 7.115887850467289e-06, "objective/entropy": 12.315929412841797, "objective/kl": 175.46511840820312, "objective/non_score_reward": -8.773256301879883, "objective/rlhf_reward": -6.570131301879883, "objective/scores": 2.203125, "policy/approxkl_avg": 5.532078742980957, "policy/clipfrac_avg": 0.09375, "policy/entropy_avg": 0.5138251781463623, "step": 54, "val/clipfrac_avg": 0.003537735901772976, "val/num_eos_tokens": 0, "val/ratio": 0.9130541086196899, "val/ratio_var": 8.332962897839025e-05 }, { "episode": 880, "epoch": 1.543859649122807, "eps": 3, "loss/policy_avg": 0.0055680545046925545, "loss/value_avg": 0.44329357147216797, "lr": 6.9841121495327106e-06, "objective/entropy": 13.753085136413574, "objective/kl": 162.1046905517578, "objective/non_score_reward": -8.10523509979248, "objective/rlhf_reward": -6.1052350997924805, "objective/scores": 2.0, "policy/approxkl_avg": 4.2778730392456055, "policy/clipfrac_avg": 0.10200471431016922, "policy/entropy_avg": 0.538252592086792, "step": 55, "val/clipfrac_avg": 0.000589622650295496, "val/num_eos_tokens": 0, "val/ratio": 0.89923495054245, "val/ratio_var": 0.00013667276652995497 }, { "episode": 896, "epoch": 1.5719298245614035, "eps": 3, "loss/policy_avg": 0.0029763877391815186, "loss/value_avg": 0.5969923734664917, "lr": 6.852336448598131e-06, "objective/entropy": 10.386423110961914, "objective/kl": 170.64817810058594, "objective/non_score_reward": -8.53240966796875, "objective/rlhf_reward": -5.84490966796875, "objective/scores": 2.6875, "policy/approxkl_avg": 5.515145301818848, "policy/clipfrac_avg": 0.0695754736661911, "policy/entropy_avg": 0.4759911596775055, "step": 56, "val/clipfrac_avg": 0.22051887214183807, "val/num_eos_tokens": 0, "val/ratio": 0.9136029481887817, "val/ratio_var": 0.00015939133299980313 }, { "episode": 912, "epoch": 1.6, "eps": 3, "loss/policy_avg": -0.0002519981935620308, "loss/value_avg": 0.6188120245933533, "lr": 6.720560747663552e-06, "objective/entropy": 9.047847747802734, "objective/kl": 162.95162963867188, "objective/non_score_reward": -8.147581100463867, "objective/rlhf_reward": -5.835081100463867, "objective/scores": 2.3125, "policy/approxkl_avg": 5.942928314208984, "policy/clipfrac_avg": 0.06721697747707367, "policy/entropy_avg": 0.43892478942871094, "step": 57, "val/clipfrac_avg": 0.03242924436926842, "val/num_eos_tokens": 0, "val/ratio": 0.9175019264221191, "val/ratio_var": 2.5528926926199347e-05 }, { "episode": 928, "epoch": 1.6280701754385964, "eps": 3, "loss/policy_avg": -0.004241641610860825, "loss/value_avg": 0.6380342245101929, "lr": 6.588785046728972e-06, "objective/entropy": 10.172576904296875, "objective/kl": 172.64210510253906, "objective/non_score_reward": -8.632105827331543, "objective/rlhf_reward": -6.085230827331543, "objective/scores": 2.546875, "policy/approxkl_avg": 5.1512861251831055, "policy/clipfrac_avg": 0.09669811278581619, "policy/entropy_avg": 0.44444799423217773, "step": 58, "val/clipfrac_avg": 0.00294811325147748, "val/num_eos_tokens": 0, "val/ratio": 0.9051207304000854, "val/ratio_var": 9.685073746368289e-05 }, { "episode": 944, "epoch": 1.656140350877193, "eps": 3, "loss/policy_avg": 0.005844447761774063, "loss/value_avg": 0.46530038118362427, "lr": 6.457009345794393e-06, "objective/entropy": 11.34018611907959, "objective/kl": 167.05087280273438, "objective/non_score_reward": -8.352543830871582, "objective/rlhf_reward": -5.368168830871582, "objective/scores": 2.984375, "policy/approxkl_avg": 4.73173713684082, "policy/clipfrac_avg": 0.06898584961891174, "policy/entropy_avg": 0.4987587630748749, "step": 59, "val/clipfrac_avg": 0.003537735901772976, "val/num_eos_tokens": 0, "val/ratio": 0.9096221327781677, "val/ratio_var": 0.0002903940330725163 }, { "episode": 960, "epoch": 1.6842105263157894, "eps": 3, "loss/policy_avg": 0.0015796682564541698, "loss/value_avg": 0.5465973615646362, "lr": 6.3252336448598135e-06, "objective/entropy": 10.832345962524414, "objective/kl": 166.35125732421875, "objective/non_score_reward": -8.317562103271484, "objective/rlhf_reward": -5.114437103271484, "objective/scores": 3.203125, "policy/approxkl_avg": 4.080867767333984, "policy/clipfrac_avg": 0.08726415038108826, "policy/entropy_avg": 0.46615108847618103, "step": 60, "val/clipfrac_avg": 0.018278302624821663, "val/num_eos_tokens": 0, "val/ratio": 0.9084208011627197, "val/ratio_var": 1.8292890672455542e-05 }, { "episode": 976, "epoch": 1.712280701754386, "eps": 3, "loss/policy_avg": -0.0016184533014893532, "loss/value_avg": 0.6316072344779968, "lr": 6.193457943925234e-06, "objective/entropy": 9.0885648727417, "objective/kl": 172.646240234375, "objective/non_score_reward": -8.632311820983887, "objective/rlhf_reward": -5.194811820983887, "objective/scores": 3.4375, "policy/approxkl_avg": 4.502593994140625, "policy/clipfrac_avg": 0.06603773683309555, "policy/entropy_avg": 0.41100969910621643, "step": 61, "val/clipfrac_avg": 0.044811319559812546, "val/num_eos_tokens": 0, "val/ratio": 0.9256702661514282, "val/ratio_var": 7.894221198512241e-05 }, { "episode": 992, "epoch": 1.7403508771929825, "eps": 3, "loss/policy_avg": -0.0019415542483329773, "loss/value_avg": 0.6046911478042603, "lr": 6.061682242990655e-06, "objective/entropy": 9.12926197052002, "objective/kl": 169.4315185546875, "objective/non_score_reward": -8.471575736999512, "objective/rlhf_reward": -5.424700736999512, "objective/scores": 3.046875, "policy/approxkl_avg": 5.609973907470703, "policy/clipfrac_avg": 0.09198112785816193, "policy/entropy_avg": 0.4236205816268921, "step": 62, "val/clipfrac_avg": 0.001768867950886488, "val/num_eos_tokens": 0, "val/ratio": 0.9198966026306152, "val/ratio_var": 6.228529673535377e-05 }, { "episode": 1008, "epoch": 1.768421052631579, "eps": 3, "loss/policy_avg": -0.007835395634174347, "loss/value_avg": 0.6853305697441101, "lr": 5.929906542056075e-06, "objective/entropy": 8.566083908081055, "objective/kl": 163.68191528320312, "objective/non_score_reward": -8.18409538269043, "objective/rlhf_reward": -4.09034538269043, "objective/scores": 4.09375, "policy/approxkl_avg": 3.7664973735809326, "policy/clipfrac_avg": 0.07429245114326477, "policy/entropy_avg": 0.41426771879196167, "step": 63, "val/clipfrac_avg": 0.007665094453841448, "val/num_eos_tokens": 0, "val/ratio": 0.9395467042922974, "val/ratio_var": 0.00018259203352499753 }, { "episode": 1024, "epoch": 1.7964912280701755, "eps": 3, "loss/policy_avg": 0.0056846365332603455, "loss/value_avg": 0.8050791621208191, "lr": 5.798130841121495e-06, "objective/entropy": 7.867904186248779, "objective/kl": 176.44961547851562, "objective/non_score_reward": -8.822481155395508, "objective/rlhf_reward": -4.931856155395508, "objective/scores": 3.890625, "policy/approxkl_avg": 4.615470886230469, "policy/clipfrac_avg": 0.07016509771347046, "policy/entropy_avg": 0.40076911449432373, "step": 64, "val/clipfrac_avg": 0.1179245263338089, "val/num_eos_tokens": 0, "val/ratio": 0.9168256521224976, "val/ratio_var": 1.1071170774812344e-05 }, { "episode": 1040, "epoch": 1.8245614035087718, "eps": 3, "loss/policy_avg": -0.004829235374927521, "loss/value_avg": 0.7683409452438354, "lr": 5.666355140186916e-06, "objective/entropy": 8.73065185546875, "objective/kl": 165.93441772460938, "objective/non_score_reward": -8.296720504760742, "objective/rlhf_reward": -4.531095504760742, "objective/scores": 3.765625, "policy/approxkl_avg": 4.037623882293701, "policy/clipfrac_avg": 0.0625, "policy/entropy_avg": 0.38483142852783203, "step": 65, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9364030361175537, "val/ratio_var": 0.0001283105229958892 }, { "episode": 1056, "epoch": 1.8526315789473684, "eps": 3, "loss/policy_avg": -0.002082128543406725, "loss/value_avg": 0.8781827688217163, "lr": 5.534579439252336e-06, "objective/entropy": 6.81689977645874, "objective/kl": 173.76760864257812, "objective/non_score_reward": -8.688380241394043, "objective/rlhf_reward": -5.454005241394043, "objective/scores": 3.234375, "policy/approxkl_avg": 5.1825032234191895, "policy/clipfrac_avg": 0.07488207519054413, "policy/entropy_avg": 0.3771995007991791, "step": 66, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9298049211502075, "val/ratio_var": 0.00015954735863488168 }, { "episode": 1072, "epoch": 1.880701754385965, "eps": 3, "loss/policy_avg": 0.005034355446696281, "loss/value_avg": 1.0226874351501465, "lr": 5.402803738317757e-06, "objective/entropy": 5.308557510375977, "objective/kl": 171.6015167236328, "objective/non_score_reward": -8.580076217651367, "objective/rlhf_reward": -4.236326217651367, "objective/scores": 4.34375, "policy/approxkl_avg": 5.336367607116699, "policy/clipfrac_avg": 0.04599056765437126, "policy/entropy_avg": 0.34400177001953125, "step": 67, "val/clipfrac_avg": 0.000589622650295496, "val/num_eos_tokens": 0, "val/ratio": 0.9246993064880371, "val/ratio_var": 9.672338592281449e-07 }, { "episode": 1088, "epoch": 1.9087719298245613, "eps": 3, "loss/policy_avg": 0.023275576531887054, "loss/value_avg": 0.6750494241714478, "lr": 5.271028037383177e-06, "objective/entropy": 7.23941707611084, "objective/kl": 166.45547485351562, "objective/non_score_reward": -8.322773933410645, "objective/rlhf_reward": -4.6352739334106445, "objective/scores": 3.6875, "policy/approxkl_avg": 3.1369752883911133, "policy/clipfrac_avg": 0.05837263911962509, "policy/entropy_avg": 0.3996211886405945, "step": 68, "val/clipfrac_avg": 0.000589622650295496, "val/num_eos_tokens": 0, "val/ratio": 0.9343756437301636, "val/ratio_var": 0.00011849942529806867 }, { "episode": 1104, "epoch": 1.936842105263158, "eps": 3, "loss/policy_avg": 0.001583978533744812, "loss/value_avg": 0.7364473342895508, "lr": 5.139252336448598e-06, "objective/entropy": 8.292254447937012, "objective/kl": 174.10446166992188, "objective/non_score_reward": -8.705223083496094, "objective/rlhf_reward": -4.517723083496094, "objective/scores": 4.1875, "policy/approxkl_avg": 5.407079696655273, "policy/clipfrac_avg": 0.06780660152435303, "policy/entropy_avg": 0.3910168409347534, "step": 69, "val/clipfrac_avg": 0.001179245300590992, "val/num_eos_tokens": 0, "val/ratio": 0.9262620210647583, "val/ratio_var": 8.509035978931934e-06 }, { "episode": 1120, "epoch": 1.9649122807017543, "eps": 3, "loss/policy_avg": 0.014011572115123272, "loss/value_avg": 0.49188750982284546, "lr": 5.0074766355140185e-06, "objective/entropy": 4.73923397064209, "objective/kl": 170.3909912109375, "objective/non_score_reward": -8.519549369812012, "objective/rlhf_reward": -4.535174369812012, "objective/scores": 3.984375, "policy/approxkl_avg": 4.553505897521973, "policy/clipfrac_avg": 0.04658018797636032, "policy/entropy_avg": 0.314146488904953, "step": 70, "val/clipfrac_avg": 0.001768867950886488, "val/num_eos_tokens": 0, "val/ratio": 0.9345220327377319, "val/ratio_var": 0.00011590120993787423 }, { "episode": 1136, "epoch": 1.9929824561403509, "eps": 3, "loss/policy_avg": 0.014443885535001755, "loss/value_avg": 0.8583539724349976, "lr": 4.875700934579439e-06, "objective/entropy": 6.110556602478027, "objective/kl": 168.1246337890625, "objective/non_score_reward": -8.406231880187988, "objective/rlhf_reward": -4.781231880187988, "objective/scores": 3.625, "policy/approxkl_avg": 3.3112387657165527, "policy/clipfrac_avg": 0.04716981202363968, "policy/entropy_avg": 0.3741912841796875, "step": 71, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9311293363571167, "val/ratio_var": 1.7129657862824388e-05 }, { "episode": 1152, "epoch": 2.0210526315789474, "eps": 3, "loss/policy_avg": 0.0069357771426439285, "loss/value_avg": 0.6024092435836792, "lr": 4.74392523364486e-06, "objective/entropy": 1.9080017805099487, "objective/kl": 175.54367065429688, "objective/non_score_reward": -8.777183532714844, "objective/rlhf_reward": -3.7771835327148438, "objective/scores": 5.0, "policy/approxkl_avg": 6.433887004852295, "policy/clipfrac_avg": 0.03655660152435303, "policy/entropy_avg": 0.2685927748680115, "step": 72, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9363144040107727, "val/ratio_var": 2.7767631763708778e-05 }, { "episode": 1168, "epoch": 2.049122807017544, "eps": 3, "loss/policy_avg": 0.004744451027363539, "loss/value_avg": 0.6521505117416382, "lr": 4.61214953271028e-06, "objective/entropy": 2.584568500518799, "objective/kl": 171.61709594726562, "objective/non_score_reward": -8.580854415893555, "objective/rlhf_reward": -3.1121044158935547, "objective/scores": 5.46875, "policy/approxkl_avg": 4.509120941162109, "policy/clipfrac_avg": 0.03478773683309555, "policy/entropy_avg": 0.2757822573184967, "step": 73, "val/clipfrac_avg": 0.2900943160057068, "val/num_eos_tokens": 0, "val/ratio": 0.9448769092559814, "val/ratio_var": 1.143478584708646e-05 }, { "episode": 1184, "epoch": 2.07719298245614, "eps": 3, "loss/policy_avg": 0.004101068712770939, "loss/value_avg": 0.44417738914489746, "lr": 4.480373831775701e-06, "objective/entropy": 3.265643835067749, "objective/kl": 179.89352416992188, "objective/non_score_reward": -8.99467658996582, "objective/rlhf_reward": -4.11967658996582, "objective/scores": 4.875, "policy/approxkl_avg": 5.798920154571533, "policy/clipfrac_avg": 0.028891509398818016, "policy/entropy_avg": 0.29206639528274536, "step": 74, "val/clipfrac_avg": 0.05188679322600365, "val/num_eos_tokens": 0, "val/ratio": 0.9267533421516418, "val/ratio_var": 0.0001095838742912747 }, { "episode": 1200, "epoch": 2.1052631578947367, "eps": 3, "loss/policy_avg": 0.004760343115776777, "loss/value_avg": 0.3549901843070984, "lr": 4.3485981308411215e-06, "objective/entropy": 2.9447989463806152, "objective/kl": 175.41961669921875, "objective/non_score_reward": -8.770980834960938, "objective/rlhf_reward": -3.2084808349609375, "objective/scores": 5.5625, "policy/approxkl_avg": 4.80606746673584, "policy/clipfrac_avg": 0.03419811278581619, "policy/entropy_avg": 0.30916815996170044, "step": 75, "val/clipfrac_avg": 0.004127358552068472, "val/num_eos_tokens": 0, "val/ratio": 0.9180092215538025, "val/ratio_var": 2.3069829694577493e-05 }, { "episode": 1216, "epoch": 2.1333333333333333, "eps": 3, "loss/policy_avg": 0.010298425331711769, "loss/value_avg": 0.15927723050117493, "lr": 4.216822429906542e-06, "objective/entropy": 1.4227180480957031, "objective/kl": 176.0067138671875, "objective/non_score_reward": -8.800336837768555, "objective/rlhf_reward": -2.6753368377685547, "objective/scores": 6.125, "policy/approxkl_avg": 4.99057149887085, "policy/clipfrac_avg": 0.028301887214183807, "policy/entropy_avg": 0.2751670479774475, "step": 76, "val/clipfrac_avg": 0.10495282709598541, "val/num_eos_tokens": 0, "val/ratio": 0.9201045036315918, "val/ratio_var": 5.366753157431958e-06 }, { "episode": 1232, "epoch": 2.16140350877193, "eps": 3, "loss/policy_avg": -0.005462624132633209, "loss/value_avg": 0.28704196214675903, "lr": 4.085046728971963e-06, "objective/entropy": 1.6171071529388428, "objective/kl": 176.83685302734375, "objective/non_score_reward": -8.841842651367188, "objective/rlhf_reward": -3.1543426513671875, "objective/scores": 5.6875, "policy/approxkl_avg": 5.847208023071289, "policy/clipfrac_avg": 0.028891509398818016, "policy/entropy_avg": 0.286138117313385, "step": 77, "val/clipfrac_avg": 0.07075471431016922, "val/num_eos_tokens": 0, "val/ratio": 0.9195102453231812, "val/ratio_var": 9.577343917044345e-06 }, { "episode": 1248, "epoch": 2.1894736842105265, "eps": 3, "loss/policy_avg": 0.0010141655802726746, "loss/value_avg": 0.8408201932907104, "lr": 3.953271028037383e-06, "objective/entropy": 7.40260124206543, "objective/kl": 177.4427490234375, "objective/non_score_reward": -8.872137069702148, "objective/rlhf_reward": -4.903387069702148, "objective/scores": 3.96875, "policy/approxkl_avg": 5.285105228424072, "policy/clipfrac_avg": 0.04658018797636032, "policy/entropy_avg": 0.4253733158111572, "step": 78, "val/clipfrac_avg": 0.000589622650295496, "val/num_eos_tokens": 0, "val/ratio": 0.9203585982322693, "val/ratio_var": 1.65566543728346e-05 }, { "episode": 1264, "epoch": 2.2175438596491226, "eps": 3, "loss/policy_avg": -0.004624534398317337, "loss/value_avg": 0.7719740271568298, "lr": 3.821495327102804e-06, "objective/entropy": 4.648886203765869, "objective/kl": 181.51986694335938, "objective/non_score_reward": -9.075994491577148, "objective/rlhf_reward": -4.700994491577148, "objective/scores": 4.375, "policy/approxkl_avg": 4.547338485717773, "policy/clipfrac_avg": 0.0383254736661911, "policy/entropy_avg": 0.3513880968093872, "step": 79, "val/clipfrac_avg": 0.1291273534297943, "val/num_eos_tokens": 0, "val/ratio": 0.9286638498306274, "val/ratio_var": 6.422147998819128e-05 }, { "episode": 1280, "epoch": 2.245614035087719, "eps": 3, "loss/policy_avg": 0.012380128726363182, "loss/value_avg": 0.40563684701919556, "lr": 3.689719626168224e-06, "objective/entropy": 7.685408115386963, "objective/kl": 168.90484619140625, "objective/non_score_reward": -8.445242881774902, "objective/rlhf_reward": -3.4452428817749023, "objective/scores": 5.0, "policy/approxkl_avg": 3.4143970012664795, "policy/clipfrac_avg": 0.041273586452007294, "policy/entropy_avg": 0.4100227355957031, "step": 80, "val/clipfrac_avg": 0.015330187976360321, "val/num_eos_tokens": 0, "val/ratio": 0.9217012524604797, "val/ratio_var": 7.061174983391538e-05 }, { "episode": 1296, "epoch": 2.2736842105263158, "eps": 3, "loss/policy_avg": 0.011339722201228142, "loss/value_avg": 0.3490160405635834, "lr": 3.5579439252336446e-06, "objective/entropy": 4.046834945678711, "objective/kl": 177.92718505859375, "objective/non_score_reward": -8.89635944366455, "objective/rlhf_reward": -3.958859443664551, "objective/scores": 4.9375, "policy/approxkl_avg": 5.583766460418701, "policy/clipfrac_avg": 0.03478773683309555, "policy/entropy_avg": 0.3193933963775635, "step": 81, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9363265037536621, "val/ratio_var": 5.8069101214641705e-05 }, { "episode": 1312, "epoch": 2.3017543859649123, "eps": 3, "loss/policy_avg": 0.007465606089681387, "loss/value_avg": 0.3137081563472748, "lr": 3.4261682242990656e-06, "objective/entropy": 3.293423652648926, "objective/kl": 173.18377685546875, "objective/non_score_reward": -8.659189224243164, "objective/rlhf_reward": -3.065439224243164, "objective/scores": 5.59375, "policy/approxkl_avg": 4.794089317321777, "policy/clipfrac_avg": 0.0383254736661911, "policy/entropy_avg": 0.29685914516448975, "step": 82, "val/clipfrac_avg": 0.000589622650295496, "val/num_eos_tokens": 0, "val/ratio": 0.9458816647529602, "val/ratio_var": 0.00023432180751115084 }, { "episode": 1328, "epoch": 2.329824561403509, "eps": 3, "loss/policy_avg": -0.00093865767121315, "loss/value_avg": 0.9402576684951782, "lr": 3.294392523364486e-06, "objective/entropy": 5.09280252456665, "objective/kl": 173.88351440429688, "objective/non_score_reward": -8.694175720214844, "objective/rlhf_reward": -4.866050720214844, "objective/scores": 3.828125, "policy/approxkl_avg": 3.8168904781341553, "policy/clipfrac_avg": 0.03891509398818016, "policy/entropy_avg": 0.35750845074653625, "step": 83, "val/clipfrac_avg": 0.000589622650295496, "val/num_eos_tokens": 0, "val/ratio": 0.9232673645019531, "val/ratio_var": 4.1539384255884215e-05 }, { "episode": 1344, "epoch": 2.357894736842105, "eps": 3, "loss/policy_avg": -0.007357731461524963, "loss/value_avg": 0.36178284883499146, "lr": 3.1626168224299067e-06, "objective/entropy": 5.281716346740723, "objective/kl": 179.2125701904297, "objective/non_score_reward": -8.960628509521484, "objective/rlhf_reward": -3.9918785095214844, "objective/scores": 4.96875, "policy/approxkl_avg": 4.461269378662109, "policy/clipfrac_avg": 0.05365566164255142, "policy/entropy_avg": 0.35694169998168945, "step": 84, "val/clipfrac_avg": 0.000589622650295496, "val/num_eos_tokens": 0, "val/ratio": 0.9178085923194885, "val/ratio_var": 4.949720823788084e-05 }, { "episode": 1360, "epoch": 2.3859649122807016, "eps": 3, "loss/policy_avg": 0.0004696398973464966, "loss/value_avg": 0.30143094062805176, "lr": 3.0308411214953273e-06, "objective/entropy": 2.755769729614258, "objective/kl": 173.08140563964844, "objective/non_score_reward": -8.654069900512695, "objective/rlhf_reward": -2.9665699005126953, "objective/scores": 5.6875, "policy/approxkl_avg": 5.356992721557617, "policy/clipfrac_avg": 0.03242924436926842, "policy/entropy_avg": 0.282896488904953, "step": 85, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9419326782226562, "val/ratio_var": 3.9359238144243136e-05 }, { "episode": 1376, "epoch": 2.414035087719298, "eps": 3, "loss/policy_avg": 0.0008706599473953247, "loss/value_avg": 0.5158276557922363, "lr": 2.8990654205607475e-06, "objective/entropy": 4.149250507354736, "objective/kl": 173.71505737304688, "objective/non_score_reward": -8.685752868652344, "objective/rlhf_reward": -3.6857528686523438, "objective/scores": 5.0, "policy/approxkl_avg": 5.095344066619873, "policy/clipfrac_avg": 0.030070755630731583, "policy/entropy_avg": 0.3076534867286682, "step": 86, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.93389892578125, "val/ratio_var": 9.108168342208955e-06 }, { "episode": 1392, "epoch": 2.442105263157895, "eps": 3, "loss/policy_avg": -0.0013641200494021177, "loss/value_avg": 0.46665364503860474, "lr": 2.767289719626168e-06, "objective/entropy": 3.9847404956817627, "objective/kl": 171.97903442382812, "objective/non_score_reward": -8.59895133972168, "objective/rlhf_reward": -3.4114513397216797, "objective/scores": 5.1875, "policy/approxkl_avg": 4.758839130401611, "policy/clipfrac_avg": 0.02771226316690445, "policy/entropy_avg": 0.3068329691886902, "step": 87, "val/clipfrac_avg": 0.002358490601181984, "val/num_eos_tokens": 0, "val/ratio": 0.9303795099258423, "val/ratio_var": 2.7705931643140502e-05 }, { "episode": 1408, "epoch": 2.4701754385964914, "eps": 3, "loss/policy_avg": -0.009293105453252792, "loss/value_avg": 0.1374308168888092, "lr": 2.6355140186915887e-06, "objective/entropy": 2.8504319190979004, "objective/kl": 178.8887176513672, "objective/non_score_reward": -8.944437026977539, "objective/rlhf_reward": -2.975687026977539, "objective/scores": 5.96875, "policy/approxkl_avg": 5.254701614379883, "policy/clipfrac_avg": 0.026533018797636032, "policy/entropy_avg": 0.2935040593147278, "step": 88, "val/clipfrac_avg": 0.002358490601181984, "val/num_eos_tokens": 0, "val/ratio": 0.9373711943626404, "val/ratio_var": 1.766591776686255e-05 }, { "episode": 1424, "epoch": 2.498245614035088, "eps": 3, "loss/policy_avg": 0.00495288148522377, "loss/value_avg": 0.20061969757080078, "lr": 2.5037383177570093e-06, "objective/entropy": 4.51104211807251, "objective/kl": 169.7410125732422, "objective/non_score_reward": -8.487051010131836, "objective/rlhf_reward": -2.768301010131836, "objective/scores": 5.71875, "policy/approxkl_avg": 4.481791019439697, "policy/clipfrac_avg": 0.03478773683309555, "policy/entropy_avg": 0.3265501856803894, "step": 89, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9414163827896118, "val/ratio_var": 0.00011632378300419077 }, { "episode": 1440, "epoch": 2.526315789473684, "eps": 3, "loss/policy_avg": 0.00646105594933033, "loss/value_avg": 0.42740941047668457, "lr": 2.37196261682243e-06, "objective/entropy": 3.2403650283813477, "objective/kl": 176.238037109375, "objective/non_score_reward": -8.811902046203613, "objective/rlhf_reward": -3.9994020462036133, "objective/scores": 4.8125, "policy/approxkl_avg": 5.44842529296875, "policy/clipfrac_avg": 0.01591981202363968, "policy/entropy_avg": 0.2933845520019531, "step": 90, "val/clipfrac_avg": 0.000589622650295496, "val/num_eos_tokens": 0, "val/ratio": 0.9381667971611023, "val/ratio_var": 9.369335020892322e-05 }, { "episode": 1456, "epoch": 2.5543859649122806, "eps": 3, "loss/policy_avg": -0.005389541387557983, "loss/value_avg": 0.4948211908340454, "lr": 2.2401869158878504e-06, "objective/entropy": 2.898387908935547, "objective/kl": 173.48486328125, "objective/non_score_reward": -8.674242973327637, "objective/rlhf_reward": -3.5179929733276367, "objective/scores": 5.15625, "policy/approxkl_avg": 4.66801643371582, "policy/clipfrac_avg": 0.020636793226003647, "policy/entropy_avg": 0.2913670837879181, "step": 91, "val/clipfrac_avg": 0.001179245300590992, "val/num_eos_tokens": 0, "val/ratio": 0.9378049373626709, "val/ratio_var": 1.2327662261668593e-05 }, { "episode": 1472, "epoch": 2.5824561403508772, "eps": 3, "loss/policy_avg": -0.010267895646393299, "loss/value_avg": 0.26834648847579956, "lr": 2.108411214953271e-06, "objective/entropy": 4.616816997528076, "objective/kl": 171.12762451171875, "objective/non_score_reward": -8.556382179260254, "objective/rlhf_reward": -3.587632179260254, "objective/scores": 4.96875, "policy/approxkl_avg": 4.146580219268799, "policy/clipfrac_avg": 0.041273586452007294, "policy/entropy_avg": 0.34417253732681274, "step": 92, "val/clipfrac_avg": 0.000589622650295496, "val/num_eos_tokens": 0, "val/ratio": 0.9241670370101929, "val/ratio_var": 4.9057460273616016e-05 }, { "episode": 1488, "epoch": 2.610526315789474, "eps": 3, "loss/policy_avg": 0.0006395354866981506, "loss/value_avg": 0.7872554063796997, "lr": 1.9766355140186916e-06, "objective/entropy": 5.483046531677246, "objective/kl": 169.1695098876953, "objective/non_score_reward": -8.458476066589355, "objective/rlhf_reward": -4.4741010665893555, "objective/scores": 3.984375, "policy/approxkl_avg": 2.8852078914642334, "policy/clipfrac_avg": 0.032429248094558716, "policy/entropy_avg": 0.35312554240226746, "step": 93, "val/clipfrac_avg": 0.001768867950886488, "val/num_eos_tokens": 0, "val/ratio": 0.9286659955978394, "val/ratio_var": 2.8370095606078394e-05 }, { "episode": 1504, "epoch": 2.6385964912280704, "eps": 3, "loss/policy_avg": -0.00652042031288147, "loss/value_avg": 0.17014235258102417, "lr": 1.844859813084112e-06, "objective/entropy": 2.737617015838623, "objective/kl": 178.22747802734375, "objective/non_score_reward": -8.911375045776367, "objective/rlhf_reward": -3.286375045776367, "objective/scores": 5.625, "policy/approxkl_avg": 5.498225688934326, "policy/clipfrac_avg": 0.028891511261463165, "policy/entropy_avg": 0.28995558619499207, "step": 94, "val/clipfrac_avg": 0.001768867950886488, "val/num_eos_tokens": 0, "val/ratio": 0.9345540404319763, "val/ratio_var": 1.2709216434814152e-06 }, { "episode": 1520, "epoch": 2.6666666666666665, "eps": 3, "loss/policy_avg": 0.007195580750703812, "loss/value_avg": 0.40153437852859497, "lr": 1.7130841121495328e-06, "objective/entropy": 4.30942440032959, "objective/kl": 176.95938110351562, "objective/non_score_reward": -8.847970008850098, "objective/rlhf_reward": -3.8479700088500977, "objective/scores": 5.0, "policy/approxkl_avg": 5.267421245574951, "policy/clipfrac_avg": 0.03125, "policy/entropy_avg": 0.31008654832839966, "step": 95, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9336869716644287, "val/ratio_var": 4.966981941834092e-05 }, { "episode": 1536, "epoch": 2.694736842105263, "eps": 3, "loss/policy_avg": 0.012591801583766937, "loss/value_avg": 0.3597390055656433, "lr": 1.5813084112149534e-06, "objective/entropy": 5.459916591644287, "objective/kl": 172.44110107421875, "objective/non_score_reward": -8.622055053710938, "objective/rlhf_reward": -3.6220550537109375, "objective/scores": 5.0, "policy/approxkl_avg": 4.5339765548706055, "policy/clipfrac_avg": 0.03537736088037491, "policy/entropy_avg": 0.3411254286766052, "step": 96, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9462149739265442, "val/ratio_var": 3.8459929783130065e-05 }, { "episode": 1552, "epoch": 2.7228070175438597, "eps": 3, "loss/policy_avg": -0.003356472123414278, "loss/value_avg": 0.6434417963027954, "lr": 1.4495327102803737e-06, "objective/entropy": 5.633913516998291, "objective/kl": 172.26502990722656, "objective/non_score_reward": -8.613250732421875, "objective/rlhf_reward": -4.363250732421875, "objective/scores": 4.25, "policy/approxkl_avg": 3.585165500640869, "policy/clipfrac_avg": 0.03537735715508461, "policy/entropy_avg": 0.34199586510658264, "step": 97, "val/clipfrac_avg": 0.001179245300590992, "val/num_eos_tokens": 0, "val/ratio": 0.9311625957489014, "val/ratio_var": 6.935850979061797e-05 }, { "episode": 1568, "epoch": 2.7508771929824563, "eps": 3, "loss/policy_avg": -0.003898909315466881, "loss/value_avg": 0.36550819873809814, "lr": 1.3177570093457943e-06, "objective/entropy": 4.281040191650391, "objective/kl": 174.15972900390625, "objective/non_score_reward": -8.707986831665039, "objective/rlhf_reward": -3.614236831665039, "objective/scores": 5.09375, "policy/approxkl_avg": 5.1715850830078125, "policy/clipfrac_avg": 0.02712264284491539, "policy/entropy_avg": 0.3137935698032379, "step": 98, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9350335597991943, "val/ratio_var": 1.248767239303561e-05 }, { "episode": 1584, "epoch": 2.778947368421053, "eps": 3, "loss/policy_avg": 0.0017306804656982422, "loss/value_avg": 0.2737918496131897, "lr": 1.185981308411215e-06, "objective/entropy": 5.210065841674805, "objective/kl": 173.97068786621094, "objective/non_score_reward": -8.69853401184082, "objective/rlhf_reward": -3.8860340118408203, "objective/scores": 4.8125, "policy/approxkl_avg": 5.011469841003418, "policy/clipfrac_avg": 0.04304245114326477, "policy/entropy_avg": 0.3419041931629181, "step": 99, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9419320225715637, "val/ratio_var": 8.726042324269656e-06 }, { "episode": 1600, "epoch": 2.807017543859649, "eps": 3, "loss/policy_avg": -0.006221463903784752, "loss/value_avg": 0.3625496029853821, "lr": 1.0542056074766355e-06, "objective/entropy": 3.721562623977661, "objective/kl": 175.773193359375, "objective/non_score_reward": -8.78865909576416, "objective/rlhf_reward": -3.47615909576416, "objective/scores": 5.3125, "policy/approxkl_avg": 5.388751029968262, "policy/clipfrac_avg": 0.03419811278581619, "policy/entropy_avg": 0.29315799474716187, "step": 100, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.949840784072876, "val/ratio_var": 3.881535303662531e-05 }, { "episode": 1616, "epoch": 2.8350877192982455, "eps": 3, "loss/policy_avg": -0.005170758813619614, "loss/value_avg": 0.4136154055595398, "lr": 9.22429906542056e-07, "objective/entropy": 5.907715320587158, "objective/kl": 171.47390747070312, "objective/non_score_reward": -8.573695182800293, "objective/rlhf_reward": -4.167445182800293, "objective/scores": 4.40625, "policy/approxkl_avg": 5.403087615966797, "policy/clipfrac_avg": 0.03478773683309555, "policy/entropy_avg": 0.33564049005508423, "step": 101, "val/clipfrac_avg": 0.000589622650295496, "val/num_eos_tokens": 0, "val/ratio": 0.9340347051620483, "val/ratio_var": 3.0960076401242986e-05 }, { "episode": 1632, "epoch": 2.863157894736842, "eps": 3, "loss/policy_avg": 0.002457182854413986, "loss/value_avg": 0.27742013335227966, "lr": 7.906542056074767e-07, "objective/entropy": 5.222499370574951, "objective/kl": 176.05380249023438, "objective/non_score_reward": -8.802690505981445, "objective/rlhf_reward": -3.6151905059814453, "objective/scores": 5.1875, "policy/approxkl_avg": 4.675132751464844, "policy/clipfrac_avg": 0.04304245114326477, "policy/entropy_avg": 0.3403066396713257, "step": 102, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9404515027999878, "val/ratio_var": 7.289678615052253e-05 }, { "episode": 1648, "epoch": 2.8912280701754387, "eps": 3, "loss/policy_avg": -0.006544323638081551, "loss/value_avg": 0.29731422662734985, "lr": 6.588785046728972e-07, "objective/entropy": 4.219725608825684, "objective/kl": 178.1557159423828, "objective/non_score_reward": -8.90778636932373, "objective/rlhf_reward": -3.8452863693237305, "objective/scores": 5.0625, "policy/approxkl_avg": 5.953890800476074, "policy/clipfrac_avg": 0.0383254699409008, "policy/entropy_avg": 0.31199511885643005, "step": 103, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9407525658607483, "val/ratio_var": 5.8047560742124915e-05 }, { "episode": 1664, "epoch": 2.9192982456140353, "eps": 3, "loss/policy_avg": -0.0011880630627274513, "loss/value_avg": 0.21903052926063538, "lr": 5.271028037383178e-07, "objective/entropy": 5.557653427124023, "objective/kl": 170.518798828125, "objective/non_score_reward": -8.52593994140625, "objective/rlhf_reward": -3.05718994140625, "objective/scores": 5.46875, "policy/approxkl_avg": 4.447786331176758, "policy/clipfrac_avg": 0.0383254699409008, "policy/entropy_avg": 0.33431151509284973, "step": 104, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9298925399780273, "val/ratio_var": 9.144405339611694e-06 }, { "episode": 1680, "epoch": 2.9473684210526314, "eps": 3, "loss/policy_avg": -0.0007513905875384808, "loss/value_avg": 0.21037007868289948, "lr": 3.9532710280373834e-07, "objective/entropy": 3.900575876235962, "objective/kl": 174.99456787109375, "objective/non_score_reward": -8.74972915649414, "objective/rlhf_reward": -3.5309791564941406, "objective/scores": 5.21875, "policy/approxkl_avg": 5.165627479553223, "policy/clipfrac_avg": 0.028301887214183807, "policy/entropy_avg": 0.2935909032821655, "step": 105, "val/clipfrac_avg": 0.001179245300590992, "val/num_eos_tokens": 0, "val/ratio": 0.9422957897186279, "val/ratio_var": 2.6614558009896427e-05 }, { "episode": 1696, "epoch": 2.975438596491228, "eps": 3, "loss/policy_avg": -0.005426734685897827, "loss/value_avg": 0.21496959030628204, "lr": 2.635514018691589e-07, "objective/entropy": 4.556634902954102, "objective/kl": 173.05136108398438, "objective/non_score_reward": -8.652568817138672, "objective/rlhf_reward": -2.933818817138672, "objective/scores": 5.71875, "policy/approxkl_avg": 4.820314884185791, "policy/clipfrac_avg": 0.04304245486855507, "policy/entropy_avg": 0.31966692209243774, "step": 106, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9327390193939209, "val/ratio_var": 8.508096652803943e-05 }, { "episode": 1712, "epoch": 3.0035087719298246, "eps": 3, "loss/policy_avg": 0.0001406269147992134, "loss/value_avg": 0.186610609292984, "lr": 1.3177570093457944e-07, "objective/entropy": 4.999897003173828, "objective/kl": 173.25045776367188, "objective/non_score_reward": -8.66252326965332, "objective/rlhf_reward": -2.9437732696533203, "objective/scores": 5.71875, "policy/approxkl_avg": 4.337066173553467, "policy/clipfrac_avg": 0.04716981202363968, "policy/entropy_avg": 0.3558363914489746, "step": 107, "val/clipfrac_avg": 0.0, "val/num_eos_tokens": 0, "val/ratio": 0.9243938326835632, "val/ratio_var": 0.000128799001686275 } ], "logging_steps": 10, "max_steps": 107, "num_input_tokens_seen": 0, "num_train_epochs": 3.0, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0, "train_batch_size": null, "trial_name": null, "trial_params": null }