| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "episode": 1712, | |
| "epoch": 3.0035087719298246, | |
| "eval_steps": 500, | |
| "global_step": 107, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "episode": 16, | |
| "epoch": 0.028070175438596492, | |
| "eps": 2, | |
| "loss/policy_avg": 0.040254347026348114, | |
| "loss/value_avg": 4.365694046020508, | |
| "lr": 1.41e-05, | |
| "objective/entropy": 20.665752410888672, | |
| "objective/kl": 38.31879425048828, | |
| "objective/non_score_reward": -1.9159398078918457, | |
| "objective/rlhf_reward": -0.5565648078918457, | |
| "objective/scores": 1.359375, | |
| "policy/approxkl_avg": 5.953035354614258, | |
| "policy/clipfrac_avg": 0.2399764209985733, | |
| "policy/entropy_avg": 0.6761884689331055, | |
| "step": 1, | |
| "val/clipfrac_avg": 0.26650944352149963, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8525064587593079, | |
| "val/ratio_var": 0.0006113838753663003 | |
| }, | |
| { | |
| "episode": 32, | |
| "epoch": 0.056140350877192984, | |
| "eps": 2, | |
| "loss/policy_avg": 0.0446447990834713, | |
| "loss/value_avg": 3.0273280143737793, | |
| "lr": 1.3968224299065421e-05, | |
| "objective/entropy": 21.37685203552246, | |
| "objective/kl": 73.40159606933594, | |
| "objective/non_score_reward": -3.6700797080993652, | |
| "objective/rlhf_reward": -2.5294547080993652, | |
| "objective/scores": 1.140625, | |
| "policy/approxkl_avg": 6.8038129806518555, | |
| "policy/clipfrac_avg": 0.22936320304870605, | |
| "policy/entropy_avg": 0.6141112446784973, | |
| "step": 2, | |
| "val/clipfrac_avg": 0.2458726465702057, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8650269508361816, | |
| "val/ratio_var": 0.0003330775070935488 | |
| }, | |
| { | |
| "episode": 48, | |
| "epoch": 0.08421052631578947, | |
| "eps": 3, | |
| "loss/policy_avg": 0.07702315598726273, | |
| "loss/value_avg": 2.1064209938049316, | |
| "lr": 1.3836448598130842e-05, | |
| "objective/entropy": 26.10182762145996, | |
| "objective/kl": 74.03025817871094, | |
| "objective/non_score_reward": -3.7015130519866943, | |
| "objective/rlhf_reward": -2.6546380519866943, | |
| "objective/scores": 1.046875, | |
| "policy/approxkl_avg": 5.6432695388793945, | |
| "policy/clipfrac_avg": 0.2087264060974121, | |
| "policy/entropy_avg": 0.7391780614852905, | |
| "step": 3, | |
| "val/clipfrac_avg": 0.21639150381088257, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8773487210273743, | |
| "val/ratio_var": 0.00039911610656417906 | |
| }, | |
| { | |
| "episode": 64, | |
| "epoch": 0.11228070175438597, | |
| "eps": 3, | |
| "loss/policy_avg": 0.03978118300437927, | |
| "loss/value_avg": 1.5315862894058228, | |
| "lr": 1.3704672897196262e-05, | |
| "objective/entropy": 20.781639099121094, | |
| "objective/kl": 68.94087219238281, | |
| "objective/non_score_reward": -3.4470434188842773, | |
| "objective/rlhf_reward": -2.4470434188842773, | |
| "objective/scores": 1.0, | |
| "policy/approxkl_avg": 4.726678848266602, | |
| "policy/clipfrac_avg": 0.20341980457305908, | |
| "policy/entropy_avg": 0.7119085788726807, | |
| "step": 4, | |
| "val/clipfrac_avg": 0.16096699237823486, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8485063314437866, | |
| "val/ratio_var": 0.0003752955235540867 | |
| }, | |
| { | |
| "episode": 80, | |
| "epoch": 0.14035087719298245, | |
| "eps": 3, | |
| "loss/policy_avg": 0.06184825301170349, | |
| "loss/value_avg": 0.9904976487159729, | |
| "lr": 1.3572897196261683e-05, | |
| "objective/entropy": 22.48508071899414, | |
| "objective/kl": 75.60542297363281, | |
| "objective/non_score_reward": -3.780271291732788, | |
| "objective/rlhf_reward": -3.272458791732788, | |
| "objective/scores": 0.5078125, | |
| "policy/approxkl_avg": 4.352260589599609, | |
| "policy/clipfrac_avg": 0.2146226465702057, | |
| "policy/entropy_avg": 0.7200251817703247, | |
| "step": 5, | |
| "val/clipfrac_avg": 0.09669811278581619, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8546276092529297, | |
| "val/ratio_var": 0.00010198648669756949 | |
| }, | |
| { | |
| "episode": 96, | |
| "epoch": 0.16842105263157894, | |
| "eps": 3, | |
| "loss/policy_avg": 0.046236515045166016, | |
| "loss/value_avg": 1.2219572067260742, | |
| "lr": 1.3441121495327103e-05, | |
| "objective/entropy": 22.197113037109375, | |
| "objective/kl": 82.56051635742188, | |
| "objective/non_score_reward": -4.128026008605957, | |
| "objective/rlhf_reward": -3.253026008605957, | |
| "objective/scores": 0.875, | |
| "policy/approxkl_avg": 3.8577029705047607, | |
| "policy/clipfrac_avg": 0.22641509771347046, | |
| "policy/entropy_avg": 0.7238099575042725, | |
| "step": 6, | |
| "val/clipfrac_avg": 0.0383254736661911, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8292837738990784, | |
| "val/ratio_var": 6.185401434777305e-05 | |
| }, | |
| { | |
| "episode": 112, | |
| "epoch": 0.19649122807017544, | |
| "eps": 3, | |
| "loss/policy_avg": 0.06606701016426086, | |
| "loss/value_avg": 1.325202226638794, | |
| "lr": 1.3309345794392524e-05, | |
| "objective/entropy": 27.022249221801758, | |
| "objective/kl": 101.50897216796875, | |
| "objective/non_score_reward": -5.075448513031006, | |
| "objective/rlhf_reward": -3.653573513031006, | |
| "objective/scores": 1.421875, | |
| "policy/approxkl_avg": 5.585095405578613, | |
| "policy/clipfrac_avg": 0.24174529314041138, | |
| "policy/entropy_avg": 0.8859995603561401, | |
| "step": 7, | |
| "val/clipfrac_avg": 0.028301887214183807, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8486474752426147, | |
| "val/ratio_var": 0.0005388148711062968 | |
| }, | |
| { | |
| "episode": 128, | |
| "epoch": 0.22456140350877193, | |
| "eps": 3, | |
| "loss/policy_avg": 0.06467999517917633, | |
| "loss/value_avg": 0.9481044411659241, | |
| "lr": 1.3177570093457945e-05, | |
| "objective/entropy": 26.585155487060547, | |
| "objective/kl": 116.76457214355469, | |
| "objective/non_score_reward": -5.838229179382324, | |
| "objective/rlhf_reward": -4.111666679382324, | |
| "objective/scores": 1.7265625, | |
| "policy/approxkl_avg": 5.351860046386719, | |
| "policy/clipfrac_avg": 0.24469339847564697, | |
| "policy/entropy_avg": 0.8991943597793579, | |
| "step": 8, | |
| "val/clipfrac_avg": 0.08785377442836761, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8524093627929688, | |
| "val/ratio_var": 3.493872281978838e-05 | |
| }, | |
| { | |
| "episode": 144, | |
| "epoch": 0.25263157894736843, | |
| "eps": 3, | |
| "loss/policy_avg": 0.03949737548828125, | |
| "loss/value_avg": 0.9386715888977051, | |
| "lr": 1.3045794392523365e-05, | |
| "objective/entropy": 29.664459228515625, | |
| "objective/kl": 132.19940185546875, | |
| "objective/non_score_reward": -6.6099700927734375, | |
| "objective/rlhf_reward": -5.6763763427734375, | |
| "objective/scores": 0.93359375, | |
| "policy/approxkl_avg": 5.967764377593994, | |
| "policy/clipfrac_avg": 0.2057783007621765, | |
| "policy/entropy_avg": 1.0122931003570557, | |
| "step": 9, | |
| "val/clipfrac_avg": 0.017099056392908096, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8205329179763794, | |
| "val/ratio_var": 7.763963367324322e-05 | |
| }, | |
| { | |
| "episode": 160, | |
| "epoch": 0.2807017543859649, | |
| "eps": 3, | |
| "loss/policy_avg": 0.0211679395288229, | |
| "loss/value_avg": 1.213599681854248, | |
| "lr": 1.2914018691588786e-05, | |
| "objective/entropy": 31.076860427856445, | |
| "objective/kl": 135.477294921875, | |
| "objective/non_score_reward": -6.77386474609375, | |
| "objective/rlhf_reward": -5.00042724609375, | |
| "objective/scores": 1.7734375, | |
| "policy/approxkl_avg": 3.5569186210632324, | |
| "policy/clipfrac_avg": 0.21304652094841003, | |
| "policy/entropy_avg": 1.1141610145568848, | |
| "step": 10, | |
| "val/clipfrac_avg": 0.06719152629375458, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8466310501098633, | |
| "val/ratio_var": 0.0001414915022905916 | |
| }, | |
| { | |
| "episode": 176, | |
| "epoch": 0.3087719298245614, | |
| "eps": 3, | |
| "loss/policy_avg": 0.0240048635751009, | |
| "loss/value_avg": 1.3511584997177124, | |
| "lr": 1.2782242990654206e-05, | |
| "objective/entropy": 32.90422058105469, | |
| "objective/kl": 146.03787231445312, | |
| "objective/non_score_reward": -7.301893711090088, | |
| "objective/rlhf_reward": -5.630018711090088, | |
| "objective/scores": 1.671875, | |
| "policy/approxkl_avg": 5.625584125518799, | |
| "policy/clipfrac_avg": 0.20400942862033844, | |
| "policy/entropy_avg": 1.1266499757766724, | |
| "step": 11, | |
| "val/clipfrac_avg": 0.05483490601181984, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8710312843322754, | |
| "val/ratio_var": 0.00014447471767198294 | |
| }, | |
| { | |
| "episode": 192, | |
| "epoch": 0.3368421052631579, | |
| "eps": 3, | |
| "loss/policy_avg": 0.05039631202816963, | |
| "loss/value_avg": 1.4507163763046265, | |
| "lr": 1.2650467289719627e-05, | |
| "objective/entropy": 34.778846740722656, | |
| "objective/kl": 152.17945861816406, | |
| "objective/non_score_reward": -7.608973503112793, | |
| "objective/rlhf_reward": -5.726161003112793, | |
| "objective/scores": 1.8828125, | |
| "policy/approxkl_avg": 4.985030174255371, | |
| "policy/clipfrac_avg": 0.1987028270959854, | |
| "policy/entropy_avg": 1.1480720043182373, | |
| "step": 12, | |
| "val/clipfrac_avg": 0.05188679322600365, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8588758111000061, | |
| "val/ratio_var": 3.4296579542569816e-05 | |
| }, | |
| { | |
| "episode": 208, | |
| "epoch": 0.3649122807017544, | |
| "eps": 3, | |
| "loss/policy_avg": 0.037938639521598816, | |
| "loss/value_avg": 1.5314528942108154, | |
| "lr": 1.2518691588785048e-05, | |
| "objective/entropy": 45.65214538574219, | |
| "objective/kl": 132.1256866455078, | |
| "objective/non_score_reward": -6.606284141540527, | |
| "objective/rlhf_reward": -6.803549766540527, | |
| "objective/scores": -0.197265625, | |
| "policy/approxkl_avg": 5.0863566398620605, | |
| "policy/clipfrac_avg": 0.18867924809455872, | |
| "policy/entropy_avg": 1.3655226230621338, | |
| "step": 13, | |
| "val/clipfrac_avg": 0.06367924809455872, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8282334804534912, | |
| "val/ratio_var": 0.0004519254434853792 | |
| }, | |
| { | |
| "episode": 224, | |
| "epoch": 0.3929824561403509, | |
| "eps": 3, | |
| "loss/policy_avg": 0.036732763051986694, | |
| "loss/value_avg": 2.189356803894043, | |
| "lr": 1.2386915887850468e-05, | |
| "objective/entropy": 38.35211944580078, | |
| "objective/kl": 113.13496398925781, | |
| "objective/non_score_reward": -5.656748294830322, | |
| "objective/rlhf_reward": -4.539560794830322, | |
| "objective/scores": 1.1171875, | |
| "policy/approxkl_avg": 4.408792972564697, | |
| "policy/clipfrac_avg": 0.21201542019844055, | |
| "policy/entropy_avg": 1.2410473823547363, | |
| "step": 14, | |
| "val/clipfrac_avg": 0.10222071409225464, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8754662275314331, | |
| "val/ratio_var": 0.0007480247295461595 | |
| }, | |
| { | |
| "episode": 240, | |
| "epoch": 0.42105263157894735, | |
| "eps": 3, | |
| "loss/policy_avg": 0.03690744563937187, | |
| "loss/value_avg": 1.4401739835739136, | |
| "lr": 1.2255140186915889e-05, | |
| "objective/entropy": 38.69694519042969, | |
| "objective/kl": 115.77085876464844, | |
| "objective/non_score_reward": -5.788543224334717, | |
| "objective/rlhf_reward": -5.627410411834717, | |
| "objective/scores": 0.1611328125, | |
| "policy/approxkl_avg": 4.673148155212402, | |
| "policy/clipfrac_avg": 0.1833726465702057, | |
| "policy/entropy_avg": 1.2108347415924072, | |
| "step": 15, | |
| "val/clipfrac_avg": 0.01179245300590992, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8487275838851929, | |
| "val/ratio_var": 0.0008522844291292131 | |
| }, | |
| { | |
| "episode": 256, | |
| "epoch": 0.44912280701754387, | |
| "eps": 3, | |
| "loss/policy_avg": 0.019956424832344055, | |
| "loss/value_avg": 1.5383408069610596, | |
| "lr": 1.212336448598131e-05, | |
| "objective/entropy": 32.02195739746094, | |
| "objective/kl": 122.87109375, | |
| "objective/non_score_reward": -6.1435546875, | |
| "objective/rlhf_reward": -5.7470703125, | |
| "objective/scores": 0.396484375, | |
| "policy/approxkl_avg": 5.454257965087891, | |
| "policy/clipfrac_avg": 0.22314535081386566, | |
| "policy/entropy_avg": 1.1098759174346924, | |
| "step": 16, | |
| "val/clipfrac_avg": 0.03384597226977348, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8839770555496216, | |
| "val/ratio_var": 0.0004538022622000426 | |
| }, | |
| { | |
| "episode": 272, | |
| "epoch": 0.47719298245614034, | |
| "eps": 3, | |
| "loss/policy_avg": 0.016002152115106583, | |
| "loss/value_avg": 1.5554126501083374, | |
| "lr": 1.199158878504673e-05, | |
| "objective/entropy": 28.698740005493164, | |
| "objective/kl": 129.80386352539062, | |
| "objective/non_score_reward": -6.4901933670043945, | |
| "objective/rlhf_reward": -6.1796464920043945, | |
| "objective/scores": 0.310546875, | |
| "policy/approxkl_avg": 3.0268969535827637, | |
| "policy/clipfrac_avg": 0.19969519972801208, | |
| "policy/entropy_avg": 0.9890843629837036, | |
| "step": 17, | |
| "val/clipfrac_avg": 0.07565668225288391, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8847370743751526, | |
| "val/ratio_var": 0.000195541579159908 | |
| }, | |
| { | |
| "episode": 288, | |
| "epoch": 0.5052631578947369, | |
| "eps": 3, | |
| "loss/policy_avg": 0.013964798301458359, | |
| "loss/value_avg": 1.590545892715454, | |
| "lr": 1.185981308411215e-05, | |
| "objective/entropy": 12.885665893554688, | |
| "objective/kl": 107.77202606201172, | |
| "objective/non_score_reward": -5.388601303100586, | |
| "objective/rlhf_reward": -4.169851303100586, | |
| "objective/scores": 1.21875, | |
| "policy/approxkl_avg": 5.9386420249938965, | |
| "policy/clipfrac_avg": 0.18009786307811737, | |
| "policy/entropy_avg": 0.6803157925605774, | |
| "step": 18, | |
| "val/clipfrac_avg": 0.12902730703353882, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9129149913787842, | |
| "val/ratio_var": 0.0004151359898969531 | |
| }, | |
| { | |
| "episode": 304, | |
| "epoch": 0.5333333333333333, | |
| "eps": 3, | |
| "loss/policy_avg": 0.02675745077431202, | |
| "loss/value_avg": 1.745023488998413, | |
| "lr": 1.1728037383177571e-05, | |
| "objective/entropy": 22.75409698486328, | |
| "objective/kl": 119.34423828125, | |
| "objective/non_score_reward": -5.967212200164795, | |
| "objective/rlhf_reward": -5.197680950164795, | |
| "objective/scores": 0.76953125, | |
| "policy/approxkl_avg": 6.532215118408203, | |
| "policy/clipfrac_avg": 0.1759602427482605, | |
| "policy/entropy_avg": 0.8050931692123413, | |
| "step": 19, | |
| "val/clipfrac_avg": 0.0052770450711250305, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9205665588378906, | |
| "val/ratio_var": 0.0005024418351240456 | |
| }, | |
| { | |
| "episode": 320, | |
| "epoch": 0.5614035087719298, | |
| "eps": 3, | |
| "loss/policy_avg": 0.014609305188059807, | |
| "loss/value_avg": 1.7434797286987305, | |
| "lr": 1.159626168224299e-05, | |
| "objective/entropy": 29.933460235595703, | |
| "objective/kl": 144.45672607421875, | |
| "objective/non_score_reward": -7.222836494445801, | |
| "objective/rlhf_reward": -6.675961494445801, | |
| "objective/scores": 0.546875, | |
| "policy/approxkl_avg": 7.733211517333984, | |
| "policy/clipfrac_avg": 0.1762971729040146, | |
| "policy/entropy_avg": 0.8804416656494141, | |
| "step": 20, | |
| "val/clipfrac_avg": 0.026533018797636032, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8693416714668274, | |
| "val/ratio_var": 0.000557584862690419 | |
| }, | |
| { | |
| "episode": 336, | |
| "epoch": 0.5894736842105263, | |
| "eps": 3, | |
| "loss/policy_avg": 0.019340746104717255, | |
| "loss/value_avg": 1.471176266670227, | |
| "lr": 1.146448598130841e-05, | |
| "objective/entropy": 23.77098846435547, | |
| "objective/kl": 149.46974182128906, | |
| "objective/non_score_reward": -7.473487377166748, | |
| "objective/rlhf_reward": -5.668799877166748, | |
| "objective/scores": 1.8046875, | |
| "policy/approxkl_avg": 5.010880470275879, | |
| "policy/clipfrac_avg": 0.18341976404190063, | |
| "policy/entropy_avg": 0.973499596118927, | |
| "step": 21, | |
| "val/clipfrac_avg": 0.016483133658766747, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8893705606460571, | |
| "val/ratio_var": 0.0012118957238271832 | |
| }, | |
| { | |
| "episode": 352, | |
| "epoch": 0.6175438596491228, | |
| "eps": 3, | |
| "loss/policy_avg": 0.009950436651706696, | |
| "loss/value_avg": 1.1944406032562256, | |
| "lr": 1.1332710280373831e-05, | |
| "objective/entropy": 23.91471290588379, | |
| "objective/kl": 155.48843383789062, | |
| "objective/non_score_reward": -7.774421691894531, | |
| "objective/rlhf_reward": -7.125984191894531, | |
| "objective/scores": 0.6484375, | |
| "policy/approxkl_avg": 6.8445940017700195, | |
| "policy/clipfrac_avg": 0.16214622557163239, | |
| "policy/entropy_avg": 0.837052047252655, | |
| "step": 22, | |
| "val/clipfrac_avg": 0.09257075190544128, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9027042984962463, | |
| "val/ratio_var": 0.00019580482330638915 | |
| }, | |
| { | |
| "episode": 368, | |
| "epoch": 0.6456140350877193, | |
| "eps": 3, | |
| "loss/policy_avg": 0.021877210587263107, | |
| "loss/value_avg": 1.0937385559082031, | |
| "lr": 1.1200934579439252e-05, | |
| "objective/entropy": 23.552492141723633, | |
| "objective/kl": 172.6247100830078, | |
| "objective/non_score_reward": -8.631235122680664, | |
| "objective/rlhf_reward": -6.787485122680664, | |
| "objective/scores": 1.84375, | |
| "policy/approxkl_avg": 6.227967262268066, | |
| "policy/clipfrac_avg": 0.16203010082244873, | |
| "policy/entropy_avg": 0.8671172857284546, | |
| "step": 23, | |
| "val/clipfrac_avg": 0.018410665914416313, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8919187784194946, | |
| "val/ratio_var": 0.001309347921051085 | |
| }, | |
| { | |
| "episode": 384, | |
| "epoch": 0.6736842105263158, | |
| "eps": 3, | |
| "loss/policy_avg": 0.036975178867578506, | |
| "loss/value_avg": 1.1198090314865112, | |
| "lr": 1.1069158878504672e-05, | |
| "objective/entropy": 20.281631469726562, | |
| "objective/kl": 170.099365234375, | |
| "objective/non_score_reward": -8.504968643188477, | |
| "objective/rlhf_reward": -7.262781143188477, | |
| "objective/scores": 1.2421875, | |
| "policy/approxkl_avg": 6.825319766998291, | |
| "policy/clipfrac_avg": 0.14049983024597168, | |
| "policy/entropy_avg": 0.802283525466919, | |
| "step": 24, | |
| "val/clipfrac_avg": 0.0785929411649704, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8865500092506409, | |
| "val/ratio_var": 0.0006475243135355413 | |
| }, | |
| { | |
| "episode": 400, | |
| "epoch": 0.7017543859649122, | |
| "eps": 3, | |
| "loss/policy_avg": 0.0237587820738554, | |
| "loss/value_avg": 1.3415908813476562, | |
| "lr": 1.0937383177570093e-05, | |
| "objective/entropy": 21.36954689025879, | |
| "objective/kl": 179.03819274902344, | |
| "objective/non_score_reward": -8.951910018920898, | |
| "objective/rlhf_reward": -7.334722518920898, | |
| "objective/scores": 1.6171875, | |
| "policy/approxkl_avg": 7.667660713195801, | |
| "policy/clipfrac_avg": 0.11468379199504852, | |
| "policy/entropy_avg": 0.7623839974403381, | |
| "step": 25, | |
| "val/clipfrac_avg": 0.27072370052337646, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8912988901138306, | |
| "val/ratio_var": 2.9881122827646323e-05 | |
| }, | |
| { | |
| "episode": 416, | |
| "epoch": 0.7298245614035088, | |
| "eps": 3, | |
| "loss/policy_avg": 0.018890127539634705, | |
| "loss/value_avg": 1.2341463565826416, | |
| "lr": 1.0805607476635514e-05, | |
| "objective/entropy": 16.852466583251953, | |
| "objective/kl": 179.8382568359375, | |
| "objective/non_score_reward": -8.991912841796875, | |
| "objective/rlhf_reward": -8.312225341796875, | |
| "objective/scores": 0.6796875, | |
| "policy/approxkl_avg": 7.357123374938965, | |
| "policy/clipfrac_avg": 0.11944779008626938, | |
| "policy/entropy_avg": 0.7555092573165894, | |
| "step": 26, | |
| "val/clipfrac_avg": 0.2508935034275055, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8802620768547058, | |
| "val/ratio_var": 0.00018670025747269392 | |
| }, | |
| { | |
| "episode": 432, | |
| "epoch": 0.7578947368421053, | |
| "eps": 3, | |
| "loss/policy_avg": 0.021100062876939774, | |
| "loss/value_avg": 0.9362931847572327, | |
| "lr": 1.0673831775700934e-05, | |
| "objective/entropy": 21.68787384033203, | |
| "objective/kl": 183.68655395507812, | |
| "objective/non_score_reward": -9.184328079223633, | |
| "objective/rlhf_reward": -7.199953079223633, | |
| "objective/scores": 1.984375, | |
| "policy/approxkl_avg": 5.103993892669678, | |
| "policy/clipfrac_avg": 0.12146226316690445, | |
| "policy/entropy_avg": 0.7987968921661377, | |
| "step": 27, | |
| "val/clipfrac_avg": 0.05424528568983078, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8928566575050354, | |
| "val/ratio_var": 0.00013602118997368962 | |
| }, | |
| { | |
| "episode": 448, | |
| "epoch": 0.7859649122807018, | |
| "eps": 3, | |
| "loss/policy_avg": 0.011967534199357033, | |
| "loss/value_avg": 1.0544021129608154, | |
| "lr": 1.0542056074766355e-05, | |
| "objective/entropy": 23.05614471435547, | |
| "objective/kl": 182.8275146484375, | |
| "objective/non_score_reward": -9.141375541687012, | |
| "objective/rlhf_reward": -7.789813041687012, | |
| "objective/scores": 1.3515625, | |
| "policy/approxkl_avg": 4.963105201721191, | |
| "policy/clipfrac_avg": 0.14268869161605835, | |
| "policy/entropy_avg": 0.8151739835739136, | |
| "step": 28, | |
| "val/clipfrac_avg": 0.2146226465702057, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8715541362762451, | |
| "val/ratio_var": 7.266430475283414e-05 | |
| }, | |
| { | |
| "episode": 464, | |
| "epoch": 0.8140350877192982, | |
| "eps": 3, | |
| "loss/policy_avg": 0.011134720407426357, | |
| "loss/value_avg": 0.775411069393158, | |
| "lr": 1.0410280373831775e-05, | |
| "objective/entropy": 25.063724517822266, | |
| "objective/kl": 188.51422119140625, | |
| "objective/non_score_reward": -9.425710678100586, | |
| "objective/rlhf_reward": -8.394460678100586, | |
| "objective/scores": 1.03125, | |
| "policy/approxkl_avg": 7.372692108154297, | |
| "policy/clipfrac_avg": 0.1320754736661911, | |
| "policy/entropy_avg": 0.8475193977355957, | |
| "step": 29, | |
| "val/clipfrac_avg": 0.04245283082127571, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.891329824924469, | |
| "val/ratio_var": 3.0236831207730575e-06 | |
| }, | |
| { | |
| "episode": 480, | |
| "epoch": 0.8421052631578947, | |
| "eps": 3, | |
| "loss/policy_avg": 0.017270730808377266, | |
| "loss/value_avg": 0.7942019701004028, | |
| "lr": 1.0278504672897196e-05, | |
| "objective/entropy": 22.402624130249023, | |
| "objective/kl": 185.72421264648438, | |
| "objective/non_score_reward": -9.286211013793945, | |
| "objective/rlhf_reward": -7.606523513793945, | |
| "objective/scores": 1.6796875, | |
| "policy/approxkl_avg": 8.755260467529297, | |
| "policy/clipfrac_avg": 0.11261792480945587, | |
| "policy/entropy_avg": 0.799101710319519, | |
| "step": 30, | |
| "val/clipfrac_avg": 0.014740565791726112, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8892796635627747, | |
| "val/ratio_var": 6.262218084884807e-05 | |
| }, | |
| { | |
| "episode": 496, | |
| "epoch": 0.8701754385964913, | |
| "eps": 3, | |
| "loss/policy_avg": 0.019226763397455215, | |
| "loss/value_avg": 0.6826229095458984, | |
| "lr": 1.0146728971962616e-05, | |
| "objective/entropy": 23.43070411682129, | |
| "objective/kl": 201.67799377441406, | |
| "objective/non_score_reward": -10.083900451660156, | |
| "objective/rlhf_reward": -9.154212951660156, | |
| "objective/scores": 0.9296875, | |
| "policy/approxkl_avg": 8.144369125366211, | |
| "policy/clipfrac_avg": 0.12264151126146317, | |
| "policy/entropy_avg": 0.8400179147720337, | |
| "step": 31, | |
| "val/clipfrac_avg": 0.2228773534297943, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8812745213508606, | |
| "val/ratio_var": 0.00010276544344378635 | |
| }, | |
| { | |
| "episode": 512, | |
| "epoch": 0.8982456140350877, | |
| "eps": 3, | |
| "loss/policy_avg": 0.005793810822069645, | |
| "loss/value_avg": 0.8381754159927368, | |
| "lr": 1.0014953271028037e-05, | |
| "objective/entropy": 21.22824478149414, | |
| "objective/kl": 185.61614990234375, | |
| "objective/non_score_reward": -9.280807495117188, | |
| "objective/rlhf_reward": -7.3511199951171875, | |
| "objective/scores": 1.9296875, | |
| "policy/approxkl_avg": 7.533528804779053, | |
| "policy/clipfrac_avg": 0.12205187976360321, | |
| "policy/entropy_avg": 0.7801576256752014, | |
| "step": 32, | |
| "val/clipfrac_avg": 0.09787735342979431, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8987904787063599, | |
| "val/ratio_var": 0.00018136559810955077 | |
| }, | |
| { | |
| "episode": 528, | |
| "epoch": 0.9263157894736842, | |
| "eps": 3, | |
| "loss/policy_avg": 0.02254486456513405, | |
| "loss/value_avg": 0.8877236843109131, | |
| "lr": 9.883177570093458e-06, | |
| "objective/entropy": 23.442203521728516, | |
| "objective/kl": 188.65457153320312, | |
| "objective/non_score_reward": -9.432729721069336, | |
| "objective/rlhf_reward": -7.534292221069336, | |
| "objective/scores": 1.8984375, | |
| "policy/approxkl_avg": 5.329720497131348, | |
| "policy/clipfrac_avg": 0.12323113530874252, | |
| "policy/entropy_avg": 0.8007351160049438, | |
| "step": 33, | |
| "val/clipfrac_avg": 0.036556605249643326, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8945379257202148, | |
| "val/ratio_var": 0.0001225806336151436 | |
| }, | |
| { | |
| "episode": 544, | |
| "epoch": 0.9543859649122807, | |
| "eps": 3, | |
| "loss/policy_avg": 0.01662730611860752, | |
| "loss/value_avg": 0.637324869632721, | |
| "lr": 9.751401869158878e-06, | |
| "objective/entropy": 20.620216369628906, | |
| "objective/kl": 186.25180053710938, | |
| "objective/non_score_reward": -9.312589645385742, | |
| "objective/rlhf_reward": -7.851652145385742, | |
| "objective/scores": 1.4609375, | |
| "policy/approxkl_avg": 8.322406768798828, | |
| "policy/clipfrac_avg": 0.125, | |
| "policy/entropy_avg": 0.7402328848838806, | |
| "step": 34, | |
| "val/clipfrac_avg": 0.002358490601181984, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8899158239364624, | |
| "val/ratio_var": 0.0002540757122915238 | |
| }, | |
| { | |
| "episode": 560, | |
| "epoch": 0.9824561403508771, | |
| "eps": 3, | |
| "loss/policy_avg": 0.0003616265021264553, | |
| "loss/value_avg": 0.6015822291374207, | |
| "lr": 9.619626168224299e-06, | |
| "objective/entropy": 21.12371063232422, | |
| "objective/kl": 186.5103302001953, | |
| "objective/non_score_reward": -9.325516700744629, | |
| "objective/rlhf_reward": -7.817704200744629, | |
| "objective/scores": 1.5078125, | |
| "policy/approxkl_avg": 6.682015419006348, | |
| "policy/clipfrac_avg": 0.13443395495414734, | |
| "policy/entropy_avg": 0.7727504372596741, | |
| "step": 35, | |
| "val/clipfrac_avg": 0.037146229296922684, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8780511021614075, | |
| "val/ratio_var": 6.755981303285807e-05 | |
| }, | |
| { | |
| "episode": 576, | |
| "epoch": 1.0105263157894737, | |
| "eps": 3, | |
| "loss/policy_avg": 0.009338408708572388, | |
| "loss/value_avg": 0.6681860685348511, | |
| "lr": 9.48785046728972e-06, | |
| "objective/entropy": 21.184894561767578, | |
| "objective/kl": 192.3019256591797, | |
| "objective/non_score_reward": -9.615096092224121, | |
| "objective/rlhf_reward": -7.443221092224121, | |
| "objective/scores": 2.171875, | |
| "policy/approxkl_avg": 6.053627967834473, | |
| "policy/clipfrac_avg": 0.1179245263338089, | |
| "policy/entropy_avg": 0.7489176392555237, | |
| "step": 36, | |
| "val/clipfrac_avg": 0.15683962404727936, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8922820091247559, | |
| "val/ratio_var": 0.00026253468240611255 | |
| }, | |
| { | |
| "episode": 592, | |
| "epoch": 1.03859649122807, | |
| "eps": 3, | |
| "loss/policy_avg": 0.006294197402894497, | |
| "loss/value_avg": 0.8030184507369995, | |
| "lr": 9.35607476635514e-06, | |
| "objective/entropy": 20.368942260742188, | |
| "objective/kl": 190.313232421875, | |
| "objective/non_score_reward": -9.51566219329834, | |
| "objective/rlhf_reward": -8.02347469329834, | |
| "objective/scores": 1.4921875, | |
| "policy/approxkl_avg": 6.838142395019531, | |
| "policy/clipfrac_avg": 0.11615566164255142, | |
| "policy/entropy_avg": 0.7397478222846985, | |
| "step": 37, | |
| "val/clipfrac_avg": 0.003537735901772976, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8849948644638062, | |
| "val/ratio_var": 8.990589412860572e-05 | |
| }, | |
| { | |
| "episode": 608, | |
| "epoch": 1.0666666666666667, | |
| "eps": 3, | |
| "loss/policy_avg": 0.016940509900450706, | |
| "loss/value_avg": 0.5882998704910278, | |
| "lr": 9.22429906542056e-06, | |
| "objective/entropy": 23.183269500732422, | |
| "objective/kl": 186.7628631591797, | |
| "objective/non_score_reward": -9.338143348693848, | |
| "objective/rlhf_reward": -7.564705848693848, | |
| "objective/scores": 1.7734375, | |
| "policy/approxkl_avg": 3.9737157821655273, | |
| "policy/clipfrac_avg": 0.13089622557163239, | |
| "policy/entropy_avg": 0.7805944681167603, | |
| "step": 38, | |
| "val/clipfrac_avg": 0.004127358552068472, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8967232704162598, | |
| "val/ratio_var": 0.00010195688810199499 | |
| }, | |
| { | |
| "episode": 624, | |
| "epoch": 1.0947368421052632, | |
| "eps": 3, | |
| "loss/policy_avg": 0.011485239490866661, | |
| "loss/value_avg": 0.5787074565887451, | |
| "lr": 9.092523364485981e-06, | |
| "objective/entropy": 17.85469627380371, | |
| "objective/kl": 187.50631713867188, | |
| "objective/non_score_reward": -9.375316619873047, | |
| "objective/rlhf_reward": -7.203441619873047, | |
| "objective/scores": 2.171875, | |
| "policy/approxkl_avg": 5.274375915527344, | |
| "policy/clipfrac_avg": 0.11556603759527206, | |
| "policy/entropy_avg": 0.7150323390960693, | |
| "step": 39, | |
| "val/clipfrac_avg": 0.16509434580802917, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8858749866485596, | |
| "val/ratio_var": 0.0001809587120078504 | |
| }, | |
| { | |
| "episode": 640, | |
| "epoch": 1.1228070175438596, | |
| "eps": 3, | |
| "loss/policy_avg": 0.017689252272248268, | |
| "loss/value_avg": 0.7021454572677612, | |
| "lr": 8.960747663551402e-06, | |
| "objective/entropy": 21.994365692138672, | |
| "objective/kl": 182.05810546875, | |
| "objective/non_score_reward": -9.1029052734375, | |
| "objective/rlhf_reward": -6.8841552734375, | |
| "objective/scores": 2.21875, | |
| "policy/approxkl_avg": 4.578630447387695, | |
| "policy/clipfrac_avg": 0.12558962404727936, | |
| "policy/entropy_avg": 0.7208189964294434, | |
| "step": 40, | |
| "val/clipfrac_avg": 0.06426886469125748, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8957177400588989, | |
| "val/ratio_var": 7.126481068553403e-05 | |
| }, | |
| { | |
| "episode": 656, | |
| "epoch": 1.1508771929824562, | |
| "eps": 3, | |
| "loss/policy_avg": 0.005923585034906864, | |
| "loss/value_avg": 0.5532969236373901, | |
| "lr": 8.828971962616822e-06, | |
| "objective/entropy": 18.056703567504883, | |
| "objective/kl": 170.11863708496094, | |
| "objective/non_score_reward": -8.505931854248047, | |
| "objective/rlhf_reward": -6.380931854248047, | |
| "objective/scores": 2.125, | |
| "policy/approxkl_avg": 3.3685710430145264, | |
| "policy/clipfrac_avg": 0.12028301507234573, | |
| "policy/entropy_avg": 0.6859033107757568, | |
| "step": 41, | |
| "val/clipfrac_avg": 0.0, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8874868154525757, | |
| "val/ratio_var": 4.332972093834542e-05 | |
| }, | |
| { | |
| "episode": 672, | |
| "epoch": 1.1789473684210525, | |
| "eps": 3, | |
| "loss/policy_avg": 0.012987145222723484, | |
| "loss/value_avg": 0.632028341293335, | |
| "lr": 8.697196261682243e-06, | |
| "objective/entropy": 19.865787506103516, | |
| "objective/kl": 177.42971801757812, | |
| "objective/non_score_reward": -8.871485710144043, | |
| "objective/rlhf_reward": -6.527735710144043, | |
| "objective/scores": 2.34375, | |
| "policy/approxkl_avg": 6.186136245727539, | |
| "policy/clipfrac_avg": 0.13325470685958862, | |
| "policy/entropy_avg": 0.6923173069953918, | |
| "step": 42, | |
| "val/clipfrac_avg": 0.125, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8991193771362305, | |
| "val/ratio_var": 0.0009607934043742716 | |
| }, | |
| { | |
| "episode": 688, | |
| "epoch": 1.207017543859649, | |
| "eps": 3, | |
| "loss/policy_avg": 0.011711956933140755, | |
| "loss/value_avg": 0.4474850296974182, | |
| "lr": 8.565420560747664e-06, | |
| "objective/entropy": 19.79934310913086, | |
| "objective/kl": 176.83395385742188, | |
| "objective/non_score_reward": -8.84169864654541, | |
| "objective/rlhf_reward": -7.02138614654541, | |
| "objective/scores": 1.8203125, | |
| "policy/approxkl_avg": 5.904331207275391, | |
| "policy/clipfrac_avg": 0.13089622557163239, | |
| "policy/entropy_avg": 0.6940422654151917, | |
| "step": 43, | |
| "val/clipfrac_avg": 0.000589622650295496, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8869062066078186, | |
| "val/ratio_var": 0.0008940807892940938 | |
| }, | |
| { | |
| "episode": 704, | |
| "epoch": 1.2350877192982457, | |
| "eps": 3, | |
| "loss/policy_avg": 0.014619714580476284, | |
| "loss/value_avg": 0.625725507736206, | |
| "lr": 8.433644859813084e-06, | |
| "objective/entropy": 22.385128021240234, | |
| "objective/kl": 181.2716064453125, | |
| "objective/non_score_reward": -9.063579559326172, | |
| "objective/rlhf_reward": -7.493267059326172, | |
| "objective/scores": 1.5703125, | |
| "policy/approxkl_avg": 4.966976165771484, | |
| "policy/clipfrac_avg": 0.14799527823925018, | |
| "policy/entropy_avg": 0.7825338840484619, | |
| "step": 44, | |
| "val/clipfrac_avg": 0.14681604504585266, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.883050799369812, | |
| "val/ratio_var": 1.7675925846560858e-05 | |
| }, | |
| { | |
| "episode": 720, | |
| "epoch": 1.263157894736842, | |
| "eps": 3, | |
| "loss/policy_avg": 0.024550937116146088, | |
| "loss/value_avg": 0.7726404070854187, | |
| "lr": 8.301869158878505e-06, | |
| "objective/entropy": 19.87116050720215, | |
| "objective/kl": 172.38674926757812, | |
| "objective/non_score_reward": -8.619338035583496, | |
| "objective/rlhf_reward": -6.572463035583496, | |
| "objective/scores": 2.046875, | |
| "policy/approxkl_avg": 4.95554256439209, | |
| "policy/clipfrac_avg": 0.13089622557163239, | |
| "policy/entropy_avg": 0.6934086680412292, | |
| "step": 45, | |
| "val/clipfrac_avg": 0.009433962404727936, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8879209756851196, | |
| "val/ratio_var": 0.0003393842780496925 | |
| }, | |
| { | |
| "episode": 736, | |
| "epoch": 1.2912280701754386, | |
| "eps": 3, | |
| "loss/policy_avg": 0.019363895058631897, | |
| "loss/value_avg": 0.5529497861862183, | |
| "lr": 8.170093457943925e-06, | |
| "objective/entropy": 17.47795295715332, | |
| "objective/kl": 174.22576904296875, | |
| "objective/non_score_reward": -8.711288452148438, | |
| "objective/rlhf_reward": -6.3206634521484375, | |
| "objective/scores": 2.390625, | |
| "policy/approxkl_avg": 3.8177051544189453, | |
| "policy/clipfrac_avg": 0.11851415038108826, | |
| "policy/entropy_avg": 0.6579099297523499, | |
| "step": 46, | |
| "val/clipfrac_avg": 0.001179245300590992, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9020024538040161, | |
| "val/ratio_var": 0.00011536524834809825 | |
| }, | |
| { | |
| "episode": 752, | |
| "epoch": 1.3192982456140352, | |
| "eps": 3, | |
| "loss/policy_avg": 0.012542858719825745, | |
| "loss/value_avg": 0.6548900604248047, | |
| "lr": 8.038317757009346e-06, | |
| "objective/entropy": 17.823394775390625, | |
| "objective/kl": 171.56602478027344, | |
| "objective/non_score_reward": -8.578301429748535, | |
| "objective/rlhf_reward": -6.828301429748535, | |
| "objective/scores": 1.75, | |
| "policy/approxkl_avg": 5.147519111633301, | |
| "policy/clipfrac_avg": 0.14563679695129395, | |
| "policy/entropy_avg": 0.6417368650436401, | |
| "step": 47, | |
| "val/clipfrac_avg": 0.004127358552068472, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8848338723182678, | |
| "val/ratio_var": 0.000510354817379266 | |
| }, | |
| { | |
| "episode": 768, | |
| "epoch": 1.3473684210526315, | |
| "eps": 3, | |
| "loss/policy_avg": 0.013358336873352528, | |
| "loss/value_avg": 0.3859623968601227, | |
| "lr": 7.906542056074766e-06, | |
| "objective/entropy": 16.128374099731445, | |
| "objective/kl": 168.9840087890625, | |
| "objective/non_score_reward": -8.449200630187988, | |
| "objective/rlhf_reward": -6.847638130187988, | |
| "objective/scores": 1.6015625, | |
| "policy/approxkl_avg": 4.781813144683838, | |
| "policy/clipfrac_avg": 0.12028302252292633, | |
| "policy/entropy_avg": 0.5887731313705444, | |
| "step": 48, | |
| "val/clipfrac_avg": 0.01179245300590992, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9012677669525146, | |
| "val/ratio_var": 0.00041765146306715906 | |
| }, | |
| { | |
| "episode": 784, | |
| "epoch": 1.3754385964912281, | |
| "eps": 3, | |
| "loss/policy_avg": 0.017281489446759224, | |
| "loss/value_avg": 0.5614770650863647, | |
| "lr": 7.774766355140187e-06, | |
| "objective/entropy": 20.177621841430664, | |
| "objective/kl": 166.16497802734375, | |
| "objective/non_score_reward": -8.308249473571777, | |
| "objective/rlhf_reward": -7.081686973571777, | |
| "objective/scores": 1.2265625, | |
| "policy/approxkl_avg": 3.1813056468963623, | |
| "policy/clipfrac_avg": 0.1432783007621765, | |
| "policy/entropy_avg": 0.7520714998245239, | |
| "step": 49, | |
| "val/clipfrac_avg": 0.1845518946647644, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8848077654838562, | |
| "val/ratio_var": 0.0005292592104524374 | |
| }, | |
| { | |
| "episode": 800, | |
| "epoch": 1.4035087719298245, | |
| "eps": 3, | |
| "loss/policy_avg": 0.015007663518190384, | |
| "loss/value_avg": 0.918763279914856, | |
| "lr": 7.642990654205608e-06, | |
| "objective/entropy": 17.82724380493164, | |
| "objective/kl": 179.90628051757812, | |
| "objective/non_score_reward": -8.99531364440918, | |
| "objective/rlhf_reward": -7.37812614440918, | |
| "objective/scores": 1.6171875, | |
| "policy/approxkl_avg": 5.0764312744140625, | |
| "policy/clipfrac_avg": 0.1149764209985733, | |
| "policy/entropy_avg": 0.6353041529655457, | |
| "step": 50, | |
| "val/clipfrac_avg": 0.014740565791726112, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9066751599311829, | |
| "val/ratio_var": 0.00023197307018563151 | |
| }, | |
| { | |
| "episode": 816, | |
| "epoch": 1.431578947368421, | |
| "eps": 3, | |
| "loss/policy_avg": 0.013827439397573471, | |
| "loss/value_avg": 0.6559504270553589, | |
| "lr": 7.511214953271027e-06, | |
| "objective/entropy": 17.98678207397461, | |
| "objective/kl": 173.85345458984375, | |
| "objective/non_score_reward": -8.692672729492188, | |
| "objective/rlhf_reward": -7.0911102294921875, | |
| "objective/scores": 1.6015625, | |
| "policy/approxkl_avg": 4.863851547241211, | |
| "policy/clipfrac_avg": 0.12382075190544128, | |
| "policy/entropy_avg": 0.6412214040756226, | |
| "step": 51, | |
| "val/clipfrac_avg": 0.03419811278581619, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8822081089019775, | |
| "val/ratio_var": 5.379146841733018e-06 | |
| }, | |
| { | |
| "episode": 832, | |
| "epoch": 1.4596491228070176, | |
| "eps": 3, | |
| "loss/policy_avg": 0.009551126509904861, | |
| "loss/value_avg": 0.46615320444107056, | |
| "lr": 7.379439252336448e-06, | |
| "objective/entropy": 14.611004829406738, | |
| "objective/kl": 169.49464416503906, | |
| "objective/non_score_reward": -8.4747314453125, | |
| "objective/rlhf_reward": -7.0137939453125, | |
| "objective/scores": 1.4609375, | |
| "policy/approxkl_avg": 4.765644073486328, | |
| "policy/clipfrac_avg": 0.09964622557163239, | |
| "policy/entropy_avg": 0.5645979642868042, | |
| "step": 52, | |
| "val/clipfrac_avg": 0.0, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9012112021446228, | |
| "val/ratio_var": 1.3516472790797707e-05 | |
| }, | |
| { | |
| "episode": 848, | |
| "epoch": 1.487719298245614, | |
| "eps": 3, | |
| "loss/policy_avg": 0.009903261438012123, | |
| "loss/value_avg": 1.024169683456421, | |
| "lr": 7.2476635514018685e-06, | |
| "objective/entropy": 16.012420654296875, | |
| "objective/kl": 173.99917602539062, | |
| "objective/non_score_reward": -8.699958801269531, | |
| "objective/rlhf_reward": -6.371833801269531, | |
| "objective/scores": 2.328125, | |
| "policy/approxkl_avg": 4.889924049377441, | |
| "policy/clipfrac_avg": 0.11261792480945587, | |
| "policy/entropy_avg": 0.5950597524642944, | |
| "step": 53, | |
| "val/clipfrac_avg": 0.20518869161605835, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.8948594331741333, | |
| "val/ratio_var": 6.705896521452814e-05 | |
| }, | |
| { | |
| "episode": 864, | |
| "epoch": 1.5157894736842106, | |
| "eps": 3, | |
| "loss/policy_avg": -0.007397271227091551, | |
| "loss/value_avg": 0.4296436011791229, | |
| "lr": 7.115887850467289e-06, | |
| "objective/entropy": 12.315929412841797, | |
| "objective/kl": 175.46511840820312, | |
| "objective/non_score_reward": -8.773256301879883, | |
| "objective/rlhf_reward": -6.570131301879883, | |
| "objective/scores": 2.203125, | |
| "policy/approxkl_avg": 5.532078742980957, | |
| "policy/clipfrac_avg": 0.09375, | |
| "policy/entropy_avg": 0.5138251781463623, | |
| "step": 54, | |
| "val/clipfrac_avg": 0.003537735901772976, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9130541086196899, | |
| "val/ratio_var": 8.332962897839025e-05 | |
| }, | |
| { | |
| "episode": 880, | |
| "epoch": 1.543859649122807, | |
| "eps": 3, | |
| "loss/policy_avg": 0.0055680545046925545, | |
| "loss/value_avg": 0.44329357147216797, | |
| "lr": 6.9841121495327106e-06, | |
| "objective/entropy": 13.753085136413574, | |
| "objective/kl": 162.1046905517578, | |
| "objective/non_score_reward": -8.10523509979248, | |
| "objective/rlhf_reward": -6.1052350997924805, | |
| "objective/scores": 2.0, | |
| "policy/approxkl_avg": 4.2778730392456055, | |
| "policy/clipfrac_avg": 0.10200471431016922, | |
| "policy/entropy_avg": 0.538252592086792, | |
| "step": 55, | |
| "val/clipfrac_avg": 0.000589622650295496, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.89923495054245, | |
| "val/ratio_var": 0.00013667276652995497 | |
| }, | |
| { | |
| "episode": 896, | |
| "epoch": 1.5719298245614035, | |
| "eps": 3, | |
| "loss/policy_avg": 0.0029763877391815186, | |
| "loss/value_avg": 0.5969923734664917, | |
| "lr": 6.852336448598131e-06, | |
| "objective/entropy": 10.386423110961914, | |
| "objective/kl": 170.64817810058594, | |
| "objective/non_score_reward": -8.53240966796875, | |
| "objective/rlhf_reward": -5.84490966796875, | |
| "objective/scores": 2.6875, | |
| "policy/approxkl_avg": 5.515145301818848, | |
| "policy/clipfrac_avg": 0.0695754736661911, | |
| "policy/entropy_avg": 0.4759911596775055, | |
| "step": 56, | |
| "val/clipfrac_avg": 0.22051887214183807, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9136029481887817, | |
| "val/ratio_var": 0.00015939133299980313 | |
| }, | |
| { | |
| "episode": 912, | |
| "epoch": 1.6, | |
| "eps": 3, | |
| "loss/policy_avg": -0.0002519981935620308, | |
| "loss/value_avg": 0.6188120245933533, | |
| "lr": 6.720560747663552e-06, | |
| "objective/entropy": 9.047847747802734, | |
| "objective/kl": 162.95162963867188, | |
| "objective/non_score_reward": -8.147581100463867, | |
| "objective/rlhf_reward": -5.835081100463867, | |
| "objective/scores": 2.3125, | |
| "policy/approxkl_avg": 5.942928314208984, | |
| "policy/clipfrac_avg": 0.06721697747707367, | |
| "policy/entropy_avg": 0.43892478942871094, | |
| "step": 57, | |
| "val/clipfrac_avg": 0.03242924436926842, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9175019264221191, | |
| "val/ratio_var": 2.5528926926199347e-05 | |
| }, | |
| { | |
| "episode": 928, | |
| "epoch": 1.6280701754385964, | |
| "eps": 3, | |
| "loss/policy_avg": -0.004241641610860825, | |
| "loss/value_avg": 0.6380342245101929, | |
| "lr": 6.588785046728972e-06, | |
| "objective/entropy": 10.172576904296875, | |
| "objective/kl": 172.64210510253906, | |
| "objective/non_score_reward": -8.632105827331543, | |
| "objective/rlhf_reward": -6.085230827331543, | |
| "objective/scores": 2.546875, | |
| "policy/approxkl_avg": 5.1512861251831055, | |
| "policy/clipfrac_avg": 0.09669811278581619, | |
| "policy/entropy_avg": 0.44444799423217773, | |
| "step": 58, | |
| "val/clipfrac_avg": 0.00294811325147748, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9051207304000854, | |
| "val/ratio_var": 9.685073746368289e-05 | |
| }, | |
| { | |
| "episode": 944, | |
| "epoch": 1.656140350877193, | |
| "eps": 3, | |
| "loss/policy_avg": 0.005844447761774063, | |
| "loss/value_avg": 0.46530038118362427, | |
| "lr": 6.457009345794393e-06, | |
| "objective/entropy": 11.34018611907959, | |
| "objective/kl": 167.05087280273438, | |
| "objective/non_score_reward": -8.352543830871582, | |
| "objective/rlhf_reward": -5.368168830871582, | |
| "objective/scores": 2.984375, | |
| "policy/approxkl_avg": 4.73173713684082, | |
| "policy/clipfrac_avg": 0.06898584961891174, | |
| "policy/entropy_avg": 0.4987587630748749, | |
| "step": 59, | |
| "val/clipfrac_avg": 0.003537735901772976, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9096221327781677, | |
| "val/ratio_var": 0.0002903940330725163 | |
| }, | |
| { | |
| "episode": 960, | |
| "epoch": 1.6842105263157894, | |
| "eps": 3, | |
| "loss/policy_avg": 0.0015796682564541698, | |
| "loss/value_avg": 0.5465973615646362, | |
| "lr": 6.3252336448598135e-06, | |
| "objective/entropy": 10.832345962524414, | |
| "objective/kl": 166.35125732421875, | |
| "objective/non_score_reward": -8.317562103271484, | |
| "objective/rlhf_reward": -5.114437103271484, | |
| "objective/scores": 3.203125, | |
| "policy/approxkl_avg": 4.080867767333984, | |
| "policy/clipfrac_avg": 0.08726415038108826, | |
| "policy/entropy_avg": 0.46615108847618103, | |
| "step": 60, | |
| "val/clipfrac_avg": 0.018278302624821663, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9084208011627197, | |
| "val/ratio_var": 1.8292890672455542e-05 | |
| }, | |
| { | |
| "episode": 976, | |
| "epoch": 1.712280701754386, | |
| "eps": 3, | |
| "loss/policy_avg": -0.0016184533014893532, | |
| "loss/value_avg": 0.6316072344779968, | |
| "lr": 6.193457943925234e-06, | |
| "objective/entropy": 9.0885648727417, | |
| "objective/kl": 172.646240234375, | |
| "objective/non_score_reward": -8.632311820983887, | |
| "objective/rlhf_reward": -5.194811820983887, | |
| "objective/scores": 3.4375, | |
| "policy/approxkl_avg": 4.502593994140625, | |
| "policy/clipfrac_avg": 0.06603773683309555, | |
| "policy/entropy_avg": 0.41100969910621643, | |
| "step": 61, | |
| "val/clipfrac_avg": 0.044811319559812546, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9256702661514282, | |
| "val/ratio_var": 7.894221198512241e-05 | |
| }, | |
| { | |
| "episode": 992, | |
| "epoch": 1.7403508771929825, | |
| "eps": 3, | |
| "loss/policy_avg": -0.0019415542483329773, | |
| "loss/value_avg": 0.6046911478042603, | |
| "lr": 6.061682242990655e-06, | |
| "objective/entropy": 9.12926197052002, | |
| "objective/kl": 169.4315185546875, | |
| "objective/non_score_reward": -8.471575736999512, | |
| "objective/rlhf_reward": -5.424700736999512, | |
| "objective/scores": 3.046875, | |
| "policy/approxkl_avg": 5.609973907470703, | |
| "policy/clipfrac_avg": 0.09198112785816193, | |
| "policy/entropy_avg": 0.4236205816268921, | |
| "step": 62, | |
| "val/clipfrac_avg": 0.001768867950886488, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9198966026306152, | |
| "val/ratio_var": 6.228529673535377e-05 | |
| }, | |
| { | |
| "episode": 1008, | |
| "epoch": 1.768421052631579, | |
| "eps": 3, | |
| "loss/policy_avg": -0.007835395634174347, | |
| "loss/value_avg": 0.6853305697441101, | |
| "lr": 5.929906542056075e-06, | |
| "objective/entropy": 8.566083908081055, | |
| "objective/kl": 163.68191528320312, | |
| "objective/non_score_reward": -8.18409538269043, | |
| "objective/rlhf_reward": -4.09034538269043, | |
| "objective/scores": 4.09375, | |
| "policy/approxkl_avg": 3.7664973735809326, | |
| "policy/clipfrac_avg": 0.07429245114326477, | |
| "policy/entropy_avg": 0.41426771879196167, | |
| "step": 63, | |
| "val/clipfrac_avg": 0.007665094453841448, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9395467042922974, | |
| "val/ratio_var": 0.00018259203352499753 | |
| }, | |
| { | |
| "episode": 1024, | |
| "epoch": 1.7964912280701755, | |
| "eps": 3, | |
| "loss/policy_avg": 0.0056846365332603455, | |
| "loss/value_avg": 0.8050791621208191, | |
| "lr": 5.798130841121495e-06, | |
| "objective/entropy": 7.867904186248779, | |
| "objective/kl": 176.44961547851562, | |
| "objective/non_score_reward": -8.822481155395508, | |
| "objective/rlhf_reward": -4.931856155395508, | |
| "objective/scores": 3.890625, | |
| "policy/approxkl_avg": 4.615470886230469, | |
| "policy/clipfrac_avg": 0.07016509771347046, | |
| "policy/entropy_avg": 0.40076911449432373, | |
| "step": 64, | |
| "val/clipfrac_avg": 0.1179245263338089, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9168256521224976, | |
| "val/ratio_var": 1.1071170774812344e-05 | |
| }, | |
| { | |
| "episode": 1040, | |
| "epoch": 1.8245614035087718, | |
| "eps": 3, | |
| "loss/policy_avg": -0.004829235374927521, | |
| "loss/value_avg": 0.7683409452438354, | |
| "lr": 5.666355140186916e-06, | |
| "objective/entropy": 8.73065185546875, | |
| "objective/kl": 165.93441772460938, | |
| "objective/non_score_reward": -8.296720504760742, | |
| "objective/rlhf_reward": -4.531095504760742, | |
| "objective/scores": 3.765625, | |
| "policy/approxkl_avg": 4.037623882293701, | |
| "policy/clipfrac_avg": 0.0625, | |
| "policy/entropy_avg": 0.38483142852783203, | |
| "step": 65, | |
| "val/clipfrac_avg": 0.0, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9364030361175537, | |
| "val/ratio_var": 0.0001283105229958892 | |
| }, | |
| { | |
| "episode": 1056, | |
| "epoch": 1.8526315789473684, | |
| "eps": 3, | |
| "loss/policy_avg": -0.002082128543406725, | |
| "loss/value_avg": 0.8781827688217163, | |
| "lr": 5.534579439252336e-06, | |
| "objective/entropy": 6.81689977645874, | |
| "objective/kl": 173.76760864257812, | |
| "objective/non_score_reward": -8.688380241394043, | |
| "objective/rlhf_reward": -5.454005241394043, | |
| "objective/scores": 3.234375, | |
| "policy/approxkl_avg": 5.1825032234191895, | |
| "policy/clipfrac_avg": 0.07488207519054413, | |
| "policy/entropy_avg": 0.3771995007991791, | |
| "step": 66, | |
| "val/clipfrac_avg": 0.0, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9298049211502075, | |
| "val/ratio_var": 0.00015954735863488168 | |
| }, | |
| { | |
| "episode": 1072, | |
| "epoch": 1.880701754385965, | |
| "eps": 3, | |
| "loss/policy_avg": 0.005034355446696281, | |
| "loss/value_avg": 1.0226874351501465, | |
| "lr": 5.402803738317757e-06, | |
| "objective/entropy": 5.308557510375977, | |
| "objective/kl": 171.6015167236328, | |
| "objective/non_score_reward": -8.580076217651367, | |
| "objective/rlhf_reward": -4.236326217651367, | |
| "objective/scores": 4.34375, | |
| "policy/approxkl_avg": 5.336367607116699, | |
| "policy/clipfrac_avg": 0.04599056765437126, | |
| "policy/entropy_avg": 0.34400177001953125, | |
| "step": 67, | |
| "val/clipfrac_avg": 0.000589622650295496, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9246993064880371, | |
| "val/ratio_var": 9.672338592281449e-07 | |
| }, | |
| { | |
| "episode": 1088, | |
| "epoch": 1.9087719298245613, | |
| "eps": 3, | |
| "loss/policy_avg": 0.023275576531887054, | |
| "loss/value_avg": 0.6750494241714478, | |
| "lr": 5.271028037383177e-06, | |
| "objective/entropy": 7.23941707611084, | |
| "objective/kl": 166.45547485351562, | |
| "objective/non_score_reward": -8.322773933410645, | |
| "objective/rlhf_reward": -4.6352739334106445, | |
| "objective/scores": 3.6875, | |
| "policy/approxkl_avg": 3.1369752883911133, | |
| "policy/clipfrac_avg": 0.05837263911962509, | |
| "policy/entropy_avg": 0.3996211886405945, | |
| "step": 68, | |
| "val/clipfrac_avg": 0.000589622650295496, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9343756437301636, | |
| "val/ratio_var": 0.00011849942529806867 | |
| }, | |
| { | |
| "episode": 1104, | |
| "epoch": 1.936842105263158, | |
| "eps": 3, | |
| "loss/policy_avg": 0.001583978533744812, | |
| "loss/value_avg": 0.7364473342895508, | |
| "lr": 5.139252336448598e-06, | |
| "objective/entropy": 8.292254447937012, | |
| "objective/kl": 174.10446166992188, | |
| "objective/non_score_reward": -8.705223083496094, | |
| "objective/rlhf_reward": -4.517723083496094, | |
| "objective/scores": 4.1875, | |
| "policy/approxkl_avg": 5.407079696655273, | |
| "policy/clipfrac_avg": 0.06780660152435303, | |
| "policy/entropy_avg": 0.3910168409347534, | |
| "step": 69, | |
| "val/clipfrac_avg": 0.001179245300590992, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9262620210647583, | |
| "val/ratio_var": 8.509035978931934e-06 | |
| }, | |
| { | |
| "episode": 1120, | |
| "epoch": 1.9649122807017543, | |
| "eps": 3, | |
| "loss/policy_avg": 0.014011572115123272, | |
| "loss/value_avg": 0.49188750982284546, | |
| "lr": 5.0074766355140185e-06, | |
| "objective/entropy": 4.73923397064209, | |
| "objective/kl": 170.3909912109375, | |
| "objective/non_score_reward": -8.519549369812012, | |
| "objective/rlhf_reward": -4.535174369812012, | |
| "objective/scores": 3.984375, | |
| "policy/approxkl_avg": 4.553505897521973, | |
| "policy/clipfrac_avg": 0.04658018797636032, | |
| "policy/entropy_avg": 0.314146488904953, | |
| "step": 70, | |
| "val/clipfrac_avg": 0.001768867950886488, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9345220327377319, | |
| "val/ratio_var": 0.00011590120993787423 | |
| }, | |
| { | |
| "episode": 1136, | |
| "epoch": 1.9929824561403509, | |
| "eps": 3, | |
| "loss/policy_avg": 0.014443885535001755, | |
| "loss/value_avg": 0.8583539724349976, | |
| "lr": 4.875700934579439e-06, | |
| "objective/entropy": 6.110556602478027, | |
| "objective/kl": 168.1246337890625, | |
| "objective/non_score_reward": -8.406231880187988, | |
| "objective/rlhf_reward": -4.781231880187988, | |
| "objective/scores": 3.625, | |
| "policy/approxkl_avg": 3.3112387657165527, | |
| "policy/clipfrac_avg": 0.04716981202363968, | |
| "policy/entropy_avg": 0.3741912841796875, | |
| "step": 71, | |
| "val/clipfrac_avg": 0.0, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9311293363571167, | |
| "val/ratio_var": 1.7129657862824388e-05 | |
| }, | |
| { | |
| "episode": 1152, | |
| "epoch": 2.0210526315789474, | |
| "eps": 3, | |
| "loss/policy_avg": 0.0069357771426439285, | |
| "loss/value_avg": 0.6024092435836792, | |
| "lr": 4.74392523364486e-06, | |
| "objective/entropy": 1.9080017805099487, | |
| "objective/kl": 175.54367065429688, | |
| "objective/non_score_reward": -8.777183532714844, | |
| "objective/rlhf_reward": -3.7771835327148438, | |
| "objective/scores": 5.0, | |
| "policy/approxkl_avg": 6.433887004852295, | |
| "policy/clipfrac_avg": 0.03655660152435303, | |
| "policy/entropy_avg": 0.2685927748680115, | |
| "step": 72, | |
| "val/clipfrac_avg": 0.0, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9363144040107727, | |
| "val/ratio_var": 2.7767631763708778e-05 | |
| }, | |
| { | |
| "episode": 1168, | |
| "epoch": 2.049122807017544, | |
| "eps": 3, | |
| "loss/policy_avg": 0.004744451027363539, | |
| "loss/value_avg": 0.6521505117416382, | |
| "lr": 4.61214953271028e-06, | |
| "objective/entropy": 2.584568500518799, | |
| "objective/kl": 171.61709594726562, | |
| "objective/non_score_reward": -8.580854415893555, | |
| "objective/rlhf_reward": -3.1121044158935547, | |
| "objective/scores": 5.46875, | |
| "policy/approxkl_avg": 4.509120941162109, | |
| "policy/clipfrac_avg": 0.03478773683309555, | |
| "policy/entropy_avg": 0.2757822573184967, | |
| "step": 73, | |
| "val/clipfrac_avg": 0.2900943160057068, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9448769092559814, | |
| "val/ratio_var": 1.143478584708646e-05 | |
| }, | |
| { | |
| "episode": 1184, | |
| "epoch": 2.07719298245614, | |
| "eps": 3, | |
| "loss/policy_avg": 0.004101068712770939, | |
| "loss/value_avg": 0.44417738914489746, | |
| "lr": 4.480373831775701e-06, | |
| "objective/entropy": 3.265643835067749, | |
| "objective/kl": 179.89352416992188, | |
| "objective/non_score_reward": -8.99467658996582, | |
| "objective/rlhf_reward": -4.11967658996582, | |
| "objective/scores": 4.875, | |
| "policy/approxkl_avg": 5.798920154571533, | |
| "policy/clipfrac_avg": 0.028891509398818016, | |
| "policy/entropy_avg": 0.29206639528274536, | |
| "step": 74, | |
| "val/clipfrac_avg": 0.05188679322600365, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9267533421516418, | |
| "val/ratio_var": 0.0001095838742912747 | |
| }, | |
| { | |
| "episode": 1200, | |
| "epoch": 2.1052631578947367, | |
| "eps": 3, | |
| "loss/policy_avg": 0.004760343115776777, | |
| "loss/value_avg": 0.3549901843070984, | |
| "lr": 4.3485981308411215e-06, | |
| "objective/entropy": 2.9447989463806152, | |
| "objective/kl": 175.41961669921875, | |
| "objective/non_score_reward": -8.770980834960938, | |
| "objective/rlhf_reward": -3.2084808349609375, | |
| "objective/scores": 5.5625, | |
| "policy/approxkl_avg": 4.80606746673584, | |
| "policy/clipfrac_avg": 0.03419811278581619, | |
| "policy/entropy_avg": 0.30916815996170044, | |
| "step": 75, | |
| "val/clipfrac_avg": 0.004127358552068472, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9180092215538025, | |
| "val/ratio_var": 2.3069829694577493e-05 | |
| }, | |
| { | |
| "episode": 1216, | |
| "epoch": 2.1333333333333333, | |
| "eps": 3, | |
| "loss/policy_avg": 0.010298425331711769, | |
| "loss/value_avg": 0.15927723050117493, | |
| "lr": 4.216822429906542e-06, | |
| "objective/entropy": 1.4227180480957031, | |
| "objective/kl": 176.0067138671875, | |
| "objective/non_score_reward": -8.800336837768555, | |
| "objective/rlhf_reward": -2.6753368377685547, | |
| "objective/scores": 6.125, | |
| "policy/approxkl_avg": 4.99057149887085, | |
| "policy/clipfrac_avg": 0.028301887214183807, | |
| "policy/entropy_avg": 0.2751670479774475, | |
| "step": 76, | |
| "val/clipfrac_avg": 0.10495282709598541, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9201045036315918, | |
| "val/ratio_var": 5.366753157431958e-06 | |
| }, | |
| { | |
| "episode": 1232, | |
| "epoch": 2.16140350877193, | |
| "eps": 3, | |
| "loss/policy_avg": -0.005462624132633209, | |
| "loss/value_avg": 0.28704196214675903, | |
| "lr": 4.085046728971963e-06, | |
| "objective/entropy": 1.6171071529388428, | |
| "objective/kl": 176.83685302734375, | |
| "objective/non_score_reward": -8.841842651367188, | |
| "objective/rlhf_reward": -3.1543426513671875, | |
| "objective/scores": 5.6875, | |
| "policy/approxkl_avg": 5.847208023071289, | |
| "policy/clipfrac_avg": 0.028891509398818016, | |
| "policy/entropy_avg": 0.286138117313385, | |
| "step": 77, | |
| "val/clipfrac_avg": 0.07075471431016922, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9195102453231812, | |
| "val/ratio_var": 9.577343917044345e-06 | |
| }, | |
| { | |
| "episode": 1248, | |
| "epoch": 2.1894736842105265, | |
| "eps": 3, | |
| "loss/policy_avg": 0.0010141655802726746, | |
| "loss/value_avg": 0.8408201932907104, | |
| "lr": 3.953271028037383e-06, | |
| "objective/entropy": 7.40260124206543, | |
| "objective/kl": 177.4427490234375, | |
| "objective/non_score_reward": -8.872137069702148, | |
| "objective/rlhf_reward": -4.903387069702148, | |
| "objective/scores": 3.96875, | |
| "policy/approxkl_avg": 5.285105228424072, | |
| "policy/clipfrac_avg": 0.04658018797636032, | |
| "policy/entropy_avg": 0.4253733158111572, | |
| "step": 78, | |
| "val/clipfrac_avg": 0.000589622650295496, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9203585982322693, | |
| "val/ratio_var": 1.65566543728346e-05 | |
| }, | |
| { | |
| "episode": 1264, | |
| "epoch": 2.2175438596491226, | |
| "eps": 3, | |
| "loss/policy_avg": -0.004624534398317337, | |
| "loss/value_avg": 0.7719740271568298, | |
| "lr": 3.821495327102804e-06, | |
| "objective/entropy": 4.648886203765869, | |
| "objective/kl": 181.51986694335938, | |
| "objective/non_score_reward": -9.075994491577148, | |
| "objective/rlhf_reward": -4.700994491577148, | |
| "objective/scores": 4.375, | |
| "policy/approxkl_avg": 4.547338485717773, | |
| "policy/clipfrac_avg": 0.0383254736661911, | |
| "policy/entropy_avg": 0.3513880968093872, | |
| "step": 79, | |
| "val/clipfrac_avg": 0.1291273534297943, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9286638498306274, | |
| "val/ratio_var": 6.422147998819128e-05 | |
| }, | |
| { | |
| "episode": 1280, | |
| "epoch": 2.245614035087719, | |
| "eps": 3, | |
| "loss/policy_avg": 0.012380128726363182, | |
| "loss/value_avg": 0.40563684701919556, | |
| "lr": 3.689719626168224e-06, | |
| "objective/entropy": 7.685408115386963, | |
| "objective/kl": 168.90484619140625, | |
| "objective/non_score_reward": -8.445242881774902, | |
| "objective/rlhf_reward": -3.4452428817749023, | |
| "objective/scores": 5.0, | |
| "policy/approxkl_avg": 3.4143970012664795, | |
| "policy/clipfrac_avg": 0.041273586452007294, | |
| "policy/entropy_avg": 0.4100227355957031, | |
| "step": 80, | |
| "val/clipfrac_avg": 0.015330187976360321, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9217012524604797, | |
| "val/ratio_var": 7.061174983391538e-05 | |
| }, | |
| { | |
| "episode": 1296, | |
| "epoch": 2.2736842105263158, | |
| "eps": 3, | |
| "loss/policy_avg": 0.011339722201228142, | |
| "loss/value_avg": 0.3490160405635834, | |
| "lr": 3.5579439252336446e-06, | |
| "objective/entropy": 4.046834945678711, | |
| "objective/kl": 177.92718505859375, | |
| "objective/non_score_reward": -8.89635944366455, | |
| "objective/rlhf_reward": -3.958859443664551, | |
| "objective/scores": 4.9375, | |
| "policy/approxkl_avg": 5.583766460418701, | |
| "policy/clipfrac_avg": 0.03478773683309555, | |
| "policy/entropy_avg": 0.3193933963775635, | |
| "step": 81, | |
| "val/clipfrac_avg": 0.0, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9363265037536621, | |
| "val/ratio_var": 5.8069101214641705e-05 | |
| }, | |
| { | |
| "episode": 1312, | |
| "epoch": 2.3017543859649123, | |
| "eps": 3, | |
| "loss/policy_avg": 0.007465606089681387, | |
| "loss/value_avg": 0.3137081563472748, | |
| "lr": 3.4261682242990656e-06, | |
| "objective/entropy": 3.293423652648926, | |
| "objective/kl": 173.18377685546875, | |
| "objective/non_score_reward": -8.659189224243164, | |
| "objective/rlhf_reward": -3.065439224243164, | |
| "objective/scores": 5.59375, | |
| "policy/approxkl_avg": 4.794089317321777, | |
| "policy/clipfrac_avg": 0.0383254736661911, | |
| "policy/entropy_avg": 0.29685914516448975, | |
| "step": 82, | |
| "val/clipfrac_avg": 0.000589622650295496, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9458816647529602, | |
| "val/ratio_var": 0.00023432180751115084 | |
| }, | |
| { | |
| "episode": 1328, | |
| "epoch": 2.329824561403509, | |
| "eps": 3, | |
| "loss/policy_avg": -0.00093865767121315, | |
| "loss/value_avg": 0.9402576684951782, | |
| "lr": 3.294392523364486e-06, | |
| "objective/entropy": 5.09280252456665, | |
| "objective/kl": 173.88351440429688, | |
| "objective/non_score_reward": -8.694175720214844, | |
| "objective/rlhf_reward": -4.866050720214844, | |
| "objective/scores": 3.828125, | |
| "policy/approxkl_avg": 3.8168904781341553, | |
| "policy/clipfrac_avg": 0.03891509398818016, | |
| "policy/entropy_avg": 0.35750845074653625, | |
| "step": 83, | |
| "val/clipfrac_avg": 0.000589622650295496, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9232673645019531, | |
| "val/ratio_var": 4.1539384255884215e-05 | |
| }, | |
| { | |
| "episode": 1344, | |
| "epoch": 2.357894736842105, | |
| "eps": 3, | |
| "loss/policy_avg": -0.007357731461524963, | |
| "loss/value_avg": 0.36178284883499146, | |
| "lr": 3.1626168224299067e-06, | |
| "objective/entropy": 5.281716346740723, | |
| "objective/kl": 179.2125701904297, | |
| "objective/non_score_reward": -8.960628509521484, | |
| "objective/rlhf_reward": -3.9918785095214844, | |
| "objective/scores": 4.96875, | |
| "policy/approxkl_avg": 4.461269378662109, | |
| "policy/clipfrac_avg": 0.05365566164255142, | |
| "policy/entropy_avg": 0.35694169998168945, | |
| "step": 84, | |
| "val/clipfrac_avg": 0.000589622650295496, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9178085923194885, | |
| "val/ratio_var": 4.949720823788084e-05 | |
| }, | |
| { | |
| "episode": 1360, | |
| "epoch": 2.3859649122807016, | |
| "eps": 3, | |
| "loss/policy_avg": 0.0004696398973464966, | |
| "loss/value_avg": 0.30143094062805176, | |
| "lr": 3.0308411214953273e-06, | |
| "objective/entropy": 2.755769729614258, | |
| "objective/kl": 173.08140563964844, | |
| "objective/non_score_reward": -8.654069900512695, | |
| "objective/rlhf_reward": -2.9665699005126953, | |
| "objective/scores": 5.6875, | |
| "policy/approxkl_avg": 5.356992721557617, | |
| "policy/clipfrac_avg": 0.03242924436926842, | |
| "policy/entropy_avg": 0.282896488904953, | |
| "step": 85, | |
| "val/clipfrac_avg": 0.0, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9419326782226562, | |
| "val/ratio_var": 3.9359238144243136e-05 | |
| }, | |
| { | |
| "episode": 1376, | |
| "epoch": 2.414035087719298, | |
| "eps": 3, | |
| "loss/policy_avg": 0.0008706599473953247, | |
| "loss/value_avg": 0.5158276557922363, | |
| "lr": 2.8990654205607475e-06, | |
| "objective/entropy": 4.149250507354736, | |
| "objective/kl": 173.71505737304688, | |
| "objective/non_score_reward": -8.685752868652344, | |
| "objective/rlhf_reward": -3.6857528686523438, | |
| "objective/scores": 5.0, | |
| "policy/approxkl_avg": 5.095344066619873, | |
| "policy/clipfrac_avg": 0.030070755630731583, | |
| "policy/entropy_avg": 0.3076534867286682, | |
| "step": 86, | |
| "val/clipfrac_avg": 0.0, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.93389892578125, | |
| "val/ratio_var": 9.108168342208955e-06 | |
| }, | |
| { | |
| "episode": 1392, | |
| "epoch": 2.442105263157895, | |
| "eps": 3, | |
| "loss/policy_avg": -0.0013641200494021177, | |
| "loss/value_avg": 0.46665364503860474, | |
| "lr": 2.767289719626168e-06, | |
| "objective/entropy": 3.9847404956817627, | |
| "objective/kl": 171.97903442382812, | |
| "objective/non_score_reward": -8.59895133972168, | |
| "objective/rlhf_reward": -3.4114513397216797, | |
| "objective/scores": 5.1875, | |
| "policy/approxkl_avg": 4.758839130401611, | |
| "policy/clipfrac_avg": 0.02771226316690445, | |
| "policy/entropy_avg": 0.3068329691886902, | |
| "step": 87, | |
| "val/clipfrac_avg": 0.002358490601181984, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9303795099258423, | |
| "val/ratio_var": 2.7705931643140502e-05 | |
| }, | |
| { | |
| "episode": 1408, | |
| "epoch": 2.4701754385964914, | |
| "eps": 3, | |
| "loss/policy_avg": -0.009293105453252792, | |
| "loss/value_avg": 0.1374308168888092, | |
| "lr": 2.6355140186915887e-06, | |
| "objective/entropy": 2.8504319190979004, | |
| "objective/kl": 178.8887176513672, | |
| "objective/non_score_reward": -8.944437026977539, | |
| "objective/rlhf_reward": -2.975687026977539, | |
| "objective/scores": 5.96875, | |
| "policy/approxkl_avg": 5.254701614379883, | |
| "policy/clipfrac_avg": 0.026533018797636032, | |
| "policy/entropy_avg": 0.2935040593147278, | |
| "step": 88, | |
| "val/clipfrac_avg": 0.002358490601181984, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9373711943626404, | |
| "val/ratio_var": 1.766591776686255e-05 | |
| }, | |
| { | |
| "episode": 1424, | |
| "epoch": 2.498245614035088, | |
| "eps": 3, | |
| "loss/policy_avg": 0.00495288148522377, | |
| "loss/value_avg": 0.20061969757080078, | |
| "lr": 2.5037383177570093e-06, | |
| "objective/entropy": 4.51104211807251, | |
| "objective/kl": 169.7410125732422, | |
| "objective/non_score_reward": -8.487051010131836, | |
| "objective/rlhf_reward": -2.768301010131836, | |
| "objective/scores": 5.71875, | |
| "policy/approxkl_avg": 4.481791019439697, | |
| "policy/clipfrac_avg": 0.03478773683309555, | |
| "policy/entropy_avg": 0.3265501856803894, | |
| "step": 89, | |
| "val/clipfrac_avg": 0.0, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9414163827896118, | |
| "val/ratio_var": 0.00011632378300419077 | |
| }, | |
| { | |
| "episode": 1440, | |
| "epoch": 2.526315789473684, | |
| "eps": 3, | |
| "loss/policy_avg": 0.00646105594933033, | |
| "loss/value_avg": 0.42740941047668457, | |
| "lr": 2.37196261682243e-06, | |
| "objective/entropy": 3.2403650283813477, | |
| "objective/kl": 176.238037109375, | |
| "objective/non_score_reward": -8.811902046203613, | |
| "objective/rlhf_reward": -3.9994020462036133, | |
| "objective/scores": 4.8125, | |
| "policy/approxkl_avg": 5.44842529296875, | |
| "policy/clipfrac_avg": 0.01591981202363968, | |
| "policy/entropy_avg": 0.2933845520019531, | |
| "step": 90, | |
| "val/clipfrac_avg": 0.000589622650295496, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9381667971611023, | |
| "val/ratio_var": 9.369335020892322e-05 | |
| }, | |
| { | |
| "episode": 1456, | |
| "epoch": 2.5543859649122806, | |
| "eps": 3, | |
| "loss/policy_avg": -0.005389541387557983, | |
| "loss/value_avg": 0.4948211908340454, | |
| "lr": 2.2401869158878504e-06, | |
| "objective/entropy": 2.898387908935547, | |
| "objective/kl": 173.48486328125, | |
| "objective/non_score_reward": -8.674242973327637, | |
| "objective/rlhf_reward": -3.5179929733276367, | |
| "objective/scores": 5.15625, | |
| "policy/approxkl_avg": 4.66801643371582, | |
| "policy/clipfrac_avg": 0.020636793226003647, | |
| "policy/entropy_avg": 0.2913670837879181, | |
| "step": 91, | |
| "val/clipfrac_avg": 0.001179245300590992, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9378049373626709, | |
| "val/ratio_var": 1.2327662261668593e-05 | |
| }, | |
| { | |
| "episode": 1472, | |
| "epoch": 2.5824561403508772, | |
| "eps": 3, | |
| "loss/policy_avg": -0.010267895646393299, | |
| "loss/value_avg": 0.26834648847579956, | |
| "lr": 2.108411214953271e-06, | |
| "objective/entropy": 4.616816997528076, | |
| "objective/kl": 171.12762451171875, | |
| "objective/non_score_reward": -8.556382179260254, | |
| "objective/rlhf_reward": -3.587632179260254, | |
| "objective/scores": 4.96875, | |
| "policy/approxkl_avg": 4.146580219268799, | |
| "policy/clipfrac_avg": 0.041273586452007294, | |
| "policy/entropy_avg": 0.34417253732681274, | |
| "step": 92, | |
| "val/clipfrac_avg": 0.000589622650295496, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9241670370101929, | |
| "val/ratio_var": 4.9057460273616016e-05 | |
| }, | |
| { | |
| "episode": 1488, | |
| "epoch": 2.610526315789474, | |
| "eps": 3, | |
| "loss/policy_avg": 0.0006395354866981506, | |
| "loss/value_avg": 0.7872554063796997, | |
| "lr": 1.9766355140186916e-06, | |
| "objective/entropy": 5.483046531677246, | |
| "objective/kl": 169.1695098876953, | |
| "objective/non_score_reward": -8.458476066589355, | |
| "objective/rlhf_reward": -4.4741010665893555, | |
| "objective/scores": 3.984375, | |
| "policy/approxkl_avg": 2.8852078914642334, | |
| "policy/clipfrac_avg": 0.032429248094558716, | |
| "policy/entropy_avg": 0.35312554240226746, | |
| "step": 93, | |
| "val/clipfrac_avg": 0.001768867950886488, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9286659955978394, | |
| "val/ratio_var": 2.8370095606078394e-05 | |
| }, | |
| { | |
| "episode": 1504, | |
| "epoch": 2.6385964912280704, | |
| "eps": 3, | |
| "loss/policy_avg": -0.00652042031288147, | |
| "loss/value_avg": 0.17014235258102417, | |
| "lr": 1.844859813084112e-06, | |
| "objective/entropy": 2.737617015838623, | |
| "objective/kl": 178.22747802734375, | |
| "objective/non_score_reward": -8.911375045776367, | |
| "objective/rlhf_reward": -3.286375045776367, | |
| "objective/scores": 5.625, | |
| "policy/approxkl_avg": 5.498225688934326, | |
| "policy/clipfrac_avg": 0.028891511261463165, | |
| "policy/entropy_avg": 0.28995558619499207, | |
| "step": 94, | |
| "val/clipfrac_avg": 0.001768867950886488, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9345540404319763, | |
| "val/ratio_var": 1.2709216434814152e-06 | |
| }, | |
| { | |
| "episode": 1520, | |
| "epoch": 2.6666666666666665, | |
| "eps": 3, | |
| "loss/policy_avg": 0.007195580750703812, | |
| "loss/value_avg": 0.40153437852859497, | |
| "lr": 1.7130841121495328e-06, | |
| "objective/entropy": 4.30942440032959, | |
| "objective/kl": 176.95938110351562, | |
| "objective/non_score_reward": -8.847970008850098, | |
| "objective/rlhf_reward": -3.8479700088500977, | |
| "objective/scores": 5.0, | |
| "policy/approxkl_avg": 5.267421245574951, | |
| "policy/clipfrac_avg": 0.03125, | |
| "policy/entropy_avg": 0.31008654832839966, | |
| "step": 95, | |
| "val/clipfrac_avg": 0.0, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9336869716644287, | |
| "val/ratio_var": 4.966981941834092e-05 | |
| }, | |
| { | |
| "episode": 1536, | |
| "epoch": 2.694736842105263, | |
| "eps": 3, | |
| "loss/policy_avg": 0.012591801583766937, | |
| "loss/value_avg": 0.3597390055656433, | |
| "lr": 1.5813084112149534e-06, | |
| "objective/entropy": 5.459916591644287, | |
| "objective/kl": 172.44110107421875, | |
| "objective/non_score_reward": -8.622055053710938, | |
| "objective/rlhf_reward": -3.6220550537109375, | |
| "objective/scores": 5.0, | |
| "policy/approxkl_avg": 4.5339765548706055, | |
| "policy/clipfrac_avg": 0.03537736088037491, | |
| "policy/entropy_avg": 0.3411254286766052, | |
| "step": 96, | |
| "val/clipfrac_avg": 0.0, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9462149739265442, | |
| "val/ratio_var": 3.8459929783130065e-05 | |
| }, | |
| { | |
| "episode": 1552, | |
| "epoch": 2.7228070175438597, | |
| "eps": 3, | |
| "loss/policy_avg": -0.003356472123414278, | |
| "loss/value_avg": 0.6434417963027954, | |
| "lr": 1.4495327102803737e-06, | |
| "objective/entropy": 5.633913516998291, | |
| "objective/kl": 172.26502990722656, | |
| "objective/non_score_reward": -8.613250732421875, | |
| "objective/rlhf_reward": -4.363250732421875, | |
| "objective/scores": 4.25, | |
| "policy/approxkl_avg": 3.585165500640869, | |
| "policy/clipfrac_avg": 0.03537735715508461, | |
| "policy/entropy_avg": 0.34199586510658264, | |
| "step": 97, | |
| "val/clipfrac_avg": 0.001179245300590992, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9311625957489014, | |
| "val/ratio_var": 6.935850979061797e-05 | |
| }, | |
| { | |
| "episode": 1568, | |
| "epoch": 2.7508771929824563, | |
| "eps": 3, | |
| "loss/policy_avg": -0.003898909315466881, | |
| "loss/value_avg": 0.36550819873809814, | |
| "lr": 1.3177570093457943e-06, | |
| "objective/entropy": 4.281040191650391, | |
| "objective/kl": 174.15972900390625, | |
| "objective/non_score_reward": -8.707986831665039, | |
| "objective/rlhf_reward": -3.614236831665039, | |
| "objective/scores": 5.09375, | |
| "policy/approxkl_avg": 5.1715850830078125, | |
| "policy/clipfrac_avg": 0.02712264284491539, | |
| "policy/entropy_avg": 0.3137935698032379, | |
| "step": 98, | |
| "val/clipfrac_avg": 0.0, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9350335597991943, | |
| "val/ratio_var": 1.248767239303561e-05 | |
| }, | |
| { | |
| "episode": 1584, | |
| "epoch": 2.778947368421053, | |
| "eps": 3, | |
| "loss/policy_avg": 0.0017306804656982422, | |
| "loss/value_avg": 0.2737918496131897, | |
| "lr": 1.185981308411215e-06, | |
| "objective/entropy": 5.210065841674805, | |
| "objective/kl": 173.97068786621094, | |
| "objective/non_score_reward": -8.69853401184082, | |
| "objective/rlhf_reward": -3.8860340118408203, | |
| "objective/scores": 4.8125, | |
| "policy/approxkl_avg": 5.011469841003418, | |
| "policy/clipfrac_avg": 0.04304245114326477, | |
| "policy/entropy_avg": 0.3419041931629181, | |
| "step": 99, | |
| "val/clipfrac_avg": 0.0, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9419320225715637, | |
| "val/ratio_var": 8.726042324269656e-06 | |
| }, | |
| { | |
| "episode": 1600, | |
| "epoch": 2.807017543859649, | |
| "eps": 3, | |
| "loss/policy_avg": -0.006221463903784752, | |
| "loss/value_avg": 0.3625496029853821, | |
| "lr": 1.0542056074766355e-06, | |
| "objective/entropy": 3.721562623977661, | |
| "objective/kl": 175.773193359375, | |
| "objective/non_score_reward": -8.78865909576416, | |
| "objective/rlhf_reward": -3.47615909576416, | |
| "objective/scores": 5.3125, | |
| "policy/approxkl_avg": 5.388751029968262, | |
| "policy/clipfrac_avg": 0.03419811278581619, | |
| "policy/entropy_avg": 0.29315799474716187, | |
| "step": 100, | |
| "val/clipfrac_avg": 0.0, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.949840784072876, | |
| "val/ratio_var": 3.881535303662531e-05 | |
| }, | |
| { | |
| "episode": 1616, | |
| "epoch": 2.8350877192982455, | |
| "eps": 3, | |
| "loss/policy_avg": -0.005170758813619614, | |
| "loss/value_avg": 0.4136154055595398, | |
| "lr": 9.22429906542056e-07, | |
| "objective/entropy": 5.907715320587158, | |
| "objective/kl": 171.47390747070312, | |
| "objective/non_score_reward": -8.573695182800293, | |
| "objective/rlhf_reward": -4.167445182800293, | |
| "objective/scores": 4.40625, | |
| "policy/approxkl_avg": 5.403087615966797, | |
| "policy/clipfrac_avg": 0.03478773683309555, | |
| "policy/entropy_avg": 0.33564049005508423, | |
| "step": 101, | |
| "val/clipfrac_avg": 0.000589622650295496, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9340347051620483, | |
| "val/ratio_var": 3.0960076401242986e-05 | |
| }, | |
| { | |
| "episode": 1632, | |
| "epoch": 2.863157894736842, | |
| "eps": 3, | |
| "loss/policy_avg": 0.002457182854413986, | |
| "loss/value_avg": 0.27742013335227966, | |
| "lr": 7.906542056074767e-07, | |
| "objective/entropy": 5.222499370574951, | |
| "objective/kl": 176.05380249023438, | |
| "objective/non_score_reward": -8.802690505981445, | |
| "objective/rlhf_reward": -3.6151905059814453, | |
| "objective/scores": 5.1875, | |
| "policy/approxkl_avg": 4.675132751464844, | |
| "policy/clipfrac_avg": 0.04304245114326477, | |
| "policy/entropy_avg": 0.3403066396713257, | |
| "step": 102, | |
| "val/clipfrac_avg": 0.0, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9404515027999878, | |
| "val/ratio_var": 7.289678615052253e-05 | |
| }, | |
| { | |
| "episode": 1648, | |
| "epoch": 2.8912280701754387, | |
| "eps": 3, | |
| "loss/policy_avg": -0.006544323638081551, | |
| "loss/value_avg": 0.29731422662734985, | |
| "lr": 6.588785046728972e-07, | |
| "objective/entropy": 4.219725608825684, | |
| "objective/kl": 178.1557159423828, | |
| "objective/non_score_reward": -8.90778636932373, | |
| "objective/rlhf_reward": -3.8452863693237305, | |
| "objective/scores": 5.0625, | |
| "policy/approxkl_avg": 5.953890800476074, | |
| "policy/clipfrac_avg": 0.0383254699409008, | |
| "policy/entropy_avg": 0.31199511885643005, | |
| "step": 103, | |
| "val/clipfrac_avg": 0.0, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9407525658607483, | |
| "val/ratio_var": 5.8047560742124915e-05 | |
| }, | |
| { | |
| "episode": 1664, | |
| "epoch": 2.9192982456140353, | |
| "eps": 3, | |
| "loss/policy_avg": -0.0011880630627274513, | |
| "loss/value_avg": 0.21903052926063538, | |
| "lr": 5.271028037383178e-07, | |
| "objective/entropy": 5.557653427124023, | |
| "objective/kl": 170.518798828125, | |
| "objective/non_score_reward": -8.52593994140625, | |
| "objective/rlhf_reward": -3.05718994140625, | |
| "objective/scores": 5.46875, | |
| "policy/approxkl_avg": 4.447786331176758, | |
| "policy/clipfrac_avg": 0.0383254699409008, | |
| "policy/entropy_avg": 0.33431151509284973, | |
| "step": 104, | |
| "val/clipfrac_avg": 0.0, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9298925399780273, | |
| "val/ratio_var": 9.144405339611694e-06 | |
| }, | |
| { | |
| "episode": 1680, | |
| "epoch": 2.9473684210526314, | |
| "eps": 3, | |
| "loss/policy_avg": -0.0007513905875384808, | |
| "loss/value_avg": 0.21037007868289948, | |
| "lr": 3.9532710280373834e-07, | |
| "objective/entropy": 3.900575876235962, | |
| "objective/kl": 174.99456787109375, | |
| "objective/non_score_reward": -8.74972915649414, | |
| "objective/rlhf_reward": -3.5309791564941406, | |
| "objective/scores": 5.21875, | |
| "policy/approxkl_avg": 5.165627479553223, | |
| "policy/clipfrac_avg": 0.028301887214183807, | |
| "policy/entropy_avg": 0.2935909032821655, | |
| "step": 105, | |
| "val/clipfrac_avg": 0.001179245300590992, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9422957897186279, | |
| "val/ratio_var": 2.6614558009896427e-05 | |
| }, | |
| { | |
| "episode": 1696, | |
| "epoch": 2.975438596491228, | |
| "eps": 3, | |
| "loss/policy_avg": -0.005426734685897827, | |
| "loss/value_avg": 0.21496959030628204, | |
| "lr": 2.635514018691589e-07, | |
| "objective/entropy": 4.556634902954102, | |
| "objective/kl": 173.05136108398438, | |
| "objective/non_score_reward": -8.652568817138672, | |
| "objective/rlhf_reward": -2.933818817138672, | |
| "objective/scores": 5.71875, | |
| "policy/approxkl_avg": 4.820314884185791, | |
| "policy/clipfrac_avg": 0.04304245486855507, | |
| "policy/entropy_avg": 0.31966692209243774, | |
| "step": 106, | |
| "val/clipfrac_avg": 0.0, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9327390193939209, | |
| "val/ratio_var": 8.508096652803943e-05 | |
| }, | |
| { | |
| "episode": 1712, | |
| "epoch": 3.0035087719298246, | |
| "eps": 3, | |
| "loss/policy_avg": 0.0001406269147992134, | |
| "loss/value_avg": 0.186610609292984, | |
| "lr": 1.3177570093457944e-07, | |
| "objective/entropy": 4.999897003173828, | |
| "objective/kl": 173.25045776367188, | |
| "objective/non_score_reward": -8.66252326965332, | |
| "objective/rlhf_reward": -2.9437732696533203, | |
| "objective/scores": 5.71875, | |
| "policy/approxkl_avg": 4.337066173553467, | |
| "policy/clipfrac_avg": 0.04716981202363968, | |
| "policy/entropy_avg": 0.3558363914489746, | |
| "step": 107, | |
| "val/clipfrac_avg": 0.0, | |
| "val/num_eos_tokens": 0, | |
| "val/ratio": 0.9243938326835632, | |
| "val/ratio_var": 0.000128799001686275 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 107, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3.0, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0, | |
| "train_batch_size": null, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |