| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9976019184652278, | |
| "eval_steps": 500, | |
| "global_step": 208, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 912.9860992431641, | |
| "epoch": 0.004796163069544364, | |
| "grad_norm": 0.12673589773378027, | |
| "kl": 0.0, | |
| "learning_rate": 4.7619047619047613e-08, | |
| "loss": 0.0232, | |
| "reward": 0.6874999850988388, | |
| "reward_std": 0.3423890396952629, | |
| "rewards/accuracy_reward": 0.2500000037252903, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4375, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 883.1944580078125, | |
| "epoch": 0.009592326139088728, | |
| "grad_norm": 0.116076838638148, | |
| "kl": 0.0, | |
| "learning_rate": 9.523809523809523e-08, | |
| "loss": 0.0136, | |
| "reward": 0.6423611119389534, | |
| "reward_std": 0.3120992071926594, | |
| "rewards/accuracy_reward": 0.243055559694767, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3993055522441864, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 851.4652862548828, | |
| "epoch": 0.014388489208633094, | |
| "grad_norm": 0.12982224030488893, | |
| "kl": 2.41696834564209e-05, | |
| "learning_rate": 1.4285714285714285e-07, | |
| "loss": 0.0211, | |
| "reward": 0.7118055671453476, | |
| "reward_std": 0.3277251161634922, | |
| "rewards/accuracy_reward": 0.2708333358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4409722238779068, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 877.0138854980469, | |
| "epoch": 0.019184652278177457, | |
| "grad_norm": 0.12194604446708072, | |
| "kl": 2.086162567138672e-05, | |
| "learning_rate": 1.9047619047619045e-07, | |
| "loss": 0.0285, | |
| "reward": 0.6909722238779068, | |
| "reward_std": 0.31570543721318245, | |
| "rewards/accuracy_reward": 0.2847222248092294, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.40625, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 884.4236297607422, | |
| "epoch": 0.023980815347721823, | |
| "grad_norm": 0.13261607584021887, | |
| "kl": 3.56137752532959e-05, | |
| "learning_rate": 2.3809523809523806e-07, | |
| "loss": 0.0312, | |
| "reward": 0.626736119389534, | |
| "reward_std": 0.2724486030638218, | |
| "rewards/accuracy_reward": 0.23611111007630825, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3906250074505806, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 825.4166870117188, | |
| "epoch": 0.02877697841726619, | |
| "grad_norm": 0.12101637274777749, | |
| "kl": 2.5272369384765625e-05, | |
| "learning_rate": 2.857142857142857e-07, | |
| "loss": 0.022, | |
| "reward": 0.817708358168602, | |
| "reward_std": 0.3249164782464504, | |
| "rewards/accuracy_reward": 0.361111119389534, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4565972313284874, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 898.826416015625, | |
| "epoch": 0.03357314148681055, | |
| "grad_norm": 0.13410726618357155, | |
| "kl": 3.2961368560791016e-05, | |
| "learning_rate": 3.333333333333333e-07, | |
| "loss": 0.017, | |
| "reward": 0.6961805522441864, | |
| "reward_std": 0.281472560018301, | |
| "rewards/accuracy_reward": 0.2916666641831398, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4045138880610466, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 868.3472290039062, | |
| "epoch": 0.03836930455635491, | |
| "grad_norm": 0.14428783414562169, | |
| "kl": 3.2767653465270996e-05, | |
| "learning_rate": 3.809523809523809e-07, | |
| "loss": 0.0364, | |
| "reward": 0.7777777761220932, | |
| "reward_std": 0.38908588513731956, | |
| "rewards/accuracy_reward": 0.34722222574055195, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.430555559694767, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 922.3125152587891, | |
| "epoch": 0.04316546762589928, | |
| "grad_norm": 0.14202941889032566, | |
| "kl": 3.263354301452637e-05, | |
| "learning_rate": 4.285714285714285e-07, | |
| "loss": 0.0361, | |
| "reward": 0.524305559694767, | |
| "reward_std": 0.23212899640202522, | |
| "rewards/accuracy_reward": 0.15972222574055195, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3645833432674408, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 914.1597290039062, | |
| "epoch": 0.047961630695443645, | |
| "grad_norm": 0.12987837517343923, | |
| "kl": 3.269314765930176e-05, | |
| "learning_rate": 4.761904761904761e-07, | |
| "loss": 0.0316, | |
| "reward": 0.5885416567325592, | |
| "reward_std": 0.2799038216471672, | |
| "rewards/accuracy_reward": 0.19444444822147489, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3940972238779068, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 935.5347137451172, | |
| "epoch": 0.05275779376498801, | |
| "grad_norm": 0.12123557233064179, | |
| "kl": 2.6702880859375e-05, | |
| "learning_rate": 5.238095238095238e-07, | |
| "loss": 0.0268, | |
| "reward": 0.579861119389534, | |
| "reward_std": 0.2995072081685066, | |
| "rewards/accuracy_reward": 0.20833333395421505, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3715277761220932, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 902.8472290039062, | |
| "epoch": 0.05755395683453238, | |
| "grad_norm": 0.1263644550467175, | |
| "kl": 1.638941466808319e-05, | |
| "learning_rate": 5.714285714285714e-07, | |
| "loss": 0.0182, | |
| "reward": 0.5815972238779068, | |
| "reward_std": 0.26795749366283417, | |
| "rewards/accuracy_reward": 0.18055555410683155, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.401041679084301, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 910.9583435058594, | |
| "epoch": 0.06235011990407674, | |
| "grad_norm": 0.13348819214381502, | |
| "kl": 3.2275915145874023e-05, | |
| "learning_rate": 6.19047619047619e-07, | |
| "loss": 0.033, | |
| "reward": 0.5833333507180214, | |
| "reward_std": 0.27443326637148857, | |
| "rewards/accuracy_reward": 0.18750000186264515, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3958333432674408, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 837.5764007568359, | |
| "epoch": 0.0671462829736211, | |
| "grad_norm": 0.12534297846344097, | |
| "kl": 2.5704503059387207e-05, | |
| "learning_rate": 6.666666666666666e-07, | |
| "loss": 0.0398, | |
| "reward": 0.6614583432674408, | |
| "reward_std": 0.25699039548635483, | |
| "rewards/accuracy_reward": 0.2291666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4322916716337204, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 870.8958435058594, | |
| "epoch": 0.07194244604316546, | |
| "grad_norm": 0.1178768584334791, | |
| "kl": 1.9222497940063477e-05, | |
| "learning_rate": 7.142857142857143e-07, | |
| "loss": 0.0223, | |
| "reward": 0.6701388955116272, | |
| "reward_std": 0.25698356330394745, | |
| "rewards/accuracy_reward": 0.25694444589316845, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4131944477558136, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 855.7222137451172, | |
| "epoch": 0.07673860911270983, | |
| "grad_norm": 0.15574941119385063, | |
| "kl": 2.9087066650390625e-05, | |
| "learning_rate": 7.619047619047618e-07, | |
| "loss": 0.0337, | |
| "reward": 0.7760416865348816, | |
| "reward_std": 0.4277946427464485, | |
| "rewards/accuracy_reward": 0.3194444440305233, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4565972313284874, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 869.6944427490234, | |
| "epoch": 0.0815347721822542, | |
| "grad_norm": 0.1258833956177519, | |
| "kl": 3.331899642944336e-05, | |
| "learning_rate": 8.095238095238095e-07, | |
| "loss": 0.0129, | |
| "reward": 0.6701388955116272, | |
| "reward_std": 0.30363673344254494, | |
| "rewards/accuracy_reward": 0.2847222238779068, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3854166716337204, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 874.4097137451172, | |
| "epoch": 0.08633093525179857, | |
| "grad_norm": 0.1315962441453753, | |
| "kl": 2.0717590814456344e-05, | |
| "learning_rate": 8.57142857142857e-07, | |
| "loss": 0.0301, | |
| "reward": 0.6163194552063942, | |
| "reward_std": 0.2553598415106535, | |
| "rewards/accuracy_reward": 0.21527777705341578, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4010416716337204, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 815.0555572509766, | |
| "epoch": 0.09112709832134293, | |
| "grad_norm": 0.14298620074456062, | |
| "kl": 3.075599670410156e-05, | |
| "learning_rate": 9.047619047619047e-07, | |
| "loss": 0.0405, | |
| "reward": 0.763888880610466, | |
| "reward_std": 0.2983681969344616, | |
| "rewards/accuracy_reward": 0.3263888955116272, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4375000074505806, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 862.4861145019531, | |
| "epoch": 0.09592326139088729, | |
| "grad_norm": 0.13958983684341053, | |
| "kl": 3.007054328918457e-05, | |
| "learning_rate": 9.523809523809522e-07, | |
| "loss": 0.0179, | |
| "reward": 0.6788194626569748, | |
| "reward_std": 0.2541828490793705, | |
| "rewards/accuracy_reward": 0.24305555783212185, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4357638880610466, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 891.1597290039062, | |
| "epoch": 0.10071942446043165, | |
| "grad_norm": 0.14460083396808562, | |
| "kl": 5.367398262023926e-05, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0206, | |
| "reward": 0.6215277761220932, | |
| "reward_std": 0.2927175499498844, | |
| "rewards/accuracy_reward": 0.22222222574055195, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.399305559694767, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 859.0, | |
| "epoch": 0.10551558752997602, | |
| "grad_norm": 0.13266204171177207, | |
| "kl": 7.867813110351562e-05, | |
| "learning_rate": 9.999364977905849e-07, | |
| "loss": 0.0212, | |
| "reward": 0.7864583432674408, | |
| "reward_std": 0.329488068819046, | |
| "rewards/accuracy_reward": 0.3472222238779068, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4392361119389534, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 841.5069427490234, | |
| "epoch": 0.11031175059952038, | |
| "grad_norm": 0.1368070463424762, | |
| "kl": 0.00011658668518066406, | |
| "learning_rate": 9.99746009084698e-07, | |
| "loss": 0.0389, | |
| "reward": 0.7725694477558136, | |
| "reward_std": 0.3147674612700939, | |
| "rewards/accuracy_reward": 0.3402777761220932, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.432291679084301, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 887.5139007568359, | |
| "epoch": 0.11510791366906475, | |
| "grad_norm": 0.1458860079179632, | |
| "kl": 0.00013267993927001953, | |
| "learning_rate": 9.994285876443557e-07, | |
| "loss": 0.0341, | |
| "reward": 0.626736119389534, | |
| "reward_std": 0.2819124907255173, | |
| "rewards/accuracy_reward": 0.2361111119389534, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3906250074505806, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 873.8611145019531, | |
| "epoch": 0.11990407673860912, | |
| "grad_norm": 0.130318186151234, | |
| "kl": 0.00016427040100097656, | |
| "learning_rate": 9.989843230560593e-07, | |
| "loss": 0.0384, | |
| "reward": 0.6493055671453476, | |
| "reward_std": 0.28573132678866386, | |
| "rewards/accuracy_reward": 0.22222222574055195, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4270833358168602, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 820.6111145019531, | |
| "epoch": 0.12470023980815348, | |
| "grad_norm": 0.1353493867926934, | |
| "kl": 0.00028514862060546875, | |
| "learning_rate": 9.984133407055104e-07, | |
| "loss": 0.0057, | |
| "reward": 0.7204861342906952, | |
| "reward_std": 0.27508755773305893, | |
| "rewards/accuracy_reward": 0.2777777798473835, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4427083358168602, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 903.1041717529297, | |
| "epoch": 0.12949640287769784, | |
| "grad_norm": 0.13147174706101974, | |
| "kl": 0.0002446174621582031, | |
| "learning_rate": 9.97715801742224e-07, | |
| "loss": 0.043, | |
| "reward": 0.6232638880610466, | |
| "reward_std": 0.2895628921687603, | |
| "rewards/accuracy_reward": 0.2361111156642437, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3871527835726738, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 870.1736145019531, | |
| "epoch": 0.1342925659472422, | |
| "grad_norm": 0.12627875749194337, | |
| "kl": 0.0002665519714355469, | |
| "learning_rate": 9.968919030340457e-07, | |
| "loss": 0.0277, | |
| "reward": 0.756944477558136, | |
| "reward_std": 0.3117631673812866, | |
| "rewards/accuracy_reward": 0.3194444477558136, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4375, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 896.8680725097656, | |
| "epoch": 0.13908872901678657, | |
| "grad_norm": 0.13847019160477245, | |
| "kl": 0.00043702125549316406, | |
| "learning_rate": 9.959418771115903e-07, | |
| "loss": 0.0286, | |
| "reward": 0.5954861268401146, | |
| "reward_std": 0.268420971930027, | |
| "rewards/accuracy_reward": 0.180555559694767, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.414930559694767, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 802.3194580078125, | |
| "epoch": 0.14388489208633093, | |
| "grad_norm": 0.14967482022344253, | |
| "kl": 0.0006508827209472656, | |
| "learning_rate": 9.948659921026139e-07, | |
| "loss": 0.0318, | |
| "reward": 0.8125000149011612, | |
| "reward_std": 0.4278785213828087, | |
| "rewards/accuracy_reward": 0.3541666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4583333283662796, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 794.6875, | |
| "epoch": 0.1486810551558753, | |
| "grad_norm": 0.17944099601126384, | |
| "kl": 0.0007243156433105469, | |
| "learning_rate": 9.936645516563387e-07, | |
| "loss": 0.0583, | |
| "reward": 0.8663194477558136, | |
| "reward_std": 0.35557055473327637, | |
| "rewards/accuracy_reward": 0.4027777835726738, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4635416716337204, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 880.7361145019531, | |
| "epoch": 0.15347721822541965, | |
| "grad_norm": 0.12117489678150584, | |
| "kl": 0.0006771087646484375, | |
| "learning_rate": 9.923378948577558e-07, | |
| "loss": 0.0401, | |
| "reward": 0.6406250074505806, | |
| "reward_std": 0.26150013506412506, | |
| "rewards/accuracy_reward": 0.23611111473292112, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4045138880610466, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 834.5555725097656, | |
| "epoch": 0.15827338129496402, | |
| "grad_norm": 0.1537599572741959, | |
| "kl": 0.00096893310546875, | |
| "learning_rate": 9.908863961319219e-07, | |
| "loss": 0.0342, | |
| "reward": 0.861111119389534, | |
| "reward_std": 0.37204235792160034, | |
| "rewards/accuracy_reward": 0.3958333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4652777835726738, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 850.9791870117188, | |
| "epoch": 0.1630695443645084, | |
| "grad_norm": 0.1605541412094071, | |
| "kl": 0.0012502670288085938, | |
| "learning_rate": 9.893104651382861e-07, | |
| "loss": 0.055, | |
| "reward": 0.8003472238779068, | |
| "reward_std": 0.3042585700750351, | |
| "rewards/accuracy_reward": 0.3819444514811039, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4184027835726738, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 851.0069580078125, | |
| "epoch": 0.16786570743405277, | |
| "grad_norm": 0.13513924603619376, | |
| "kl": 0.001659393310546875, | |
| "learning_rate": 9.876105466550707e-07, | |
| "loss": 0.0509, | |
| "reward": 0.7881944477558136, | |
| "reward_std": 0.3145363964140415, | |
| "rewards/accuracy_reward": 0.3680555671453476, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4201388880610466, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 878.7361297607422, | |
| "epoch": 0.17266187050359713, | |
| "grad_norm": 0.1542251680807794, | |
| "kl": 0.0016040802001953125, | |
| "learning_rate": 9.857871204537401e-07, | |
| "loss": 0.0544, | |
| "reward": 0.6944444477558136, | |
| "reward_std": 0.2541184388101101, | |
| "rewards/accuracy_reward": 0.305555559694767, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3888888955116272, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 830.6805572509766, | |
| "epoch": 0.1774580335731415, | |
| "grad_norm": 0.13514283713713865, | |
| "kl": 0.0020580291748046875, | |
| "learning_rate": 9.838407011635942e-07, | |
| "loss": 0.0246, | |
| "reward": 0.8350694626569748, | |
| "reward_std": 0.29664015769958496, | |
| "rewards/accuracy_reward": 0.3958333283662796, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4392361119389534, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 896.7152862548828, | |
| "epoch": 0.18225419664268586, | |
| "grad_norm": 0.12350709906557038, | |
| "kl": 0.0016126632690429688, | |
| "learning_rate": 9.817718381265238e-07, | |
| "loss": 0.0437, | |
| "reward": 0.6128472238779068, | |
| "reward_std": 0.31635782122612, | |
| "rewards/accuracy_reward": 0.2291666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3836805522441864, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 864.9444580078125, | |
| "epoch": 0.18705035971223022, | |
| "grad_norm": 0.12426066518400752, | |
| "kl": 0.0020294189453125, | |
| "learning_rate": 9.795811152419678e-07, | |
| "loss": 0.0301, | |
| "reward": 0.7291666716337204, | |
| "reward_std": 0.24946986511349678, | |
| "rewards/accuracy_reward": 0.3333333358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.3958333358168602, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 800.3333282470703, | |
| "epoch": 0.19184652278177458, | |
| "grad_norm": 0.15357763591255594, | |
| "kl": 0.0022525787353515625, | |
| "learning_rate": 9.772691508021193e-07, | |
| "loss": 0.042, | |
| "reward": 0.8281250149011612, | |
| "reward_std": 0.310004822909832, | |
| "rewards/accuracy_reward": 0.3888888992369175, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4392361044883728, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 818.4653015136719, | |
| "epoch": 0.19664268585131894, | |
| "grad_norm": 0.14942346890583016, | |
| "kl": 0.0027675628662109375, | |
| "learning_rate": 9.748365973174227e-07, | |
| "loss": 0.0492, | |
| "reward": 0.8402778059244156, | |
| "reward_std": 0.292511161416769, | |
| "rewards/accuracy_reward": 0.375, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4652777835726738, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 809.9514007568359, | |
| "epoch": 0.2014388489208633, | |
| "grad_norm": 0.15897417290331764, | |
| "kl": 0.003749847412109375, | |
| "learning_rate": 9.722841413324149e-07, | |
| "loss": 0.0459, | |
| "reward": 0.8593750149011612, | |
| "reward_std": 0.3366158865392208, | |
| "rewards/accuracy_reward": 0.4027777761220932, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4565972238779068, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 729.5972290039062, | |
| "epoch": 0.20623501199040767, | |
| "grad_norm": 0.176025608996368, | |
| "kl": 0.002838134765625, | |
| "learning_rate": 9.6961250323196e-07, | |
| "loss": 0.0243, | |
| "reward": 1.1562499850988388, | |
| "reward_std": 0.33967938274145126, | |
| "rewards/accuracy_reward": 0.6319444552063942, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5243055522441864, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 850.7222290039062, | |
| "epoch": 0.21103117505995203, | |
| "grad_norm": 0.11868611450690505, | |
| "kl": 0.003307342529296875, | |
| "learning_rate": 9.668224370379346e-07, | |
| "loss": 0.0277, | |
| "reward": 0.8246527910232544, | |
| "reward_std": 0.2909863740205765, | |
| "rewards/accuracy_reward": 0.4027777910232544, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4218750074505806, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 758.6111297607422, | |
| "epoch": 0.2158273381294964, | |
| "grad_norm": 0.15818031203002028, | |
| "kl": 0.003475189208984375, | |
| "learning_rate": 9.639147301964175e-07, | |
| "loss": 0.0287, | |
| "reward": 0.9652777910232544, | |
| "reward_std": 0.301775723695755, | |
| "rewards/accuracy_reward": 0.4791666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4861111119389534, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 838.8055725097656, | |
| "epoch": 0.22062350119904076, | |
| "grad_norm": 0.16002912577528097, | |
| "kl": 0.004032135009765625, | |
| "learning_rate": 9.608902033554475e-07, | |
| "loss": 0.0429, | |
| "reward": 0.7517361342906952, | |
| "reward_std": 0.34651144593954086, | |
| "rewards/accuracy_reward": 0.305555559694767, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.446180559694767, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 843.9861297607422, | |
| "epoch": 0.22541966426858512, | |
| "grad_norm": 0.1540903493857499, | |
| "kl": 0.003780364990234375, | |
| "learning_rate": 9.577497101334103e-07, | |
| "loss": 0.0317, | |
| "reward": 0.8159722238779068, | |
| "reward_std": 0.3663570396602154, | |
| "rewards/accuracy_reward": 0.3611111119389534, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.454861119389534, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 865.0486145019531, | |
| "epoch": 0.2302158273381295, | |
| "grad_norm": 0.14872925893794725, | |
| "kl": 0.004302978515625, | |
| "learning_rate": 9.544941368781208e-07, | |
| "loss": 0.0514, | |
| "reward": 0.7083333432674408, | |
| "reward_std": 0.39198317378759384, | |
| "rewards/accuracy_reward": 0.2777777798473835, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.430555559694767, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 808.4028015136719, | |
| "epoch": 0.23501199040767387, | |
| "grad_norm": 0.13499682062737456, | |
| "kl": 0.004261016845703125, | |
| "learning_rate": 9.51124402416666e-07, | |
| "loss": 0.0296, | |
| "reward": 0.8680555671453476, | |
| "reward_std": 0.265322033315897, | |
| "rewards/accuracy_reward": 0.4305555559694767, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4375, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 853.8611145019531, | |
| "epoch": 0.23980815347721823, | |
| "grad_norm": 0.16386389384814085, | |
| "kl": 0.00469207763671875, | |
| "learning_rate": 9.476414577960834e-07, | |
| "loss": 0.0508, | |
| "reward": 0.7951389104127884, | |
| "reward_std": 0.33591291680932045, | |
| "rewards/accuracy_reward": 0.3611111119389534, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4340277835726738, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 770.3402862548828, | |
| "epoch": 0.2446043165467626, | |
| "grad_norm": 0.17896254581630255, | |
| "kl": 0.0062255859375, | |
| "learning_rate": 9.440462860149451e-07, | |
| "loss": 0.0483, | |
| "reward": 0.8229166716337204, | |
| "reward_std": 0.36454326659440994, | |
| "rewards/accuracy_reward": 0.3749999962747097, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4479166716337204, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 839.0208435058594, | |
| "epoch": 0.24940047961630696, | |
| "grad_norm": 0.13648789584357146, | |
| "kl": 0.00505828857421875, | |
| "learning_rate": 9.403399017459234e-07, | |
| "loss": 0.0323, | |
| "reward": 0.8489583283662796, | |
| "reward_std": 0.2952596992254257, | |
| "rewards/accuracy_reward": 0.3958333358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.453125, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 804.8263854980469, | |
| "epoch": 0.2541966426858513, | |
| "grad_norm": 0.1523526638090565, | |
| "kl": 0.0061492919921875, | |
| "learning_rate": 9.365233510494185e-07, | |
| "loss": 0.0435, | |
| "reward": 0.892361119389534, | |
| "reward_std": 0.32454150170087814, | |
| "rewards/accuracy_reward": 0.444444440305233, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4479166716337204, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 798.3125, | |
| "epoch": 0.2589928057553957, | |
| "grad_norm": 0.18724044885869282, | |
| "kl": 0.0067901611328125, | |
| "learning_rate": 9.325977110783263e-07, | |
| "loss": 0.0222, | |
| "reward": 0.9270833432674408, | |
| "reward_std": 0.335986964404583, | |
| "rewards/accuracy_reward": 0.4444444552063942, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.482638880610466, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 796.0347290039062, | |
| "epoch": 0.2637889688249401, | |
| "grad_norm": 0.15753263206218102, | |
| "kl": 0.00702667236328125, | |
| "learning_rate": 9.285640897740315e-07, | |
| "loss": 0.0554, | |
| "reward": 0.878472238779068, | |
| "reward_std": 0.3221370540559292, | |
| "rewards/accuracy_reward": 0.4097222238779068, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.46875, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 807.6458435058594, | |
| "epoch": 0.2685851318944844, | |
| "grad_norm": 0.1525585503261879, | |
| "kl": 0.0070648193359375, | |
| "learning_rate": 9.244236255537107e-07, | |
| "loss": 0.0488, | |
| "reward": 0.8350694477558136, | |
| "reward_std": 0.25923068448901176, | |
| "rewards/accuracy_reward": 0.3680555522441864, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4670139029622078, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 814.4583435058594, | |
| "epoch": 0.2733812949640288, | |
| "grad_norm": 0.15792440768758637, | |
| "kl": 0.0050201416015625, | |
| "learning_rate": 9.20177486989035e-07, | |
| "loss": 0.0434, | |
| "reward": 0.878472238779068, | |
| "reward_std": 0.31662074103951454, | |
| "rewards/accuracy_reward": 0.4166666641831398, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.461805559694767, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 816.2708435058594, | |
| "epoch": 0.27817745803357313, | |
| "grad_norm": 0.16159815697797508, | |
| "kl": 0.0063934326171875, | |
| "learning_rate": 9.158268724763614e-07, | |
| "loss": 0.0424, | |
| "reward": 0.894097238779068, | |
| "reward_std": 0.3126923553645611, | |
| "rewards/accuracy_reward": 0.4236111119389534, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.470486119389534, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 876.4583435058594, | |
| "epoch": 0.2829736211031175, | |
| "grad_norm": 0.1270858164567956, | |
| "kl": 0.0079803466796875, | |
| "learning_rate": 9.113730098985075e-07, | |
| "loss": 0.0267, | |
| "reward": 0.78125, | |
| "reward_std": 0.2495138719677925, | |
| "rewards/accuracy_reward": 0.3611111119389534, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4201388880610466, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 842.1111297607422, | |
| "epoch": 0.28776978417266186, | |
| "grad_norm": 0.14390666559593665, | |
| "kl": 0.0057373046875, | |
| "learning_rate": 9.068171562782021e-07, | |
| "loss": 0.0467, | |
| "reward": 0.8940972536802292, | |
| "reward_std": 0.3231881149113178, | |
| "rewards/accuracy_reward": 0.4305555559694767, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.463541679084301, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 816.5902862548828, | |
| "epoch": 0.29256594724220625, | |
| "grad_norm": 0.16728826541039396, | |
| "kl": 0.00667572021484375, | |
| "learning_rate": 9.021605974233152e-07, | |
| "loss": 0.0724, | |
| "reward": 0.989583358168602, | |
| "reward_std": 0.36507341638207436, | |
| "rewards/accuracy_reward": 0.4861111044883728, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5034722313284874, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 828.9861145019531, | |
| "epoch": 0.2973621103117506, | |
| "grad_norm": 0.15432237348385633, | |
| "kl": 0.00737762451171875, | |
| "learning_rate": 8.974046475639604e-07, | |
| "loss": 0.0447, | |
| "reward": 0.925347238779068, | |
| "reward_std": 0.3722820319235325, | |
| "rewards/accuracy_reward": 0.4513888880610466, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4739583432674408, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 810.2569427490234, | |
| "epoch": 0.302158273381295, | |
| "grad_norm": 0.1856747664961947, | |
| "kl": 0.00745391845703125, | |
| "learning_rate": 8.925506489815772e-07, | |
| "loss": 0.0687, | |
| "reward": 0.895833358168602, | |
| "reward_std": 0.29615509510040283, | |
| "rewards/accuracy_reward": 0.430555559694767, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4652777835726738, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 861.1111297607422, | |
| "epoch": 0.3069544364508393, | |
| "grad_norm": 0.13202082976554858, | |
| "kl": 0.00617218017578125, | |
| "learning_rate": 8.875999716300968e-07, | |
| "loss": 0.0299, | |
| "reward": 0.8020833432674408, | |
| "reward_std": 0.3038054183125496, | |
| "rewards/accuracy_reward": 0.3819444440305233, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4201388955116272, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 858.9097290039062, | |
| "epoch": 0.3117505995203837, | |
| "grad_norm": 0.152190266317737, | |
| "kl": 0.00725555419921875, | |
| "learning_rate": 8.825540127492965e-07, | |
| "loss": 0.0571, | |
| "reward": 0.7847222238779068, | |
| "reward_std": 0.3564433120191097, | |
| "rewards/accuracy_reward": 0.3472222313284874, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4375, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 798.2152862548828, | |
| "epoch": 0.31654676258992803, | |
| "grad_norm": 0.16383126534952586, | |
| "kl": 0.00787353515625, | |
| "learning_rate": 8.774141964704546e-07, | |
| "loss": 0.0431, | |
| "reward": 0.8836805671453476, | |
| "reward_std": 0.29356446862220764, | |
| "rewards/accuracy_reward": 0.4027777835726738, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4809027761220932, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 809.1597290039062, | |
| "epoch": 0.3213429256594724, | |
| "grad_norm": 0.17427922859293266, | |
| "kl": 0.00984954833984375, | |
| "learning_rate": 8.721819734144135e-07, | |
| "loss": 0.0541, | |
| "reward": 0.9930555671453476, | |
| "reward_std": 0.36635252088308334, | |
| "rewards/accuracy_reward": 0.4583333358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5347222313284874, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 738.5833435058594, | |
| "epoch": 0.3261390887290168, | |
| "grad_norm": 0.16012047020291365, | |
| "kl": 0.009185791015625, | |
| "learning_rate": 8.668588202821706e-07, | |
| "loss": 0.039, | |
| "reward": 1.0850694477558136, | |
| "reward_std": 0.23961883038282394, | |
| "rewards/accuracy_reward": 0.5416666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5434027910232544, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 832.7847442626953, | |
| "epoch": 0.33093525179856115, | |
| "grad_norm": 0.15979089643431796, | |
| "kl": 0.0091400146484375, | |
| "learning_rate": 8.614462394381026e-07, | |
| "loss": 0.0613, | |
| "reward": 0.9340277761220932, | |
| "reward_std": 0.3319687321782112, | |
| "rewards/accuracy_reward": 0.4513888955116272, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4826388880610466, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 731.4444427490234, | |
| "epoch": 0.33573141486810554, | |
| "grad_norm": 0.2080530430054881, | |
| "kl": 0.01006317138671875, | |
| "learning_rate": 8.559457584859535e-07, | |
| "loss": 0.0441, | |
| "reward": 1.0954861342906952, | |
| "reward_std": 0.42393119633197784, | |
| "rewards/accuracy_reward": 0.5138888955116272, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5815972238779068, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 804.1528015136719, | |
| "epoch": 0.3405275779376499, | |
| "grad_norm": 0.16129162539469918, | |
| "kl": 0.008331298828125, | |
| "learning_rate": 8.503589298376931e-07, | |
| "loss": 0.0347, | |
| "reward": 0.9513888955116272, | |
| "reward_std": 0.37669622898101807, | |
| "rewards/accuracy_reward": 0.4583333358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4930555671453476, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 851.0902862548828, | |
| "epoch": 0.34532374100719426, | |
| "grad_norm": 0.1421650821283864, | |
| "kl": 0.00882720947265625, | |
| "learning_rate": 8.446873302753783e-07, | |
| "loss": 0.0403, | |
| "reward": 0.892361119389534, | |
| "reward_std": 0.2742934599518776, | |
| "rewards/accuracy_reward": 0.423611119389534, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4687500074505806, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 834.8611145019531, | |
| "epoch": 0.3501199040767386, | |
| "grad_norm": 0.14158911723554238, | |
| "kl": 0.00821685791015625, | |
| "learning_rate": 8.389325605061341e-07, | |
| "loss": 0.0319, | |
| "reward": 0.9305555820465088, | |
| "reward_std": 0.2332368977367878, | |
| "rewards/accuracy_reward": 0.4583333283662796, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4722222238779068, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 872.9097442626953, | |
| "epoch": 0.354916067146283, | |
| "grad_norm": 0.1521349105449586, | |
| "kl": 0.0106048583984375, | |
| "learning_rate": 8.330962447103829e-07, | |
| "loss": 0.0301, | |
| "reward": 0.8125000149011612, | |
| "reward_std": 0.35327186062932014, | |
| "rewards/accuracy_reward": 0.3402777835726738, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4722222164273262, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 727.3472290039062, | |
| "epoch": 0.3597122302158273, | |
| "grad_norm": 0.1648475576874547, | |
| "kl": 0.0107269287109375, | |
| "learning_rate": 8.271800300834486e-07, | |
| "loss": 0.0719, | |
| "reward": 1.1545138657093048, | |
| "reward_std": 0.3374630883336067, | |
| "rewards/accuracy_reward": 0.5763888955116272, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.578125, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 754.5485992431641, | |
| "epoch": 0.3645083932853717, | |
| "grad_norm": 0.15678609737006508, | |
| "kl": 0.0106353759765625, | |
| "learning_rate": 8.211855863706654e-07, | |
| "loss": 0.0206, | |
| "reward": 1.1302083283662796, | |
| "reward_std": 0.3273175358772278, | |
| "rewards/accuracy_reward": 0.5625000149011612, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5677083358168602, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 758.7639007568359, | |
| "epoch": 0.36930455635491605, | |
| "grad_norm": 0.14059085263342075, | |
| "kl": 0.011688232421875, | |
| "learning_rate": 8.151146053961217e-07, | |
| "loss": 0.0247, | |
| "reward": 1.038194477558136, | |
| "reward_std": 0.24932898953557014, | |
| "rewards/accuracy_reward": 0.5138888955116272, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5243055447936058, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 762.6527709960938, | |
| "epoch": 0.37410071942446044, | |
| "grad_norm": 0.16861004490817355, | |
| "kl": 0.011260986328125, | |
| "learning_rate": 8.089688005851745e-07, | |
| "loss": 0.0374, | |
| "reward": 1.09375, | |
| "reward_std": 0.362262312322855, | |
| "rewards/accuracy_reward": 0.5416666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5520833432674408, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 836.3055725097656, | |
| "epoch": 0.37889688249400477, | |
| "grad_norm": 0.13259854891508993, | |
| "kl": 0.009613037109375, | |
| "learning_rate": 8.02749906480864e-07, | |
| "loss": 0.0224, | |
| "reward": 0.9062500149011612, | |
| "reward_std": 0.2787036634981632, | |
| "rewards/accuracy_reward": 0.4097222313284874, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4965277835726738, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 779.7361297607422, | |
| "epoch": 0.38369304556354916, | |
| "grad_norm": 0.16767324138234957, | |
| "kl": 0.0133819580078125, | |
| "learning_rate": 7.964596782543716e-07, | |
| "loss": 0.0572, | |
| "reward": 1.0520833432674408, | |
| "reward_std": 0.28914331272244453, | |
| "rewards/accuracy_reward": 0.5277777910232544, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.524305559694767, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 767.7708435058594, | |
| "epoch": 0.38848920863309355, | |
| "grad_norm": 0.16351300117480583, | |
| "kl": 0.01165771484375, | |
| "learning_rate": 7.900998912096527e-07, | |
| "loss": 0.0307, | |
| "reward": 0.9444444477558136, | |
| "reward_std": 0.3029831796884537, | |
| "rewards/accuracy_reward": 0.4166666567325592, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5277777835726738, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 780.3541717529297, | |
| "epoch": 0.3932853717026379, | |
| "grad_norm": 0.1546912166828873, | |
| "kl": 0.013702392578125, | |
| "learning_rate": 7.836723402823913e-07, | |
| "loss": 0.0369, | |
| "reward": 1.0538194626569748, | |
| "reward_std": 0.3243863359093666, | |
| "rewards/accuracy_reward": 0.5000000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5538194477558136, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 808.9375, | |
| "epoch": 0.3980815347721823, | |
| "grad_norm": 0.14595312337300426, | |
| "kl": 0.012359619140625, | |
| "learning_rate": 7.771788395334094e-07, | |
| "loss": 0.0399, | |
| "reward": 0.918402761220932, | |
| "reward_std": 0.2620566040277481, | |
| "rewards/accuracy_reward": 0.416666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5017361119389534, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 767.1805419921875, | |
| "epoch": 0.4028776978417266, | |
| "grad_norm": 0.1382082777095037, | |
| "kl": 0.0124969482421875, | |
| "learning_rate": 7.706212216366819e-07, | |
| "loss": 0.0237, | |
| "reward": 0.9461805671453476, | |
| "reward_std": 0.2770383469760418, | |
| "rewards/accuracy_reward": 0.4444444514811039, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.501736119389534, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 773.5, | |
| "epoch": 0.407673860911271, | |
| "grad_norm": 0.17673710255712385, | |
| "kl": 0.0130767822265625, | |
| "learning_rate": 7.640013373620979e-07, | |
| "loss": 0.0526, | |
| "reward": 1.0694444626569748, | |
| "reward_std": 0.36386215686798096, | |
| "rewards/accuracy_reward": 0.548611119389534, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5208333432674408, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 709.4583435058594, | |
| "epoch": 0.41247002398081534, | |
| "grad_norm": 0.2107698032340365, | |
| "kl": 0.0170440673828125, | |
| "learning_rate": 7.573210550531125e-07, | |
| "loss": 0.0816, | |
| "reward": 1.222222238779068, | |
| "reward_std": 0.4683116003870964, | |
| "rewards/accuracy_reward": 0.6041666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6180555671453476, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 834.0694427490234, | |
| "epoch": 0.4172661870503597, | |
| "grad_norm": 0.13070064074785792, | |
| "kl": 0.012725830078125, | |
| "learning_rate": 7.505822600994423e-07, | |
| "loss": 0.0331, | |
| "reward": 0.8090277761220932, | |
| "reward_std": 0.2008717618882656, | |
| "rewards/accuracy_reward": 0.3541666641831398, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.454861119389534, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 741.8402862548828, | |
| "epoch": 0.42206235011990406, | |
| "grad_norm": 0.20183514921907542, | |
| "kl": 0.01544189453125, | |
| "learning_rate": 7.437868544049463e-07, | |
| "loss": 0.0421, | |
| "reward": 0.8489583432674408, | |
| "reward_std": 0.2780023626983166, | |
| "rewards/accuracy_reward": 0.3472222313284874, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.501736119389534, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 826.2777862548828, | |
| "epoch": 0.42685851318944845, | |
| "grad_norm": 0.15298818073279, | |
| "kl": 0.013885498046875, | |
| "learning_rate": 7.36936755850849e-07, | |
| "loss": 0.054, | |
| "reward": 0.8472222238779068, | |
| "reward_std": 0.22455434128642082, | |
| "rewards/accuracy_reward": 0.3750000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4722222313284874, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 796.0555419921875, | |
| "epoch": 0.4316546762589928, | |
| "grad_norm": 0.18224911397155316, | |
| "kl": 0.0146942138671875, | |
| "learning_rate": 7.300338977544519e-07, | |
| "loss": 0.0238, | |
| "reward": 0.9600694477558136, | |
| "reward_std": 0.36052028089761734, | |
| "rewards/accuracy_reward": 0.4375000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5225694477558136, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 816.0208587646484, | |
| "epoch": 0.4364508393285372, | |
| "grad_norm": 0.19516033120759366, | |
| "kl": 0.0160675048828125, | |
| "learning_rate": 7.230802283234904e-07, | |
| "loss": 0.0525, | |
| "reward": 0.989583358168602, | |
| "reward_std": 0.3490638807415962, | |
| "rewards/accuracy_reward": 0.472222238779068, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5173611044883728, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 801.7291870117188, | |
| "epoch": 0.4412470023980815, | |
| "grad_norm": 0.14966073132580007, | |
| "kl": 0.0144195556640625, | |
| "learning_rate": 7.160777101062865e-07, | |
| "loss": 0.0341, | |
| "reward": 1.0225694626569748, | |
| "reward_std": 0.337300319224596, | |
| "rewards/accuracy_reward": 0.4791666567325592, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5434027910232544, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 834.2222290039062, | |
| "epoch": 0.4460431654676259, | |
| "grad_norm": 0.13423539328804068, | |
| "kl": 0.0140380859375, | |
| "learning_rate": 7.090283194378542e-07, | |
| "loss": 0.0035, | |
| "reward": 0.921875, | |
| "reward_std": 0.259520523250103, | |
| "rewards/accuracy_reward": 0.423611119389534, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.4982638880610466, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 832.9444427490234, | |
| "epoch": 0.45083932853717024, | |
| "grad_norm": 0.16645556279200993, | |
| "kl": 0.013580322265625, | |
| "learning_rate": 7.019340458821159e-07, | |
| "loss": 0.0388, | |
| "reward": 0.9652777910232544, | |
| "reward_std": 0.29097262397408485, | |
| "rewards/accuracy_reward": 0.4652777761220932, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5000000074505806, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 760.0764007568359, | |
| "epoch": 0.4556354916067146, | |
| "grad_norm": 0.15120637517973337, | |
| "kl": 0.01580810546875, | |
| "learning_rate": 6.947968916703826e-07, | |
| "loss": 0.0373, | |
| "reward": 1.015625, | |
| "reward_std": 0.2590954527258873, | |
| "rewards/accuracy_reward": 0.486111119389534, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5295138880610466, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 765.4791564941406, | |
| "epoch": 0.460431654676259, | |
| "grad_norm": 0.16253895506587696, | |
| "kl": 0.0163421630859375, | |
| "learning_rate": 6.876188711362603e-07, | |
| "loss": 0.0583, | |
| "reward": 0.9513889104127884, | |
| "reward_std": 0.3330418989062309, | |
| "rewards/accuracy_reward": 0.4236111044883728, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5277777910232544, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 775.6736297607422, | |
| "epoch": 0.46522781774580335, | |
| "grad_norm": 0.20268144913502836, | |
| "kl": 0.0174560546875, | |
| "learning_rate": 6.80402010147141e-07, | |
| "loss": 0.0393, | |
| "reward": 1.017361119389534, | |
| "reward_std": 0.3649497255682945, | |
| "rewards/accuracy_reward": 0.4652777798473835, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5520833358168602, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 766.9652709960938, | |
| "epoch": 0.47002398081534774, | |
| "grad_norm": 0.1728775404147991, | |
| "kl": 0.015045166015625, | |
| "learning_rate": 6.731483455324374e-07, | |
| "loss": 0.0282, | |
| "reward": 1.1076389104127884, | |
| "reward_std": 0.3713233917951584, | |
| "rewards/accuracy_reward": 0.5347222238779068, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5729166716337204, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 781.7222442626953, | |
| "epoch": 0.4748201438848921, | |
| "grad_norm": 0.1996452779202398, | |
| "kl": 0.01593017578125, | |
| "learning_rate": 6.658599245087241e-07, | |
| "loss": 0.0927, | |
| "reward": 1.1597222536802292, | |
| "reward_std": 0.3544151149690151, | |
| "rewards/accuracy_reward": 0.5833333358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5763888955116272, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 823.1527862548828, | |
| "epoch": 0.47961630695443647, | |
| "grad_norm": 0.14748523873535388, | |
| "kl": 0.0178680419921875, | |
| "learning_rate": 6.585388041019487e-07, | |
| "loss": 0.0367, | |
| "reward": 1.0312500149011612, | |
| "reward_std": 0.27015675604343414, | |
| "rewards/accuracy_reward": 0.4861111119389534, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5451388880610466, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 772.1944427490234, | |
| "epoch": 0.4844124700239808, | |
| "grad_norm": 0.5160967541046672, | |
| "kl": 0.0177764892578125, | |
| "learning_rate": 6.511870505668725e-07, | |
| "loss": 0.0517, | |
| "reward": 1.0746527761220932, | |
| "reward_std": 0.32491182163357735, | |
| "rewards/accuracy_reward": 0.5138888880610466, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5607638955116272, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 813.8680572509766, | |
| "epoch": 0.4892086330935252, | |
| "grad_norm": 0.13567321274406932, | |
| "kl": 0.017913818359375, | |
| "learning_rate": 6.438067388039064e-07, | |
| "loss": 0.039, | |
| "reward": 1.0138888955116272, | |
| "reward_std": 0.2365904077887535, | |
| "rewards/accuracy_reward": 0.4722222313284874, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5416666641831398, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 792.1250152587891, | |
| "epoch": 0.4940047961630695, | |
| "grad_norm": 0.16686045898429722, | |
| "kl": 0.017578125, | |
| "learning_rate": 6.36399951773509e-07, | |
| "loss": 0.0349, | |
| "reward": 1.052083358168602, | |
| "reward_std": 0.3173811621963978, | |
| "rewards/accuracy_reward": 0.5138888880610466, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5381944552063942, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 792.9583435058594, | |
| "epoch": 0.4988009592326139, | |
| "grad_norm": 0.16673634023147957, | |
| "kl": 0.019927978515625, | |
| "learning_rate": 6.289687799083072e-07, | |
| "loss": 0.0385, | |
| "reward": 0.954861119389534, | |
| "reward_std": 0.34330564737319946, | |
| "rewards/accuracy_reward": 0.4444444477558136, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.510416679084301, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 838.2152862548828, | |
| "epoch": 0.5035971223021583, | |
| "grad_norm": 0.19765705391364718, | |
| "kl": 0.018402099609375, | |
| "learning_rate": 6.2151532052311e-07, | |
| "loss": 0.0526, | |
| "reward": 0.9461805671453476, | |
| "reward_std": 0.380074605345726, | |
| "rewards/accuracy_reward": 0.4305555522441864, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5156250074505806, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 848.3958282470703, | |
| "epoch": 0.5083932853717026, | |
| "grad_norm": 0.16771910068496884, | |
| "kl": 0.018890380859375, | |
| "learning_rate": 6.140416772229784e-07, | |
| "loss": 0.0449, | |
| "reward": 0.8819444477558136, | |
| "reward_std": 0.36511222273111343, | |
| "rewards/accuracy_reward": 0.3819444552063942, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 763.1527862548828, | |
| "epoch": 0.513189448441247, | |
| "grad_norm": 0.1914924709345533, | |
| "kl": 0.022308349609375, | |
| "learning_rate": 6.065499593095208e-07, | |
| "loss": 0.0358, | |
| "reward": 1.0451388955116272, | |
| "reward_std": 0.3345286548137665, | |
| "rewards/accuracy_reward": 0.4722222238779068, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5729166716337204, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 795.0000152587891, | |
| "epoch": 0.5179856115107914, | |
| "grad_norm": 0.23442272993782337, | |
| "kl": 0.02197265625, | |
| "learning_rate": 5.990422811855812e-07, | |
| "loss": 0.0786, | |
| "reward": 0.9982638955116272, | |
| "reward_std": 0.38987091183662415, | |
| "rewards/accuracy_reward": 0.4375000037252903, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5607639029622078, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 801.8402862548828, | |
| "epoch": 0.5227817745803357, | |
| "grad_norm": 0.21980101064995708, | |
| "kl": 0.02423095703125, | |
| "learning_rate": 5.915207617584858e-07, | |
| "loss": 0.0335, | |
| "reward": 0.9427083432674408, | |
| "reward_std": 0.36763929575681686, | |
| "rewards/accuracy_reward": 0.4305555522441864, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5121527835726738, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 743.0069427490234, | |
| "epoch": 0.5275779376498801, | |
| "grad_norm": 0.20707736821413145, | |
| "kl": 0.025299072265625, | |
| "learning_rate": 5.839875238420205e-07, | |
| "loss": 0.0706, | |
| "reward": 1.1406249850988388, | |
| "reward_std": 0.29043491929769516, | |
| "rewards/accuracy_reward": 0.555555559694767, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.585069440305233, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 804.6111145019531, | |
| "epoch": 0.5323741007194245, | |
| "grad_norm": 0.17161222800354356, | |
| "kl": 0.021148681640625, | |
| "learning_rate": 5.764446935573041e-07, | |
| "loss": 0.0426, | |
| "reward": 0.9861110895872116, | |
| "reward_std": 0.33307311683893204, | |
| "rewards/accuracy_reward": 0.4375, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.548611119389534, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 778.125, | |
| "epoch": 0.5371702637889688, | |
| "grad_norm": 0.21031410356763766, | |
| "kl": 0.0242919921875, | |
| "learning_rate": 5.688943997327288e-07, | |
| "loss": 0.05, | |
| "reward": 0.9496527910232544, | |
| "reward_std": 0.2749031111598015, | |
| "rewards/accuracy_reward": 0.4027777761220932, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5468750074505806, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 767.6458435058594, | |
| "epoch": 0.5419664268585132, | |
| "grad_norm": 0.282246872790042, | |
| "kl": 0.02569580078125, | |
| "learning_rate": 5.613387733031375e-07, | |
| "loss": 0.0988, | |
| "reward": 1.1336805522441864, | |
| "reward_std": 0.35546836256980896, | |
| "rewards/accuracy_reward": 0.5486111268401146, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5850694477558136, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 727.4097137451172, | |
| "epoch": 0.5467625899280576, | |
| "grad_norm": 0.2177323521611779, | |
| "kl": 0.0255126953125, | |
| "learning_rate": 5.53779946708405e-07, | |
| "loss": 0.047, | |
| "reward": 1.1145833283662796, | |
| "reward_std": 0.28548414260149, | |
| "rewards/accuracy_reward": 0.5277777798473835, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.586805559694767, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 762.1875152587891, | |
| "epoch": 0.5515587529976019, | |
| "grad_norm": 0.2228573500540112, | |
| "kl": 0.023681640625, | |
| "learning_rate": 5.462200532915951e-07, | |
| "loss": 0.0526, | |
| "reward": 1.0694444626569748, | |
| "reward_std": 0.3976950142532587, | |
| "rewards/accuracy_reward": 0.4722222350537777, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5972222313284874, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 821.8263854980469, | |
| "epoch": 0.5563549160671463, | |
| "grad_norm": 0.22544332116422824, | |
| "kl": 0.02313232421875, | |
| "learning_rate": 5.386612266968625e-07, | |
| "loss": 0.0808, | |
| "reward": 0.9774305522441864, | |
| "reward_std": 0.3071533590555191, | |
| "rewards/accuracy_reward": 0.3958333283662796, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5815972238779068, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 732.5486145019531, | |
| "epoch": 0.5611510791366906, | |
| "grad_norm": 0.24884339296220886, | |
| "kl": 0.026092529296875, | |
| "learning_rate": 5.311056002672712e-07, | |
| "loss": 0.0805, | |
| "reward": 1.2187500298023224, | |
| "reward_std": 0.33359793573617935, | |
| "rewards/accuracy_reward": 0.5902777835726738, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6284722238779068, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 759.3264007568359, | |
| "epoch": 0.565947242206235, | |
| "grad_norm": 0.20316735987226078, | |
| "kl": 0.029693603515625, | |
| "learning_rate": 5.235553064426961e-07, | |
| "loss": 0.0398, | |
| "reward": 1.1354166865348816, | |
| "reward_std": 0.29686928167939186, | |
| "rewards/accuracy_reward": 0.506944440305233, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.628472238779068, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 727.5, | |
| "epoch": 0.5707434052757794, | |
| "grad_norm": 0.29440880209457704, | |
| "kl": 0.0303955078125, | |
| "learning_rate": 5.160124761579795e-07, | |
| "loss": 0.0572, | |
| "reward": 1.2673611044883728, | |
| "reward_std": 0.3973044380545616, | |
| "rewards/accuracy_reward": 0.6180555671453476, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6493055820465088, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 746.875, | |
| "epoch": 0.5755395683453237, | |
| "grad_norm": 0.2292130236322372, | |
| "kl": 0.033233642578125, | |
| "learning_rate": 5.084792382415141e-07, | |
| "loss": 0.0549, | |
| "reward": 1.0781250149011612, | |
| "reward_std": 0.3586086630821228, | |
| "rewards/accuracy_reward": 0.4791666641831398, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5989583432674408, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 846.0, | |
| "epoch": 0.580335731414868, | |
| "grad_norm": 0.2898186862467139, | |
| "kl": 0.031280517578125, | |
| "learning_rate": 5.009577188144188e-07, | |
| "loss": 0.092, | |
| "reward": 0.9079861044883728, | |
| "reward_std": 0.3724118545651436, | |
| "rewards/accuracy_reward": 0.3541666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5538194552063942, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 858.1944580078125, | |
| "epoch": 0.5851318944844125, | |
| "grad_norm": 0.2766016571596048, | |
| "kl": 0.03631591796875, | |
| "learning_rate": 4.93450040690479e-07, | |
| "loss": 0.0593, | |
| "reward": 0.9548611044883728, | |
| "reward_std": 0.43608929216861725, | |
| "rewards/accuracy_reward": 0.4027777761220932, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5520833283662796, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 788.2152862548828, | |
| "epoch": 0.5899280575539568, | |
| "grad_norm": 0.26705241094954724, | |
| "kl": 0.0374755859375, | |
| "learning_rate": 4.859583227770217e-07, | |
| "loss": 0.0683, | |
| "reward": 1.0520833879709244, | |
| "reward_std": 0.2720135301351547, | |
| "rewards/accuracy_reward": 0.4791666641831398, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5729166567325592, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 772.5069427490234, | |
| "epoch": 0.5947242206235012, | |
| "grad_norm": 0.2671582354543242, | |
| "kl": 0.035980224609375, | |
| "learning_rate": 4.784846794768901e-07, | |
| "loss": 0.0564, | |
| "reward": 1.1267361044883728, | |
| "reward_std": 0.3838435262441635, | |
| "rewards/accuracy_reward": 0.5277777761220932, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5989583283662796, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 807.1597442626953, | |
| "epoch": 0.5995203836930456, | |
| "grad_norm": 0.28942117017345487, | |
| "kl": 0.03851318359375, | |
| "learning_rate": 4.7103122009169283e-07, | |
| "loss": 0.0337, | |
| "reward": 1.0086805671453476, | |
| "reward_std": 0.31841161847114563, | |
| "rewards/accuracy_reward": 0.4513888880610466, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5572916716337204, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 794.0694580078125, | |
| "epoch": 0.60431654676259, | |
| "grad_norm": 0.29337843160217614, | |
| "kl": 0.034027099609375, | |
| "learning_rate": 4.63600048226491e-07, | |
| "loss": 0.0638, | |
| "reward": 1.0694444626569748, | |
| "reward_std": 0.37363580614328384, | |
| "rewards/accuracy_reward": 0.479166679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5902777910232544, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 682.5347290039062, | |
| "epoch": 0.6091127098321343, | |
| "grad_norm": 0.2980289320595657, | |
| "kl": 0.046630859375, | |
| "learning_rate": 4.5619326119609346e-07, | |
| "loss": 0.0582, | |
| "reward": 1.0850694626569748, | |
| "reward_std": 0.3542333133518696, | |
| "rewards/accuracy_reward": 0.4583333283662796, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.626736119389534, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 810.3055572509766, | |
| "epoch": 0.6139088729016786, | |
| "grad_norm": 0.4115921669385449, | |
| "kl": 0.039794921875, | |
| "learning_rate": 4.4881294943312756e-07, | |
| "loss": 0.1129, | |
| "reward": 0.9635416716337204, | |
| "reward_std": 0.4346095398068428, | |
| "rewards/accuracy_reward": 0.3958333358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5677083283662796, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 796.3958282470703, | |
| "epoch": 0.6187050359712231, | |
| "grad_norm": 0.28690441517101206, | |
| "kl": 0.03887939453125, | |
| "learning_rate": 4.414611958980512e-07, | |
| "loss": 0.0596, | |
| "reward": 1.104166641831398, | |
| "reward_std": 0.32108214125037193, | |
| "rewards/accuracy_reward": 0.5138888955116272, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5902777910232544, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 704.2986145019531, | |
| "epoch": 0.6235011990407674, | |
| "grad_norm": 0.39366843219844283, | |
| "kl": 0.045166015625, | |
| "learning_rate": 4.3414007549127594e-07, | |
| "loss": 0.0469, | |
| "reward": 1.1545138955116272, | |
| "reward_std": 0.362628273665905, | |
| "rewards/accuracy_reward": 0.5208333283662796, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.633680559694767, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 792.5555572509766, | |
| "epoch": 0.6282973621103117, | |
| "grad_norm": 0.3049060746508426, | |
| "kl": 0.044677734375, | |
| "learning_rate": 4.268516544675628e-07, | |
| "loss": 0.0332, | |
| "reward": 1.0902777910232544, | |
| "reward_std": 0.381888784468174, | |
| "rewards/accuracy_reward": 0.5000000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5902777761220932, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 760.2291717529297, | |
| "epoch": 0.6330935251798561, | |
| "grad_norm": 0.5199022261670415, | |
| "kl": 0.05535888671875, | |
| "learning_rate": 4.195979898528589e-07, | |
| "loss": 0.0576, | |
| "reward": 1.131944477558136, | |
| "reward_std": 0.45905186980962753, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6319444477558136, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 736.3541717529297, | |
| "epoch": 0.6378896882494005, | |
| "grad_norm": 0.4002608571694072, | |
| "kl": 0.0509033203125, | |
| "learning_rate": 4.1238112886373967e-07, | |
| "loss": 0.0692, | |
| "reward": 1.2430555820465088, | |
| "reward_std": 0.43104151636362076, | |
| "rewards/accuracy_reward": 0.5902777910232544, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6527777910232544, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 750.2986145019531, | |
| "epoch": 0.6426858513189448, | |
| "grad_norm": 0.47064244094639573, | |
| "kl": 0.05303955078125, | |
| "learning_rate": 4.0520310832961747e-07, | |
| "loss": 0.0578, | |
| "reward": 1.2552083730697632, | |
| "reward_std": 0.3141016773879528, | |
| "rewards/accuracy_reward": 0.604166679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6510416716337204, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 700.3611297607422, | |
| "epoch": 0.6474820143884892, | |
| "grad_norm": 0.5155783092634934, | |
| "kl": 0.04730224609375, | |
| "learning_rate": 3.980659541178841e-07, | |
| "loss": 0.0851, | |
| "reward": 1.1597222089767456, | |
| "reward_std": 0.33885327726602554, | |
| "rewards/accuracy_reward": 0.5625000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5972222238779068, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 733.5833435058594, | |
| "epoch": 0.6522781774580336, | |
| "grad_norm": 0.3556569979579297, | |
| "kl": 0.0576171875, | |
| "learning_rate": 3.909716805621458e-07, | |
| "loss": 0.0298, | |
| "reward": 1.1493055522441864, | |
| "reward_std": 0.3029083050787449, | |
| "rewards/accuracy_reward": 0.5208333358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.628472238779068, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 707.9791717529297, | |
| "epoch": 0.657074340527578, | |
| "grad_norm": 0.529609284338842, | |
| "kl": 0.06280517578125, | |
| "learning_rate": 3.8392228989371357e-07, | |
| "loss": 0.1004, | |
| "reward": 1.0902777910232544, | |
| "reward_std": 0.3477436378598213, | |
| "rewards/accuracy_reward": 0.4722222313284874, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6180555522441864, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 735.5902938842773, | |
| "epoch": 0.6618705035971223, | |
| "grad_norm": 0.5906533852176611, | |
| "kl": 0.06390380859375, | |
| "learning_rate": 3.7691977167650947e-07, | |
| "loss": 0.0947, | |
| "reward": 1.2638888955116272, | |
| "reward_std": 0.37084779888391495, | |
| "rewards/accuracy_reward": 0.5972222313284874, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6666666865348816, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 787.9236145019531, | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 0.3518724040292374, | |
| "kl": 0.05731201171875, | |
| "learning_rate": 3.6996610224554815e-07, | |
| "loss": 0.035, | |
| "reward": 1.1510416865348816, | |
| "reward_std": 0.39138108491897583, | |
| "rewards/accuracy_reward": 0.5208333358168602, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6302083432674408, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 753.4236145019531, | |
| "epoch": 0.6714628297362111, | |
| "grad_norm": 0.5518379170803371, | |
| "kl": 0.0836181640625, | |
| "learning_rate": 3.630632441491511e-07, | |
| "loss": 0.0206, | |
| "reward": 1.1197917014360428, | |
| "reward_std": 0.33788175135850906, | |
| "rewards/accuracy_reward": 0.4930555745959282, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6267361044883728, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 847.3541717529297, | |
| "epoch": 0.6762589928057554, | |
| "grad_norm": 0.7174313824432543, | |
| "kl": 0.08111572265625, | |
| "learning_rate": 3.562131455950538e-07, | |
| "loss": 0.075, | |
| "reward": 0.940972238779068, | |
| "reward_std": 0.39178355410695076, | |
| "rewards/accuracy_reward": 0.3819444477558136, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5590277835726738, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 823.6180572509766, | |
| "epoch": 0.6810551558752997, | |
| "grad_norm": 0.8192916163585466, | |
| "kl": 0.09405517578125, | |
| "learning_rate": 3.4941773990055777e-07, | |
| "loss": 0.0704, | |
| "reward": 0.8750000149011612, | |
| "reward_std": 0.40220723301172256, | |
| "rewards/accuracy_reward": 0.3402777761220932, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.534722238779068, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 741.7569580078125, | |
| "epoch": 0.6858513189448441, | |
| "grad_norm": 0.6199201515779919, | |
| "kl": 0.0787353515625, | |
| "learning_rate": 3.426789449468873e-07, | |
| "loss": 0.0473, | |
| "reward": 1.1718749850988388, | |
| "reward_std": 0.3498489521443844, | |
| "rewards/accuracy_reward": 0.5486111119389534, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6232638955116272, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 832.3055572509766, | |
| "epoch": 0.6906474820143885, | |
| "grad_norm": 0.9765675758790994, | |
| "kl": 0.08935546875, | |
| "learning_rate": 3.359986626379022e-07, | |
| "loss": 0.0842, | |
| "reward": 0.984375, | |
| "reward_std": 0.48340315371751785, | |
| "rewards/accuracy_reward": 0.423611119389534, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5607639029622078, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 741.2639007568359, | |
| "epoch": 0.6954436450839329, | |
| "grad_norm": 0.6256951971346841, | |
| "kl": 0.0902099609375, | |
| "learning_rate": 3.293787783633182e-07, | |
| "loss": 0.0524, | |
| "reward": 1.092013880610466, | |
| "reward_std": 0.35471441224217415, | |
| "rewards/accuracy_reward": 0.4861111268401146, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6059027761220932, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 749.1250152587891, | |
| "epoch": 0.7002398081534772, | |
| "grad_norm": 0.4746047065439084, | |
| "kl": 0.0960693359375, | |
| "learning_rate": 3.2282116046659064e-07, | |
| "loss": 0.0216, | |
| "reward": 1.1197916567325592, | |
| "reward_std": 0.3484783172607422, | |
| "rewards/accuracy_reward": 0.5000000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6197916716337204, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 728.75, | |
| "epoch": 0.7050359712230215, | |
| "grad_norm": 0.4212278612821504, | |
| "kl": 0.1041259765625, | |
| "learning_rate": 3.163276597176087e-07, | |
| "loss": 0.0352, | |
| "reward": 1.3003472089767456, | |
| "reward_std": 0.366548266261816, | |
| "rewards/accuracy_reward": 0.6388888955116272, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.661458358168602, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 729.4166717529297, | |
| "epoch": 0.709832134292566, | |
| "grad_norm": 0.5703953599283403, | |
| "kl": 0.1136474609375, | |
| "learning_rate": 3.099001087903473e-07, | |
| "loss": 0.0144, | |
| "reward": 1.1718750298023224, | |
| "reward_std": 0.44783008843660355, | |
| "rewards/accuracy_reward": 0.5625000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.609375, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 807.3610992431641, | |
| "epoch": 0.7146282973621103, | |
| "grad_norm": 0.6651812784116454, | |
| "kl": 0.12939453125, | |
| "learning_rate": 3.0354032174562863e-07, | |
| "loss": 0.0654, | |
| "reward": 1.0920139104127884, | |
| "reward_std": 0.3492956757545471, | |
| "rewards/accuracy_reward": 0.4791666641831398, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6128472313284874, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 786.3194732666016, | |
| "epoch": 0.7194244604316546, | |
| "grad_norm": 0.689721941763239, | |
| "kl": 0.1400146484375, | |
| "learning_rate": 2.97250093519136e-07, | |
| "loss": 0.0635, | |
| "reward": 1.1111111342906952, | |
| "reward_std": 0.3203607201576233, | |
| "rewards/accuracy_reward": 0.486111119389534, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6250000149011612, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 792.2152709960938, | |
| "epoch": 0.7242206235011991, | |
| "grad_norm": 1.2387475983297178, | |
| "kl": 0.1351318359375, | |
| "learning_rate": 2.910311994148255e-07, | |
| "loss": 0.071, | |
| "reward": 1.09375, | |
| "reward_std": 0.36406850814819336, | |
| "rewards/accuracy_reward": 0.4930555559694767, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6006944477558136, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 767.8333435058594, | |
| "epoch": 0.7290167865707434, | |
| "grad_norm": 0.7047137897175717, | |
| "kl": 0.1422119140625, | |
| "learning_rate": 2.848853946038782e-07, | |
| "loss": 0.0384, | |
| "reward": 1.0711805522441864, | |
| "reward_std": 0.2421913631260395, | |
| "rewards/accuracy_reward": 0.4791666641831398, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5920139104127884, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 749.2708435058594, | |
| "epoch": 0.7338129496402878, | |
| "grad_norm": 1.144909232594251, | |
| "kl": 0.1258544921875, | |
| "learning_rate": 2.7881441362933464e-07, | |
| "loss": 0.066, | |
| "reward": 1.045138880610466, | |
| "reward_std": 0.3445451110601425, | |
| "rewards/accuracy_reward": 0.430555559694767, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6145833432674408, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 723.8055572509766, | |
| "epoch": 0.7386091127098321, | |
| "grad_norm": 0.9998994389867698, | |
| "kl": 0.1502685546875, | |
| "learning_rate": 2.7281996991655145e-07, | |
| "loss": 0.0722, | |
| "reward": 1.1649305671453476, | |
| "reward_std": 0.4142308458685875, | |
| "rewards/accuracy_reward": 0.5555555671453476, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.609375, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 712.1736297607422, | |
| "epoch": 0.7434052757793765, | |
| "grad_norm": 0.8719467247990521, | |
| "kl": 0.1572265625, | |
| "learning_rate": 2.669037552896172e-07, | |
| "loss": 0.0362, | |
| "reward": 1.1753472536802292, | |
| "reward_std": 0.4305378869175911, | |
| "rewards/accuracy_reward": 0.5277777835726738, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6475694477558136, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 792.1458282470703, | |
| "epoch": 0.7482014388489209, | |
| "grad_norm": 1.5075628106035353, | |
| "kl": 0.19970703125, | |
| "learning_rate": 2.6106743949386586e-07, | |
| "loss": 0.0657, | |
| "reward": 1.0902777910232544, | |
| "reward_std": 0.39609089493751526, | |
| "rewards/accuracy_reward": 0.4652777910232544, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6250000149011612, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 720.9791717529297, | |
| "epoch": 0.7529976019184652, | |
| "grad_norm": 1.1801204435963506, | |
| "kl": 0.14697265625, | |
| "learning_rate": 2.553126697246217e-07, | |
| "loss": 0.0499, | |
| "reward": 1.1493055820465088, | |
| "reward_std": 0.40563249588012695, | |
| "rewards/accuracy_reward": 0.541666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6076388955116272, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 708.5486297607422, | |
| "epoch": 0.7577937649880095, | |
| "grad_norm": 1.3449351919779642, | |
| "kl": 0.1728515625, | |
| "learning_rate": 2.49641070162307e-07, | |
| "loss": 0.0543, | |
| "reward": 1.2118056118488312, | |
| "reward_std": 0.3700602427124977, | |
| "rewards/accuracy_reward": 0.5833333507180214, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6284722238779068, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 757.7916870117188, | |
| "epoch": 0.762589928057554, | |
| "grad_norm": 1.141025370807687, | |
| "kl": 0.19677734375, | |
| "learning_rate": 2.440542415140466e-07, | |
| "loss": 0.0881, | |
| "reward": 1.1232638955116272, | |
| "reward_std": 0.4029542878270149, | |
| "rewards/accuracy_reward": 0.493055559694767, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6302083432674408, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 677.4652862548828, | |
| "epoch": 0.7673860911270983, | |
| "grad_norm": 1.6300168405182138, | |
| "kl": 0.2421875, | |
| "learning_rate": 2.3855376056189737e-07, | |
| "loss": 0.058, | |
| "reward": 1.3194444477558136, | |
| "reward_std": 0.44138168543577194, | |
| "rewards/accuracy_reward": 0.604166679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7152777761220932, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 664.5694580078125, | |
| "epoch": 0.7721822541966427, | |
| "grad_norm": 1.5733647270439453, | |
| "kl": 0.20703125, | |
| "learning_rate": 2.3314117971782945e-07, | |
| "loss": 0.0652, | |
| "reward": 1.1788194477558136, | |
| "reward_std": 0.3714512586593628, | |
| "rewards/accuracy_reward": 0.541666679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6371527910232544, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 769.9722290039062, | |
| "epoch": 0.7769784172661871, | |
| "grad_norm": 1.533175643219152, | |
| "kl": 0.23095703125, | |
| "learning_rate": 2.2781802658558635e-07, | |
| "loss": 0.0533, | |
| "reward": 0.984375, | |
| "reward_std": 0.39164651185274124, | |
| "rewards/accuracy_reward": 0.4027777835726738, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.581597238779068, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 733.8125, | |
| "epoch": 0.7817745803357314, | |
| "grad_norm": 0.9142708944559201, | |
| "kl": 0.22607421875, | |
| "learning_rate": 2.2258580352954552e-07, | |
| "loss": 0.0356, | |
| "reward": 1.1076388955116272, | |
| "reward_std": 0.32901762425899506, | |
| "rewards/accuracy_reward": 0.472222238779068, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6354166716337204, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 743.4930572509766, | |
| "epoch": 0.7865707434052758, | |
| "grad_norm": 1.564329334465899, | |
| "kl": 0.3662109375, | |
| "learning_rate": 2.1744598725070347e-07, | |
| "loss": 0.0512, | |
| "reward": 1.0538194477558136, | |
| "reward_std": 0.28839075565338135, | |
| "rewards/accuracy_reward": 0.4652777835726738, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.5885416716337204, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 739.3611145019531, | |
| "epoch": 0.7913669064748201, | |
| "grad_norm": 1.1882320571551739, | |
| "kl": 0.28759765625, | |
| "learning_rate": 2.1240002836990328e-07, | |
| "loss": 0.0243, | |
| "reward": 1.1527777910232544, | |
| "reward_std": 0.3735358491539955, | |
| "rewards/accuracy_reward": 0.4791666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6736111044883728, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 706.4166717529297, | |
| "epoch": 0.7961630695443646, | |
| "grad_norm": 1.201792642717349, | |
| "kl": 0.32080078125, | |
| "learning_rate": 2.0744935101842275e-07, | |
| "loss": 0.0349, | |
| "reward": 1.1701389253139496, | |
| "reward_std": 0.3558007851243019, | |
| "rewards/accuracy_reward": 0.4930555671453476, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6770833283662796, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 669.9444580078125, | |
| "epoch": 0.8009592326139089, | |
| "grad_norm": 1.415131555543672, | |
| "kl": 0.339111328125, | |
| "learning_rate": 2.025953524360396e-07, | |
| "loss": 0.0588, | |
| "reward": 1.2465277910232544, | |
| "reward_std": 0.3056763559579849, | |
| "rewards/accuracy_reward": 0.6250000074505806, | |
| "rewards/format_reward": 0.0069444444961845875, | |
| "rewards/tag_count_reward": 0.6145833358168602, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 690.5277862548828, | |
| "epoch": 0.8057553956834532, | |
| "grad_norm": 1.8031709952858856, | |
| "kl": 0.342041015625, | |
| "learning_rate": 1.9783940257668473e-07, | |
| "loss": 0.1108, | |
| "reward": 1.1909722089767456, | |
| "reward_std": 0.42986829578876495, | |
| "rewards/accuracy_reward": 0.5416666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6493055671453476, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 731.9027862548828, | |
| "epoch": 0.8105515587529976, | |
| "grad_norm": 1.54016513318198, | |
| "kl": 0.34375, | |
| "learning_rate": 1.9318284372179783e-07, | |
| "loss": 0.0829, | |
| "reward": 1.0902778059244156, | |
| "reward_std": 0.3709410950541496, | |
| "rewards/accuracy_reward": 0.4791666641831398, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.611111119389534, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 678.4305572509766, | |
| "epoch": 0.815347721822542, | |
| "grad_norm": 1.2889679479137333, | |
| "kl": 0.310791015625, | |
| "learning_rate": 1.8862699010149265e-07, | |
| "loss": 0.0637, | |
| "reward": 1.1493055820465088, | |
| "reward_std": 0.4024455025792122, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6493055671453476, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 666.5764007568359, | |
| "epoch": 0.8201438848920863, | |
| "grad_norm": 1.2493403412894837, | |
| "kl": 0.37060546875, | |
| "learning_rate": 1.8417312752363842e-07, | |
| "loss": 0.0292, | |
| "reward": 1.2986111342906952, | |
| "reward_std": 0.39357686042785645, | |
| "rewards/accuracy_reward": 0.5972222238779068, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7013888955116272, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 680.5, | |
| "epoch": 0.8249400479616307, | |
| "grad_norm": 1.3917142124264024, | |
| "kl": 0.289794921875, | |
| "learning_rate": 1.7982251301096496e-07, | |
| "loss": 0.0587, | |
| "reward": 1.2343749701976776, | |
| "reward_std": 0.3718552738428116, | |
| "rewards/accuracy_reward": 0.569444440305233, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6649305671453476, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 762.5139007568359, | |
| "epoch": 0.829736211031175, | |
| "grad_norm": 2.082423982787284, | |
| "kl": 0.37939453125, | |
| "learning_rate": 1.7557637444628934e-07, | |
| "loss": 0.0734, | |
| "reward": 1.0295139104127884, | |
| "reward_std": 0.42010512948036194, | |
| "rewards/accuracy_reward": 0.4166666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.612847238779068, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 758.2777862548828, | |
| "epoch": 0.8345323741007195, | |
| "grad_norm": 1.4700469055914496, | |
| "kl": 0.31884765625, | |
| "learning_rate": 1.7143591022596842e-07, | |
| "loss": 0.0462, | |
| "reward": 1.0850694626569748, | |
| "reward_std": 0.3826001510024071, | |
| "rewards/accuracy_reward": 0.4513888955116272, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6336805373430252, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 694.0694427490234, | |
| "epoch": 0.8393285371702638, | |
| "grad_norm": 1.3297784898512763, | |
| "kl": 0.38623046875, | |
| "learning_rate": 1.674022889216737e-07, | |
| "loss": 0.0566, | |
| "reward": 1.2083333134651184, | |
| "reward_std": 0.36128322780132294, | |
| "rewards/accuracy_reward": 0.5486111044883728, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.659722238779068, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 678.7152862548828, | |
| "epoch": 0.8441247002398081, | |
| "grad_norm": 1.4970986391973622, | |
| "kl": 0.312744140625, | |
| "learning_rate": 1.634766489505815e-07, | |
| "loss": 0.0584, | |
| "reward": 1.2951389253139496, | |
| "reward_std": 0.39571166411042213, | |
| "rewards/accuracy_reward": 0.6180555522441864, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6770833283662796, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 650.5069580078125, | |
| "epoch": 0.8489208633093526, | |
| "grad_norm": 1.3198452732539876, | |
| "kl": 0.282958984375, | |
| "learning_rate": 1.5966009825407664e-07, | |
| "loss": 0.0487, | |
| "reward": 1.2291666865348816, | |
| "reward_std": 0.41252629458904266, | |
| "rewards/accuracy_reward": 0.5763888955116272, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6527777910232544, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 709.4861145019531, | |
| "epoch": 0.8537170263788969, | |
| "grad_norm": 1.571938422464948, | |
| "kl": 0.275146484375, | |
| "learning_rate": 1.5595371398505497e-07, | |
| "loss": 0.0601, | |
| "reward": 1.1354167014360428, | |
| "reward_std": 0.3936513438820839, | |
| "rewards/accuracy_reward": 0.4444444552063942, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6909722238779068, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 644.2847290039062, | |
| "epoch": 0.8585131894484412, | |
| "grad_norm": 1.2655481838867129, | |
| "kl": 0.313232421875, | |
| "learning_rate": 1.523585422039165e-07, | |
| "loss": 0.0395, | |
| "reward": 1.2447916567325592, | |
| "reward_std": 0.3132231794297695, | |
| "rewards/accuracy_reward": 0.5555555671453476, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.689236119389534, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 727.8055725097656, | |
| "epoch": 0.8633093525179856, | |
| "grad_norm": 2.287249210625076, | |
| "kl": 0.4541015625, | |
| "learning_rate": 1.4887559758333408e-07, | |
| "loss": 0.0809, | |
| "reward": 1.1718749850988388, | |
| "reward_std": 0.4368325099349022, | |
| "rewards/accuracy_reward": 0.5069444477558136, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6649305671453476, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 677.8541641235352, | |
| "epoch": 0.86810551558753, | |
| "grad_norm": 2.029066779031394, | |
| "kl": 0.44677734375, | |
| "learning_rate": 1.4550586312187919e-07, | |
| "loss": 0.0318, | |
| "reward": 1.2395833432674408, | |
| "reward_std": 0.34451349824666977, | |
| "rewards/accuracy_reward": 0.597222238779068, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.642361119389534, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 668.8194427490234, | |
| "epoch": 0.8729016786570744, | |
| "grad_norm": 1.6348530863651882, | |
| "kl": 0.3857421875, | |
| "learning_rate": 1.4225028986658965e-07, | |
| "loss": 0.0692, | |
| "reward": 1.2500000298023224, | |
| "reward_std": 0.4199504852294922, | |
| "rewards/accuracy_reward": 0.5763888880610466, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.673611119389534, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 739.4930725097656, | |
| "epoch": 0.8776978417266187, | |
| "grad_norm": 2.171960155817612, | |
| "kl": 0.31787109375, | |
| "learning_rate": 1.391097966445526e-07, | |
| "loss": 0.0609, | |
| "reward": 1.1805555820465088, | |
| "reward_std": 0.3399686738848686, | |
| "rewards/accuracy_reward": 0.5208333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.659722238779068, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 710.2638854980469, | |
| "epoch": 0.882494004796163, | |
| "grad_norm": 2.354582366359387, | |
| "kl": 0.439208984375, | |
| "learning_rate": 1.3608526980358242e-07, | |
| "loss": 0.1236, | |
| "reward": 1.1701389104127884, | |
| "reward_std": 0.3848187327384949, | |
| "rewards/accuracy_reward": 0.5, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.670138880610466, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 740.0833435058594, | |
| "epoch": 0.8872901678657075, | |
| "grad_norm": 1.0825401183719674, | |
| "kl": 0.34619140625, | |
| "learning_rate": 1.331775629620653e-07, | |
| "loss": 0.0486, | |
| "reward": 1.1493055522441864, | |
| "reward_std": 0.36640702188014984, | |
| "rewards/accuracy_reward": 0.4861111268401146, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6631944477558136, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 628.0208282470703, | |
| "epoch": 0.8920863309352518, | |
| "grad_norm": 1.7907760466473297, | |
| "kl": 0.5361328125, | |
| "learning_rate": 1.303874967680399e-07, | |
| "loss": 0.0542, | |
| "reward": 1.2604166567325592, | |
| "reward_std": 0.4223191514611244, | |
| "rewards/accuracy_reward": 0.5694444552063942, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.690972238779068, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 629.2708358764648, | |
| "epoch": 0.8968824940047961, | |
| "grad_norm": 1.7941484108862593, | |
| "kl": 0.375, | |
| "learning_rate": 1.277158586675852e-07, | |
| "loss": 0.0782, | |
| "reward": 1.1996527910232544, | |
| "reward_std": 0.33358532190322876, | |
| "rewards/accuracy_reward": 0.493055559694767, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.706597238779068, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 774.6180725097656, | |
| "epoch": 0.9016786570743405, | |
| "grad_norm": 1.3540196149379808, | |
| "kl": 0.42333984375, | |
| "learning_rate": 1.2516340268257737e-07, | |
| "loss": 0.0613, | |
| "reward": 1.065972238779068, | |
| "reward_std": 0.3640429899096489, | |
| "rewards/accuracy_reward": 0.3819444477558136, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6840277910232544, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 673.5208511352539, | |
| "epoch": 0.9064748201438849, | |
| "grad_norm": 2.3552439559585534, | |
| "kl": 0.56640625, | |
| "learning_rate": 1.2273084919788063e-07, | |
| "loss": 0.0419, | |
| "reward": 1.2378471940755844, | |
| "reward_std": 0.40937893092632294, | |
| "rewards/accuracy_reward": 0.5833333283662796, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.654513880610466, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 680.7222137451172, | |
| "epoch": 0.9112709832134293, | |
| "grad_norm": 2.0506106800441746, | |
| "kl": 0.62939453125, | |
| "learning_rate": 1.2041888475803217e-07, | |
| "loss": 0.0987, | |
| "reward": 1.1649305671453476, | |
| "reward_std": 0.4104561358690262, | |
| "rewards/accuracy_reward": 0.5208333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6440972238779068, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 652.5694580078125, | |
| "epoch": 0.9160671462829736, | |
| "grad_norm": 2.011461189500051, | |
| "kl": 0.650390625, | |
| "learning_rate": 1.1822816187347622e-07, | |
| "loss": 0.1134, | |
| "reward": 1.1857638955116272, | |
| "reward_std": 0.4204775467514992, | |
| "rewards/accuracy_reward": 0.5347222313284874, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6510416716337204, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 672.8889007568359, | |
| "epoch": 0.920863309352518, | |
| "grad_norm": 1.766556315204042, | |
| "kl": 0.52197265625, | |
| "learning_rate": 1.1615929883640567e-07, | |
| "loss": 0.0868, | |
| "reward": 1.2239583283662796, | |
| "reward_std": 0.37772539258003235, | |
| "rewards/accuracy_reward": 0.5486111268401146, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.675347238779068, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 705.6805572509766, | |
| "epoch": 0.9256594724220624, | |
| "grad_norm": 1.2464963403957747, | |
| "kl": 0.42431640625, | |
| "learning_rate": 1.1421287954625985e-07, | |
| "loss": 0.0538, | |
| "reward": 1.2118055522441864, | |
| "reward_std": 0.3169648088514805, | |
| "rewards/accuracy_reward": 0.5416666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6701388955116272, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 735.0416870117188, | |
| "epoch": 0.9304556354916067, | |
| "grad_norm": 1.4114342882843525, | |
| "kl": 0.4072265625, | |
| "learning_rate": 1.1238945334492928e-07, | |
| "loss": 0.038, | |
| "reward": 1.1388888955116272, | |
| "reward_std": 0.3398313596844673, | |
| "rewards/accuracy_reward": 0.4791666716337204, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.659722238779068, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 734.25, | |
| "epoch": 0.935251798561151, | |
| "grad_norm": 1.3534299207814684, | |
| "kl": 0.740234375, | |
| "learning_rate": 1.1068953486171385e-07, | |
| "loss": 0.0948, | |
| "reward": 1.14930559694767, | |
| "reward_std": 0.4659058451652527, | |
| "rewards/accuracy_reward": 0.5138888955116272, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6354166716337204, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 767.3125, | |
| "epoch": 0.9400479616306955, | |
| "grad_norm": 1.8926261098268726, | |
| "kl": 0.6171875, | |
| "learning_rate": 1.0911360386807814e-07, | |
| "loss": 0.0999, | |
| "reward": 1.034722238779068, | |
| "reward_std": 0.3850885070860386, | |
| "rewards/accuracy_reward": 0.423611119389534, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6111111342906952, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 671.8333282470703, | |
| "epoch": 0.9448441247002398, | |
| "grad_norm": 2.2335073755128354, | |
| "kl": 0.63671875, | |
| "learning_rate": 1.0766210514224419e-07, | |
| "loss": 0.0608, | |
| "reward": 1.206597238779068, | |
| "reward_std": 0.39280908554792404, | |
| "rewards/accuracy_reward": 0.5486111342906952, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.657986119389534, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 690.3472137451172, | |
| "epoch": 0.9496402877697842, | |
| "grad_norm": 2.0084883063292747, | |
| "kl": 0.505859375, | |
| "learning_rate": 1.0633544834366123e-07, | |
| "loss": 0.1037, | |
| "reward": 1.2291666865348816, | |
| "reward_std": 0.44404156506061554, | |
| "rewards/accuracy_reward": 0.5277777835726738, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7013888955116272, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 706.125, | |
| "epoch": 0.9544364508393285, | |
| "grad_norm": 2.8291221690849255, | |
| "kl": 0.67822265625, | |
| "learning_rate": 1.051340078973863e-07, | |
| "loss": 0.084, | |
| "reward": 1.0954861044883728, | |
| "reward_std": 0.43709662556648254, | |
| "rewards/accuracy_reward": 0.4375, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6579861044883728, | |
| "step": 199 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 670.6319427490234, | |
| "epoch": 0.9592326139088729, | |
| "grad_norm": 1.3994189437001958, | |
| "kl": 0.4013671875, | |
| "learning_rate": 1.0405812288840964e-07, | |
| "loss": 0.0765, | |
| "reward": 1.2777777761220932, | |
| "reward_std": 0.34174390137195587, | |
| "rewards/accuracy_reward": 0.5763889029622078, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7013888955116272, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 629.4861145019531, | |
| "epoch": 0.9640287769784173, | |
| "grad_norm": 1.434366201343452, | |
| "kl": 0.3046875, | |
| "learning_rate": 1.031080969659543e-07, | |
| "loss": 0.0855, | |
| "reward": 1.3107638955116272, | |
| "reward_std": 0.34832194447517395, | |
| "rewards/accuracy_reward": 0.604166679084301, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7065972238779068, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 665.0833587646484, | |
| "epoch": 0.9688249400479616, | |
| "grad_norm": 1.5034799191037997, | |
| "kl": 0.46337890625, | |
| "learning_rate": 1.0228419825777602e-07, | |
| "loss": 0.0582, | |
| "reward": 1.2343750298023224, | |
| "reward_std": 0.4124446362257004, | |
| "rewards/accuracy_reward": 0.5763889029622078, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6579861342906952, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 781.5555572509766, | |
| "epoch": 0.973621103117506, | |
| "grad_norm": 2.678396312950966, | |
| "kl": 0.5263671875, | |
| "learning_rate": 1.0158665929448951e-07, | |
| "loss": 0.0947, | |
| "reward": 1.0694444477558136, | |
| "reward_std": 0.42056479305028915, | |
| "rewards/accuracy_reward": 0.3958333283662796, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6736111044883728, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 618.8888854980469, | |
| "epoch": 0.9784172661870504, | |
| "grad_norm": 2.12368752310343, | |
| "kl": 0.5556640625, | |
| "learning_rate": 1.0101567694394071e-07, | |
| "loss": 0.1194, | |
| "reward": 1.3229166567325592, | |
| "reward_std": 0.41604190319776535, | |
| "rewards/accuracy_reward": 0.6111111268401146, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.7118055522441864, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 715.125, | |
| "epoch": 0.9832134292565947, | |
| "grad_norm": 1.6931563983674274, | |
| "kl": 0.5595703125, | |
| "learning_rate": 1.0057141235564423e-07, | |
| "loss": 0.0796, | |
| "reward": 1.1458333432674408, | |
| "reward_std": 0.39061587303876877, | |
| "rewards/accuracy_reward": 0.5000000074505806, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6458333432674408, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 721.2014007568359, | |
| "epoch": 0.988009592326139, | |
| "grad_norm": 1.6094244664912092, | |
| "kl": 0.51513671875, | |
| "learning_rate": 1.0025399091530193e-07, | |
| "loss": 0.0913, | |
| "reward": 1.2239583432674408, | |
| "reward_std": 0.34610963612794876, | |
| "rewards/accuracy_reward": 0.5347222089767456, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6892361044883728, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 612.5069351196289, | |
| "epoch": 0.9928057553956835, | |
| "grad_norm": 1.7144937573704717, | |
| "kl": 0.45556640625, | |
| "learning_rate": 1.0006350220941502e-07, | |
| "loss": 0.0338, | |
| "reward": 1.3229166865348816, | |
| "reward_std": 0.37486525624990463, | |
| "rewards/accuracy_reward": 0.5833333432674408, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.739583358168602, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 573.1250076293945, | |
| "epoch": 0.9976019184652278, | |
| "grad_norm": 1.1702049182470995, | |
| "kl": 0.43017578125, | |
| "learning_rate": 1e-07, | |
| "loss": 0.02, | |
| "reward": 1.237847238779068, | |
| "reward_std": 0.40600838512182236, | |
| "rewards/accuracy_reward": 0.5625, | |
| "rewards/format_reward": 0.0, | |
| "rewards/tag_count_reward": 0.6753472089767456, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.9976019184652278, | |
| "step": 208, | |
| "total_flos": 0.0, | |
| "train_loss": 0.05009339519780882, | |
| "train_runtime": 7148.6291, | |
| "train_samples_per_second": 1.049, | |
| "train_steps_per_second": 0.029 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 208, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 12, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |