| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 10000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 75.6, |
| "epoch": 0.002, |
| "grad_norm": 0.0, |
| "kl": 0.0, |
| "learning_rate": 5.0000000000000004e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 10 |
| }, |
| { |
| "completion_length": 69.0, |
| "epoch": 0.004, |
| "grad_norm": 8.249282836914062e-05, |
| "kl": 0.0007458075881004334, |
| "learning_rate": 1.0000000000000001e-07, |
| "loss": 0.0, |
| "match_ratio": 0.85, |
| "reward": -0.15, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.15, |
| "step": 20 |
| }, |
| { |
| "completion_length": 65.25, |
| "epoch": 0.006, |
| "grad_norm": 8.726119995117188e-05, |
| "kl": 0.0008603519352618604, |
| "learning_rate": 1.5000000000000002e-07, |
| "loss": 0.0, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.075, |
| "step": 30 |
| }, |
| { |
| "completion_length": 51.1, |
| "epoch": 0.008, |
| "grad_norm": 0.0001392364501953125, |
| "kl": 0.0006604890164453536, |
| "learning_rate": 2.0000000000000002e-07, |
| "loss": 0.0, |
| "match_ratio": 0.85, |
| "reward": -0.15, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.15, |
| "step": 40 |
| }, |
| { |
| "completion_length": 81.475, |
| "epoch": 0.01, |
| "grad_norm": 10.3125, |
| "kl": 0.0008120269441860728, |
| "learning_rate": 2.5000000000000004e-07, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.3154700517654419, |
| "rewards/reward_func": -0.2, |
| "step": 50 |
| }, |
| { |
| "completion_length": 82.3, |
| "epoch": 0.012, |
| "grad_norm": 0.000148773193359375, |
| "kl": 0.0009021399382618256, |
| "learning_rate": 3.0000000000000004e-07, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.2, |
| "rewards/reward_func": -0.1, |
| "step": 60 |
| }, |
| { |
| "completion_length": 57.675, |
| "epoch": 0.014, |
| "grad_norm": 0.000453948974609375, |
| "kl": 0.0008941800828324631, |
| "learning_rate": 3.5000000000000004e-07, |
| "loss": 0.0, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.10773502588272095, |
| "rewards/reward_func": -0.075, |
| "step": 70 |
| }, |
| { |
| "completion_length": 70.475, |
| "epoch": 0.016, |
| "grad_norm": 7.90625, |
| "kl": 0.0010760451084934175, |
| "learning_rate": 4.0000000000000003e-07, |
| "loss": 0.0, |
| "match_ratio": 0.75, |
| "reward": -0.25, |
| "reward_std": 0.2, |
| "rewards/reward_func": -0.25, |
| "step": 80 |
| }, |
| { |
| "completion_length": 56.2, |
| "epoch": 0.018, |
| "grad_norm": 8.535385131835938e-05, |
| "kl": 0.001845199626404792, |
| "learning_rate": 4.5000000000000003e-07, |
| "loss": 0.0, |
| "match_ratio": 0.85, |
| "reward": -0.15, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.15, |
| "step": 90 |
| }, |
| { |
| "completion_length": 63.5, |
| "epoch": 0.02, |
| "grad_norm": 0.00011777877807617188, |
| "kl": 0.0007161700828874018, |
| "learning_rate": 5.000000000000001e-07, |
| "loss": 0.0, |
| "match_ratio": 0.725, |
| "reward": -0.275, |
| "reward_std": 0.20773502588272094, |
| "rewards/reward_func": -0.275, |
| "step": 100 |
| }, |
| { |
| "completion_length": 88.175, |
| "epoch": 0.022, |
| "grad_norm": 0.000125885009765625, |
| "kl": 0.001011227659182623, |
| "learning_rate": 5.5e-07, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.15773502588272095, |
| "rewards/reward_func": -0.1, |
| "step": 110 |
| }, |
| { |
| "completion_length": 69.725, |
| "epoch": 0.024, |
| "grad_norm": 0.00010633468627929688, |
| "kl": 0.0008000041969353333, |
| "learning_rate": 6.000000000000001e-07, |
| "loss": 0.0, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.25, |
| "rewards/reward_func": -0.125, |
| "step": 120 |
| }, |
| { |
| "completion_length": 72.35, |
| "epoch": 0.026, |
| "grad_norm": 0.00012159347534179688, |
| "kl": 0.0010485154576599597, |
| "learning_rate": 6.5e-07, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 130 |
| }, |
| { |
| "completion_length": 75.575, |
| "epoch": 0.028, |
| "grad_norm": 0.00017547607421875, |
| "kl": 0.0007810671289917081, |
| "learning_rate": 7.000000000000001e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 140 |
| }, |
| { |
| "completion_length": 79.65, |
| "epoch": 0.03, |
| "grad_norm": 8.0625, |
| "kl": 0.0008353532728506252, |
| "learning_rate": 7.5e-07, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 150 |
| }, |
| { |
| "completion_length": 52.075, |
| "epoch": 0.032, |
| "grad_norm": 33.25, |
| "kl": 0.0031860046496149153, |
| "learning_rate": 8.000000000000001e-07, |
| "loss": 0.0, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.15, |
| "rewards/reward_func": -0.125, |
| "step": 160 |
| }, |
| { |
| "completion_length": 58.825, |
| "epoch": 0.034, |
| "grad_norm": 15.6875, |
| "kl": 0.001403974276036024, |
| "learning_rate": 8.500000000000001e-07, |
| "loss": 0.0, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.125, |
| "step": 170 |
| }, |
| { |
| "completion_length": 70.95, |
| "epoch": 0.036, |
| "grad_norm": 0.0002956390380859375, |
| "kl": 0.0009778408275451511, |
| "learning_rate": 9.000000000000001e-07, |
| "loss": 0.0, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.125, |
| "step": 180 |
| }, |
| { |
| "completion_length": 62.25, |
| "epoch": 0.038, |
| "grad_norm": 0.00011587142944335938, |
| "kl": 0.001159800120512955, |
| "learning_rate": 9.500000000000001e-07, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 190 |
| }, |
| { |
| "completion_length": 68.7, |
| "epoch": 0.04, |
| "grad_norm": 0.0001430511474609375, |
| "kl": 0.0036639797501266, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 200 |
| }, |
| { |
| "completion_length": 50.675, |
| "epoch": 0.042, |
| "grad_norm": 7.534027099609375e-05, |
| "kl": 0.002666096478151303, |
| "learning_rate": 1.0500000000000001e-06, |
| "loss": 0.0, |
| "match_ratio": 0.775, |
| "reward": -0.225, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.225, |
| "step": 210 |
| }, |
| { |
| "completion_length": 36.8, |
| "epoch": 0.044, |
| "grad_norm": 0.00018215179443359375, |
| "kl": 0.014809455376234838, |
| "learning_rate": 1.1e-06, |
| "loss": 0.0, |
| "match_ratio": 0.725, |
| "reward": -0.275, |
| "reward_std": 0.15, |
| "rewards/reward_func": -0.275, |
| "step": 220 |
| }, |
| { |
| "completion_length": 66.75, |
| "epoch": 0.046, |
| "grad_norm": 0.0004520416259765625, |
| "kl": 0.007854271659743972, |
| "learning_rate": 1.1500000000000002e-06, |
| "loss": 0.0, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.20773502588272094, |
| "rewards/reward_func": -0.125, |
| "step": 230 |
| }, |
| { |
| "completion_length": 76.35, |
| "epoch": 0.048, |
| "grad_norm": 17.875, |
| "kl": 1.018031721841544, |
| "learning_rate": 1.2000000000000002e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.2, |
| "rewards/reward_func": -0.2, |
| "step": 240 |
| }, |
| { |
| "completion_length": 59.075, |
| "epoch": 0.05, |
| "grad_norm": 8.869171142578125e-05, |
| "kl": 0.13859437993960455, |
| "learning_rate": 1.25e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 250 |
| }, |
| { |
| "completion_length": 55.95, |
| "epoch": 0.052, |
| "grad_norm": 0.458984375, |
| "kl": 0.44334062208363323, |
| "learning_rate": 1.3e-06, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.2, |
| "step": 260 |
| }, |
| { |
| "completion_length": 62.5, |
| "epoch": 0.054, |
| "grad_norm": 0.000209808349609375, |
| "kl": 1.4304919777001488, |
| "learning_rate": 1.3500000000000002e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.15, |
| "rewards/reward_func": -0.125, |
| "step": 270 |
| }, |
| { |
| "completion_length": 59.125, |
| "epoch": 0.056, |
| "grad_norm": 7.963180541992188e-05, |
| "kl": 0.3974201448727399, |
| "learning_rate": 1.4000000000000001e-06, |
| "loss": 0.0, |
| "match_ratio": 0.575, |
| "reward": -0.425, |
| "reward_std": 0.15, |
| "rewards/reward_func": -0.425, |
| "step": 280 |
| }, |
| { |
| "completion_length": 61.75, |
| "epoch": 0.058, |
| "grad_norm": 0.00015926361083984375, |
| "kl": 0.6299153287603986, |
| "learning_rate": 1.45e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.775, |
| "reward": -0.225, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.225, |
| "step": 290 |
| }, |
| { |
| "completion_length": 94.275, |
| "epoch": 0.06, |
| "grad_norm": 0.00010347366333007812, |
| "kl": 0.007541297184070572, |
| "learning_rate": 1.5e-06, |
| "loss": 0.0, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.15, |
| "rewards/reward_func": -0.075, |
| "step": 300 |
| }, |
| { |
| "completion_length": 55.45, |
| "epoch": 0.062, |
| "grad_norm": 0.000507354736328125, |
| "kl": 0.8210757704044227, |
| "learning_rate": 1.5500000000000002e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 310 |
| }, |
| { |
| "completion_length": 63.475, |
| "epoch": 0.064, |
| "grad_norm": 0.000396728515625, |
| "kl": 0.10055320091196336, |
| "learning_rate": 1.6000000000000001e-06, |
| "loss": 0.0, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.10773502588272095, |
| "rewards/reward_func": -0.125, |
| "step": 320 |
| }, |
| { |
| "completion_length": 82.15, |
| "epoch": 0.066, |
| "grad_norm": 0.000514984130859375, |
| "kl": 0.37705419784761035, |
| "learning_rate": 1.6500000000000003e-06, |
| "loss": 0.0, |
| "match_ratio": 0.825, |
| "reward": -0.175, |
| "reward_std": 0.10773502588272095, |
| "rewards/reward_func": -0.175, |
| "step": 330 |
| }, |
| { |
| "completion_length": 51.475, |
| "epoch": 0.068, |
| "grad_norm": 0.00014400482177734375, |
| "kl": 41.57116786188563, |
| "learning_rate": 1.7000000000000002e-06, |
| "loss": 0.0042, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.10773502588272095, |
| "rewards/reward_func": -0.075, |
| "step": 340 |
| }, |
| { |
| "completion_length": 55.525, |
| "epoch": 0.07, |
| "grad_norm": 0.0002689361572265625, |
| "kl": 1.14909179067472, |
| "learning_rate": 1.75e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.725, |
| "reward": -0.275, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.275, |
| "step": 350 |
| }, |
| { |
| "completion_length": 47.725, |
| "epoch": 0.072, |
| "grad_norm": 4.28125, |
| "kl": 1.577250469638966, |
| "learning_rate": 1.8000000000000001e-06, |
| "loss": 0.0002, |
| "match_ratio": 0.675, |
| "reward": -0.325, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.325, |
| "step": 360 |
| }, |
| { |
| "completion_length": 61.825, |
| "epoch": 0.074, |
| "grad_norm": 0.00017452239990234375, |
| "kl": 0.34925971169723197, |
| "learning_rate": 1.85e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 370 |
| }, |
| { |
| "completion_length": 55.825, |
| "epoch": 0.076, |
| "grad_norm": 0.00066375732421875, |
| "kl": 0.1755049143510405, |
| "learning_rate": 1.9000000000000002e-06, |
| "loss": 0.0, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.15, |
| "rewards/reward_func": -0.075, |
| "step": 380 |
| }, |
| { |
| "completion_length": 49.9, |
| "epoch": 0.078, |
| "grad_norm": 0.0164794921875, |
| "kl": 0.019992552557960154, |
| "learning_rate": 1.9500000000000004e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 390 |
| }, |
| { |
| "completion_length": 63.85, |
| "epoch": 0.08, |
| "grad_norm": 0.0002002716064453125, |
| "kl": 4.795912343251985, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 0.0005, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.2, |
| "rewards/reward_func": -0.1, |
| "step": 400 |
| }, |
| { |
| "completion_length": 56.6, |
| "epoch": 0.082, |
| "grad_norm": 0.0004520416259765625, |
| "kl": 1.1443448643549345, |
| "learning_rate": 2.05e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 410 |
| }, |
| { |
| "completion_length": 58.725, |
| "epoch": 0.084, |
| "grad_norm": 0.0002765655517578125, |
| "kl": 0.0028327183797955515, |
| "learning_rate": 2.1000000000000002e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 420 |
| }, |
| { |
| "completion_length": 47.1, |
| "epoch": 0.086, |
| "grad_norm": 0.0010986328125, |
| "kl": 0.008324940234888344, |
| "learning_rate": 2.15e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 430 |
| }, |
| { |
| "completion_length": 65.75, |
| "epoch": 0.088, |
| "grad_norm": 11.9375, |
| "kl": 0.006597463192883879, |
| "learning_rate": 2.2e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 440 |
| }, |
| { |
| "completion_length": 68.975, |
| "epoch": 0.09, |
| "grad_norm": 2064.0, |
| "kl": 405.5571417377796, |
| "learning_rate": 2.25e-06, |
| "loss": 0.0406, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 450 |
| }, |
| { |
| "completion_length": 56.975, |
| "epoch": 0.092, |
| "grad_norm": 0.00038909912109375, |
| "kl": 0.05630149020580575, |
| "learning_rate": 2.3000000000000004e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 460 |
| }, |
| { |
| "completion_length": 61.075, |
| "epoch": 0.094, |
| "grad_norm": 0.000446319580078125, |
| "kl": 0.005178993823938072, |
| "learning_rate": 2.35e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 470 |
| }, |
| { |
| "completion_length": 73.325, |
| "epoch": 0.096, |
| "grad_norm": 0.0002613067626953125, |
| "kl": 4.652890888956608, |
| "learning_rate": 2.4000000000000003e-06, |
| "loss": 0.0005, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 480 |
| }, |
| { |
| "completion_length": 68.625, |
| "epoch": 0.098, |
| "grad_norm": 0.000164031982421875, |
| "kl": 15.9451186191116, |
| "learning_rate": 2.4500000000000003e-06, |
| "loss": 0.0016, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 490 |
| }, |
| { |
| "completion_length": 82.025, |
| "epoch": 0.1, |
| "grad_norm": 0.000453948974609375, |
| "kl": 0.006236090854508802, |
| "learning_rate": 2.5e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 500 |
| }, |
| { |
| "completion_length": 73.05, |
| "epoch": 0.102, |
| "grad_norm": 0.00064849853515625, |
| "kl": 0.0709857388283126, |
| "learning_rate": 2.55e-06, |
| "loss": 0.0, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.15, |
| "rewards/reward_func": -0.125, |
| "step": 510 |
| }, |
| { |
| "completion_length": 69.325, |
| "epoch": 0.104, |
| "grad_norm": 0.00048828125, |
| "kl": 517.9831573915319, |
| "learning_rate": 2.6e-06, |
| "loss": 0.0518, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.10773502588272095, |
| "rewards/reward_func": -0.075, |
| "step": 520 |
| }, |
| { |
| "completion_length": 57.95, |
| "epoch": 0.106, |
| "grad_norm": 0.00028228759765625, |
| "kl": 0.006235601624939591, |
| "learning_rate": 2.6500000000000005e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 530 |
| }, |
| { |
| "completion_length": 39.375, |
| "epoch": 0.108, |
| "grad_norm": 0.0023345947265625, |
| "kl": 592.0278737243498, |
| "learning_rate": 2.7000000000000004e-06, |
| "loss": 0.0592, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.125, |
| "step": 540 |
| }, |
| { |
| "completion_length": 62.875, |
| "epoch": 0.11, |
| "grad_norm": 0.0003910064697265625, |
| "kl": 0.004806909896433354, |
| "learning_rate": 2.7500000000000004e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 550 |
| }, |
| { |
| "completion_length": 56.45, |
| "epoch": 0.112, |
| "grad_norm": 0.0002117156982421875, |
| "kl": 0.004208524071145803, |
| "learning_rate": 2.8000000000000003e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 560 |
| }, |
| { |
| "completion_length": 67.7, |
| "epoch": 0.114, |
| "grad_norm": 0.00016689300537109375, |
| "kl": 3.4651028811815197, |
| "learning_rate": 2.85e-06, |
| "loss": 0.0003, |
| "match_ratio": 0.85, |
| "reward": -0.15, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.15, |
| "step": 570 |
| }, |
| { |
| "completion_length": 51.525, |
| "epoch": 0.116, |
| "grad_norm": 0.040283203125, |
| "kl": 0.03394674692535773, |
| "learning_rate": 2.9e-06, |
| "loss": 0.0, |
| "match_ratio": 0.825, |
| "reward": -0.175, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.175, |
| "step": 580 |
| }, |
| { |
| "completion_length": 69.275, |
| "epoch": 0.118, |
| "grad_norm": 0.000823974609375, |
| "kl": 0.0029081626795232295, |
| "learning_rate": 2.95e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 590 |
| }, |
| { |
| "completion_length": 71.975, |
| "epoch": 0.12, |
| "grad_norm": 0.0004711151123046875, |
| "kl": 0.2187046888633631, |
| "learning_rate": 3e-06, |
| "loss": 0.0, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.20773502588272094, |
| "rewards/reward_func": -0.125, |
| "step": 600 |
| }, |
| { |
| "completion_length": 61.85, |
| "epoch": 0.122, |
| "grad_norm": 24.25, |
| "kl": 13.38376448857598, |
| "learning_rate": 3.05e-06, |
| "loss": 0.0013, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 610 |
| }, |
| { |
| "completion_length": 53.5, |
| "epoch": 0.124, |
| "grad_norm": 0.0002498626708984375, |
| "kl": 0.006577977701090277, |
| "learning_rate": 3.1000000000000004e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 620 |
| }, |
| { |
| "completion_length": 59.925, |
| "epoch": 0.126, |
| "grad_norm": 0.000690460205078125, |
| "kl": 0.03976157886208966, |
| "learning_rate": 3.1500000000000003e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 630 |
| }, |
| { |
| "completion_length": 51.275, |
| "epoch": 0.128, |
| "grad_norm": 284.0, |
| "kl": 128.89907464290735, |
| "learning_rate": 3.2000000000000003e-06, |
| "loss": 0.0129, |
| "match_ratio": 0.825, |
| "reward": -0.175, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.175, |
| "step": 640 |
| }, |
| { |
| "completion_length": 57.525, |
| "epoch": 0.13, |
| "grad_norm": 0.000217437744140625, |
| "kl": 0.15632477974286302, |
| "learning_rate": 3.2500000000000002e-06, |
| "loss": 0.0, |
| "match_ratio": 0.75, |
| "reward": -0.25, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.25, |
| "step": 650 |
| }, |
| { |
| "completion_length": 48.75, |
| "epoch": 0.132, |
| "grad_norm": 0.00035858154296875, |
| "kl": 0.012460133875720203, |
| "learning_rate": 3.3000000000000006e-06, |
| "loss": 0.0, |
| "match_ratio": 0.85, |
| "reward": -0.15, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.15, |
| "step": 660 |
| }, |
| { |
| "completion_length": 46.375, |
| "epoch": 0.134, |
| "grad_norm": 0.000560760498046875, |
| "kl": 0.005184722866397351, |
| "learning_rate": 3.3500000000000005e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 670 |
| }, |
| { |
| "completion_length": 65.7, |
| "epoch": 0.136, |
| "grad_norm": 21.625, |
| "kl": 0.03178581706015393, |
| "learning_rate": 3.4000000000000005e-06, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.2, |
| "step": 680 |
| }, |
| { |
| "completion_length": 70.9, |
| "epoch": 0.138, |
| "grad_norm": 0.00025177001953125, |
| "kl": 0.0047074495116248725, |
| "learning_rate": 3.45e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 690 |
| }, |
| { |
| "completion_length": 50.8, |
| "epoch": 0.14, |
| "grad_norm": 0.0001735687255859375, |
| "kl": 2.7392968325410036, |
| "learning_rate": 3.5e-06, |
| "loss": 0.0003, |
| "match_ratio": 0.725, |
| "reward": -0.275, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.275, |
| "step": 700 |
| }, |
| { |
| "completion_length": 57.025, |
| "epoch": 0.142, |
| "grad_norm": 0.039794921875, |
| "kl": 179.23230375794228, |
| "learning_rate": 3.5500000000000003e-06, |
| "loss": 0.0179, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.2, |
| "step": 710 |
| }, |
| { |
| "completion_length": 67.825, |
| "epoch": 0.144, |
| "grad_norm": 0.00042724609375, |
| "kl": 0.07836510783527047, |
| "learning_rate": 3.6000000000000003e-06, |
| "loss": 0.0, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.125, |
| "step": 720 |
| }, |
| { |
| "completion_length": 60.7, |
| "epoch": 0.146, |
| "grad_norm": 936.0, |
| "kl": 213.31693772624712, |
| "learning_rate": 3.65e-06, |
| "loss": 0.0213, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.15, |
| "rewards/reward_func": -0.125, |
| "step": 730 |
| }, |
| { |
| "completion_length": 56.275, |
| "epoch": 0.148, |
| "grad_norm": 0.00104522705078125, |
| "kl": 0.2562748788390309, |
| "learning_rate": 3.7e-06, |
| "loss": 0.0, |
| "match_ratio": 0.75, |
| "reward": -0.25, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.25, |
| "step": 740 |
| }, |
| { |
| "completion_length": 63.85, |
| "epoch": 0.15, |
| "grad_norm": 0.000640869140625, |
| "kl": 0.869432492996566, |
| "learning_rate": 3.7500000000000005e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 750 |
| }, |
| { |
| "completion_length": 68.85, |
| "epoch": 0.152, |
| "grad_norm": 0.0002613067626953125, |
| "kl": 0.003684952133335173, |
| "learning_rate": 3.8000000000000005e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 760 |
| }, |
| { |
| "completion_length": 59.675, |
| "epoch": 0.154, |
| "grad_norm": 0.000286102294921875, |
| "kl": 0.011132878507487476, |
| "learning_rate": 3.85e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 770 |
| }, |
| { |
| "completion_length": 56.925, |
| "epoch": 0.156, |
| "grad_norm": 0.000244140625, |
| "kl": 431.3150466301013, |
| "learning_rate": 3.900000000000001e-06, |
| "loss": 0.0431, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 780 |
| }, |
| { |
| "completion_length": 59.075, |
| "epoch": 0.158, |
| "grad_norm": 0.00026702880859375, |
| "kl": 0.004525382234714925, |
| "learning_rate": 3.95e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 790 |
| }, |
| { |
| "completion_length": 41.475, |
| "epoch": 0.16, |
| "grad_norm": 0.000392913818359375, |
| "kl": 0.8990198554703965, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.75, |
| "reward": -0.25, |
| "reward_std": 0.15773502588272095, |
| "rewards/reward_func": -0.25, |
| "step": 800 |
| }, |
| { |
| "completion_length": 49.35, |
| "epoch": 0.162, |
| "grad_norm": 0.0006103515625, |
| "kl": 10258.552996213792, |
| "learning_rate": 4.05e-06, |
| "loss": 1.0259, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 810 |
| }, |
| { |
| "completion_length": 47.675, |
| "epoch": 0.164, |
| "grad_norm": 0.00102996826171875, |
| "kl": 3237.7607058377935, |
| "learning_rate": 4.1e-06, |
| "loss": 0.3238, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 820 |
| }, |
| { |
| "completion_length": 63.375, |
| "epoch": 0.166, |
| "grad_norm": 0.0003414154052734375, |
| "kl": 4.472107960679568, |
| "learning_rate": 4.15e-06, |
| "loss": 0.0004, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 830 |
| }, |
| { |
| "completion_length": 70.375, |
| "epoch": 0.168, |
| "grad_norm": 0.00023174285888671875, |
| "kl": 0.005188186629675328, |
| "learning_rate": 4.2000000000000004e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 840 |
| }, |
| { |
| "completion_length": 64.975, |
| "epoch": 0.17, |
| "grad_norm": 0.00022029876708984375, |
| "kl": 0.019074565428309143, |
| "learning_rate": 4.25e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 850 |
| }, |
| { |
| "completion_length": 52.925, |
| "epoch": 0.172, |
| "grad_norm": 0.00029754638671875, |
| "kl": 4.278230914589949, |
| "learning_rate": 4.3e-06, |
| "loss": 0.0004, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.15773502588272095, |
| "rewards/reward_func": -0.1, |
| "step": 860 |
| }, |
| { |
| "completion_length": 66.075, |
| "epoch": 0.174, |
| "grad_norm": 0.00021457672119140625, |
| "kl": 0.006756197474896908, |
| "learning_rate": 4.350000000000001e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 870 |
| }, |
| { |
| "completion_length": 62.075, |
| "epoch": 0.176, |
| "grad_norm": 7.9375, |
| "kl": 1679.0881600409746, |
| "learning_rate": 4.4e-06, |
| "loss": 0.1679, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.125, |
| "step": 880 |
| }, |
| { |
| "completion_length": 57.475, |
| "epoch": 0.178, |
| "grad_norm": 0.000576019287109375, |
| "kl": 0.06974834711290896, |
| "learning_rate": 4.450000000000001e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 890 |
| }, |
| { |
| "completion_length": 59.25, |
| "epoch": 0.18, |
| "grad_norm": 0.000835418701171875, |
| "kl": 0.0047427960205823185, |
| "learning_rate": 4.5e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 900 |
| }, |
| { |
| "completion_length": 49.925, |
| "epoch": 0.182, |
| "grad_norm": 0.000652313232421875, |
| "kl": 0.0047499775420874355, |
| "learning_rate": 4.5500000000000005e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 910 |
| }, |
| { |
| "completion_length": 63.825, |
| "epoch": 0.184, |
| "grad_norm": 72192.0, |
| "kl": 24955.292986106546, |
| "learning_rate": 4.600000000000001e-06, |
| "loss": 2.4955, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 920 |
| }, |
| { |
| "completion_length": 52.475, |
| "epoch": 0.186, |
| "grad_norm": 0.000362396240234375, |
| "kl": 0.6700008324347436, |
| "learning_rate": 4.65e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 930 |
| }, |
| { |
| "completion_length": 58.925, |
| "epoch": 0.188, |
| "grad_norm": 0.0002956390380859375, |
| "kl": 0.019399669324047863, |
| "learning_rate": 4.7e-06, |
| "loss": 0.0, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.125, |
| "step": 940 |
| }, |
| { |
| "completion_length": 62.1, |
| "epoch": 0.19, |
| "grad_norm": 0.0019378662109375, |
| "kl": 0.008174928580410778, |
| "learning_rate": 4.75e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 950 |
| }, |
| { |
| "completion_length": 57.75, |
| "epoch": 0.192, |
| "grad_norm": 0.0004138946533203125, |
| "kl": 1.113649177318439, |
| "learning_rate": 4.800000000000001e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.85, |
| "reward": -0.15, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.15, |
| "step": 960 |
| }, |
| { |
| "completion_length": 56.225, |
| "epoch": 0.194, |
| "grad_norm": 0.00069427490234375, |
| "kl": 0.5510010560508818, |
| "learning_rate": 4.85e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 970 |
| }, |
| { |
| "completion_length": 47.175, |
| "epoch": 0.196, |
| "grad_norm": 0.0002155303955078125, |
| "kl": 1441.3666083157761, |
| "learning_rate": 4.9000000000000005e-06, |
| "loss": 0.1441, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.125, |
| "step": 980 |
| }, |
| { |
| "completion_length": 46.85, |
| "epoch": 0.198, |
| "grad_norm": 0.000720977783203125, |
| "kl": 1.4619831766700373, |
| "learning_rate": 4.95e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 990 |
| }, |
| { |
| "completion_length": 65.55, |
| "epoch": 0.2, |
| "grad_norm": 0.0003871917724609375, |
| "kl": 7.501803689775988, |
| "learning_rate": 5e-06, |
| "loss": 0.0008, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.1, |
| "step": 1000 |
| }, |
| { |
| "completion_length": 62.975, |
| "epoch": 0.202, |
| "grad_norm": 15.5625, |
| "kl": 0.10273585927207023, |
| "learning_rate": 4.999984769144476e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 1010 |
| }, |
| { |
| "completion_length": 57.35, |
| "epoch": 0.204, |
| "grad_norm": 0.0004520416259765625, |
| "kl": 3.634061038820073, |
| "learning_rate": 4.999939076763487e-06, |
| "loss": 0.0004, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.10773502588272095, |
| "rewards/reward_func": -0.075, |
| "step": 1020 |
| }, |
| { |
| "completion_length": 59.675, |
| "epoch": 0.206, |
| "grad_norm": 0.000507354736328125, |
| "kl": 0.45058006714098153, |
| "learning_rate": 4.999862923413781e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 1030 |
| }, |
| { |
| "completion_length": 44.05, |
| "epoch": 0.208, |
| "grad_norm": 0.0005645751953125, |
| "kl": 0.08316081156954169, |
| "learning_rate": 4.999756310023261e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1040 |
| }, |
| { |
| "completion_length": 79.225, |
| "epoch": 0.21, |
| "grad_norm": 0.00121307373046875, |
| "kl": 0.018303806148469447, |
| "learning_rate": 4.9996192378909785e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.1, |
| "step": 1050 |
| }, |
| { |
| "completion_length": 57.4, |
| "epoch": 0.212, |
| "grad_norm": 0.00104522705078125, |
| "kl": 17796.077124893247, |
| "learning_rate": 4.999451708687114e-06, |
| "loss": 1.7796, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 1060 |
| }, |
| { |
| "completion_length": 52.8, |
| "epoch": 0.214, |
| "grad_norm": 8.821487426757812e-05, |
| "kl": 6.741311503923498, |
| "learning_rate": 4.9992537244529585e-06, |
| "loss": 0.0007, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 1070 |
| }, |
| { |
| "completion_length": 56.675, |
| "epoch": 0.216, |
| "grad_norm": 0.00084686279296875, |
| "kl": 0.024167609214782716, |
| "learning_rate": 4.999025287600886e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1080 |
| }, |
| { |
| "completion_length": 61.325, |
| "epoch": 0.218, |
| "grad_norm": 0.0004634857177734375, |
| "kl": 0.03314157268032432, |
| "learning_rate": 4.998766400914329e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1090 |
| }, |
| { |
| "completion_length": 78.3, |
| "epoch": 0.22, |
| "grad_norm": 0.0003376007080078125, |
| "kl": 0.01341487793251872, |
| "learning_rate": 4.99847706754774e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1100 |
| }, |
| { |
| "completion_length": 54.725, |
| "epoch": 0.222, |
| "grad_norm": 0.000560760498046875, |
| "kl": 674.1007295364049, |
| "learning_rate": 4.998157291026553e-06, |
| "loss": 0.0674, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 1110 |
| }, |
| { |
| "completion_length": 54.8, |
| "epoch": 0.224, |
| "grad_norm": 0.00119781494140625, |
| "kl": 10.044112924486399, |
| "learning_rate": 4.997807075247147e-06, |
| "loss": 0.001, |
| "match_ratio": 0.85, |
| "reward": -0.15, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.15, |
| "step": 1120 |
| }, |
| { |
| "completion_length": 49.275, |
| "epoch": 0.226, |
| "grad_norm": 0.0010833740234375, |
| "kl": 0.017626433167606592, |
| "learning_rate": 4.997426424476787e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1130 |
| }, |
| { |
| "completion_length": 57.625, |
| "epoch": 0.228, |
| "grad_norm": 0.0020599365234375, |
| "kl": 0.44624101794324816, |
| "learning_rate": 4.9970153433535855e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 1140 |
| }, |
| { |
| "completion_length": 61.475, |
| "epoch": 0.23, |
| "grad_norm": 0.00159454345703125, |
| "kl": 0.052927281521260736, |
| "learning_rate": 4.9965738368864345e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 1150 |
| }, |
| { |
| "completion_length": 71.8, |
| "epoch": 0.232, |
| "grad_norm": 0.00103759765625, |
| "kl": 0.011898941779509186, |
| "learning_rate": 4.996101910454953e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1160 |
| }, |
| { |
| "completion_length": 56.775, |
| "epoch": 0.234, |
| "grad_norm": 0.006256103515625, |
| "kl": 0.440633371565491, |
| "learning_rate": 4.995599569809414e-06, |
| "loss": 0.0, |
| "match_ratio": 0.7, |
| "reward": -0.3, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.3, |
| "step": 1170 |
| }, |
| { |
| "completion_length": 61.0, |
| "epoch": 0.236, |
| "grad_norm": 0.00174713134765625, |
| "kl": 0.04742448972538114, |
| "learning_rate": 4.9950668210706795e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 1180 |
| }, |
| { |
| "completion_length": 61.45, |
| "epoch": 0.238, |
| "grad_norm": 16.625, |
| "kl": 0.04350157366134226, |
| "learning_rate": 4.994503670730126e-06, |
| "loss": 0.0, |
| "match_ratio": 0.75, |
| "reward": -0.25, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.25, |
| "step": 1190 |
| }, |
| { |
| "completion_length": 66.725, |
| "epoch": 0.24, |
| "grad_norm": 0.00063323974609375, |
| "kl": 0.013561246497556568, |
| "learning_rate": 4.993910125649561e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 1200 |
| }, |
| { |
| "completion_length": 73.775, |
| "epoch": 0.242, |
| "grad_norm": 0.0018310546875, |
| "kl": 0.008337400993332267, |
| "learning_rate": 4.993286193061145e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 1210 |
| }, |
| { |
| "completion_length": 46.075, |
| "epoch": 0.244, |
| "grad_norm": 38.75, |
| "kl": 0.10523022091947495, |
| "learning_rate": 4.992631880567301e-06, |
| "loss": 0.0, |
| "match_ratio": 0.775, |
| "reward": -0.225, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.225, |
| "step": 1220 |
| }, |
| { |
| "completion_length": 44.275, |
| "epoch": 0.246, |
| "grad_norm": 0.0004405975341796875, |
| "kl": 0.289978933124803, |
| "learning_rate": 4.991947196140619e-06, |
| "loss": 0.0, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.075, |
| "step": 1230 |
| }, |
| { |
| "completion_length": 57.0, |
| "epoch": 0.248, |
| "grad_norm": 326.0, |
| "kl": 595.9660109838471, |
| "learning_rate": 4.9912321481237616e-06, |
| "loss": 0.0596, |
| "match_ratio": 0.825, |
| "reward": -0.175, |
| "reward_std": 0.15, |
| "rewards/reward_func": -0.175, |
| "step": 1240 |
| }, |
| { |
| "completion_length": 82.45, |
| "epoch": 0.25, |
| "grad_norm": 0.0002574920654296875, |
| "kl": 0.023628250462934375, |
| "learning_rate": 4.990486745229364e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1250 |
| }, |
| { |
| "completion_length": 78.125, |
| "epoch": 0.252, |
| "grad_norm": 0.0023651123046875, |
| "kl": 0.013969364436343312, |
| "learning_rate": 4.989710996539926e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1260 |
| }, |
| { |
| "completion_length": 55.5, |
| "epoch": 0.254, |
| "grad_norm": 0.0003910064697265625, |
| "kl": 2.1650808176025746, |
| "learning_rate": 4.9889049115077e-06, |
| "loss": 0.0002, |
| "match_ratio": 0.85, |
| "reward": -0.15, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.15, |
| "step": 1270 |
| }, |
| { |
| "completion_length": 71.45, |
| "epoch": 0.256, |
| "grad_norm": 0.00060272216796875, |
| "kl": 0.02477358910255134, |
| "learning_rate": 4.988068499954578e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1280 |
| }, |
| { |
| "completion_length": 57.5, |
| "epoch": 0.258, |
| "grad_norm": 0.0024261474609375, |
| "kl": 0.028649515146389602, |
| "learning_rate": 4.987201772071971e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1290 |
| }, |
| { |
| "completion_length": 59.425, |
| "epoch": 0.26, |
| "grad_norm": 0.0003948211669921875, |
| "kl": 0.02502227737568319, |
| "learning_rate": 4.986304738420684e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1300 |
| }, |
| { |
| "completion_length": 53.9, |
| "epoch": 0.262, |
| "grad_norm": 0.000690460205078125, |
| "kl": 0.028165359469130635, |
| "learning_rate": 4.985377409930789e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1310 |
| }, |
| { |
| "completion_length": 53.075, |
| "epoch": 0.264, |
| "grad_norm": 0.00091552734375, |
| "kl": 30.839417777769267, |
| "learning_rate": 4.984419797901491e-06, |
| "loss": 0.0031, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 1320 |
| }, |
| { |
| "completion_length": 71.55, |
| "epoch": 0.266, |
| "grad_norm": 0.00034332275390625, |
| "kl": 0.011334103159606456, |
| "learning_rate": 4.983431914000991e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1330 |
| }, |
| { |
| "completion_length": 47.675, |
| "epoch": 0.268, |
| "grad_norm": 0.00049591064453125, |
| "kl": 0.21036937911994755, |
| "learning_rate": 4.9824137702663424e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 1340 |
| }, |
| { |
| "completion_length": 59.8, |
| "epoch": 0.27, |
| "grad_norm": 0.000598907470703125, |
| "kl": 0.0196828240994364, |
| "learning_rate": 4.981365379103306e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1350 |
| }, |
| { |
| "completion_length": 57.1, |
| "epoch": 0.272, |
| "grad_norm": 0.000858306884765625, |
| "kl": 0.010027467273175716, |
| "learning_rate": 4.980286753286196e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1360 |
| }, |
| { |
| "completion_length": 64.575, |
| "epoch": 0.274, |
| "grad_norm": 0.00262451171875, |
| "kl": 0.03173879962414503, |
| "learning_rate": 4.979177905957726e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 1370 |
| }, |
| { |
| "completion_length": 38.225, |
| "epoch": 0.276, |
| "grad_norm": 0.001953125, |
| "kl": 0.10523775820620358, |
| "learning_rate": 4.978038850628855e-06, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.2, |
| "step": 1380 |
| }, |
| { |
| "completion_length": 64.175, |
| "epoch": 0.278, |
| "grad_norm": 0.00165557861328125, |
| "kl": 0.022827543993480505, |
| "learning_rate": 4.9768696011786095e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 1390 |
| }, |
| { |
| "completion_length": 51.525, |
| "epoch": 0.28, |
| "grad_norm": 0.00408935546875, |
| "kl": 0.040360532607883214, |
| "learning_rate": 4.975670171853926e-06, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.2, |
| "step": 1400 |
| }, |
| { |
| "completion_length": 65.125, |
| "epoch": 0.282, |
| "grad_norm": 0.001129150390625, |
| "kl": 0.010821055877022446, |
| "learning_rate": 4.974440577269473e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 1410 |
| }, |
| { |
| "completion_length": 57.675, |
| "epoch": 0.284, |
| "grad_norm": 0.00049591064453125, |
| "kl": 0.028958286670967937, |
| "learning_rate": 4.973180832407471e-06, |
| "loss": 0.0, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.10773502588272095, |
| "rewards/reward_func": -0.125, |
| "step": 1420 |
| }, |
| { |
| "completion_length": 60.3, |
| "epoch": 0.286, |
| "grad_norm": 0.00051116943359375, |
| "kl": 0.015607311762869358, |
| "learning_rate": 4.971890952617515e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 1430 |
| }, |
| { |
| "completion_length": 90.0, |
| "epoch": 0.288, |
| "grad_norm": 0.000392913818359375, |
| "kl": 0.007699519535526634, |
| "learning_rate": 4.970570953616383e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1440 |
| }, |
| { |
| "completion_length": 51.625, |
| "epoch": 0.29, |
| "grad_norm": 0.00067138671875, |
| "kl": 0.05114179509691894, |
| "learning_rate": 4.9692208514878445e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 1450 |
| }, |
| { |
| "completion_length": 53.2, |
| "epoch": 0.292, |
| "grad_norm": 0.0004673004150390625, |
| "kl": 0.015444098180159927, |
| "learning_rate": 4.96784066268247e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1460 |
| }, |
| { |
| "completion_length": 51.15, |
| "epoch": 0.294, |
| "grad_norm": 0.0004558563232421875, |
| "kl": 0.028699404350481926, |
| "learning_rate": 4.966430404017424e-06, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.2, |
| "step": 1470 |
| }, |
| { |
| "completion_length": 65.125, |
| "epoch": 0.296, |
| "grad_norm": 0.00144195556640625, |
| "kl": 481.04375956221486, |
| "learning_rate": 4.964990092676263e-06, |
| "loss": 0.0481, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 1480 |
| }, |
| { |
| "completion_length": 54.275, |
| "epoch": 0.298, |
| "grad_norm": 0.000766754150390625, |
| "kl": 0.056382374046370386, |
| "learning_rate": 4.963519746208726e-06, |
| "loss": 0.0, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.075, |
| "step": 1490 |
| }, |
| { |
| "completion_length": 64.05, |
| "epoch": 0.3, |
| "grad_norm": 0.000438690185546875, |
| "kl": 0.3169886400224641, |
| "learning_rate": 4.962019382530521e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 1500 |
| }, |
| { |
| "completion_length": 62.525, |
| "epoch": 0.302, |
| "grad_norm": 0.00079345703125, |
| "kl": 0.022879413142800332, |
| "learning_rate": 4.960489019923105e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 1510 |
| }, |
| { |
| "completion_length": 46.15, |
| "epoch": 0.304, |
| "grad_norm": 29.25, |
| "kl": 0.05621479714754969, |
| "learning_rate": 4.958928677033465e-06, |
| "loss": 0.0, |
| "match_ratio": 0.85, |
| "reward": -0.15, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.15, |
| "step": 1520 |
| }, |
| { |
| "completion_length": 55.775, |
| "epoch": 0.306, |
| "grad_norm": 0.0005035400390625, |
| "kl": 0.23808469655923545, |
| "learning_rate": 4.957338372873886e-06, |
| "loss": 0.0, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.125, |
| "step": 1530 |
| }, |
| { |
| "completion_length": 54.0, |
| "epoch": 0.308, |
| "grad_norm": 0.000518798828125, |
| "kl": 0.052838593162596224, |
| "learning_rate": 4.9557181268217225e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 1540 |
| }, |
| { |
| "completion_length": 59.875, |
| "epoch": 0.31, |
| "grad_norm": 7.0, |
| "kl": 0.020038261311128736, |
| "learning_rate": 4.9540679586191605e-06, |
| "loss": 0.0, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.10773502588272095, |
| "rewards/reward_func": -0.075, |
| "step": 1550 |
| }, |
| { |
| "completion_length": 48.025, |
| "epoch": 0.312, |
| "grad_norm": 0.0013885498046875, |
| "kl": 0.037048061547102405, |
| "learning_rate": 4.9523878883729794e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 1560 |
| }, |
| { |
| "completion_length": 71.175, |
| "epoch": 0.314, |
| "grad_norm": 0.000621795654296875, |
| "kl": 0.011885163560509681, |
| "learning_rate": 4.9506779365543054e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1570 |
| }, |
| { |
| "completion_length": 55.575, |
| "epoch": 0.316, |
| "grad_norm": 0.000339508056640625, |
| "kl": 0.022388620488345622, |
| "learning_rate": 4.94893812399836e-06, |
| "loss": 0.0, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.075, |
| "step": 1580 |
| }, |
| { |
| "completion_length": 59.975, |
| "epoch": 0.318, |
| "grad_norm": 0.0004520416259765625, |
| "kl": 0.009310156595893205, |
| "learning_rate": 4.947168471904213e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1590 |
| }, |
| { |
| "completion_length": 61.125, |
| "epoch": 0.32, |
| "grad_norm": 0.000823974609375, |
| "kl": 0.016036251094192266, |
| "learning_rate": 4.9453690018345144e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1600 |
| }, |
| { |
| "completion_length": 66.625, |
| "epoch": 0.322, |
| "grad_norm": 0.034912109375, |
| "kl": 0.03369634412229061, |
| "learning_rate": 4.9435397357152406e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 1610 |
| }, |
| { |
| "completion_length": 58.8, |
| "epoch": 0.324, |
| "grad_norm": 0.0047607421875, |
| "kl": 0.0254040343221277, |
| "learning_rate": 4.9416806958354206e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1620 |
| }, |
| { |
| "completion_length": 46.0, |
| "epoch": 0.326, |
| "grad_norm": 0.0011138916015625, |
| "kl": 0.07715323262382298, |
| "learning_rate": 4.939791904846869e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 1630 |
| }, |
| { |
| "completion_length": 46.125, |
| "epoch": 0.328, |
| "grad_norm": 0.00060272216796875, |
| "kl": 0.01129569010809064, |
| "learning_rate": 4.937873385763909e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1640 |
| }, |
| { |
| "completion_length": 66.1, |
| "epoch": 0.33, |
| "grad_norm": 0.0003204345703125, |
| "kl": 0.008372989785857498, |
| "learning_rate": 4.935925161963089e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1650 |
| }, |
| { |
| "completion_length": 56.675, |
| "epoch": 0.332, |
| "grad_norm": 0.0007171630859375, |
| "kl": 0.01157067040912807, |
| "learning_rate": 4.933947257182901e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1660 |
| }, |
| { |
| "completion_length": 44.15, |
| "epoch": 0.334, |
| "grad_norm": 0.00087738037109375, |
| "kl": 0.016125927586108445, |
| "learning_rate": 4.9319396955234925e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 1670 |
| }, |
| { |
| "completion_length": 64.3, |
| "epoch": 0.336, |
| "grad_norm": 0.0004673004150390625, |
| "kl": 0.008596798940561711, |
| "learning_rate": 4.9299025014463665e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 1680 |
| }, |
| { |
| "completion_length": 57.15, |
| "epoch": 0.338, |
| "grad_norm": 0.000705718994140625, |
| "kl": 0.08002093653194606, |
| "learning_rate": 4.92783569977409e-06, |
| "loss": 0.0, |
| "match_ratio": 0.65, |
| "reward": -0.35, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.35, |
| "step": 1690 |
| }, |
| { |
| "completion_length": 52.65, |
| "epoch": 0.34, |
| "grad_norm": 0.0004558563232421875, |
| "kl": 0.020256173936650156, |
| "learning_rate": 4.925739315689991e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 1700 |
| }, |
| { |
| "completion_length": 58.75, |
| "epoch": 0.342, |
| "grad_norm": 0.00150299072265625, |
| "kl": 0.02677068072371185, |
| "learning_rate": 4.923613374737848e-06, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.2, |
| "step": 1710 |
| }, |
| { |
| "completion_length": 55.0, |
| "epoch": 0.344, |
| "grad_norm": 0.002227783203125, |
| "kl": 0.03761114357039332, |
| "learning_rate": 4.921457902821578e-06, |
| "loss": 0.0, |
| "match_ratio": 0.7, |
| "reward": -0.3, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.3, |
| "step": 1720 |
| }, |
| { |
| "completion_length": 54.575, |
| "epoch": 0.346, |
| "grad_norm": 23.125, |
| "kl": 0.01751216114498675, |
| "learning_rate": 4.9192729262049285e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 1730 |
| }, |
| { |
| "completion_length": 60.45, |
| "epoch": 0.348, |
| "grad_norm": 0.00075531005859375, |
| "kl": 0.011810581240570172, |
| "learning_rate": 4.917058471511149e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1740 |
| }, |
| { |
| "completion_length": 45.65, |
| "epoch": 0.35, |
| "grad_norm": 0.004730224609375, |
| "kl": 0.040744514157995584, |
| "learning_rate": 4.914814565722671e-06, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.2, |
| "step": 1750 |
| }, |
| { |
| "completion_length": 59.5, |
| "epoch": 0.352, |
| "grad_norm": 0.000579833984375, |
| "kl": 0.01578701629769057, |
| "learning_rate": 4.912541236180779e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1760 |
| }, |
| { |
| "completion_length": 50.7, |
| "epoch": 0.354, |
| "grad_norm": 0.000629425048828125, |
| "kl": 0.11596511220559477, |
| "learning_rate": 4.910238510585275e-06, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.2, |
| "step": 1770 |
| }, |
| { |
| "completion_length": 48.8, |
| "epoch": 0.356, |
| "grad_norm": 46.75, |
| "kl": 0.2281810746062547, |
| "learning_rate": 4.907906416994146e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 1780 |
| }, |
| { |
| "completion_length": 57.4, |
| "epoch": 0.358, |
| "grad_norm": 0.00173187255859375, |
| "kl": 0.02588364710099995, |
| "learning_rate": 4.905544983823214e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 1790 |
| }, |
| { |
| "completion_length": 39.475, |
| "epoch": 0.36, |
| "grad_norm": 0.00136566162109375, |
| "kl": 0.12392290020361543, |
| "learning_rate": 4.903154239845798e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 1800 |
| }, |
| { |
| "completion_length": 57.725, |
| "epoch": 0.362, |
| "grad_norm": 0.0037689208984375, |
| "kl": 0.7656503105536103, |
| "learning_rate": 4.900734214192358e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.75, |
| "reward": -0.25, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.25, |
| "step": 1810 |
| }, |
| { |
| "completion_length": 51.025, |
| "epoch": 0.364, |
| "grad_norm": 0.00032806396484375, |
| "kl": 1.4439625646919012, |
| "learning_rate": 4.898284936350144e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 1820 |
| }, |
| { |
| "completion_length": 83.85, |
| "epoch": 0.366, |
| "grad_norm": 0.00061798095703125, |
| "kl": 0.013502365676686168, |
| "learning_rate": 4.8958064361628334e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1830 |
| }, |
| { |
| "completion_length": 57.3, |
| "epoch": 0.368, |
| "grad_norm": 0.00040435791015625, |
| "kl": 0.02529239854775369, |
| "learning_rate": 4.893298743830168e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 1840 |
| }, |
| { |
| "completion_length": 57.875, |
| "epoch": 0.37, |
| "grad_norm": 0.00067901611328125, |
| "kl": 0.013202862720936537, |
| "learning_rate": 4.890761889907589e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1850 |
| }, |
| { |
| "completion_length": 61.725, |
| "epoch": 0.372, |
| "grad_norm": 0.00250244140625, |
| "kl": 2.53927280055359, |
| "learning_rate": 4.888195905305859e-06, |
| "loss": 0.0003, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.10773502588272095, |
| "rewards/reward_func": -0.075, |
| "step": 1860 |
| }, |
| { |
| "completion_length": 57.6, |
| "epoch": 0.374, |
| "grad_norm": 0.0003833770751953125, |
| "kl": 0.02102891537360847, |
| "learning_rate": 4.885600821290692e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1870 |
| }, |
| { |
| "completion_length": 53.675, |
| "epoch": 0.376, |
| "grad_norm": 0.000583648681640625, |
| "kl": 0.0170896818395704, |
| "learning_rate": 4.882976669482368e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1880 |
| }, |
| { |
| "completion_length": 47.525, |
| "epoch": 0.378, |
| "grad_norm": 0.0037384033203125, |
| "kl": 0.01905378680676222, |
| "learning_rate": 4.880323481855347e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1890 |
| }, |
| { |
| "completion_length": 48.275, |
| "epoch": 0.38, |
| "grad_norm": 0.08203125, |
| "kl": 0.1403908584266901, |
| "learning_rate": 4.8776412907378845e-06, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.2, |
| "step": 1900 |
| }, |
| { |
| "completion_length": 73.425, |
| "epoch": 0.382, |
| "grad_norm": 0.000560760498046875, |
| "kl": 0.007492217188701034, |
| "learning_rate": 4.874930128811631e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1910 |
| }, |
| { |
| "completion_length": 67.0, |
| "epoch": 0.384, |
| "grad_norm": 0.00122833251953125, |
| "kl": 0.014595681196078658, |
| "learning_rate": 4.8721900291112415e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 1920 |
| }, |
| { |
| "completion_length": 65.125, |
| "epoch": 0.386, |
| "grad_norm": 0.000484466552734375, |
| "kl": 0.01310229734517634, |
| "learning_rate": 4.869421025023965e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 1930 |
| }, |
| { |
| "completion_length": 67.325, |
| "epoch": 0.388, |
| "grad_norm": 0.00048065185546875, |
| "kl": 0.012927077431231736, |
| "learning_rate": 4.866623150289241e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 1940 |
| }, |
| { |
| "completion_length": 53.425, |
| "epoch": 0.39, |
| "grad_norm": 0.001068115234375, |
| "kl": 0.009116059914231301, |
| "learning_rate": 4.863796438998293e-06, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.2, |
| "step": 1950 |
| }, |
| { |
| "completion_length": 58.7, |
| "epoch": 0.392, |
| "grad_norm": 0.00176239013671875, |
| "kl": 0.22789982098620384, |
| "learning_rate": 4.860940925593703e-06, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.2, |
| "step": 1960 |
| }, |
| { |
| "completion_length": 76.2, |
| "epoch": 0.394, |
| "grad_norm": 0.00179290771484375, |
| "kl": 0.010708777070976793, |
| "learning_rate": 4.858056644869002e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 1970 |
| }, |
| { |
| "completion_length": 53.675, |
| "epoch": 0.396, |
| "grad_norm": 0.0010528564453125, |
| "kl": 0.033514925348572436, |
| "learning_rate": 4.855143631968242e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 1980 |
| }, |
| { |
| "completion_length": 48.75, |
| "epoch": 0.398, |
| "grad_norm": 0.0004596710205078125, |
| "kl": 0.02880375348031521, |
| "learning_rate": 4.852201922385564e-06, |
| "loss": 0.0, |
| "match_ratio": 0.775, |
| "reward": -0.225, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.225, |
| "step": 1990 |
| }, |
| { |
| "completion_length": 56.05, |
| "epoch": 0.4, |
| "grad_norm": 7.82012939453125e-05, |
| "kl": 0.020885943528264762, |
| "learning_rate": 4.849231551964771e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 2000 |
| }, |
| { |
| "completion_length": 57.825, |
| "epoch": 0.402, |
| "grad_norm": 0.0007781982421875, |
| "kl": 0.013335178885608912, |
| "learning_rate": 4.84623255689889e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2010 |
| }, |
| { |
| "completion_length": 76.475, |
| "epoch": 0.404, |
| "grad_norm": 0.000461578369140625, |
| "kl": 0.013225622242316604, |
| "learning_rate": 4.84320497372973e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2020 |
| }, |
| { |
| "completion_length": 65.45, |
| "epoch": 0.406, |
| "grad_norm": 0.0016632080078125, |
| "kl": 0.020334804011508823, |
| "learning_rate": 4.840148839347434e-06, |
| "loss": 0.0, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.075, |
| "step": 2030 |
| }, |
| { |
| "completion_length": 87.85, |
| "epoch": 0.408, |
| "grad_norm": 0.000423431396484375, |
| "kl": 0.020761342905461787, |
| "learning_rate": 4.837064190990036e-06, |
| "loss": 0.0, |
| "match_ratio": 0.85, |
| "reward": -0.15, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.15, |
| "step": 2040 |
| }, |
| { |
| "completion_length": 60.15, |
| "epoch": 0.41, |
| "grad_norm": 0.000713348388671875, |
| "kl": 0.01624767268076539, |
| "learning_rate": 4.833951066243004e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2050 |
| }, |
| { |
| "completion_length": 52.075, |
| "epoch": 0.412, |
| "grad_norm": 0.007171630859375, |
| "kl": 0.039952522004023196, |
| "learning_rate": 4.830809503038781e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 2060 |
| }, |
| { |
| "completion_length": 64.925, |
| "epoch": 0.414, |
| "grad_norm": 0.00063323974609375, |
| "kl": 0.4387159863486886, |
| "learning_rate": 4.8276395396563215e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 2070 |
| }, |
| { |
| "completion_length": 52.35, |
| "epoch": 0.416, |
| "grad_norm": 0.000301361083984375, |
| "kl": 0.018893744330853224, |
| "learning_rate": 4.824441214720629e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2080 |
| }, |
| { |
| "completion_length": 72.125, |
| "epoch": 0.418, |
| "grad_norm": 0.000850677490234375, |
| "kl": 0.014612970128655433, |
| "learning_rate": 4.821214567202284e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2090 |
| }, |
| { |
| "completion_length": 62.525, |
| "epoch": 0.42, |
| "grad_norm": 0.003662109375, |
| "kl": 0.11677258219569922, |
| "learning_rate": 4.817959636416969e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 2100 |
| }, |
| { |
| "completion_length": 41.975, |
| "epoch": 0.422, |
| "grad_norm": 0.0006256103515625, |
| "kl": 0.014599576778709888, |
| "learning_rate": 4.814676462024988e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2110 |
| }, |
| { |
| "completion_length": 61.65, |
| "epoch": 0.424, |
| "grad_norm": 0.00052642822265625, |
| "kl": 0.010790122766047716, |
| "learning_rate": 4.811365084030784e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2120 |
| }, |
| { |
| "completion_length": 45.65, |
| "epoch": 0.426, |
| "grad_norm": 0.00095367431640625, |
| "kl": 0.011601420305669307, |
| "learning_rate": 4.808025542782453e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2130 |
| }, |
| { |
| "completion_length": 64.25, |
| "epoch": 0.428, |
| "grad_norm": 0.0003986358642578125, |
| "kl": 0.5641481504775584, |
| "learning_rate": 4.804657878971252e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 2140 |
| }, |
| { |
| "completion_length": 56.675, |
| "epoch": 0.43, |
| "grad_norm": 0.000957489013671875, |
| "kl": 0.013445794116705656, |
| "learning_rate": 4.801262133631101e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2150 |
| }, |
| { |
| "completion_length": 75.425, |
| "epoch": 0.432, |
| "grad_norm": 0.00055694580078125, |
| "kl": 0.012692990363575518, |
| "learning_rate": 4.7978383481380865e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2160 |
| }, |
| { |
| "completion_length": 62.375, |
| "epoch": 0.434, |
| "grad_norm": 6.96875, |
| "kl": 0.023554344521835448, |
| "learning_rate": 4.794386564209953e-06, |
| "loss": 0.0, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.125, |
| "step": 2170 |
| }, |
| { |
| "completion_length": 65.475, |
| "epoch": 0.436, |
| "grad_norm": 0.0003337860107421875, |
| "kl": 0.06298564318567515, |
| "learning_rate": 4.790906823905599e-06, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.2, |
| "step": 2180 |
| }, |
| { |
| "completion_length": 59.8, |
| "epoch": 0.438, |
| "grad_norm": 0.000537872314453125, |
| "kl": 0.013637619884684682, |
| "learning_rate": 4.787399169624562e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2190 |
| }, |
| { |
| "completion_length": 54.875, |
| "epoch": 0.44, |
| "grad_norm": 0.000759124755859375, |
| "kl": 0.01425664583221078, |
| "learning_rate": 4.783863644106502e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 2200 |
| }, |
| { |
| "completion_length": 50.1, |
| "epoch": 0.442, |
| "grad_norm": 0.001800537109375, |
| "kl": 0.0958622452802956, |
| "learning_rate": 4.780300290430683e-06, |
| "loss": 0.0, |
| "match_ratio": 0.85, |
| "reward": -0.15, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.15, |
| "step": 2210 |
| }, |
| { |
| "completion_length": 66.8, |
| "epoch": 0.444, |
| "grad_norm": 0.00043487548828125, |
| "kl": 0.00994320074096322, |
| "learning_rate": 4.776709152015443e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 2220 |
| }, |
| { |
| "completion_length": 73.9, |
| "epoch": 0.446, |
| "grad_norm": 0.000705718994140625, |
| "kl": 0.016998659167438746, |
| "learning_rate": 4.773090272617672e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2230 |
| }, |
| { |
| "completion_length": 65.45, |
| "epoch": 0.448, |
| "grad_norm": 0.00054931640625, |
| "kl": 0.015969987539574505, |
| "learning_rate": 4.769443696332272e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2240 |
| }, |
| { |
| "completion_length": 58.0, |
| "epoch": 0.45, |
| "grad_norm": 0.000514984130859375, |
| "kl": 0.05210723381023854, |
| "learning_rate": 4.765769467591626e-06, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.2, |
| "step": 2250 |
| }, |
| { |
| "completion_length": 55.35, |
| "epoch": 0.452, |
| "grad_norm": 0.0002841949462890625, |
| "kl": 0.2783783482853323, |
| "learning_rate": 4.762067631165049e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 2260 |
| }, |
| { |
| "completion_length": 60.1, |
| "epoch": 0.454, |
| "grad_norm": 0.00119781494140625, |
| "kl": 0.05332662384025753, |
| "learning_rate": 4.7583382321582525e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 2270 |
| }, |
| { |
| "completion_length": 59.625, |
| "epoch": 0.456, |
| "grad_norm": 0.0014495849609375, |
| "kl": 0.015380131197161973, |
| "learning_rate": 4.754581316012785e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2280 |
| }, |
| { |
| "completion_length": 56.1, |
| "epoch": 0.458, |
| "grad_norm": 0.000885009765625, |
| "kl": 0.04605462467297912, |
| "learning_rate": 4.750796928505484e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 2290 |
| }, |
| { |
| "completion_length": 76.35, |
| "epoch": 0.46, |
| "grad_norm": 0.000579833984375, |
| "kl": 0.053115089796483515, |
| "learning_rate": 4.746985115747918e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 2300 |
| }, |
| { |
| "completion_length": 50.8, |
| "epoch": 0.462, |
| "grad_norm": 0.000637054443359375, |
| "kl": 0.01561843790113926, |
| "learning_rate": 4.743145924185821e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 2310 |
| }, |
| { |
| "completion_length": 65.575, |
| "epoch": 0.464, |
| "grad_norm": 0.00061798095703125, |
| "kl": 0.01594538043718785, |
| "learning_rate": 4.7392794005985324e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2320 |
| }, |
| { |
| "completion_length": 44.15, |
| "epoch": 0.466, |
| "grad_norm": 0.0005340576171875, |
| "kl": 12.877768159005791, |
| "learning_rate": 4.735385592098421e-06, |
| "loss": 0.0013, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.1, |
| "step": 2330 |
| }, |
| { |
| "completion_length": 51.45, |
| "epoch": 0.468, |
| "grad_norm": 0.00055694580078125, |
| "kl": 0.019627093384042383, |
| "learning_rate": 4.731464546130315e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2340 |
| }, |
| { |
| "completion_length": 67.975, |
| "epoch": 0.47, |
| "grad_norm": 0.0024261474609375, |
| "kl": 0.018453579442575575, |
| "learning_rate": 4.72751631047092e-06, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.2, |
| "step": 2350 |
| }, |
| { |
| "completion_length": 73.7, |
| "epoch": 0.472, |
| "grad_norm": 0.00049591064453125, |
| "kl": 0.011441022157669067, |
| "learning_rate": 4.723540933228245e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2360 |
| }, |
| { |
| "completion_length": 68.525, |
| "epoch": 0.474, |
| "grad_norm": 0.000537872314453125, |
| "kl": 0.010118643706664442, |
| "learning_rate": 4.719538462841003e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2370 |
| }, |
| { |
| "completion_length": 42.325, |
| "epoch": 0.476, |
| "grad_norm": 0.00110626220703125, |
| "kl": 23.094405939802527, |
| "learning_rate": 4.715508948078037e-06, |
| "loss": 0.0023, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.1, |
| "step": 2380 |
| }, |
| { |
| "completion_length": 51.85, |
| "epoch": 0.478, |
| "grad_norm": 0.0002574920654296875, |
| "kl": 0.01785165797919035, |
| "learning_rate": 4.71145243803771e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2390 |
| }, |
| { |
| "completion_length": 76.575, |
| "epoch": 0.48, |
| "grad_norm": 0.00049591064453125, |
| "kl": 0.02041715644299984, |
| "learning_rate": 4.707368982147318e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 2400 |
| }, |
| { |
| "completion_length": 53.65, |
| "epoch": 0.482, |
| "grad_norm": 0.0007171630859375, |
| "kl": 0.011072598048485816, |
| "learning_rate": 4.703258630162481e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2410 |
| }, |
| { |
| "completion_length": 45.65, |
| "epoch": 0.484, |
| "grad_norm": 0.0010986328125, |
| "kl": 1306590.1205121286, |
| "learning_rate": 4.699121432166542e-06, |
| "loss": 130.659, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 2420 |
| }, |
| { |
| "completion_length": 66.625, |
| "epoch": 0.486, |
| "grad_norm": 0.0004425048828125, |
| "kl": 0.018537986697629093, |
| "learning_rate": 4.6949574385699514e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2430 |
| }, |
| { |
| "completion_length": 50.825, |
| "epoch": 0.488, |
| "grad_norm": 0.00121307373046875, |
| "kl": 0.027628638222813605, |
| "learning_rate": 4.690766700109659e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2440 |
| }, |
| { |
| "completion_length": 72.275, |
| "epoch": 0.49, |
| "grad_norm": 0.0006103515625, |
| "kl": 0.012821279000490904, |
| "learning_rate": 4.68654926784849e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2450 |
| }, |
| { |
| "completion_length": 59.025, |
| "epoch": 0.492, |
| "grad_norm": 0.0006256103515625, |
| "kl": 1.4881786234676837, |
| "learning_rate": 4.682305193174524e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.2, |
| "step": 2460 |
| }, |
| { |
| "completion_length": 70.65, |
| "epoch": 0.494, |
| "grad_norm": 32.25, |
| "kl": 0.13413287354633213, |
| "learning_rate": 4.6780345278004744e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 2470 |
| }, |
| { |
| "completion_length": 60.875, |
| "epoch": 0.496, |
| "grad_norm": 0.00067138671875, |
| "kl": 0.022409677878022193, |
| "learning_rate": 4.673737323763048e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 2480 |
| }, |
| { |
| "completion_length": 57.875, |
| "epoch": 0.498, |
| "grad_norm": 18.75, |
| "kl": 0.49180023511871696, |
| "learning_rate": 4.669413633422322e-06, |
| "loss": 0.0, |
| "match_ratio": 0.775, |
| "reward": -0.225, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.225, |
| "step": 2490 |
| }, |
| { |
| "completion_length": 63.95, |
| "epoch": 0.5, |
| "grad_norm": 0.000720977783203125, |
| "kl": 0.0207068151794374, |
| "learning_rate": 4.665063509461098e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 2500 |
| }, |
| { |
| "completion_length": 70.75, |
| "epoch": 0.502, |
| "grad_norm": 0.0004634857177734375, |
| "kl": 0.01583680328913033, |
| "learning_rate": 4.6606870048842626e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2510 |
| }, |
| { |
| "completion_length": 41.6, |
| "epoch": 0.504, |
| "grad_norm": 0.001251220703125, |
| "kl": 0.024196008208673448, |
| "learning_rate": 4.656284173018144e-06, |
| "loss": 0.0, |
| "match_ratio": 0.7, |
| "reward": -0.3, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.3, |
| "step": 2520 |
| }, |
| { |
| "completion_length": 60.475, |
| "epoch": 0.506, |
| "grad_norm": 0.0008697509765625, |
| "kl": 0.02790404809638858, |
| "learning_rate": 4.65185506750986e-06, |
| "loss": 0.0, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.125, |
| "step": 2530 |
| }, |
| { |
| "completion_length": 63.1, |
| "epoch": 0.508, |
| "grad_norm": 0.0005950927734375, |
| "kl": 0.037049750238656996, |
| "learning_rate": 4.6473997423266615e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 2540 |
| }, |
| { |
| "completion_length": 64.025, |
| "epoch": 0.51, |
| "grad_norm": 0.00121307373046875, |
| "kl": 0.05311856884509325, |
| "learning_rate": 4.642918251755281e-06, |
| "loss": 0.0, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.125, |
| "step": 2550 |
| }, |
| { |
| "completion_length": 48.925, |
| "epoch": 0.512, |
| "grad_norm": 0.00482177734375, |
| "kl": 0.07079303860664368, |
| "learning_rate": 4.638410650401267e-06, |
| "loss": 0.0, |
| "match_ratio": 0.7, |
| "reward": -0.3, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.3, |
| "step": 2560 |
| }, |
| { |
| "completion_length": 61.8, |
| "epoch": 0.514, |
| "grad_norm": 0.0010986328125, |
| "kl": 0.024926586542278528, |
| "learning_rate": 4.633876993188319e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 2570 |
| }, |
| { |
| "completion_length": 49.6, |
| "epoch": 0.516, |
| "grad_norm": 0.000766754150390625, |
| "kl": 0.019125528051517904, |
| "learning_rate": 4.62931733535762e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 2580 |
| }, |
| { |
| "completion_length": 66.35, |
| "epoch": 0.518, |
| "grad_norm": 0.00159454345703125, |
| "kl": 0.020169223845005035, |
| "learning_rate": 4.62473173246716e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2590 |
| }, |
| { |
| "completion_length": 55.7, |
| "epoch": 0.52, |
| "grad_norm": 0.00067138671875, |
| "kl": 0.24018120649270713, |
| "learning_rate": 4.620120240391065e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 2600 |
| }, |
| { |
| "completion_length": 57.725, |
| "epoch": 0.522, |
| "grad_norm": 0.00087738037109375, |
| "kl": 2.7745140019804237, |
| "learning_rate": 4.6154829153189105e-06, |
| "loss": 0.0003, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 2610 |
| }, |
| { |
| "completion_length": 48.7, |
| "epoch": 0.524, |
| "grad_norm": 0.000659942626953125, |
| "kl": 0.023752238228917123, |
| "learning_rate": 4.610819813755038e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 2620 |
| }, |
| { |
| "completion_length": 57.225, |
| "epoch": 0.526, |
| "grad_norm": 0.00150299072265625, |
| "kl": 0.02745365663431585, |
| "learning_rate": 4.60613099251787e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2630 |
| }, |
| { |
| "completion_length": 48.5, |
| "epoch": 0.528, |
| "grad_norm": 0.00133514404296875, |
| "kl": 0.021794071048498155, |
| "learning_rate": 4.601416508739211e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 2640 |
| }, |
| { |
| "completion_length": 61.7, |
| "epoch": 0.53, |
| "grad_norm": 0.000823974609375, |
| "kl": 0.01592640457674861, |
| "learning_rate": 4.596676419863561e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2650 |
| }, |
| { |
| "completion_length": 55.825, |
| "epoch": 0.532, |
| "grad_norm": 0.000431060791015625, |
| "kl": 4393.635703391675, |
| "learning_rate": 4.591910783647405e-06, |
| "loss": 0.4394, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.125, |
| "step": 2660 |
| }, |
| { |
| "completion_length": 42.725, |
| "epoch": 0.534, |
| "grad_norm": 0.0005340576171875, |
| "kl": 3.308469070494175, |
| "learning_rate": 4.587119658158517e-06, |
| "loss": 0.0003, |
| "match_ratio": 0.7, |
| "reward": -0.3, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.3, |
| "step": 2670 |
| }, |
| { |
| "completion_length": 53.675, |
| "epoch": 0.536, |
| "grad_norm": 0.000362396240234375, |
| "kl": 0.017304986575618388, |
| "learning_rate": 4.582303101775249e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2680 |
| }, |
| { |
| "completion_length": 46.825, |
| "epoch": 0.538, |
| "grad_norm": 0.000438690185546875, |
| "kl": 1.4977983684279024, |
| "learning_rate": 4.577461173185821e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.7, |
| "reward": -0.3, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.3, |
| "step": 2690 |
| }, |
| { |
| "completion_length": 49.425, |
| "epoch": 0.54, |
| "grad_norm": 0.000637054443359375, |
| "kl": 0.04018927337601781, |
| "learning_rate": 4.572593931387604e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 2700 |
| }, |
| { |
| "completion_length": 61.475, |
| "epoch": 0.542, |
| "grad_norm": 0.00025177001953125, |
| "kl": 0.029250907758250833, |
| "learning_rate": 4.567701435686405e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 2710 |
| }, |
| { |
| "completion_length": 61.3, |
| "epoch": 0.544, |
| "grad_norm": 0.00107574462890625, |
| "kl": 0.041718969354406, |
| "learning_rate": 4.562783745695738e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 2720 |
| }, |
| { |
| "completion_length": 71.225, |
| "epoch": 0.546, |
| "grad_norm": 0.00101470947265625, |
| "kl": 0.017608029022812843, |
| "learning_rate": 4.5578409213361055e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 2730 |
| }, |
| { |
| "completion_length": 68.825, |
| "epoch": 0.548, |
| "grad_norm": 0.00084686279296875, |
| "kl": 0.01787120271474123, |
| "learning_rate": 4.55287302283426e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 2740 |
| }, |
| { |
| "completion_length": 40.975, |
| "epoch": 0.55, |
| "grad_norm": 0.0004405975341796875, |
| "kl": 0.08943550041876733, |
| "learning_rate": 4.54788011072248e-06, |
| "loss": 0.0, |
| "match_ratio": 0.7, |
| "reward": -0.3, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.3, |
| "step": 2750 |
| }, |
| { |
| "completion_length": 60.275, |
| "epoch": 0.552, |
| "grad_norm": 0.000888824462890625, |
| "kl": 0.012318810448050499, |
| "learning_rate": 4.542862245837821e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2760 |
| }, |
| { |
| "completion_length": 50.225, |
| "epoch": 0.554, |
| "grad_norm": 0.000396728515625, |
| "kl": 0.1667893348261714, |
| "learning_rate": 4.537819489321385e-06, |
| "loss": 0.0, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.125, |
| "step": 2770 |
| }, |
| { |
| "completion_length": 71.5, |
| "epoch": 0.556, |
| "grad_norm": 0.000873565673828125, |
| "kl": 0.009898501250427216, |
| "learning_rate": 4.5327519026175694e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2780 |
| }, |
| { |
| "completion_length": 66.525, |
| "epoch": 0.558, |
| "grad_norm": 0.0013275146484375, |
| "kl": 0.016875687218271197, |
| "learning_rate": 4.527659547473317e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2790 |
| }, |
| { |
| "completion_length": 54.8, |
| "epoch": 0.56, |
| "grad_norm": 0.000308990478515625, |
| "kl": 0.40478452597744763, |
| "learning_rate": 4.522542485937369e-06, |
| "loss": 0.0, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.075, |
| "step": 2800 |
| }, |
| { |
| "completion_length": 63.175, |
| "epoch": 0.562, |
| "grad_norm": 0.002197265625, |
| "kl": 0.01588670499622822, |
| "learning_rate": 4.517400780359505e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2810 |
| }, |
| { |
| "completion_length": 66.025, |
| "epoch": 0.564, |
| "grad_norm": 0.0015411376953125, |
| "kl": 0.9499945601448416, |
| "learning_rate": 4.512234493389785e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.075, |
| "step": 2820 |
| }, |
| { |
| "completion_length": 52.925, |
| "epoch": 0.566, |
| "grad_norm": 0.000408172607421875, |
| "kl": 0.7477384469937534, |
| "learning_rate": 4.507043687977787e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 2830 |
| }, |
| { |
| "completion_length": 54.05, |
| "epoch": 0.568, |
| "grad_norm": 0.00145721435546875, |
| "kl": 0.02171561080031097, |
| "learning_rate": 4.501828427371834e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2840 |
| }, |
| { |
| "completion_length": 67.425, |
| "epoch": 0.57, |
| "grad_norm": 0.00038909912109375, |
| "kl": 46.97684473299887, |
| "learning_rate": 4.496588775118232e-06, |
| "loss": 0.0047, |
| "match_ratio": 0.825, |
| "reward": -0.175, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.175, |
| "step": 2850 |
| }, |
| { |
| "completion_length": 49.9, |
| "epoch": 0.572, |
| "grad_norm": 0.000713348388671875, |
| "kl": 0.04041039999574423, |
| "learning_rate": 4.491324795060491e-06, |
| "loss": 0.0, |
| "match_ratio": 0.7, |
| "reward": -0.3, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.3, |
| "step": 2860 |
| }, |
| { |
| "completion_length": 46.8, |
| "epoch": 0.574, |
| "grad_norm": 0.0003948211669921875, |
| "kl": 0.013312188815325499, |
| "learning_rate": 4.4860365513385456e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2870 |
| }, |
| { |
| "completion_length": 46.825, |
| "epoch": 0.576, |
| "grad_norm": 0.0003490447998046875, |
| "kl": 0.4708886262029409, |
| "learning_rate": 4.4807241083879774e-06, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.2, |
| "step": 2880 |
| }, |
| { |
| "completion_length": 65.625, |
| "epoch": 0.578, |
| "grad_norm": 0.00121307373046875, |
| "kl": 0.0194290304556489, |
| "learning_rate": 4.475387530939226e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2890 |
| }, |
| { |
| "completion_length": 56.075, |
| "epoch": 0.58, |
| "grad_norm": 0.000579833984375, |
| "kl": 0.04038618067279458, |
| "learning_rate": 4.470026884016805e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 2900 |
| }, |
| { |
| "completion_length": 47.725, |
| "epoch": 0.582, |
| "grad_norm": 0.00043487548828125, |
| "kl": 16.41964945977088, |
| "learning_rate": 4.464642232938505e-06, |
| "loss": 0.0016, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 2910 |
| }, |
| { |
| "completion_length": 63.9, |
| "epoch": 0.584, |
| "grad_norm": 0.0013885498046875, |
| "kl": 0.01390684423968196, |
| "learning_rate": 4.4592336433146e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2920 |
| }, |
| { |
| "completion_length": 44.3, |
| "epoch": 0.586, |
| "grad_norm": 0.00023651123046875, |
| "kl": 0.010492815752513707, |
| "learning_rate": 4.453801181047047e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2930 |
| }, |
| { |
| "completion_length": 72.45, |
| "epoch": 0.588, |
| "grad_norm": 0.00032806396484375, |
| "kl": 0.01113151153549552, |
| "learning_rate": 4.448344912328686e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2940 |
| }, |
| { |
| "completion_length": 61.05, |
| "epoch": 0.59, |
| "grad_norm": 0.0004787445068359375, |
| "kl": 0.008429582207463681, |
| "learning_rate": 4.442864903642428e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2950 |
| }, |
| { |
| "completion_length": 51.9, |
| "epoch": 0.592, |
| "grad_norm": 0.00080108642578125, |
| "kl": 0.0231597448233515, |
| "learning_rate": 4.437361221760449e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2960 |
| }, |
| { |
| "completion_length": 47.825, |
| "epoch": 0.594, |
| "grad_norm": 0.0002899169921875, |
| "kl": 0.03416364281438291, |
| "learning_rate": 4.431833933743378e-06, |
| "loss": 0.0, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.125, |
| "step": 2970 |
| }, |
| { |
| "completion_length": 54.475, |
| "epoch": 0.596, |
| "grad_norm": 29.75, |
| "kl": 25.77551784273237, |
| "learning_rate": 4.426283106939474e-06, |
| "loss": 0.0026, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 2980 |
| }, |
| { |
| "completion_length": 61.175, |
| "epoch": 0.598, |
| "grad_norm": 0.00060272216796875, |
| "kl": 0.010949767334386707, |
| "learning_rate": 4.420708808983809e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 2990 |
| }, |
| { |
| "completion_length": 58.05, |
| "epoch": 0.6, |
| "grad_norm": 0.00049591064453125, |
| "kl": 0.03518106024712324, |
| "learning_rate": 4.415111107797445e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3000 |
| }, |
| { |
| "completion_length": 48.7, |
| "epoch": 0.602, |
| "grad_norm": 0.0006103515625, |
| "kl": 0.015348105784505605, |
| "learning_rate": 4.409490071586606e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3010 |
| }, |
| { |
| "completion_length": 59.9, |
| "epoch": 0.604, |
| "grad_norm": 0.0004119873046875, |
| "kl": 0.011616118438541888, |
| "learning_rate": 4.403845768841842e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3020 |
| }, |
| { |
| "completion_length": 69.0, |
| "epoch": 0.606, |
| "grad_norm": 0.000640869140625, |
| "kl": 0.011708037834614516, |
| "learning_rate": 4.398178268337202e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3030 |
| }, |
| { |
| "completion_length": 52.125, |
| "epoch": 0.608, |
| "grad_norm": 0.000896453857421875, |
| "kl": 0.04663766893791035, |
| "learning_rate": 4.3924876391293915e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3040 |
| }, |
| { |
| "completion_length": 54.175, |
| "epoch": 0.61, |
| "grad_norm": 0.000701904296875, |
| "kl": 9.442724062688649, |
| "learning_rate": 4.386773950556931e-06, |
| "loss": 0.0009, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 3050 |
| }, |
| { |
| "completion_length": 51.075, |
| "epoch": 0.612, |
| "grad_norm": 25.75, |
| "kl": 18.313499209098516, |
| "learning_rate": 4.381037272239311e-06, |
| "loss": 0.0018, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.2, |
| "step": 3060 |
| }, |
| { |
| "completion_length": 59.45, |
| "epoch": 0.614, |
| "grad_norm": 0.00145721435546875, |
| "kl": 0.05719580026343465, |
| "learning_rate": 4.3752776740761495e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.1, |
| "step": 3070 |
| }, |
| { |
| "completion_length": 59.75, |
| "epoch": 0.616, |
| "grad_norm": 0.000865936279296875, |
| "kl": 955.0316817238461, |
| "learning_rate": 4.36949522624633e-06, |
| "loss": 0.0955, |
| "match_ratio": 0.85, |
| "reward": -0.15, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.15, |
| "step": 3080 |
| }, |
| { |
| "completion_length": 49.25, |
| "epoch": 0.618, |
| "grad_norm": 0.0947265625, |
| "kl": 0.12424529809504747, |
| "learning_rate": 4.3636899992071555e-06, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.2, |
| "step": 3090 |
| }, |
| { |
| "completion_length": 63.45, |
| "epoch": 0.62, |
| "grad_norm": 0.000774383544921875, |
| "kl": 0.021155705489218236, |
| "learning_rate": 4.357862063693486e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3100 |
| }, |
| { |
| "completion_length": 58.675, |
| "epoch": 0.622, |
| "grad_norm": 0.00130462646484375, |
| "kl": 0.03269129507243633, |
| "learning_rate": 4.352011490716875e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3110 |
| }, |
| { |
| "completion_length": 51.25, |
| "epoch": 0.624, |
| "grad_norm": 0.000720977783203125, |
| "kl": 0.01852965746074915, |
| "learning_rate": 4.346138351564711e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3120 |
| }, |
| { |
| "completion_length": 42.025, |
| "epoch": 0.626, |
| "grad_norm": 0.0014495849609375, |
| "kl": 0.13662478388287128, |
| "learning_rate": 4.340242717799337e-06, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.2, |
| "step": 3130 |
| }, |
| { |
| "completion_length": 55.225, |
| "epoch": 0.628, |
| "grad_norm": 0.0002536773681640625, |
| "kl": 0.011004617274738848, |
| "learning_rate": 4.334324661257191e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3140 |
| }, |
| { |
| "completion_length": 36.075, |
| "epoch": 0.63, |
| "grad_norm": 0.000701904296875, |
| "kl": 0.03598860376514494, |
| "learning_rate": 4.328384254047927e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3150 |
| }, |
| { |
| "completion_length": 70.7, |
| "epoch": 0.632, |
| "grad_norm": 0.0004825592041015625, |
| "kl": 0.014078293647617101, |
| "learning_rate": 4.322421568553529e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3160 |
| }, |
| { |
| "completion_length": 41.4, |
| "epoch": 0.634, |
| "grad_norm": 0.00093841552734375, |
| "kl": 0.02731174589134753, |
| "learning_rate": 4.316436677427441e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3170 |
| }, |
| { |
| "completion_length": 62.625, |
| "epoch": 0.636, |
| "grad_norm": 0.00035858154296875, |
| "kl": 0.015678783506155015, |
| "learning_rate": 4.3104296535936695e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3180 |
| }, |
| { |
| "completion_length": 61.0, |
| "epoch": 0.638, |
| "grad_norm": 0.00091552734375, |
| "kl": 0.019854954723268748, |
| "learning_rate": 4.3044005702459055e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3190 |
| }, |
| { |
| "completion_length": 52.95, |
| "epoch": 0.64, |
| "grad_norm": 0.000514984130859375, |
| "kl": 0.13356436253525317, |
| "learning_rate": 4.2983495008466285e-06, |
| "loss": 0.0, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.125, |
| "step": 3200 |
| }, |
| { |
| "completion_length": 66.475, |
| "epoch": 0.642, |
| "grad_norm": 0.000316619873046875, |
| "kl": 0.013267815671861171, |
| "learning_rate": 4.2922765191262075e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3210 |
| }, |
| { |
| "completion_length": 66.475, |
| "epoch": 0.644, |
| "grad_norm": 0.000701904296875, |
| "kl": 0.07887385552749038, |
| "learning_rate": 4.286181699082008e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 3220 |
| }, |
| { |
| "completion_length": 61.55, |
| "epoch": 0.646, |
| "grad_norm": 0.000118255615234375, |
| "kl": 0.011934885568916798, |
| "learning_rate": 4.280065114977492e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3230 |
| }, |
| { |
| "completion_length": 61.625, |
| "epoch": 0.648, |
| "grad_norm": 0.0004558563232421875, |
| "kl": 0.015098626213148236, |
| "learning_rate": 4.273926841341303e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3240 |
| }, |
| { |
| "completion_length": 47.7, |
| "epoch": 0.65, |
| "grad_norm": 0.00048065185546875, |
| "kl": 0.017614057380706073, |
| "learning_rate": 4.267766952966369e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3250 |
| }, |
| { |
| "completion_length": 47.35, |
| "epoch": 0.652, |
| "grad_norm": 0.000911712646484375, |
| "kl": 19.53749562408775, |
| "learning_rate": 4.261585524908987e-06, |
| "loss": 0.002, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 3260 |
| }, |
| { |
| "completion_length": 67.85, |
| "epoch": 0.654, |
| "grad_norm": 0.0004367828369140625, |
| "kl": 0.011493841698393226, |
| "learning_rate": 4.255382632487907e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3270 |
| }, |
| { |
| "completion_length": 43.525, |
| "epoch": 0.656, |
| "grad_norm": 0.0004062652587890625, |
| "kl": 0.06476088264025748, |
| "learning_rate": 4.249158351283414e-06, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.2, |
| "step": 3280 |
| }, |
| { |
| "completion_length": 50.575, |
| "epoch": 0.658, |
| "grad_norm": 0.000812530517578125, |
| "kl": 0.044670914835296574, |
| "learning_rate": 4.242912757136412e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3290 |
| }, |
| { |
| "completion_length": 48.95, |
| "epoch": 0.66, |
| "grad_norm": 0.054931640625, |
| "kl": 0.05593093540519476, |
| "learning_rate": 4.236645926147493e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3300 |
| }, |
| { |
| "completion_length": 55.425, |
| "epoch": 0.662, |
| "grad_norm": 0.0004024505615234375, |
| "kl": 0.02145648035220802, |
| "learning_rate": 4.230357934676017e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3310 |
| }, |
| { |
| "completion_length": 62.2, |
| "epoch": 0.664, |
| "grad_norm": 0.000499725341796875, |
| "kl": 0.012483126670122146, |
| "learning_rate": 4.224048859339175e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3320 |
| }, |
| { |
| "completion_length": 61.9, |
| "epoch": 0.666, |
| "grad_norm": 0.0002689361572265625, |
| "kl": 0.03277284097857773, |
| "learning_rate": 4.217718777011058e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3330 |
| }, |
| { |
| "completion_length": 70.675, |
| "epoch": 0.668, |
| "grad_norm": 0.00103759765625, |
| "kl": 0.32424843702465295, |
| "learning_rate": 4.211367764821722e-06, |
| "loss": 0.0, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.075, |
| "step": 3340 |
| }, |
| { |
| "completion_length": 41.375, |
| "epoch": 0.67, |
| "grad_norm": 0.0004367828369140625, |
| "kl": 0.05056889692787081, |
| "learning_rate": 4.204995900156247e-06, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.2, |
| "step": 3350 |
| }, |
| { |
| "completion_length": 59.675, |
| "epoch": 0.672, |
| "grad_norm": 0.00106048583984375, |
| "kl": 0.04967752741649747, |
| "learning_rate": 4.198603260653792e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3360 |
| }, |
| { |
| "completion_length": 53.425, |
| "epoch": 0.674, |
| "grad_norm": 0.0003681182861328125, |
| "kl": 0.01580333085730672, |
| "learning_rate": 4.192189924206652e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3370 |
| }, |
| { |
| "completion_length": 50.75, |
| "epoch": 0.676, |
| "grad_norm": 0.002899169921875, |
| "kl": 1.8218023491092026, |
| "learning_rate": 4.185755968959308e-06, |
| "loss": 0.0002, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.10773502588272095, |
| "rewards/reward_func": -0.125, |
| "step": 3380 |
| }, |
| { |
| "completion_length": 62.375, |
| "epoch": 0.678, |
| "grad_norm": 0.000156402587890625, |
| "kl": 0.047494524717330934, |
| "learning_rate": 4.179301473307476e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3390 |
| }, |
| { |
| "completion_length": 69.3, |
| "epoch": 0.68, |
| "grad_norm": 0.0002651214599609375, |
| "kl": 0.014598681312054395, |
| "learning_rate": 4.172826515897146e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3400 |
| }, |
| { |
| "completion_length": 63.575, |
| "epoch": 0.682, |
| "grad_norm": 0.000423431396484375, |
| "kl": 0.04025569665245712, |
| "learning_rate": 4.166331175623631e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3410 |
| }, |
| { |
| "completion_length": 48.05, |
| "epoch": 0.684, |
| "grad_norm": 0.0006561279296875, |
| "kl": 913.4162682918599, |
| "learning_rate": 4.159815531630604e-06, |
| "loss": 0.0913, |
| "match_ratio": 0.85, |
| "reward": -0.15, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.15, |
| "step": 3420 |
| }, |
| { |
| "completion_length": 52.575, |
| "epoch": 0.686, |
| "grad_norm": 0.0006561279296875, |
| "kl": 0.0610341252759099, |
| "learning_rate": 4.15327966330913e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3430 |
| }, |
| { |
| "completion_length": 57.6, |
| "epoch": 0.688, |
| "grad_norm": 0.0024871826171875, |
| "kl": 0.9020142253488302, |
| "learning_rate": 4.146723650296701e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.85, |
| "reward": -0.15, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.15, |
| "step": 3440 |
| }, |
| { |
| "completion_length": 56.25, |
| "epoch": 0.69, |
| "grad_norm": 0.0045166015625, |
| "kl": 0.14730083039030434, |
| "learning_rate": 4.140147572476269e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3450 |
| }, |
| { |
| "completion_length": 72.55, |
| "epoch": 0.692, |
| "grad_norm": 0.0004024505615234375, |
| "kl": 0.05645229946821928, |
| "learning_rate": 4.133551509975264e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3460 |
| }, |
| { |
| "completion_length": 62.0, |
| "epoch": 0.694, |
| "grad_norm": 0.00063323974609375, |
| "kl": 0.018445250298827886, |
| "learning_rate": 4.126935543164628e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3470 |
| }, |
| { |
| "completion_length": 54.125, |
| "epoch": 0.696, |
| "grad_norm": 0.004608154296875, |
| "kl": 0.03874910874292255, |
| "learning_rate": 4.120299752657828e-06, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.2, |
| "step": 3480 |
| }, |
| { |
| "completion_length": 35.4, |
| "epoch": 0.698, |
| "grad_norm": 0.020751953125, |
| "kl": 0.07965331296436488, |
| "learning_rate": 4.113644219309877e-06, |
| "loss": 0.0, |
| "match_ratio": 0.675, |
| "reward": -0.325, |
| "reward_std": 0.10773502588272095, |
| "rewards/reward_func": -0.325, |
| "step": 3490 |
| }, |
| { |
| "completion_length": 61.6, |
| "epoch": 0.7, |
| "grad_norm": 0.00079345703125, |
| "kl": 0.02872077892534435, |
| "learning_rate": 4.106969024216348e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3500 |
| }, |
| { |
| "completion_length": 57.175, |
| "epoch": 0.702, |
| "grad_norm": 0.00157928466796875, |
| "kl": 0.017641184292733668, |
| "learning_rate": 4.1002742487123896e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3510 |
| }, |
| { |
| "completion_length": 62.6, |
| "epoch": 0.704, |
| "grad_norm": 0.0002651214599609375, |
| "kl": 0.020063164038583638, |
| "learning_rate": 4.093559974371725e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3520 |
| }, |
| { |
| "completion_length": 51.125, |
| "epoch": 0.706, |
| "grad_norm": 0.000576019287109375, |
| "kl": 0.6725238669663668, |
| "learning_rate": 4.086826283005669e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 3530 |
| }, |
| { |
| "completion_length": 56.625, |
| "epoch": 0.708, |
| "grad_norm": 0.000579833984375, |
| "kl": 0.01952581750229001, |
| "learning_rate": 4.080073256662128e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3540 |
| }, |
| { |
| "completion_length": 62.7, |
| "epoch": 0.71, |
| "grad_norm": 0.0035247802734375, |
| "kl": 0.07669782191514969, |
| "learning_rate": 4.073300977624594e-06, |
| "loss": 0.0, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.075, |
| "step": 3550 |
| }, |
| { |
| "completion_length": 68.325, |
| "epoch": 0.712, |
| "grad_norm": 0.0014801025390625, |
| "kl": 0.026417199242860078, |
| "learning_rate": 4.066509528411151e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3560 |
| }, |
| { |
| "completion_length": 68.925, |
| "epoch": 0.714, |
| "grad_norm": 0.0003643035888671875, |
| "kl": 0.01686573908664286, |
| "learning_rate": 4.059698991773466e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3570 |
| }, |
| { |
| "completion_length": 65.325, |
| "epoch": 0.716, |
| "grad_norm": 0.0003261566162109375, |
| "kl": 0.009878239961108193, |
| "learning_rate": 4.052869450695776e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3580 |
| }, |
| { |
| "completion_length": 54.125, |
| "epoch": 0.718, |
| "grad_norm": 0.00054168701171875, |
| "kl": 0.017347801569849254, |
| "learning_rate": 4.046020988393886e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3590 |
| }, |
| { |
| "completion_length": 62.45, |
| "epoch": 0.72, |
| "grad_norm": 0.000362396240234375, |
| "kl": 0.010593670699745417, |
| "learning_rate": 4.039153688314146e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3600 |
| }, |
| { |
| "completion_length": 56.225, |
| "epoch": 0.722, |
| "grad_norm": 0.0007476806640625, |
| "kl": 20.069431526213883, |
| "learning_rate": 4.032267634132442e-06, |
| "loss": 0.002, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.10773502588272095, |
| "rewards/reward_func": -0.075, |
| "step": 3610 |
| }, |
| { |
| "completion_length": 61.925, |
| "epoch": 0.724, |
| "grad_norm": 0.0003643035888671875, |
| "kl": 0.023581979051232337, |
| "learning_rate": 4.02536290975317e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3620 |
| }, |
| { |
| "completion_length": 71.05, |
| "epoch": 0.726, |
| "grad_norm": 0.0006561279296875, |
| "kl": 0.019050255604088306, |
| "learning_rate": 4.018439599308217e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3630 |
| }, |
| { |
| "completion_length": 63.85, |
| "epoch": 0.728, |
| "grad_norm": 0.0002498626708984375, |
| "kl": 0.023293742351233958, |
| "learning_rate": 4.011497787155938e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3640 |
| }, |
| { |
| "completion_length": 72.65, |
| "epoch": 0.73, |
| "grad_norm": 0.000347137451171875, |
| "kl": 0.011120679695159197, |
| "learning_rate": 4.0045375578801216e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3650 |
| }, |
| { |
| "completion_length": 62.95, |
| "epoch": 0.732, |
| "grad_norm": 0.00042724609375, |
| "kl": 0.017660227511078118, |
| "learning_rate": 3.997558996288965e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3660 |
| }, |
| { |
| "completion_length": 56.05, |
| "epoch": 0.734, |
| "grad_norm": 0.000881195068359375, |
| "kl": 0.35850770082324745, |
| "learning_rate": 3.9905621874140396e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3670 |
| }, |
| { |
| "completion_length": 50.5, |
| "epoch": 0.736, |
| "grad_norm": 0.0004291534423828125, |
| "kl": 0.022859503608196975, |
| "learning_rate": 3.983547216509254e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3680 |
| }, |
| { |
| "completion_length": 65.675, |
| "epoch": 0.738, |
| "grad_norm": 0.0003795623779296875, |
| "kl": 0.01089323298074305, |
| "learning_rate": 3.976514169049814e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3690 |
| }, |
| { |
| "completion_length": 55.825, |
| "epoch": 0.74, |
| "grad_norm": 0.0002765655517578125, |
| "kl": 0.026947349560214207, |
| "learning_rate": 3.969463130731183e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3700 |
| }, |
| { |
| "completion_length": 51.9, |
| "epoch": 0.742, |
| "grad_norm": 0.000720977783203125, |
| "kl": 101.020502169244, |
| "learning_rate": 3.96239418746804e-06, |
| "loss": 0.0101, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3710 |
| }, |
| { |
| "completion_length": 41.925, |
| "epoch": 0.744, |
| "grad_norm": 0.0004367828369140625, |
| "kl": 1.0482193630887195, |
| "learning_rate": 3.955307425393224e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3720 |
| }, |
| { |
| "completion_length": 43.075, |
| "epoch": 0.746, |
| "grad_norm": 0.0010986328125, |
| "kl": 0.011403680918738246, |
| "learning_rate": 3.948202930856697e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3730 |
| }, |
| { |
| "completion_length": 64.75, |
| "epoch": 0.748, |
| "grad_norm": 0.0004062652587890625, |
| "kl": 0.1115244179032743, |
| "learning_rate": 3.941080790424483e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3740 |
| }, |
| { |
| "completion_length": 59.65, |
| "epoch": 0.75, |
| "grad_norm": 0.000553131103515625, |
| "kl": 0.011348503362387418, |
| "learning_rate": 3.933941090877615e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3750 |
| }, |
| { |
| "completion_length": 50.1, |
| "epoch": 0.752, |
| "grad_norm": 0.0004749298095703125, |
| "kl": 0.01582015200983733, |
| "learning_rate": 3.92678391921108e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3760 |
| }, |
| { |
| "completion_length": 58.2, |
| "epoch": 0.754, |
| "grad_norm": 0.0004024505615234375, |
| "kl": 0.02131882361136377, |
| "learning_rate": 3.9196093626327535e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3770 |
| }, |
| { |
| "completion_length": 56.45, |
| "epoch": 0.756, |
| "grad_norm": 0.000766754150390625, |
| "kl": 0.024744509416632355, |
| "learning_rate": 3.912417508562345e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3780 |
| }, |
| { |
| "completion_length": 48.5, |
| "epoch": 0.758, |
| "grad_norm": 0.0005950927734375, |
| "kl": 0.015975080896168947, |
| "learning_rate": 3.905208444630326e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3790 |
| }, |
| { |
| "completion_length": 69.525, |
| "epoch": 0.76, |
| "grad_norm": 0.000537872314453125, |
| "kl": 0.009961457317695021, |
| "learning_rate": 3.897982258676867e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 3800 |
| }, |
| { |
| "completion_length": 60.575, |
| "epoch": 0.762, |
| "grad_norm": 0.000507354736328125, |
| "kl": 0.011210405128076672, |
| "learning_rate": 3.890739038750763e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3810 |
| }, |
| { |
| "completion_length": 46.2, |
| "epoch": 0.764, |
| "grad_norm": 0.000255584716796875, |
| "kl": 1.3439840027829633, |
| "learning_rate": 3.88347887310836e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.825, |
| "reward": -0.175, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.175, |
| "step": 3820 |
| }, |
| { |
| "completion_length": 54.325, |
| "epoch": 0.766, |
| "grad_norm": 0.000637054443359375, |
| "kl": 0.09195185881108045, |
| "learning_rate": 3.876201850212489e-06, |
| "loss": 0.0, |
| "match_ratio": 0.675, |
| "reward": -0.325, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.325, |
| "step": 3830 |
| }, |
| { |
| "completion_length": 74.35, |
| "epoch": 0.768, |
| "grad_norm": 0.000225067138671875, |
| "kl": 0.01327997730113566, |
| "learning_rate": 3.868908058731376e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3840 |
| }, |
| { |
| "completion_length": 50.125, |
| "epoch": 0.77, |
| "grad_norm": 0.000370025634765625, |
| "kl": 0.0457455332390964, |
| "learning_rate": 3.861597587537568e-06, |
| "loss": 0.0, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.125, |
| "step": 3850 |
| }, |
| { |
| "completion_length": 67.225, |
| "epoch": 0.772, |
| "grad_norm": 0.0009765625, |
| "kl": 0.019194579031318427, |
| "learning_rate": 3.85427052570685e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3860 |
| }, |
| { |
| "completion_length": 67.425, |
| "epoch": 0.774, |
| "grad_norm": 0.000522613525390625, |
| "kl": 0.16493179565295576, |
| "learning_rate": 3.846926962517158e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 3870 |
| }, |
| { |
| "completion_length": 58.725, |
| "epoch": 0.776, |
| "grad_norm": 0.0007171630859375, |
| "kl": 0.0165805596858263, |
| "learning_rate": 3.839566987447492e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3880 |
| }, |
| { |
| "completion_length": 64.575, |
| "epoch": 0.778, |
| "grad_norm": 0.00061798095703125, |
| "kl": 0.014522301172837615, |
| "learning_rate": 3.832190690176825e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3890 |
| }, |
| { |
| "completion_length": 49.6, |
| "epoch": 0.78, |
| "grad_norm": 5.435943603515625e-05, |
| "kl": 0.014127893140539527, |
| "learning_rate": 3.824798160583012e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3900 |
| }, |
| { |
| "completion_length": 49.8, |
| "epoch": 0.782, |
| "grad_norm": 0.000278472900390625, |
| "kl": 0.11558867986313999, |
| "learning_rate": 3.817389488741694e-06, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.2, |
| "step": 3910 |
| }, |
| { |
| "completion_length": 65.35, |
| "epoch": 0.784, |
| "grad_norm": 0.000690460205078125, |
| "kl": 0.013915874017402529, |
| "learning_rate": 3.8099647649251984e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3920 |
| }, |
| { |
| "completion_length": 63.1, |
| "epoch": 0.786, |
| "grad_norm": 0.000270843505859375, |
| "kl": 0.26083877284545454, |
| "learning_rate": 3.802524079601442e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3930 |
| }, |
| { |
| "completion_length": 57.475, |
| "epoch": 0.788, |
| "grad_norm": 0.000476837158203125, |
| "kl": 0.44110607262700796, |
| "learning_rate": 3.795067523432826e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.1, |
| "step": 3940 |
| }, |
| { |
| "completion_length": 50.125, |
| "epoch": 0.79, |
| "grad_norm": 0.0004062652587890625, |
| "kl": 0.03131135320290923, |
| "learning_rate": 3.787595187275136e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3950 |
| }, |
| { |
| "completion_length": 47.525, |
| "epoch": 0.792, |
| "grad_norm": 0.000949859619140625, |
| "kl": 0.056715600471943614, |
| "learning_rate": 3.780107162176429e-06, |
| "loss": 0.0, |
| "match_ratio": 0.8, |
| "reward": -0.2, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.2, |
| "step": 3960 |
| }, |
| { |
| "completion_length": 47.1, |
| "epoch": 0.794, |
| "grad_norm": 52.75, |
| "kl": 0.48424787069670855, |
| "learning_rate": 3.772603539375929e-06, |
| "loss": 0.0, |
| "match_ratio": 0.75, |
| "reward": -0.25, |
| "reward_std": 0.15773502588272095, |
| "rewards/reward_func": -0.25, |
| "step": 3970 |
| }, |
| { |
| "completion_length": 64.875, |
| "epoch": 0.796, |
| "grad_norm": 0.000530242919921875, |
| "kl": 0.08614660077728331, |
| "learning_rate": 3.7650844103029093e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 3980 |
| }, |
| { |
| "completion_length": 64.725, |
| "epoch": 0.798, |
| "grad_norm": 0.000347137451171875, |
| "kl": 0.012265483383089304, |
| "learning_rate": 3.7575498665755884e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 3990 |
| }, |
| { |
| "completion_length": 61.2, |
| "epoch": 0.8, |
| "grad_norm": 0.00040435791015625, |
| "kl": 0.032297836942598225, |
| "learning_rate": 3.7500000000000005e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4000 |
| }, |
| { |
| "completion_length": 65.8, |
| "epoch": 0.802, |
| "grad_norm": 0.00167083740234375, |
| "kl": 0.016016237577423452, |
| "learning_rate": 3.742434902568889e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4010 |
| }, |
| { |
| "completion_length": 56.125, |
| "epoch": 0.804, |
| "grad_norm": 0.0004100799560546875, |
| "kl": 0.06678674127906561, |
| "learning_rate": 3.7348546664605777e-06, |
| "loss": 0.0, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.075, |
| "step": 4020 |
| }, |
| { |
| "completion_length": 52.825, |
| "epoch": 0.806, |
| "grad_norm": 13.9375, |
| "kl": 39.51057905447669, |
| "learning_rate": 3.7272593840378526e-06, |
| "loss": 0.004, |
| "match_ratio": 0.825, |
| "reward": -0.175, |
| "reward_std": 0.10773502588272095, |
| "rewards/reward_func": -0.175, |
| "step": 4030 |
| }, |
| { |
| "completion_length": 56.375, |
| "epoch": 0.808, |
| "grad_norm": 0.000736236572265625, |
| "kl": 76.7652599786874, |
| "learning_rate": 3.7196491478468322e-06, |
| "loss": 0.0077, |
| "match_ratio": 0.825, |
| "reward": -0.175, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.175, |
| "step": 4040 |
| }, |
| { |
| "completion_length": 59.775, |
| "epoch": 0.81, |
| "grad_norm": 23.625, |
| "kl": 2.1401951428037136, |
| "learning_rate": 3.7120240506158433e-06, |
| "loss": 0.0002, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 4050 |
| }, |
| { |
| "completion_length": 78.925, |
| "epoch": 0.812, |
| "grad_norm": 0.000362396240234375, |
| "kl": 0.019226322788745163, |
| "learning_rate": 3.7043841852542884e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4060 |
| }, |
| { |
| "completion_length": 58.425, |
| "epoch": 0.814, |
| "grad_norm": 0.00078582763671875, |
| "kl": 0.04994579209014773, |
| "learning_rate": 3.6967296448515176e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4070 |
| }, |
| { |
| "completion_length": 57.025, |
| "epoch": 0.816, |
| "grad_norm": 0.000553131103515625, |
| "kl": 0.11473355963826179, |
| "learning_rate": 3.689060522675689e-06, |
| "loss": 0.0, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.075, |
| "step": 4080 |
| }, |
| { |
| "completion_length": 59.875, |
| "epoch": 0.818, |
| "grad_norm": 0.0164794921875, |
| "kl": 0.05073905866593122, |
| "learning_rate": 3.6813769121726356e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4090 |
| }, |
| { |
| "completion_length": 44.25, |
| "epoch": 0.82, |
| "grad_norm": 0.0002727508544921875, |
| "kl": 0.7498203465249389, |
| "learning_rate": 3.6736789069647273e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 4100 |
| }, |
| { |
| "completion_length": 58.15, |
| "epoch": 0.822, |
| "grad_norm": 0.0007781982421875, |
| "kl": 0.022883613361045718, |
| "learning_rate": 3.6659666008497287e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4110 |
| }, |
| { |
| "completion_length": 64.35, |
| "epoch": 0.824, |
| "grad_norm": 0.0005035400390625, |
| "kl": 3.837466208729893, |
| "learning_rate": 3.658240087799655e-06, |
| "loss": 0.0004, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.075, |
| "step": 4120 |
| }, |
| { |
| "completion_length": 63.35, |
| "epoch": 0.826, |
| "grad_norm": 0.0002651214599609375, |
| "kl": 0.03254580916836858, |
| "learning_rate": 3.6504994619596295e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4130 |
| }, |
| { |
| "completion_length": 45.65, |
| "epoch": 0.828, |
| "grad_norm": 0.000396728515625, |
| "kl": 17.52819751542993, |
| "learning_rate": 3.642744817646736e-06, |
| "loss": 0.0018, |
| "match_ratio": 0.825, |
| "reward": -0.175, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.175, |
| "step": 4140 |
| }, |
| { |
| "completion_length": 62.8, |
| "epoch": 0.83, |
| "grad_norm": 0.05712890625, |
| "kl": 0.056678724475204945, |
| "learning_rate": 3.634976249348867e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4150 |
| }, |
| { |
| "completion_length": 49.45, |
| "epoch": 0.832, |
| "grad_norm": 0.0004863739013671875, |
| "kl": 0.02377572702243924, |
| "learning_rate": 3.627193851723577e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4160 |
| }, |
| { |
| "completion_length": 56.3, |
| "epoch": 0.834, |
| "grad_norm": 0.0010528564453125, |
| "kl": 0.23082902017049492, |
| "learning_rate": 3.6193977195969243e-06, |
| "loss": 0.0, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.075, |
| "step": 4170 |
| }, |
| { |
| "completion_length": 46.35, |
| "epoch": 0.836, |
| "grad_norm": 0.000518798828125, |
| "kl": 0.037431746069341895, |
| "learning_rate": 3.611587947962319e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4180 |
| }, |
| { |
| "completion_length": 66.15, |
| "epoch": 0.838, |
| "grad_norm": 0.00054168701171875, |
| "kl": 0.014803345128893853, |
| "learning_rate": 3.6037646319793635e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4190 |
| }, |
| { |
| "completion_length": 54.525, |
| "epoch": 0.84, |
| "grad_norm": 0.000743865966796875, |
| "kl": 0.03277415055781603, |
| "learning_rate": 3.595927866972694e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4200 |
| }, |
| { |
| "completion_length": 74.5, |
| "epoch": 0.842, |
| "grad_norm": 0.0004863739013671875, |
| "kl": 0.014684983342885972, |
| "learning_rate": 3.5880777484308193e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4210 |
| }, |
| { |
| "completion_length": 63.125, |
| "epoch": 0.844, |
| "grad_norm": 0.000640869140625, |
| "kl": 0.02462619331199676, |
| "learning_rate": 3.5802143720049565e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4220 |
| }, |
| { |
| "completion_length": 56.275, |
| "epoch": 0.846, |
| "grad_norm": 0.000415802001953125, |
| "kl": 4.124728001933545, |
| "learning_rate": 3.5723378335078653e-06, |
| "loss": 0.0004, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 4230 |
| }, |
| { |
| "completion_length": 53.25, |
| "epoch": 0.848, |
| "grad_norm": 0.000759124755859375, |
| "kl": 0.024984571058303116, |
| "learning_rate": 3.564448228912682e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4240 |
| }, |
| { |
| "completion_length": 56.15, |
| "epoch": 0.85, |
| "grad_norm": 0.00075531005859375, |
| "kl": 8.048016933631152, |
| "learning_rate": 3.556545654351749e-06, |
| "loss": 0.0008, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4250 |
| }, |
| { |
| "completion_length": 46.275, |
| "epoch": 0.852, |
| "grad_norm": 0.00115203857421875, |
| "kl": 0.02151933144778013, |
| "learning_rate": 3.5486302061154433e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4260 |
| }, |
| { |
| "completion_length": 56.525, |
| "epoch": 0.854, |
| "grad_norm": 0.000476837158203125, |
| "kl": 0.042470036540180445, |
| "learning_rate": 3.5407019806510035e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4270 |
| }, |
| { |
| "completion_length": 73.7, |
| "epoch": 0.856, |
| "grad_norm": 0.00035858154296875, |
| "kl": 0.01676445291377604, |
| "learning_rate": 3.532761074561355e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4280 |
| }, |
| { |
| "completion_length": 58.625, |
| "epoch": 0.858, |
| "grad_norm": 0.00136566162109375, |
| "kl": 13.48712082421407, |
| "learning_rate": 3.524807584603932e-06, |
| "loss": 0.0013, |
| "match_ratio": 0.825, |
| "reward": -0.175, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.175, |
| "step": 4290 |
| }, |
| { |
| "completion_length": 72.7, |
| "epoch": 0.86, |
| "grad_norm": 0.001922607421875, |
| "kl": 0.01346550565212965, |
| "learning_rate": 3.516841607689501e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 4300 |
| }, |
| { |
| "completion_length": 48.225, |
| "epoch": 0.862, |
| "grad_norm": 0.000934600830078125, |
| "kl": 1.3110887278337031, |
| "learning_rate": 3.5088632408809757e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.075, |
| "step": 4310 |
| }, |
| { |
| "completion_length": 42.375, |
| "epoch": 0.864, |
| "grad_norm": 0.000583648681640625, |
| "kl": 32.864392778254114, |
| "learning_rate": 3.5008725813922383e-06, |
| "loss": 0.0033, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.10773502588272095, |
| "rewards/reward_func": -0.125, |
| "step": 4320 |
| }, |
| { |
| "completion_length": 54.675, |
| "epoch": 0.866, |
| "grad_norm": 0.00145721435546875, |
| "kl": 0.43685728376731275, |
| "learning_rate": 3.4928697265869516e-06, |
| "loss": 0.0, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.10773502588272095, |
| "rewards/reward_func": -0.075, |
| "step": 4330 |
| }, |
| { |
| "completion_length": 61.15, |
| "epoch": 0.868, |
| "grad_norm": 0.0004787445068359375, |
| "kl": 0.02581656016409397, |
| "learning_rate": 3.4848547739773782e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4340 |
| }, |
| { |
| "completion_length": 52.5, |
| "epoch": 0.87, |
| "grad_norm": 0.0003643035888671875, |
| "kl": 0.013556264666840434, |
| "learning_rate": 3.476827821223184e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4350 |
| }, |
| { |
| "completion_length": 56.025, |
| "epoch": 0.872, |
| "grad_norm": 0.00144195556640625, |
| "kl": 0.05843255072832108, |
| "learning_rate": 3.4687889661302577e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4360 |
| }, |
| { |
| "completion_length": 68.225, |
| "epoch": 0.874, |
| "grad_norm": 0.000438690185546875, |
| "kl": 0.025629992503672837, |
| "learning_rate": 3.460738306649509e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4370 |
| }, |
| { |
| "completion_length": 40.225, |
| "epoch": 0.876, |
| "grad_norm": 0.000537872314453125, |
| "kl": 20.029893927741796, |
| "learning_rate": 3.452675940875686e-06, |
| "loss": 0.002, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 4380 |
| }, |
| { |
| "completion_length": 53.5, |
| "epoch": 0.878, |
| "grad_norm": 0.000759124755859375, |
| "kl": 0.0226501208730042, |
| "learning_rate": 3.4446019670461684e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4390 |
| }, |
| { |
| "completion_length": 55.325, |
| "epoch": 0.88, |
| "grad_norm": 0.00035858154296875, |
| "kl": 1.9279290955979378, |
| "learning_rate": 3.436516483539781e-06, |
| "loss": 0.0002, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 4400 |
| }, |
| { |
| "completion_length": 70.05, |
| "epoch": 0.882, |
| "grad_norm": 10.0, |
| "kl": 0.05578553443774581, |
| "learning_rate": 3.4284195888755877e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 4410 |
| }, |
| { |
| "completion_length": 58.05, |
| "epoch": 0.884, |
| "grad_norm": 0.00086212158203125, |
| "kl": 0.019159636087715627, |
| "learning_rate": 3.4203113817116955e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4420 |
| }, |
| { |
| "completion_length": 62.425, |
| "epoch": 0.886, |
| "grad_norm": 0.000614166259765625, |
| "kl": 0.10707788309082389, |
| "learning_rate": 3.412191960844049e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4430 |
| }, |
| { |
| "completion_length": 56.225, |
| "epoch": 0.888, |
| "grad_norm": 0.00061798095703125, |
| "kl": 3.764384925994091, |
| "learning_rate": 3.4040614252052305e-06, |
| "loss": 0.0004, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 4440 |
| }, |
| { |
| "completion_length": 57.075, |
| "epoch": 0.89, |
| "grad_norm": 0.002838134765625, |
| "kl": 0.04788713352754712, |
| "learning_rate": 3.39591987386325e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 4450 |
| }, |
| { |
| "completion_length": 62.575, |
| "epoch": 0.892, |
| "grad_norm": 0.000339508056640625, |
| "kl": 0.024599794298410416, |
| "learning_rate": 3.387767406020343e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4460 |
| }, |
| { |
| "completion_length": 47.85, |
| "epoch": 0.894, |
| "grad_norm": 0.000522613525390625, |
| "kl": 419.8108845547773, |
| "learning_rate": 3.3796041210117545e-06, |
| "loss": 0.042, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.075, |
| "step": 4470 |
| }, |
| { |
| "completion_length": 56.55, |
| "epoch": 0.896, |
| "grad_norm": 0.0016326904296875, |
| "kl": 4.869295587006491, |
| "learning_rate": 3.3714301183045382e-06, |
| "loss": 0.0005, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 4480 |
| }, |
| { |
| "completion_length": 65.65, |
| "epoch": 0.898, |
| "grad_norm": 0.000926971435546875, |
| "kl": 0.021241254778578876, |
| "learning_rate": 3.3632454974963368e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 4490 |
| }, |
| { |
| "completion_length": 51.7, |
| "epoch": 0.9, |
| "grad_norm": 0.00225830078125, |
| "kl": 0.06338205388747156, |
| "learning_rate": 3.3550503583141726e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4500 |
| }, |
| { |
| "completion_length": 52.65, |
| "epoch": 0.902, |
| "grad_norm": 0.0003948211669921875, |
| "kl": 0.3547412235289812, |
| "learning_rate": 3.346844800613229e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.1, |
| "step": 4510 |
| }, |
| { |
| "completion_length": 66.475, |
| "epoch": 0.904, |
| "grad_norm": 0.000637054443359375, |
| "kl": 0.01707718223333359, |
| "learning_rate": 3.338628924375638e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4520 |
| }, |
| { |
| "completion_length": 69.775, |
| "epoch": 0.906, |
| "grad_norm": 0.00017642974853515625, |
| "kl": 4.814126300462521, |
| "learning_rate": 3.3304028297092583e-06, |
| "loss": 0.0005, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.1154700517654419, |
| "rewards/reward_func": -0.1, |
| "step": 4530 |
| }, |
| { |
| "completion_length": 72.7, |
| "epoch": 0.908, |
| "grad_norm": 0.00022411346435546875, |
| "kl": 0.0319039260270074, |
| "learning_rate": 3.3221666168464584e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 4540 |
| }, |
| { |
| "completion_length": 63.75, |
| "epoch": 0.91, |
| "grad_norm": 0.002227783203125, |
| "kl": 0.02470703413709998, |
| "learning_rate": 3.313920386142892e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 4550 |
| }, |
| { |
| "completion_length": 49.6, |
| "epoch": 0.912, |
| "grad_norm": 0.0012969970703125, |
| "kl": 2.1085672612302004, |
| "learning_rate": 3.3056642380762783e-06, |
| "loss": 0.0002, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.075, |
| "step": 4560 |
| }, |
| { |
| "completion_length": 70.425, |
| "epoch": 0.914, |
| "grad_norm": 0.0013885498046875, |
| "kl": 0.016354763973504306, |
| "learning_rate": 3.2973982732451753e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4570 |
| }, |
| { |
| "completion_length": 59.775, |
| "epoch": 0.916, |
| "grad_norm": 0.000934600830078125, |
| "kl": 0.035655501671135424, |
| "learning_rate": 3.2891225923677565e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 4580 |
| }, |
| { |
| "completion_length": 54.775, |
| "epoch": 0.918, |
| "grad_norm": 0.01336669921875, |
| "kl": 4.602624000795186, |
| "learning_rate": 3.280837296280582e-06, |
| "loss": 0.0005, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 4590 |
| }, |
| { |
| "completion_length": 46.175, |
| "epoch": 0.92, |
| "grad_norm": 0.0003147125244140625, |
| "kl": 0.02292898967862129, |
| "learning_rate": 3.272542485937369e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4600 |
| }, |
| { |
| "completion_length": 48.85, |
| "epoch": 0.922, |
| "grad_norm": 0.000362396240234375, |
| "kl": 0.08314138883724809, |
| "learning_rate": 3.2642382624077647e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4610 |
| }, |
| { |
| "completion_length": 49.15, |
| "epoch": 0.924, |
| "grad_norm": 0.0006103515625, |
| "kl": 0.01725058164447546, |
| "learning_rate": 3.2559247268761117e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4620 |
| }, |
| { |
| "completion_length": 56.0, |
| "epoch": 0.926, |
| "grad_norm": 0.000514984130859375, |
| "kl": 0.02017789352685213, |
| "learning_rate": 3.247601980640217e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4630 |
| }, |
| { |
| "completion_length": 48.975, |
| "epoch": 0.928, |
| "grad_norm": 0.00102996826171875, |
| "kl": 0.015598981559742242, |
| "learning_rate": 3.2392701251101172e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4640 |
| }, |
| { |
| "completion_length": 54.35, |
| "epoch": 0.93, |
| "grad_norm": 0.00016307830810546875, |
| "kl": 0.017281436000484974, |
| "learning_rate": 3.230929261806842e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4650 |
| }, |
| { |
| "completion_length": 62.25, |
| "epoch": 0.932, |
| "grad_norm": 0.00055694580078125, |
| "kl": 27.82315392717719, |
| "learning_rate": 3.222579492361179e-06, |
| "loss": 0.0028, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 4660 |
| }, |
| { |
| "completion_length": 51.0, |
| "epoch": 0.934, |
| "grad_norm": 0.00083160400390625, |
| "kl": 0.18145442437380552, |
| "learning_rate": 3.214220918512434e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4670 |
| }, |
| { |
| "completion_length": 53.025, |
| "epoch": 0.936, |
| "grad_norm": 0.000469207763671875, |
| "kl": 0.26395926494151356, |
| "learning_rate": 3.205853642107192e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4680 |
| }, |
| { |
| "completion_length": 46.575, |
| "epoch": 0.938, |
| "grad_norm": 32.75, |
| "kl": 50.37309080436826, |
| "learning_rate": 3.1974777650980737e-06, |
| "loss": 0.005, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 4690 |
| }, |
| { |
| "completion_length": 54.75, |
| "epoch": 0.94, |
| "grad_norm": 0.00131988525390625, |
| "kl": 1.135747592896223, |
| "learning_rate": 3.189093389542498e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.075, |
| "step": 4700 |
| }, |
| { |
| "completion_length": 45.35, |
| "epoch": 0.942, |
| "grad_norm": 0.000698089599609375, |
| "kl": 8.475377059169114, |
| "learning_rate": 3.180700617601436e-06, |
| "loss": 0.0008, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 4710 |
| }, |
| { |
| "completion_length": 62.7, |
| "epoch": 0.944, |
| "grad_norm": 0.000827789306640625, |
| "kl": 0.10563798192888499, |
| "learning_rate": 3.1722995515381644e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4720 |
| }, |
| { |
| "completion_length": 51.2, |
| "epoch": 0.946, |
| "grad_norm": 0.000637054443359375, |
| "kl": 0.041765560209751126, |
| "learning_rate": 3.1638902937170224e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4730 |
| }, |
| { |
| "completion_length": 53.425, |
| "epoch": 0.948, |
| "grad_norm": 0.037841796875, |
| "kl": 90.9074339528568, |
| "learning_rate": 3.155472946602162e-06, |
| "loss": 0.0091, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.10773502588272095, |
| "rewards/reward_func": -0.125, |
| "step": 4740 |
| }, |
| { |
| "completion_length": 56.025, |
| "epoch": 0.95, |
| "grad_norm": 0.000637054443359375, |
| "kl": 0.14690550537779928, |
| "learning_rate": 3.147047612756302e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4750 |
| }, |
| { |
| "completion_length": 59.45, |
| "epoch": 0.952, |
| "grad_norm": 0.00110626220703125, |
| "kl": 0.021767212729901075, |
| "learning_rate": 3.1386143948394764e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4760 |
| }, |
| { |
| "completion_length": 49.15, |
| "epoch": 0.954, |
| "grad_norm": 68.0, |
| "kl": 50.01492289174348, |
| "learning_rate": 3.130173395607785e-06, |
| "loss": 0.005, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.10773502588272095, |
| "rewards/reward_func": -0.075, |
| "step": 4770 |
| }, |
| { |
| "completion_length": 50.925, |
| "epoch": 0.956, |
| "grad_norm": 0.000579833984375, |
| "kl": 3.767396915424615, |
| "learning_rate": 3.121724717912138e-06, |
| "loss": 0.0004, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 4780 |
| }, |
| { |
| "completion_length": 54.85, |
| "epoch": 0.958, |
| "grad_norm": 0.00109100341796875, |
| "kl": 0.024809733917936682, |
| "learning_rate": 3.1132684646970068e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4790 |
| }, |
| { |
| "completion_length": 60.0, |
| "epoch": 0.96, |
| "grad_norm": 0.000492095947265625, |
| "kl": 22.365475433226674, |
| "learning_rate": 3.1048047389991693e-06, |
| "loss": 0.0022, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.15773502588272095, |
| "rewards/reward_func": -0.1, |
| "step": 4800 |
| }, |
| { |
| "completion_length": 48.5, |
| "epoch": 0.962, |
| "grad_norm": 0.00107574462890625, |
| "kl": 0.01949691798072308, |
| "learning_rate": 3.0963336439464527e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4810 |
| }, |
| { |
| "completion_length": 73.975, |
| "epoch": 0.964, |
| "grad_norm": 0.0017242431640625, |
| "kl": 0.0853988635353744, |
| "learning_rate": 3.087855282756475e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4820 |
| }, |
| { |
| "completion_length": 61.175, |
| "epoch": 0.966, |
| "grad_norm": 0.00046539306640625, |
| "kl": 0.05258291512727738, |
| "learning_rate": 3.079369758735393e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4830 |
| }, |
| { |
| "completion_length": 69.55, |
| "epoch": 0.968, |
| "grad_norm": 0.08447265625, |
| "kl": 0.19851951650343835, |
| "learning_rate": 3.0708771752766397e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 4840 |
| }, |
| { |
| "completion_length": 52.35, |
| "epoch": 0.97, |
| "grad_norm": 0.00139617919921875, |
| "kl": 0.016163587383925915, |
| "learning_rate": 3.062377635859663e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4850 |
| }, |
| { |
| "completion_length": 59.4, |
| "epoch": 0.972, |
| "grad_norm": 0.000415802001953125, |
| "kl": 0.07268630117177963, |
| "learning_rate": 3.053871244048669e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4860 |
| }, |
| { |
| "completion_length": 52.725, |
| "epoch": 0.974, |
| "grad_norm": 0.000751495361328125, |
| "kl": 0.08838214613497257, |
| "learning_rate": 3.045358103491357e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4870 |
| }, |
| { |
| "completion_length": 47.25, |
| "epoch": 0.976, |
| "grad_norm": 0.0015411376953125, |
| "kl": 0.8490013023838401, |
| "learning_rate": 3.0368383179176584e-06, |
| "loss": 0.0001, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4880 |
| }, |
| { |
| "completion_length": 51.9, |
| "epoch": 0.978, |
| "grad_norm": 0.000576019287109375, |
| "kl": 0.06923787947744131, |
| "learning_rate": 3.0283119911384724e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4890 |
| }, |
| { |
| "completion_length": 45.05, |
| "epoch": 0.98, |
| "grad_norm": 0.000823974609375, |
| "kl": 0.06741849109530448, |
| "learning_rate": 3.019779227044398e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4900 |
| }, |
| { |
| "completion_length": 59.025, |
| "epoch": 0.982, |
| "grad_norm": 0.000885009765625, |
| "kl": 0.20571241448633373, |
| "learning_rate": 3.0112401296044756e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4910 |
| }, |
| { |
| "completion_length": 58.85, |
| "epoch": 0.984, |
| "grad_norm": 0.0003299713134765625, |
| "kl": 0.0659916253760457, |
| "learning_rate": 3.002694802864912e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4920 |
| }, |
| { |
| "completion_length": 57.3, |
| "epoch": 0.986, |
| "grad_norm": 0.0010223388671875, |
| "kl": 0.9824612125754356, |
| "learning_rate": 2.9941433509478157e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 4930 |
| }, |
| { |
| "completion_length": 63.675, |
| "epoch": 0.988, |
| "grad_norm": 0.0008544921875, |
| "kl": 0.08037902340292931, |
| "learning_rate": 2.98558587804993e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4940 |
| }, |
| { |
| "completion_length": 61.775, |
| "epoch": 0.99, |
| "grad_norm": 0.0003757476806640625, |
| "kl": 0.05214073383249342, |
| "learning_rate": 2.9770224884413625e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4950 |
| }, |
| { |
| "completion_length": 58.125, |
| "epoch": 0.992, |
| "grad_norm": 0.003936767578125, |
| "kl": 0.307067746296525, |
| "learning_rate": 2.9684532864643123e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4960 |
| }, |
| { |
| "completion_length": 47.575, |
| "epoch": 0.994, |
| "grad_norm": 0.00148773193359375, |
| "kl": 0.4293937426991761, |
| "learning_rate": 2.9598783765318005e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4970 |
| }, |
| { |
| "completion_length": 61.5, |
| "epoch": 0.996, |
| "grad_norm": 0.000698089599609375, |
| "kl": 0.055739361047744754, |
| "learning_rate": 2.9512978631264006e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 4980 |
| }, |
| { |
| "completion_length": 61.25, |
| "epoch": 0.998, |
| "grad_norm": 0.0030517578125, |
| "kl": 0.31474687876179813, |
| "learning_rate": 2.942711850798959e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 4990 |
| }, |
| { |
| "completion_length": 55.975, |
| "epoch": 1.0, |
| "grad_norm": 0.000606536865234375, |
| "kl": 10.358118780329823, |
| "learning_rate": 2.9341204441673267e-06, |
| "loss": 0.001, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.10773502588272095, |
| "rewards/reward_func": -0.075, |
| "step": 5000 |
| }, |
| { |
| "completion_length": 67.125, |
| "epoch": 1.002, |
| "grad_norm": 0.0023193359375, |
| "kl": 0.02981336957309395, |
| "learning_rate": 2.9255237479150815e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5010 |
| }, |
| { |
| "completion_length": 46.975, |
| "epoch": 1.004, |
| "grad_norm": 0.0004596710205078125, |
| "kl": 0.18872954780235887, |
| "learning_rate": 2.9169218667902562e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 5020 |
| }, |
| { |
| "completion_length": 81.2, |
| "epoch": 1.006, |
| "grad_norm": 0.0005950927734375, |
| "kl": 0.019520534854382276, |
| "learning_rate": 2.908314905604056e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5030 |
| }, |
| { |
| "completion_length": 84.75, |
| "epoch": 1.008, |
| "grad_norm": 11.1875, |
| "kl": 0.21086107967421414, |
| "learning_rate": 2.8997029692295875e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 5040 |
| }, |
| { |
| "completion_length": 67.175, |
| "epoch": 1.01, |
| "grad_norm": 0.000514984130859375, |
| "kl": 0.026521979738026856, |
| "learning_rate": 2.8910861626005774e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5050 |
| }, |
| { |
| "completion_length": 49.025, |
| "epoch": 1.012, |
| "grad_norm": 0.00106048583984375, |
| "kl": 0.05570605006068945, |
| "learning_rate": 2.8824645907100957e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5060 |
| }, |
| { |
| "completion_length": 57.875, |
| "epoch": 1.014, |
| "grad_norm": 0.000522613525390625, |
| "kl": 0.02027883781120181, |
| "learning_rate": 2.8738383586092745e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5070 |
| }, |
| { |
| "completion_length": 64.325, |
| "epoch": 1.016, |
| "grad_norm": 0.0004520416259765625, |
| "kl": 0.35661591766402123, |
| "learning_rate": 2.8652075714060296e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 5080 |
| }, |
| { |
| "completion_length": 47.125, |
| "epoch": 1.018, |
| "grad_norm": 0.0033111572265625, |
| "kl": 0.16099169924855233, |
| "learning_rate": 2.8565723342637797e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5090 |
| }, |
| { |
| "completion_length": 58.325, |
| "epoch": 1.02, |
| "grad_norm": 0.00066375732421875, |
| "kl": 36.65545420385897, |
| "learning_rate": 2.847932752400164e-06, |
| "loss": 0.0037, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5100 |
| }, |
| { |
| "completion_length": 54.975, |
| "epoch": 1.022, |
| "grad_norm": 0.0054931640625, |
| "kl": 0.14035283839330076, |
| "learning_rate": 2.8392889310857615e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5110 |
| }, |
| { |
| "completion_length": 44.15, |
| "epoch": 1.024, |
| "grad_norm": 0.0006103515625, |
| "kl": 0.05474662664346397, |
| "learning_rate": 2.8306409756428067e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5120 |
| }, |
| { |
| "completion_length": 51.425, |
| "epoch": 1.026, |
| "grad_norm": 0.00077056884765625, |
| "kl": 0.9017472909763455, |
| "learning_rate": 2.8219889914439073e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5130 |
| }, |
| { |
| "completion_length": 56.425, |
| "epoch": 1.028, |
| "grad_norm": 0.0030059814453125, |
| "kl": 920.3957623304799, |
| "learning_rate": 2.813333083910761e-06, |
| "loss": 0.092, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.075, |
| "step": 5140 |
| }, |
| { |
| "completion_length": 61.8, |
| "epoch": 1.03, |
| "grad_norm": 0.002166748046875, |
| "kl": 0.06634964090771973, |
| "learning_rate": 2.804673358512869e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5150 |
| }, |
| { |
| "completion_length": 60.6, |
| "epoch": 1.032, |
| "grad_norm": 0.0009613037109375, |
| "kl": 0.10043707201257349, |
| "learning_rate": 2.7960099207662535e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5160 |
| }, |
| { |
| "completion_length": 60.625, |
| "epoch": 1.034, |
| "grad_norm": 0.00049591064453125, |
| "kl": 2957.9408455969765, |
| "learning_rate": 2.7873428762321667e-06, |
| "loss": 0.2958, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5170 |
| }, |
| { |
| "completion_length": 48.175, |
| "epoch": 1.036, |
| "grad_norm": 0.000431060791015625, |
| "kl": 0.44579824099782855, |
| "learning_rate": 2.778672330515814e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 5180 |
| }, |
| { |
| "completion_length": 69.6, |
| "epoch": 1.038, |
| "grad_norm": 0.000530242919921875, |
| "kl": 0.1467185489833355, |
| "learning_rate": 2.769998389265057e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 5190 |
| }, |
| { |
| "completion_length": 46.4, |
| "epoch": 1.04, |
| "grad_norm": 0.0022735595703125, |
| "kl": 0.1808565909974277, |
| "learning_rate": 2.761321158169134e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5200 |
| }, |
| { |
| "completion_length": 66.075, |
| "epoch": 1.042, |
| "grad_norm": 0.00151824951171875, |
| "kl": 0.10877533163875341, |
| "learning_rate": 2.752640742957366e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5210 |
| }, |
| { |
| "completion_length": 64.8, |
| "epoch": 1.044, |
| "grad_norm": 0.000827789306640625, |
| "kl": 36.803903768444435, |
| "learning_rate": 2.743957249397874e-06, |
| "loss": 0.0037, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5220 |
| }, |
| { |
| "completion_length": 61.525, |
| "epoch": 1.046, |
| "grad_norm": 0.0040283203125, |
| "kl": 0.021204144693911076, |
| "learning_rate": 2.7352707832962865e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5230 |
| }, |
| { |
| "completion_length": 45.65, |
| "epoch": 1.048, |
| "grad_norm": 0.003173828125, |
| "kl": 2.3442719845101236, |
| "learning_rate": 2.726581450494451e-06, |
| "loss": 0.0002, |
| "match_ratio": 0.875, |
| "reward": -0.125, |
| "reward_std": 0.10773502588272095, |
| "rewards/reward_func": -0.125, |
| "step": 5240 |
| }, |
| { |
| "completion_length": 54.725, |
| "epoch": 1.05, |
| "grad_norm": 0.0003566741943359375, |
| "kl": 0.08242949154227971, |
| "learning_rate": 2.717889356869146e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5250 |
| }, |
| { |
| "completion_length": 68.075, |
| "epoch": 1.052, |
| "grad_norm": 0.0003337860107421875, |
| "kl": 0.0425911046564579, |
| "learning_rate": 2.70919460833079e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5260 |
| }, |
| { |
| "completion_length": 58.25, |
| "epoch": 1.054, |
| "grad_norm": 0.00067901611328125, |
| "kl": 0.05532512974459678, |
| "learning_rate": 2.700497310822147e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5270 |
| }, |
| { |
| "completion_length": 67.3, |
| "epoch": 1.056, |
| "grad_norm": 1616.0, |
| "kl": 654.5019911365816, |
| "learning_rate": 2.6917975703170466e-06, |
| "loss": 0.0655, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5280 |
| }, |
| { |
| "completion_length": 50.725, |
| "epoch": 1.058, |
| "grad_norm": 0.0004863739013671875, |
| "kl": 314.1969824824482, |
| "learning_rate": 2.6830954928190795e-06, |
| "loss": 0.0314, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.1, |
| "step": 5290 |
| }, |
| { |
| "completion_length": 52.65, |
| "epoch": 1.06, |
| "grad_norm": 0.000690460205078125, |
| "kl": 3050.6620800592004, |
| "learning_rate": 2.6743911843603134e-06, |
| "loss": 0.3051, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.1, |
| "step": 5300 |
| }, |
| { |
| "completion_length": 45.2, |
| "epoch": 1.062, |
| "grad_norm": 0.000606536865234375, |
| "kl": 0.6229335282929241, |
| "learning_rate": 2.6656847510000013e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5310 |
| }, |
| { |
| "completion_length": 59.825, |
| "epoch": 1.064, |
| "grad_norm": 0.00023555755615234375, |
| "kl": 0.0229927783831954, |
| "learning_rate": 2.6569762988232838e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5320 |
| }, |
| { |
| "completion_length": 45.525, |
| "epoch": 1.066, |
| "grad_norm": 0.470703125, |
| "kl": 12.478434246452526, |
| "learning_rate": 2.6482659339399047e-06, |
| "loss": 0.0012, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 5330 |
| }, |
| { |
| "completion_length": 48.5, |
| "epoch": 1.068, |
| "grad_norm": 0.000637054443359375, |
| "kl": 0.04660536227747798, |
| "learning_rate": 2.63955376248291e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5340 |
| }, |
| { |
| "completion_length": 57.5, |
| "epoch": 1.07, |
| "grad_norm": 0.00103759765625, |
| "kl": 0.46991982199251653, |
| "learning_rate": 2.6308398906073603e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5350 |
| }, |
| { |
| "completion_length": 59.9, |
| "epoch": 1.072, |
| "grad_norm": 0.00051116943359375, |
| "kl": 0.05683571686968207, |
| "learning_rate": 2.6221244244890336e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5360 |
| }, |
| { |
| "completion_length": 74.275, |
| "epoch": 1.074, |
| "grad_norm": 0.00079345703125, |
| "kl": 0.044226788356900214, |
| "learning_rate": 2.613407470323134e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5370 |
| }, |
| { |
| "completion_length": 71.075, |
| "epoch": 1.076, |
| "grad_norm": 0.000576019287109375, |
| "kl": 0.014436176512390375, |
| "learning_rate": 2.604689134322999e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5380 |
| }, |
| { |
| "completion_length": 54.925, |
| "epoch": 1.078, |
| "grad_norm": 0.00112152099609375, |
| "kl": 0.06992563903331757, |
| "learning_rate": 2.5959695227188e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5390 |
| }, |
| { |
| "completion_length": 58.65, |
| "epoch": 1.08, |
| "grad_norm": 0.00103759765625, |
| "kl": 0.026312044728547333, |
| "learning_rate": 2.587248741756253e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5400 |
| }, |
| { |
| "completion_length": 61.625, |
| "epoch": 1.082, |
| "grad_norm": 38.25, |
| "kl": 96.55521301142872, |
| "learning_rate": 2.578526897695321e-06, |
| "loss": 0.0097, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.15, |
| "rewards/reward_func": -0.075, |
| "step": 5410 |
| }, |
| { |
| "completion_length": 74.625, |
| "epoch": 1.084, |
| "grad_norm": 0.0004444122314453125, |
| "kl": 0.027571643888950347, |
| "learning_rate": 2.569804096808923e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5420 |
| }, |
| { |
| "completion_length": 63.75, |
| "epoch": 1.086, |
| "grad_norm": 0.000728607177734375, |
| "kl": 0.06449384274892508, |
| "learning_rate": 2.5610804453816333e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5430 |
| }, |
| { |
| "completion_length": 61.75, |
| "epoch": 1.088, |
| "grad_norm": 0.001007080078125, |
| "kl": 0.014204623247496783, |
| "learning_rate": 2.5523560497083927e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5440 |
| }, |
| { |
| "completion_length": 48.175, |
| "epoch": 1.09, |
| "grad_norm": 41.75, |
| "kl": 4.48764673435362, |
| "learning_rate": 2.543631016093209e-06, |
| "loss": 0.0004, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.075, |
| "step": 5450 |
| }, |
| { |
| "completion_length": 53.15, |
| "epoch": 1.092, |
| "grad_norm": 0.00054931640625, |
| "kl": 0.04185728752054274, |
| "learning_rate": 2.5349054508478636e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5460 |
| }, |
| { |
| "completion_length": 58.475, |
| "epoch": 1.094, |
| "grad_norm": 0.000457763671875, |
| "kl": 0.058739370107650755, |
| "learning_rate": 2.526179460290615e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5470 |
| }, |
| { |
| "completion_length": 54.5, |
| "epoch": 1.096, |
| "grad_norm": 0.025634765625, |
| "kl": 0.8717142393812537, |
| "learning_rate": 2.517453150744904e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 5480 |
| }, |
| { |
| "completion_length": 65.1, |
| "epoch": 1.098, |
| "grad_norm": 0.000553131103515625, |
| "kl": 0.0642022612504661, |
| "learning_rate": 2.5087266285380597e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5490 |
| }, |
| { |
| "completion_length": 55.0, |
| "epoch": 1.1, |
| "grad_norm": 0.000644683837890625, |
| "kl": 0.037831029202789065, |
| "learning_rate": 2.5e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5500 |
| }, |
| { |
| "completion_length": 56.85, |
| "epoch": 1.102, |
| "grad_norm": 0.001007080078125, |
| "kl": 0.1622185967862606, |
| "learning_rate": 2.4912733714619415e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5510 |
| }, |
| { |
| "completion_length": 72.775, |
| "epoch": 1.104, |
| "grad_norm": 0.00127410888671875, |
| "kl": 0.014623588742688298, |
| "learning_rate": 2.482546849255096e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5520 |
| }, |
| { |
| "completion_length": 53.525, |
| "epoch": 1.106, |
| "grad_norm": 0.000782012939453125, |
| "kl": 0.21322614937089385, |
| "learning_rate": 2.4738205397093863e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5530 |
| }, |
| { |
| "completion_length": 48.825, |
| "epoch": 1.108, |
| "grad_norm": 32.0, |
| "kl": 3.262874563597143, |
| "learning_rate": 2.4650945491521372e-06, |
| "loss": 0.0003, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 5540 |
| }, |
| { |
| "completion_length": 58.7, |
| "epoch": 1.11, |
| "grad_norm": 0.0012664794921875, |
| "kl": 0.062091145850718024, |
| "learning_rate": 2.4563689839067913e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5550 |
| }, |
| { |
| "completion_length": 45.5, |
| "epoch": 1.112, |
| "grad_norm": 0.00070953369140625, |
| "kl": 58.79087800290436, |
| "learning_rate": 2.447643950291608e-06, |
| "loss": 0.0059, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5560 |
| }, |
| { |
| "completion_length": 60.625, |
| "epoch": 1.114, |
| "grad_norm": 0.0003070831298828125, |
| "kl": 11.288955740490929, |
| "learning_rate": 2.4389195546183676e-06, |
| "loss": 0.0011, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 5570 |
| }, |
| { |
| "completion_length": 57.375, |
| "epoch": 1.116, |
| "grad_norm": 0.004669189453125, |
| "kl": 0.05885868603363633, |
| "learning_rate": 2.4301959031910785e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5580 |
| }, |
| { |
| "completion_length": 64.825, |
| "epoch": 1.1179999999999999, |
| "grad_norm": 0.00072479248046875, |
| "kl": 0.04936583343660459, |
| "learning_rate": 2.4214731023046795e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5590 |
| }, |
| { |
| "completion_length": 60.0, |
| "epoch": 1.12, |
| "grad_norm": 0.0009002685546875, |
| "kl": 16.106818246748297, |
| "learning_rate": 2.4127512582437486e-06, |
| "loss": 0.0016, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5600 |
| }, |
| { |
| "completion_length": 60.425, |
| "epoch": 1.1219999999999999, |
| "grad_norm": 0.0023193359375, |
| "kl": 0.05024177338927984, |
| "learning_rate": 2.4040304772812002e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5610 |
| }, |
| { |
| "completion_length": 54.6, |
| "epoch": 1.124, |
| "grad_norm": 0.0004558563232421875, |
| "kl": 0.3465561534278095, |
| "learning_rate": 2.3953108656770018e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 5620 |
| }, |
| { |
| "completion_length": 61.025, |
| "epoch": 1.126, |
| "grad_norm": 0.00311279296875, |
| "kl": 0.019688890036195516, |
| "learning_rate": 2.3865925296768658e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5630 |
| }, |
| { |
| "completion_length": 50.0, |
| "epoch": 1.1280000000000001, |
| "grad_norm": 0.00066375732421875, |
| "kl": 0.38241584403440354, |
| "learning_rate": 2.377875575510967e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5640 |
| }, |
| { |
| "completion_length": 58.7, |
| "epoch": 1.13, |
| "grad_norm": 0.00112152099609375, |
| "kl": 0.11847766758874059, |
| "learning_rate": 2.3691601093926406e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5650 |
| }, |
| { |
| "completion_length": 61.125, |
| "epoch": 1.1320000000000001, |
| "grad_norm": 0.0023956298828125, |
| "kl": 1.8248440870083869, |
| "learning_rate": 2.3604462375170905e-06, |
| "loss": 0.0002, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 5660 |
| }, |
| { |
| "completion_length": 57.175, |
| "epoch": 1.134, |
| "grad_norm": 22.75, |
| "kl": 16.894031352642923, |
| "learning_rate": 2.3517340660600965e-06, |
| "loss": 0.0017, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 5670 |
| }, |
| { |
| "completion_length": 66.275, |
| "epoch": 1.1360000000000001, |
| "grad_norm": 0.00035858154296875, |
| "kl": 0.01967704053968191, |
| "learning_rate": 2.3430237011767166e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5680 |
| }, |
| { |
| "completion_length": 51.75, |
| "epoch": 1.138, |
| "grad_norm": 69.0, |
| "kl": 15.417314376076684, |
| "learning_rate": 2.3343152490000004e-06, |
| "loss": 0.0015, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5690 |
| }, |
| { |
| "completion_length": 43.4, |
| "epoch": 1.1400000000000001, |
| "grad_norm": 0.000396728515625, |
| "kl": 9.579606763273478, |
| "learning_rate": 2.325608815639687e-06, |
| "loss": 0.001, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5700 |
| }, |
| { |
| "completion_length": 63.475, |
| "epoch": 1.142, |
| "grad_norm": 0.000457763671875, |
| "kl": 0.09258651239797473, |
| "learning_rate": 2.3169045071809217e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 5710 |
| }, |
| { |
| "completion_length": 67.575, |
| "epoch": 1.144, |
| "grad_norm": 0.0004215240478515625, |
| "kl": 36.50597060709261, |
| "learning_rate": 2.3082024296829538e-06, |
| "loss": 0.0037, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5720 |
| }, |
| { |
| "completion_length": 57.8, |
| "epoch": 1.146, |
| "grad_norm": 0.0002994537353515625, |
| "kl": 0.08120424915105104, |
| "learning_rate": 2.2995026891778533e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5730 |
| }, |
| { |
| "completion_length": 55.275, |
| "epoch": 1.148, |
| "grad_norm": 0.0027618408203125, |
| "kl": 0.9556269285269081, |
| "learning_rate": 2.290805391669212e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5740 |
| }, |
| { |
| "completion_length": 67.5, |
| "epoch": 1.15, |
| "grad_norm": 0.03662109375, |
| "kl": 4.989278326183557, |
| "learning_rate": 2.2821106431308546e-06, |
| "loss": 0.0005, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5750 |
| }, |
| { |
| "completion_length": 57.625, |
| "epoch": 1.152, |
| "grad_norm": 0.0004482269287109375, |
| "kl": 0.05195563132874668, |
| "learning_rate": 2.2734185495055503e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5760 |
| }, |
| { |
| "completion_length": 66.875, |
| "epoch": 1.154, |
| "grad_norm": 0.000614166259765625, |
| "kl": 0.04006156194955111, |
| "learning_rate": 2.2647292167037143e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5770 |
| }, |
| { |
| "completion_length": 67.85, |
| "epoch": 1.156, |
| "grad_norm": 0.000518798828125, |
| "kl": 0.017254956741817297, |
| "learning_rate": 2.256042750602127e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5780 |
| }, |
| { |
| "completion_length": 57.9, |
| "epoch": 1.158, |
| "grad_norm": 0.016357421875, |
| "kl": 22.463907711207867, |
| "learning_rate": 2.2473592570426343e-06, |
| "loss": 0.0022, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5790 |
| }, |
| { |
| "completion_length": 53.4, |
| "epoch": 1.16, |
| "grad_norm": 14.9375, |
| "kl": 0.47836247340310367, |
| "learning_rate": 2.238678841830867e-06, |
| "loss": 0.0, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.1154700517654419, |
| "rewards/reward_func": -0.1, |
| "step": 5800 |
| }, |
| { |
| "completion_length": 52.575, |
| "epoch": 1.162, |
| "grad_norm": 0.0005340576171875, |
| "kl": 1.7170299529330806, |
| "learning_rate": 2.230001610734943e-06, |
| "loss": 0.0002, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5810 |
| }, |
| { |
| "completion_length": 52.175, |
| "epoch": 1.164, |
| "grad_norm": 0.000820159912109375, |
| "kl": 0.023483294621109964, |
| "learning_rate": 2.2213276694841866e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5820 |
| }, |
| { |
| "completion_length": 66.725, |
| "epoch": 1.166, |
| "grad_norm": 0.000797271728515625, |
| "kl": 0.017644689697772265, |
| "learning_rate": 2.212657123767834e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5830 |
| }, |
| { |
| "completion_length": 46.525, |
| "epoch": 1.168, |
| "grad_norm": 0.000629425048828125, |
| "kl": 0.022305818554013968, |
| "learning_rate": 2.2039900792337477e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5840 |
| }, |
| { |
| "completion_length": 79.55, |
| "epoch": 1.17, |
| "grad_norm": 0.000514984130859375, |
| "kl": 0.045465368404984476, |
| "learning_rate": 2.195326641487132e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5850 |
| }, |
| { |
| "completion_length": 70.875, |
| "epoch": 1.172, |
| "grad_norm": 0.000728607177734375, |
| "kl": 0.026964151486754417, |
| "learning_rate": 2.186666916089239e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5860 |
| }, |
| { |
| "completion_length": 51.9, |
| "epoch": 1.174, |
| "grad_norm": 0.0006103515625, |
| "kl": 0.06193929803557694, |
| "learning_rate": 2.1780110085560935e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5870 |
| }, |
| { |
| "completion_length": 58.975, |
| "epoch": 1.176, |
| "grad_norm": 0.0003662109375, |
| "kl": 96.22263815930928, |
| "learning_rate": 2.1693590243571937e-06, |
| "loss": 0.0096, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5880 |
| }, |
| { |
| "completion_length": 50.05, |
| "epoch": 1.178, |
| "grad_norm": 0.0012054443359375, |
| "kl": 0.10491749201901257, |
| "learning_rate": 2.1607110689142393e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5890 |
| }, |
| { |
| "completion_length": 49.3, |
| "epoch": 1.18, |
| "grad_norm": 0.00064849853515625, |
| "kl": 0.09745957013219594, |
| "learning_rate": 2.1520672475998374e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5900 |
| }, |
| { |
| "completion_length": 63.625, |
| "epoch": 1.182, |
| "grad_norm": 0.00054931640625, |
| "kl": 0.0679678438231349, |
| "learning_rate": 2.143427665736221e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5910 |
| }, |
| { |
| "completion_length": 63.7, |
| "epoch": 1.184, |
| "grad_norm": 0.0005950927734375, |
| "kl": 0.03987161219120026, |
| "learning_rate": 2.134792428593971e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5920 |
| }, |
| { |
| "completion_length": 53.3, |
| "epoch": 1.186, |
| "grad_norm": 0.000591278076171875, |
| "kl": 3.143317204480991, |
| "learning_rate": 2.1261616413907267e-06, |
| "loss": 0.0003, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 5930 |
| }, |
| { |
| "completion_length": 44.425, |
| "epoch": 1.188, |
| "grad_norm": 0.01080322265625, |
| "kl": 0.0820039251120761, |
| "learning_rate": 2.117535409289905e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5940 |
| }, |
| { |
| "completion_length": 46.825, |
| "epoch": 1.19, |
| "grad_norm": 0.000396728515625, |
| "kl": 0.10649018711410463, |
| "learning_rate": 2.1089138373994226e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5950 |
| }, |
| { |
| "completion_length": 47.475, |
| "epoch": 1.192, |
| "grad_norm": 0.000499725341796875, |
| "kl": 0.01360652674920857, |
| "learning_rate": 2.1002970307704134e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5960 |
| }, |
| { |
| "completion_length": 63.525, |
| "epoch": 1.194, |
| "grad_norm": 0.00060272216796875, |
| "kl": 0.015495409537106753, |
| "learning_rate": 2.0916850943959453e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5970 |
| }, |
| { |
| "completion_length": 47.5, |
| "epoch": 1.196, |
| "grad_norm": 0.0026702880859375, |
| "kl": 0.058427824173122644, |
| "learning_rate": 2.0830781332097446e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5980 |
| }, |
| { |
| "completion_length": 59.525, |
| "epoch": 1.198, |
| "grad_norm": 0.00106048583984375, |
| "kl": 0.07809406300075353, |
| "learning_rate": 2.0744762520849193e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 5990 |
| }, |
| { |
| "completion_length": 68.25, |
| "epoch": 1.2, |
| "grad_norm": 0.000324249267578125, |
| "kl": 5.223696762509644, |
| "learning_rate": 2.0658795558326745e-06, |
| "loss": 0.0005, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 6000 |
| }, |
| { |
| "completion_length": 52.325, |
| "epoch": 1.202, |
| "grad_norm": 0.000720977783203125, |
| "kl": 0.08351310016587377, |
| "learning_rate": 2.0572881492010423e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6010 |
| }, |
| { |
| "completion_length": 65.15, |
| "epoch": 1.204, |
| "grad_norm": 0.000499725341796875, |
| "kl": 668.4266535042319, |
| "learning_rate": 2.0487021368736002e-06, |
| "loss": 0.0668, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 6020 |
| }, |
| { |
| "completion_length": 56.75, |
| "epoch": 1.206, |
| "grad_norm": 0.0004119873046875, |
| "kl": 10.605211506178602, |
| "learning_rate": 2.0401216234682e-06, |
| "loss": 0.0011, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 6030 |
| }, |
| { |
| "completion_length": 69.575, |
| "epoch": 1.208, |
| "grad_norm": 0.00077056884765625, |
| "kl": 0.01607757806777954, |
| "learning_rate": 2.031546713535688e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6040 |
| }, |
| { |
| "completion_length": 66.85, |
| "epoch": 1.21, |
| "grad_norm": 0.00070953369140625, |
| "kl": 0.05791890555992722, |
| "learning_rate": 2.022977511558638e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6050 |
| }, |
| { |
| "completion_length": 55.3, |
| "epoch": 1.212, |
| "grad_norm": 0.000774383544921875, |
| "kl": 301.9731966109015, |
| "learning_rate": 2.0144141219500707e-06, |
| "loss": 0.0302, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 6060 |
| }, |
| { |
| "completion_length": 53.25, |
| "epoch": 1.214, |
| "grad_norm": 0.0079345703125, |
| "kl": 0.03139863689430058, |
| "learning_rate": 2.0058566490521848e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6070 |
| }, |
| { |
| "completion_length": 61.25, |
| "epoch": 1.216, |
| "grad_norm": 0.0005340576171875, |
| "kl": 0.04796885896939784, |
| "learning_rate": 1.997305197135089e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6080 |
| }, |
| { |
| "completion_length": 47.325, |
| "epoch": 1.218, |
| "grad_norm": 0.002166748046875, |
| "kl": 0.2320690915454179, |
| "learning_rate": 1.9887598703955244e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6090 |
| }, |
| { |
| "completion_length": 69.575, |
| "epoch": 1.22, |
| "grad_norm": 0.000339508056640625, |
| "kl": 0.02641369737684727, |
| "learning_rate": 1.9802207729556023e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6100 |
| }, |
| { |
| "completion_length": 55.0, |
| "epoch": 1.222, |
| "grad_norm": 0.0003509521484375, |
| "kl": 0.07225975301116705, |
| "learning_rate": 1.971688008861529e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6110 |
| }, |
| { |
| "completion_length": 60.75, |
| "epoch": 1.224, |
| "grad_norm": 0.00022411346435546875, |
| "kl": 0.015332509903237224, |
| "learning_rate": 1.963161682082342e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6120 |
| }, |
| { |
| "completion_length": 48.175, |
| "epoch": 1.226, |
| "grad_norm": 0.002105712890625, |
| "kl": 0.05186178609728813, |
| "learning_rate": 1.9546418965086444e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6130 |
| }, |
| { |
| "completion_length": 55.825, |
| "epoch": 1.228, |
| "grad_norm": 0.00109100341796875, |
| "kl": 0.023852512496523558, |
| "learning_rate": 1.946128755951332e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6140 |
| }, |
| { |
| "completion_length": 60.925, |
| "epoch": 1.23, |
| "grad_norm": 0.0003604888916015625, |
| "kl": 0.1579098215326667, |
| "learning_rate": 1.937622364140338e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 6150 |
| }, |
| { |
| "completion_length": 54.3, |
| "epoch": 1.232, |
| "grad_norm": 0.0018463134765625, |
| "kl": 0.07494590748101473, |
| "learning_rate": 1.9291228247233607e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6160 |
| }, |
| { |
| "completion_length": 49.825, |
| "epoch": 1.234, |
| "grad_norm": 0.0006866455078125, |
| "kl": 0.019502221944276244, |
| "learning_rate": 1.9206302412646074e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6170 |
| }, |
| { |
| "completion_length": 55.625, |
| "epoch": 1.236, |
| "grad_norm": 0.0003032684326171875, |
| "kl": 0.017997803702019154, |
| "learning_rate": 1.912144717243525e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6180 |
| }, |
| { |
| "completion_length": 70.4, |
| "epoch": 1.238, |
| "grad_norm": 0.0008697509765625, |
| "kl": 0.13488098671659826, |
| "learning_rate": 1.9036663560535484e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6190 |
| }, |
| { |
| "completion_length": 42.95, |
| "epoch": 1.24, |
| "grad_norm": 0.0008392333984375, |
| "kl": 0.05781206511892378, |
| "learning_rate": 1.895195261000831e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6200 |
| }, |
| { |
| "completion_length": 55.925, |
| "epoch": 1.242, |
| "grad_norm": 0.000698089599609375, |
| "kl": 0.018764377292245626, |
| "learning_rate": 1.8867315353029937e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6210 |
| }, |
| { |
| "completion_length": 60.3, |
| "epoch": 1.244, |
| "grad_norm": 0.000396728515625, |
| "kl": 0.061903743352741, |
| "learning_rate": 1.8782752820878636e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6220 |
| }, |
| { |
| "completion_length": 60.525, |
| "epoch": 1.246, |
| "grad_norm": 0.000698089599609375, |
| "kl": 0.06673049959354102, |
| "learning_rate": 1.8698266043922159e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6230 |
| }, |
| { |
| "completion_length": 53.775, |
| "epoch": 1.248, |
| "grad_norm": 0.0004520416259765625, |
| "kl": 0.05146434986963868, |
| "learning_rate": 1.8613856051605242e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6240 |
| }, |
| { |
| "completion_length": 57.675, |
| "epoch": 1.25, |
| "grad_norm": 0.000431060791015625, |
| "kl": 0.05393651574850082, |
| "learning_rate": 1.852952387243698e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6250 |
| }, |
| { |
| "completion_length": 44.85, |
| "epoch": 1.252, |
| "grad_norm": 0.000461578369140625, |
| "kl": 0.13971609035506843, |
| "learning_rate": 1.8445270533978387e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 6260 |
| }, |
| { |
| "completion_length": 50.125, |
| "epoch": 1.254, |
| "grad_norm": 0.000614166259765625, |
| "kl": 0.050323914270848036, |
| "learning_rate": 1.836109706282978e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6270 |
| }, |
| { |
| "completion_length": 50.5, |
| "epoch": 1.256, |
| "grad_norm": 0.00054931640625, |
| "kl": 0.13358841557055712, |
| "learning_rate": 1.827700448461836e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6280 |
| }, |
| { |
| "completion_length": 58.675, |
| "epoch": 1.258, |
| "grad_norm": 0.000518798828125, |
| "kl": 0.4532184978015721, |
| "learning_rate": 1.8192993823985643e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 6290 |
| }, |
| { |
| "completion_length": 61.5, |
| "epoch": 1.26, |
| "grad_norm": 0.000621795654296875, |
| "kl": 2.259404849074781, |
| "learning_rate": 1.8109066104575023e-06, |
| "loss": 0.0002, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 6300 |
| }, |
| { |
| "completion_length": 47.8, |
| "epoch": 1.262, |
| "grad_norm": 0.00080108642578125, |
| "kl": 0.110232665669173, |
| "learning_rate": 1.8025222349019273e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6310 |
| }, |
| { |
| "completion_length": 46.3, |
| "epoch": 1.264, |
| "grad_norm": 0.000720977783203125, |
| "kl": 0.0984095955034718, |
| "learning_rate": 1.7941463578928088e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6320 |
| }, |
| { |
| "completion_length": 57.525, |
| "epoch": 1.266, |
| "grad_norm": 17.375, |
| "kl": 53.167960462137124, |
| "learning_rate": 1.7857790814875665e-06, |
| "loss": 0.0053, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 6330 |
| }, |
| { |
| "completion_length": 56.15, |
| "epoch": 1.268, |
| "grad_norm": 0.002105712890625, |
| "kl": 0.09631253816187382, |
| "learning_rate": 1.7774205076388207e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6340 |
| }, |
| { |
| "completion_length": 50.2, |
| "epoch": 1.27, |
| "grad_norm": 0.014404296875, |
| "kl": 0.185893784603104, |
| "learning_rate": 1.7690707381931585e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6350 |
| }, |
| { |
| "completion_length": 57.775, |
| "epoch": 1.272, |
| "grad_norm": 0.0006256103515625, |
| "kl": 0.032014391385018826, |
| "learning_rate": 1.7607298748898844e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6360 |
| }, |
| { |
| "completion_length": 72.225, |
| "epoch": 1.274, |
| "grad_norm": 0.0087890625, |
| "kl": 0.053022891748696566, |
| "learning_rate": 1.7523980193597837e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6370 |
| }, |
| { |
| "completion_length": 61.275, |
| "epoch": 1.276, |
| "grad_norm": 0.0004520416259765625, |
| "kl": 0.017128141969442366, |
| "learning_rate": 1.744075273123889e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6380 |
| }, |
| { |
| "completion_length": 66.125, |
| "epoch": 1.278, |
| "grad_norm": 0.00061798095703125, |
| "kl": 0.07104477211833, |
| "learning_rate": 1.735761737592236e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 6390 |
| }, |
| { |
| "completion_length": 61.725, |
| "epoch": 1.28, |
| "grad_norm": 0.00168609619140625, |
| "kl": 0.022364417230710386, |
| "learning_rate": 1.7274575140626318e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6400 |
| }, |
| { |
| "completion_length": 59.975, |
| "epoch": 1.282, |
| "grad_norm": 0.0005950927734375, |
| "kl": 0.04414304066449404, |
| "learning_rate": 1.7191627037194187e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6410 |
| }, |
| { |
| "completion_length": 58.35, |
| "epoch": 1.284, |
| "grad_norm": 0.0004100799560546875, |
| "kl": 71.30782471811399, |
| "learning_rate": 1.7108774076322443e-06, |
| "loss": 0.0071, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 6420 |
| }, |
| { |
| "completion_length": 54.925, |
| "epoch": 1.286, |
| "grad_norm": 25.625, |
| "kl": 3.6482051144819705, |
| "learning_rate": 1.702601726754825e-06, |
| "loss": 0.0004, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 6430 |
| }, |
| { |
| "completion_length": 51.125, |
| "epoch": 1.288, |
| "grad_norm": 0.00084686279296875, |
| "kl": 0.06689287801855244, |
| "learning_rate": 1.6943357619237227e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6440 |
| }, |
| { |
| "completion_length": 60.625, |
| "epoch": 1.29, |
| "grad_norm": 0.00052642822265625, |
| "kl": 0.03276003615465015, |
| "learning_rate": 1.686079613857109e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6450 |
| }, |
| { |
| "completion_length": 49.8, |
| "epoch": 1.292, |
| "grad_norm": 0.004669189453125, |
| "kl": 0.04734173566102982, |
| "learning_rate": 1.677833383153542e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6460 |
| }, |
| { |
| "completion_length": 50.925, |
| "epoch": 1.294, |
| "grad_norm": 0.00040435791015625, |
| "kl": 0.08111863350495696, |
| "learning_rate": 1.6695971702907425e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6470 |
| }, |
| { |
| "completion_length": 51.975, |
| "epoch": 1.296, |
| "grad_norm": 0.000335693359375, |
| "kl": 0.034373713890090585, |
| "learning_rate": 1.661371075624363e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6480 |
| }, |
| { |
| "completion_length": 56.55, |
| "epoch": 1.298, |
| "grad_norm": 0.0008544921875, |
| "kl": 0.05773493410088122, |
| "learning_rate": 1.6531551993867717e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6490 |
| }, |
| { |
| "completion_length": 50.95, |
| "epoch": 1.3, |
| "grad_norm": 282.0, |
| "kl": 178.04388241134583, |
| "learning_rate": 1.6449496416858285e-06, |
| "loss": 0.0178, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 6500 |
| }, |
| { |
| "completion_length": 58.625, |
| "epoch": 1.302, |
| "grad_norm": 0.0023651123046875, |
| "kl": 5.505481028556824, |
| "learning_rate": 1.6367545025036634e-06, |
| "loss": 0.0006, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 6510 |
| }, |
| { |
| "completion_length": 67.575, |
| "epoch": 1.304, |
| "grad_norm": 0.001068115234375, |
| "kl": 0.041122534591704604, |
| "learning_rate": 1.6285698816954626e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6520 |
| }, |
| { |
| "completion_length": 49.75, |
| "epoch": 1.306, |
| "grad_norm": 0.000415802001953125, |
| "kl": 0.165414993558079, |
| "learning_rate": 1.6203958789882457e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6530 |
| }, |
| { |
| "completion_length": 56.675, |
| "epoch": 1.308, |
| "grad_norm": 0.0007476806640625, |
| "kl": 0.2703967327717692, |
| "learning_rate": 1.612232593979658e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 6540 |
| }, |
| { |
| "completion_length": 56.2, |
| "epoch": 1.31, |
| "grad_norm": 0.00133514404296875, |
| "kl": 0.02644283170811832, |
| "learning_rate": 1.6040801261367494e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6550 |
| }, |
| { |
| "completion_length": 58.05, |
| "epoch": 1.312, |
| "grad_norm": 0.00030517578125, |
| "kl": 0.015307861985638738, |
| "learning_rate": 1.5959385747947697e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6560 |
| }, |
| { |
| "completion_length": 73.35, |
| "epoch": 1.314, |
| "grad_norm": 0.0004405975341796875, |
| "kl": 0.013954693730920554, |
| "learning_rate": 1.5878080391559507e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6570 |
| }, |
| { |
| "completion_length": 51.05, |
| "epoch": 1.316, |
| "grad_norm": 0.0003566741943359375, |
| "kl": 684.7183584340382, |
| "learning_rate": 1.5796886182883053e-06, |
| "loss": 0.0685, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 6580 |
| }, |
| { |
| "completion_length": 67.15, |
| "epoch": 1.318, |
| "grad_norm": 0.000598907470703125, |
| "kl": 0.024668072490021585, |
| "learning_rate": 1.5715804111244138e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6590 |
| }, |
| { |
| "completion_length": 66.8, |
| "epoch": 1.32, |
| "grad_norm": 0.00738525390625, |
| "kl": 0.047915787994861604, |
| "learning_rate": 1.56348351646022e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6600 |
| }, |
| { |
| "completion_length": 61.85, |
| "epoch": 1.322, |
| "grad_norm": 0.000537872314453125, |
| "kl": 0.05744472313672304, |
| "learning_rate": 1.5553980329538326e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6610 |
| }, |
| { |
| "completion_length": 57.5, |
| "epoch": 1.324, |
| "grad_norm": 0.0006561279296875, |
| "kl": 0.05604997184127569, |
| "learning_rate": 1.547324059124315e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6620 |
| }, |
| { |
| "completion_length": 59.425, |
| "epoch": 1.326, |
| "grad_norm": 0.00112152099609375, |
| "kl": 0.0188056749291718, |
| "learning_rate": 1.539261693350491e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6630 |
| }, |
| { |
| "completion_length": 40.9, |
| "epoch": 1.328, |
| "grad_norm": 0.00072479248046875, |
| "kl": 0.1357155740261078, |
| "learning_rate": 1.5312110338697427e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6640 |
| }, |
| { |
| "completion_length": 61.5, |
| "epoch": 1.33, |
| "grad_norm": 0.000690460205078125, |
| "kl": 0.018099735863506793, |
| "learning_rate": 1.5231721787768162e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6650 |
| }, |
| { |
| "completion_length": 54.125, |
| "epoch": 1.332, |
| "grad_norm": 0.000720977783203125, |
| "kl": 0.014814224326983094, |
| "learning_rate": 1.5151452260226224e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6660 |
| }, |
| { |
| "completion_length": 64.325, |
| "epoch": 1.334, |
| "grad_norm": 0.00016117095947265625, |
| "kl": 0.016494302544742823, |
| "learning_rate": 1.5071302734130488e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6670 |
| }, |
| { |
| "completion_length": 59.2, |
| "epoch": 1.336, |
| "grad_norm": 0.00194549560546875, |
| "kl": 0.06070426572114229, |
| "learning_rate": 1.4991274186077632e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6680 |
| }, |
| { |
| "completion_length": 59.15, |
| "epoch": 1.338, |
| "grad_norm": 0.0006256103515625, |
| "kl": 0.11375871314667166, |
| "learning_rate": 1.491136759119025e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 6690 |
| }, |
| { |
| "completion_length": 52.65, |
| "epoch": 1.34, |
| "grad_norm": 0.000713348388671875, |
| "kl": 0.03938477258197963, |
| "learning_rate": 1.4831583923105e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6700 |
| }, |
| { |
| "completion_length": 63.525, |
| "epoch": 1.342, |
| "grad_norm": 0.00083160400390625, |
| "kl": 0.020069646975025536, |
| "learning_rate": 1.4751924153960681e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6710 |
| }, |
| { |
| "completion_length": 53.7, |
| "epoch": 1.3439999999999999, |
| "grad_norm": 0.06787109375, |
| "kl": 0.14404951045289635, |
| "learning_rate": 1.467238925438646e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6720 |
| }, |
| { |
| "completion_length": 55.475, |
| "epoch": 1.346, |
| "grad_norm": 0.0015411376953125, |
| "kl": 180.89379140562377, |
| "learning_rate": 1.4592980193489975e-06, |
| "loss": 0.0181, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 6730 |
| }, |
| { |
| "completion_length": 53.55, |
| "epoch": 1.3479999999999999, |
| "grad_norm": 0.00067138671875, |
| "kl": 0.05597533159889281, |
| "learning_rate": 1.4513697938845571e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6740 |
| }, |
| { |
| "completion_length": 53.1, |
| "epoch": 1.35, |
| "grad_norm": 0.002777099609375, |
| "kl": 0.3969091270118952, |
| "learning_rate": 1.443454345648252e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 6750 |
| }, |
| { |
| "completion_length": 43.8, |
| "epoch": 1.3519999999999999, |
| "grad_norm": 0.0003185272216796875, |
| "kl": 0.10832225987687708, |
| "learning_rate": 1.4355517710873184e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6760 |
| }, |
| { |
| "completion_length": 68.725, |
| "epoch": 1.354, |
| "grad_norm": 0.0004730224609375, |
| "kl": 0.03166971495375037, |
| "learning_rate": 1.4276621664921358e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6770 |
| }, |
| { |
| "completion_length": 63.625, |
| "epoch": 1.3559999999999999, |
| "grad_norm": 0.00077056884765625, |
| "kl": 0.018040235806256532, |
| "learning_rate": 1.419785627995044e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6780 |
| }, |
| { |
| "completion_length": 56.7, |
| "epoch": 1.358, |
| "grad_norm": 0.0103759765625, |
| "kl": 0.08500627786852419, |
| "learning_rate": 1.4119222515691817e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6790 |
| }, |
| { |
| "completion_length": 60.375, |
| "epoch": 1.3599999999999999, |
| "grad_norm": 0.000652313232421875, |
| "kl": 0.13012904403731226, |
| "learning_rate": 1.4040721330273063e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6800 |
| }, |
| { |
| "completion_length": 51.225, |
| "epoch": 1.362, |
| "grad_norm": 0.00110626220703125, |
| "kl": 0.019943116419017314, |
| "learning_rate": 1.3962353680206372e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6810 |
| }, |
| { |
| "completion_length": 62.1, |
| "epoch": 1.3639999999999999, |
| "grad_norm": 0.000614166259765625, |
| "kl": 0.08246160177513957, |
| "learning_rate": 1.388412052037682e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 6820 |
| }, |
| { |
| "completion_length": 49.75, |
| "epoch": 1.366, |
| "grad_norm": 0.0008697509765625, |
| "kl": 95.11083188317716, |
| "learning_rate": 1.380602280403076e-06, |
| "loss": 0.0095, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 6830 |
| }, |
| { |
| "completion_length": 63.875, |
| "epoch": 1.3679999999999999, |
| "grad_norm": 0.000579833984375, |
| "kl": 0.027737328410148622, |
| "learning_rate": 1.3728061482764238e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6840 |
| }, |
| { |
| "completion_length": 56.325, |
| "epoch": 1.37, |
| "grad_norm": 0.00054168701171875, |
| "kl": 0.06817373894155025, |
| "learning_rate": 1.3650237506511333e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6850 |
| }, |
| { |
| "completion_length": 56.875, |
| "epoch": 1.3719999999999999, |
| "grad_norm": 0.000423431396484375, |
| "kl": 0.014378735097125173, |
| "learning_rate": 1.3572551823532654e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6860 |
| }, |
| { |
| "completion_length": 58.425, |
| "epoch": 1.374, |
| "grad_norm": 0.00054168701171875, |
| "kl": 0.43131620325148107, |
| "learning_rate": 1.349500538040371e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 6870 |
| }, |
| { |
| "completion_length": 55.575, |
| "epoch": 1.376, |
| "grad_norm": 0.00066375732421875, |
| "kl": 0.033100543078035116, |
| "learning_rate": 1.3417599122003464e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6880 |
| }, |
| { |
| "completion_length": 54.85, |
| "epoch": 1.3780000000000001, |
| "grad_norm": 0.002105712890625, |
| "kl": 0.04654085249640048, |
| "learning_rate": 1.3340333991502723e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6890 |
| }, |
| { |
| "completion_length": 68.475, |
| "epoch": 1.38, |
| "grad_norm": 0.0005035400390625, |
| "kl": 0.05651907054707408, |
| "learning_rate": 1.3263210930352737e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6900 |
| }, |
| { |
| "completion_length": 61.55, |
| "epoch": 1.3820000000000001, |
| "grad_norm": 0.00054168701171875, |
| "kl": 0.03381169466301799, |
| "learning_rate": 1.3186230878273654e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6910 |
| }, |
| { |
| "completion_length": 57.0, |
| "epoch": 1.384, |
| "grad_norm": 0.00049591064453125, |
| "kl": 0.6785514406859875, |
| "learning_rate": 1.3109394773243117e-06, |
| "loss": 0.0001, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 6920 |
| }, |
| { |
| "completion_length": 61.8, |
| "epoch": 1.3860000000000001, |
| "grad_norm": 0.020751953125, |
| "kl": 0.08321558614261448, |
| "learning_rate": 1.3032703551484832e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6930 |
| }, |
| { |
| "completion_length": 60.55, |
| "epoch": 1.388, |
| "grad_norm": 0.000690460205078125, |
| "kl": 0.09076761337928474, |
| "learning_rate": 1.2956158147457116e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6940 |
| }, |
| { |
| "completion_length": 49.25, |
| "epoch": 1.3900000000000001, |
| "grad_norm": 0.000667572021484375, |
| "kl": 0.04632122702896595, |
| "learning_rate": 1.2879759493841577e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6950 |
| }, |
| { |
| "completion_length": 66.75, |
| "epoch": 1.392, |
| "grad_norm": 0.000629425048828125, |
| "kl": 3.6676836960949, |
| "learning_rate": 1.280350852153168e-06, |
| "loss": 0.0004, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 6960 |
| }, |
| { |
| "completion_length": 58.6, |
| "epoch": 1.3940000000000001, |
| "grad_norm": 0.0003719329833984375, |
| "kl": 0.07259991895407439, |
| "learning_rate": 1.272740615962148e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6970 |
| }, |
| { |
| "completion_length": 62.025, |
| "epoch": 1.396, |
| "grad_norm": 0.00069427490234375, |
| "kl": 0.01472460343502462, |
| "learning_rate": 1.2651453335394232e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6980 |
| }, |
| { |
| "completion_length": 64.175, |
| "epoch": 1.3980000000000001, |
| "grad_norm": 0.000606536865234375, |
| "kl": 0.18156763026490808, |
| "learning_rate": 1.2575650974311118e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 6990 |
| }, |
| { |
| "completion_length": 50.15, |
| "epoch": 1.4, |
| "grad_norm": 14.125, |
| "kl": 10.663465712498873, |
| "learning_rate": 1.2500000000000007e-06, |
| "loss": 0.0011, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.15, |
| "rewards/reward_func": -0.075, |
| "step": 7000 |
| }, |
| { |
| "completion_length": 62.375, |
| "epoch": 1.4020000000000001, |
| "grad_norm": 0.000507354736328125, |
| "kl": 0.14125907123088838, |
| "learning_rate": 1.2424501334244124e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7010 |
| }, |
| { |
| "completion_length": 52.6, |
| "epoch": 1.404, |
| "grad_norm": 0.0018310546875, |
| "kl": 0.13884197538718582, |
| "learning_rate": 1.234915589697091e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7020 |
| }, |
| { |
| "completion_length": 55.575, |
| "epoch": 1.4060000000000001, |
| "grad_norm": 0.0006103515625, |
| "kl": 31778.019179227947, |
| "learning_rate": 1.2273964606240718e-06, |
| "loss": 3.1778, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 7030 |
| }, |
| { |
| "completion_length": 48.4, |
| "epoch": 1.408, |
| "grad_norm": 0.0009613037109375, |
| "kl": 0.17425558338873087, |
| "learning_rate": 1.2198928378235717e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7040 |
| }, |
| { |
| "completion_length": 47.075, |
| "epoch": 1.41, |
| "grad_norm": 0.000476837158203125, |
| "kl": 0.045156693411991, |
| "learning_rate": 1.2124048127248644e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7050 |
| }, |
| { |
| "completion_length": 62.375, |
| "epoch": 1.412, |
| "grad_norm": 0.000751495361328125, |
| "kl": 0.031479455251246694, |
| "learning_rate": 1.204932476567175e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7060 |
| }, |
| { |
| "completion_length": 55.525, |
| "epoch": 1.414, |
| "grad_norm": 0.00058746337890625, |
| "kl": 0.13692689267918468, |
| "learning_rate": 1.19747592039856e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 7070 |
| }, |
| { |
| "completion_length": 47.2, |
| "epoch": 1.416, |
| "grad_norm": 0.000492095947265625, |
| "kl": 0.028804597025737167, |
| "learning_rate": 1.1900352350748026e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7080 |
| }, |
| { |
| "completion_length": 59.075, |
| "epoch": 1.418, |
| "grad_norm": 0.0003509521484375, |
| "kl": 0.04570387415587902, |
| "learning_rate": 1.1826105112583061e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7090 |
| }, |
| { |
| "completion_length": 56.05, |
| "epoch": 1.42, |
| "grad_norm": 0.0004558563232421875, |
| "kl": 0.018374279094859957, |
| "learning_rate": 1.1752018394169882e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7100 |
| }, |
| { |
| "completion_length": 61.95, |
| "epoch": 1.422, |
| "grad_norm": 0.000583648681640625, |
| "kl": 0.03715153355151415, |
| "learning_rate": 1.1678093098231748e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7110 |
| }, |
| { |
| "completion_length": 54.9, |
| "epoch": 1.424, |
| "grad_norm": 0.000518798828125, |
| "kl": 0.016605707909911872, |
| "learning_rate": 1.160433012552508e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7120 |
| }, |
| { |
| "completion_length": 57.55, |
| "epoch": 1.426, |
| "grad_norm": 0.0004215240478515625, |
| "kl": 0.018531074468046426, |
| "learning_rate": 1.1530730374828422e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7130 |
| }, |
| { |
| "completion_length": 58.0, |
| "epoch": 1.428, |
| "grad_norm": 0.000522613525390625, |
| "kl": 0.06935790865682065, |
| "learning_rate": 1.1457294742931508e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7140 |
| }, |
| { |
| "completion_length": 52.5, |
| "epoch": 1.43, |
| "grad_norm": 0.0010833740234375, |
| "kl": 0.09027541326358915, |
| "learning_rate": 1.1384024124624324e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7150 |
| }, |
| { |
| "completion_length": 68.775, |
| "epoch": 1.432, |
| "grad_norm": 0.00066375732421875, |
| "kl": 0.017101448262110353, |
| "learning_rate": 1.1310919412686248e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7160 |
| }, |
| { |
| "completion_length": 66.075, |
| "epoch": 1.434, |
| "grad_norm": 0.0003528594970703125, |
| "kl": 0.10510317548178136, |
| "learning_rate": 1.1237981497875112e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 7170 |
| }, |
| { |
| "completion_length": 58.75, |
| "epoch": 1.436, |
| "grad_norm": 0.0003719329833984375, |
| "kl": 0.07255538417957723, |
| "learning_rate": 1.11652112689164e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7180 |
| }, |
| { |
| "completion_length": 55.575, |
| "epoch": 1.438, |
| "grad_norm": 0.000713348388671875, |
| "kl": 0.04587976224720478, |
| "learning_rate": 1.109260961249238e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7190 |
| }, |
| { |
| "completion_length": 44.3, |
| "epoch": 1.44, |
| "grad_norm": 0.0007171630859375, |
| "kl": 0.026211364893242717, |
| "learning_rate": 1.1020177413231334e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7200 |
| }, |
| { |
| "completion_length": 55.4, |
| "epoch": 1.442, |
| "grad_norm": 0.00011587142944335938, |
| "kl": 1273.178384515643, |
| "learning_rate": 1.0947915553696742e-06, |
| "loss": 0.1273, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.15773502588272095, |
| "rewards/reward_func": -0.1, |
| "step": 7210 |
| }, |
| { |
| "completion_length": 70.35, |
| "epoch": 1.444, |
| "grad_norm": 0.0002689361572265625, |
| "kl": 0.30778478598222136, |
| "learning_rate": 1.0875824914376555e-06, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 7220 |
| }, |
| { |
| "completion_length": 51.35, |
| "epoch": 1.446, |
| "grad_norm": 0.000743865966796875, |
| "kl": 0.11805587047711015, |
| "learning_rate": 1.0803906373672477e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 7230 |
| }, |
| { |
| "completion_length": 54.1, |
| "epoch": 1.448, |
| "grad_norm": 0.00083160400390625, |
| "kl": 0.13561045327223836, |
| "learning_rate": 1.073216080788921e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7240 |
| }, |
| { |
| "completion_length": 58.075, |
| "epoch": 1.45, |
| "grad_norm": 0.0007781982421875, |
| "kl": 0.01598156727850437, |
| "learning_rate": 1.0660589091223854e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7250 |
| }, |
| { |
| "completion_length": 55.825, |
| "epoch": 1.452, |
| "grad_norm": 0.000614166259765625, |
| "kl": 0.08759649377316236, |
| "learning_rate": 1.0589192095755172e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7260 |
| }, |
| { |
| "completion_length": 54.95, |
| "epoch": 1.454, |
| "grad_norm": 0.000942230224609375, |
| "kl": 2.808669605664909, |
| "learning_rate": 1.0517970691433035e-06, |
| "loss": 0.0003, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 7270 |
| }, |
| { |
| "completion_length": 44.8, |
| "epoch": 1.456, |
| "grad_norm": 0.0010833740234375, |
| "kl": 177.8802186036017, |
| "learning_rate": 1.0446925746067768e-06, |
| "loss": 0.0178, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 7280 |
| }, |
| { |
| "completion_length": 55.0, |
| "epoch": 1.458, |
| "grad_norm": 0.001861572265625, |
| "kl": 0.25618111025542023, |
| "learning_rate": 1.0376058125319614e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 7290 |
| }, |
| { |
| "completion_length": 65.625, |
| "epoch": 1.46, |
| "grad_norm": 0.01287841796875, |
| "kl": 0.1665965816937387, |
| "learning_rate": 1.0305368692688175e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 7300 |
| }, |
| { |
| "completion_length": 56.25, |
| "epoch": 1.462, |
| "grad_norm": 0.00029754638671875, |
| "kl": 0.4289894063025713, |
| "learning_rate": 1.0234858309501864e-06, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 7310 |
| }, |
| { |
| "completion_length": 48.675, |
| "epoch": 1.464, |
| "grad_norm": 2.328125, |
| "kl": 44.01882844008505, |
| "learning_rate": 1.0164527834907468e-06, |
| "loss": 0.0044, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 7320 |
| }, |
| { |
| "completion_length": 50.35, |
| "epoch": 1.466, |
| "grad_norm": 0.0024261474609375, |
| "kl": 0.13080412773415445, |
| "learning_rate": 1.0094378125859602e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7330 |
| }, |
| { |
| "completion_length": 55.85, |
| "epoch": 1.468, |
| "grad_norm": 0.000461578369140625, |
| "kl": 0.0169123521191068, |
| "learning_rate": 1.0024410037110358e-06, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7340 |
| }, |
| { |
| "completion_length": 49.05, |
| "epoch": 1.47, |
| "grad_norm": 0.00058746337890625, |
| "kl": 0.13749618739821018, |
| "learning_rate": 9.95462442119879e-07, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 7350 |
| }, |
| { |
| "completion_length": 64.4, |
| "epoch": 1.472, |
| "grad_norm": 0.000614166259765625, |
| "kl": 0.025387801649048924, |
| "learning_rate": 9.88502212844063e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7360 |
| }, |
| { |
| "completion_length": 66.95, |
| "epoch": 1.474, |
| "grad_norm": 0.00026702880859375, |
| "kl": 0.03902003513649106, |
| "learning_rate": 9.815604006917839e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7370 |
| }, |
| { |
| "completion_length": 54.0, |
| "epoch": 1.476, |
| "grad_norm": 0.0159912109375, |
| "kl": 0.08616708847694099, |
| "learning_rate": 9.746370902468311e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7380 |
| }, |
| { |
| "completion_length": 61.025, |
| "epoch": 1.478, |
| "grad_norm": 0.00043487548828125, |
| "kl": 0.027717783488333224, |
| "learning_rate": 9.677323658675594e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7390 |
| }, |
| { |
| "completion_length": 47.25, |
| "epoch": 1.48, |
| "grad_norm": 0.00274658203125, |
| "kl": 0.09193211463280022, |
| "learning_rate": 9.608463116858544e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7400 |
| }, |
| { |
| "completion_length": 61.15, |
| "epoch": 1.482, |
| "grad_norm": 0.00194549560546875, |
| "kl": 0.09381414433009923, |
| "learning_rate": 9.53979011606115e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7410 |
| }, |
| { |
| "completion_length": 55.875, |
| "epoch": 1.484, |
| "grad_norm": 0.000873565673828125, |
| "kl": 0.06577477985993027, |
| "learning_rate": 9.471305493042243e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7420 |
| }, |
| { |
| "completion_length": 55.325, |
| "epoch": 1.486, |
| "grad_norm": 0.0006866455078125, |
| "kl": 0.05357563262805343, |
| "learning_rate": 9.403010082265351e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7430 |
| }, |
| { |
| "completion_length": 55.225, |
| "epoch": 1.488, |
| "grad_norm": 0.000606536865234375, |
| "kl": 0.043947093037422745, |
| "learning_rate": 9.334904715888496e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7440 |
| }, |
| { |
| "completion_length": 47.125, |
| "epoch": 1.49, |
| "grad_norm": 0.0005035400390625, |
| "kl": 0.020906020514667036, |
| "learning_rate": 9.266990223754069e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7450 |
| }, |
| { |
| "completion_length": 75.375, |
| "epoch": 1.492, |
| "grad_norm": 0.0007476806640625, |
| "kl": 0.0432497413828969, |
| "learning_rate": 9.199267433378728e-07, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 7460 |
| }, |
| { |
| "completion_length": 66.75, |
| "epoch": 1.494, |
| "grad_norm": 0.000682830810546875, |
| "kl": 0.01475386363454163, |
| "learning_rate": 9.131737169943314e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7470 |
| }, |
| { |
| "completion_length": 53.25, |
| "epoch": 1.496, |
| "grad_norm": 0.000530242919921875, |
| "kl": 0.034680284932255744, |
| "learning_rate": 9.064400256282757e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7480 |
| }, |
| { |
| "completion_length": 66.5, |
| "epoch": 1.498, |
| "grad_norm": 0.0012664794921875, |
| "kl": 0.020779677666723728, |
| "learning_rate": 8.99725751287611e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7490 |
| }, |
| { |
| "completion_length": 69.5, |
| "epoch": 1.5, |
| "grad_norm": 35.5, |
| "kl": 10.81919735018164, |
| "learning_rate": 8.930309757836517e-07, |
| "loss": 0.0011, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 7500 |
| }, |
| { |
| "completion_length": 63.825, |
| "epoch": 1.502, |
| "grad_norm": 0.000682830810546875, |
| "kl": 0.018158415833022447, |
| "learning_rate": 8.863557806901233e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7510 |
| }, |
| { |
| "completion_length": 46.225, |
| "epoch": 1.504, |
| "grad_norm": 0.0003185272216796875, |
| "kl": 0.04219387628836557, |
| "learning_rate": 8.797002473421729e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7520 |
| }, |
| { |
| "completion_length": 70.0, |
| "epoch": 1.506, |
| "grad_norm": 0.00041961669921875, |
| "kl": 0.03907957626506686, |
| "learning_rate": 8.73064456835373e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7530 |
| }, |
| { |
| "completion_length": 60.8, |
| "epoch": 1.508, |
| "grad_norm": 0.000659942626953125, |
| "kl": 152.24121750062332, |
| "learning_rate": 8.664484900247363e-07, |
| "loss": 0.0152, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.0, |
| "rewards/reward_func": -0.1, |
| "step": 7540 |
| }, |
| { |
| "completion_length": 46.375, |
| "epoch": 1.51, |
| "grad_norm": 0.001556396484375, |
| "kl": 0.2765787610784173, |
| "learning_rate": 8.598524275237321e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7550 |
| }, |
| { |
| "completion_length": 68.675, |
| "epoch": 1.512, |
| "grad_norm": 0.0027008056640625, |
| "kl": 0.028881799709051848, |
| "learning_rate": 8.532763497032987e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7560 |
| }, |
| { |
| "completion_length": 63.8, |
| "epoch": 1.514, |
| "grad_norm": 0.000469207763671875, |
| "kl": 0.04086120091378689, |
| "learning_rate": 8.467203366908708e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7570 |
| }, |
| { |
| "completion_length": 49.95, |
| "epoch": 1.516, |
| "grad_norm": 0.0037689208984375, |
| "kl": 0.12793728783726693, |
| "learning_rate": 8.40184468369396e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7580 |
| }, |
| { |
| "completion_length": 53.5, |
| "epoch": 1.518, |
| "grad_norm": 0.00121307373046875, |
| "kl": 42.36696035126224, |
| "learning_rate": 8.336688243763691e-07, |
| "loss": 0.0042, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.10773502588272095, |
| "rewards/reward_func": -0.075, |
| "step": 7590 |
| }, |
| { |
| "completion_length": 47.325, |
| "epoch": 1.52, |
| "grad_norm": 0.0006103515625, |
| "kl": 14.073435558238998, |
| "learning_rate": 8.271734841028553e-07, |
| "loss": 0.0014, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 7600 |
| }, |
| { |
| "completion_length": 65.0, |
| "epoch": 1.522, |
| "grad_norm": 0.000720977783203125, |
| "kl": 0.04525826433673501, |
| "learning_rate": 8.206985266925249e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7610 |
| }, |
| { |
| "completion_length": 59.525, |
| "epoch": 1.524, |
| "grad_norm": 0.000843048095703125, |
| "kl": 0.04048813302069902, |
| "learning_rate": 8.142440310406923e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7620 |
| }, |
| { |
| "completion_length": 46.45, |
| "epoch": 1.526, |
| "grad_norm": 0.0004329681396484375, |
| "kl": 8.161725069396198, |
| "learning_rate": 8.078100757933486e-07, |
| "loss": 0.0008, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 7630 |
| }, |
| { |
| "completion_length": 57.225, |
| "epoch": 1.528, |
| "grad_norm": 0.000698089599609375, |
| "kl": 0.1071649724675808, |
| "learning_rate": 8.013967393462094e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7640 |
| }, |
| { |
| "completion_length": 52.975, |
| "epoch": 1.53, |
| "grad_norm": 0.000431060791015625, |
| "kl": 0.12129491865634918, |
| "learning_rate": 7.950040998437541e-07, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 7650 |
| }, |
| { |
| "completion_length": 43.225, |
| "epoch": 1.532, |
| "grad_norm": 0.0026702880859375, |
| "kl": 0.1067592917010188, |
| "learning_rate": 7.886322351782782e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7660 |
| }, |
| { |
| "completion_length": 59.9, |
| "epoch": 1.534, |
| "grad_norm": 0.00093841552734375, |
| "kl": 0.05229797107167542, |
| "learning_rate": 7.822812229889429e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7670 |
| }, |
| { |
| "completion_length": 65.525, |
| "epoch": 1.536, |
| "grad_norm": 0.0004405975341796875, |
| "kl": 0.010464739426970482, |
| "learning_rate": 7.759511406608255e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7680 |
| }, |
| { |
| "completion_length": 66.625, |
| "epoch": 1.538, |
| "grad_norm": 0.0004825592041015625, |
| "kl": 0.045739847654476765, |
| "learning_rate": 7.696420653239834e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7690 |
| }, |
| { |
| "completion_length": 73.575, |
| "epoch": 1.54, |
| "grad_norm": 0.00058746337890625, |
| "kl": 0.031135138869285584, |
| "learning_rate": 7.633540738525066e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7700 |
| }, |
| { |
| "completion_length": 48.175, |
| "epoch": 1.542, |
| "grad_norm": 51.25, |
| "kl": 80.07849281346425, |
| "learning_rate": 7.57087242863589e-07, |
| "loss": 0.008, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 7710 |
| }, |
| { |
| "completion_length": 46.375, |
| "epoch": 1.544, |
| "grad_norm": 0.00138092041015625, |
| "kl": 0.0285742097068578, |
| "learning_rate": 7.508416487165862e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7720 |
| }, |
| { |
| "completion_length": 57.2, |
| "epoch": 1.546, |
| "grad_norm": 0.000492095947265625, |
| "kl": 0.07636187486350536, |
| "learning_rate": 7.44617367512094e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7730 |
| }, |
| { |
| "completion_length": 56.15, |
| "epoch": 1.548, |
| "grad_norm": 0.00045013427734375, |
| "kl": 0.03193683328572661, |
| "learning_rate": 7.384144750910133e-07, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 7740 |
| }, |
| { |
| "completion_length": 59.65, |
| "epoch": 1.55, |
| "grad_norm": 0.000370025634765625, |
| "kl": 0.09212675780290738, |
| "learning_rate": 7.322330470336314e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7750 |
| }, |
| { |
| "completion_length": 53.225, |
| "epoch": 1.552, |
| "grad_norm": 0.000499725341796875, |
| "kl": 0.0655658102594316, |
| "learning_rate": 7.260731586586983e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7760 |
| }, |
| { |
| "completion_length": 50.3, |
| "epoch": 1.554, |
| "grad_norm": 0.00186920166015625, |
| "kl": 0.07870109416544438, |
| "learning_rate": 7.199348850225091e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7770 |
| }, |
| { |
| "completion_length": 53.8, |
| "epoch": 1.556, |
| "grad_norm": 0.003204345703125, |
| "kl": 146.1276578912046, |
| "learning_rate": 7.138183009179922e-07, |
| "loss": 0.0146, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 7780 |
| }, |
| { |
| "completion_length": 69.675, |
| "epoch": 1.558, |
| "grad_norm": 0.0005645751953125, |
| "kl": 0.028883875254541634, |
| "learning_rate": 7.077234808737932e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7790 |
| }, |
| { |
| "completion_length": 67.7, |
| "epoch": 1.56, |
| "grad_norm": 0.00102996826171875, |
| "kl": 0.0383193613961339, |
| "learning_rate": 7.016504991533727e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7800 |
| }, |
| { |
| "completion_length": 59.3, |
| "epoch": 1.562, |
| "grad_norm": 0.001800537109375, |
| "kl": 0.07388523239642382, |
| "learning_rate": 6.955994297540947e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7810 |
| }, |
| { |
| "completion_length": 56.75, |
| "epoch": 1.564, |
| "grad_norm": 0.000614166259765625, |
| "kl": 0.08295210748910904, |
| "learning_rate": 6.895703464063319e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7820 |
| }, |
| { |
| "completion_length": 60.15, |
| "epoch": 1.5659999999999998, |
| "grad_norm": 0.00628662109375, |
| "kl": 0.03666973649524152, |
| "learning_rate": 6.835633225725604e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7830 |
| }, |
| { |
| "completion_length": 67.05, |
| "epoch": 1.568, |
| "grad_norm": 0.000774383544921875, |
| "kl": 0.017159267235547303, |
| "learning_rate": 6.775784314464717e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7840 |
| }, |
| { |
| "completion_length": 63.4, |
| "epoch": 1.5699999999999998, |
| "grad_norm": 0.00055694580078125, |
| "kl": 0.01824809005483985, |
| "learning_rate": 6.716157459520739e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7850 |
| }, |
| { |
| "completion_length": 54.525, |
| "epoch": 1.572, |
| "grad_norm": 0.0005035400390625, |
| "kl": 0.03218274647369981, |
| "learning_rate": 6.656753387428089e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7860 |
| }, |
| { |
| "completion_length": 70.525, |
| "epoch": 1.5739999999999998, |
| "grad_norm": 0.000705718994140625, |
| "kl": 0.0729364191647619, |
| "learning_rate": 6.597572822006643e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7870 |
| }, |
| { |
| "completion_length": 69.875, |
| "epoch": 1.576, |
| "grad_norm": 0.0003833770751953125, |
| "kl": 0.07040122235193849, |
| "learning_rate": 6.538616484352902e-07, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 7880 |
| }, |
| { |
| "completion_length": 59.375, |
| "epoch": 1.5779999999999998, |
| "grad_norm": 0.001129150390625, |
| "kl": 0.04789869613014162, |
| "learning_rate": 6.479885092831251e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7890 |
| }, |
| { |
| "completion_length": 56.575, |
| "epoch": 1.58, |
| "grad_norm": 0.00049591064453125, |
| "kl": 0.05556117547675967, |
| "learning_rate": 6.421379363065142e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7900 |
| }, |
| { |
| "completion_length": 50.85, |
| "epoch": 1.5819999999999999, |
| "grad_norm": 0.000957489013671875, |
| "kl": 0.07133134175091982, |
| "learning_rate": 6.363100007928447e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7910 |
| }, |
| { |
| "completion_length": 74.025, |
| "epoch": 1.584, |
| "grad_norm": 0.0003108978271484375, |
| "kl": 0.10760618806816638, |
| "learning_rate": 6.305047737536707e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7920 |
| }, |
| { |
| "completion_length": 56.6, |
| "epoch": 1.5859999999999999, |
| "grad_norm": 0.00092315673828125, |
| "kl": 2.8826652359217406, |
| "learning_rate": 6.247223259238511e-07, |
| "loss": 0.0003, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 7930 |
| }, |
| { |
| "completion_length": 55.475, |
| "epoch": 1.588, |
| "grad_norm": 0.0006103515625, |
| "kl": 2.284485016670078, |
| "learning_rate": 6.189627277606894e-07, |
| "loss": 0.0002, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7940 |
| }, |
| { |
| "completion_length": 46.125, |
| "epoch": 1.5899999999999999, |
| "grad_norm": 0.000972747802734375, |
| "kl": 0.12067738296464085, |
| "learning_rate": 6.1322604944307e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7950 |
| }, |
| { |
| "completion_length": 59.225, |
| "epoch": 1.592, |
| "grad_norm": 0.00066375732421875, |
| "kl": 0.14497559778392316, |
| "learning_rate": 6.075123608706093e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7960 |
| }, |
| { |
| "completion_length": 60.925, |
| "epoch": 1.5939999999999999, |
| "grad_norm": 0.0024566650390625, |
| "kl": 258.11860719914546, |
| "learning_rate": 6.01821731662798e-07, |
| "loss": 0.0258, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 7970 |
| }, |
| { |
| "completion_length": 62.225, |
| "epoch": 1.596, |
| "grad_norm": 0.00153350830078125, |
| "kl": 0.024051298201084138, |
| "learning_rate": 5.961542311581586e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7980 |
| }, |
| { |
| "completion_length": 50.25, |
| "epoch": 1.5979999999999999, |
| "grad_norm": 0.0004482269287109375, |
| "kl": 0.10747648775577545, |
| "learning_rate": 5.905099284133953e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 7990 |
| }, |
| { |
| "completion_length": 52.1, |
| "epoch": 1.6, |
| "grad_norm": 0.00038909912109375, |
| "kl": 4.900969664240256, |
| "learning_rate": 5.848888922025553e-07, |
| "loss": 0.0005, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 8000 |
| }, |
| { |
| "completion_length": 46.85, |
| "epoch": 1.6019999999999999, |
| "grad_norm": 0.000553131103515625, |
| "kl": 0.06989260124973953, |
| "learning_rate": 5.792911910161922e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8010 |
| }, |
| { |
| "completion_length": 64.45, |
| "epoch": 1.604, |
| "grad_norm": 0.0006561279296875, |
| "kl": 0.054861510870978236, |
| "learning_rate": 5.737168930605272e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8020 |
| }, |
| { |
| "completion_length": 69.5, |
| "epoch": 1.6059999999999999, |
| "grad_norm": 0.0003757476806640625, |
| "kl": 0.01320057879202068, |
| "learning_rate": 5.681660662566225e-07, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8030 |
| }, |
| { |
| "completion_length": 48.75, |
| "epoch": 1.608, |
| "grad_norm": 0.000743865966796875, |
| "kl": 0.5451168741099537, |
| "learning_rate": 5.626387782395512e-07, |
| "loss": 0.0001, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.15, |
| "rewards/reward_func": -0.075, |
| "step": 8040 |
| }, |
| { |
| "completion_length": 57.45, |
| "epoch": 1.6099999999999999, |
| "grad_norm": 0.0004177093505859375, |
| "kl": 0.06206353167071939, |
| "learning_rate": 5.571350963575728e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8050 |
| }, |
| { |
| "completion_length": 54.175, |
| "epoch": 1.612, |
| "grad_norm": 0.000614166259765625, |
| "kl": 2.013091558404267, |
| "learning_rate": 5.516550876713142e-07, |
| "loss": 0.0002, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 8060 |
| }, |
| { |
| "completion_length": 61.025, |
| "epoch": 1.6139999999999999, |
| "grad_norm": 0.000652313232421875, |
| "kl": 0.06326800542883575, |
| "learning_rate": 5.461988189529529e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8070 |
| }, |
| { |
| "completion_length": 58.825, |
| "epoch": 1.616, |
| "grad_norm": 0.00104522705078125, |
| "kl": 0.014891783054918051, |
| "learning_rate": 5.407663566854008e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8080 |
| }, |
| { |
| "completion_length": 54.875, |
| "epoch": 1.6179999999999999, |
| "grad_norm": 0.0020904541015625, |
| "kl": 0.025240180967375635, |
| "learning_rate": 5.353577670614951e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8090 |
| }, |
| { |
| "completion_length": 61.5, |
| "epoch": 1.62, |
| "grad_norm": 0.000614166259765625, |
| "kl": 0.04227957231923938, |
| "learning_rate": 5.299731159831953e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8100 |
| }, |
| { |
| "completion_length": 65.5, |
| "epoch": 1.6219999999999999, |
| "grad_norm": 0.0004177093505859375, |
| "kl": 0.19428066378459335, |
| "learning_rate": 5.24612469060774e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8110 |
| }, |
| { |
| "completion_length": 53.05, |
| "epoch": 1.624, |
| "grad_norm": 0.0010986328125, |
| "kl": 0.042151403008028866, |
| "learning_rate": 5.192758916120236e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8120 |
| }, |
| { |
| "completion_length": 55.55, |
| "epoch": 1.626, |
| "grad_norm": 0.00029754638671875, |
| "kl": 0.02924617677927017, |
| "learning_rate": 5.139634486614544e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8130 |
| }, |
| { |
| "completion_length": 76.4, |
| "epoch": 1.6280000000000001, |
| "grad_norm": 0.01080322265625, |
| "kl": 0.042456808709539474, |
| "learning_rate": 5.086752049395094e-07, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8140 |
| }, |
| { |
| "completion_length": 60.325, |
| "epoch": 1.63, |
| "grad_norm": 0.01495361328125, |
| "kl": 0.025815209513530134, |
| "learning_rate": 5.034112248817685e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8150 |
| }, |
| { |
| "completion_length": 71.75, |
| "epoch": 1.6320000000000001, |
| "grad_norm": 0.000865936279296875, |
| "kl": 0.0641026332974434, |
| "learning_rate": 4.981715726281666e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8160 |
| }, |
| { |
| "completion_length": 57.4, |
| "epoch": 1.634, |
| "grad_norm": 0.000530242919921875, |
| "kl": 0.045072671584784986, |
| "learning_rate": 4.929563120222142e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8170 |
| }, |
| { |
| "completion_length": 52.075, |
| "epoch": 1.6360000000000001, |
| "grad_norm": 0.00011157989501953125, |
| "kl": 0.1052944268565625, |
| "learning_rate": 4.87765506610215e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8180 |
| }, |
| { |
| "completion_length": 54.375, |
| "epoch": 1.638, |
| "grad_norm": 0.0003986358642578125, |
| "kl": 0.03237830828875303, |
| "learning_rate": 4.825992196404958e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8190 |
| }, |
| { |
| "completion_length": 56.3, |
| "epoch": 1.6400000000000001, |
| "grad_norm": 0.000835418701171875, |
| "kl": 0.01498257415369153, |
| "learning_rate": 4.774575140626317e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8200 |
| }, |
| { |
| "completion_length": 66.55, |
| "epoch": 1.642, |
| "grad_norm": 0.0003414154052734375, |
| "kl": 0.011996694607660174, |
| "learning_rate": 4.7234045252668393e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8210 |
| }, |
| { |
| "completion_length": 53.05, |
| "epoch": 1.6440000000000001, |
| "grad_norm": 0.0003986358642578125, |
| "kl": 0.1085278536658734, |
| "learning_rate": 4.672480973824312e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8220 |
| }, |
| { |
| "completion_length": 58.675, |
| "epoch": 1.646, |
| "grad_norm": 0.00080108642578125, |
| "kl": 0.04904728039400652, |
| "learning_rate": 4.6218051067861423e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8230 |
| }, |
| { |
| "completion_length": 50.825, |
| "epoch": 1.6480000000000001, |
| "grad_norm": 0.000759124755859375, |
| "kl": 0.014587640948593616, |
| "learning_rate": 4.5713775416217884e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8240 |
| }, |
| { |
| "completion_length": 45.15, |
| "epoch": 1.65, |
| "grad_norm": 0.0003795623779296875, |
| "kl": 0.09401618214324117, |
| "learning_rate": 4.5211988927752026e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8250 |
| }, |
| { |
| "completion_length": 64.925, |
| "epoch": 1.6520000000000001, |
| "grad_norm": 0.0026702880859375, |
| "kl": 0.026571149285882712, |
| "learning_rate": 4.4712697716573994e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8260 |
| }, |
| { |
| "completion_length": 68.8, |
| "epoch": 1.654, |
| "grad_norm": 0.000606536865234375, |
| "kl": 0.09327265082392841, |
| "learning_rate": 4.421590786638952e-07, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8270 |
| }, |
| { |
| "completion_length": 58.775, |
| "epoch": 1.6560000000000001, |
| "grad_norm": 0.000858306884765625, |
| "kl": 0.06238628029823303, |
| "learning_rate": 4.372162543042624e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8280 |
| }, |
| { |
| "completion_length": 56.575, |
| "epoch": 1.658, |
| "grad_norm": 0.00030517578125, |
| "kl": 0.06320808534510433, |
| "learning_rate": 4.3229856431359516e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8290 |
| }, |
| { |
| "completion_length": 59.625, |
| "epoch": 1.6600000000000001, |
| "grad_norm": 0.000576019287109375, |
| "kl": 0.06072661457583308, |
| "learning_rate": 4.27406068612396e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8300 |
| }, |
| { |
| "completion_length": 54.5, |
| "epoch": 1.662, |
| "grad_norm": 0.000698089599609375, |
| "kl": 0.0206127600511536, |
| "learning_rate": 4.225388268141797e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8310 |
| }, |
| { |
| "completion_length": 64.5, |
| "epoch": 1.6640000000000001, |
| "grad_norm": 0.000766754150390625, |
| "kl": 1.2397997039370239, |
| "learning_rate": 4.1769689822475147e-07, |
| "loss": 0.0001, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8320 |
| }, |
| { |
| "completion_length": 63.85, |
| "epoch": 1.666, |
| "grad_norm": 0.00113677978515625, |
| "kl": 0.04746299120597541, |
| "learning_rate": 4.12880341841484e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8330 |
| }, |
| { |
| "completion_length": 65.3, |
| "epoch": 1.6680000000000001, |
| "grad_norm": 0.000629425048828125, |
| "kl": 0.07974740182980895, |
| "learning_rate": 4.0808921635259595e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8340 |
| }, |
| { |
| "completion_length": 54.1, |
| "epoch": 1.67, |
| "grad_norm": 0.000759124755859375, |
| "kl": 350.0721945284866, |
| "learning_rate": 4.033235801364402e-07, |
| "loss": 0.035, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 8350 |
| }, |
| { |
| "completion_length": 50.45, |
| "epoch": 1.6720000000000002, |
| "grad_norm": 0.00054168701171875, |
| "kl": 7.292029631882906, |
| "learning_rate": 3.9858349126078945e-07, |
| "loss": 0.0007, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 8360 |
| }, |
| { |
| "completion_length": 58.775, |
| "epoch": 1.674, |
| "grad_norm": 0.012939453125, |
| "kl": 0.07852717223577202, |
| "learning_rate": 3.938690074821314e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8370 |
| }, |
| { |
| "completion_length": 54.425, |
| "epoch": 1.6760000000000002, |
| "grad_norm": 0.00070953369140625, |
| "kl": 0.01418459378182888, |
| "learning_rate": 3.891801862449629e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8380 |
| }, |
| { |
| "completion_length": 50.175, |
| "epoch": 1.678, |
| "grad_norm": 0.000560760498046875, |
| "kl": 0.03785524540580809, |
| "learning_rate": 3.8451708468109026e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8390 |
| }, |
| { |
| "completion_length": 43.075, |
| "epoch": 1.6800000000000002, |
| "grad_norm": 0.00136566162109375, |
| "kl": 0.08654712834395469, |
| "learning_rate": 3.798797596089351e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8400 |
| }, |
| { |
| "completion_length": 59.775, |
| "epoch": 1.682, |
| "grad_norm": 0.0005950927734375, |
| "kl": 0.2558928931131959, |
| "learning_rate": 3.7526826753284065e-07, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8410 |
| }, |
| { |
| "completion_length": 58.375, |
| "epoch": 1.6840000000000002, |
| "grad_norm": 0.00067901611328125, |
| "kl": 1.3383296761894599, |
| "learning_rate": 3.7068266464238085e-07, |
| "loss": 0.0001, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8420 |
| }, |
| { |
| "completion_length": 64.6, |
| "epoch": 1.686, |
| "grad_norm": 0.0003833770751953125, |
| "kl": 0.0377775629516691, |
| "learning_rate": 3.661230068116811e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8430 |
| }, |
| { |
| "completion_length": 58.925, |
| "epoch": 1.688, |
| "grad_norm": 0.00092315673828125, |
| "kl": 0.05282154800370335, |
| "learning_rate": 3.615893495987335e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8440 |
| }, |
| { |
| "completion_length": 61.375, |
| "epoch": 1.69, |
| "grad_norm": 0.00079345703125, |
| "kl": 104.9448153554462, |
| "learning_rate": 3.5708174824471947e-07, |
| "loss": 0.0105, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 8450 |
| }, |
| { |
| "completion_length": 66.675, |
| "epoch": 1.692, |
| "grad_norm": 0.0004119873046875, |
| "kl": 0.040050674229860306, |
| "learning_rate": 3.5260025767333894e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8460 |
| }, |
| { |
| "completion_length": 69.475, |
| "epoch": 1.694, |
| "grad_norm": 0.000682830810546875, |
| "kl": 0.020584713015705348, |
| "learning_rate": 3.481449324901412e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8470 |
| }, |
| { |
| "completion_length": 66.525, |
| "epoch": 1.696, |
| "grad_norm": 0.0002193450927734375, |
| "kl": 0.05711883215699345, |
| "learning_rate": 3.4371582698185636e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8480 |
| }, |
| { |
| "completion_length": 68.075, |
| "epoch": 1.698, |
| "grad_norm": 0.0002460479736328125, |
| "kl": 0.05435404470190406, |
| "learning_rate": 3.393129951157384e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8490 |
| }, |
| { |
| "completion_length": 63.9, |
| "epoch": 1.7, |
| "grad_norm": 0.00174713134765625, |
| "kl": 0.06182208526879549, |
| "learning_rate": 3.3493649053890325e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8500 |
| }, |
| { |
| "completion_length": 68.7, |
| "epoch": 1.702, |
| "grad_norm": 167.0, |
| "kl": 88.8611083610449, |
| "learning_rate": 3.3058636657767927e-07, |
| "loss": 0.0089, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8510 |
| }, |
| { |
| "completion_length": 76.675, |
| "epoch": 1.704, |
| "grad_norm": 0.0038909912109375, |
| "kl": 0.05937002245336771, |
| "learning_rate": 3.262626762369525e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8520 |
| }, |
| { |
| "completion_length": 57.825, |
| "epoch": 1.706, |
| "grad_norm": 0.000690460205078125, |
| "kl": 0.05280606346204877, |
| "learning_rate": 3.219654721995266e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8530 |
| }, |
| { |
| "completion_length": 48.85, |
| "epoch": 1.708, |
| "grad_norm": 0.000560760498046875, |
| "kl": 188.36169426795095, |
| "learning_rate": 3.176948068254762e-07, |
| "loss": 0.0188, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8540 |
| }, |
| { |
| "completion_length": 64.1, |
| "epoch": 1.71, |
| "grad_norm": 0.0003757476806640625, |
| "kl": 0.01651700264774263, |
| "learning_rate": 3.134507321515107e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8550 |
| }, |
| { |
| "completion_length": 43.95, |
| "epoch": 1.712, |
| "grad_norm": 0.0003681182861328125, |
| "kl": 5.833402361674234, |
| "learning_rate": 3.092332998903416e-07, |
| "loss": 0.0006, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 8560 |
| }, |
| { |
| "completion_length": 60.85, |
| "epoch": 1.714, |
| "grad_norm": 0.005889892578125, |
| "kl": 0.057405439857393506, |
| "learning_rate": 3.050425614300487e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8570 |
| }, |
| { |
| "completion_length": 70.15, |
| "epoch": 1.716, |
| "grad_norm": 0.000476837158203125, |
| "kl": 0.01461967695504427, |
| "learning_rate": 3.0087856783345916e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8580 |
| }, |
| { |
| "completion_length": 48.075, |
| "epoch": 1.718, |
| "grad_norm": 0.000438690185546875, |
| "kl": 0.0398553837556392, |
| "learning_rate": 2.967413698375196e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8590 |
| }, |
| { |
| "completion_length": 57.75, |
| "epoch": 1.72, |
| "grad_norm": 0.000518798828125, |
| "kl": 0.02238648202328477, |
| "learning_rate": 2.9263101785268253e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8600 |
| }, |
| { |
| "completion_length": 66.25, |
| "epoch": 1.722, |
| "grad_norm": 0.0015716552734375, |
| "kl": 0.04364732797257602, |
| "learning_rate": 2.8854756196229017e-07, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8610 |
| }, |
| { |
| "completion_length": 43.7, |
| "epoch": 1.724, |
| "grad_norm": 0.0003299713134765625, |
| "kl": 0.11783089116215706, |
| "learning_rate": 2.844910519219632e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8620 |
| }, |
| { |
| "completion_length": 49.45, |
| "epoch": 1.726, |
| "grad_norm": 0.000640869140625, |
| "kl": 0.13984102117829025, |
| "learning_rate": 2.8046153715899695e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8630 |
| }, |
| { |
| "completion_length": 48.175, |
| "epoch": 1.728, |
| "grad_norm": 0.00054931640625, |
| "kl": 0.40176068069413307, |
| "learning_rate": 2.764590667717562e-07, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8640 |
| }, |
| { |
| "completion_length": 63.95, |
| "epoch": 1.73, |
| "grad_norm": 0.0004711151123046875, |
| "kl": 0.8224213434383273, |
| "learning_rate": 2.7248368952908055e-07, |
| "loss": 0.0001, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8650 |
| }, |
| { |
| "completion_length": 44.175, |
| "epoch": 1.732, |
| "grad_norm": 0.000644683837890625, |
| "kl": 0.04362150589004159, |
| "learning_rate": 2.6853545386968607e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8660 |
| }, |
| { |
| "completion_length": 67.65, |
| "epoch": 1.734, |
| "grad_norm": 0.000659942626953125, |
| "kl": 0.048378444463014605, |
| "learning_rate": 2.6461440790157974e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8670 |
| }, |
| { |
| "completion_length": 61.075, |
| "epoch": 1.736, |
| "grad_norm": 0.0022430419921875, |
| "kl": 0.019372216332703827, |
| "learning_rate": 2.6072059940146775e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8680 |
| }, |
| { |
| "completion_length": 44.075, |
| "epoch": 1.738, |
| "grad_norm": 0.0004119873046875, |
| "kl": 8.146503202756866, |
| "learning_rate": 2.568540758141791e-07, |
| "loss": 0.0008, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8690 |
| }, |
| { |
| "completion_length": 53.575, |
| "epoch": 1.74, |
| "grad_norm": 0.0004138946533203125, |
| "kl": 49.64364205431193, |
| "learning_rate": 2.53014884252083e-07, |
| "loss": 0.005, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 8700 |
| }, |
| { |
| "completion_length": 65.325, |
| "epoch": 1.742, |
| "grad_norm": 0.000579833984375, |
| "kl": 0.4298483125632629, |
| "learning_rate": 2.492030714945162e-07, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8710 |
| }, |
| { |
| "completion_length": 52.625, |
| "epoch": 1.744, |
| "grad_norm": 0.0016326904296875, |
| "kl": 0.07851723725907504, |
| "learning_rate": 2.454186839872158e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8720 |
| }, |
| { |
| "completion_length": 62.65, |
| "epoch": 1.746, |
| "grad_norm": 0.001373291015625, |
| "kl": 23.3441758136265, |
| "learning_rate": 2.4166176784174795e-07, |
| "loss": 0.0023, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8730 |
| }, |
| { |
| "completion_length": 74.575, |
| "epoch": 1.748, |
| "grad_norm": 0.000637054443359375, |
| "kl": 11.16068452913314, |
| "learning_rate": 2.3793236883495164e-07, |
| "loss": 0.0011, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8740 |
| }, |
| { |
| "completion_length": 49.475, |
| "epoch": 1.75, |
| "grad_norm": 0.000820159912109375, |
| "kl": 0.075881730299443, |
| "learning_rate": 2.3423053240837518e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8750 |
| }, |
| { |
| "completion_length": 56.025, |
| "epoch": 1.752, |
| "grad_norm": 0.000476837158203125, |
| "kl": 0.01870635347440839, |
| "learning_rate": 2.3055630366772857e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8760 |
| }, |
| { |
| "completion_length": 47.775, |
| "epoch": 1.754, |
| "grad_norm": 0.000675201416015625, |
| "kl": 0.07872601179406047, |
| "learning_rate": 2.269097273823287e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8770 |
| }, |
| { |
| "completion_length": 55.075, |
| "epoch": 1.756, |
| "grad_norm": 0.002685546875, |
| "kl": 2.795431226864457, |
| "learning_rate": 2.2329084798455747e-07, |
| "loss": 0.0003, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8780 |
| }, |
| { |
| "completion_length": 57.725, |
| "epoch": 1.758, |
| "grad_norm": 0.0003795623779296875, |
| "kl": 0.03155038901604712, |
| "learning_rate": 2.1969970956931762e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8790 |
| }, |
| { |
| "completion_length": 56.35, |
| "epoch": 1.76, |
| "grad_norm": 0.000591278076171875, |
| "kl": 48.86689073387534, |
| "learning_rate": 2.1613635589349756e-07, |
| "loss": 0.0049, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8800 |
| }, |
| { |
| "completion_length": 52.05, |
| "epoch": 1.762, |
| "grad_norm": 0.00151824951171875, |
| "kl": 0.05452495804056525, |
| "learning_rate": 2.1260083037543817e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8810 |
| }, |
| { |
| "completion_length": 47.3, |
| "epoch": 1.764, |
| "grad_norm": 0.0004444122314453125, |
| "kl": 312.406746559497, |
| "learning_rate": 2.0909317609440093e-07, |
| "loss": 0.0312, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8820 |
| }, |
| { |
| "completion_length": 52.475, |
| "epoch": 1.766, |
| "grad_norm": 0.0005950927734375, |
| "kl": 0.19929129825904965, |
| "learning_rate": 2.0561343579004716e-07, |
| "loss": 0.0, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 8830 |
| }, |
| { |
| "completion_length": 74.525, |
| "epoch": 1.768, |
| "grad_norm": 0.0003871917724609375, |
| "kl": 0.022377661243081094, |
| "learning_rate": 2.0216165186191406e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8840 |
| }, |
| { |
| "completion_length": 47.8, |
| "epoch": 1.77, |
| "grad_norm": 0.00055694580078125, |
| "kl": 0.018075392534956335, |
| "learning_rate": 1.9873786636889908e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8850 |
| }, |
| { |
| "completion_length": 66.875, |
| "epoch": 1.772, |
| "grad_norm": 0.00037384033203125, |
| "kl": 0.0537069259211421, |
| "learning_rate": 1.95342121028749e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8860 |
| }, |
| { |
| "completion_length": 50.35, |
| "epoch": 1.774, |
| "grad_norm": 0.000408172607421875, |
| "kl": 0.16273712795227765, |
| "learning_rate": 1.9197445721754777e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8870 |
| }, |
| { |
| "completion_length": 41.1, |
| "epoch": 1.776, |
| "grad_norm": 0.00162506103515625, |
| "kl": 0.0835498913191259, |
| "learning_rate": 1.8863491596921745e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8880 |
| }, |
| { |
| "completion_length": 56.375, |
| "epoch": 1.778, |
| "grad_norm": 0.0021514892578125, |
| "kl": 3.5430075244046746, |
| "learning_rate": 1.8532353797501318e-07, |
| "loss": 0.0004, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8890 |
| }, |
| { |
| "completion_length": 49.0, |
| "epoch": 1.78, |
| "grad_norm": 0.00153350830078125, |
| "kl": 0.09430858921259641, |
| "learning_rate": 1.8204036358303173e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8900 |
| }, |
| { |
| "completion_length": 56.175, |
| "epoch": 1.782, |
| "grad_norm": 0.00121307373046875, |
| "kl": 0.04068310302682221, |
| "learning_rate": 1.787854327977162e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8910 |
| }, |
| { |
| "completion_length": 59.075, |
| "epoch": 1.784, |
| "grad_norm": 0.0003814697265625, |
| "kl": 0.04385726461187005, |
| "learning_rate": 1.7555878527937164e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8920 |
| }, |
| { |
| "completion_length": 55.05, |
| "epoch": 1.786, |
| "grad_norm": 0.0030517578125, |
| "kl": 0.018657160410657524, |
| "learning_rate": 1.7236046034367959e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8930 |
| }, |
| { |
| "completion_length": 47.975, |
| "epoch": 1.788, |
| "grad_norm": 0.0004711151123046875, |
| "kl": 1.90866837259382, |
| "learning_rate": 1.6919049696121957e-07, |
| "loss": 0.0002, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8940 |
| }, |
| { |
| "completion_length": 61.925, |
| "epoch": 1.79, |
| "grad_norm": 0.0030517578125, |
| "kl": 0.7081083978526295, |
| "learning_rate": 1.6604893375699594e-07, |
| "loss": 0.0001, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8950 |
| }, |
| { |
| "completion_length": 59.725, |
| "epoch": 1.792, |
| "grad_norm": 0.0004444122314453125, |
| "kl": 4.325691572204232, |
| "learning_rate": 1.629358090099639e-07, |
| "loss": 0.0004, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8960 |
| }, |
| { |
| "completion_length": 54.6, |
| "epoch": 1.794, |
| "grad_norm": 0.0009918212890625, |
| "kl": 0.11546620442532003, |
| "learning_rate": 1.5985116065256683e-07, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8970 |
| }, |
| { |
| "completion_length": 58.7, |
| "epoch": 1.796, |
| "grad_norm": 0.000591278076171875, |
| "kl": 1.2548286508535966, |
| "learning_rate": 1.567950262702714e-07, |
| "loss": 0.0001, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 8980 |
| }, |
| { |
| "completion_length": 56.925, |
| "epoch": 1.798, |
| "grad_norm": 0.0003376007080078125, |
| "kl": 0.07534236300271005, |
| "learning_rate": 1.5376744310111019e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 8990 |
| }, |
| { |
| "completion_length": 59.7, |
| "epoch": 1.8, |
| "grad_norm": 0.000576019287109375, |
| "kl": 79.93720495556481, |
| "learning_rate": 1.507684480352292e-07, |
| "loss": 0.008, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 9000 |
| }, |
| { |
| "completion_length": 47.55, |
| "epoch": 1.802, |
| "grad_norm": 0.000606536865234375, |
| "kl": 0.07122775209136308, |
| "learning_rate": 1.4779807761443638e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9010 |
| }, |
| { |
| "completion_length": 56.475, |
| "epoch": 1.804, |
| "grad_norm": 0.0019378662109375, |
| "kl": 31.246724256686868, |
| "learning_rate": 1.4485636803175828e-07, |
| "loss": 0.0031, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 9020 |
| }, |
| { |
| "completion_length": 56.8, |
| "epoch": 1.806, |
| "grad_norm": 0.0007476806640625, |
| "kl": 0.01624395214021206, |
| "learning_rate": 1.419433551309976e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9030 |
| }, |
| { |
| "completion_length": 52.925, |
| "epoch": 1.808, |
| "grad_norm": 0.00052642822265625, |
| "kl": 0.03512433131691069, |
| "learning_rate": 1.3905907440629752e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9040 |
| }, |
| { |
| "completion_length": 63.475, |
| "epoch": 1.81, |
| "grad_norm": 0.0008087158203125, |
| "kl": 0.05812466649804264, |
| "learning_rate": 1.362035610017079e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9050 |
| }, |
| { |
| "completion_length": 67.2, |
| "epoch": 1.812, |
| "grad_norm": 0.0002841949462890625, |
| "kl": 0.053207884868606926, |
| "learning_rate": 1.3337684971075932e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9060 |
| }, |
| { |
| "completion_length": 63.025, |
| "epoch": 1.814, |
| "grad_norm": 0.00083160400390625, |
| "kl": 0.017276625451631843, |
| "learning_rate": 1.305789749760361e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9070 |
| }, |
| { |
| "completion_length": 53.85, |
| "epoch": 1.8159999999999998, |
| "grad_norm": 0.0014495849609375, |
| "kl": 0.1759139670059085, |
| "learning_rate": 1.278099708887587e-07, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 9080 |
| }, |
| { |
| "completion_length": 49.575, |
| "epoch": 1.818, |
| "grad_norm": 0.00054168701171875, |
| "kl": 0.06552611859515309, |
| "learning_rate": 1.2506987118836912e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9090 |
| }, |
| { |
| "completion_length": 57.775, |
| "epoch": 1.8199999999999998, |
| "grad_norm": 0.0005035400390625, |
| "kl": 0.14279152313247323, |
| "learning_rate": 1.223587092621162e-07, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 9100 |
| }, |
| { |
| "completion_length": 66.725, |
| "epoch": 1.822, |
| "grad_norm": 0.000827789306640625, |
| "kl": 26.51692173536867, |
| "learning_rate": 1.1967651814465353e-07, |
| "loss": 0.0027, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 9110 |
| }, |
| { |
| "completion_length": 45.675, |
| "epoch": 1.8239999999999998, |
| "grad_norm": 24.0, |
| "kl": 21.41680323826149, |
| "learning_rate": 1.1702333051763271e-07, |
| "loss": 0.0021, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 9120 |
| }, |
| { |
| "completion_length": 50.95, |
| "epoch": 1.826, |
| "grad_norm": 0.0005035400390625, |
| "kl": 0.031037054676562547, |
| "learning_rate": 1.1439917870930795e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9130 |
| }, |
| { |
| "completion_length": 49.925, |
| "epoch": 1.8279999999999998, |
| "grad_norm": 0.000782012939453125, |
| "kl": 0.06788429841399193, |
| "learning_rate": 1.1180409469414094e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9140 |
| }, |
| { |
| "completion_length": 54.575, |
| "epoch": 1.83, |
| "grad_norm": 0.00067901611328125, |
| "kl": 0.02977508623152971, |
| "learning_rate": 1.0923811009241142e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9150 |
| }, |
| { |
| "completion_length": 62.55, |
| "epoch": 1.8319999999999999, |
| "grad_norm": 0.005157470703125, |
| "kl": 0.03778183825779706, |
| "learning_rate": 1.067012561698319e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9160 |
| }, |
| { |
| "completion_length": 55.6, |
| "epoch": 1.834, |
| "grad_norm": 0.0003528594970703125, |
| "kl": 0.033238646434620024, |
| "learning_rate": 1.041935638371669e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9170 |
| }, |
| { |
| "completion_length": 60.35, |
| "epoch": 1.8359999999999999, |
| "grad_norm": 0.00052642822265625, |
| "kl": 0.02701822677627206, |
| "learning_rate": 1.0171506364985622e-07, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9180 |
| }, |
| { |
| "completion_length": 53.775, |
| "epoch": 1.838, |
| "grad_norm": 0.0002994537353515625, |
| "kl": 1243.2045701113996, |
| "learning_rate": 9.926578580764234e-08, |
| "loss": 0.1243, |
| "match_ratio": 0.9, |
| "reward": -0.1, |
| "reward_std": 0.1154700517654419, |
| "rewards/reward_func": -0.1, |
| "step": 9190 |
| }, |
| { |
| "completion_length": 76.5, |
| "epoch": 1.8399999999999999, |
| "grad_norm": 0.000469207763671875, |
| "kl": 0.017305072862654924, |
| "learning_rate": 9.684576015420277e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9200 |
| }, |
| { |
| "completion_length": 50.85, |
| "epoch": 1.842, |
| "grad_norm": 0.00045013427734375, |
| "kl": 13.52835137634538, |
| "learning_rate": 9.445501617678654e-08, |
| "loss": 0.0014, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 9210 |
| }, |
| { |
| "completion_length": 55.425, |
| "epoch": 1.8439999999999999, |
| "grad_norm": 0.00072479248046875, |
| "kl": 0.03520208708941937, |
| "learning_rate": 9.209358300585474e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9220 |
| }, |
| { |
| "completion_length": 66.8, |
| "epoch": 1.846, |
| "grad_norm": 0.000614166259765625, |
| "kl": 0.02983384854160249, |
| "learning_rate": 8.9761489414725e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9230 |
| }, |
| { |
| "completion_length": 49.075, |
| "epoch": 1.8479999999999999, |
| "grad_norm": 0.000537872314453125, |
| "kl": 0.032278594188392164, |
| "learning_rate": 8.745876381922147e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9240 |
| }, |
| { |
| "completion_length": 60.0, |
| "epoch": 1.85, |
| "grad_norm": 0.000659942626953125, |
| "kl": 0.02954811817035079, |
| "learning_rate": 8.518543427732951e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9250 |
| }, |
| { |
| "completion_length": 58.05, |
| "epoch": 1.8519999999999999, |
| "grad_norm": 0.000522613525390625, |
| "kl": 0.020372640853747726, |
| "learning_rate": 8.294152848885156e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9260 |
| }, |
| { |
| "completion_length": 57.65, |
| "epoch": 1.854, |
| "grad_norm": 0.000514984130859375, |
| "kl": 48.13295641997829, |
| "learning_rate": 8.072707379507217e-08, |
| "loss": 0.0048, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 9270 |
| }, |
| { |
| "completion_length": 73.275, |
| "epoch": 1.8559999999999999, |
| "grad_norm": 0.000583648681640625, |
| "kl": 259.27141086012125, |
| "learning_rate": 7.854209717842231e-08, |
| "loss": 0.0259, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 9280 |
| }, |
| { |
| "completion_length": 62.75, |
| "epoch": 1.858, |
| "grad_norm": 0.0002765655517578125, |
| "kl": 0.0620627264957875, |
| "learning_rate": 7.638662526215284e-08, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 9290 |
| }, |
| { |
| "completion_length": 60.6, |
| "epoch": 1.8599999999999999, |
| "grad_norm": 0.000881195068359375, |
| "kl": 0.0414402786642313, |
| "learning_rate": 7.426068431000883e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9300 |
| }, |
| { |
| "completion_length": 65.575, |
| "epoch": 1.862, |
| "grad_norm": 0.00058746337890625, |
| "kl": 0.08443178189918399, |
| "learning_rate": 7.216430022591009e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9310 |
| }, |
| { |
| "completion_length": 64.35, |
| "epoch": 1.8639999999999999, |
| "grad_norm": 0.00634765625, |
| "kl": 0.12132438533008098, |
| "learning_rate": 7.009749855363457e-08, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 9320 |
| }, |
| { |
| "completion_length": 59.775, |
| "epoch": 1.866, |
| "grad_norm": 0.000392913818359375, |
| "kl": 7.24802761040628, |
| "learning_rate": 6.806030447650879e-08, |
| "loss": 0.0007, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.075, |
| "step": 9330 |
| }, |
| { |
| "completion_length": 51.325, |
| "epoch": 1.8679999999999999, |
| "grad_norm": 0.000522613525390625, |
| "kl": 12.393874236382544, |
| "learning_rate": 6.605274281709929e-08, |
| "loss": 0.0012, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 9340 |
| }, |
| { |
| "completion_length": 54.575, |
| "epoch": 1.87, |
| "grad_norm": 0.000598907470703125, |
| "kl": 0.20043480526655913, |
| "learning_rate": 6.407483803691216e-08, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 9350 |
| }, |
| { |
| "completion_length": 65.05, |
| "epoch": 1.8719999999999999, |
| "grad_norm": 0.000621795654296875, |
| "kl": 0.038857326842844486, |
| "learning_rate": 6.212661423609184e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9360 |
| }, |
| { |
| "completion_length": 59.3, |
| "epoch": 1.874, |
| "grad_norm": 0.000453948974609375, |
| "kl": 0.1275158784352243, |
| "learning_rate": 6.020809515313141e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9370 |
| }, |
| { |
| "completion_length": 55.175, |
| "epoch": 1.876, |
| "grad_norm": 0.001220703125, |
| "kl": 0.6000383426435292, |
| "learning_rate": 5.83193041645802e-08, |
| "loss": 0.0001, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9380 |
| }, |
| { |
| "completion_length": 57.375, |
| "epoch": 1.8780000000000001, |
| "grad_norm": 0.00177001953125, |
| "kl": 0.06478001358918846, |
| "learning_rate": 5.6460264284760316e-08, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 9390 |
| }, |
| { |
| "completion_length": 51.175, |
| "epoch": 1.88, |
| "grad_norm": 0.006195068359375, |
| "kl": 0.08602785079274326, |
| "learning_rate": 5.463099816548578e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9400 |
| }, |
| { |
| "completion_length": 57.75, |
| "epoch": 1.8820000000000001, |
| "grad_norm": 0.00092315673828125, |
| "kl": 0.028340872889384628, |
| "learning_rate": 5.283152809578751e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9410 |
| }, |
| { |
| "completion_length": 61.275, |
| "epoch": 1.884, |
| "grad_norm": 0.0004024505615234375, |
| "kl": 0.089741973252967, |
| "learning_rate": 5.106187600163987e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9420 |
| }, |
| { |
| "completion_length": 65.625, |
| "epoch": 1.8860000000000001, |
| "grad_norm": 0.0025634765625, |
| "kl": 0.060642439499497415, |
| "learning_rate": 4.932206344569562e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9430 |
| }, |
| { |
| "completion_length": 58.875, |
| "epoch": 1.888, |
| "grad_norm": 0.00067901611328125, |
| "kl": 0.06356988861225546, |
| "learning_rate": 4.761211162702117e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9440 |
| }, |
| { |
| "completion_length": 57.25, |
| "epoch": 1.8900000000000001, |
| "grad_norm": 0.00072479248046875, |
| "kl": 16.864195838803425, |
| "learning_rate": 4.593204138084006e-08, |
| "loss": 0.0017, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.1, |
| "rewards/reward_func": -0.05, |
| "step": 9450 |
| }, |
| { |
| "completion_length": 51.025, |
| "epoch": 1.892, |
| "grad_norm": 0.00037384033203125, |
| "kl": 0.047673306241631505, |
| "learning_rate": 4.428187317827848e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9460 |
| }, |
| { |
| "completion_length": 66.45, |
| "epoch": 1.8940000000000001, |
| "grad_norm": 0.000469207763671875, |
| "kl": 0.035626521334052086, |
| "learning_rate": 4.26616271261146e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9470 |
| }, |
| { |
| "completion_length": 47.25, |
| "epoch": 1.896, |
| "grad_norm": 0.00045013427734375, |
| "kl": 0.09364478723146022, |
| "learning_rate": 4.1071322966535487e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9480 |
| }, |
| { |
| "completion_length": 61.45, |
| "epoch": 1.8980000000000001, |
| "grad_norm": 0.0004100799560546875, |
| "kl": 0.02402509720996022, |
| "learning_rate": 3.95109800768953e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9490 |
| }, |
| { |
| "completion_length": 52.2, |
| "epoch": 1.9, |
| "grad_norm": 0.00049591064453125, |
| "kl": 0.08021967611275613, |
| "learning_rate": 3.798061746947995e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9500 |
| }, |
| { |
| "completion_length": 54.175, |
| "epoch": 1.9020000000000001, |
| "grad_norm": 0.0003814697265625, |
| "kl": 0.08938063569366932, |
| "learning_rate": 3.648025379127479e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9510 |
| }, |
| { |
| "completion_length": 48.6, |
| "epoch": 1.904, |
| "grad_norm": 0.000553131103515625, |
| "kl": 0.03359618247486651, |
| "learning_rate": 3.5009907323737826e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9520 |
| }, |
| { |
| "completion_length": 49.65, |
| "epoch": 1.9060000000000001, |
| "grad_norm": 0.00131988525390625, |
| "kl": 0.10454095806926489, |
| "learning_rate": 3.3569595982576584e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9530 |
| }, |
| { |
| "completion_length": 42.4, |
| "epoch": 1.908, |
| "grad_norm": 0.000667572021484375, |
| "kl": 0.18224592534825207, |
| "learning_rate": 3.2159337317530234e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9540 |
| }, |
| { |
| "completion_length": 51.125, |
| "epoch": 1.9100000000000001, |
| "grad_norm": 0.0009613037109375, |
| "kl": 0.2186179363168776, |
| "learning_rate": 3.077914851215585e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9550 |
| }, |
| { |
| "completion_length": 60.75, |
| "epoch": 1.912, |
| "grad_norm": 0.000701904296875, |
| "kl": 0.090417854860425, |
| "learning_rate": 2.9429046383618042e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9560 |
| }, |
| { |
| "completion_length": 41.175, |
| "epoch": 1.9140000000000001, |
| "grad_norm": 0.00130462646484375, |
| "kl": 0.07170910434797406, |
| "learning_rate": 2.810904738248549e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9570 |
| }, |
| { |
| "completion_length": 43.875, |
| "epoch": 1.916, |
| "grad_norm": 0.0018310546875, |
| "kl": 0.12651289403438568, |
| "learning_rate": 2.681916759252917e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9580 |
| }, |
| { |
| "completion_length": 67.95, |
| "epoch": 1.9180000000000001, |
| "grad_norm": 0.0009613037109375, |
| "kl": 0.046817721845582125, |
| "learning_rate": 2.555942273052753e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9590 |
| }, |
| { |
| "completion_length": 44.825, |
| "epoch": 1.92, |
| "grad_norm": 0.002197265625, |
| "kl": 1.0959480846766383, |
| "learning_rate": 2.4329828146074096e-08, |
| "loss": 0.0001, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 9600 |
| }, |
| { |
| "completion_length": 40.15, |
| "epoch": 1.9220000000000002, |
| "grad_norm": 0.0015869140625, |
| "kl": 0.05991814769804478, |
| "learning_rate": 2.313039882139101e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9610 |
| }, |
| { |
| "completion_length": 56.9, |
| "epoch": 1.924, |
| "grad_norm": 0.000377655029296875, |
| "kl": 0.019165601092390717, |
| "learning_rate": 2.1961149371145795e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9620 |
| }, |
| { |
| "completion_length": 61.1, |
| "epoch": 1.9260000000000002, |
| "grad_norm": 0.000263214111328125, |
| "kl": 0.05205519350711256, |
| "learning_rate": 2.082209404227403e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9630 |
| }, |
| { |
| "completion_length": 46.925, |
| "epoch": 1.928, |
| "grad_norm": 0.0004367828369140625, |
| "kl": 0.08729816749691963, |
| "learning_rate": 1.9713246713805588e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9640 |
| }, |
| { |
| "completion_length": 60.35, |
| "epoch": 1.9300000000000002, |
| "grad_norm": 0.0017852783203125, |
| "kl": 0.09572115261107683, |
| "learning_rate": 1.8634620896695044e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9650 |
| }, |
| { |
| "completion_length": 62.975, |
| "epoch": 1.932, |
| "grad_norm": 10.0625, |
| "kl": 0.08949833824299276, |
| "learning_rate": 1.7586229733657646e-08, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 9660 |
| }, |
| { |
| "completion_length": 60.05, |
| "epoch": 1.9340000000000002, |
| "grad_norm": 0.0003566741943359375, |
| "kl": 0.051867073588073256, |
| "learning_rate": 1.6568085999008886e-08, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 9670 |
| }, |
| { |
| "completion_length": 56.425, |
| "epoch": 1.936, |
| "grad_norm": 0.0006866455078125, |
| "kl": 0.05798132345080376, |
| "learning_rate": 1.5580202098509078e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9680 |
| }, |
| { |
| "completion_length": 60.575, |
| "epoch": 1.938, |
| "grad_norm": 29.75, |
| "kl": 22.995475397538392, |
| "learning_rate": 1.4622590069211517e-08, |
| "loss": 0.0023, |
| "match_ratio": 0.95, |
| "reward": -0.05, |
| "reward_std": 0.05773502588272095, |
| "rewards/reward_func": -0.05, |
| "step": 9690 |
| }, |
| { |
| "completion_length": 52.575, |
| "epoch": 1.94, |
| "grad_norm": 0.00121307373046875, |
| "kl": 0.05500190043821931, |
| "learning_rate": 1.3695261579316776e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9700 |
| }, |
| { |
| "completion_length": 51.8, |
| "epoch": 1.942, |
| "grad_norm": 0.000308990478515625, |
| "kl": 0.07781615569256246, |
| "learning_rate": 1.2798227928029483e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9710 |
| }, |
| { |
| "completion_length": 56.15, |
| "epoch": 1.944, |
| "grad_norm": 0.0003795623779296875, |
| "kl": 0.08795451316982508, |
| "learning_rate": 1.193150004542204e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9720 |
| }, |
| { |
| "completion_length": 52.4, |
| "epoch": 1.946, |
| "grad_norm": 0.002899169921875, |
| "kl": 0.047139992006123066, |
| "learning_rate": 1.109508849230001e-08, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9730 |
| }, |
| { |
| "completion_length": 45.45, |
| "epoch": 1.948, |
| "grad_norm": 0.0003662109375, |
| "kl": 4.14183980775997, |
| "learning_rate": 1.0289003460074165e-08, |
| "loss": 0.0004, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 9740 |
| }, |
| { |
| "completion_length": 56.375, |
| "epoch": 1.95, |
| "grad_norm": 0.0031890869140625, |
| "kl": 0.060475172754377124, |
| "learning_rate": 9.513254770636138e-09, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9750 |
| }, |
| { |
| "completion_length": 48.85, |
| "epoch": 1.952, |
| "grad_norm": 0.0008087158203125, |
| "kl": 0.01811651182360947, |
| "learning_rate": 8.767851876239075e-09, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9760 |
| }, |
| { |
| "completion_length": 63.625, |
| "epoch": 1.954, |
| "grad_norm": 0.000522613525390625, |
| "kl": 0.04865064946934581, |
| "learning_rate": 8.052803859382174e-09, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9770 |
| }, |
| { |
| "completion_length": 66.825, |
| "epoch": 1.956, |
| "grad_norm": 0.0003662109375, |
| "kl": 0.017140331957489253, |
| "learning_rate": 7.368119432699383e-09, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9780 |
| }, |
| { |
| "completion_length": 49.275, |
| "epoch": 1.958, |
| "grad_norm": 0.000598907470703125, |
| "kl": 0.05813699197024107, |
| "learning_rate": 6.7138069388547614e-09, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9790 |
| }, |
| { |
| "completion_length": 53.425, |
| "epoch": 1.96, |
| "grad_norm": 0.00069427490234375, |
| "kl": 1490.5032024047337, |
| "learning_rate": 6.089874350439507e-09, |
| "loss": 0.1491, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.10773502588272095, |
| "rewards/reward_func": -0.075, |
| "step": 9800 |
| }, |
| { |
| "completion_length": 43.9, |
| "epoch": 1.962, |
| "grad_norm": 0.0005645751953125, |
| "kl": 31.488269805023446, |
| "learning_rate": 5.4963292698750896e-09, |
| "loss": 0.0031, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 9810 |
| }, |
| { |
| "completion_length": 54.95, |
| "epoch": 1.964, |
| "grad_norm": 0.0184326171875, |
| "kl": 0.13211959092877806, |
| "learning_rate": 4.933178929321103e-09, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9820 |
| }, |
| { |
| "completion_length": 48.75, |
| "epoch": 1.966, |
| "grad_norm": 0.000850677490234375, |
| "kl": 0.10980427814647556, |
| "learning_rate": 4.400430190586724e-09, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9830 |
| }, |
| { |
| "completion_length": 57.525, |
| "epoch": 1.968, |
| "grad_norm": 14.625, |
| "kl": 0.031209711637347936, |
| "learning_rate": 3.8980895450474455e-09, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 9840 |
| }, |
| { |
| "completion_length": 49.9, |
| "epoch": 1.97, |
| "grad_norm": 0.000507354736328125, |
| "kl": 16.15725321341306, |
| "learning_rate": 3.4261631135654174e-09, |
| "loss": 0.0016, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 9850 |
| }, |
| { |
| "completion_length": 54.075, |
| "epoch": 1.972, |
| "grad_norm": 0.001739501953125, |
| "kl": 0.04826322416774929, |
| "learning_rate": 2.984656646415063e-09, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9860 |
| }, |
| { |
| "completion_length": 66.375, |
| "epoch": 1.974, |
| "grad_norm": 0.000919342041015625, |
| "kl": 0.035056399274617435, |
| "learning_rate": 2.573575523213412e-09, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9870 |
| }, |
| { |
| "completion_length": 72.6, |
| "epoch": 1.976, |
| "grad_norm": 0.0007781982421875, |
| "kl": 0.05609772065654397, |
| "learning_rate": 2.192924752854042e-09, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 9880 |
| }, |
| { |
| "completion_length": 72.45, |
| "epoch": 1.978, |
| "grad_norm": 0.0004405975341796875, |
| "kl": 0.26721446458250286, |
| "learning_rate": 1.842708973447127e-09, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9890 |
| }, |
| { |
| "completion_length": 62.25, |
| "epoch": 1.98, |
| "grad_norm": 0.0003643035888671875, |
| "kl": 0.08341183541342616, |
| "learning_rate": 1.5229324522605949e-09, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9900 |
| }, |
| { |
| "completion_length": 58.65, |
| "epoch": 1.982, |
| "grad_norm": 0.00063323974609375, |
| "kl": 2.203670488623902, |
| "learning_rate": 1.2335990856710001e-09, |
| "loss": 0.0002, |
| "match_ratio": 0.925, |
| "reward": -0.075, |
| "reward_std": 0.10773502588272095, |
| "rewards/reward_func": -0.075, |
| "step": 9910 |
| }, |
| { |
| "completion_length": 45.05, |
| "epoch": 1.984, |
| "grad_norm": 0.00061798095703125, |
| "kl": 0.06615068479441107, |
| "learning_rate": 9.747123991141193e-10, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9920 |
| }, |
| { |
| "completion_length": 57.625, |
| "epoch": 1.986, |
| "grad_norm": 0.000545501708984375, |
| "kl": 0.0431473188335076, |
| "learning_rate": 7.462755470422078e-10, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9930 |
| }, |
| { |
| "completion_length": 65.05, |
| "epoch": 1.988, |
| "grad_norm": 0.0004634857177734375, |
| "kl": 0.049975822074338795, |
| "learning_rate": 5.48291312886251e-10, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9940 |
| }, |
| { |
| "completion_length": 67.95, |
| "epoch": 1.99, |
| "grad_norm": 0.00127410888671875, |
| "kl": 0.033399745682254435, |
| "learning_rate": 3.8076210902182607e-10, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9950 |
| }, |
| { |
| "completion_length": 56.175, |
| "epoch": 1.992, |
| "grad_norm": 0.0004825592041015625, |
| "kl": 0.11492122933268548, |
| "learning_rate": 2.43689976739403e-10, |
| "loss": 0.0, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 9960 |
| }, |
| { |
| "completion_length": 58.85, |
| "epoch": 1.994, |
| "grad_norm": 0.0040283203125, |
| "kl": 0.04082223805598915, |
| "learning_rate": 1.3707658621964216e-10, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9970 |
| }, |
| { |
| "completion_length": 64.95, |
| "epoch": 1.996, |
| "grad_norm": 0.00299072265625, |
| "kl": 0.055863088183104995, |
| "learning_rate": 6.092323651313293e-11, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 9980 |
| }, |
| { |
| "completion_length": 64.9, |
| "epoch": 1.998, |
| "grad_norm": 0.00091552734375, |
| "kl": 40.98459475683048, |
| "learning_rate": 1.5230855524017708e-11, |
| "loss": 0.0041, |
| "match_ratio": 0.975, |
| "reward": -0.025, |
| "reward_std": 0.05, |
| "rewards/reward_func": -0.025, |
| "step": 9990 |
| }, |
| { |
| "completion_length": 65.95, |
| "epoch": 2.0, |
| "grad_norm": 0.0003757476806640625, |
| "kl": 0.02109892386943102, |
| "learning_rate": 0.0, |
| "loss": 0.0, |
| "match_ratio": 1.0, |
| "reward": 0.0, |
| "reward_std": 0.0, |
| "rewards/reward_func": 0.0, |
| "step": 10000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 10000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|