{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 75.6, "epoch": 0.002, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5.0000000000000004e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 10 }, { "completion_length": 69.0, "epoch": 0.004, "grad_norm": 8.249282836914062e-05, "kl": 0.0007458075881004334, "learning_rate": 1.0000000000000001e-07, "loss": 0.0, "match_ratio": 0.85, "reward": -0.15, "reward_std": 0.1, "rewards/reward_func": -0.15, "step": 20 }, { "completion_length": 65.25, "epoch": 0.006, "grad_norm": 8.726119995117188e-05, "kl": 0.0008603519352618604, "learning_rate": 1.5000000000000002e-07, "loss": 0.0, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.05, "rewards/reward_func": -0.075, "step": 30 }, { "completion_length": 51.1, "epoch": 0.008, "grad_norm": 0.0001392364501953125, "kl": 0.0006604890164453536, "learning_rate": 2.0000000000000002e-07, "loss": 0.0, "match_ratio": 0.85, "reward": -0.15, "reward_std": 0.1, "rewards/reward_func": -0.15, "step": 40 }, { "completion_length": 81.475, "epoch": 0.01, "grad_norm": 10.3125, "kl": 0.0008120269441860728, "learning_rate": 2.5000000000000004e-07, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.3154700517654419, "rewards/reward_func": -0.2, "step": 50 }, { "completion_length": 82.3, "epoch": 0.012, "grad_norm": 0.000148773193359375, "kl": 0.0009021399382618256, "learning_rate": 3.0000000000000004e-07, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.2, "rewards/reward_func": -0.1, "step": 60 }, { "completion_length": 57.675, "epoch": 0.014, "grad_norm": 0.000453948974609375, "kl": 0.0008941800828324631, "learning_rate": 3.5000000000000004e-07, "loss": 0.0, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.10773502588272095, "rewards/reward_func": -0.075, "step": 70 }, { "completion_length": 70.475, "epoch": 0.016, "grad_norm": 7.90625, "kl": 0.0010760451084934175, "learning_rate": 4.0000000000000003e-07, "loss": 0.0, "match_ratio": 0.75, "reward": -0.25, "reward_std": 0.2, "rewards/reward_func": -0.25, "step": 80 }, { "completion_length": 56.2, "epoch": 0.018, "grad_norm": 8.535385131835938e-05, "kl": 0.001845199626404792, "learning_rate": 4.5000000000000003e-07, "loss": 0.0, "match_ratio": 0.85, "reward": -0.15, "reward_std": 0.1, "rewards/reward_func": -0.15, "step": 90 }, { "completion_length": 63.5, "epoch": 0.02, "grad_norm": 0.00011777877807617188, "kl": 0.0007161700828874018, "learning_rate": 5.000000000000001e-07, "loss": 0.0, "match_ratio": 0.725, "reward": -0.275, "reward_std": 0.20773502588272094, "rewards/reward_func": -0.275, "step": 100 }, { "completion_length": 88.175, "epoch": 0.022, "grad_norm": 0.000125885009765625, "kl": 0.001011227659182623, "learning_rate": 5.5e-07, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.15773502588272095, "rewards/reward_func": -0.1, "step": 110 }, { "completion_length": 69.725, "epoch": 0.024, "grad_norm": 0.00010633468627929688, "kl": 0.0008000041969353333, "learning_rate": 6.000000000000001e-07, "loss": 0.0, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.25, "rewards/reward_func": -0.125, "step": 120 }, { "completion_length": 72.35, "epoch": 0.026, "grad_norm": 0.00012159347534179688, "kl": 0.0010485154576599597, "learning_rate": 6.5e-07, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 130 }, { "completion_length": 75.575, "epoch": 0.028, "grad_norm": 0.00017547607421875, "kl": 0.0007810671289917081, "learning_rate": 7.000000000000001e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 140 }, { "completion_length": 79.65, "epoch": 0.03, "grad_norm": 8.0625, "kl": 0.0008353532728506252, "learning_rate": 7.5e-07, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 150 }, { "completion_length": 52.075, "epoch": 0.032, "grad_norm": 33.25, "kl": 0.0031860046496149153, "learning_rate": 8.000000000000001e-07, "loss": 0.0, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.15, "rewards/reward_func": -0.125, "step": 160 }, { "completion_length": 58.825, "epoch": 0.034, "grad_norm": 15.6875, "kl": 0.001403974276036024, "learning_rate": 8.500000000000001e-07, "loss": 0.0, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.05, "rewards/reward_func": -0.125, "step": 170 }, { "completion_length": 70.95, "epoch": 0.036, "grad_norm": 0.0002956390380859375, "kl": 0.0009778408275451511, "learning_rate": 9.000000000000001e-07, "loss": 0.0, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.05, "rewards/reward_func": -0.125, "step": 180 }, { "completion_length": 62.25, "epoch": 0.038, "grad_norm": 0.00011587142944335938, "kl": 0.001159800120512955, "learning_rate": 9.500000000000001e-07, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 190 }, { "completion_length": 68.7, "epoch": 0.04, "grad_norm": 0.0001430511474609375, "kl": 0.0036639797501266, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 200 }, { "completion_length": 50.675, "epoch": 0.042, "grad_norm": 7.534027099609375e-05, "kl": 0.002666096478151303, "learning_rate": 1.0500000000000001e-06, "loss": 0.0, "match_ratio": 0.775, "reward": -0.225, "reward_std": 0.05, "rewards/reward_func": -0.225, "step": 210 }, { "completion_length": 36.8, "epoch": 0.044, "grad_norm": 0.00018215179443359375, "kl": 0.014809455376234838, "learning_rate": 1.1e-06, "loss": 0.0, "match_ratio": 0.725, "reward": -0.275, "reward_std": 0.15, "rewards/reward_func": -0.275, "step": 220 }, { "completion_length": 66.75, "epoch": 0.046, "grad_norm": 0.0004520416259765625, "kl": 0.007854271659743972, "learning_rate": 1.1500000000000002e-06, "loss": 0.0, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.20773502588272094, "rewards/reward_func": -0.125, "step": 230 }, { "completion_length": 76.35, "epoch": 0.048, "grad_norm": 17.875, "kl": 1.018031721841544, "learning_rate": 1.2000000000000002e-06, "loss": 0.0001, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.2, "rewards/reward_func": -0.2, "step": 240 }, { "completion_length": 59.075, "epoch": 0.05, "grad_norm": 8.869171142578125e-05, "kl": 0.13859437993960455, "learning_rate": 1.25e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 250 }, { "completion_length": 55.95, "epoch": 0.052, "grad_norm": 0.458984375, "kl": 0.44334062208363323, "learning_rate": 1.3e-06, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.0, "rewards/reward_func": -0.2, "step": 260 }, { "completion_length": 62.5, "epoch": 0.054, "grad_norm": 0.000209808349609375, "kl": 1.4304919777001488, "learning_rate": 1.3500000000000002e-06, "loss": 0.0001, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.15, "rewards/reward_func": -0.125, "step": 270 }, { "completion_length": 59.125, "epoch": 0.056, "grad_norm": 7.963180541992188e-05, "kl": 0.3974201448727399, "learning_rate": 1.4000000000000001e-06, "loss": 0.0, "match_ratio": 0.575, "reward": -0.425, "reward_std": 0.15, "rewards/reward_func": -0.425, "step": 280 }, { "completion_length": 61.75, "epoch": 0.058, "grad_norm": 0.00015926361083984375, "kl": 0.6299153287603986, "learning_rate": 1.45e-06, "loss": 0.0001, "match_ratio": 0.775, "reward": -0.225, "reward_std": 0.05, "rewards/reward_func": -0.225, "step": 290 }, { "completion_length": 94.275, "epoch": 0.06, "grad_norm": 0.00010347366333007812, "kl": 0.007541297184070572, "learning_rate": 1.5e-06, "loss": 0.0, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.15, "rewards/reward_func": -0.075, "step": 300 }, { "completion_length": 55.45, "epoch": 0.062, "grad_norm": 0.000507354736328125, "kl": 0.8210757704044227, "learning_rate": 1.5500000000000002e-06, "loss": 0.0001, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 310 }, { "completion_length": 63.475, "epoch": 0.064, "grad_norm": 0.000396728515625, "kl": 0.10055320091196336, "learning_rate": 1.6000000000000001e-06, "loss": 0.0, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.10773502588272095, "rewards/reward_func": -0.125, "step": 320 }, { "completion_length": 82.15, "epoch": 0.066, "grad_norm": 0.000514984130859375, "kl": 0.37705419784761035, "learning_rate": 1.6500000000000003e-06, "loss": 0.0, "match_ratio": 0.825, "reward": -0.175, "reward_std": 0.10773502588272095, "rewards/reward_func": -0.175, "step": 330 }, { "completion_length": 51.475, "epoch": 0.068, "grad_norm": 0.00014400482177734375, "kl": 41.57116786188563, "learning_rate": 1.7000000000000002e-06, "loss": 0.0042, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.10773502588272095, "rewards/reward_func": -0.075, "step": 340 }, { "completion_length": 55.525, "epoch": 0.07, "grad_norm": 0.0002689361572265625, "kl": 1.14909179067472, "learning_rate": 1.75e-06, "loss": 0.0001, "match_ratio": 0.725, "reward": -0.275, "reward_std": 0.05, "rewards/reward_func": -0.275, "step": 350 }, { "completion_length": 47.725, "epoch": 0.072, "grad_norm": 4.28125, "kl": 1.577250469638966, "learning_rate": 1.8000000000000001e-06, "loss": 0.0002, "match_ratio": 0.675, "reward": -0.325, "reward_std": 0.05, "rewards/reward_func": -0.325, "step": 360 }, { "completion_length": 61.825, "epoch": 0.074, "grad_norm": 0.00017452239990234375, "kl": 0.34925971169723197, "learning_rate": 1.85e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 370 }, { "completion_length": 55.825, "epoch": 0.076, "grad_norm": 0.00066375732421875, "kl": 0.1755049143510405, "learning_rate": 1.9000000000000002e-06, "loss": 0.0, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.15, "rewards/reward_func": -0.075, "step": 380 }, { "completion_length": 49.9, "epoch": 0.078, "grad_norm": 0.0164794921875, "kl": 0.019992552557960154, "learning_rate": 1.9500000000000004e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 390 }, { "completion_length": 63.85, "epoch": 0.08, "grad_norm": 0.0002002716064453125, "kl": 4.795912343251985, "learning_rate": 2.0000000000000003e-06, "loss": 0.0005, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.2, "rewards/reward_func": -0.1, "step": 400 }, { "completion_length": 56.6, "epoch": 0.082, "grad_norm": 0.0004520416259765625, "kl": 1.1443448643549345, "learning_rate": 2.05e-06, "loss": 0.0001, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 410 }, { "completion_length": 58.725, "epoch": 0.084, "grad_norm": 0.0002765655517578125, "kl": 0.0028327183797955515, "learning_rate": 2.1000000000000002e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 420 }, { "completion_length": 47.1, "epoch": 0.086, "grad_norm": 0.0010986328125, "kl": 0.008324940234888344, "learning_rate": 2.15e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 430 }, { "completion_length": 65.75, "epoch": 0.088, "grad_norm": 11.9375, "kl": 0.006597463192883879, "learning_rate": 2.2e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 440 }, { "completion_length": 68.975, "epoch": 0.09, "grad_norm": 2064.0, "kl": 405.5571417377796, "learning_rate": 2.25e-06, "loss": 0.0406, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 450 }, { "completion_length": 56.975, "epoch": 0.092, "grad_norm": 0.00038909912109375, "kl": 0.05630149020580575, "learning_rate": 2.3000000000000004e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 460 }, { "completion_length": 61.075, "epoch": 0.094, "grad_norm": 0.000446319580078125, "kl": 0.005178993823938072, "learning_rate": 2.35e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 470 }, { "completion_length": 73.325, "epoch": 0.096, "grad_norm": 0.0002613067626953125, "kl": 4.652890888956608, "learning_rate": 2.4000000000000003e-06, "loss": 0.0005, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 480 }, { "completion_length": 68.625, "epoch": 0.098, "grad_norm": 0.000164031982421875, "kl": 15.9451186191116, "learning_rate": 2.4500000000000003e-06, "loss": 0.0016, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 490 }, { "completion_length": 82.025, "epoch": 0.1, "grad_norm": 0.000453948974609375, "kl": 0.006236090854508802, "learning_rate": 2.5e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 500 }, { "completion_length": 73.05, "epoch": 0.102, "grad_norm": 0.00064849853515625, "kl": 0.0709857388283126, "learning_rate": 2.55e-06, "loss": 0.0, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.15, "rewards/reward_func": -0.125, "step": 510 }, { "completion_length": 69.325, "epoch": 0.104, "grad_norm": 0.00048828125, "kl": 517.9831573915319, "learning_rate": 2.6e-06, "loss": 0.0518, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.10773502588272095, "rewards/reward_func": -0.075, "step": 520 }, { "completion_length": 57.95, "epoch": 0.106, "grad_norm": 0.00028228759765625, "kl": 0.006235601624939591, "learning_rate": 2.6500000000000005e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 530 }, { "completion_length": 39.375, "epoch": 0.108, "grad_norm": 0.0023345947265625, "kl": 592.0278737243498, "learning_rate": 2.7000000000000004e-06, "loss": 0.0592, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.05, "rewards/reward_func": -0.125, "step": 540 }, { "completion_length": 62.875, "epoch": 0.11, "grad_norm": 0.0003910064697265625, "kl": 0.004806909896433354, "learning_rate": 2.7500000000000004e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 550 }, { "completion_length": 56.45, "epoch": 0.112, "grad_norm": 0.0002117156982421875, "kl": 0.004208524071145803, "learning_rate": 2.8000000000000003e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 560 }, { "completion_length": 67.7, "epoch": 0.114, "grad_norm": 0.00016689300537109375, "kl": 3.4651028811815197, "learning_rate": 2.85e-06, "loss": 0.0003, "match_ratio": 0.85, "reward": -0.15, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.15, "step": 570 }, { "completion_length": 51.525, "epoch": 0.116, "grad_norm": 0.040283203125, "kl": 0.03394674692535773, "learning_rate": 2.9e-06, "loss": 0.0, "match_ratio": 0.825, "reward": -0.175, "reward_std": 0.05, "rewards/reward_func": -0.175, "step": 580 }, { "completion_length": 69.275, "epoch": 0.118, "grad_norm": 0.000823974609375, "kl": 0.0029081626795232295, "learning_rate": 2.95e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 590 }, { "completion_length": 71.975, "epoch": 0.12, "grad_norm": 0.0004711151123046875, "kl": 0.2187046888633631, "learning_rate": 3e-06, "loss": 0.0, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.20773502588272094, "rewards/reward_func": -0.125, "step": 600 }, { "completion_length": 61.85, "epoch": 0.122, "grad_norm": 24.25, "kl": 13.38376448857598, "learning_rate": 3.05e-06, "loss": 0.0013, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 610 }, { "completion_length": 53.5, "epoch": 0.124, "grad_norm": 0.0002498626708984375, "kl": 0.006577977701090277, "learning_rate": 3.1000000000000004e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 620 }, { "completion_length": 59.925, "epoch": 0.126, "grad_norm": 0.000690460205078125, "kl": 0.03976157886208966, "learning_rate": 3.1500000000000003e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 630 }, { "completion_length": 51.275, "epoch": 0.128, "grad_norm": 284.0, "kl": 128.89907464290735, "learning_rate": 3.2000000000000003e-06, "loss": 0.0129, "match_ratio": 0.825, "reward": -0.175, "reward_std": 0.05, "rewards/reward_func": -0.175, "step": 640 }, { "completion_length": 57.525, "epoch": 0.13, "grad_norm": 0.000217437744140625, "kl": 0.15632477974286302, "learning_rate": 3.2500000000000002e-06, "loss": 0.0, "match_ratio": 0.75, "reward": -0.25, "reward_std": 0.1, "rewards/reward_func": -0.25, "step": 650 }, { "completion_length": 48.75, "epoch": 0.132, "grad_norm": 0.00035858154296875, "kl": 0.012460133875720203, "learning_rate": 3.3000000000000006e-06, "loss": 0.0, "match_ratio": 0.85, "reward": -0.15, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.15, "step": 660 }, { "completion_length": 46.375, "epoch": 0.134, "grad_norm": 0.000560760498046875, "kl": 0.005184722866397351, "learning_rate": 3.3500000000000005e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 670 }, { "completion_length": 65.7, "epoch": 0.136, "grad_norm": 21.625, "kl": 0.03178581706015393, "learning_rate": 3.4000000000000005e-06, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.1, "rewards/reward_func": -0.2, "step": 680 }, { "completion_length": 70.9, "epoch": 0.138, "grad_norm": 0.00025177001953125, "kl": 0.0047074495116248725, "learning_rate": 3.45e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 690 }, { "completion_length": 50.8, "epoch": 0.14, "grad_norm": 0.0001735687255859375, "kl": 2.7392968325410036, "learning_rate": 3.5e-06, "loss": 0.0003, "match_ratio": 0.725, "reward": -0.275, "reward_std": 0.05, "rewards/reward_func": -0.275, "step": 700 }, { "completion_length": 57.025, "epoch": 0.142, "grad_norm": 0.039794921875, "kl": 179.23230375794228, "learning_rate": 3.5500000000000003e-06, "loss": 0.0179, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.1, "rewards/reward_func": -0.2, "step": 710 }, { "completion_length": 67.825, "epoch": 0.144, "grad_norm": 0.00042724609375, "kl": 0.07836510783527047, "learning_rate": 3.6000000000000003e-06, "loss": 0.0, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.05, "rewards/reward_func": -0.125, "step": 720 }, { "completion_length": 60.7, "epoch": 0.146, "grad_norm": 936.0, "kl": 213.31693772624712, "learning_rate": 3.65e-06, "loss": 0.0213, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.15, "rewards/reward_func": -0.125, "step": 730 }, { "completion_length": 56.275, "epoch": 0.148, "grad_norm": 0.00104522705078125, "kl": 0.2562748788390309, "learning_rate": 3.7e-06, "loss": 0.0, "match_ratio": 0.75, "reward": -0.25, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.25, "step": 740 }, { "completion_length": 63.85, "epoch": 0.15, "grad_norm": 0.000640869140625, "kl": 0.869432492996566, "learning_rate": 3.7500000000000005e-06, "loss": 0.0001, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 750 }, { "completion_length": 68.85, "epoch": 0.152, "grad_norm": 0.0002613067626953125, "kl": 0.003684952133335173, "learning_rate": 3.8000000000000005e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 760 }, { "completion_length": 59.675, "epoch": 0.154, "grad_norm": 0.000286102294921875, "kl": 0.011132878507487476, "learning_rate": 3.85e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 770 }, { "completion_length": 56.925, "epoch": 0.156, "grad_norm": 0.000244140625, "kl": 431.3150466301013, "learning_rate": 3.900000000000001e-06, "loss": 0.0431, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 780 }, { "completion_length": 59.075, "epoch": 0.158, "grad_norm": 0.00026702880859375, "kl": 0.004525382234714925, "learning_rate": 3.95e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 790 }, { "completion_length": 41.475, "epoch": 0.16, "grad_norm": 0.000392913818359375, "kl": 0.8990198554703965, "learning_rate": 4.000000000000001e-06, "loss": 0.0001, "match_ratio": 0.75, "reward": -0.25, "reward_std": 0.15773502588272095, "rewards/reward_func": -0.25, "step": 800 }, { "completion_length": 49.35, "epoch": 0.162, "grad_norm": 0.0006103515625, "kl": 10258.552996213792, "learning_rate": 4.05e-06, "loss": 1.0259, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 810 }, { "completion_length": 47.675, "epoch": 0.164, "grad_norm": 0.00102996826171875, "kl": 3237.7607058377935, "learning_rate": 4.1e-06, "loss": 0.3238, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 820 }, { "completion_length": 63.375, "epoch": 0.166, "grad_norm": 0.0003414154052734375, "kl": 4.472107960679568, "learning_rate": 4.15e-06, "loss": 0.0004, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 830 }, { "completion_length": 70.375, "epoch": 0.168, "grad_norm": 0.00023174285888671875, "kl": 0.005188186629675328, "learning_rate": 4.2000000000000004e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 840 }, { "completion_length": 64.975, "epoch": 0.17, "grad_norm": 0.00022029876708984375, "kl": 0.019074565428309143, "learning_rate": 4.25e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 850 }, { "completion_length": 52.925, "epoch": 0.172, "grad_norm": 0.00029754638671875, "kl": 4.278230914589949, "learning_rate": 4.3e-06, "loss": 0.0004, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.15773502588272095, "rewards/reward_func": -0.1, "step": 860 }, { "completion_length": 66.075, "epoch": 0.174, "grad_norm": 0.00021457672119140625, "kl": 0.006756197474896908, "learning_rate": 4.350000000000001e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 870 }, { "completion_length": 62.075, "epoch": 0.176, "grad_norm": 7.9375, "kl": 1679.0881600409746, "learning_rate": 4.4e-06, "loss": 0.1679, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.05, "rewards/reward_func": -0.125, "step": 880 }, { "completion_length": 57.475, "epoch": 0.178, "grad_norm": 0.000576019287109375, "kl": 0.06974834711290896, "learning_rate": 4.450000000000001e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 890 }, { "completion_length": 59.25, "epoch": 0.18, "grad_norm": 0.000835418701171875, "kl": 0.0047427960205823185, "learning_rate": 4.5e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 900 }, { "completion_length": 49.925, "epoch": 0.182, "grad_norm": 0.000652313232421875, "kl": 0.0047499775420874355, "learning_rate": 4.5500000000000005e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 910 }, { "completion_length": 63.825, "epoch": 0.184, "grad_norm": 72192.0, "kl": 24955.292986106546, "learning_rate": 4.600000000000001e-06, "loss": 2.4955, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 920 }, { "completion_length": 52.475, "epoch": 0.186, "grad_norm": 0.000362396240234375, "kl": 0.6700008324347436, "learning_rate": 4.65e-06, "loss": 0.0001, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 930 }, { "completion_length": 58.925, "epoch": 0.188, "grad_norm": 0.0002956390380859375, "kl": 0.019399669324047863, "learning_rate": 4.7e-06, "loss": 0.0, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.05, "rewards/reward_func": -0.125, "step": 940 }, { "completion_length": 62.1, "epoch": 0.19, "grad_norm": 0.0019378662109375, "kl": 0.008174928580410778, "learning_rate": 4.75e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 950 }, { "completion_length": 57.75, "epoch": 0.192, "grad_norm": 0.0004138946533203125, "kl": 1.113649177318439, "learning_rate": 4.800000000000001e-06, "loss": 0.0001, "match_ratio": 0.85, "reward": -0.15, "reward_std": 0.1, "rewards/reward_func": -0.15, "step": 960 }, { "completion_length": 56.225, "epoch": 0.194, "grad_norm": 0.00069427490234375, "kl": 0.5510010560508818, "learning_rate": 4.85e-06, "loss": 0.0001, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 970 }, { "completion_length": 47.175, "epoch": 0.196, "grad_norm": 0.0002155303955078125, "kl": 1441.3666083157761, "learning_rate": 4.9000000000000005e-06, "loss": 0.1441, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.05, "rewards/reward_func": -0.125, "step": 980 }, { "completion_length": 46.85, "epoch": 0.198, "grad_norm": 0.000720977783203125, "kl": 1.4619831766700373, "learning_rate": 4.95e-06, "loss": 0.0001, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 990 }, { "completion_length": 65.55, "epoch": 0.2, "grad_norm": 0.0003871917724609375, "kl": 7.501803689775988, "learning_rate": 5e-06, "loss": 0.0008, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.1, "rewards/reward_func": -0.1, "step": 1000 }, { "completion_length": 62.975, "epoch": 0.202, "grad_norm": 15.5625, "kl": 0.10273585927207023, "learning_rate": 4.999984769144476e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 1010 }, { "completion_length": 57.35, "epoch": 0.204, "grad_norm": 0.0004520416259765625, "kl": 3.634061038820073, "learning_rate": 4.999939076763487e-06, "loss": 0.0004, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.10773502588272095, "rewards/reward_func": -0.075, "step": 1020 }, { "completion_length": 59.675, "epoch": 0.206, "grad_norm": 0.000507354736328125, "kl": 0.45058006714098153, "learning_rate": 4.999862923413781e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 1030 }, { "completion_length": 44.05, "epoch": 0.208, "grad_norm": 0.0005645751953125, "kl": 0.08316081156954169, "learning_rate": 4.999756310023261e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1040 }, { "completion_length": 79.225, "epoch": 0.21, "grad_norm": 0.00121307373046875, "kl": 0.018303806148469447, "learning_rate": 4.9996192378909785e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.1, "rewards/reward_func": -0.1, "step": 1050 }, { "completion_length": 57.4, "epoch": 0.212, "grad_norm": 0.00104522705078125, "kl": 17796.077124893247, "learning_rate": 4.999451708687114e-06, "loss": 1.7796, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 1060 }, { "completion_length": 52.8, "epoch": 0.214, "grad_norm": 8.821487426757812e-05, "kl": 6.741311503923498, "learning_rate": 4.9992537244529585e-06, "loss": 0.0007, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 1070 }, { "completion_length": 56.675, "epoch": 0.216, "grad_norm": 0.00084686279296875, "kl": 0.024167609214782716, "learning_rate": 4.999025287600886e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1080 }, { "completion_length": 61.325, "epoch": 0.218, "grad_norm": 0.0004634857177734375, "kl": 0.03314157268032432, "learning_rate": 4.998766400914329e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1090 }, { "completion_length": 78.3, "epoch": 0.22, "grad_norm": 0.0003376007080078125, "kl": 0.01341487793251872, "learning_rate": 4.99847706754774e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1100 }, { "completion_length": 54.725, "epoch": 0.222, "grad_norm": 0.000560760498046875, "kl": 674.1007295364049, "learning_rate": 4.998157291026553e-06, "loss": 0.0674, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 1110 }, { "completion_length": 54.8, "epoch": 0.224, "grad_norm": 0.00119781494140625, "kl": 10.044112924486399, "learning_rate": 4.997807075247147e-06, "loss": 0.001, "match_ratio": 0.85, "reward": -0.15, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.15, "step": 1120 }, { "completion_length": 49.275, "epoch": 0.226, "grad_norm": 0.0010833740234375, "kl": 0.017626433167606592, "learning_rate": 4.997426424476787e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1130 }, { "completion_length": 57.625, "epoch": 0.228, "grad_norm": 0.0020599365234375, "kl": 0.44624101794324816, "learning_rate": 4.9970153433535855e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 1140 }, { "completion_length": 61.475, "epoch": 0.23, "grad_norm": 0.00159454345703125, "kl": 0.052927281521260736, "learning_rate": 4.9965738368864345e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 1150 }, { "completion_length": 71.8, "epoch": 0.232, "grad_norm": 0.00103759765625, "kl": 0.011898941779509186, "learning_rate": 4.996101910454953e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1160 }, { "completion_length": 56.775, "epoch": 0.234, "grad_norm": 0.006256103515625, "kl": 0.440633371565491, "learning_rate": 4.995599569809414e-06, "loss": 0.0, "match_ratio": 0.7, "reward": -0.3, "reward_std": 0.0, "rewards/reward_func": -0.3, "step": 1170 }, { "completion_length": 61.0, "epoch": 0.236, "grad_norm": 0.00174713134765625, "kl": 0.04742448972538114, "learning_rate": 4.9950668210706795e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 1180 }, { "completion_length": 61.45, "epoch": 0.238, "grad_norm": 16.625, "kl": 0.04350157366134226, "learning_rate": 4.994503670730126e-06, "loss": 0.0, "match_ratio": 0.75, "reward": -0.25, "reward_std": 0.1, "rewards/reward_func": -0.25, "step": 1190 }, { "completion_length": 66.725, "epoch": 0.24, "grad_norm": 0.00063323974609375, "kl": 0.013561246497556568, "learning_rate": 4.993910125649561e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 1200 }, { "completion_length": 73.775, "epoch": 0.242, "grad_norm": 0.0018310546875, "kl": 0.008337400993332267, "learning_rate": 4.993286193061145e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 1210 }, { "completion_length": 46.075, "epoch": 0.244, "grad_norm": 38.75, "kl": 0.10523022091947495, "learning_rate": 4.992631880567301e-06, "loss": 0.0, "match_ratio": 0.775, "reward": -0.225, "reward_std": 0.05, "rewards/reward_func": -0.225, "step": 1220 }, { "completion_length": 44.275, "epoch": 0.246, "grad_norm": 0.0004405975341796875, "kl": 0.289978933124803, "learning_rate": 4.991947196140619e-06, "loss": 0.0, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.05, "rewards/reward_func": -0.075, "step": 1230 }, { "completion_length": 57.0, "epoch": 0.248, "grad_norm": 326.0, "kl": 595.9660109838471, "learning_rate": 4.9912321481237616e-06, "loss": 0.0596, "match_ratio": 0.825, "reward": -0.175, "reward_std": 0.15, "rewards/reward_func": -0.175, "step": 1240 }, { "completion_length": 82.45, "epoch": 0.25, "grad_norm": 0.0002574920654296875, "kl": 0.023628250462934375, "learning_rate": 4.990486745229364e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1250 }, { "completion_length": 78.125, "epoch": 0.252, "grad_norm": 0.0023651123046875, "kl": 0.013969364436343312, "learning_rate": 4.989710996539926e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1260 }, { "completion_length": 55.5, "epoch": 0.254, "grad_norm": 0.0003910064697265625, "kl": 2.1650808176025746, "learning_rate": 4.9889049115077e-06, "loss": 0.0002, "match_ratio": 0.85, "reward": -0.15, "reward_std": 0.1, "rewards/reward_func": -0.15, "step": 1270 }, { "completion_length": 71.45, "epoch": 0.256, "grad_norm": 0.00060272216796875, "kl": 0.02477358910255134, "learning_rate": 4.988068499954578e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1280 }, { "completion_length": 57.5, "epoch": 0.258, "grad_norm": 0.0024261474609375, "kl": 0.028649515146389602, "learning_rate": 4.987201772071971e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1290 }, { "completion_length": 59.425, "epoch": 0.26, "grad_norm": 0.0003948211669921875, "kl": 0.02502227737568319, "learning_rate": 4.986304738420684e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1300 }, { "completion_length": 53.9, "epoch": 0.262, "grad_norm": 0.000690460205078125, "kl": 0.028165359469130635, "learning_rate": 4.985377409930789e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1310 }, { "completion_length": 53.075, "epoch": 0.264, "grad_norm": 0.00091552734375, "kl": 30.839417777769267, "learning_rate": 4.984419797901491e-06, "loss": 0.0031, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 1320 }, { "completion_length": 71.55, "epoch": 0.266, "grad_norm": 0.00034332275390625, "kl": 0.011334103159606456, "learning_rate": 4.983431914000991e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1330 }, { "completion_length": 47.675, "epoch": 0.268, "grad_norm": 0.00049591064453125, "kl": 0.21036937911994755, "learning_rate": 4.9824137702663424e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 1340 }, { "completion_length": 59.8, "epoch": 0.27, "grad_norm": 0.000598907470703125, "kl": 0.0196828240994364, "learning_rate": 4.981365379103306e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1350 }, { "completion_length": 57.1, "epoch": 0.272, "grad_norm": 0.000858306884765625, "kl": 0.010027467273175716, "learning_rate": 4.980286753286196e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1360 }, { "completion_length": 64.575, "epoch": 0.274, "grad_norm": 0.00262451171875, "kl": 0.03173879962414503, "learning_rate": 4.979177905957726e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 1370 }, { "completion_length": 38.225, "epoch": 0.276, "grad_norm": 0.001953125, "kl": 0.10523775820620358, "learning_rate": 4.978038850628855e-06, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.0, "rewards/reward_func": -0.2, "step": 1380 }, { "completion_length": 64.175, "epoch": 0.278, "grad_norm": 0.00165557861328125, "kl": 0.022827543993480505, "learning_rate": 4.9768696011786095e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 1390 }, { "completion_length": 51.525, "epoch": 0.28, "grad_norm": 0.00408935546875, "kl": 0.040360532607883214, "learning_rate": 4.975670171853926e-06, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.0, "rewards/reward_func": -0.2, "step": 1400 }, { "completion_length": 65.125, "epoch": 0.282, "grad_norm": 0.001129150390625, "kl": 0.010821055877022446, "learning_rate": 4.974440577269473e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 1410 }, { "completion_length": 57.675, "epoch": 0.284, "grad_norm": 0.00049591064453125, "kl": 0.028958286670967937, "learning_rate": 4.973180832407471e-06, "loss": 0.0, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.10773502588272095, "rewards/reward_func": -0.125, "step": 1420 }, { "completion_length": 60.3, "epoch": 0.286, "grad_norm": 0.00051116943359375, "kl": 0.015607311762869358, "learning_rate": 4.971890952617515e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 1430 }, { "completion_length": 90.0, "epoch": 0.288, "grad_norm": 0.000392913818359375, "kl": 0.007699519535526634, "learning_rate": 4.970570953616383e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1440 }, { "completion_length": 51.625, "epoch": 0.29, "grad_norm": 0.00067138671875, "kl": 0.05114179509691894, "learning_rate": 4.9692208514878445e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 1450 }, { "completion_length": 53.2, "epoch": 0.292, "grad_norm": 0.0004673004150390625, "kl": 0.015444098180159927, "learning_rate": 4.96784066268247e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1460 }, { "completion_length": 51.15, "epoch": 0.294, "grad_norm": 0.0004558563232421875, "kl": 0.028699404350481926, "learning_rate": 4.966430404017424e-06, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.0, "rewards/reward_func": -0.2, "step": 1470 }, { "completion_length": 65.125, "epoch": 0.296, "grad_norm": 0.00144195556640625, "kl": 481.04375956221486, "learning_rate": 4.964990092676263e-06, "loss": 0.0481, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 1480 }, { "completion_length": 54.275, "epoch": 0.298, "grad_norm": 0.000766754150390625, "kl": 0.056382374046370386, "learning_rate": 4.963519746208726e-06, "loss": 0.0, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.05, "rewards/reward_func": -0.075, "step": 1490 }, { "completion_length": 64.05, "epoch": 0.3, "grad_norm": 0.000438690185546875, "kl": 0.3169886400224641, "learning_rate": 4.962019382530521e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 1500 }, { "completion_length": 62.525, "epoch": 0.302, "grad_norm": 0.00079345703125, "kl": 0.022879413142800332, "learning_rate": 4.960489019923105e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 1510 }, { "completion_length": 46.15, "epoch": 0.304, "grad_norm": 29.25, "kl": 0.05621479714754969, "learning_rate": 4.958928677033465e-06, "loss": 0.0, "match_ratio": 0.85, "reward": -0.15, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.15, "step": 1520 }, { "completion_length": 55.775, "epoch": 0.306, "grad_norm": 0.0005035400390625, "kl": 0.23808469655923545, "learning_rate": 4.957338372873886e-06, "loss": 0.0, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.05, "rewards/reward_func": -0.125, "step": 1530 }, { "completion_length": 54.0, "epoch": 0.308, "grad_norm": 0.000518798828125, "kl": 0.052838593162596224, "learning_rate": 4.9557181268217225e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 1540 }, { "completion_length": 59.875, "epoch": 0.31, "grad_norm": 7.0, "kl": 0.020038261311128736, "learning_rate": 4.9540679586191605e-06, "loss": 0.0, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.10773502588272095, "rewards/reward_func": -0.075, "step": 1550 }, { "completion_length": 48.025, "epoch": 0.312, "grad_norm": 0.0013885498046875, "kl": 0.037048061547102405, "learning_rate": 4.9523878883729794e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 1560 }, { "completion_length": 71.175, "epoch": 0.314, "grad_norm": 0.000621795654296875, "kl": 0.011885163560509681, "learning_rate": 4.9506779365543054e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1570 }, { "completion_length": 55.575, "epoch": 0.316, "grad_norm": 0.000339508056640625, "kl": 0.022388620488345622, "learning_rate": 4.94893812399836e-06, "loss": 0.0, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.05, "rewards/reward_func": -0.075, "step": 1580 }, { "completion_length": 59.975, "epoch": 0.318, "grad_norm": 0.0004520416259765625, "kl": 0.009310156595893205, "learning_rate": 4.947168471904213e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1590 }, { "completion_length": 61.125, "epoch": 0.32, "grad_norm": 0.000823974609375, "kl": 0.016036251094192266, "learning_rate": 4.9453690018345144e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1600 }, { "completion_length": 66.625, "epoch": 0.322, "grad_norm": 0.034912109375, "kl": 0.03369634412229061, "learning_rate": 4.9435397357152406e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 1610 }, { "completion_length": 58.8, "epoch": 0.324, "grad_norm": 0.0047607421875, "kl": 0.0254040343221277, "learning_rate": 4.9416806958354206e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1620 }, { "completion_length": 46.0, "epoch": 0.326, "grad_norm": 0.0011138916015625, "kl": 0.07715323262382298, "learning_rate": 4.939791904846869e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 1630 }, { "completion_length": 46.125, "epoch": 0.328, "grad_norm": 0.00060272216796875, "kl": 0.01129569010809064, "learning_rate": 4.937873385763909e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1640 }, { "completion_length": 66.1, "epoch": 0.33, "grad_norm": 0.0003204345703125, "kl": 0.008372989785857498, "learning_rate": 4.935925161963089e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1650 }, { "completion_length": 56.675, "epoch": 0.332, "grad_norm": 0.0007171630859375, "kl": 0.01157067040912807, "learning_rate": 4.933947257182901e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1660 }, { "completion_length": 44.15, "epoch": 0.334, "grad_norm": 0.00087738037109375, "kl": 0.016125927586108445, "learning_rate": 4.9319396955234925e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 1670 }, { "completion_length": 64.3, "epoch": 0.336, "grad_norm": 0.0004673004150390625, "kl": 0.008596798940561711, "learning_rate": 4.9299025014463665e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 1680 }, { "completion_length": 57.15, "epoch": 0.338, "grad_norm": 0.000705718994140625, "kl": 0.08002093653194606, "learning_rate": 4.92783569977409e-06, "loss": 0.0, "match_ratio": 0.65, "reward": -0.35, "reward_std": 0.1, "rewards/reward_func": -0.35, "step": 1690 }, { "completion_length": 52.65, "epoch": 0.34, "grad_norm": 0.0004558563232421875, "kl": 0.020256173936650156, "learning_rate": 4.925739315689991e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 1700 }, { "completion_length": 58.75, "epoch": 0.342, "grad_norm": 0.00150299072265625, "kl": 0.02677068072371185, "learning_rate": 4.923613374737848e-06, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.0, "rewards/reward_func": -0.2, "step": 1710 }, { "completion_length": 55.0, "epoch": 0.344, "grad_norm": 0.002227783203125, "kl": 0.03761114357039332, "learning_rate": 4.921457902821578e-06, "loss": 0.0, "match_ratio": 0.7, "reward": -0.3, "reward_std": 0.0, "rewards/reward_func": -0.3, "step": 1720 }, { "completion_length": 54.575, "epoch": 0.346, "grad_norm": 23.125, "kl": 0.01751216114498675, "learning_rate": 4.9192729262049285e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 1730 }, { "completion_length": 60.45, "epoch": 0.348, "grad_norm": 0.00075531005859375, "kl": 0.011810581240570172, "learning_rate": 4.917058471511149e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1740 }, { "completion_length": 45.65, "epoch": 0.35, "grad_norm": 0.004730224609375, "kl": 0.040744514157995584, "learning_rate": 4.914814565722671e-06, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.0, "rewards/reward_func": -0.2, "step": 1750 }, { "completion_length": 59.5, "epoch": 0.352, "grad_norm": 0.000579833984375, "kl": 0.01578701629769057, "learning_rate": 4.912541236180779e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1760 }, { "completion_length": 50.7, "epoch": 0.354, "grad_norm": 0.000629425048828125, "kl": 0.11596511220559477, "learning_rate": 4.910238510585275e-06, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.0, "rewards/reward_func": -0.2, "step": 1770 }, { "completion_length": 48.8, "epoch": 0.356, "grad_norm": 46.75, "kl": 0.2281810746062547, "learning_rate": 4.907906416994146e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 1780 }, { "completion_length": 57.4, "epoch": 0.358, "grad_norm": 0.00173187255859375, "kl": 0.02588364710099995, "learning_rate": 4.905544983823214e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 1790 }, { "completion_length": 39.475, "epoch": 0.36, "grad_norm": 0.00136566162109375, "kl": 0.12392290020361543, "learning_rate": 4.903154239845798e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 1800 }, { "completion_length": 57.725, "epoch": 0.362, "grad_norm": 0.0037689208984375, "kl": 0.7656503105536103, "learning_rate": 4.900734214192358e-06, "loss": 0.0001, "match_ratio": 0.75, "reward": -0.25, "reward_std": 0.1, "rewards/reward_func": -0.25, "step": 1810 }, { "completion_length": 51.025, "epoch": 0.364, "grad_norm": 0.00032806396484375, "kl": 1.4439625646919012, "learning_rate": 4.898284936350144e-06, "loss": 0.0001, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 1820 }, { "completion_length": 83.85, "epoch": 0.366, "grad_norm": 0.00061798095703125, "kl": 0.013502365676686168, "learning_rate": 4.8958064361628334e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1830 }, { "completion_length": 57.3, "epoch": 0.368, "grad_norm": 0.00040435791015625, "kl": 0.02529239854775369, "learning_rate": 4.893298743830168e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 1840 }, { "completion_length": 57.875, "epoch": 0.37, "grad_norm": 0.00067901611328125, "kl": 0.013202862720936537, "learning_rate": 4.890761889907589e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1850 }, { "completion_length": 61.725, "epoch": 0.372, "grad_norm": 0.00250244140625, "kl": 2.53927280055359, "learning_rate": 4.888195905305859e-06, "loss": 0.0003, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.10773502588272095, "rewards/reward_func": -0.075, "step": 1860 }, { "completion_length": 57.6, "epoch": 0.374, "grad_norm": 0.0003833770751953125, "kl": 0.02102891537360847, "learning_rate": 4.885600821290692e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1870 }, { "completion_length": 53.675, "epoch": 0.376, "grad_norm": 0.000583648681640625, "kl": 0.0170896818395704, "learning_rate": 4.882976669482368e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1880 }, { "completion_length": 47.525, "epoch": 0.378, "grad_norm": 0.0037384033203125, "kl": 0.01905378680676222, "learning_rate": 4.880323481855347e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1890 }, { "completion_length": 48.275, "epoch": 0.38, "grad_norm": 0.08203125, "kl": 0.1403908584266901, "learning_rate": 4.8776412907378845e-06, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.0, "rewards/reward_func": -0.2, "step": 1900 }, { "completion_length": 73.425, "epoch": 0.382, "grad_norm": 0.000560760498046875, "kl": 0.007492217188701034, "learning_rate": 4.874930128811631e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1910 }, { "completion_length": 67.0, "epoch": 0.384, "grad_norm": 0.00122833251953125, "kl": 0.014595681196078658, "learning_rate": 4.8721900291112415e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 1920 }, { "completion_length": 65.125, "epoch": 0.386, "grad_norm": 0.000484466552734375, "kl": 0.01310229734517634, "learning_rate": 4.869421025023965e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 1930 }, { "completion_length": 67.325, "epoch": 0.388, "grad_norm": 0.00048065185546875, "kl": 0.012927077431231736, "learning_rate": 4.866623150289241e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 1940 }, { "completion_length": 53.425, "epoch": 0.39, "grad_norm": 0.001068115234375, "kl": 0.009116059914231301, "learning_rate": 4.863796438998293e-06, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.0, "rewards/reward_func": -0.2, "step": 1950 }, { "completion_length": 58.7, "epoch": 0.392, "grad_norm": 0.00176239013671875, "kl": 0.22789982098620384, "learning_rate": 4.860940925593703e-06, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.0, "rewards/reward_func": -0.2, "step": 1960 }, { "completion_length": 76.2, "epoch": 0.394, "grad_norm": 0.00179290771484375, "kl": 0.010708777070976793, "learning_rate": 4.858056644869002e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 1970 }, { "completion_length": 53.675, "epoch": 0.396, "grad_norm": 0.0010528564453125, "kl": 0.033514925348572436, "learning_rate": 4.855143631968242e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 1980 }, { "completion_length": 48.75, "epoch": 0.398, "grad_norm": 0.0004596710205078125, "kl": 0.02880375348031521, "learning_rate": 4.852201922385564e-06, "loss": 0.0, "match_ratio": 0.775, "reward": -0.225, "reward_std": 0.05, "rewards/reward_func": -0.225, "step": 1990 }, { "completion_length": 56.05, "epoch": 0.4, "grad_norm": 7.82012939453125e-05, "kl": 0.020885943528264762, "learning_rate": 4.849231551964771e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 2000 }, { "completion_length": 57.825, "epoch": 0.402, "grad_norm": 0.0007781982421875, "kl": 0.013335178885608912, "learning_rate": 4.84623255689889e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2010 }, { "completion_length": 76.475, "epoch": 0.404, "grad_norm": 0.000461578369140625, "kl": 0.013225622242316604, "learning_rate": 4.84320497372973e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2020 }, { "completion_length": 65.45, "epoch": 0.406, "grad_norm": 0.0016632080078125, "kl": 0.020334804011508823, "learning_rate": 4.840148839347434e-06, "loss": 0.0, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.05, "rewards/reward_func": -0.075, "step": 2030 }, { "completion_length": 87.85, "epoch": 0.408, "grad_norm": 0.000423431396484375, "kl": 0.020761342905461787, "learning_rate": 4.837064190990036e-06, "loss": 0.0, "match_ratio": 0.85, "reward": -0.15, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.15, "step": 2040 }, { "completion_length": 60.15, "epoch": 0.41, "grad_norm": 0.000713348388671875, "kl": 0.01624767268076539, "learning_rate": 4.833951066243004e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2050 }, { "completion_length": 52.075, "epoch": 0.412, "grad_norm": 0.007171630859375, "kl": 0.039952522004023196, "learning_rate": 4.830809503038781e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 2060 }, { "completion_length": 64.925, "epoch": 0.414, "grad_norm": 0.00063323974609375, "kl": 0.4387159863486886, "learning_rate": 4.8276395396563215e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 2070 }, { "completion_length": 52.35, "epoch": 0.416, "grad_norm": 0.000301361083984375, "kl": 0.018893744330853224, "learning_rate": 4.824441214720629e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2080 }, { "completion_length": 72.125, "epoch": 0.418, "grad_norm": 0.000850677490234375, "kl": 0.014612970128655433, "learning_rate": 4.821214567202284e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2090 }, { "completion_length": 62.525, "epoch": 0.42, "grad_norm": 0.003662109375, "kl": 0.11677258219569922, "learning_rate": 4.817959636416969e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 2100 }, { "completion_length": 41.975, "epoch": 0.422, "grad_norm": 0.0006256103515625, "kl": 0.014599576778709888, "learning_rate": 4.814676462024988e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2110 }, { "completion_length": 61.65, "epoch": 0.424, "grad_norm": 0.00052642822265625, "kl": 0.010790122766047716, "learning_rate": 4.811365084030784e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2120 }, { "completion_length": 45.65, "epoch": 0.426, "grad_norm": 0.00095367431640625, "kl": 0.011601420305669307, "learning_rate": 4.808025542782453e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2130 }, { "completion_length": 64.25, "epoch": 0.428, "grad_norm": 0.0003986358642578125, "kl": 0.5641481504775584, "learning_rate": 4.804657878971252e-06, "loss": 0.0001, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 2140 }, { "completion_length": 56.675, "epoch": 0.43, "grad_norm": 0.000957489013671875, "kl": 0.013445794116705656, "learning_rate": 4.801262133631101e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2150 }, { "completion_length": 75.425, "epoch": 0.432, "grad_norm": 0.00055694580078125, "kl": 0.012692990363575518, "learning_rate": 4.7978383481380865e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2160 }, { "completion_length": 62.375, "epoch": 0.434, "grad_norm": 6.96875, "kl": 0.023554344521835448, "learning_rate": 4.794386564209953e-06, "loss": 0.0, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.05, "rewards/reward_func": -0.125, "step": 2170 }, { "completion_length": 65.475, "epoch": 0.436, "grad_norm": 0.0003337860107421875, "kl": 0.06298564318567515, "learning_rate": 4.790906823905599e-06, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.0, "rewards/reward_func": -0.2, "step": 2180 }, { "completion_length": 59.8, "epoch": 0.438, "grad_norm": 0.000537872314453125, "kl": 0.013637619884684682, "learning_rate": 4.787399169624562e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2190 }, { "completion_length": 54.875, "epoch": 0.44, "grad_norm": 0.000759124755859375, "kl": 0.01425664583221078, "learning_rate": 4.783863644106502e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 2200 }, { "completion_length": 50.1, "epoch": 0.442, "grad_norm": 0.001800537109375, "kl": 0.0958622452802956, "learning_rate": 4.780300290430683e-06, "loss": 0.0, "match_ratio": 0.85, "reward": -0.15, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.15, "step": 2210 }, { "completion_length": 66.8, "epoch": 0.444, "grad_norm": 0.00043487548828125, "kl": 0.00994320074096322, "learning_rate": 4.776709152015443e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 2220 }, { "completion_length": 73.9, "epoch": 0.446, "grad_norm": 0.000705718994140625, "kl": 0.016998659167438746, "learning_rate": 4.773090272617672e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2230 }, { "completion_length": 65.45, "epoch": 0.448, "grad_norm": 0.00054931640625, "kl": 0.015969987539574505, "learning_rate": 4.769443696332272e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2240 }, { "completion_length": 58.0, "epoch": 0.45, "grad_norm": 0.000514984130859375, "kl": 0.05210723381023854, "learning_rate": 4.765769467591626e-06, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.0, "rewards/reward_func": -0.2, "step": 2250 }, { "completion_length": 55.35, "epoch": 0.452, "grad_norm": 0.0002841949462890625, "kl": 0.2783783482853323, "learning_rate": 4.762067631165049e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 2260 }, { "completion_length": 60.1, "epoch": 0.454, "grad_norm": 0.00119781494140625, "kl": 0.05332662384025753, "learning_rate": 4.7583382321582525e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 2270 }, { "completion_length": 59.625, "epoch": 0.456, "grad_norm": 0.0014495849609375, "kl": 0.015380131197161973, "learning_rate": 4.754581316012785e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2280 }, { "completion_length": 56.1, "epoch": 0.458, "grad_norm": 0.000885009765625, "kl": 0.04605462467297912, "learning_rate": 4.750796928505484e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 2290 }, { "completion_length": 76.35, "epoch": 0.46, "grad_norm": 0.000579833984375, "kl": 0.053115089796483515, "learning_rate": 4.746985115747918e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 2300 }, { "completion_length": 50.8, "epoch": 0.462, "grad_norm": 0.000637054443359375, "kl": 0.01561843790113926, "learning_rate": 4.743145924185821e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 2310 }, { "completion_length": 65.575, "epoch": 0.464, "grad_norm": 0.00061798095703125, "kl": 0.01594538043718785, "learning_rate": 4.7392794005985324e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2320 }, { "completion_length": 44.15, "epoch": 0.466, "grad_norm": 0.0005340576171875, "kl": 12.877768159005791, "learning_rate": 4.735385592098421e-06, "loss": 0.0013, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.1, "rewards/reward_func": -0.1, "step": 2330 }, { "completion_length": 51.45, "epoch": 0.468, "grad_norm": 0.00055694580078125, "kl": 0.019627093384042383, "learning_rate": 4.731464546130315e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2340 }, { "completion_length": 67.975, "epoch": 0.47, "grad_norm": 0.0024261474609375, "kl": 0.018453579442575575, "learning_rate": 4.72751631047092e-06, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.0, "rewards/reward_func": -0.2, "step": 2350 }, { "completion_length": 73.7, "epoch": 0.472, "grad_norm": 0.00049591064453125, "kl": 0.011441022157669067, "learning_rate": 4.723540933228245e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2360 }, { "completion_length": 68.525, "epoch": 0.474, "grad_norm": 0.000537872314453125, "kl": 0.010118643706664442, "learning_rate": 4.719538462841003e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2370 }, { "completion_length": 42.325, "epoch": 0.476, "grad_norm": 0.00110626220703125, "kl": 23.094405939802527, "learning_rate": 4.715508948078037e-06, "loss": 0.0023, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.1, "rewards/reward_func": -0.1, "step": 2380 }, { "completion_length": 51.85, "epoch": 0.478, "grad_norm": 0.0002574920654296875, "kl": 0.01785165797919035, "learning_rate": 4.71145243803771e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2390 }, { "completion_length": 76.575, "epoch": 0.48, "grad_norm": 0.00049591064453125, "kl": 0.02041715644299984, "learning_rate": 4.707368982147318e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 2400 }, { "completion_length": 53.65, "epoch": 0.482, "grad_norm": 0.0007171630859375, "kl": 0.011072598048485816, "learning_rate": 4.703258630162481e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2410 }, { "completion_length": 45.65, "epoch": 0.484, "grad_norm": 0.0010986328125, "kl": 1306590.1205121286, "learning_rate": 4.699121432166542e-06, "loss": 130.659, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 2420 }, { "completion_length": 66.625, "epoch": 0.486, "grad_norm": 0.0004425048828125, "kl": 0.018537986697629093, "learning_rate": 4.6949574385699514e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2430 }, { "completion_length": 50.825, "epoch": 0.488, "grad_norm": 0.00121307373046875, "kl": 0.027628638222813605, "learning_rate": 4.690766700109659e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2440 }, { "completion_length": 72.275, "epoch": 0.49, "grad_norm": 0.0006103515625, "kl": 0.012821279000490904, "learning_rate": 4.68654926784849e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2450 }, { "completion_length": 59.025, "epoch": 0.492, "grad_norm": 0.0006256103515625, "kl": 1.4881786234676837, "learning_rate": 4.682305193174524e-06, "loss": 0.0001, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.0, "rewards/reward_func": -0.2, "step": 2460 }, { "completion_length": 70.65, "epoch": 0.494, "grad_norm": 32.25, "kl": 0.13413287354633213, "learning_rate": 4.6780345278004744e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 2470 }, { "completion_length": 60.875, "epoch": 0.496, "grad_norm": 0.00067138671875, "kl": 0.022409677878022193, "learning_rate": 4.673737323763048e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 2480 }, { "completion_length": 57.875, "epoch": 0.498, "grad_norm": 18.75, "kl": 0.49180023511871696, "learning_rate": 4.669413633422322e-06, "loss": 0.0, "match_ratio": 0.775, "reward": -0.225, "reward_std": 0.05, "rewards/reward_func": -0.225, "step": 2490 }, { "completion_length": 63.95, "epoch": 0.5, "grad_norm": 0.000720977783203125, "kl": 0.0207068151794374, "learning_rate": 4.665063509461098e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 2500 }, { "completion_length": 70.75, "epoch": 0.502, "grad_norm": 0.0004634857177734375, "kl": 0.01583680328913033, "learning_rate": 4.6606870048842626e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2510 }, { "completion_length": 41.6, "epoch": 0.504, "grad_norm": 0.001251220703125, "kl": 0.024196008208673448, "learning_rate": 4.656284173018144e-06, "loss": 0.0, "match_ratio": 0.7, "reward": -0.3, "reward_std": 0.0, "rewards/reward_func": -0.3, "step": 2520 }, { "completion_length": 60.475, "epoch": 0.506, "grad_norm": 0.0008697509765625, "kl": 0.02790404809638858, "learning_rate": 4.65185506750986e-06, "loss": 0.0, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.05, "rewards/reward_func": -0.125, "step": 2530 }, { "completion_length": 63.1, "epoch": 0.508, "grad_norm": 0.0005950927734375, "kl": 0.037049750238656996, "learning_rate": 4.6473997423266615e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 2540 }, { "completion_length": 64.025, "epoch": 0.51, "grad_norm": 0.00121307373046875, "kl": 0.05311856884509325, "learning_rate": 4.642918251755281e-06, "loss": 0.0, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.05, "rewards/reward_func": -0.125, "step": 2550 }, { "completion_length": 48.925, "epoch": 0.512, "grad_norm": 0.00482177734375, "kl": 0.07079303860664368, "learning_rate": 4.638410650401267e-06, "loss": 0.0, "match_ratio": 0.7, "reward": -0.3, "reward_std": 0.0, "rewards/reward_func": -0.3, "step": 2560 }, { "completion_length": 61.8, "epoch": 0.514, "grad_norm": 0.0010986328125, "kl": 0.024926586542278528, "learning_rate": 4.633876993188319e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 2570 }, { "completion_length": 49.6, "epoch": 0.516, "grad_norm": 0.000766754150390625, "kl": 0.019125528051517904, "learning_rate": 4.62931733535762e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 2580 }, { "completion_length": 66.35, "epoch": 0.518, "grad_norm": 0.00159454345703125, "kl": 0.020169223845005035, "learning_rate": 4.62473173246716e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2590 }, { "completion_length": 55.7, "epoch": 0.52, "grad_norm": 0.00067138671875, "kl": 0.24018120649270713, "learning_rate": 4.620120240391065e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 2600 }, { "completion_length": 57.725, "epoch": 0.522, "grad_norm": 0.00087738037109375, "kl": 2.7745140019804237, "learning_rate": 4.6154829153189105e-06, "loss": 0.0003, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 2610 }, { "completion_length": 48.7, "epoch": 0.524, "grad_norm": 0.000659942626953125, "kl": 0.023752238228917123, "learning_rate": 4.610819813755038e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 2620 }, { "completion_length": 57.225, "epoch": 0.526, "grad_norm": 0.00150299072265625, "kl": 0.02745365663431585, "learning_rate": 4.60613099251787e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2630 }, { "completion_length": 48.5, "epoch": 0.528, "grad_norm": 0.00133514404296875, "kl": 0.021794071048498155, "learning_rate": 4.601416508739211e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 2640 }, { "completion_length": 61.7, "epoch": 0.53, "grad_norm": 0.000823974609375, "kl": 0.01592640457674861, "learning_rate": 4.596676419863561e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2650 }, { "completion_length": 55.825, "epoch": 0.532, "grad_norm": 0.000431060791015625, "kl": 4393.635703391675, "learning_rate": 4.591910783647405e-06, "loss": 0.4394, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.05, "rewards/reward_func": -0.125, "step": 2660 }, { "completion_length": 42.725, "epoch": 0.534, "grad_norm": 0.0005340576171875, "kl": 3.308469070494175, "learning_rate": 4.587119658158517e-06, "loss": 0.0003, "match_ratio": 0.7, "reward": -0.3, "reward_std": 0.0, "rewards/reward_func": -0.3, "step": 2670 }, { "completion_length": 53.675, "epoch": 0.536, "grad_norm": 0.000362396240234375, "kl": 0.017304986575618388, "learning_rate": 4.582303101775249e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2680 }, { "completion_length": 46.825, "epoch": 0.538, "grad_norm": 0.000438690185546875, "kl": 1.4977983684279024, "learning_rate": 4.577461173185821e-06, "loss": 0.0001, "match_ratio": 0.7, "reward": -0.3, "reward_std": 0.0, "rewards/reward_func": -0.3, "step": 2690 }, { "completion_length": 49.425, "epoch": 0.54, "grad_norm": 0.000637054443359375, "kl": 0.04018927337601781, "learning_rate": 4.572593931387604e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 2700 }, { "completion_length": 61.475, "epoch": 0.542, "grad_norm": 0.00025177001953125, "kl": 0.029250907758250833, "learning_rate": 4.567701435686405e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 2710 }, { "completion_length": 61.3, "epoch": 0.544, "grad_norm": 0.00107574462890625, "kl": 0.041718969354406, "learning_rate": 4.562783745695738e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 2720 }, { "completion_length": 71.225, "epoch": 0.546, "grad_norm": 0.00101470947265625, "kl": 0.017608029022812843, "learning_rate": 4.5578409213361055e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 2730 }, { "completion_length": 68.825, "epoch": 0.548, "grad_norm": 0.00084686279296875, "kl": 0.01787120271474123, "learning_rate": 4.55287302283426e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 2740 }, { "completion_length": 40.975, "epoch": 0.55, "grad_norm": 0.0004405975341796875, "kl": 0.08943550041876733, "learning_rate": 4.54788011072248e-06, "loss": 0.0, "match_ratio": 0.7, "reward": -0.3, "reward_std": 0.0, "rewards/reward_func": -0.3, "step": 2750 }, { "completion_length": 60.275, "epoch": 0.552, "grad_norm": 0.000888824462890625, "kl": 0.012318810448050499, "learning_rate": 4.542862245837821e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2760 }, { "completion_length": 50.225, "epoch": 0.554, "grad_norm": 0.000396728515625, "kl": 0.1667893348261714, "learning_rate": 4.537819489321385e-06, "loss": 0.0, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.05, "rewards/reward_func": -0.125, "step": 2770 }, { "completion_length": 71.5, "epoch": 0.556, "grad_norm": 0.000873565673828125, "kl": 0.009898501250427216, "learning_rate": 4.5327519026175694e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2780 }, { "completion_length": 66.525, "epoch": 0.558, "grad_norm": 0.0013275146484375, "kl": 0.016875687218271197, "learning_rate": 4.527659547473317e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2790 }, { "completion_length": 54.8, "epoch": 0.56, "grad_norm": 0.000308990478515625, "kl": 0.40478452597744763, "learning_rate": 4.522542485937369e-06, "loss": 0.0, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.05, "rewards/reward_func": -0.075, "step": 2800 }, { "completion_length": 63.175, "epoch": 0.562, "grad_norm": 0.002197265625, "kl": 0.01588670499622822, "learning_rate": 4.517400780359505e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2810 }, { "completion_length": 66.025, "epoch": 0.564, "grad_norm": 0.0015411376953125, "kl": 0.9499945601448416, "learning_rate": 4.512234493389785e-06, "loss": 0.0001, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.05, "rewards/reward_func": -0.075, "step": 2820 }, { "completion_length": 52.925, "epoch": 0.566, "grad_norm": 0.000408172607421875, "kl": 0.7477384469937534, "learning_rate": 4.507043687977787e-06, "loss": 0.0001, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 2830 }, { "completion_length": 54.05, "epoch": 0.568, "grad_norm": 0.00145721435546875, "kl": 0.02171561080031097, "learning_rate": 4.501828427371834e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2840 }, { "completion_length": 67.425, "epoch": 0.57, "grad_norm": 0.00038909912109375, "kl": 46.97684473299887, "learning_rate": 4.496588775118232e-06, "loss": 0.0047, "match_ratio": 0.825, "reward": -0.175, "reward_std": 0.05, "rewards/reward_func": -0.175, "step": 2850 }, { "completion_length": 49.9, "epoch": 0.572, "grad_norm": 0.000713348388671875, "kl": 0.04041039999574423, "learning_rate": 4.491324795060491e-06, "loss": 0.0, "match_ratio": 0.7, "reward": -0.3, "reward_std": 0.0, "rewards/reward_func": -0.3, "step": 2860 }, { "completion_length": 46.8, "epoch": 0.574, "grad_norm": 0.0003948211669921875, "kl": 0.013312188815325499, "learning_rate": 4.4860365513385456e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2870 }, { "completion_length": 46.825, "epoch": 0.576, "grad_norm": 0.0003490447998046875, "kl": 0.4708886262029409, "learning_rate": 4.4807241083879774e-06, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.1, "rewards/reward_func": -0.2, "step": 2880 }, { "completion_length": 65.625, "epoch": 0.578, "grad_norm": 0.00121307373046875, "kl": 0.0194290304556489, "learning_rate": 4.475387530939226e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2890 }, { "completion_length": 56.075, "epoch": 0.58, "grad_norm": 0.000579833984375, "kl": 0.04038618067279458, "learning_rate": 4.470026884016805e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 2900 }, { "completion_length": 47.725, "epoch": 0.582, "grad_norm": 0.00043487548828125, "kl": 16.41964945977088, "learning_rate": 4.464642232938505e-06, "loss": 0.0016, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 2910 }, { "completion_length": 63.9, "epoch": 0.584, "grad_norm": 0.0013885498046875, "kl": 0.01390684423968196, "learning_rate": 4.4592336433146e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2920 }, { "completion_length": 44.3, "epoch": 0.586, "grad_norm": 0.00023651123046875, "kl": 0.010492815752513707, "learning_rate": 4.453801181047047e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2930 }, { "completion_length": 72.45, "epoch": 0.588, "grad_norm": 0.00032806396484375, "kl": 0.01113151153549552, "learning_rate": 4.448344912328686e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2940 }, { "completion_length": 61.05, "epoch": 0.59, "grad_norm": 0.0004787445068359375, "kl": 0.008429582207463681, "learning_rate": 4.442864903642428e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2950 }, { "completion_length": 51.9, "epoch": 0.592, "grad_norm": 0.00080108642578125, "kl": 0.0231597448233515, "learning_rate": 4.437361221760449e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2960 }, { "completion_length": 47.825, "epoch": 0.594, "grad_norm": 0.0002899169921875, "kl": 0.03416364281438291, "learning_rate": 4.431833933743378e-06, "loss": 0.0, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.05, "rewards/reward_func": -0.125, "step": 2970 }, { "completion_length": 54.475, "epoch": 0.596, "grad_norm": 29.75, "kl": 25.77551784273237, "learning_rate": 4.426283106939474e-06, "loss": 0.0026, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 2980 }, { "completion_length": 61.175, "epoch": 0.598, "grad_norm": 0.00060272216796875, "kl": 0.010949767334386707, "learning_rate": 4.420708808983809e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 2990 }, { "completion_length": 58.05, "epoch": 0.6, "grad_norm": 0.00049591064453125, "kl": 0.03518106024712324, "learning_rate": 4.415111107797445e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3000 }, { "completion_length": 48.7, "epoch": 0.602, "grad_norm": 0.0006103515625, "kl": 0.015348105784505605, "learning_rate": 4.409490071586606e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3010 }, { "completion_length": 59.9, "epoch": 0.604, "grad_norm": 0.0004119873046875, "kl": 0.011616118438541888, "learning_rate": 4.403845768841842e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3020 }, { "completion_length": 69.0, "epoch": 0.606, "grad_norm": 0.000640869140625, "kl": 0.011708037834614516, "learning_rate": 4.398178268337202e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3030 }, { "completion_length": 52.125, "epoch": 0.608, "grad_norm": 0.000896453857421875, "kl": 0.04663766893791035, "learning_rate": 4.3924876391293915e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3040 }, { "completion_length": 54.175, "epoch": 0.61, "grad_norm": 0.000701904296875, "kl": 9.442724062688649, "learning_rate": 4.386773950556931e-06, "loss": 0.0009, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 3050 }, { "completion_length": 51.075, "epoch": 0.612, "grad_norm": 25.75, "kl": 18.313499209098516, "learning_rate": 4.381037272239311e-06, "loss": 0.0018, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.1, "rewards/reward_func": -0.2, "step": 3060 }, { "completion_length": 59.45, "epoch": 0.614, "grad_norm": 0.00145721435546875, "kl": 0.05719580026343465, "learning_rate": 4.3752776740761495e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.1, "rewards/reward_func": -0.1, "step": 3070 }, { "completion_length": 59.75, "epoch": 0.616, "grad_norm": 0.000865936279296875, "kl": 955.0316817238461, "learning_rate": 4.36949522624633e-06, "loss": 0.0955, "match_ratio": 0.85, "reward": -0.15, "reward_std": 0.1, "rewards/reward_func": -0.15, "step": 3080 }, { "completion_length": 49.25, "epoch": 0.618, "grad_norm": 0.0947265625, "kl": 0.12424529809504747, "learning_rate": 4.3636899992071555e-06, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.0, "rewards/reward_func": -0.2, "step": 3090 }, { "completion_length": 63.45, "epoch": 0.62, "grad_norm": 0.000774383544921875, "kl": 0.021155705489218236, "learning_rate": 4.357862063693486e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3100 }, { "completion_length": 58.675, "epoch": 0.622, "grad_norm": 0.00130462646484375, "kl": 0.03269129507243633, "learning_rate": 4.352011490716875e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3110 }, { "completion_length": 51.25, "epoch": 0.624, "grad_norm": 0.000720977783203125, "kl": 0.01852965746074915, "learning_rate": 4.346138351564711e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3120 }, { "completion_length": 42.025, "epoch": 0.626, "grad_norm": 0.0014495849609375, "kl": 0.13662478388287128, "learning_rate": 4.340242717799337e-06, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.0, "rewards/reward_func": -0.2, "step": 3130 }, { "completion_length": 55.225, "epoch": 0.628, "grad_norm": 0.0002536773681640625, "kl": 0.011004617274738848, "learning_rate": 4.334324661257191e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3140 }, { "completion_length": 36.075, "epoch": 0.63, "grad_norm": 0.000701904296875, "kl": 0.03598860376514494, "learning_rate": 4.328384254047927e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3150 }, { "completion_length": 70.7, "epoch": 0.632, "grad_norm": 0.0004825592041015625, "kl": 0.014078293647617101, "learning_rate": 4.322421568553529e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3160 }, { "completion_length": 41.4, "epoch": 0.634, "grad_norm": 0.00093841552734375, "kl": 0.02731174589134753, "learning_rate": 4.316436677427441e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3170 }, { "completion_length": 62.625, "epoch": 0.636, "grad_norm": 0.00035858154296875, "kl": 0.015678783506155015, "learning_rate": 4.3104296535936695e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3180 }, { "completion_length": 61.0, "epoch": 0.638, "grad_norm": 0.00091552734375, "kl": 0.019854954723268748, "learning_rate": 4.3044005702459055e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3190 }, { "completion_length": 52.95, "epoch": 0.64, "grad_norm": 0.000514984130859375, "kl": 0.13356436253525317, "learning_rate": 4.2983495008466285e-06, "loss": 0.0, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.05, "rewards/reward_func": -0.125, "step": 3200 }, { "completion_length": 66.475, "epoch": 0.642, "grad_norm": 0.000316619873046875, "kl": 0.013267815671861171, "learning_rate": 4.2922765191262075e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3210 }, { "completion_length": 66.475, "epoch": 0.644, "grad_norm": 0.000701904296875, "kl": 0.07887385552749038, "learning_rate": 4.286181699082008e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 3220 }, { "completion_length": 61.55, "epoch": 0.646, "grad_norm": 0.000118255615234375, "kl": 0.011934885568916798, "learning_rate": 4.280065114977492e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3230 }, { "completion_length": 61.625, "epoch": 0.648, "grad_norm": 0.0004558563232421875, "kl": 0.015098626213148236, "learning_rate": 4.273926841341303e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3240 }, { "completion_length": 47.7, "epoch": 0.65, "grad_norm": 0.00048065185546875, "kl": 0.017614057380706073, "learning_rate": 4.267766952966369e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3250 }, { "completion_length": 47.35, "epoch": 0.652, "grad_norm": 0.000911712646484375, "kl": 19.53749562408775, "learning_rate": 4.261585524908987e-06, "loss": 0.002, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 3260 }, { "completion_length": 67.85, "epoch": 0.654, "grad_norm": 0.0004367828369140625, "kl": 0.011493841698393226, "learning_rate": 4.255382632487907e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3270 }, { "completion_length": 43.525, "epoch": 0.656, "grad_norm": 0.0004062652587890625, "kl": 0.06476088264025748, "learning_rate": 4.249158351283414e-06, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.0, "rewards/reward_func": -0.2, "step": 3280 }, { "completion_length": 50.575, "epoch": 0.658, "grad_norm": 0.000812530517578125, "kl": 0.044670914835296574, "learning_rate": 4.242912757136412e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3290 }, { "completion_length": 48.95, "epoch": 0.66, "grad_norm": 0.054931640625, "kl": 0.05593093540519476, "learning_rate": 4.236645926147493e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3300 }, { "completion_length": 55.425, "epoch": 0.662, "grad_norm": 0.0004024505615234375, "kl": 0.02145648035220802, "learning_rate": 4.230357934676017e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3310 }, { "completion_length": 62.2, "epoch": 0.664, "grad_norm": 0.000499725341796875, "kl": 0.012483126670122146, "learning_rate": 4.224048859339175e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3320 }, { "completion_length": 61.9, "epoch": 0.666, "grad_norm": 0.0002689361572265625, "kl": 0.03277284097857773, "learning_rate": 4.217718777011058e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3330 }, { "completion_length": 70.675, "epoch": 0.668, "grad_norm": 0.00103759765625, "kl": 0.32424843702465295, "learning_rate": 4.211367764821722e-06, "loss": 0.0, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.05, "rewards/reward_func": -0.075, "step": 3340 }, { "completion_length": 41.375, "epoch": 0.67, "grad_norm": 0.0004367828369140625, "kl": 0.05056889692787081, "learning_rate": 4.204995900156247e-06, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.0, "rewards/reward_func": -0.2, "step": 3350 }, { "completion_length": 59.675, "epoch": 0.672, "grad_norm": 0.00106048583984375, "kl": 0.04967752741649747, "learning_rate": 4.198603260653792e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3360 }, { "completion_length": 53.425, "epoch": 0.674, "grad_norm": 0.0003681182861328125, "kl": 0.01580333085730672, "learning_rate": 4.192189924206652e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3370 }, { "completion_length": 50.75, "epoch": 0.676, "grad_norm": 0.002899169921875, "kl": 1.8218023491092026, "learning_rate": 4.185755968959308e-06, "loss": 0.0002, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.10773502588272095, "rewards/reward_func": -0.125, "step": 3380 }, { "completion_length": 62.375, "epoch": 0.678, "grad_norm": 0.000156402587890625, "kl": 0.047494524717330934, "learning_rate": 4.179301473307476e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3390 }, { "completion_length": 69.3, "epoch": 0.68, "grad_norm": 0.0002651214599609375, "kl": 0.014598681312054395, "learning_rate": 4.172826515897146e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3400 }, { "completion_length": 63.575, "epoch": 0.682, "grad_norm": 0.000423431396484375, "kl": 0.04025569665245712, "learning_rate": 4.166331175623631e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3410 }, { "completion_length": 48.05, "epoch": 0.684, "grad_norm": 0.0006561279296875, "kl": 913.4162682918599, "learning_rate": 4.159815531630604e-06, "loss": 0.0913, "match_ratio": 0.85, "reward": -0.15, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.15, "step": 3420 }, { "completion_length": 52.575, "epoch": 0.686, "grad_norm": 0.0006561279296875, "kl": 0.0610341252759099, "learning_rate": 4.15327966330913e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3430 }, { "completion_length": 57.6, "epoch": 0.688, "grad_norm": 0.0024871826171875, "kl": 0.9020142253488302, "learning_rate": 4.146723650296701e-06, "loss": 0.0001, "match_ratio": 0.85, "reward": -0.15, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.15, "step": 3440 }, { "completion_length": 56.25, "epoch": 0.69, "grad_norm": 0.0045166015625, "kl": 0.14730083039030434, "learning_rate": 4.140147572476269e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3450 }, { "completion_length": 72.55, "epoch": 0.692, "grad_norm": 0.0004024505615234375, "kl": 0.05645229946821928, "learning_rate": 4.133551509975264e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3460 }, { "completion_length": 62.0, "epoch": 0.694, "grad_norm": 0.00063323974609375, "kl": 0.018445250298827886, "learning_rate": 4.126935543164628e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3470 }, { "completion_length": 54.125, "epoch": 0.696, "grad_norm": 0.004608154296875, "kl": 0.03874910874292255, "learning_rate": 4.120299752657828e-06, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.1, "rewards/reward_func": -0.2, "step": 3480 }, { "completion_length": 35.4, "epoch": 0.698, "grad_norm": 0.020751953125, "kl": 0.07965331296436488, "learning_rate": 4.113644219309877e-06, "loss": 0.0, "match_ratio": 0.675, "reward": -0.325, "reward_std": 0.10773502588272095, "rewards/reward_func": -0.325, "step": 3490 }, { "completion_length": 61.6, "epoch": 0.7, "grad_norm": 0.00079345703125, "kl": 0.02872077892534435, "learning_rate": 4.106969024216348e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3500 }, { "completion_length": 57.175, "epoch": 0.702, "grad_norm": 0.00157928466796875, "kl": 0.017641184292733668, "learning_rate": 4.1002742487123896e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3510 }, { "completion_length": 62.6, "epoch": 0.704, "grad_norm": 0.0002651214599609375, "kl": 0.020063164038583638, "learning_rate": 4.093559974371725e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3520 }, { "completion_length": 51.125, "epoch": 0.706, "grad_norm": 0.000576019287109375, "kl": 0.6725238669663668, "learning_rate": 4.086826283005669e-06, "loss": 0.0001, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 3530 }, { "completion_length": 56.625, "epoch": 0.708, "grad_norm": 0.000579833984375, "kl": 0.01952581750229001, "learning_rate": 4.080073256662128e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3540 }, { "completion_length": 62.7, "epoch": 0.71, "grad_norm": 0.0035247802734375, "kl": 0.07669782191514969, "learning_rate": 4.073300977624594e-06, "loss": 0.0, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.05, "rewards/reward_func": -0.075, "step": 3550 }, { "completion_length": 68.325, "epoch": 0.712, "grad_norm": 0.0014801025390625, "kl": 0.026417199242860078, "learning_rate": 4.066509528411151e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3560 }, { "completion_length": 68.925, "epoch": 0.714, "grad_norm": 0.0003643035888671875, "kl": 0.01686573908664286, "learning_rate": 4.059698991773466e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3570 }, { "completion_length": 65.325, "epoch": 0.716, "grad_norm": 0.0003261566162109375, "kl": 0.009878239961108193, "learning_rate": 4.052869450695776e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3580 }, { "completion_length": 54.125, "epoch": 0.718, "grad_norm": 0.00054168701171875, "kl": 0.017347801569849254, "learning_rate": 4.046020988393886e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3590 }, { "completion_length": 62.45, "epoch": 0.72, "grad_norm": 0.000362396240234375, "kl": 0.010593670699745417, "learning_rate": 4.039153688314146e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3600 }, { "completion_length": 56.225, "epoch": 0.722, "grad_norm": 0.0007476806640625, "kl": 20.069431526213883, "learning_rate": 4.032267634132442e-06, "loss": 0.002, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.10773502588272095, "rewards/reward_func": -0.075, "step": 3610 }, { "completion_length": 61.925, "epoch": 0.724, "grad_norm": 0.0003643035888671875, "kl": 0.023581979051232337, "learning_rate": 4.02536290975317e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3620 }, { "completion_length": 71.05, "epoch": 0.726, "grad_norm": 0.0006561279296875, "kl": 0.019050255604088306, "learning_rate": 4.018439599308217e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3630 }, { "completion_length": 63.85, "epoch": 0.728, "grad_norm": 0.0002498626708984375, "kl": 0.023293742351233958, "learning_rate": 4.011497787155938e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3640 }, { "completion_length": 72.65, "epoch": 0.73, "grad_norm": 0.000347137451171875, "kl": 0.011120679695159197, "learning_rate": 4.0045375578801216e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3650 }, { "completion_length": 62.95, "epoch": 0.732, "grad_norm": 0.00042724609375, "kl": 0.017660227511078118, "learning_rate": 3.997558996288965e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3660 }, { "completion_length": 56.05, "epoch": 0.734, "grad_norm": 0.000881195068359375, "kl": 0.35850770082324745, "learning_rate": 3.9905621874140396e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3670 }, { "completion_length": 50.5, "epoch": 0.736, "grad_norm": 0.0004291534423828125, "kl": 0.022859503608196975, "learning_rate": 3.983547216509254e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3680 }, { "completion_length": 65.675, "epoch": 0.738, "grad_norm": 0.0003795623779296875, "kl": 0.01089323298074305, "learning_rate": 3.976514169049814e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3690 }, { "completion_length": 55.825, "epoch": 0.74, "grad_norm": 0.0002765655517578125, "kl": 0.026947349560214207, "learning_rate": 3.969463130731183e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3700 }, { "completion_length": 51.9, "epoch": 0.742, "grad_norm": 0.000720977783203125, "kl": 101.020502169244, "learning_rate": 3.96239418746804e-06, "loss": 0.0101, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3710 }, { "completion_length": 41.925, "epoch": 0.744, "grad_norm": 0.0004367828369140625, "kl": 1.0482193630887195, "learning_rate": 3.955307425393224e-06, "loss": 0.0001, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3720 }, { "completion_length": 43.075, "epoch": 0.746, "grad_norm": 0.0010986328125, "kl": 0.011403680918738246, "learning_rate": 3.948202930856697e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3730 }, { "completion_length": 64.75, "epoch": 0.748, "grad_norm": 0.0004062652587890625, "kl": 0.1115244179032743, "learning_rate": 3.941080790424483e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3740 }, { "completion_length": 59.65, "epoch": 0.75, "grad_norm": 0.000553131103515625, "kl": 0.011348503362387418, "learning_rate": 3.933941090877615e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3750 }, { "completion_length": 50.1, "epoch": 0.752, "grad_norm": 0.0004749298095703125, "kl": 0.01582015200983733, "learning_rate": 3.92678391921108e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3760 }, { "completion_length": 58.2, "epoch": 0.754, "grad_norm": 0.0004024505615234375, "kl": 0.02131882361136377, "learning_rate": 3.9196093626327535e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3770 }, { "completion_length": 56.45, "epoch": 0.756, "grad_norm": 0.000766754150390625, "kl": 0.024744509416632355, "learning_rate": 3.912417508562345e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3780 }, { "completion_length": 48.5, "epoch": 0.758, "grad_norm": 0.0005950927734375, "kl": 0.015975080896168947, "learning_rate": 3.905208444630326e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3790 }, { "completion_length": 69.525, "epoch": 0.76, "grad_norm": 0.000537872314453125, "kl": 0.009961457317695021, "learning_rate": 3.897982258676867e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 3800 }, { "completion_length": 60.575, "epoch": 0.762, "grad_norm": 0.000507354736328125, "kl": 0.011210405128076672, "learning_rate": 3.890739038750763e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3810 }, { "completion_length": 46.2, "epoch": 0.764, "grad_norm": 0.000255584716796875, "kl": 1.3439840027829633, "learning_rate": 3.88347887310836e-06, "loss": 0.0001, "match_ratio": 0.825, "reward": -0.175, "reward_std": 0.05, "rewards/reward_func": -0.175, "step": 3820 }, { "completion_length": 54.325, "epoch": 0.766, "grad_norm": 0.000637054443359375, "kl": 0.09195185881108045, "learning_rate": 3.876201850212489e-06, "loss": 0.0, "match_ratio": 0.675, "reward": -0.325, "reward_std": 0.05, "rewards/reward_func": -0.325, "step": 3830 }, { "completion_length": 74.35, "epoch": 0.768, "grad_norm": 0.000225067138671875, "kl": 0.01327997730113566, "learning_rate": 3.868908058731376e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3840 }, { "completion_length": 50.125, "epoch": 0.77, "grad_norm": 0.000370025634765625, "kl": 0.0457455332390964, "learning_rate": 3.861597587537568e-06, "loss": 0.0, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.05, "rewards/reward_func": -0.125, "step": 3850 }, { "completion_length": 67.225, "epoch": 0.772, "grad_norm": 0.0009765625, "kl": 0.019194579031318427, "learning_rate": 3.85427052570685e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3860 }, { "completion_length": 67.425, "epoch": 0.774, "grad_norm": 0.000522613525390625, "kl": 0.16493179565295576, "learning_rate": 3.846926962517158e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 3870 }, { "completion_length": 58.725, "epoch": 0.776, "grad_norm": 0.0007171630859375, "kl": 0.0165805596858263, "learning_rate": 3.839566987447492e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3880 }, { "completion_length": 64.575, "epoch": 0.778, "grad_norm": 0.00061798095703125, "kl": 0.014522301172837615, "learning_rate": 3.832190690176825e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3890 }, { "completion_length": 49.6, "epoch": 0.78, "grad_norm": 5.435943603515625e-05, "kl": 0.014127893140539527, "learning_rate": 3.824798160583012e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3900 }, { "completion_length": 49.8, "epoch": 0.782, "grad_norm": 0.000278472900390625, "kl": 0.11558867986313999, "learning_rate": 3.817389488741694e-06, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.0, "rewards/reward_func": -0.2, "step": 3910 }, { "completion_length": 65.35, "epoch": 0.784, "grad_norm": 0.000690460205078125, "kl": 0.013915874017402529, "learning_rate": 3.8099647649251984e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3920 }, { "completion_length": 63.1, "epoch": 0.786, "grad_norm": 0.000270843505859375, "kl": 0.26083877284545454, "learning_rate": 3.802524079601442e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3930 }, { "completion_length": 57.475, "epoch": 0.788, "grad_norm": 0.000476837158203125, "kl": 0.44110607262700796, "learning_rate": 3.795067523432826e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.1, "rewards/reward_func": -0.1, "step": 3940 }, { "completion_length": 50.125, "epoch": 0.79, "grad_norm": 0.0004062652587890625, "kl": 0.03131135320290923, "learning_rate": 3.787595187275136e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3950 }, { "completion_length": 47.525, "epoch": 0.792, "grad_norm": 0.000949859619140625, "kl": 0.056715600471943614, "learning_rate": 3.780107162176429e-06, "loss": 0.0, "match_ratio": 0.8, "reward": -0.2, "reward_std": 0.1, "rewards/reward_func": -0.2, "step": 3960 }, { "completion_length": 47.1, "epoch": 0.794, "grad_norm": 52.75, "kl": 0.48424787069670855, "learning_rate": 3.772603539375929e-06, "loss": 0.0, "match_ratio": 0.75, "reward": -0.25, "reward_std": 0.15773502588272095, "rewards/reward_func": -0.25, "step": 3970 }, { "completion_length": 64.875, "epoch": 0.796, "grad_norm": 0.000530242919921875, "kl": 0.08614660077728331, "learning_rate": 3.7650844103029093e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 3980 }, { "completion_length": 64.725, "epoch": 0.798, "grad_norm": 0.000347137451171875, "kl": 0.012265483383089304, "learning_rate": 3.7575498665755884e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 3990 }, { "completion_length": 61.2, "epoch": 0.8, "grad_norm": 0.00040435791015625, "kl": 0.032297836942598225, "learning_rate": 3.7500000000000005e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4000 }, { "completion_length": 65.8, "epoch": 0.802, "grad_norm": 0.00167083740234375, "kl": 0.016016237577423452, "learning_rate": 3.742434902568889e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4010 }, { "completion_length": 56.125, "epoch": 0.804, "grad_norm": 0.0004100799560546875, "kl": 0.06678674127906561, "learning_rate": 3.7348546664605777e-06, "loss": 0.0, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.05, "rewards/reward_func": -0.075, "step": 4020 }, { "completion_length": 52.825, "epoch": 0.806, "grad_norm": 13.9375, "kl": 39.51057905447669, "learning_rate": 3.7272593840378526e-06, "loss": 0.004, "match_ratio": 0.825, "reward": -0.175, "reward_std": 0.10773502588272095, "rewards/reward_func": -0.175, "step": 4030 }, { "completion_length": 56.375, "epoch": 0.808, "grad_norm": 0.000736236572265625, "kl": 76.7652599786874, "learning_rate": 3.7196491478468322e-06, "loss": 0.0077, "match_ratio": 0.825, "reward": -0.175, "reward_std": 0.05, "rewards/reward_func": -0.175, "step": 4040 }, { "completion_length": 59.775, "epoch": 0.81, "grad_norm": 23.625, "kl": 2.1401951428037136, "learning_rate": 3.7120240506158433e-06, "loss": 0.0002, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 4050 }, { "completion_length": 78.925, "epoch": 0.812, "grad_norm": 0.000362396240234375, "kl": 0.019226322788745163, "learning_rate": 3.7043841852542884e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4060 }, { "completion_length": 58.425, "epoch": 0.814, "grad_norm": 0.00078582763671875, "kl": 0.04994579209014773, "learning_rate": 3.6967296448515176e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4070 }, { "completion_length": 57.025, "epoch": 0.816, "grad_norm": 0.000553131103515625, "kl": 0.11473355963826179, "learning_rate": 3.689060522675689e-06, "loss": 0.0, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.05, "rewards/reward_func": -0.075, "step": 4080 }, { "completion_length": 59.875, "epoch": 0.818, "grad_norm": 0.0164794921875, "kl": 0.05073905866593122, "learning_rate": 3.6813769121726356e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4090 }, { "completion_length": 44.25, "epoch": 0.82, "grad_norm": 0.0002727508544921875, "kl": 0.7498203465249389, "learning_rate": 3.6736789069647273e-06, "loss": 0.0001, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 4100 }, { "completion_length": 58.15, "epoch": 0.822, "grad_norm": 0.0007781982421875, "kl": 0.022883613361045718, "learning_rate": 3.6659666008497287e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4110 }, { "completion_length": 64.35, "epoch": 0.824, "grad_norm": 0.0005035400390625, "kl": 3.837466208729893, "learning_rate": 3.658240087799655e-06, "loss": 0.0004, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.05, "rewards/reward_func": -0.075, "step": 4120 }, { "completion_length": 63.35, "epoch": 0.826, "grad_norm": 0.0002651214599609375, "kl": 0.03254580916836858, "learning_rate": 3.6504994619596295e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4130 }, { "completion_length": 45.65, "epoch": 0.828, "grad_norm": 0.000396728515625, "kl": 17.52819751542993, "learning_rate": 3.642744817646736e-06, "loss": 0.0018, "match_ratio": 0.825, "reward": -0.175, "reward_std": 0.05, "rewards/reward_func": -0.175, "step": 4140 }, { "completion_length": 62.8, "epoch": 0.83, "grad_norm": 0.05712890625, "kl": 0.056678724475204945, "learning_rate": 3.634976249348867e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4150 }, { "completion_length": 49.45, "epoch": 0.832, "grad_norm": 0.0004863739013671875, "kl": 0.02377572702243924, "learning_rate": 3.627193851723577e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4160 }, { "completion_length": 56.3, "epoch": 0.834, "grad_norm": 0.0010528564453125, "kl": 0.23082902017049492, "learning_rate": 3.6193977195969243e-06, "loss": 0.0, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.05, "rewards/reward_func": -0.075, "step": 4170 }, { "completion_length": 46.35, "epoch": 0.836, "grad_norm": 0.000518798828125, "kl": 0.037431746069341895, "learning_rate": 3.611587947962319e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4180 }, { "completion_length": 66.15, "epoch": 0.838, "grad_norm": 0.00054168701171875, "kl": 0.014803345128893853, "learning_rate": 3.6037646319793635e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4190 }, { "completion_length": 54.525, "epoch": 0.84, "grad_norm": 0.000743865966796875, "kl": 0.03277415055781603, "learning_rate": 3.595927866972694e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4200 }, { "completion_length": 74.5, "epoch": 0.842, "grad_norm": 0.0004863739013671875, "kl": 0.014684983342885972, "learning_rate": 3.5880777484308193e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4210 }, { "completion_length": 63.125, "epoch": 0.844, "grad_norm": 0.000640869140625, "kl": 0.02462619331199676, "learning_rate": 3.5802143720049565e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4220 }, { "completion_length": 56.275, "epoch": 0.846, "grad_norm": 0.000415802001953125, "kl": 4.124728001933545, "learning_rate": 3.5723378335078653e-06, "loss": 0.0004, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 4230 }, { "completion_length": 53.25, "epoch": 0.848, "grad_norm": 0.000759124755859375, "kl": 0.024984571058303116, "learning_rate": 3.564448228912682e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4240 }, { "completion_length": 56.15, "epoch": 0.85, "grad_norm": 0.00075531005859375, "kl": 8.048016933631152, "learning_rate": 3.556545654351749e-06, "loss": 0.0008, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4250 }, { "completion_length": 46.275, "epoch": 0.852, "grad_norm": 0.00115203857421875, "kl": 0.02151933144778013, "learning_rate": 3.5486302061154433e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4260 }, { "completion_length": 56.525, "epoch": 0.854, "grad_norm": 0.000476837158203125, "kl": 0.042470036540180445, "learning_rate": 3.5407019806510035e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4270 }, { "completion_length": 73.7, "epoch": 0.856, "grad_norm": 0.00035858154296875, "kl": 0.01676445291377604, "learning_rate": 3.532761074561355e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4280 }, { "completion_length": 58.625, "epoch": 0.858, "grad_norm": 0.00136566162109375, "kl": 13.48712082421407, "learning_rate": 3.524807584603932e-06, "loss": 0.0013, "match_ratio": 0.825, "reward": -0.175, "reward_std": 0.05, "rewards/reward_func": -0.175, "step": 4290 }, { "completion_length": 72.7, "epoch": 0.86, "grad_norm": 0.001922607421875, "kl": 0.01346550565212965, "learning_rate": 3.516841607689501e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 4300 }, { "completion_length": 48.225, "epoch": 0.862, "grad_norm": 0.000934600830078125, "kl": 1.3110887278337031, "learning_rate": 3.5088632408809757e-06, "loss": 0.0001, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.05, "rewards/reward_func": -0.075, "step": 4310 }, { "completion_length": 42.375, "epoch": 0.864, "grad_norm": 0.000583648681640625, "kl": 32.864392778254114, "learning_rate": 3.5008725813922383e-06, "loss": 0.0033, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.10773502588272095, "rewards/reward_func": -0.125, "step": 4320 }, { "completion_length": 54.675, "epoch": 0.866, "grad_norm": 0.00145721435546875, "kl": 0.43685728376731275, "learning_rate": 3.4928697265869516e-06, "loss": 0.0, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.10773502588272095, "rewards/reward_func": -0.075, "step": 4330 }, { "completion_length": 61.15, "epoch": 0.868, "grad_norm": 0.0004787445068359375, "kl": 0.02581656016409397, "learning_rate": 3.4848547739773782e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4340 }, { "completion_length": 52.5, "epoch": 0.87, "grad_norm": 0.0003643035888671875, "kl": 0.013556264666840434, "learning_rate": 3.476827821223184e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4350 }, { "completion_length": 56.025, "epoch": 0.872, "grad_norm": 0.00144195556640625, "kl": 0.05843255072832108, "learning_rate": 3.4687889661302577e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4360 }, { "completion_length": 68.225, "epoch": 0.874, "grad_norm": 0.000438690185546875, "kl": 0.025629992503672837, "learning_rate": 3.460738306649509e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4370 }, { "completion_length": 40.225, "epoch": 0.876, "grad_norm": 0.000537872314453125, "kl": 20.029893927741796, "learning_rate": 3.452675940875686e-06, "loss": 0.002, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 4380 }, { "completion_length": 53.5, "epoch": 0.878, "grad_norm": 0.000759124755859375, "kl": 0.0226501208730042, "learning_rate": 3.4446019670461684e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4390 }, { "completion_length": 55.325, "epoch": 0.88, "grad_norm": 0.00035858154296875, "kl": 1.9279290955979378, "learning_rate": 3.436516483539781e-06, "loss": 0.0002, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 4400 }, { "completion_length": 70.05, "epoch": 0.882, "grad_norm": 10.0, "kl": 0.05578553443774581, "learning_rate": 3.4284195888755877e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 4410 }, { "completion_length": 58.05, "epoch": 0.884, "grad_norm": 0.00086212158203125, "kl": 0.019159636087715627, "learning_rate": 3.4203113817116955e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4420 }, { "completion_length": 62.425, "epoch": 0.886, "grad_norm": 0.000614166259765625, "kl": 0.10707788309082389, "learning_rate": 3.412191960844049e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4430 }, { "completion_length": 56.225, "epoch": 0.888, "grad_norm": 0.00061798095703125, "kl": 3.764384925994091, "learning_rate": 3.4040614252052305e-06, "loss": 0.0004, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 4440 }, { "completion_length": 57.075, "epoch": 0.89, "grad_norm": 0.002838134765625, "kl": 0.04788713352754712, "learning_rate": 3.39591987386325e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 4450 }, { "completion_length": 62.575, "epoch": 0.892, "grad_norm": 0.000339508056640625, "kl": 0.024599794298410416, "learning_rate": 3.387767406020343e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4460 }, { "completion_length": 47.85, "epoch": 0.894, "grad_norm": 0.000522613525390625, "kl": 419.8108845547773, "learning_rate": 3.3796041210117545e-06, "loss": 0.042, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.05, "rewards/reward_func": -0.075, "step": 4470 }, { "completion_length": 56.55, "epoch": 0.896, "grad_norm": 0.0016326904296875, "kl": 4.869295587006491, "learning_rate": 3.3714301183045382e-06, "loss": 0.0005, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 4480 }, { "completion_length": 65.65, "epoch": 0.898, "grad_norm": 0.000926971435546875, "kl": 0.021241254778578876, "learning_rate": 3.3632454974963368e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 4490 }, { "completion_length": 51.7, "epoch": 0.9, "grad_norm": 0.00225830078125, "kl": 0.06338205388747156, "learning_rate": 3.3550503583141726e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4500 }, { "completion_length": 52.65, "epoch": 0.902, "grad_norm": 0.0003948211669921875, "kl": 0.3547412235289812, "learning_rate": 3.346844800613229e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.1, "rewards/reward_func": -0.1, "step": 4510 }, { "completion_length": 66.475, "epoch": 0.904, "grad_norm": 0.000637054443359375, "kl": 0.01707718223333359, "learning_rate": 3.338628924375638e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4520 }, { "completion_length": 69.775, "epoch": 0.906, "grad_norm": 0.00017642974853515625, "kl": 4.814126300462521, "learning_rate": 3.3304028297092583e-06, "loss": 0.0005, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.1154700517654419, "rewards/reward_func": -0.1, "step": 4530 }, { "completion_length": 72.7, "epoch": 0.908, "grad_norm": 0.00022411346435546875, "kl": 0.0319039260270074, "learning_rate": 3.3221666168464584e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 4540 }, { "completion_length": 63.75, "epoch": 0.91, "grad_norm": 0.002227783203125, "kl": 0.02470703413709998, "learning_rate": 3.313920386142892e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 4550 }, { "completion_length": 49.6, "epoch": 0.912, "grad_norm": 0.0012969970703125, "kl": 2.1085672612302004, "learning_rate": 3.3056642380762783e-06, "loss": 0.0002, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.05, "rewards/reward_func": -0.075, "step": 4560 }, { "completion_length": 70.425, "epoch": 0.914, "grad_norm": 0.0013885498046875, "kl": 0.016354763973504306, "learning_rate": 3.2973982732451753e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4570 }, { "completion_length": 59.775, "epoch": 0.916, "grad_norm": 0.000934600830078125, "kl": 0.035655501671135424, "learning_rate": 3.2891225923677565e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 4580 }, { "completion_length": 54.775, "epoch": 0.918, "grad_norm": 0.01336669921875, "kl": 4.602624000795186, "learning_rate": 3.280837296280582e-06, "loss": 0.0005, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 4590 }, { "completion_length": 46.175, "epoch": 0.92, "grad_norm": 0.0003147125244140625, "kl": 0.02292898967862129, "learning_rate": 3.272542485937369e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4600 }, { "completion_length": 48.85, "epoch": 0.922, "grad_norm": 0.000362396240234375, "kl": 0.08314138883724809, "learning_rate": 3.2642382624077647e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4610 }, { "completion_length": 49.15, "epoch": 0.924, "grad_norm": 0.0006103515625, "kl": 0.01725058164447546, "learning_rate": 3.2559247268761117e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4620 }, { "completion_length": 56.0, "epoch": 0.926, "grad_norm": 0.000514984130859375, "kl": 0.02017789352685213, "learning_rate": 3.247601980640217e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4630 }, { "completion_length": 48.975, "epoch": 0.928, "grad_norm": 0.00102996826171875, "kl": 0.015598981559742242, "learning_rate": 3.2392701251101172e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4640 }, { "completion_length": 54.35, "epoch": 0.93, "grad_norm": 0.00016307830810546875, "kl": 0.017281436000484974, "learning_rate": 3.230929261806842e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4650 }, { "completion_length": 62.25, "epoch": 0.932, "grad_norm": 0.00055694580078125, "kl": 27.82315392717719, "learning_rate": 3.222579492361179e-06, "loss": 0.0028, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 4660 }, { "completion_length": 51.0, "epoch": 0.934, "grad_norm": 0.00083160400390625, "kl": 0.18145442437380552, "learning_rate": 3.214220918512434e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4670 }, { "completion_length": 53.025, "epoch": 0.936, "grad_norm": 0.000469207763671875, "kl": 0.26395926494151356, "learning_rate": 3.205853642107192e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4680 }, { "completion_length": 46.575, "epoch": 0.938, "grad_norm": 32.75, "kl": 50.37309080436826, "learning_rate": 3.1974777650980737e-06, "loss": 0.005, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 4690 }, { "completion_length": 54.75, "epoch": 0.94, "grad_norm": 0.00131988525390625, "kl": 1.135747592896223, "learning_rate": 3.189093389542498e-06, "loss": 0.0001, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.05, "rewards/reward_func": -0.075, "step": 4700 }, { "completion_length": 45.35, "epoch": 0.942, "grad_norm": 0.000698089599609375, "kl": 8.475377059169114, "learning_rate": 3.180700617601436e-06, "loss": 0.0008, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 4710 }, { "completion_length": 62.7, "epoch": 0.944, "grad_norm": 0.000827789306640625, "kl": 0.10563798192888499, "learning_rate": 3.1722995515381644e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4720 }, { "completion_length": 51.2, "epoch": 0.946, "grad_norm": 0.000637054443359375, "kl": 0.041765560209751126, "learning_rate": 3.1638902937170224e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4730 }, { "completion_length": 53.425, "epoch": 0.948, "grad_norm": 0.037841796875, "kl": 90.9074339528568, "learning_rate": 3.155472946602162e-06, "loss": 0.0091, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.10773502588272095, "rewards/reward_func": -0.125, "step": 4740 }, { "completion_length": 56.025, "epoch": 0.95, "grad_norm": 0.000637054443359375, "kl": 0.14690550537779928, "learning_rate": 3.147047612756302e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4750 }, { "completion_length": 59.45, "epoch": 0.952, "grad_norm": 0.00110626220703125, "kl": 0.021767212729901075, "learning_rate": 3.1386143948394764e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4760 }, { "completion_length": 49.15, "epoch": 0.954, "grad_norm": 68.0, "kl": 50.01492289174348, "learning_rate": 3.130173395607785e-06, "loss": 0.005, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.10773502588272095, "rewards/reward_func": -0.075, "step": 4770 }, { "completion_length": 50.925, "epoch": 0.956, "grad_norm": 0.000579833984375, "kl": 3.767396915424615, "learning_rate": 3.121724717912138e-06, "loss": 0.0004, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 4780 }, { "completion_length": 54.85, "epoch": 0.958, "grad_norm": 0.00109100341796875, "kl": 0.024809733917936682, "learning_rate": 3.1132684646970068e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4790 }, { "completion_length": 60.0, "epoch": 0.96, "grad_norm": 0.000492095947265625, "kl": 22.365475433226674, "learning_rate": 3.1048047389991693e-06, "loss": 0.0022, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.15773502588272095, "rewards/reward_func": -0.1, "step": 4800 }, { "completion_length": 48.5, "epoch": 0.962, "grad_norm": 0.00107574462890625, "kl": 0.01949691798072308, "learning_rate": 3.0963336439464527e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4810 }, { "completion_length": 73.975, "epoch": 0.964, "grad_norm": 0.0017242431640625, "kl": 0.0853988635353744, "learning_rate": 3.087855282756475e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4820 }, { "completion_length": 61.175, "epoch": 0.966, "grad_norm": 0.00046539306640625, "kl": 0.05258291512727738, "learning_rate": 3.079369758735393e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4830 }, { "completion_length": 69.55, "epoch": 0.968, "grad_norm": 0.08447265625, "kl": 0.19851951650343835, "learning_rate": 3.0708771752766397e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 4840 }, { "completion_length": 52.35, "epoch": 0.97, "grad_norm": 0.00139617919921875, "kl": 0.016163587383925915, "learning_rate": 3.062377635859663e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4850 }, { "completion_length": 59.4, "epoch": 0.972, "grad_norm": 0.000415802001953125, "kl": 0.07268630117177963, "learning_rate": 3.053871244048669e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4860 }, { "completion_length": 52.725, "epoch": 0.974, "grad_norm": 0.000751495361328125, "kl": 0.08838214613497257, "learning_rate": 3.045358103491357e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4870 }, { "completion_length": 47.25, "epoch": 0.976, "grad_norm": 0.0015411376953125, "kl": 0.8490013023838401, "learning_rate": 3.0368383179176584e-06, "loss": 0.0001, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4880 }, { "completion_length": 51.9, "epoch": 0.978, "grad_norm": 0.000576019287109375, "kl": 0.06923787947744131, "learning_rate": 3.0283119911384724e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4890 }, { "completion_length": 45.05, "epoch": 0.98, "grad_norm": 0.000823974609375, "kl": 0.06741849109530448, "learning_rate": 3.019779227044398e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4900 }, { "completion_length": 59.025, "epoch": 0.982, "grad_norm": 0.000885009765625, "kl": 0.20571241448633373, "learning_rate": 3.0112401296044756e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4910 }, { "completion_length": 58.85, "epoch": 0.984, "grad_norm": 0.0003299713134765625, "kl": 0.0659916253760457, "learning_rate": 3.002694802864912e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4920 }, { "completion_length": 57.3, "epoch": 0.986, "grad_norm": 0.0010223388671875, "kl": 0.9824612125754356, "learning_rate": 2.9941433509478157e-06, "loss": 0.0001, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 4930 }, { "completion_length": 63.675, "epoch": 0.988, "grad_norm": 0.0008544921875, "kl": 0.08037902340292931, "learning_rate": 2.98558587804993e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4940 }, { "completion_length": 61.775, "epoch": 0.99, "grad_norm": 0.0003757476806640625, "kl": 0.05214073383249342, "learning_rate": 2.9770224884413625e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4950 }, { "completion_length": 58.125, "epoch": 0.992, "grad_norm": 0.003936767578125, "kl": 0.307067746296525, "learning_rate": 2.9684532864643123e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4960 }, { "completion_length": 47.575, "epoch": 0.994, "grad_norm": 0.00148773193359375, "kl": 0.4293937426991761, "learning_rate": 2.9598783765318005e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4970 }, { "completion_length": 61.5, "epoch": 0.996, "grad_norm": 0.000698089599609375, "kl": 0.055739361047744754, "learning_rate": 2.9512978631264006e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 4980 }, { "completion_length": 61.25, "epoch": 0.998, "grad_norm": 0.0030517578125, "kl": 0.31474687876179813, "learning_rate": 2.942711850798959e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 4990 }, { "completion_length": 55.975, "epoch": 1.0, "grad_norm": 0.000606536865234375, "kl": 10.358118780329823, "learning_rate": 2.9341204441673267e-06, "loss": 0.001, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.10773502588272095, "rewards/reward_func": -0.075, "step": 5000 }, { "completion_length": 67.125, "epoch": 1.002, "grad_norm": 0.0023193359375, "kl": 0.02981336957309395, "learning_rate": 2.9255237479150815e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5010 }, { "completion_length": 46.975, "epoch": 1.004, "grad_norm": 0.0004596710205078125, "kl": 0.18872954780235887, "learning_rate": 2.9169218667902562e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 5020 }, { "completion_length": 81.2, "epoch": 1.006, "grad_norm": 0.0005950927734375, "kl": 0.019520534854382276, "learning_rate": 2.908314905604056e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5030 }, { "completion_length": 84.75, "epoch": 1.008, "grad_norm": 11.1875, "kl": 0.21086107967421414, "learning_rate": 2.8997029692295875e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 5040 }, { "completion_length": 67.175, "epoch": 1.01, "grad_norm": 0.000514984130859375, "kl": 0.026521979738026856, "learning_rate": 2.8910861626005774e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5050 }, { "completion_length": 49.025, "epoch": 1.012, "grad_norm": 0.00106048583984375, "kl": 0.05570605006068945, "learning_rate": 2.8824645907100957e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5060 }, { "completion_length": 57.875, "epoch": 1.014, "grad_norm": 0.000522613525390625, "kl": 0.02027883781120181, "learning_rate": 2.8738383586092745e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5070 }, { "completion_length": 64.325, "epoch": 1.016, "grad_norm": 0.0004520416259765625, "kl": 0.35661591766402123, "learning_rate": 2.8652075714060296e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 5080 }, { "completion_length": 47.125, "epoch": 1.018, "grad_norm": 0.0033111572265625, "kl": 0.16099169924855233, "learning_rate": 2.8565723342637797e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5090 }, { "completion_length": 58.325, "epoch": 1.02, "grad_norm": 0.00066375732421875, "kl": 36.65545420385897, "learning_rate": 2.847932752400164e-06, "loss": 0.0037, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5100 }, { "completion_length": 54.975, "epoch": 1.022, "grad_norm": 0.0054931640625, "kl": 0.14035283839330076, "learning_rate": 2.8392889310857615e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5110 }, { "completion_length": 44.15, "epoch": 1.024, "grad_norm": 0.0006103515625, "kl": 0.05474662664346397, "learning_rate": 2.8306409756428067e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5120 }, { "completion_length": 51.425, "epoch": 1.026, "grad_norm": 0.00077056884765625, "kl": 0.9017472909763455, "learning_rate": 2.8219889914439073e-06, "loss": 0.0001, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5130 }, { "completion_length": 56.425, "epoch": 1.028, "grad_norm": 0.0030059814453125, "kl": 920.3957623304799, "learning_rate": 2.813333083910761e-06, "loss": 0.092, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.05, "rewards/reward_func": -0.075, "step": 5140 }, { "completion_length": 61.8, "epoch": 1.03, "grad_norm": 0.002166748046875, "kl": 0.06634964090771973, "learning_rate": 2.804673358512869e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5150 }, { "completion_length": 60.6, "epoch": 1.032, "grad_norm": 0.0009613037109375, "kl": 0.10043707201257349, "learning_rate": 2.7960099207662535e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5160 }, { "completion_length": 60.625, "epoch": 1.034, "grad_norm": 0.00049591064453125, "kl": 2957.9408455969765, "learning_rate": 2.7873428762321667e-06, "loss": 0.2958, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5170 }, { "completion_length": 48.175, "epoch": 1.036, "grad_norm": 0.000431060791015625, "kl": 0.44579824099782855, "learning_rate": 2.778672330515814e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 5180 }, { "completion_length": 69.6, "epoch": 1.038, "grad_norm": 0.000530242919921875, "kl": 0.1467185489833355, "learning_rate": 2.769998389265057e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 5190 }, { "completion_length": 46.4, "epoch": 1.04, "grad_norm": 0.0022735595703125, "kl": 0.1808565909974277, "learning_rate": 2.761321158169134e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5200 }, { "completion_length": 66.075, "epoch": 1.042, "grad_norm": 0.00151824951171875, "kl": 0.10877533163875341, "learning_rate": 2.752640742957366e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5210 }, { "completion_length": 64.8, "epoch": 1.044, "grad_norm": 0.000827789306640625, "kl": 36.803903768444435, "learning_rate": 2.743957249397874e-06, "loss": 0.0037, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5220 }, { "completion_length": 61.525, "epoch": 1.046, "grad_norm": 0.0040283203125, "kl": 0.021204144693911076, "learning_rate": 2.7352707832962865e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5230 }, { "completion_length": 45.65, "epoch": 1.048, "grad_norm": 0.003173828125, "kl": 2.3442719845101236, "learning_rate": 2.726581450494451e-06, "loss": 0.0002, "match_ratio": 0.875, "reward": -0.125, "reward_std": 0.10773502588272095, "rewards/reward_func": -0.125, "step": 5240 }, { "completion_length": 54.725, "epoch": 1.05, "grad_norm": 0.0003566741943359375, "kl": 0.08242949154227971, "learning_rate": 2.717889356869146e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5250 }, { "completion_length": 68.075, "epoch": 1.052, "grad_norm": 0.0003337860107421875, "kl": 0.0425911046564579, "learning_rate": 2.70919460833079e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5260 }, { "completion_length": 58.25, "epoch": 1.054, "grad_norm": 0.00067901611328125, "kl": 0.05532512974459678, "learning_rate": 2.700497310822147e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5270 }, { "completion_length": 67.3, "epoch": 1.056, "grad_norm": 1616.0, "kl": 654.5019911365816, "learning_rate": 2.6917975703170466e-06, "loss": 0.0655, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5280 }, { "completion_length": 50.725, "epoch": 1.058, "grad_norm": 0.0004863739013671875, "kl": 314.1969824824482, "learning_rate": 2.6830954928190795e-06, "loss": 0.0314, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.1, "rewards/reward_func": -0.1, "step": 5290 }, { "completion_length": 52.65, "epoch": 1.06, "grad_norm": 0.000690460205078125, "kl": 3050.6620800592004, "learning_rate": 2.6743911843603134e-06, "loss": 0.3051, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.1, "rewards/reward_func": -0.1, "step": 5300 }, { "completion_length": 45.2, "epoch": 1.062, "grad_norm": 0.000606536865234375, "kl": 0.6229335282929241, "learning_rate": 2.6656847510000013e-06, "loss": 0.0001, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5310 }, { "completion_length": 59.825, "epoch": 1.064, "grad_norm": 0.00023555755615234375, "kl": 0.0229927783831954, "learning_rate": 2.6569762988232838e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5320 }, { "completion_length": 45.525, "epoch": 1.066, "grad_norm": 0.470703125, "kl": 12.478434246452526, "learning_rate": 2.6482659339399047e-06, "loss": 0.0012, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 5330 }, { "completion_length": 48.5, "epoch": 1.068, "grad_norm": 0.000637054443359375, "kl": 0.04660536227747798, "learning_rate": 2.63955376248291e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5340 }, { "completion_length": 57.5, "epoch": 1.07, "grad_norm": 0.00103759765625, "kl": 0.46991982199251653, "learning_rate": 2.6308398906073603e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5350 }, { "completion_length": 59.9, "epoch": 1.072, "grad_norm": 0.00051116943359375, "kl": 0.05683571686968207, "learning_rate": 2.6221244244890336e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5360 }, { "completion_length": 74.275, "epoch": 1.074, "grad_norm": 0.00079345703125, "kl": 0.044226788356900214, "learning_rate": 2.613407470323134e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5370 }, { "completion_length": 71.075, "epoch": 1.076, "grad_norm": 0.000576019287109375, "kl": 0.014436176512390375, "learning_rate": 2.604689134322999e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5380 }, { "completion_length": 54.925, "epoch": 1.078, "grad_norm": 0.00112152099609375, "kl": 0.06992563903331757, "learning_rate": 2.5959695227188e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5390 }, { "completion_length": 58.65, "epoch": 1.08, "grad_norm": 0.00103759765625, "kl": 0.026312044728547333, "learning_rate": 2.587248741756253e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5400 }, { "completion_length": 61.625, "epoch": 1.082, "grad_norm": 38.25, "kl": 96.55521301142872, "learning_rate": 2.578526897695321e-06, "loss": 0.0097, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.15, "rewards/reward_func": -0.075, "step": 5410 }, { "completion_length": 74.625, "epoch": 1.084, "grad_norm": 0.0004444122314453125, "kl": 0.027571643888950347, "learning_rate": 2.569804096808923e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5420 }, { "completion_length": 63.75, "epoch": 1.086, "grad_norm": 0.000728607177734375, "kl": 0.06449384274892508, "learning_rate": 2.5610804453816333e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5430 }, { "completion_length": 61.75, "epoch": 1.088, "grad_norm": 0.001007080078125, "kl": 0.014204623247496783, "learning_rate": 2.5523560497083927e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5440 }, { "completion_length": 48.175, "epoch": 1.09, "grad_norm": 41.75, "kl": 4.48764673435362, "learning_rate": 2.543631016093209e-06, "loss": 0.0004, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.05, "rewards/reward_func": -0.075, "step": 5450 }, { "completion_length": 53.15, "epoch": 1.092, "grad_norm": 0.00054931640625, "kl": 0.04185728752054274, "learning_rate": 2.5349054508478636e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5460 }, { "completion_length": 58.475, "epoch": 1.094, "grad_norm": 0.000457763671875, "kl": 0.058739370107650755, "learning_rate": 2.526179460290615e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5470 }, { "completion_length": 54.5, "epoch": 1.096, "grad_norm": 0.025634765625, "kl": 0.8717142393812537, "learning_rate": 2.517453150744904e-06, "loss": 0.0001, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 5480 }, { "completion_length": 65.1, "epoch": 1.098, "grad_norm": 0.000553131103515625, "kl": 0.0642022612504661, "learning_rate": 2.5087266285380597e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5490 }, { "completion_length": 55.0, "epoch": 1.1, "grad_norm": 0.000644683837890625, "kl": 0.037831029202789065, "learning_rate": 2.5e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5500 }, { "completion_length": 56.85, "epoch": 1.102, "grad_norm": 0.001007080078125, "kl": 0.1622185967862606, "learning_rate": 2.4912733714619415e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5510 }, { "completion_length": 72.775, "epoch": 1.104, "grad_norm": 0.00127410888671875, "kl": 0.014623588742688298, "learning_rate": 2.482546849255096e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5520 }, { "completion_length": 53.525, "epoch": 1.106, "grad_norm": 0.000782012939453125, "kl": 0.21322614937089385, "learning_rate": 2.4738205397093863e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5530 }, { "completion_length": 48.825, "epoch": 1.108, "grad_norm": 32.0, "kl": 3.262874563597143, "learning_rate": 2.4650945491521372e-06, "loss": 0.0003, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 5540 }, { "completion_length": 58.7, "epoch": 1.11, "grad_norm": 0.0012664794921875, "kl": 0.062091145850718024, "learning_rate": 2.4563689839067913e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5550 }, { "completion_length": 45.5, "epoch": 1.112, "grad_norm": 0.00070953369140625, "kl": 58.79087800290436, "learning_rate": 2.447643950291608e-06, "loss": 0.0059, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5560 }, { "completion_length": 60.625, "epoch": 1.114, "grad_norm": 0.0003070831298828125, "kl": 11.288955740490929, "learning_rate": 2.4389195546183676e-06, "loss": 0.0011, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 5570 }, { "completion_length": 57.375, "epoch": 1.116, "grad_norm": 0.004669189453125, "kl": 0.05885868603363633, "learning_rate": 2.4301959031910785e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5580 }, { "completion_length": 64.825, "epoch": 1.1179999999999999, "grad_norm": 0.00072479248046875, "kl": 0.04936583343660459, "learning_rate": 2.4214731023046795e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5590 }, { "completion_length": 60.0, "epoch": 1.12, "grad_norm": 0.0009002685546875, "kl": 16.106818246748297, "learning_rate": 2.4127512582437486e-06, "loss": 0.0016, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5600 }, { "completion_length": 60.425, "epoch": 1.1219999999999999, "grad_norm": 0.0023193359375, "kl": 0.05024177338927984, "learning_rate": 2.4040304772812002e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5610 }, { "completion_length": 54.6, "epoch": 1.124, "grad_norm": 0.0004558563232421875, "kl": 0.3465561534278095, "learning_rate": 2.3953108656770018e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 5620 }, { "completion_length": 61.025, "epoch": 1.126, "grad_norm": 0.00311279296875, "kl": 0.019688890036195516, "learning_rate": 2.3865925296768658e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5630 }, { "completion_length": 50.0, "epoch": 1.1280000000000001, "grad_norm": 0.00066375732421875, "kl": 0.38241584403440354, "learning_rate": 2.377875575510967e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5640 }, { "completion_length": 58.7, "epoch": 1.13, "grad_norm": 0.00112152099609375, "kl": 0.11847766758874059, "learning_rate": 2.3691601093926406e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5650 }, { "completion_length": 61.125, "epoch": 1.1320000000000001, "grad_norm": 0.0023956298828125, "kl": 1.8248440870083869, "learning_rate": 2.3604462375170905e-06, "loss": 0.0002, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 5660 }, { "completion_length": 57.175, "epoch": 1.134, "grad_norm": 22.75, "kl": 16.894031352642923, "learning_rate": 2.3517340660600965e-06, "loss": 0.0017, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 5670 }, { "completion_length": 66.275, "epoch": 1.1360000000000001, "grad_norm": 0.00035858154296875, "kl": 0.01967704053968191, "learning_rate": 2.3430237011767166e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5680 }, { "completion_length": 51.75, "epoch": 1.138, "grad_norm": 69.0, "kl": 15.417314376076684, "learning_rate": 2.3343152490000004e-06, "loss": 0.0015, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5690 }, { "completion_length": 43.4, "epoch": 1.1400000000000001, "grad_norm": 0.000396728515625, "kl": 9.579606763273478, "learning_rate": 2.325608815639687e-06, "loss": 0.001, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5700 }, { "completion_length": 63.475, "epoch": 1.142, "grad_norm": 0.000457763671875, "kl": 0.09258651239797473, "learning_rate": 2.3169045071809217e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 5710 }, { "completion_length": 67.575, "epoch": 1.144, "grad_norm": 0.0004215240478515625, "kl": 36.50597060709261, "learning_rate": 2.3082024296829538e-06, "loss": 0.0037, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5720 }, { "completion_length": 57.8, "epoch": 1.146, "grad_norm": 0.0002994537353515625, "kl": 0.08120424915105104, "learning_rate": 2.2995026891778533e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5730 }, { "completion_length": 55.275, "epoch": 1.148, "grad_norm": 0.0027618408203125, "kl": 0.9556269285269081, "learning_rate": 2.290805391669212e-06, "loss": 0.0001, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5740 }, { "completion_length": 67.5, "epoch": 1.15, "grad_norm": 0.03662109375, "kl": 4.989278326183557, "learning_rate": 2.2821106431308546e-06, "loss": 0.0005, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5750 }, { "completion_length": 57.625, "epoch": 1.152, "grad_norm": 0.0004482269287109375, "kl": 0.05195563132874668, "learning_rate": 2.2734185495055503e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5760 }, { "completion_length": 66.875, "epoch": 1.154, "grad_norm": 0.000614166259765625, "kl": 0.04006156194955111, "learning_rate": 2.2647292167037143e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5770 }, { "completion_length": 67.85, "epoch": 1.156, "grad_norm": 0.000518798828125, "kl": 0.017254956741817297, "learning_rate": 2.256042750602127e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5780 }, { "completion_length": 57.9, "epoch": 1.158, "grad_norm": 0.016357421875, "kl": 22.463907711207867, "learning_rate": 2.2473592570426343e-06, "loss": 0.0022, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5790 }, { "completion_length": 53.4, "epoch": 1.16, "grad_norm": 14.9375, "kl": 0.47836247340310367, "learning_rate": 2.238678841830867e-06, "loss": 0.0, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.1154700517654419, "rewards/reward_func": -0.1, "step": 5800 }, { "completion_length": 52.575, "epoch": 1.162, "grad_norm": 0.0005340576171875, "kl": 1.7170299529330806, "learning_rate": 2.230001610734943e-06, "loss": 0.0002, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5810 }, { "completion_length": 52.175, "epoch": 1.164, "grad_norm": 0.000820159912109375, "kl": 0.023483294621109964, "learning_rate": 2.2213276694841866e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5820 }, { "completion_length": 66.725, "epoch": 1.166, "grad_norm": 0.000797271728515625, "kl": 0.017644689697772265, "learning_rate": 2.212657123767834e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5830 }, { "completion_length": 46.525, "epoch": 1.168, "grad_norm": 0.000629425048828125, "kl": 0.022305818554013968, "learning_rate": 2.2039900792337477e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5840 }, { "completion_length": 79.55, "epoch": 1.17, "grad_norm": 0.000514984130859375, "kl": 0.045465368404984476, "learning_rate": 2.195326641487132e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5850 }, { "completion_length": 70.875, "epoch": 1.172, "grad_norm": 0.000728607177734375, "kl": 0.026964151486754417, "learning_rate": 2.186666916089239e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5860 }, { "completion_length": 51.9, "epoch": 1.174, "grad_norm": 0.0006103515625, "kl": 0.06193929803557694, "learning_rate": 2.1780110085560935e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5870 }, { "completion_length": 58.975, "epoch": 1.176, "grad_norm": 0.0003662109375, "kl": 96.22263815930928, "learning_rate": 2.1693590243571937e-06, "loss": 0.0096, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5880 }, { "completion_length": 50.05, "epoch": 1.178, "grad_norm": 0.0012054443359375, "kl": 0.10491749201901257, "learning_rate": 2.1607110689142393e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5890 }, { "completion_length": 49.3, "epoch": 1.18, "grad_norm": 0.00064849853515625, "kl": 0.09745957013219594, "learning_rate": 2.1520672475998374e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5900 }, { "completion_length": 63.625, "epoch": 1.182, "grad_norm": 0.00054931640625, "kl": 0.0679678438231349, "learning_rate": 2.143427665736221e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5910 }, { "completion_length": 63.7, "epoch": 1.184, "grad_norm": 0.0005950927734375, "kl": 0.03987161219120026, "learning_rate": 2.134792428593971e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5920 }, { "completion_length": 53.3, "epoch": 1.186, "grad_norm": 0.000591278076171875, "kl": 3.143317204480991, "learning_rate": 2.1261616413907267e-06, "loss": 0.0003, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 5930 }, { "completion_length": 44.425, "epoch": 1.188, "grad_norm": 0.01080322265625, "kl": 0.0820039251120761, "learning_rate": 2.117535409289905e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5940 }, { "completion_length": 46.825, "epoch": 1.19, "grad_norm": 0.000396728515625, "kl": 0.10649018711410463, "learning_rate": 2.1089138373994226e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5950 }, { "completion_length": 47.475, "epoch": 1.192, "grad_norm": 0.000499725341796875, "kl": 0.01360652674920857, "learning_rate": 2.1002970307704134e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5960 }, { "completion_length": 63.525, "epoch": 1.194, "grad_norm": 0.00060272216796875, "kl": 0.015495409537106753, "learning_rate": 2.0916850943959453e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5970 }, { "completion_length": 47.5, "epoch": 1.196, "grad_norm": 0.0026702880859375, "kl": 0.058427824173122644, "learning_rate": 2.0830781332097446e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5980 }, { "completion_length": 59.525, "epoch": 1.198, "grad_norm": 0.00106048583984375, "kl": 0.07809406300075353, "learning_rate": 2.0744762520849193e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 5990 }, { "completion_length": 68.25, "epoch": 1.2, "grad_norm": 0.000324249267578125, "kl": 5.223696762509644, "learning_rate": 2.0658795558326745e-06, "loss": 0.0005, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 6000 }, { "completion_length": 52.325, "epoch": 1.202, "grad_norm": 0.000720977783203125, "kl": 0.08351310016587377, "learning_rate": 2.0572881492010423e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6010 }, { "completion_length": 65.15, "epoch": 1.204, "grad_norm": 0.000499725341796875, "kl": 668.4266535042319, "learning_rate": 2.0487021368736002e-06, "loss": 0.0668, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 6020 }, { "completion_length": 56.75, "epoch": 1.206, "grad_norm": 0.0004119873046875, "kl": 10.605211506178602, "learning_rate": 2.0401216234682e-06, "loss": 0.0011, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 6030 }, { "completion_length": 69.575, "epoch": 1.208, "grad_norm": 0.00077056884765625, "kl": 0.01607757806777954, "learning_rate": 2.031546713535688e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6040 }, { "completion_length": 66.85, "epoch": 1.21, "grad_norm": 0.00070953369140625, "kl": 0.05791890555992722, "learning_rate": 2.022977511558638e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6050 }, { "completion_length": 55.3, "epoch": 1.212, "grad_norm": 0.000774383544921875, "kl": 301.9731966109015, "learning_rate": 2.0144141219500707e-06, "loss": 0.0302, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 6060 }, { "completion_length": 53.25, "epoch": 1.214, "grad_norm": 0.0079345703125, "kl": 0.03139863689430058, "learning_rate": 2.0058566490521848e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6070 }, { "completion_length": 61.25, "epoch": 1.216, "grad_norm": 0.0005340576171875, "kl": 0.04796885896939784, "learning_rate": 1.997305197135089e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6080 }, { "completion_length": 47.325, "epoch": 1.218, "grad_norm": 0.002166748046875, "kl": 0.2320690915454179, "learning_rate": 1.9887598703955244e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6090 }, { "completion_length": 69.575, "epoch": 1.22, "grad_norm": 0.000339508056640625, "kl": 0.02641369737684727, "learning_rate": 1.9802207729556023e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6100 }, { "completion_length": 55.0, "epoch": 1.222, "grad_norm": 0.0003509521484375, "kl": 0.07225975301116705, "learning_rate": 1.971688008861529e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6110 }, { "completion_length": 60.75, "epoch": 1.224, "grad_norm": 0.00022411346435546875, "kl": 0.015332509903237224, "learning_rate": 1.963161682082342e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6120 }, { "completion_length": 48.175, "epoch": 1.226, "grad_norm": 0.002105712890625, "kl": 0.05186178609728813, "learning_rate": 1.9546418965086444e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6130 }, { "completion_length": 55.825, "epoch": 1.228, "grad_norm": 0.00109100341796875, "kl": 0.023852512496523558, "learning_rate": 1.946128755951332e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6140 }, { "completion_length": 60.925, "epoch": 1.23, "grad_norm": 0.0003604888916015625, "kl": 0.1579098215326667, "learning_rate": 1.937622364140338e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 6150 }, { "completion_length": 54.3, "epoch": 1.232, "grad_norm": 0.0018463134765625, "kl": 0.07494590748101473, "learning_rate": 1.9291228247233607e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6160 }, { "completion_length": 49.825, "epoch": 1.234, "grad_norm": 0.0006866455078125, "kl": 0.019502221944276244, "learning_rate": 1.9206302412646074e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6170 }, { "completion_length": 55.625, "epoch": 1.236, "grad_norm": 0.0003032684326171875, "kl": 0.017997803702019154, "learning_rate": 1.912144717243525e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6180 }, { "completion_length": 70.4, "epoch": 1.238, "grad_norm": 0.0008697509765625, "kl": 0.13488098671659826, "learning_rate": 1.9036663560535484e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6190 }, { "completion_length": 42.95, "epoch": 1.24, "grad_norm": 0.0008392333984375, "kl": 0.05781206511892378, "learning_rate": 1.895195261000831e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6200 }, { "completion_length": 55.925, "epoch": 1.242, "grad_norm": 0.000698089599609375, "kl": 0.018764377292245626, "learning_rate": 1.8867315353029937e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6210 }, { "completion_length": 60.3, "epoch": 1.244, "grad_norm": 0.000396728515625, "kl": 0.061903743352741, "learning_rate": 1.8782752820878636e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6220 }, { "completion_length": 60.525, "epoch": 1.246, "grad_norm": 0.000698089599609375, "kl": 0.06673049959354102, "learning_rate": 1.8698266043922159e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6230 }, { "completion_length": 53.775, "epoch": 1.248, "grad_norm": 0.0004520416259765625, "kl": 0.05146434986963868, "learning_rate": 1.8613856051605242e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6240 }, { "completion_length": 57.675, "epoch": 1.25, "grad_norm": 0.000431060791015625, "kl": 0.05393651574850082, "learning_rate": 1.852952387243698e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6250 }, { "completion_length": 44.85, "epoch": 1.252, "grad_norm": 0.000461578369140625, "kl": 0.13971609035506843, "learning_rate": 1.8445270533978387e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 6260 }, { "completion_length": 50.125, "epoch": 1.254, "grad_norm": 0.000614166259765625, "kl": 0.050323914270848036, "learning_rate": 1.836109706282978e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6270 }, { "completion_length": 50.5, "epoch": 1.256, "grad_norm": 0.00054931640625, "kl": 0.13358841557055712, "learning_rate": 1.827700448461836e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6280 }, { "completion_length": 58.675, "epoch": 1.258, "grad_norm": 0.000518798828125, "kl": 0.4532184978015721, "learning_rate": 1.8192993823985643e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 6290 }, { "completion_length": 61.5, "epoch": 1.26, "grad_norm": 0.000621795654296875, "kl": 2.259404849074781, "learning_rate": 1.8109066104575023e-06, "loss": 0.0002, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 6300 }, { "completion_length": 47.8, "epoch": 1.262, "grad_norm": 0.00080108642578125, "kl": 0.110232665669173, "learning_rate": 1.8025222349019273e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6310 }, { "completion_length": 46.3, "epoch": 1.264, "grad_norm": 0.000720977783203125, "kl": 0.0984095955034718, "learning_rate": 1.7941463578928088e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6320 }, { "completion_length": 57.525, "epoch": 1.266, "grad_norm": 17.375, "kl": 53.167960462137124, "learning_rate": 1.7857790814875665e-06, "loss": 0.0053, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 6330 }, { "completion_length": 56.15, "epoch": 1.268, "grad_norm": 0.002105712890625, "kl": 0.09631253816187382, "learning_rate": 1.7774205076388207e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6340 }, { "completion_length": 50.2, "epoch": 1.27, "grad_norm": 0.014404296875, "kl": 0.185893784603104, "learning_rate": 1.7690707381931585e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6350 }, { "completion_length": 57.775, "epoch": 1.272, "grad_norm": 0.0006256103515625, "kl": 0.032014391385018826, "learning_rate": 1.7607298748898844e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6360 }, { "completion_length": 72.225, "epoch": 1.274, "grad_norm": 0.0087890625, "kl": 0.053022891748696566, "learning_rate": 1.7523980193597837e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6370 }, { "completion_length": 61.275, "epoch": 1.276, "grad_norm": 0.0004520416259765625, "kl": 0.017128141969442366, "learning_rate": 1.744075273123889e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6380 }, { "completion_length": 66.125, "epoch": 1.278, "grad_norm": 0.00061798095703125, "kl": 0.07104477211833, "learning_rate": 1.735761737592236e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 6390 }, { "completion_length": 61.725, "epoch": 1.28, "grad_norm": 0.00168609619140625, "kl": 0.022364417230710386, "learning_rate": 1.7274575140626318e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6400 }, { "completion_length": 59.975, "epoch": 1.282, "grad_norm": 0.0005950927734375, "kl": 0.04414304066449404, "learning_rate": 1.7191627037194187e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6410 }, { "completion_length": 58.35, "epoch": 1.284, "grad_norm": 0.0004100799560546875, "kl": 71.30782471811399, "learning_rate": 1.7108774076322443e-06, "loss": 0.0071, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 6420 }, { "completion_length": 54.925, "epoch": 1.286, "grad_norm": 25.625, "kl": 3.6482051144819705, "learning_rate": 1.702601726754825e-06, "loss": 0.0004, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 6430 }, { "completion_length": 51.125, "epoch": 1.288, "grad_norm": 0.00084686279296875, "kl": 0.06689287801855244, "learning_rate": 1.6943357619237227e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6440 }, { "completion_length": 60.625, "epoch": 1.29, "grad_norm": 0.00052642822265625, "kl": 0.03276003615465015, "learning_rate": 1.686079613857109e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6450 }, { "completion_length": 49.8, "epoch": 1.292, "grad_norm": 0.004669189453125, "kl": 0.04734173566102982, "learning_rate": 1.677833383153542e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6460 }, { "completion_length": 50.925, "epoch": 1.294, "grad_norm": 0.00040435791015625, "kl": 0.08111863350495696, "learning_rate": 1.6695971702907425e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6470 }, { "completion_length": 51.975, "epoch": 1.296, "grad_norm": 0.000335693359375, "kl": 0.034373713890090585, "learning_rate": 1.661371075624363e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6480 }, { "completion_length": 56.55, "epoch": 1.298, "grad_norm": 0.0008544921875, "kl": 0.05773493410088122, "learning_rate": 1.6531551993867717e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6490 }, { "completion_length": 50.95, "epoch": 1.3, "grad_norm": 282.0, "kl": 178.04388241134583, "learning_rate": 1.6449496416858285e-06, "loss": 0.0178, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 6500 }, { "completion_length": 58.625, "epoch": 1.302, "grad_norm": 0.0023651123046875, "kl": 5.505481028556824, "learning_rate": 1.6367545025036634e-06, "loss": 0.0006, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 6510 }, { "completion_length": 67.575, "epoch": 1.304, "grad_norm": 0.001068115234375, "kl": 0.041122534591704604, "learning_rate": 1.6285698816954626e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6520 }, { "completion_length": 49.75, "epoch": 1.306, "grad_norm": 0.000415802001953125, "kl": 0.165414993558079, "learning_rate": 1.6203958789882457e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6530 }, { "completion_length": 56.675, "epoch": 1.308, "grad_norm": 0.0007476806640625, "kl": 0.2703967327717692, "learning_rate": 1.612232593979658e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 6540 }, { "completion_length": 56.2, "epoch": 1.31, "grad_norm": 0.00133514404296875, "kl": 0.02644283170811832, "learning_rate": 1.6040801261367494e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6550 }, { "completion_length": 58.05, "epoch": 1.312, "grad_norm": 0.00030517578125, "kl": 0.015307861985638738, "learning_rate": 1.5959385747947697e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6560 }, { "completion_length": 73.35, "epoch": 1.314, "grad_norm": 0.0004405975341796875, "kl": 0.013954693730920554, "learning_rate": 1.5878080391559507e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6570 }, { "completion_length": 51.05, "epoch": 1.316, "grad_norm": 0.0003566741943359375, "kl": 684.7183584340382, "learning_rate": 1.5796886182883053e-06, "loss": 0.0685, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 6580 }, { "completion_length": 67.15, "epoch": 1.318, "grad_norm": 0.000598907470703125, "kl": 0.024668072490021585, "learning_rate": 1.5715804111244138e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6590 }, { "completion_length": 66.8, "epoch": 1.32, "grad_norm": 0.00738525390625, "kl": 0.047915787994861604, "learning_rate": 1.56348351646022e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6600 }, { "completion_length": 61.85, "epoch": 1.322, "grad_norm": 0.000537872314453125, "kl": 0.05744472313672304, "learning_rate": 1.5553980329538326e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6610 }, { "completion_length": 57.5, "epoch": 1.324, "grad_norm": 0.0006561279296875, "kl": 0.05604997184127569, "learning_rate": 1.547324059124315e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6620 }, { "completion_length": 59.425, "epoch": 1.326, "grad_norm": 0.00112152099609375, "kl": 0.0188056749291718, "learning_rate": 1.539261693350491e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6630 }, { "completion_length": 40.9, "epoch": 1.328, "grad_norm": 0.00072479248046875, "kl": 0.1357155740261078, "learning_rate": 1.5312110338697427e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6640 }, { "completion_length": 61.5, "epoch": 1.33, "grad_norm": 0.000690460205078125, "kl": 0.018099735863506793, "learning_rate": 1.5231721787768162e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6650 }, { "completion_length": 54.125, "epoch": 1.332, "grad_norm": 0.000720977783203125, "kl": 0.014814224326983094, "learning_rate": 1.5151452260226224e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6660 }, { "completion_length": 64.325, "epoch": 1.334, "grad_norm": 0.00016117095947265625, "kl": 0.016494302544742823, "learning_rate": 1.5071302734130488e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6670 }, { "completion_length": 59.2, "epoch": 1.336, "grad_norm": 0.00194549560546875, "kl": 0.06070426572114229, "learning_rate": 1.4991274186077632e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6680 }, { "completion_length": 59.15, "epoch": 1.338, "grad_norm": 0.0006256103515625, "kl": 0.11375871314667166, "learning_rate": 1.491136759119025e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 6690 }, { "completion_length": 52.65, "epoch": 1.34, "grad_norm": 0.000713348388671875, "kl": 0.03938477258197963, "learning_rate": 1.4831583923105e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6700 }, { "completion_length": 63.525, "epoch": 1.342, "grad_norm": 0.00083160400390625, "kl": 0.020069646975025536, "learning_rate": 1.4751924153960681e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6710 }, { "completion_length": 53.7, "epoch": 1.3439999999999999, "grad_norm": 0.06787109375, "kl": 0.14404951045289635, "learning_rate": 1.467238925438646e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6720 }, { "completion_length": 55.475, "epoch": 1.346, "grad_norm": 0.0015411376953125, "kl": 180.89379140562377, "learning_rate": 1.4592980193489975e-06, "loss": 0.0181, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 6730 }, { "completion_length": 53.55, "epoch": 1.3479999999999999, "grad_norm": 0.00067138671875, "kl": 0.05597533159889281, "learning_rate": 1.4513697938845571e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6740 }, { "completion_length": 53.1, "epoch": 1.35, "grad_norm": 0.002777099609375, "kl": 0.3969091270118952, "learning_rate": 1.443454345648252e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 6750 }, { "completion_length": 43.8, "epoch": 1.3519999999999999, "grad_norm": 0.0003185272216796875, "kl": 0.10832225987687708, "learning_rate": 1.4355517710873184e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6760 }, { "completion_length": 68.725, "epoch": 1.354, "grad_norm": 0.0004730224609375, "kl": 0.03166971495375037, "learning_rate": 1.4276621664921358e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6770 }, { "completion_length": 63.625, "epoch": 1.3559999999999999, "grad_norm": 0.00077056884765625, "kl": 0.018040235806256532, "learning_rate": 1.419785627995044e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6780 }, { "completion_length": 56.7, "epoch": 1.358, "grad_norm": 0.0103759765625, "kl": 0.08500627786852419, "learning_rate": 1.4119222515691817e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6790 }, { "completion_length": 60.375, "epoch": 1.3599999999999999, "grad_norm": 0.000652313232421875, "kl": 0.13012904403731226, "learning_rate": 1.4040721330273063e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6800 }, { "completion_length": 51.225, "epoch": 1.362, "grad_norm": 0.00110626220703125, "kl": 0.019943116419017314, "learning_rate": 1.3962353680206372e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6810 }, { "completion_length": 62.1, "epoch": 1.3639999999999999, "grad_norm": 0.000614166259765625, "kl": 0.08246160177513957, "learning_rate": 1.388412052037682e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 6820 }, { "completion_length": 49.75, "epoch": 1.366, "grad_norm": 0.0008697509765625, "kl": 95.11083188317716, "learning_rate": 1.380602280403076e-06, "loss": 0.0095, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 6830 }, { "completion_length": 63.875, "epoch": 1.3679999999999999, "grad_norm": 0.000579833984375, "kl": 0.027737328410148622, "learning_rate": 1.3728061482764238e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6840 }, { "completion_length": 56.325, "epoch": 1.37, "grad_norm": 0.00054168701171875, "kl": 0.06817373894155025, "learning_rate": 1.3650237506511333e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6850 }, { "completion_length": 56.875, "epoch": 1.3719999999999999, "grad_norm": 0.000423431396484375, "kl": 0.014378735097125173, "learning_rate": 1.3572551823532654e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6860 }, { "completion_length": 58.425, "epoch": 1.374, "grad_norm": 0.00054168701171875, "kl": 0.43131620325148107, "learning_rate": 1.349500538040371e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 6870 }, { "completion_length": 55.575, "epoch": 1.376, "grad_norm": 0.00066375732421875, "kl": 0.033100543078035116, "learning_rate": 1.3417599122003464e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6880 }, { "completion_length": 54.85, "epoch": 1.3780000000000001, "grad_norm": 0.002105712890625, "kl": 0.04654085249640048, "learning_rate": 1.3340333991502723e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6890 }, { "completion_length": 68.475, "epoch": 1.38, "grad_norm": 0.0005035400390625, "kl": 0.05651907054707408, "learning_rate": 1.3263210930352737e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6900 }, { "completion_length": 61.55, "epoch": 1.3820000000000001, "grad_norm": 0.00054168701171875, "kl": 0.03381169466301799, "learning_rate": 1.3186230878273654e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6910 }, { "completion_length": 57.0, "epoch": 1.384, "grad_norm": 0.00049591064453125, "kl": 0.6785514406859875, "learning_rate": 1.3109394773243117e-06, "loss": 0.0001, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 6920 }, { "completion_length": 61.8, "epoch": 1.3860000000000001, "grad_norm": 0.020751953125, "kl": 0.08321558614261448, "learning_rate": 1.3032703551484832e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6930 }, { "completion_length": 60.55, "epoch": 1.388, "grad_norm": 0.000690460205078125, "kl": 0.09076761337928474, "learning_rate": 1.2956158147457116e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6940 }, { "completion_length": 49.25, "epoch": 1.3900000000000001, "grad_norm": 0.000667572021484375, "kl": 0.04632122702896595, "learning_rate": 1.2879759493841577e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6950 }, { "completion_length": 66.75, "epoch": 1.392, "grad_norm": 0.000629425048828125, "kl": 3.6676836960949, "learning_rate": 1.280350852153168e-06, "loss": 0.0004, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 6960 }, { "completion_length": 58.6, "epoch": 1.3940000000000001, "grad_norm": 0.0003719329833984375, "kl": 0.07259991895407439, "learning_rate": 1.272740615962148e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6970 }, { "completion_length": 62.025, "epoch": 1.396, "grad_norm": 0.00069427490234375, "kl": 0.01472460343502462, "learning_rate": 1.2651453335394232e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6980 }, { "completion_length": 64.175, "epoch": 1.3980000000000001, "grad_norm": 0.000606536865234375, "kl": 0.18156763026490808, "learning_rate": 1.2575650974311118e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 6990 }, { "completion_length": 50.15, "epoch": 1.4, "grad_norm": 14.125, "kl": 10.663465712498873, "learning_rate": 1.2500000000000007e-06, "loss": 0.0011, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.15, "rewards/reward_func": -0.075, "step": 7000 }, { "completion_length": 62.375, "epoch": 1.4020000000000001, "grad_norm": 0.000507354736328125, "kl": 0.14125907123088838, "learning_rate": 1.2424501334244124e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7010 }, { "completion_length": 52.6, "epoch": 1.404, "grad_norm": 0.0018310546875, "kl": 0.13884197538718582, "learning_rate": 1.234915589697091e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7020 }, { "completion_length": 55.575, "epoch": 1.4060000000000001, "grad_norm": 0.0006103515625, "kl": 31778.019179227947, "learning_rate": 1.2273964606240718e-06, "loss": 3.1778, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 7030 }, { "completion_length": 48.4, "epoch": 1.408, "grad_norm": 0.0009613037109375, "kl": 0.17425558338873087, "learning_rate": 1.2198928378235717e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7040 }, { "completion_length": 47.075, "epoch": 1.41, "grad_norm": 0.000476837158203125, "kl": 0.045156693411991, "learning_rate": 1.2124048127248644e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7050 }, { "completion_length": 62.375, "epoch": 1.412, "grad_norm": 0.000751495361328125, "kl": 0.031479455251246694, "learning_rate": 1.204932476567175e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7060 }, { "completion_length": 55.525, "epoch": 1.414, "grad_norm": 0.00058746337890625, "kl": 0.13692689267918468, "learning_rate": 1.19747592039856e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 7070 }, { "completion_length": 47.2, "epoch": 1.416, "grad_norm": 0.000492095947265625, "kl": 0.028804597025737167, "learning_rate": 1.1900352350748026e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7080 }, { "completion_length": 59.075, "epoch": 1.418, "grad_norm": 0.0003509521484375, "kl": 0.04570387415587902, "learning_rate": 1.1826105112583061e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7090 }, { "completion_length": 56.05, "epoch": 1.42, "grad_norm": 0.0004558563232421875, "kl": 0.018374279094859957, "learning_rate": 1.1752018394169882e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7100 }, { "completion_length": 61.95, "epoch": 1.422, "grad_norm": 0.000583648681640625, "kl": 0.03715153355151415, "learning_rate": 1.1678093098231748e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7110 }, { "completion_length": 54.9, "epoch": 1.424, "grad_norm": 0.000518798828125, "kl": 0.016605707909911872, "learning_rate": 1.160433012552508e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7120 }, { "completion_length": 57.55, "epoch": 1.426, "grad_norm": 0.0004215240478515625, "kl": 0.018531074468046426, "learning_rate": 1.1530730374828422e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7130 }, { "completion_length": 58.0, "epoch": 1.428, "grad_norm": 0.000522613525390625, "kl": 0.06935790865682065, "learning_rate": 1.1457294742931508e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7140 }, { "completion_length": 52.5, "epoch": 1.43, "grad_norm": 0.0010833740234375, "kl": 0.09027541326358915, "learning_rate": 1.1384024124624324e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7150 }, { "completion_length": 68.775, "epoch": 1.432, "grad_norm": 0.00066375732421875, "kl": 0.017101448262110353, "learning_rate": 1.1310919412686248e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7160 }, { "completion_length": 66.075, "epoch": 1.434, "grad_norm": 0.0003528594970703125, "kl": 0.10510317548178136, "learning_rate": 1.1237981497875112e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 7170 }, { "completion_length": 58.75, "epoch": 1.436, "grad_norm": 0.0003719329833984375, "kl": 0.07255538417957723, "learning_rate": 1.11652112689164e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7180 }, { "completion_length": 55.575, "epoch": 1.438, "grad_norm": 0.000713348388671875, "kl": 0.04587976224720478, "learning_rate": 1.109260961249238e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7190 }, { "completion_length": 44.3, "epoch": 1.44, "grad_norm": 0.0007171630859375, "kl": 0.026211364893242717, "learning_rate": 1.1020177413231334e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7200 }, { "completion_length": 55.4, "epoch": 1.442, "grad_norm": 0.00011587142944335938, "kl": 1273.178384515643, "learning_rate": 1.0947915553696742e-06, "loss": 0.1273, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.15773502588272095, "rewards/reward_func": -0.1, "step": 7210 }, { "completion_length": 70.35, "epoch": 1.444, "grad_norm": 0.0002689361572265625, "kl": 0.30778478598222136, "learning_rate": 1.0875824914376555e-06, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 7220 }, { "completion_length": 51.35, "epoch": 1.446, "grad_norm": 0.000743865966796875, "kl": 0.11805587047711015, "learning_rate": 1.0803906373672477e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 7230 }, { "completion_length": 54.1, "epoch": 1.448, "grad_norm": 0.00083160400390625, "kl": 0.13561045327223836, "learning_rate": 1.073216080788921e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7240 }, { "completion_length": 58.075, "epoch": 1.45, "grad_norm": 0.0007781982421875, "kl": 0.01598156727850437, "learning_rate": 1.0660589091223854e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7250 }, { "completion_length": 55.825, "epoch": 1.452, "grad_norm": 0.000614166259765625, "kl": 0.08759649377316236, "learning_rate": 1.0589192095755172e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7260 }, { "completion_length": 54.95, "epoch": 1.454, "grad_norm": 0.000942230224609375, "kl": 2.808669605664909, "learning_rate": 1.0517970691433035e-06, "loss": 0.0003, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 7270 }, { "completion_length": 44.8, "epoch": 1.456, "grad_norm": 0.0010833740234375, "kl": 177.8802186036017, "learning_rate": 1.0446925746067768e-06, "loss": 0.0178, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 7280 }, { "completion_length": 55.0, "epoch": 1.458, "grad_norm": 0.001861572265625, "kl": 0.25618111025542023, "learning_rate": 1.0376058125319614e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 7290 }, { "completion_length": 65.625, "epoch": 1.46, "grad_norm": 0.01287841796875, "kl": 0.1665965816937387, "learning_rate": 1.0305368692688175e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 7300 }, { "completion_length": 56.25, "epoch": 1.462, "grad_norm": 0.00029754638671875, "kl": 0.4289894063025713, "learning_rate": 1.0234858309501864e-06, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 7310 }, { "completion_length": 48.675, "epoch": 1.464, "grad_norm": 2.328125, "kl": 44.01882844008505, "learning_rate": 1.0164527834907468e-06, "loss": 0.0044, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 7320 }, { "completion_length": 50.35, "epoch": 1.466, "grad_norm": 0.0024261474609375, "kl": 0.13080412773415445, "learning_rate": 1.0094378125859602e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7330 }, { "completion_length": 55.85, "epoch": 1.468, "grad_norm": 0.000461578369140625, "kl": 0.0169123521191068, "learning_rate": 1.0024410037110358e-06, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7340 }, { "completion_length": 49.05, "epoch": 1.47, "grad_norm": 0.00058746337890625, "kl": 0.13749618739821018, "learning_rate": 9.95462442119879e-07, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 7350 }, { "completion_length": 64.4, "epoch": 1.472, "grad_norm": 0.000614166259765625, "kl": 0.025387801649048924, "learning_rate": 9.88502212844063e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7360 }, { "completion_length": 66.95, "epoch": 1.474, "grad_norm": 0.00026702880859375, "kl": 0.03902003513649106, "learning_rate": 9.815604006917839e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7370 }, { "completion_length": 54.0, "epoch": 1.476, "grad_norm": 0.0159912109375, "kl": 0.08616708847694099, "learning_rate": 9.746370902468311e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7380 }, { "completion_length": 61.025, "epoch": 1.478, "grad_norm": 0.00043487548828125, "kl": 0.027717783488333224, "learning_rate": 9.677323658675594e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7390 }, { "completion_length": 47.25, "epoch": 1.48, "grad_norm": 0.00274658203125, "kl": 0.09193211463280022, "learning_rate": 9.608463116858544e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7400 }, { "completion_length": 61.15, "epoch": 1.482, "grad_norm": 0.00194549560546875, "kl": 0.09381414433009923, "learning_rate": 9.53979011606115e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7410 }, { "completion_length": 55.875, "epoch": 1.484, "grad_norm": 0.000873565673828125, "kl": 0.06577477985993027, "learning_rate": 9.471305493042243e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7420 }, { "completion_length": 55.325, "epoch": 1.486, "grad_norm": 0.0006866455078125, "kl": 0.05357563262805343, "learning_rate": 9.403010082265351e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7430 }, { "completion_length": 55.225, "epoch": 1.488, "grad_norm": 0.000606536865234375, "kl": 0.043947093037422745, "learning_rate": 9.334904715888496e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7440 }, { "completion_length": 47.125, "epoch": 1.49, "grad_norm": 0.0005035400390625, "kl": 0.020906020514667036, "learning_rate": 9.266990223754069e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7450 }, { "completion_length": 75.375, "epoch": 1.492, "grad_norm": 0.0007476806640625, "kl": 0.0432497413828969, "learning_rate": 9.199267433378728e-07, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 7460 }, { "completion_length": 66.75, "epoch": 1.494, "grad_norm": 0.000682830810546875, "kl": 0.01475386363454163, "learning_rate": 9.131737169943314e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7470 }, { "completion_length": 53.25, "epoch": 1.496, "grad_norm": 0.000530242919921875, "kl": 0.034680284932255744, "learning_rate": 9.064400256282757e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7480 }, { "completion_length": 66.5, "epoch": 1.498, "grad_norm": 0.0012664794921875, "kl": 0.020779677666723728, "learning_rate": 8.99725751287611e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7490 }, { "completion_length": 69.5, "epoch": 1.5, "grad_norm": 35.5, "kl": 10.81919735018164, "learning_rate": 8.930309757836517e-07, "loss": 0.0011, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 7500 }, { "completion_length": 63.825, "epoch": 1.502, "grad_norm": 0.000682830810546875, "kl": 0.018158415833022447, "learning_rate": 8.863557806901233e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7510 }, { "completion_length": 46.225, "epoch": 1.504, "grad_norm": 0.0003185272216796875, "kl": 0.04219387628836557, "learning_rate": 8.797002473421729e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7520 }, { "completion_length": 70.0, "epoch": 1.506, "grad_norm": 0.00041961669921875, "kl": 0.03907957626506686, "learning_rate": 8.73064456835373e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7530 }, { "completion_length": 60.8, "epoch": 1.508, "grad_norm": 0.000659942626953125, "kl": 152.24121750062332, "learning_rate": 8.664484900247363e-07, "loss": 0.0152, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.0, "rewards/reward_func": -0.1, "step": 7540 }, { "completion_length": 46.375, "epoch": 1.51, "grad_norm": 0.001556396484375, "kl": 0.2765787610784173, "learning_rate": 8.598524275237321e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7550 }, { "completion_length": 68.675, "epoch": 1.512, "grad_norm": 0.0027008056640625, "kl": 0.028881799709051848, "learning_rate": 8.532763497032987e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7560 }, { "completion_length": 63.8, "epoch": 1.514, "grad_norm": 0.000469207763671875, "kl": 0.04086120091378689, "learning_rate": 8.467203366908708e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7570 }, { "completion_length": 49.95, "epoch": 1.516, "grad_norm": 0.0037689208984375, "kl": 0.12793728783726693, "learning_rate": 8.40184468369396e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7580 }, { "completion_length": 53.5, "epoch": 1.518, "grad_norm": 0.00121307373046875, "kl": 42.36696035126224, "learning_rate": 8.336688243763691e-07, "loss": 0.0042, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.10773502588272095, "rewards/reward_func": -0.075, "step": 7590 }, { "completion_length": 47.325, "epoch": 1.52, "grad_norm": 0.0006103515625, "kl": 14.073435558238998, "learning_rate": 8.271734841028553e-07, "loss": 0.0014, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 7600 }, { "completion_length": 65.0, "epoch": 1.522, "grad_norm": 0.000720977783203125, "kl": 0.04525826433673501, "learning_rate": 8.206985266925249e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7610 }, { "completion_length": 59.525, "epoch": 1.524, "grad_norm": 0.000843048095703125, "kl": 0.04048813302069902, "learning_rate": 8.142440310406923e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7620 }, { "completion_length": 46.45, "epoch": 1.526, "grad_norm": 0.0004329681396484375, "kl": 8.161725069396198, "learning_rate": 8.078100757933486e-07, "loss": 0.0008, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 7630 }, { "completion_length": 57.225, "epoch": 1.528, "grad_norm": 0.000698089599609375, "kl": 0.1071649724675808, "learning_rate": 8.013967393462094e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7640 }, { "completion_length": 52.975, "epoch": 1.53, "grad_norm": 0.000431060791015625, "kl": 0.12129491865634918, "learning_rate": 7.950040998437541e-07, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 7650 }, { "completion_length": 43.225, "epoch": 1.532, "grad_norm": 0.0026702880859375, "kl": 0.1067592917010188, "learning_rate": 7.886322351782782e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7660 }, { "completion_length": 59.9, "epoch": 1.534, "grad_norm": 0.00093841552734375, "kl": 0.05229797107167542, "learning_rate": 7.822812229889429e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7670 }, { "completion_length": 65.525, "epoch": 1.536, "grad_norm": 0.0004405975341796875, "kl": 0.010464739426970482, "learning_rate": 7.759511406608255e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7680 }, { "completion_length": 66.625, "epoch": 1.538, "grad_norm": 0.0004825592041015625, "kl": 0.045739847654476765, "learning_rate": 7.696420653239834e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7690 }, { "completion_length": 73.575, "epoch": 1.54, "grad_norm": 0.00058746337890625, "kl": 0.031135138869285584, "learning_rate": 7.633540738525066e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7700 }, { "completion_length": 48.175, "epoch": 1.542, "grad_norm": 51.25, "kl": 80.07849281346425, "learning_rate": 7.57087242863589e-07, "loss": 0.008, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 7710 }, { "completion_length": 46.375, "epoch": 1.544, "grad_norm": 0.00138092041015625, "kl": 0.0285742097068578, "learning_rate": 7.508416487165862e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7720 }, { "completion_length": 57.2, "epoch": 1.546, "grad_norm": 0.000492095947265625, "kl": 0.07636187486350536, "learning_rate": 7.44617367512094e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7730 }, { "completion_length": 56.15, "epoch": 1.548, "grad_norm": 0.00045013427734375, "kl": 0.03193683328572661, "learning_rate": 7.384144750910133e-07, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 7740 }, { "completion_length": 59.65, "epoch": 1.55, "grad_norm": 0.000370025634765625, "kl": 0.09212675780290738, "learning_rate": 7.322330470336314e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7750 }, { "completion_length": 53.225, "epoch": 1.552, "grad_norm": 0.000499725341796875, "kl": 0.0655658102594316, "learning_rate": 7.260731586586983e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7760 }, { "completion_length": 50.3, "epoch": 1.554, "grad_norm": 0.00186920166015625, "kl": 0.07870109416544438, "learning_rate": 7.199348850225091e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7770 }, { "completion_length": 53.8, "epoch": 1.556, "grad_norm": 0.003204345703125, "kl": 146.1276578912046, "learning_rate": 7.138183009179922e-07, "loss": 0.0146, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 7780 }, { "completion_length": 69.675, "epoch": 1.558, "grad_norm": 0.0005645751953125, "kl": 0.028883875254541634, "learning_rate": 7.077234808737932e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7790 }, { "completion_length": 67.7, "epoch": 1.56, "grad_norm": 0.00102996826171875, "kl": 0.0383193613961339, "learning_rate": 7.016504991533727e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7800 }, { "completion_length": 59.3, "epoch": 1.562, "grad_norm": 0.001800537109375, "kl": 0.07388523239642382, "learning_rate": 6.955994297540947e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7810 }, { "completion_length": 56.75, "epoch": 1.564, "grad_norm": 0.000614166259765625, "kl": 0.08295210748910904, "learning_rate": 6.895703464063319e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7820 }, { "completion_length": 60.15, "epoch": 1.5659999999999998, "grad_norm": 0.00628662109375, "kl": 0.03666973649524152, "learning_rate": 6.835633225725604e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7830 }, { "completion_length": 67.05, "epoch": 1.568, "grad_norm": 0.000774383544921875, "kl": 0.017159267235547303, "learning_rate": 6.775784314464717e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7840 }, { "completion_length": 63.4, "epoch": 1.5699999999999998, "grad_norm": 0.00055694580078125, "kl": 0.01824809005483985, "learning_rate": 6.716157459520739e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7850 }, { "completion_length": 54.525, "epoch": 1.572, "grad_norm": 0.0005035400390625, "kl": 0.03218274647369981, "learning_rate": 6.656753387428089e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7860 }, { "completion_length": 70.525, "epoch": 1.5739999999999998, "grad_norm": 0.000705718994140625, "kl": 0.0729364191647619, "learning_rate": 6.597572822006643e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7870 }, { "completion_length": 69.875, "epoch": 1.576, "grad_norm": 0.0003833770751953125, "kl": 0.07040122235193849, "learning_rate": 6.538616484352902e-07, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 7880 }, { "completion_length": 59.375, "epoch": 1.5779999999999998, "grad_norm": 0.001129150390625, "kl": 0.04789869613014162, "learning_rate": 6.479885092831251e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7890 }, { "completion_length": 56.575, "epoch": 1.58, "grad_norm": 0.00049591064453125, "kl": 0.05556117547675967, "learning_rate": 6.421379363065142e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7900 }, { "completion_length": 50.85, "epoch": 1.5819999999999999, "grad_norm": 0.000957489013671875, "kl": 0.07133134175091982, "learning_rate": 6.363100007928447e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7910 }, { "completion_length": 74.025, "epoch": 1.584, "grad_norm": 0.0003108978271484375, "kl": 0.10760618806816638, "learning_rate": 6.305047737536707e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7920 }, { "completion_length": 56.6, "epoch": 1.5859999999999999, "grad_norm": 0.00092315673828125, "kl": 2.8826652359217406, "learning_rate": 6.247223259238511e-07, "loss": 0.0003, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 7930 }, { "completion_length": 55.475, "epoch": 1.588, "grad_norm": 0.0006103515625, "kl": 2.284485016670078, "learning_rate": 6.189627277606894e-07, "loss": 0.0002, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7940 }, { "completion_length": 46.125, "epoch": 1.5899999999999999, "grad_norm": 0.000972747802734375, "kl": 0.12067738296464085, "learning_rate": 6.1322604944307e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7950 }, { "completion_length": 59.225, "epoch": 1.592, "grad_norm": 0.00066375732421875, "kl": 0.14497559778392316, "learning_rate": 6.075123608706093e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7960 }, { "completion_length": 60.925, "epoch": 1.5939999999999999, "grad_norm": 0.0024566650390625, "kl": 258.11860719914546, "learning_rate": 6.01821731662798e-07, "loss": 0.0258, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 7970 }, { "completion_length": 62.225, "epoch": 1.596, "grad_norm": 0.00153350830078125, "kl": 0.024051298201084138, "learning_rate": 5.961542311581586e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7980 }, { "completion_length": 50.25, "epoch": 1.5979999999999999, "grad_norm": 0.0004482269287109375, "kl": 0.10747648775577545, "learning_rate": 5.905099284133953e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 7990 }, { "completion_length": 52.1, "epoch": 1.6, "grad_norm": 0.00038909912109375, "kl": 4.900969664240256, "learning_rate": 5.848888922025553e-07, "loss": 0.0005, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 8000 }, { "completion_length": 46.85, "epoch": 1.6019999999999999, "grad_norm": 0.000553131103515625, "kl": 0.06989260124973953, "learning_rate": 5.792911910161922e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8010 }, { "completion_length": 64.45, "epoch": 1.604, "grad_norm": 0.0006561279296875, "kl": 0.054861510870978236, "learning_rate": 5.737168930605272e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8020 }, { "completion_length": 69.5, "epoch": 1.6059999999999999, "grad_norm": 0.0003757476806640625, "kl": 0.01320057879202068, "learning_rate": 5.681660662566225e-07, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8030 }, { "completion_length": 48.75, "epoch": 1.608, "grad_norm": 0.000743865966796875, "kl": 0.5451168741099537, "learning_rate": 5.626387782395512e-07, "loss": 0.0001, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.15, "rewards/reward_func": -0.075, "step": 8040 }, { "completion_length": 57.45, "epoch": 1.6099999999999999, "grad_norm": 0.0004177093505859375, "kl": 0.06206353167071939, "learning_rate": 5.571350963575728e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8050 }, { "completion_length": 54.175, "epoch": 1.612, "grad_norm": 0.000614166259765625, "kl": 2.013091558404267, "learning_rate": 5.516550876713142e-07, "loss": 0.0002, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 8060 }, { "completion_length": 61.025, "epoch": 1.6139999999999999, "grad_norm": 0.000652313232421875, "kl": 0.06326800542883575, "learning_rate": 5.461988189529529e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8070 }, { "completion_length": 58.825, "epoch": 1.616, "grad_norm": 0.00104522705078125, "kl": 0.014891783054918051, "learning_rate": 5.407663566854008e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8080 }, { "completion_length": 54.875, "epoch": 1.6179999999999999, "grad_norm": 0.0020904541015625, "kl": 0.025240180967375635, "learning_rate": 5.353577670614951e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8090 }, { "completion_length": 61.5, "epoch": 1.62, "grad_norm": 0.000614166259765625, "kl": 0.04227957231923938, "learning_rate": 5.299731159831953e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8100 }, { "completion_length": 65.5, "epoch": 1.6219999999999999, "grad_norm": 0.0004177093505859375, "kl": 0.19428066378459335, "learning_rate": 5.24612469060774e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8110 }, { "completion_length": 53.05, "epoch": 1.624, "grad_norm": 0.0010986328125, "kl": 0.042151403008028866, "learning_rate": 5.192758916120236e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8120 }, { "completion_length": 55.55, "epoch": 1.626, "grad_norm": 0.00029754638671875, "kl": 0.02924617677927017, "learning_rate": 5.139634486614544e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8130 }, { "completion_length": 76.4, "epoch": 1.6280000000000001, "grad_norm": 0.01080322265625, "kl": 0.042456808709539474, "learning_rate": 5.086752049395094e-07, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8140 }, { "completion_length": 60.325, "epoch": 1.63, "grad_norm": 0.01495361328125, "kl": 0.025815209513530134, "learning_rate": 5.034112248817685e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8150 }, { "completion_length": 71.75, "epoch": 1.6320000000000001, "grad_norm": 0.000865936279296875, "kl": 0.0641026332974434, "learning_rate": 4.981715726281666e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8160 }, { "completion_length": 57.4, "epoch": 1.634, "grad_norm": 0.000530242919921875, "kl": 0.045072671584784986, "learning_rate": 4.929563120222142e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8170 }, { "completion_length": 52.075, "epoch": 1.6360000000000001, "grad_norm": 0.00011157989501953125, "kl": 0.1052944268565625, "learning_rate": 4.87765506610215e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8180 }, { "completion_length": 54.375, "epoch": 1.638, "grad_norm": 0.0003986358642578125, "kl": 0.03237830828875303, "learning_rate": 4.825992196404958e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8190 }, { "completion_length": 56.3, "epoch": 1.6400000000000001, "grad_norm": 0.000835418701171875, "kl": 0.01498257415369153, "learning_rate": 4.774575140626317e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8200 }, { "completion_length": 66.55, "epoch": 1.642, "grad_norm": 0.0003414154052734375, "kl": 0.011996694607660174, "learning_rate": 4.7234045252668393e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8210 }, { "completion_length": 53.05, "epoch": 1.6440000000000001, "grad_norm": 0.0003986358642578125, "kl": 0.1085278536658734, "learning_rate": 4.672480973824312e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8220 }, { "completion_length": 58.675, "epoch": 1.646, "grad_norm": 0.00080108642578125, "kl": 0.04904728039400652, "learning_rate": 4.6218051067861423e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8230 }, { "completion_length": 50.825, "epoch": 1.6480000000000001, "grad_norm": 0.000759124755859375, "kl": 0.014587640948593616, "learning_rate": 4.5713775416217884e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8240 }, { "completion_length": 45.15, "epoch": 1.65, "grad_norm": 0.0003795623779296875, "kl": 0.09401618214324117, "learning_rate": 4.5211988927752026e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8250 }, { "completion_length": 64.925, "epoch": 1.6520000000000001, "grad_norm": 0.0026702880859375, "kl": 0.026571149285882712, "learning_rate": 4.4712697716573994e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8260 }, { "completion_length": 68.8, "epoch": 1.654, "grad_norm": 0.000606536865234375, "kl": 0.09327265082392841, "learning_rate": 4.421590786638952e-07, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8270 }, { "completion_length": 58.775, "epoch": 1.6560000000000001, "grad_norm": 0.000858306884765625, "kl": 0.06238628029823303, "learning_rate": 4.372162543042624e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8280 }, { "completion_length": 56.575, "epoch": 1.658, "grad_norm": 0.00030517578125, "kl": 0.06320808534510433, "learning_rate": 4.3229856431359516e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8290 }, { "completion_length": 59.625, "epoch": 1.6600000000000001, "grad_norm": 0.000576019287109375, "kl": 0.06072661457583308, "learning_rate": 4.27406068612396e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8300 }, { "completion_length": 54.5, "epoch": 1.662, "grad_norm": 0.000698089599609375, "kl": 0.0206127600511536, "learning_rate": 4.225388268141797e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8310 }, { "completion_length": 64.5, "epoch": 1.6640000000000001, "grad_norm": 0.000766754150390625, "kl": 1.2397997039370239, "learning_rate": 4.1769689822475147e-07, "loss": 0.0001, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8320 }, { "completion_length": 63.85, "epoch": 1.666, "grad_norm": 0.00113677978515625, "kl": 0.04746299120597541, "learning_rate": 4.12880341841484e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8330 }, { "completion_length": 65.3, "epoch": 1.6680000000000001, "grad_norm": 0.000629425048828125, "kl": 0.07974740182980895, "learning_rate": 4.0808921635259595e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8340 }, { "completion_length": 54.1, "epoch": 1.67, "grad_norm": 0.000759124755859375, "kl": 350.0721945284866, "learning_rate": 4.033235801364402e-07, "loss": 0.035, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 8350 }, { "completion_length": 50.45, "epoch": 1.6720000000000002, "grad_norm": 0.00054168701171875, "kl": 7.292029631882906, "learning_rate": 3.9858349126078945e-07, "loss": 0.0007, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 8360 }, { "completion_length": 58.775, "epoch": 1.674, "grad_norm": 0.012939453125, "kl": 0.07852717223577202, "learning_rate": 3.938690074821314e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8370 }, { "completion_length": 54.425, "epoch": 1.6760000000000002, "grad_norm": 0.00070953369140625, "kl": 0.01418459378182888, "learning_rate": 3.891801862449629e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8380 }, { "completion_length": 50.175, "epoch": 1.678, "grad_norm": 0.000560760498046875, "kl": 0.03785524540580809, "learning_rate": 3.8451708468109026e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8390 }, { "completion_length": 43.075, "epoch": 1.6800000000000002, "grad_norm": 0.00136566162109375, "kl": 0.08654712834395469, "learning_rate": 3.798797596089351e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8400 }, { "completion_length": 59.775, "epoch": 1.682, "grad_norm": 0.0005950927734375, "kl": 0.2558928931131959, "learning_rate": 3.7526826753284065e-07, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8410 }, { "completion_length": 58.375, "epoch": 1.6840000000000002, "grad_norm": 0.00067901611328125, "kl": 1.3383296761894599, "learning_rate": 3.7068266464238085e-07, "loss": 0.0001, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8420 }, { "completion_length": 64.6, "epoch": 1.686, "grad_norm": 0.0003833770751953125, "kl": 0.0377775629516691, "learning_rate": 3.661230068116811e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8430 }, { "completion_length": 58.925, "epoch": 1.688, "grad_norm": 0.00092315673828125, "kl": 0.05282154800370335, "learning_rate": 3.615893495987335e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8440 }, { "completion_length": 61.375, "epoch": 1.69, "grad_norm": 0.00079345703125, "kl": 104.9448153554462, "learning_rate": 3.5708174824471947e-07, "loss": 0.0105, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 8450 }, { "completion_length": 66.675, "epoch": 1.692, "grad_norm": 0.0004119873046875, "kl": 0.040050674229860306, "learning_rate": 3.5260025767333894e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8460 }, { "completion_length": 69.475, "epoch": 1.694, "grad_norm": 0.000682830810546875, "kl": 0.020584713015705348, "learning_rate": 3.481449324901412e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8470 }, { "completion_length": 66.525, "epoch": 1.696, "grad_norm": 0.0002193450927734375, "kl": 0.05711883215699345, "learning_rate": 3.4371582698185636e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8480 }, { "completion_length": 68.075, "epoch": 1.698, "grad_norm": 0.0002460479736328125, "kl": 0.05435404470190406, "learning_rate": 3.393129951157384e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8490 }, { "completion_length": 63.9, "epoch": 1.7, "grad_norm": 0.00174713134765625, "kl": 0.06182208526879549, "learning_rate": 3.3493649053890325e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8500 }, { "completion_length": 68.7, "epoch": 1.702, "grad_norm": 167.0, "kl": 88.8611083610449, "learning_rate": 3.3058636657767927e-07, "loss": 0.0089, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8510 }, { "completion_length": 76.675, "epoch": 1.704, "grad_norm": 0.0038909912109375, "kl": 0.05937002245336771, "learning_rate": 3.262626762369525e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8520 }, { "completion_length": 57.825, "epoch": 1.706, "grad_norm": 0.000690460205078125, "kl": 0.05280606346204877, "learning_rate": 3.219654721995266e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8530 }, { "completion_length": 48.85, "epoch": 1.708, "grad_norm": 0.000560760498046875, "kl": 188.36169426795095, "learning_rate": 3.176948068254762e-07, "loss": 0.0188, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8540 }, { "completion_length": 64.1, "epoch": 1.71, "grad_norm": 0.0003757476806640625, "kl": 0.01651700264774263, "learning_rate": 3.134507321515107e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8550 }, { "completion_length": 43.95, "epoch": 1.712, "grad_norm": 0.0003681182861328125, "kl": 5.833402361674234, "learning_rate": 3.092332998903416e-07, "loss": 0.0006, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 8560 }, { "completion_length": 60.85, "epoch": 1.714, "grad_norm": 0.005889892578125, "kl": 0.057405439857393506, "learning_rate": 3.050425614300487e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8570 }, { "completion_length": 70.15, "epoch": 1.716, "grad_norm": 0.000476837158203125, "kl": 0.01461967695504427, "learning_rate": 3.0087856783345916e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8580 }, { "completion_length": 48.075, "epoch": 1.718, "grad_norm": 0.000438690185546875, "kl": 0.0398553837556392, "learning_rate": 2.967413698375196e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8590 }, { "completion_length": 57.75, "epoch": 1.72, "grad_norm": 0.000518798828125, "kl": 0.02238648202328477, "learning_rate": 2.9263101785268253e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8600 }, { "completion_length": 66.25, "epoch": 1.722, "grad_norm": 0.0015716552734375, "kl": 0.04364732797257602, "learning_rate": 2.8854756196229017e-07, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8610 }, { "completion_length": 43.7, "epoch": 1.724, "grad_norm": 0.0003299713134765625, "kl": 0.11783089116215706, "learning_rate": 2.844910519219632e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8620 }, { "completion_length": 49.45, "epoch": 1.726, "grad_norm": 0.000640869140625, "kl": 0.13984102117829025, "learning_rate": 2.8046153715899695e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8630 }, { "completion_length": 48.175, "epoch": 1.728, "grad_norm": 0.00054931640625, "kl": 0.40176068069413307, "learning_rate": 2.764590667717562e-07, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8640 }, { "completion_length": 63.95, "epoch": 1.73, "grad_norm": 0.0004711151123046875, "kl": 0.8224213434383273, "learning_rate": 2.7248368952908055e-07, "loss": 0.0001, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8650 }, { "completion_length": 44.175, "epoch": 1.732, "grad_norm": 0.000644683837890625, "kl": 0.04362150589004159, "learning_rate": 2.6853545386968607e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8660 }, { "completion_length": 67.65, "epoch": 1.734, "grad_norm": 0.000659942626953125, "kl": 0.048378444463014605, "learning_rate": 2.6461440790157974e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8670 }, { "completion_length": 61.075, "epoch": 1.736, "grad_norm": 0.0022430419921875, "kl": 0.019372216332703827, "learning_rate": 2.6072059940146775e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8680 }, { "completion_length": 44.075, "epoch": 1.738, "grad_norm": 0.0004119873046875, "kl": 8.146503202756866, "learning_rate": 2.568540758141791e-07, "loss": 0.0008, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8690 }, { "completion_length": 53.575, "epoch": 1.74, "grad_norm": 0.0004138946533203125, "kl": 49.64364205431193, "learning_rate": 2.53014884252083e-07, "loss": 0.005, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 8700 }, { "completion_length": 65.325, "epoch": 1.742, "grad_norm": 0.000579833984375, "kl": 0.4298483125632629, "learning_rate": 2.492030714945162e-07, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8710 }, { "completion_length": 52.625, "epoch": 1.744, "grad_norm": 0.0016326904296875, "kl": 0.07851723725907504, "learning_rate": 2.454186839872158e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8720 }, { "completion_length": 62.65, "epoch": 1.746, "grad_norm": 0.001373291015625, "kl": 23.3441758136265, "learning_rate": 2.4166176784174795e-07, "loss": 0.0023, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8730 }, { "completion_length": 74.575, "epoch": 1.748, "grad_norm": 0.000637054443359375, "kl": 11.16068452913314, "learning_rate": 2.3793236883495164e-07, "loss": 0.0011, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8740 }, { "completion_length": 49.475, "epoch": 1.75, "grad_norm": 0.000820159912109375, "kl": 0.075881730299443, "learning_rate": 2.3423053240837518e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8750 }, { "completion_length": 56.025, "epoch": 1.752, "grad_norm": 0.000476837158203125, "kl": 0.01870635347440839, "learning_rate": 2.3055630366772857e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8760 }, { "completion_length": 47.775, "epoch": 1.754, "grad_norm": 0.000675201416015625, "kl": 0.07872601179406047, "learning_rate": 2.269097273823287e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8770 }, { "completion_length": 55.075, "epoch": 1.756, "grad_norm": 0.002685546875, "kl": 2.795431226864457, "learning_rate": 2.2329084798455747e-07, "loss": 0.0003, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8780 }, { "completion_length": 57.725, "epoch": 1.758, "grad_norm": 0.0003795623779296875, "kl": 0.03155038901604712, "learning_rate": 2.1969970956931762e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8790 }, { "completion_length": 56.35, "epoch": 1.76, "grad_norm": 0.000591278076171875, "kl": 48.86689073387534, "learning_rate": 2.1613635589349756e-07, "loss": 0.0049, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8800 }, { "completion_length": 52.05, "epoch": 1.762, "grad_norm": 0.00151824951171875, "kl": 0.05452495804056525, "learning_rate": 2.1260083037543817e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8810 }, { "completion_length": 47.3, "epoch": 1.764, "grad_norm": 0.0004444122314453125, "kl": 312.406746559497, "learning_rate": 2.0909317609440093e-07, "loss": 0.0312, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8820 }, { "completion_length": 52.475, "epoch": 1.766, "grad_norm": 0.0005950927734375, "kl": 0.19929129825904965, "learning_rate": 2.0561343579004716e-07, "loss": 0.0, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 8830 }, { "completion_length": 74.525, "epoch": 1.768, "grad_norm": 0.0003871917724609375, "kl": 0.022377661243081094, "learning_rate": 2.0216165186191406e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8840 }, { "completion_length": 47.8, "epoch": 1.77, "grad_norm": 0.00055694580078125, "kl": 0.018075392534956335, "learning_rate": 1.9873786636889908e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8850 }, { "completion_length": 66.875, "epoch": 1.772, "grad_norm": 0.00037384033203125, "kl": 0.0537069259211421, "learning_rate": 1.95342121028749e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8860 }, { "completion_length": 50.35, "epoch": 1.774, "grad_norm": 0.000408172607421875, "kl": 0.16273712795227765, "learning_rate": 1.9197445721754777e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8870 }, { "completion_length": 41.1, "epoch": 1.776, "grad_norm": 0.00162506103515625, "kl": 0.0835498913191259, "learning_rate": 1.8863491596921745e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8880 }, { "completion_length": 56.375, "epoch": 1.778, "grad_norm": 0.0021514892578125, "kl": 3.5430075244046746, "learning_rate": 1.8532353797501318e-07, "loss": 0.0004, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8890 }, { "completion_length": 49.0, "epoch": 1.78, "grad_norm": 0.00153350830078125, "kl": 0.09430858921259641, "learning_rate": 1.8204036358303173e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8900 }, { "completion_length": 56.175, "epoch": 1.782, "grad_norm": 0.00121307373046875, "kl": 0.04068310302682221, "learning_rate": 1.787854327977162e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8910 }, { "completion_length": 59.075, "epoch": 1.784, "grad_norm": 0.0003814697265625, "kl": 0.04385726461187005, "learning_rate": 1.7555878527937164e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8920 }, { "completion_length": 55.05, "epoch": 1.786, "grad_norm": 0.0030517578125, "kl": 0.018657160410657524, "learning_rate": 1.7236046034367959e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8930 }, { "completion_length": 47.975, "epoch": 1.788, "grad_norm": 0.0004711151123046875, "kl": 1.90866837259382, "learning_rate": 1.6919049696121957e-07, "loss": 0.0002, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8940 }, { "completion_length": 61.925, "epoch": 1.79, "grad_norm": 0.0030517578125, "kl": 0.7081083978526295, "learning_rate": 1.6604893375699594e-07, "loss": 0.0001, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8950 }, { "completion_length": 59.725, "epoch": 1.792, "grad_norm": 0.0004444122314453125, "kl": 4.325691572204232, "learning_rate": 1.629358090099639e-07, "loss": 0.0004, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8960 }, { "completion_length": 54.6, "epoch": 1.794, "grad_norm": 0.0009918212890625, "kl": 0.11546620442532003, "learning_rate": 1.5985116065256683e-07, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8970 }, { "completion_length": 58.7, "epoch": 1.796, "grad_norm": 0.000591278076171875, "kl": 1.2548286508535966, "learning_rate": 1.567950262702714e-07, "loss": 0.0001, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 8980 }, { "completion_length": 56.925, "epoch": 1.798, "grad_norm": 0.0003376007080078125, "kl": 0.07534236300271005, "learning_rate": 1.5376744310111019e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 8990 }, { "completion_length": 59.7, "epoch": 1.8, "grad_norm": 0.000576019287109375, "kl": 79.93720495556481, "learning_rate": 1.507684480352292e-07, "loss": 0.008, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 9000 }, { "completion_length": 47.55, "epoch": 1.802, "grad_norm": 0.000606536865234375, "kl": 0.07122775209136308, "learning_rate": 1.4779807761443638e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9010 }, { "completion_length": 56.475, "epoch": 1.804, "grad_norm": 0.0019378662109375, "kl": 31.246724256686868, "learning_rate": 1.4485636803175828e-07, "loss": 0.0031, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 9020 }, { "completion_length": 56.8, "epoch": 1.806, "grad_norm": 0.0007476806640625, "kl": 0.01624395214021206, "learning_rate": 1.419433551309976e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9030 }, { "completion_length": 52.925, "epoch": 1.808, "grad_norm": 0.00052642822265625, "kl": 0.03512433131691069, "learning_rate": 1.3905907440629752e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9040 }, { "completion_length": 63.475, "epoch": 1.81, "grad_norm": 0.0008087158203125, "kl": 0.05812466649804264, "learning_rate": 1.362035610017079e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9050 }, { "completion_length": 67.2, "epoch": 1.812, "grad_norm": 0.0002841949462890625, "kl": 0.053207884868606926, "learning_rate": 1.3337684971075932e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9060 }, { "completion_length": 63.025, "epoch": 1.814, "grad_norm": 0.00083160400390625, "kl": 0.017276625451631843, "learning_rate": 1.305789749760361e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9070 }, { "completion_length": 53.85, "epoch": 1.8159999999999998, "grad_norm": 0.0014495849609375, "kl": 0.1759139670059085, "learning_rate": 1.278099708887587e-07, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 9080 }, { "completion_length": 49.575, "epoch": 1.818, "grad_norm": 0.00054168701171875, "kl": 0.06552611859515309, "learning_rate": 1.2506987118836912e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9090 }, { "completion_length": 57.775, "epoch": 1.8199999999999998, "grad_norm": 0.0005035400390625, "kl": 0.14279152313247323, "learning_rate": 1.223587092621162e-07, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 9100 }, { "completion_length": 66.725, "epoch": 1.822, "grad_norm": 0.000827789306640625, "kl": 26.51692173536867, "learning_rate": 1.1967651814465353e-07, "loss": 0.0027, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 9110 }, { "completion_length": 45.675, "epoch": 1.8239999999999998, "grad_norm": 24.0, "kl": 21.41680323826149, "learning_rate": 1.1702333051763271e-07, "loss": 0.0021, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 9120 }, { "completion_length": 50.95, "epoch": 1.826, "grad_norm": 0.0005035400390625, "kl": 0.031037054676562547, "learning_rate": 1.1439917870930795e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9130 }, { "completion_length": 49.925, "epoch": 1.8279999999999998, "grad_norm": 0.000782012939453125, "kl": 0.06788429841399193, "learning_rate": 1.1180409469414094e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9140 }, { "completion_length": 54.575, "epoch": 1.83, "grad_norm": 0.00067901611328125, "kl": 0.02977508623152971, "learning_rate": 1.0923811009241142e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9150 }, { "completion_length": 62.55, "epoch": 1.8319999999999999, "grad_norm": 0.005157470703125, "kl": 0.03778183825779706, "learning_rate": 1.067012561698319e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9160 }, { "completion_length": 55.6, "epoch": 1.834, "grad_norm": 0.0003528594970703125, "kl": 0.033238646434620024, "learning_rate": 1.041935638371669e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9170 }, { "completion_length": 60.35, "epoch": 1.8359999999999999, "grad_norm": 0.00052642822265625, "kl": 0.02701822677627206, "learning_rate": 1.0171506364985622e-07, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9180 }, { "completion_length": 53.775, "epoch": 1.838, "grad_norm": 0.0002994537353515625, "kl": 1243.2045701113996, "learning_rate": 9.926578580764234e-08, "loss": 0.1243, "match_ratio": 0.9, "reward": -0.1, "reward_std": 0.1154700517654419, "rewards/reward_func": -0.1, "step": 9190 }, { "completion_length": 76.5, "epoch": 1.8399999999999999, "grad_norm": 0.000469207763671875, "kl": 0.017305072862654924, "learning_rate": 9.684576015420277e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9200 }, { "completion_length": 50.85, "epoch": 1.842, "grad_norm": 0.00045013427734375, "kl": 13.52835137634538, "learning_rate": 9.445501617678654e-08, "loss": 0.0014, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 9210 }, { "completion_length": 55.425, "epoch": 1.8439999999999999, "grad_norm": 0.00072479248046875, "kl": 0.03520208708941937, "learning_rate": 9.209358300585474e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9220 }, { "completion_length": 66.8, "epoch": 1.846, "grad_norm": 0.000614166259765625, "kl": 0.02983384854160249, "learning_rate": 8.9761489414725e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9230 }, { "completion_length": 49.075, "epoch": 1.8479999999999999, "grad_norm": 0.000537872314453125, "kl": 0.032278594188392164, "learning_rate": 8.745876381922147e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9240 }, { "completion_length": 60.0, "epoch": 1.85, "grad_norm": 0.000659942626953125, "kl": 0.02954811817035079, "learning_rate": 8.518543427732951e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9250 }, { "completion_length": 58.05, "epoch": 1.8519999999999999, "grad_norm": 0.000522613525390625, "kl": 0.020372640853747726, "learning_rate": 8.294152848885156e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9260 }, { "completion_length": 57.65, "epoch": 1.854, "grad_norm": 0.000514984130859375, "kl": 48.13295641997829, "learning_rate": 8.072707379507217e-08, "loss": 0.0048, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 9270 }, { "completion_length": 73.275, "epoch": 1.8559999999999999, "grad_norm": 0.000583648681640625, "kl": 259.27141086012125, "learning_rate": 7.854209717842231e-08, "loss": 0.0259, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 9280 }, { "completion_length": 62.75, "epoch": 1.858, "grad_norm": 0.0002765655517578125, "kl": 0.0620627264957875, "learning_rate": 7.638662526215284e-08, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 9290 }, { "completion_length": 60.6, "epoch": 1.8599999999999999, "grad_norm": 0.000881195068359375, "kl": 0.0414402786642313, "learning_rate": 7.426068431000883e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9300 }, { "completion_length": 65.575, "epoch": 1.862, "grad_norm": 0.00058746337890625, "kl": 0.08443178189918399, "learning_rate": 7.216430022591009e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9310 }, { "completion_length": 64.35, "epoch": 1.8639999999999999, "grad_norm": 0.00634765625, "kl": 0.12132438533008098, "learning_rate": 7.009749855363457e-08, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 9320 }, { "completion_length": 59.775, "epoch": 1.866, "grad_norm": 0.000392913818359375, "kl": 7.24802761040628, "learning_rate": 6.806030447650879e-08, "loss": 0.0007, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.05, "rewards/reward_func": -0.075, "step": 9330 }, { "completion_length": 51.325, "epoch": 1.8679999999999999, "grad_norm": 0.000522613525390625, "kl": 12.393874236382544, "learning_rate": 6.605274281709929e-08, "loss": 0.0012, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 9340 }, { "completion_length": 54.575, "epoch": 1.87, "grad_norm": 0.000598907470703125, "kl": 0.20043480526655913, "learning_rate": 6.407483803691216e-08, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 9350 }, { "completion_length": 65.05, "epoch": 1.8719999999999999, "grad_norm": 0.000621795654296875, "kl": 0.038857326842844486, "learning_rate": 6.212661423609184e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9360 }, { "completion_length": 59.3, "epoch": 1.874, "grad_norm": 0.000453948974609375, "kl": 0.1275158784352243, "learning_rate": 6.020809515313141e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9370 }, { "completion_length": 55.175, "epoch": 1.876, "grad_norm": 0.001220703125, "kl": 0.6000383426435292, "learning_rate": 5.83193041645802e-08, "loss": 0.0001, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9380 }, { "completion_length": 57.375, "epoch": 1.8780000000000001, "grad_norm": 0.00177001953125, "kl": 0.06478001358918846, "learning_rate": 5.6460264284760316e-08, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 9390 }, { "completion_length": 51.175, "epoch": 1.88, "grad_norm": 0.006195068359375, "kl": 0.08602785079274326, "learning_rate": 5.463099816548578e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9400 }, { "completion_length": 57.75, "epoch": 1.8820000000000001, "grad_norm": 0.00092315673828125, "kl": 0.028340872889384628, "learning_rate": 5.283152809578751e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9410 }, { "completion_length": 61.275, "epoch": 1.884, "grad_norm": 0.0004024505615234375, "kl": 0.089741973252967, "learning_rate": 5.106187600163987e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9420 }, { "completion_length": 65.625, "epoch": 1.8860000000000001, "grad_norm": 0.0025634765625, "kl": 0.060642439499497415, "learning_rate": 4.932206344569562e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9430 }, { "completion_length": 58.875, "epoch": 1.888, "grad_norm": 0.00067901611328125, "kl": 0.06356988861225546, "learning_rate": 4.761211162702117e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9440 }, { "completion_length": 57.25, "epoch": 1.8900000000000001, "grad_norm": 0.00072479248046875, "kl": 16.864195838803425, "learning_rate": 4.593204138084006e-08, "loss": 0.0017, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.1, "rewards/reward_func": -0.05, "step": 9450 }, { "completion_length": 51.025, "epoch": 1.892, "grad_norm": 0.00037384033203125, "kl": 0.047673306241631505, "learning_rate": 4.428187317827848e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9460 }, { "completion_length": 66.45, "epoch": 1.8940000000000001, "grad_norm": 0.000469207763671875, "kl": 0.035626521334052086, "learning_rate": 4.26616271261146e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9470 }, { "completion_length": 47.25, "epoch": 1.896, "grad_norm": 0.00045013427734375, "kl": 0.09364478723146022, "learning_rate": 4.1071322966535487e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9480 }, { "completion_length": 61.45, "epoch": 1.8980000000000001, "grad_norm": 0.0004100799560546875, "kl": 0.02402509720996022, "learning_rate": 3.95109800768953e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9490 }, { "completion_length": 52.2, "epoch": 1.9, "grad_norm": 0.00049591064453125, "kl": 0.08021967611275613, "learning_rate": 3.798061746947995e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9500 }, { "completion_length": 54.175, "epoch": 1.9020000000000001, "grad_norm": 0.0003814697265625, "kl": 0.08938063569366932, "learning_rate": 3.648025379127479e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9510 }, { "completion_length": 48.6, "epoch": 1.904, "grad_norm": 0.000553131103515625, "kl": 0.03359618247486651, "learning_rate": 3.5009907323737826e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9520 }, { "completion_length": 49.65, "epoch": 1.9060000000000001, "grad_norm": 0.00131988525390625, "kl": 0.10454095806926489, "learning_rate": 3.3569595982576584e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9530 }, { "completion_length": 42.4, "epoch": 1.908, "grad_norm": 0.000667572021484375, "kl": 0.18224592534825207, "learning_rate": 3.2159337317530234e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9540 }, { "completion_length": 51.125, "epoch": 1.9100000000000001, "grad_norm": 0.0009613037109375, "kl": 0.2186179363168776, "learning_rate": 3.077914851215585e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9550 }, { "completion_length": 60.75, "epoch": 1.912, "grad_norm": 0.000701904296875, "kl": 0.090417854860425, "learning_rate": 2.9429046383618042e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9560 }, { "completion_length": 41.175, "epoch": 1.9140000000000001, "grad_norm": 0.00130462646484375, "kl": 0.07170910434797406, "learning_rate": 2.810904738248549e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9570 }, { "completion_length": 43.875, "epoch": 1.916, "grad_norm": 0.0018310546875, "kl": 0.12651289403438568, "learning_rate": 2.681916759252917e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9580 }, { "completion_length": 67.95, "epoch": 1.9180000000000001, "grad_norm": 0.0009613037109375, "kl": 0.046817721845582125, "learning_rate": 2.555942273052753e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9590 }, { "completion_length": 44.825, "epoch": 1.92, "grad_norm": 0.002197265625, "kl": 1.0959480846766383, "learning_rate": 2.4329828146074096e-08, "loss": 0.0001, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 9600 }, { "completion_length": 40.15, "epoch": 1.9220000000000002, "grad_norm": 0.0015869140625, "kl": 0.05991814769804478, "learning_rate": 2.313039882139101e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9610 }, { "completion_length": 56.9, "epoch": 1.924, "grad_norm": 0.000377655029296875, "kl": 0.019165601092390717, "learning_rate": 2.1961149371145795e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9620 }, { "completion_length": 61.1, "epoch": 1.9260000000000002, "grad_norm": 0.000263214111328125, "kl": 0.05205519350711256, "learning_rate": 2.082209404227403e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9630 }, { "completion_length": 46.925, "epoch": 1.928, "grad_norm": 0.0004367828369140625, "kl": 0.08729816749691963, "learning_rate": 1.9713246713805588e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9640 }, { "completion_length": 60.35, "epoch": 1.9300000000000002, "grad_norm": 0.0017852783203125, "kl": 0.09572115261107683, "learning_rate": 1.8634620896695044e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9650 }, { "completion_length": 62.975, "epoch": 1.932, "grad_norm": 10.0625, "kl": 0.08949833824299276, "learning_rate": 1.7586229733657646e-08, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 9660 }, { "completion_length": 60.05, "epoch": 1.9340000000000002, "grad_norm": 0.0003566741943359375, "kl": 0.051867073588073256, "learning_rate": 1.6568085999008886e-08, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 9670 }, { "completion_length": 56.425, "epoch": 1.936, "grad_norm": 0.0006866455078125, "kl": 0.05798132345080376, "learning_rate": 1.5580202098509078e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9680 }, { "completion_length": 60.575, "epoch": 1.938, "grad_norm": 29.75, "kl": 22.995475397538392, "learning_rate": 1.4622590069211517e-08, "loss": 0.0023, "match_ratio": 0.95, "reward": -0.05, "reward_std": 0.05773502588272095, "rewards/reward_func": -0.05, "step": 9690 }, { "completion_length": 52.575, "epoch": 1.94, "grad_norm": 0.00121307373046875, "kl": 0.05500190043821931, "learning_rate": 1.3695261579316776e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9700 }, { "completion_length": 51.8, "epoch": 1.942, "grad_norm": 0.000308990478515625, "kl": 0.07781615569256246, "learning_rate": 1.2798227928029483e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9710 }, { "completion_length": 56.15, "epoch": 1.944, "grad_norm": 0.0003795623779296875, "kl": 0.08795451316982508, "learning_rate": 1.193150004542204e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9720 }, { "completion_length": 52.4, "epoch": 1.946, "grad_norm": 0.002899169921875, "kl": 0.047139992006123066, "learning_rate": 1.109508849230001e-08, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9730 }, { "completion_length": 45.45, "epoch": 1.948, "grad_norm": 0.0003662109375, "kl": 4.14183980775997, "learning_rate": 1.0289003460074165e-08, "loss": 0.0004, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 9740 }, { "completion_length": 56.375, "epoch": 1.95, "grad_norm": 0.0031890869140625, "kl": 0.060475172754377124, "learning_rate": 9.513254770636138e-09, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9750 }, { "completion_length": 48.85, "epoch": 1.952, "grad_norm": 0.0008087158203125, "kl": 0.01811651182360947, "learning_rate": 8.767851876239075e-09, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9760 }, { "completion_length": 63.625, "epoch": 1.954, "grad_norm": 0.000522613525390625, "kl": 0.04865064946934581, "learning_rate": 8.052803859382174e-09, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9770 }, { "completion_length": 66.825, "epoch": 1.956, "grad_norm": 0.0003662109375, "kl": 0.017140331957489253, "learning_rate": 7.368119432699383e-09, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9780 }, { "completion_length": 49.275, "epoch": 1.958, "grad_norm": 0.000598907470703125, "kl": 0.05813699197024107, "learning_rate": 6.7138069388547614e-09, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9790 }, { "completion_length": 53.425, "epoch": 1.96, "grad_norm": 0.00069427490234375, "kl": 1490.5032024047337, "learning_rate": 6.089874350439507e-09, "loss": 0.1491, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.10773502588272095, "rewards/reward_func": -0.075, "step": 9800 }, { "completion_length": 43.9, "epoch": 1.962, "grad_norm": 0.0005645751953125, "kl": 31.488269805023446, "learning_rate": 5.4963292698750896e-09, "loss": 0.0031, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 9810 }, { "completion_length": 54.95, "epoch": 1.964, "grad_norm": 0.0184326171875, "kl": 0.13211959092877806, "learning_rate": 4.933178929321103e-09, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9820 }, { "completion_length": 48.75, "epoch": 1.966, "grad_norm": 0.000850677490234375, "kl": 0.10980427814647556, "learning_rate": 4.400430190586724e-09, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9830 }, { "completion_length": 57.525, "epoch": 1.968, "grad_norm": 14.625, "kl": 0.031209711637347936, "learning_rate": 3.8980895450474455e-09, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 9840 }, { "completion_length": 49.9, "epoch": 1.97, "grad_norm": 0.000507354736328125, "kl": 16.15725321341306, "learning_rate": 3.4261631135654174e-09, "loss": 0.0016, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 9850 }, { "completion_length": 54.075, "epoch": 1.972, "grad_norm": 0.001739501953125, "kl": 0.04826322416774929, "learning_rate": 2.984656646415063e-09, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9860 }, { "completion_length": 66.375, "epoch": 1.974, "grad_norm": 0.000919342041015625, "kl": 0.035056399274617435, "learning_rate": 2.573575523213412e-09, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9870 }, { "completion_length": 72.6, "epoch": 1.976, "grad_norm": 0.0007781982421875, "kl": 0.05609772065654397, "learning_rate": 2.192924752854042e-09, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 9880 }, { "completion_length": 72.45, "epoch": 1.978, "grad_norm": 0.0004405975341796875, "kl": 0.26721446458250286, "learning_rate": 1.842708973447127e-09, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9890 }, { "completion_length": 62.25, "epoch": 1.98, "grad_norm": 0.0003643035888671875, "kl": 0.08341183541342616, "learning_rate": 1.5229324522605949e-09, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9900 }, { "completion_length": 58.65, "epoch": 1.982, "grad_norm": 0.00063323974609375, "kl": 2.203670488623902, "learning_rate": 1.2335990856710001e-09, "loss": 0.0002, "match_ratio": 0.925, "reward": -0.075, "reward_std": 0.10773502588272095, "rewards/reward_func": -0.075, "step": 9910 }, { "completion_length": 45.05, "epoch": 1.984, "grad_norm": 0.00061798095703125, "kl": 0.06615068479441107, "learning_rate": 9.747123991141193e-10, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9920 }, { "completion_length": 57.625, "epoch": 1.986, "grad_norm": 0.000545501708984375, "kl": 0.0431473188335076, "learning_rate": 7.462755470422078e-10, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9930 }, { "completion_length": 65.05, "epoch": 1.988, "grad_norm": 0.0004634857177734375, "kl": 0.049975822074338795, "learning_rate": 5.48291312886251e-10, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9940 }, { "completion_length": 67.95, "epoch": 1.99, "grad_norm": 0.00127410888671875, "kl": 0.033399745682254435, "learning_rate": 3.8076210902182607e-10, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9950 }, { "completion_length": 56.175, "epoch": 1.992, "grad_norm": 0.0004825592041015625, "kl": 0.11492122933268548, "learning_rate": 2.43689976739403e-10, "loss": 0.0, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 9960 }, { "completion_length": 58.85, "epoch": 1.994, "grad_norm": 0.0040283203125, "kl": 0.04082223805598915, "learning_rate": 1.3707658621964216e-10, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9970 }, { "completion_length": 64.95, "epoch": 1.996, "grad_norm": 0.00299072265625, "kl": 0.055863088183104995, "learning_rate": 6.092323651313293e-11, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 9980 }, { "completion_length": 64.9, "epoch": 1.998, "grad_norm": 0.00091552734375, "kl": 40.98459475683048, "learning_rate": 1.5230855524017708e-11, "loss": 0.0041, "match_ratio": 0.975, "reward": -0.025, "reward_std": 0.05, "rewards/reward_func": -0.025, "step": 9990 }, { "completion_length": 65.95, "epoch": 2.0, "grad_norm": 0.0003757476806640625, "kl": 0.02109892386943102, "learning_rate": 0.0, "loss": 0.0, "match_ratio": 1.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func": 0.0, "step": 10000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }