| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2822.7857666015625, | |
| "epoch": 0.004, | |
| "grad_norm": 0.12564538419246674, | |
| "kl": 0.0, | |
| "learning_rate": 2e-08, | |
| "loss": 0.0645, | |
| "reward": 0.09580668434500694, | |
| "reward_std": 0.5702872574329376, | |
| "rewards/cosine_scaled_reward": -0.14554904401302338, | |
| "rewards/format_reward": 0.3869047649204731, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2575.571533203125, | |
| "epoch": 0.008, | |
| "grad_norm": 0.15411853790283203, | |
| "kl": 0.0, | |
| "learning_rate": 4e-08, | |
| "loss": 0.0717, | |
| "reward": 0.5743008255958557, | |
| "reward_std": 0.7826777100563049, | |
| "rewards/cosine_scaled_reward": 0.03119804011657834, | |
| "rewards/format_reward": 0.5119047686457634, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2762.1190490722656, | |
| "epoch": 0.012, | |
| "grad_norm": 0.13477382063865662, | |
| "kl": 3.463029861450195e-05, | |
| "learning_rate": 6e-08, | |
| "loss": 0.0865, | |
| "reward": 0.21700193732976913, | |
| "reward_std": 0.6844624578952789, | |
| "rewards/cosine_scaled_reward": -0.10578475520014763, | |
| "rewards/format_reward": 0.4285714402794838, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2686.3214721679688, | |
| "epoch": 0.016, | |
| "grad_norm": 0.1282820850610733, | |
| "kl": 2.434849739074707e-05, | |
| "learning_rate": 8e-08, | |
| "loss": 0.0525, | |
| "reward": 0.4696298725903034, | |
| "reward_std": 0.7235232815146446, | |
| "rewards/cosine_scaled_reward": -0.0062565067782998085, | |
| "rewards/format_reward": 0.4821428582072258, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2917.5535888671875, | |
| "epoch": 0.02, | |
| "grad_norm": 0.14993517100811005, | |
| "kl": 3.725290298461914e-05, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0762, | |
| "reward": 0.15318153076805174, | |
| "reward_std": 0.7213103845715523, | |
| "rewards/cosine_scaled_reward": -0.08412353717721999, | |
| "rewards/format_reward": 0.3214285857975483, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2816.2559814453125, | |
| "epoch": 0.024, | |
| "grad_norm": 0.14960958063602448, | |
| "kl": 3.1054019927978516e-05, | |
| "learning_rate": 1.2e-07, | |
| "loss": 0.0537, | |
| "reward": 0.2950221598148346, | |
| "reward_std": 0.738863505423069, | |
| "rewards/cosine_scaled_reward": -0.057846077223075554, | |
| "rewards/format_reward": 0.410714291036129, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2870.3988647460938, | |
| "epoch": 0.028, | |
| "grad_norm": 0.10985030233860016, | |
| "kl": 2.8133392333984375e-05, | |
| "learning_rate": 1.4e-07, | |
| "loss": 0.0068, | |
| "reward": 0.27893248095642775, | |
| "reward_std": 0.7550084367394447, | |
| "rewards/cosine_scaled_reward": -0.05993851972743869, | |
| "rewards/format_reward": 0.3988095410168171, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3160.452392578125, | |
| "epoch": 0.032, | |
| "grad_norm": 0.10308283567428589, | |
| "kl": 3.8176774978637695e-05, | |
| "learning_rate": 1.6e-07, | |
| "loss": 0.0223, | |
| "reward": 0.07877065148204565, | |
| "reward_std": 0.6431715190410614, | |
| "rewards/cosine_scaled_reward": -0.08263849129434675, | |
| "rewards/format_reward": 0.2440476268529892, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3020.607177734375, | |
| "epoch": 0.036, | |
| "grad_norm": 0.15057384967803955, | |
| "kl": 3.37064266204834e-05, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.0733, | |
| "reward": 0.06793000735342503, | |
| "reward_std": 0.6978132948279381, | |
| "rewards/cosine_scaled_reward": -0.12079690210521221, | |
| "rewards/format_reward": 0.3095238171517849, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3089.84521484375, | |
| "epoch": 0.04, | |
| "grad_norm": 0.11256518214941025, | |
| "kl": 3.2395124435424805e-05, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0413, | |
| "reward": 0.032662300392985344, | |
| "reward_std": 0.6881319805979729, | |
| "rewards/cosine_scaled_reward": -0.13545456249266863, | |
| "rewards/format_reward": 0.3035714365541935, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2851.946533203125, | |
| "epoch": 0.044, | |
| "grad_norm": 0.17106953263282776, | |
| "kl": 3.784894943237305e-05, | |
| "learning_rate": 2.1999999999999998e-07, | |
| "loss": 0.0636, | |
| "reward": 0.3718952457420528, | |
| "reward_std": 0.6902545392513275, | |
| "rewards/cosine_scaled_reward": -0.03131428617052734, | |
| "rewards/format_reward": 0.4345238134264946, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2798.5178833007812, | |
| "epoch": 0.048, | |
| "grad_norm": 0.1335103064775467, | |
| "kl": 2.9146671295166016e-05, | |
| "learning_rate": 2.4e-07, | |
| "loss": 0.0543, | |
| "reward": 0.40071453526616096, | |
| "reward_std": 0.7024472132325172, | |
| "rewards/cosine_scaled_reward": -0.02285701408982277, | |
| "rewards/format_reward": 0.4464285746216774, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2948.9464721679688, | |
| "epoch": 0.052, | |
| "grad_norm": 0.1271769255399704, | |
| "kl": 3.698468208312988e-05, | |
| "learning_rate": 2.6e-07, | |
| "loss": 0.0693, | |
| "reward": 0.47545497864484787, | |
| "reward_std": 0.7740402817726135, | |
| "rewards/cosine_scaled_reward": 0.002608438953757286, | |
| "rewards/format_reward": 0.470238097012043, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2679.3928833007812, | |
| "epoch": 0.056, | |
| "grad_norm": 0.12242422997951508, | |
| "kl": 2.8014183044433594e-05, | |
| "learning_rate": 2.8e-07, | |
| "loss": 0.0489, | |
| "reward": 0.40165250562131405, | |
| "reward_std": 0.7790777683258057, | |
| "rewards/cosine_scaled_reward": -0.028340420685708523, | |
| "rewards/format_reward": 0.4583333507180214, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2889.136962890625, | |
| "epoch": 0.06, | |
| "grad_norm": 0.19158992171287537, | |
| "kl": 3.224611282348633e-05, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0704, | |
| "reward": 0.15117042418569326, | |
| "reward_std": 0.6893174201250076, | |
| "rewards/cosine_scaled_reward": -0.10893859504722059, | |
| "rewards/format_reward": 0.3690476231276989, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2892.6488647460938, | |
| "epoch": 0.064, | |
| "grad_norm": 0.15633279085159302, | |
| "kl": 3.668665885925293e-05, | |
| "learning_rate": 3.2e-07, | |
| "loss": 0.0733, | |
| "reward": -0.09426919370889664, | |
| "reward_std": 0.5802397355437279, | |
| "rewards/cosine_scaled_reward": -0.18403935432434082, | |
| "rewards/format_reward": 0.2738095298409462, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2920.3511962890625, | |
| "epoch": 0.068, | |
| "grad_norm": 0.12191536277532578, | |
| "kl": 3.221631050109863e-05, | |
| "learning_rate": 3.4000000000000003e-07, | |
| "loss": 0.0214, | |
| "reward": 0.13339833123609424, | |
| "reward_std": 0.6428257077932358, | |
| "rewards/cosine_scaled_reward": -0.11187227349728346, | |
| "rewards/format_reward": 0.3571428619325161, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2704.672607421875, | |
| "epoch": 0.072, | |
| "grad_norm": 0.2207571119070053, | |
| "kl": 2.4259090423583984e-05, | |
| "learning_rate": 3.6e-07, | |
| "loss": 0.0858, | |
| "reward": 0.4250662699341774, | |
| "reward_std": 0.7673918604850769, | |
| "rewards/cosine_scaled_reward": -0.019609727547504008, | |
| "rewards/format_reward": 0.4642857350409031, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2800.6429443359375, | |
| "epoch": 0.076, | |
| "grad_norm": 0.12734168767929077, | |
| "kl": 2.4378299713134766e-05, | |
| "learning_rate": 3.7999999999999996e-07, | |
| "loss": 0.0471, | |
| "reward": 0.4042445756494999, | |
| "reward_std": 0.6929292231798172, | |
| "rewards/cosine_scaled_reward": -0.02704438249929808, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2697.8274536132812, | |
| "epoch": 0.08, | |
| "grad_norm": 0.15472018718719482, | |
| "kl": 2.35140323638916e-05, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0265, | |
| "reward": 0.3560524769127369, | |
| "reward_std": 0.6769110411405563, | |
| "rewards/cosine_scaled_reward": -0.05114044318906963, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2339.4405517578125, | |
| "epoch": 0.084, | |
| "grad_norm": 0.21466447412967682, | |
| "kl": 2.086162567138672e-05, | |
| "learning_rate": 4.1999999999999995e-07, | |
| "loss": 0.0806, | |
| "reward": 0.7416469305753708, | |
| "reward_std": 0.841043546795845, | |
| "rewards/cosine_scaled_reward": 0.06725202780216932, | |
| "rewards/format_reward": 0.6071428656578064, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2781.5238037109375, | |
| "epoch": 0.088, | |
| "grad_norm": 0.19139103591442108, | |
| "kl": 3.0338764190673828e-05, | |
| "learning_rate": 4.3999999999999997e-07, | |
| "loss": 0.0773, | |
| "reward": 0.20257593411952257, | |
| "reward_std": 0.7891978472471237, | |
| "rewards/cosine_scaled_reward": -0.1129977386444807, | |
| "rewards/format_reward": 0.4285714365541935, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3036.761962890625, | |
| "epoch": 0.092, | |
| "grad_norm": 0.1108132153749466, | |
| "kl": 2.6345252990722656e-05, | |
| "learning_rate": 4.6e-07, | |
| "loss": 0.0238, | |
| "reward": 0.20629926398396492, | |
| "reward_std": 0.7457813173532486, | |
| "rewards/cosine_scaled_reward": -0.07542179408483207, | |
| "rewards/format_reward": 0.3571428619325161, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3143.1131591796875, | |
| "epoch": 0.096, | |
| "grad_norm": 0.10591176152229309, | |
| "kl": 2.703070640563965e-05, | |
| "learning_rate": 4.8e-07, | |
| "loss": 0.0637, | |
| "reward": 0.0749267227947712, | |
| "reward_std": 0.6808565855026245, | |
| "rewards/cosine_scaled_reward": -0.1292033027857542, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2934.4227294921875, | |
| "epoch": 0.1, | |
| "grad_norm": 0.12180113047361374, | |
| "kl": 1.4990568161010742e-05, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0525, | |
| "reward": 0.3605663161724806, | |
| "reward_std": 0.7757866084575653, | |
| "rewards/cosine_scaled_reward": -0.028050171211361885, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3100.5357666015625, | |
| "epoch": 0.104, | |
| "grad_norm": 0.14736856520175934, | |
| "kl": 2.230703830718994e-05, | |
| "learning_rate": 5.2e-07, | |
| "loss": 0.087, | |
| "reward": 0.1489051878452301, | |
| "reward_std": 0.7608643025159836, | |
| "rewards/cosine_scaled_reward": -0.08923787740059197, | |
| "rewards/format_reward": 0.32738095708191395, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2978.452392578125, | |
| "epoch": 0.108, | |
| "grad_norm": 0.1490376740694046, | |
| "kl": 2.7447938919067383e-05, | |
| "learning_rate": 5.4e-07, | |
| "loss": 0.0633, | |
| "reward": 0.16602796246297657, | |
| "reward_std": 0.762113556265831, | |
| "rewards/cosine_scaled_reward": -0.09555743727833033, | |
| "rewards/format_reward": 0.3571428656578064, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2923.1012573242188, | |
| "epoch": 0.112, | |
| "grad_norm": 0.11410558968782425, | |
| "kl": 3.108382225036621e-05, | |
| "learning_rate": 5.6e-07, | |
| "loss": 0.0618, | |
| "reward": 0.058234728407114744, | |
| "reward_std": 0.5919530540704727, | |
| "rewards/cosine_scaled_reward": -0.14052549470216036, | |
| "rewards/format_reward": 0.3392857201397419, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2925.1011962890625, | |
| "epoch": 0.116, | |
| "grad_norm": 0.17734545469284058, | |
| "kl": 5.251169204711914e-05, | |
| "learning_rate": 5.8e-07, | |
| "loss": 0.0463, | |
| "reward": 0.24072746047750115, | |
| "reward_std": 0.7061209976673126, | |
| "rewards/cosine_scaled_reward": -0.061183891259133816, | |
| "rewards/format_reward": 0.3630952425301075, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2882.39892578125, | |
| "epoch": 0.12, | |
| "grad_norm": 0.15557299554347992, | |
| "kl": 2.1502375602722168e-05, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0703, | |
| "reward": 0.22035705484449863, | |
| "reward_std": 0.5751676708459854, | |
| "rewards/cosine_scaled_reward": -0.07136909663677216, | |
| "rewards/format_reward": 0.3630952462553978, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2717.607177734375, | |
| "epoch": 0.124, | |
| "grad_norm": 0.16903533041477203, | |
| "kl": 6.181001663208008e-05, | |
| "learning_rate": 6.2e-07, | |
| "loss": 0.07, | |
| "reward": 0.3481953740119934, | |
| "reward_std": 0.7361179888248444, | |
| "rewards/cosine_scaled_reward": -0.025307081639766693, | |
| "rewards/format_reward": 0.3988095298409462, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2585.244140625, | |
| "epoch": 0.128, | |
| "grad_norm": 0.1481872797012329, | |
| "kl": 0.00023996829986572266, | |
| "learning_rate": 6.4e-07, | |
| "loss": 0.0664, | |
| "reward": 0.5805501043796539, | |
| "reward_std": 0.805858314037323, | |
| "rewards/cosine_scaled_reward": 0.019441714510321617, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2608.7083740234375, | |
| "epoch": 0.132, | |
| "grad_norm": 0.10800693184137344, | |
| "kl": 0.0002808570861816406, | |
| "learning_rate": 6.6e-07, | |
| "loss": 0.0255, | |
| "reward": 0.5432634204626083, | |
| "reward_std": 0.7363616675138474, | |
| "rewards/cosine_scaled_reward": 0.02460789866745472, | |
| "rewards/format_reward": 0.4940476268529892, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2769.1964721679688, | |
| "epoch": 0.136, | |
| "grad_norm": 0.12105516344308853, | |
| "kl": 0.00020498037338256836, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.0184, | |
| "reward": 0.18091929703950882, | |
| "reward_std": 0.6703035831451416, | |
| "rewards/cosine_scaled_reward": -0.10596893168985844, | |
| "rewards/format_reward": 0.3928571604192257, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3090.1607666015625, | |
| "epoch": 0.14, | |
| "grad_norm": 0.11914981156587601, | |
| "kl": 0.0002568960189819336, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0617, | |
| "reward": 0.08196480484912172, | |
| "reward_std": 0.7742973417043686, | |
| "rewards/cosine_scaled_reward": -0.10782713070511818, | |
| "rewards/format_reward": 0.2976190559566021, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2790.0596313476562, | |
| "epoch": 0.144, | |
| "grad_norm": 0.10883598774671555, | |
| "kl": 0.00027942657470703125, | |
| "learning_rate": 7.2e-07, | |
| "loss": 0.0163, | |
| "reward": 0.31424143677577376, | |
| "reward_std": 0.669949933886528, | |
| "rewards/cosine_scaled_reward": -0.06311738677322865, | |
| "rewards/format_reward": 0.4404762014746666, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2916.3452758789062, | |
| "epoch": 0.148, | |
| "grad_norm": 0.1492447555065155, | |
| "kl": 0.00025272369384765625, | |
| "learning_rate": 7.4e-07, | |
| "loss": 0.0592, | |
| "reward": 0.1357547640800476, | |
| "reward_std": 0.7365808188915253, | |
| "rewards/cosine_scaled_reward": -0.10771786456461996, | |
| "rewards/format_reward": 0.3511904813349247, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3342.6786499023438, | |
| "epoch": 0.152, | |
| "grad_norm": 0.08414288610219955, | |
| "kl": 0.00013870000839233398, | |
| "learning_rate": 7.599999999999999e-07, | |
| "loss": 0.0239, | |
| "reward": -0.1723631415516138, | |
| "reward_std": 0.6109825298190117, | |
| "rewards/cosine_scaled_reward": -0.16951490193605423, | |
| "rewards/format_reward": 0.16666666977107525, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2762.5774536132812, | |
| "epoch": 0.156, | |
| "grad_norm": 0.14042888581752777, | |
| "kl": 0.0005602836608886719, | |
| "learning_rate": 7.799999999999999e-07, | |
| "loss": 0.0588, | |
| "reward": 0.3224933594465256, | |
| "reward_std": 0.6976565718650818, | |
| "rewards/cosine_scaled_reward": -0.04708665423095226, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2729.9644165039062, | |
| "epoch": 0.16, | |
| "grad_norm": 0.110390305519104, | |
| "kl": 0.00016605854034423828, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0549, | |
| "reward": 0.4423699714243412, | |
| "reward_std": 0.6286562532186508, | |
| "rewards/cosine_scaled_reward": -0.016910257749259472, | |
| "rewards/format_reward": 0.476190485060215, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2955.7084350585938, | |
| "epoch": 0.164, | |
| "grad_norm": 0.15253609418869019, | |
| "kl": 0.00040471553802490234, | |
| "learning_rate": 8.199999999999999e-07, | |
| "loss": 0.0718, | |
| "reward": 0.44552009692415595, | |
| "reward_std": 0.7759689763188362, | |
| "rewards/cosine_scaled_reward": 0.0144266925053671, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2961.0596313476562, | |
| "epoch": 0.168, | |
| "grad_norm": 0.22110103070735931, | |
| "kl": 0.0009613037109375, | |
| "learning_rate": 8.399999999999999e-07, | |
| "loss": 0.1186, | |
| "reward": 0.2217194978147745, | |
| "reward_std": 0.6207270994782448, | |
| "rewards/cosine_scaled_reward": -0.07366406172513962, | |
| "rewards/format_reward": 0.3690476268529892, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3015.7738647460938, | |
| "epoch": 0.172, | |
| "grad_norm": 0.23720885813236237, | |
| "kl": 0.0005915164947509766, | |
| "learning_rate": 8.599999999999999e-07, | |
| "loss": 0.1245, | |
| "reward": -0.04521503113210201, | |
| "reward_std": 0.62105892598629, | |
| "rewards/cosine_scaled_reward": -0.16844085440970957, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2937.2381591796875, | |
| "epoch": 0.176, | |
| "grad_norm": 0.09096106886863708, | |
| "kl": 0.00051116943359375, | |
| "learning_rate": 8.799999999999999e-07, | |
| "loss": 0.0249, | |
| "reward": 0.22813843563199043, | |
| "reward_std": 0.6727291792631149, | |
| "rewards/cosine_scaled_reward": -0.058549837151076645, | |
| "rewards/format_reward": 0.3452381007373333, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3149.511962890625, | |
| "epoch": 0.18, | |
| "grad_norm": 0.11164555698633194, | |
| "kl": 0.0004715919494628906, | |
| "learning_rate": 9e-07, | |
| "loss": 0.017, | |
| "reward": 0.05970348231494427, | |
| "reward_std": 0.7763290405273438, | |
| "rewards/cosine_scaled_reward": -0.11300540715456009, | |
| "rewards/format_reward": 0.285714291036129, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3184.6845703125, | |
| "epoch": 0.184, | |
| "grad_norm": 0.1711866706609726, | |
| "kl": 0.0007777214050292969, | |
| "learning_rate": 9.2e-07, | |
| "loss": 0.0731, | |
| "reward": 0.1386737246066332, | |
| "reward_std": 0.7276585251092911, | |
| "rewards/cosine_scaled_reward": -0.06459171324968338, | |
| "rewards/format_reward": 0.26785714738070965, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3015.386962890625, | |
| "epoch": 0.188, | |
| "grad_norm": 0.12470238655805588, | |
| "kl": 0.0014100074768066406, | |
| "learning_rate": 9.399999999999999e-07, | |
| "loss": 0.0653, | |
| "reward": 0.16049158992245793, | |
| "reward_std": 0.7017006278038025, | |
| "rewards/cosine_scaled_reward": -0.08642087457701564, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2909.96435546875, | |
| "epoch": 0.192, | |
| "grad_norm": 0.28355535864830017, | |
| "kl": 0.009288787841796875, | |
| "learning_rate": 9.6e-07, | |
| "loss": 0.0581, | |
| "reward": 0.0768200121819973, | |
| "reward_std": 0.7135801166296005, | |
| "rewards/cosine_scaled_reward": -0.14016142301261425, | |
| "rewards/format_reward": 0.3571428693830967, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2753.6845703125, | |
| "epoch": 0.196, | |
| "grad_norm": 0.6567728519439697, | |
| "kl": 0.023477554321289062, | |
| "learning_rate": 9.8e-07, | |
| "loss": 0.0866, | |
| "reward": 0.4337980281561613, | |
| "reward_std": 0.8068065047264099, | |
| "rewards/cosine_scaled_reward": -0.006315283477306366, | |
| "rewards/format_reward": 0.4464285746216774, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2934.8274536132812, | |
| "epoch": 0.2, | |
| "grad_norm": 0.11382321268320084, | |
| "kl": 0.0025758743286132812, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0714, | |
| "reward": 0.2880665063858032, | |
| "reward_std": 0.6403830945491791, | |
| "rewards/cosine_scaled_reward": -0.049419129034504294, | |
| "rewards/format_reward": 0.3869047649204731, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2840.7678833007812, | |
| "epoch": 0.204, | |
| "grad_norm": 0.11641126126050949, | |
| "kl": 0.0050792694091796875, | |
| "learning_rate": 9.999890338174275e-07, | |
| "loss": 0.023, | |
| "reward": 0.2840890493243933, | |
| "reward_std": 0.692708894610405, | |
| "rewards/cosine_scaled_reward": -0.06628882512450218, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3119.9881591796875, | |
| "epoch": 0.208, | |
| "grad_norm": 0.14652805030345917, | |
| "kl": 0.0032052993774414062, | |
| "learning_rate": 9.999561358041868e-07, | |
| "loss": 0.0641, | |
| "reward": 0.18620363296940923, | |
| "reward_std": 0.8490904271602631, | |
| "rewards/cosine_scaled_reward": -0.058683907613158226, | |
| "rewards/format_reward": 0.3035714365541935, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2926.9940795898438, | |
| "epoch": 0.212, | |
| "grad_norm": 0.19241634011268616, | |
| "kl": 0.00447845458984375, | |
| "learning_rate": 9.999013075636804e-07, | |
| "loss": 0.0747, | |
| "reward": 0.36803684243932366, | |
| "reward_std": 0.823193870484829, | |
| "rewards/cosine_scaled_reward": -0.006457769020926207, | |
| "rewards/format_reward": 0.380952388048172, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3031.7203369140625, | |
| "epoch": 0.216, | |
| "grad_norm": 0.12843742966651917, | |
| "kl": 0.0029668807983398438, | |
| "learning_rate": 9.998245517681593e-07, | |
| "loss": 0.0153, | |
| "reward": 0.2685772944241762, | |
| "reward_std": 0.6489126533269882, | |
| "rewards/cosine_scaled_reward": -0.04428278561681509, | |
| "rewards/format_reward": 0.3571428693830967, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3123.136962890625, | |
| "epoch": 0.22, | |
| "grad_norm": 0.1504901498556137, | |
| "kl": 0.006084442138671875, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.0691, | |
| "reward": 0.03501664288341999, | |
| "reward_std": 0.6481388062238693, | |
| "rewards/cosine_scaled_reward": -0.11642025248147547, | |
| "rewards/format_reward": 0.2678571529686451, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2673.434539794922, | |
| "epoch": 0.224, | |
| "grad_norm": 0.1224113255739212, | |
| "kl": 0.0033931732177734375, | |
| "learning_rate": 9.996052735444862e-07, | |
| "loss": 0.0269, | |
| "reward": 0.6296312126796693, | |
| "reward_std": 0.6751764714717865, | |
| "rewards/cosine_scaled_reward": 0.07374419644474983, | |
| "rewards/format_reward": 0.482142873108387, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3170.0535888671875, | |
| "epoch": 0.228, | |
| "grad_norm": 0.1044960618019104, | |
| "kl": 0.0024929046630859375, | |
| "learning_rate": 9.994627618036452e-07, | |
| "loss": 0.0262, | |
| "reward": 0.21022793278098106, | |
| "reward_std": 0.7194458544254303, | |
| "rewards/cosine_scaled_reward": -0.05560031719505787, | |
| "rewards/format_reward": 0.3214285857975483, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2590.0179138183594, | |
| "epoch": 0.232, | |
| "grad_norm": 0.13768674433231354, | |
| "kl": 0.0045948028564453125, | |
| "learning_rate": 9.992983438818915e-07, | |
| "loss": 0.0592, | |
| "reward": 0.4493846707046032, | |
| "reward_std": 0.7118680775165558, | |
| "rewards/cosine_scaled_reward": -0.02233148762024939, | |
| "rewards/format_reward": 0.4940476231276989, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3087.4702758789062, | |
| "epoch": 0.236, | |
| "grad_norm": 0.13870052993297577, | |
| "kl": 0.001789093017578125, | |
| "learning_rate": 9.991120277927223e-07, | |
| "loss": 0.0691, | |
| "reward": 0.3166997814550996, | |
| "reward_std": 0.7532177269458771, | |
| "rewards/cosine_scaled_reward": -0.011292967945337296, | |
| "rewards/format_reward": 0.3392857201397419, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2717.3036499023438, | |
| "epoch": 0.24, | |
| "grad_norm": 0.14514827728271484, | |
| "kl": 0.0060253143310546875, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0622, | |
| "reward": 0.16810212982818484, | |
| "reward_std": 0.5123014599084854, | |
| "rewards/cosine_scaled_reward": -0.10642512841150165, | |
| "rewards/format_reward": 0.3809523843228817, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3105.7977294921875, | |
| "epoch": 0.244, | |
| "grad_norm": 0.10494968295097351, | |
| "kl": 0.0026693344116210938, | |
| "learning_rate": 9.98673738502114e-07, | |
| "loss": 0.0639, | |
| "reward": 0.08244643732905388, | |
| "reward_std": 0.7039294093847275, | |
| "rewards/cosine_scaled_reward": -0.10461012227460742, | |
| "rewards/format_reward": 0.29166666977107525, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3213.3214721679688, | |
| "epoch": 0.248, | |
| "grad_norm": 0.0999075248837471, | |
| "kl": 0.0023097991943359375, | |
| "learning_rate": 9.98421786662277e-07, | |
| "loss": 0.053, | |
| "reward": 0.09679291397333145, | |
| "reward_std": 0.7138089835643768, | |
| "rewards/cosine_scaled_reward": -0.0914845080114901, | |
| "rewards/format_reward": 0.2797619104385376, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2783.9584350585938, | |
| "epoch": 0.252, | |
| "grad_norm": 0.19832438230514526, | |
| "kl": 0.0027294158935546875, | |
| "learning_rate": 9.981479793771866e-07, | |
| "loss": 0.0773, | |
| "reward": 0.2238014191389084, | |
| "reward_std": 0.6036202609539032, | |
| "rewards/cosine_scaled_reward": -0.06369452457875013, | |
| "rewards/format_reward": 0.351190485060215, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2976.6607666015625, | |
| "epoch": 0.256, | |
| "grad_norm": 0.12335456907749176, | |
| "kl": 0.0020885467529296875, | |
| "learning_rate": 9.97852329991824e-07, | |
| "loss": 0.0857, | |
| "reward": 0.27347568422555923, | |
| "reward_std": 0.7463532835245132, | |
| "rewards/cosine_scaled_reward": -0.03885740428813733, | |
| "rewards/format_reward": 0.3511904887855053, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2825.6130981445312, | |
| "epoch": 0.26, | |
| "grad_norm": 0.13863790035247803, | |
| "kl": 0.002285003662109375, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": 0.0349, | |
| "reward": 0.3712610546499491, | |
| "reward_std": 0.7277249395847321, | |
| "rewards/cosine_scaled_reward": -0.037583764642477036, | |
| "rewards/format_reward": 0.4464285746216774, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2974.232177734375, | |
| "epoch": 0.264, | |
| "grad_norm": 0.11186616122722626, | |
| "kl": 0.002407073974609375, | |
| "learning_rate": 9.971955636222684e-07, | |
| "loss": 0.0025, | |
| "reward": 0.07817286718636751, | |
| "reward_std": 0.640307292342186, | |
| "rewards/cosine_scaled_reward": -0.12460404448211193, | |
| "rewards/format_reward": 0.3273809589445591, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3152.702392578125, | |
| "epoch": 0.268, | |
| "grad_norm": 0.12694397568702698, | |
| "kl": 0.00250244140625, | |
| "learning_rate": 9.968344786479415e-07, | |
| "loss": 0.07, | |
| "reward": 0.1549822874367237, | |
| "reward_std": 0.6663320288062096, | |
| "rewards/cosine_scaled_reward": -0.08322314161341637, | |
| "rewards/format_reward": 0.3214285783469677, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2938.0179443359375, | |
| "epoch": 0.272, | |
| "grad_norm": 0.12655070424079895, | |
| "kl": 0.00341033935546875, | |
| "learning_rate": 9.964516155915151e-07, | |
| "loss": 0.0545, | |
| "reward": 0.2076467089354992, | |
| "reward_std": 0.7705407291650772, | |
| "rewards/cosine_scaled_reward": -0.08070044964551926, | |
| "rewards/format_reward": 0.3690476268529892, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3025.8095703125, | |
| "epoch": 0.276, | |
| "grad_norm": 0.12574610114097595, | |
| "kl": 0.003986358642578125, | |
| "learning_rate": 9.960469931131936e-07, | |
| "loss": 0.0227, | |
| "reward": 0.03160261735320091, | |
| "reward_std": 0.621779277920723, | |
| "rewards/cosine_scaled_reward": -0.13003203552216291, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2877.5536499023438, | |
| "epoch": 0.28, | |
| "grad_norm": 0.12181198596954346, | |
| "kl": 0.0038127899169921875, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": 0.0248, | |
| "reward": 0.2757916431874037, | |
| "reward_std": 0.7794490903615952, | |
| "rewards/cosine_scaled_reward": -0.06448512757197022, | |
| "rewards/format_reward": 0.4047619178891182, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2776.6131591796875, | |
| "epoch": 0.284, | |
| "grad_norm": 0.17934156954288483, | |
| "kl": 0.0039825439453125, | |
| "learning_rate": 9.951725498333448e-07, | |
| "loss": 0.1072, | |
| "reward": 0.2793612889945507, | |
| "reward_std": 0.7607921361923218, | |
| "rewards/cosine_scaled_reward": -0.06865269318223, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3037.5178833007812, | |
| "epoch": 0.288, | |
| "grad_norm": 0.1311851143836975, | |
| "kl": 0.0038604736328125, | |
| "learning_rate": 9.947027716509488e-07, | |
| "loss": 0.0745, | |
| "reward": 0.34610075503587723, | |
| "reward_std": 0.8604296147823334, | |
| "rewards/cosine_scaled_reward": -0.01444962713867426, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2880.3036499023438, | |
| "epoch": 0.292, | |
| "grad_norm": 0.10903972387313843, | |
| "kl": 0.005123138427734375, | |
| "learning_rate": 9.942113192828444e-07, | |
| "loss": 0.0247, | |
| "reward": 0.41264417115598917, | |
| "reward_std": 0.6988394409418106, | |
| "rewards/cosine_scaled_reward": -0.00201124744489789, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2689.5774536132812, | |
| "epoch": 0.296, | |
| "grad_norm": 0.1187940314412117, | |
| "kl": 0.00371551513671875, | |
| "learning_rate": 9.93698216681727e-07, | |
| "loss": 0.0343, | |
| "reward": 0.49459290131926537, | |
| "reward_std": 0.6582471132278442, | |
| "rewards/cosine_scaled_reward": -0.005679763096850365, | |
| "rewards/format_reward": 0.505952388048172, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3011.2440795898438, | |
| "epoch": 0.3, | |
| "grad_norm": 0.15027488768100739, | |
| "kl": 0.00612640380859375, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.0922, | |
| "reward": -0.12369688227772713, | |
| "reward_std": 0.5938592255115509, | |
| "rewards/cosine_scaled_reward": -0.1957770138978958, | |
| "rewards/format_reward": 0.26785715110599995, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3087.9762573242188, | |
| "epoch": 0.304, | |
| "grad_norm": 0.1342087835073471, | |
| "kl": 0.004589080810546875, | |
| "learning_rate": 9.926071618660237e-07, | |
| "loss": 0.051, | |
| "reward": 0.029461721424013376, | |
| "reward_std": 0.5008194297552109, | |
| "rewards/cosine_scaled_reward": -0.12812629727704916, | |
| "rewards/format_reward": 0.2857142915017903, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2708.6845703125, | |
| "epoch": 0.308, | |
| "grad_norm": 0.11936229467391968, | |
| "kl": 0.00518035888671875, | |
| "learning_rate": 9.9202926282791e-07, | |
| "loss": 0.0693, | |
| "reward": 0.38730931747704744, | |
| "reward_std": 0.6931557953357697, | |
| "rewards/cosine_scaled_reward": -0.0503929746337235, | |
| "rewards/format_reward": 0.4880952388048172, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3123.4345703125, | |
| "epoch": 0.312, | |
| "grad_norm": 0.10926749557256699, | |
| "kl": 0.006168365478515625, | |
| "learning_rate": 9.91429819907136e-07, | |
| "loss": 0.0281, | |
| "reward": 0.14759791223332286, | |
| "reward_std": 0.6552696228027344, | |
| "rewards/cosine_scaled_reward": -0.08989150635898113, | |
| "rewards/format_reward": 0.3273809519596398, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3104.886962890625, | |
| "epoch": 0.316, | |
| "grad_norm": 0.17055809497833252, | |
| "kl": 0.005580902099609375, | |
| "learning_rate": 9.908088623197048e-07, | |
| "loss": 0.122, | |
| "reward": 0.1187831275165081, | |
| "reward_std": 0.7589289993047714, | |
| "rewards/cosine_scaled_reward": -0.11917985696345568, | |
| "rewards/format_reward": 0.3571428656578064, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2892.2261962890625, | |
| "epoch": 0.32, | |
| "grad_norm": 0.12601953744888306, | |
| "kl": 0.004444122314453125, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": 0.0261, | |
| "reward": 0.2744547198526561, | |
| "reward_std": 0.686069905757904, | |
| "rewards/cosine_scaled_reward": -0.05920121353119612, | |
| "rewards/format_reward": 0.3928571492433548, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3041.1488647460938, | |
| "epoch": 0.324, | |
| "grad_norm": 0.11552488803863525, | |
| "kl": 0.00655364990234375, | |
| "learning_rate": 9.895025252503755e-07, | |
| "loss": 0.0229, | |
| "reward": 0.12106413394212723, | |
| "reward_std": 0.675617903470993, | |
| "rewards/cosine_scaled_reward": -0.1061346041969955, | |
| "rewards/format_reward": 0.33333333767950535, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3129.6429443359375, | |
| "epoch": 0.328, | |
| "grad_norm": 0.09706410765647888, | |
| "kl": 0.0055084228515625, | |
| "learning_rate": 9.888172094375033e-07, | |
| "loss": 0.0452, | |
| "reward": 0.12287123966962099, | |
| "reward_std": 0.7173575460910797, | |
| "rewards/cosine_scaled_reward": -0.09035009983927011, | |
| "rewards/format_reward": 0.3035714365541935, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2602.047637939453, | |
| "epoch": 0.332, | |
| "grad_norm": 0.11233574151992798, | |
| "kl": 0.01010894775390625, | |
| "learning_rate": 9.881105062929221e-07, | |
| "loss": 0.0258, | |
| "reward": 0.35400932375341654, | |
| "reward_std": 0.6462785750627518, | |
| "rewards/cosine_scaled_reward": -0.06109058950096369, | |
| "rewards/format_reward": 0.476190485060215, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2894.011962890625, | |
| "epoch": 0.336, | |
| "grad_norm": 0.12059750407934189, | |
| "kl": 0.00872802734375, | |
| "learning_rate": 9.873824502603459e-07, | |
| "loss": 0.0806, | |
| "reward": 0.2941260803490877, | |
| "reward_std": 0.78522889316082, | |
| "rewards/cosine_scaled_reward": -0.06722268275916576, | |
| "rewards/format_reward": 0.4285714328289032, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2979.226318359375, | |
| "epoch": 0.34, | |
| "grad_norm": 0.15206296741962433, | |
| "kl": 0.00783538818359375, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": 0.0755, | |
| "reward": 0.21889091655611992, | |
| "reward_std": 0.6674999743700027, | |
| "rewards/cosine_scaled_reward": -0.06614979542791843, | |
| "rewards/format_reward": 0.3511904887855053, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2819.5892944335938, | |
| "epoch": 0.344, | |
| "grad_norm": 0.1093529462814331, | |
| "kl": 0.00661468505859375, | |
| "learning_rate": 9.85862422507884e-07, | |
| "loss": 0.0353, | |
| "reward": 0.20389786185114644, | |
| "reward_std": 0.6942542195320129, | |
| "rewards/cosine_scaled_reward": -0.07959869271144271, | |
| "rewards/format_reward": 0.3630952425301075, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2616.452392578125, | |
| "epoch": 0.348, | |
| "grad_norm": 0.15942011773586273, | |
| "kl": 0.00847625732421875, | |
| "learning_rate": 9.850705248720068e-07, | |
| "loss": 0.0773, | |
| "reward": 0.3760679364204407, | |
| "reward_std": 0.6467384025454521, | |
| "rewards/cosine_scaled_reward": -0.05898985452950001, | |
| "rewards/format_reward": 0.4940476194024086, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2923.7500610351562, | |
| "epoch": 0.352, | |
| "grad_norm": 0.10565865784883499, | |
| "kl": 0.0073089599609375, | |
| "learning_rate": 9.8425742251254e-07, | |
| "loss": 0.0227, | |
| "reward": 0.3185804970562458, | |
| "reward_std": 0.6098055616021156, | |
| "rewards/cosine_scaled_reward": -0.010352615499868989, | |
| "rewards/format_reward": 0.3392857201397419, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2752.6309814453125, | |
| "epoch": 0.356, | |
| "grad_norm": 0.1690302938222885, | |
| "kl": 0.0132293701171875, | |
| "learning_rate": 9.83423155058946e-07, | |
| "loss": 0.0312, | |
| "reward": 0.46914676763117313, | |
| "reward_std": 0.7854363918304443, | |
| "rewards/cosine_scaled_reward": 0.005406718701124191, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2769.1488647460938, | |
| "epoch": 0.36, | |
| "grad_norm": 0.17653611302375793, | |
| "kl": 0.011993408203125, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.0815, | |
| "reward": 0.3604448903352022, | |
| "reward_std": 0.798264317214489, | |
| "rewards/cosine_scaled_reward": -0.04894421715289354, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3187.3631591796875, | |
| "epoch": 0.364, | |
| "grad_norm": 0.1068400964140892, | |
| "kl": 0.0078277587890625, | |
| "learning_rate": 9.816912885430258e-07, | |
| "loss": 0.0211, | |
| "reward": -0.03608314320445061, | |
| "reward_std": 0.5826699808239937, | |
| "rewards/cosine_scaled_reward": -0.14006539154797792, | |
| "rewards/format_reward": 0.2440476305782795, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2924.916748046875, | |
| "epoch": 0.368, | |
| "grad_norm": 0.17665976285934448, | |
| "kl": 0.0091400146484375, | |
| "learning_rate": 9.807937738894303e-07, | |
| "loss": 0.0852, | |
| "reward": 0.2780441716313362, | |
| "reward_std": 0.7524297386407852, | |
| "rewards/cosine_scaled_reward": -0.07526363711804152, | |
| "rewards/format_reward": 0.4285714291036129, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2939.3512573242188, | |
| "epoch": 0.372, | |
| "grad_norm": 0.13597099483013153, | |
| "kl": 0.0076446533203125, | |
| "learning_rate": 9.798752629550546e-07, | |
| "loss": 0.0324, | |
| "reward": 0.33797190338373184, | |
| "reward_std": 0.5880008786916733, | |
| "rewards/cosine_scaled_reward": -0.01851405529305339, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2922.875, | |
| "epoch": 0.376, | |
| "grad_norm": 0.12246444076299667, | |
| "kl": 0.00815582275390625, | |
| "learning_rate": 9.78935800506826e-07, | |
| "loss": 0.0801, | |
| "reward": 0.14625070057809353, | |
| "reward_std": 0.690229170024395, | |
| "rewards/cosine_scaled_reward": -0.12330322340130806, | |
| "rewards/format_reward": 0.3928571492433548, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3222.3632202148438, | |
| "epoch": 0.38, | |
| "grad_norm": 0.08992121368646622, | |
| "kl": 0.00704193115234375, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": -0.001, | |
| "reward": 0.26800261437892914, | |
| "reward_std": 0.7103277295827866, | |
| "rewards/cosine_scaled_reward": -0.011832039803266525, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3215.2202758789062, | |
| "epoch": 0.384, | |
| "grad_norm": 0.12997964024543762, | |
| "kl": 0.00983428955078125, | |
| "learning_rate": 9.769942052400235e-07, | |
| "loss": 0.0591, | |
| "reward": -0.03136127255856991, | |
| "reward_std": 0.6326467096805573, | |
| "rewards/cosine_scaled_reward": -0.18234730698168278, | |
| "rewards/format_reward": 0.33333333767950535, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2857.0416870117188, | |
| "epoch": 0.388, | |
| "grad_norm": 0.16558504104614258, | |
| "kl": 0.007293701171875, | |
| "learning_rate": 9.759921670520634e-07, | |
| "loss": 0.0821, | |
| "reward": 0.19707820191979408, | |
| "reward_std": 0.6188783794641495, | |
| "rewards/cosine_scaled_reward": -0.10681804455816746, | |
| "rewards/format_reward": 0.410714291036129, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2983.9940795898438, | |
| "epoch": 0.392, | |
| "grad_norm": 0.1882523149251938, | |
| "kl": 0.00878143310546875, | |
| "learning_rate": 9.749693666068663e-07, | |
| "loss": 0.1027, | |
| "reward": 0.3959239423274994, | |
| "reward_std": 0.875861182808876, | |
| "rewards/cosine_scaled_reward": 0.004509590216912329, | |
| "rewards/format_reward": 0.3869047686457634, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3077.0952758789062, | |
| "epoch": 0.396, | |
| "grad_norm": 0.1565464437007904, | |
| "kl": 0.011505126953125, | |
| "learning_rate": 9.739258537542835e-07, | |
| "loss": 0.0968, | |
| "reward": 0.027048692107200623, | |
| "reward_std": 0.7064545601606369, | |
| "rewards/cosine_scaled_reward": -0.1323089925572276, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2358.2381591796875, | |
| "epoch": 0.4, | |
| "grad_norm": 0.14640696346759796, | |
| "kl": 0.00804901123046875, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.0774, | |
| "reward": 0.8142919540405273, | |
| "reward_std": 0.7067123055458069, | |
| "rewards/cosine_scaled_reward": 0.10357456840574741, | |
| "rewards/format_reward": 0.6071428507566452, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3195.6428833007812, | |
| "epoch": 0.404, | |
| "grad_norm": 0.14821472764015198, | |
| "kl": 0.0126495361328125, | |
| "learning_rate": 9.717768952713511e-07, | |
| "loss": 0.0897, | |
| "reward": 0.2797414679080248, | |
| "reward_std": 0.8859200328588486, | |
| "rewards/cosine_scaled_reward": -0.0535816540941596, | |
| "rewards/format_reward": 0.3869047649204731, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3046.7798461914062, | |
| "epoch": 0.408, | |
| "grad_norm": 0.13244982063770294, | |
| "kl": 0.0107879638671875, | |
| "learning_rate": 9.706715543782064e-07, | |
| "loss": 0.0638, | |
| "reward": 0.16126136109232903, | |
| "reward_std": 0.6933339387178421, | |
| "rewards/cosine_scaled_reward": -0.08305979892611504, | |
| "rewards/format_reward": 0.3273809552192688, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2718.8275146484375, | |
| "epoch": 0.412, | |
| "grad_norm": 0.20267100632190704, | |
| "kl": 0.01003265380859375, | |
| "learning_rate": 9.695457105469804e-07, | |
| "loss": 0.1246, | |
| "reward": 0.4206714928150177, | |
| "reward_std": 0.6706456393003464, | |
| "rewards/cosine_scaled_reward": -0.0069261584430933, | |
| "rewards/format_reward": 0.43452382180839777, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2973.7559814453125, | |
| "epoch": 0.416, | |
| "grad_norm": 0.12351427227258682, | |
| "kl": 0.01038360595703125, | |
| "learning_rate": 9.683994186497132e-07, | |
| "loss": 0.0716, | |
| "reward": 0.20578511937389976, | |
| "reward_std": 0.7138822227716446, | |
| "rewards/cosine_scaled_reward": -0.0846074327128008, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2596.1250915527344, | |
| "epoch": 0.42, | |
| "grad_norm": 0.13755492866039276, | |
| "kl": 0.0099945068359375, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": 0.0544, | |
| "reward": 0.6021162122488022, | |
| "reward_std": 0.8435305505990982, | |
| "rewards/cosine_scaled_reward": 0.012367631308734417, | |
| "rewards/format_reward": 0.5773809663951397, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2986.226318359375, | |
| "epoch": 0.424, | |
| "grad_norm": 0.10337146371603012, | |
| "kl": 0.014007568359375, | |
| "learning_rate": 9.66045715125541e-07, | |
| "loss": 0.0441, | |
| "reward": 0.24333537928760052, | |
| "reward_std": 0.7411631494760513, | |
| "rewards/cosine_scaled_reward": -0.08071326930075884, | |
| "rewards/format_reward": 0.4047619141638279, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2804.7500610351562, | |
| "epoch": 0.428, | |
| "grad_norm": 0.1334601491689682, | |
| "kl": 0.010711669921875, | |
| "learning_rate": 9.648384182148252e-07, | |
| "loss": 0.0741, | |
| "reward": 0.21579574886709452, | |
| "reward_std": 0.559941440820694, | |
| "rewards/cosine_scaled_reward": -0.08257831074297428, | |
| "rewards/format_reward": 0.380952388048172, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2947.9583740234375, | |
| "epoch": 0.432, | |
| "grad_norm": 0.15339502692222595, | |
| "kl": 0.0123443603515625, | |
| "learning_rate": 9.636109026648554e-07, | |
| "loss": 0.0797, | |
| "reward": 0.2714387159794569, | |
| "reward_std": 0.5535019040107727, | |
| "rewards/cosine_scaled_reward": -0.060709220357239246, | |
| "rewards/format_reward": 0.3928571492433548, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3165.3631591796875, | |
| "epoch": 0.436, | |
| "grad_norm": 0.14555484056472778, | |
| "kl": 0.0123748779296875, | |
| "learning_rate": 9.623632283030077e-07, | |
| "loss": 0.0689, | |
| "reward": 0.3741426505148411, | |
| "reward_std": 0.7712415158748627, | |
| "rewards/cosine_scaled_reward": 0.011476085986942053, | |
| "rewards/format_reward": 0.351190485060215, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2682.607177734375, | |
| "epoch": 0.44, | |
| "grad_norm": 3.0458719730377197, | |
| "kl": 0.1771697998046875, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": 0.0576, | |
| "reward": 0.43371669203042984, | |
| "reward_std": 0.6959643810987473, | |
| "rewards/cosine_scaled_reward": -0.02718928176909685, | |
| "rewards/format_reward": 0.4880952462553978, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2601.636993408203, | |
| "epoch": 0.444, | |
| "grad_norm": 0.16071970760822296, | |
| "kl": 0.0132904052734375, | |
| "learning_rate": 9.598076473627796e-07, | |
| "loss": 0.0475, | |
| "reward": 0.3634342849254608, | |
| "reward_std": 0.5500286221504211, | |
| "rewards/cosine_scaled_reward": -0.06233047042042017, | |
| "rewards/format_reward": 0.4880952462553978, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2848.0, | |
| "epoch": 0.448, | |
| "grad_norm": 0.11652833968400955, | |
| "kl": 0.01318359375, | |
| "learning_rate": 9.58499865339809e-07, | |
| "loss": 0.0216, | |
| "reward": 0.4104595482349396, | |
| "reward_std": 0.7775004655122757, | |
| "rewards/cosine_scaled_reward": -0.023936893790960312, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3060.696533203125, | |
| "epoch": 0.452, | |
| "grad_norm": 0.11911512911319733, | |
| "kl": 0.019012451171875, | |
| "learning_rate": 9.571721736097088e-07, | |
| "loss": 0.0351, | |
| "reward": 0.14249714091420174, | |
| "reward_std": 0.5928184911608696, | |
| "rewards/cosine_scaled_reward": -0.08946572133572772, | |
| "rewards/format_reward": 0.32142857648432255, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 3025.3392944335938, | |
| "epoch": 0.456, | |
| "grad_norm": 0.11119002103805542, | |
| "kl": 0.013275146484375, | |
| "learning_rate": 9.55824636882301e-07, | |
| "loss": 0.0157, | |
| "reward": 0.2583576124161482, | |
| "reward_std": 0.5952321216464043, | |
| "rewards/cosine_scaled_reward": -0.05236881226301193, | |
| "rewards/format_reward": 0.3630952462553978, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2642.4524536132812, | |
| "epoch": 0.46, | |
| "grad_norm": 0.13256801664829254, | |
| "kl": 0.0142669677734375, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": 0.0464, | |
| "reward": 0.26410975866019726, | |
| "reward_std": 0.7131557315587997, | |
| "rewards/cosine_scaled_reward": -0.10604035668075085, | |
| "rewards/format_reward": 0.476190485060215, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2842.172607421875, | |
| "epoch": 0.464, | |
| "grad_norm": 0.14555224776268005, | |
| "kl": 0.0131683349609375, | |
| "learning_rate": 9.530702921077358e-07, | |
| "loss": 0.06, | |
| "reward": 0.7474905252456665, | |
| "reward_std": 0.9560296833515167, | |
| "rewards/cosine_scaled_reward": 0.10291192133445293, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2644.851318359375, | |
| "epoch": 0.468, | |
| "grad_norm": 0.1801517903804779, | |
| "kl": 0.0159149169921875, | |
| "learning_rate": 9.516636183034564e-07, | |
| "loss": 0.0868, | |
| "reward": 0.3559920974075794, | |
| "reward_std": 0.6948887631297112, | |
| "rewards/cosine_scaled_reward": -0.05712300445884466, | |
| "rewards/format_reward": 0.4702381044626236, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2898.8692016601562, | |
| "epoch": 0.472, | |
| "grad_norm": 0.11308304965496063, | |
| "kl": 0.01580810546875, | |
| "learning_rate": 9.502373679810839e-07, | |
| "loss": 0.0347, | |
| "reward": -0.02927885064855218, | |
| "reward_std": 0.5874328389763832, | |
| "rewards/cosine_scaled_reward": -0.18725845962762833, | |
| "rewards/format_reward": 0.3452381044626236, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2942.3869018554688, | |
| "epoch": 0.476, | |
| "grad_norm": 0.11883487552404404, | |
| "kl": 0.0146484375, | |
| "learning_rate": 9.487916106540465e-07, | |
| "loss": 0.0533, | |
| "reward": 0.2897670716047287, | |
| "reward_std": 0.624944195151329, | |
| "rewards/cosine_scaled_reward": -0.05452123726718128, | |
| "rewards/format_reward": 0.3988095298409462, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2888.5537109375, | |
| "epoch": 0.48, | |
| "grad_norm": 0.18806912004947662, | |
| "kl": 0.0160980224609375, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.0988, | |
| "reward": 0.42369477450847626, | |
| "reward_std": 0.8195747882127762, | |
| "rewards/cosine_scaled_reward": -0.002438324736431241, | |
| "rewards/format_reward": 0.4285714365541935, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2582.732177734375, | |
| "epoch": 0.484, | |
| "grad_norm": 0.24437126517295837, | |
| "kl": 0.0138702392578125, | |
| "learning_rate": 9.458418577899774e-07, | |
| "loss": 0.1221, | |
| "reward": 0.5238880245015025, | |
| "reward_std": 0.7648549973964691, | |
| "rewards/cosine_scaled_reward": -0.005913139786571264, | |
| "rewards/format_reward": 0.535714291036129, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2693.202392578125, | |
| "epoch": 0.488, | |
| "grad_norm": 0.1520787924528122, | |
| "kl": 0.016265869140625, | |
| "learning_rate": 9.443380060197385e-07, | |
| "loss": 0.075, | |
| "reward": 0.5096995830535889, | |
| "reward_std": 0.8008040487766266, | |
| "rewards/cosine_scaled_reward": 0.01675456203520298, | |
| "rewards/format_reward": 0.4761904776096344, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2794.4822387695312, | |
| "epoch": 0.492, | |
| "grad_norm": 0.276404470205307, | |
| "kl": 0.016845703125, | |
| "learning_rate": 9.428149347714143e-07, | |
| "loss": 0.1229, | |
| "reward": 0.1483494946733117, | |
| "reward_std": 0.7319334298372269, | |
| "rewards/cosine_scaled_reward": -0.11034906562417746, | |
| "rewards/format_reward": 0.3690476268529892, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2586.9285583496094, | |
| "epoch": 0.496, | |
| "grad_norm": 0.19590145349502563, | |
| "kl": 0.0192413330078125, | |
| "learning_rate": 9.412727182773486e-07, | |
| "loss": 0.0668, | |
| "reward": 0.42041725292801857, | |
| "reward_std": 0.7440174072980881, | |
| "rewards/cosine_scaled_reward": -0.05169615335762501, | |
| "rewards/format_reward": 0.5238095372915268, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2647.1309814453125, | |
| "epoch": 0.5, | |
| "grad_norm": 0.1762569099664688, | |
| "kl": 0.018951416015625, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": 0.058, | |
| "reward": 0.26979109086096287, | |
| "reward_std": 0.7384046316146851, | |
| "rewards/cosine_scaled_reward": -0.10022350586950779, | |
| "rewards/format_reward": 0.4702380932867527, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2329.4642639160156, | |
| "epoch": 0.504, | |
| "grad_norm": 0.2497519552707672, | |
| "kl": 0.016754150390625, | |
| "learning_rate": 9.381311511432658e-07, | |
| "loss": 0.1113, | |
| "reward": 0.5470606535673141, | |
| "reward_std": 0.7599766105413437, | |
| "rewards/cosine_scaled_reward": -0.04194586584344506, | |
| "rewards/format_reward": 0.630952388048172, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2630.851318359375, | |
| "epoch": 0.508, | |
| "grad_norm": 0.24775269627571106, | |
| "kl": 0.018798828125, | |
| "learning_rate": 9.36531953618799e-07, | |
| "loss": 0.0886, | |
| "reward": 0.5138388648629189, | |
| "reward_std": 0.8158636838197708, | |
| "rewards/cosine_scaled_reward": 0.00989562287577428, | |
| "rewards/format_reward": 0.4940476194024086, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2193.1250610351562, | |
| "epoch": 0.512, | |
| "grad_norm": 0.3206213712692261, | |
| "kl": 0.0168914794921875, | |
| "learning_rate": 9.34913917072228e-07, | |
| "loss": 0.1596, | |
| "reward": 0.7910499274730682, | |
| "reward_std": 0.7149495184421539, | |
| "rewards/cosine_scaled_reward": 0.0502868490293622, | |
| "rewards/format_reward": 0.6904762089252472, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2532.4642944335938, | |
| "epoch": 0.516, | |
| "grad_norm": 0.28250807523727417, | |
| "kl": 0.021270751953125, | |
| "learning_rate": 9.332771203643714e-07, | |
| "loss": 0.0845, | |
| "reward": 0.36558002047240734, | |
| "reward_std": 0.637114867568016, | |
| "rewards/cosine_scaled_reward": -0.05232903314754367, | |
| "rewards/format_reward": 0.470238097012043, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2768.4702758789062, | |
| "epoch": 0.52, | |
| "grad_norm": 0.26120948791503906, | |
| "kl": 0.026092529296875, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.1052, | |
| "reward": 0.41776999086141586, | |
| "reward_std": 0.9506262838840485, | |
| "rewards/cosine_scaled_reward": -0.029210255946964025, | |
| "rewards/format_reward": 0.4761904776096344, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2400.5000915527344, | |
| "epoch": 0.524, | |
| "grad_norm": 0.33211514353752136, | |
| "kl": 0.0215606689453125, | |
| "learning_rate": 9.299475664759068e-07, | |
| "loss": 0.1452, | |
| "reward": 0.42596414871513844, | |
| "reward_std": 0.6515605002641678, | |
| "rewards/cosine_scaled_reward": -0.04892268590629101, | |
| "rewards/format_reward": 0.5238095298409462, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2368.9524536132812, | |
| "epoch": 0.528, | |
| "grad_norm": 0.41833382844924927, | |
| "kl": 0.020782470703125, | |
| "learning_rate": 9.282549715730579e-07, | |
| "loss": 0.1107, | |
| "reward": 0.5763177648186684, | |
| "reward_std": 0.7463207244873047, | |
| "rewards/cosine_scaled_reward": 0.005420786794275045, | |
| "rewards/format_reward": 0.5654762089252472, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2800.4940795898438, | |
| "epoch": 0.532, | |
| "grad_norm": 0.21257169544696808, | |
| "kl": 0.031768798828125, | |
| "learning_rate": 9.265439410565328e-07, | |
| "loss": 0.0654, | |
| "reward": 0.27612858824431896, | |
| "reward_std": 0.6431511640548706, | |
| "rewards/cosine_scaled_reward": -0.07026905845850706, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2558.386962890625, | |
| "epoch": 0.536, | |
| "grad_norm": 0.42514950037002563, | |
| "kl": 0.02978515625, | |
| "learning_rate": 9.248145583195447e-07, | |
| "loss": 0.1228, | |
| "reward": 0.4270520806312561, | |
| "reward_std": 0.7729989290237427, | |
| "rewards/cosine_scaled_reward": -0.02159299748018384, | |
| "rewards/format_reward": 0.4702381044626236, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2229.363067626953, | |
| "epoch": 0.54, | |
| "grad_norm": 0.33076903223991394, | |
| "kl": 0.03668212890625, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": 0.1019, | |
| "reward": 0.28207028564065695, | |
| "reward_std": 0.6516975909471512, | |
| "rewards/cosine_scaled_reward": -0.10301248356699944, | |
| "rewards/format_reward": 0.4880952388048172, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2353.029815673828, | |
| "epoch": 0.544, | |
| "grad_norm": 0.3519177734851837, | |
| "kl": 0.035308837890625, | |
| "learning_rate": 9.213010742252327e-07, | |
| "loss": 0.0997, | |
| "reward": 0.37077474407851696, | |
| "reward_std": 0.668467104434967, | |
| "rewards/cosine_scaled_reward": -0.0675888154655695, | |
| "rewards/format_reward": 0.505952388048172, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2052.1964721679688, | |
| "epoch": 0.548, | |
| "grad_norm": 0.23379026353359222, | |
| "kl": 0.035400390625, | |
| "learning_rate": 9.195171441101668e-07, | |
| "loss": 0.058, | |
| "reward": 0.6550269052386284, | |
| "reward_std": 0.6502309143543243, | |
| "rewards/cosine_scaled_reward": 0.009061065968126059, | |
| "rewards/format_reward": 0.6369047611951828, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2325.732177734375, | |
| "epoch": 0.552, | |
| "grad_norm": 0.19041714072227478, | |
| "kl": 0.0423583984375, | |
| "learning_rate": 9.177152042508077e-07, | |
| "loss": 0.0232, | |
| "reward": 0.561458358541131, | |
| "reward_std": 0.9615298509597778, | |
| "rewards/cosine_scaled_reward": 0.012872030027210712, | |
| "rewards/format_reward": 0.5357142947614193, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2440.327423095703, | |
| "epoch": 0.556, | |
| "grad_norm": 0.2846536934375763, | |
| "kl": 0.0457763671875, | |
| "learning_rate": 9.158953424711624e-07, | |
| "loss": 0.0439, | |
| "reward": 0.44825945422053337, | |
| "reward_std": 0.7441610246896744, | |
| "rewards/cosine_scaled_reward": -0.022894082590937614, | |
| "rewards/format_reward": 0.4940476417541504, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2455.071533203125, | |
| "epoch": 0.56, | |
| "grad_norm": 0.5667356252670288, | |
| "kl": 0.05535888671875, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.1203, | |
| "reward": 0.42634591602836736, | |
| "reward_std": 0.6539553329348564, | |
| "rewards/cosine_scaled_reward": 0.007815815508365631, | |
| "rewards/format_reward": 0.4107142984867096, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2289.0416870117188, | |
| "epoch": 0.564, | |
| "grad_norm": 0.2632148861885071, | |
| "kl": 0.06378173828125, | |
| "learning_rate": 9.122022088101613e-07, | |
| "loss": 0.0588, | |
| "reward": 0.25764250196516514, | |
| "reward_std": 0.5646726861596107, | |
| "rewards/cosine_scaled_reward": -0.07355970796197653, | |
| "rewards/format_reward": 0.4047619141638279, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2396.1011962890625, | |
| "epoch": 0.568, | |
| "grad_norm": 0.48258456587791443, | |
| "kl": 0.070556640625, | |
| "learning_rate": 9.103291169269299e-07, | |
| "loss": 0.1188, | |
| "reward": 0.3830295614898205, | |
| "reward_std": 0.7874267548322678, | |
| "rewards/cosine_scaled_reward": 0.0069909729063510895, | |
| "rewards/format_reward": 0.3690476268529892, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2465.184539794922, | |
| "epoch": 0.572, | |
| "grad_norm": 0.3696215748786926, | |
| "kl": 0.083984375, | |
| "learning_rate": 9.084384631108882e-07, | |
| "loss": 0.0378, | |
| "reward": 0.17246808065101504, | |
| "reward_std": 0.7914570420980453, | |
| "rewards/cosine_scaled_reward": -0.11614691279828548, | |
| "rewards/format_reward": 0.4047619104385376, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2460.184600830078, | |
| "epoch": 0.576, | |
| "grad_norm": 0.30795326828956604, | |
| "kl": 0.08349609375, | |
| "learning_rate": 9.065303395098358e-07, | |
| "loss": 0.0254, | |
| "reward": 0.23997123539447784, | |
| "reward_std": 0.7133302837610245, | |
| "rewards/cosine_scaled_reward": -0.09430009685456753, | |
| "rewards/format_reward": 0.4285714365541935, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2376.089324951172, | |
| "epoch": 0.58, | |
| "grad_norm": 0.5491130352020264, | |
| "kl": 0.0899658203125, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": 0.0823, | |
| "reward": 0.4339366629719734, | |
| "reward_std": 0.7774848788976669, | |
| "rewards/cosine_scaled_reward": 0.005658812588080764, | |
| "rewards/format_reward": 0.4226190559566021, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2340.136962890625, | |
| "epoch": 0.584, | |
| "grad_norm": 0.3470991551876068, | |
| "kl": 0.1185302734375, | |
| "learning_rate": 9.026620557966279e-07, | |
| "loss": 0.0431, | |
| "reward": 0.31324461475014687, | |
| "reward_std": 0.7800580561161041, | |
| "rewards/cosine_scaled_reward": -0.10528245754539967, | |
| "rewards/format_reward": 0.5238095298409462, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2512.7262573242188, | |
| "epoch": 0.588, | |
| "grad_norm": 0.31661325693130493, | |
| "kl": 0.1114501953125, | |
| "learning_rate": 9.007020842191634e-07, | |
| "loss": 0.0189, | |
| "reward": 0.2997382581233978, | |
| "reward_std": 0.8120257556438446, | |
| "rewards/cosine_scaled_reward": -0.06144038587808609, | |
| "rewards/format_reward": 0.4226190485060215, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2311.8035888671875, | |
| "epoch": 0.592, | |
| "grad_norm": 0.9110273718833923, | |
| "kl": 0.1287841796875, | |
| "learning_rate": 8.987250199168808e-07, | |
| "loss": 0.0875, | |
| "reward": 0.29848775546997786, | |
| "reward_std": 0.7376701682806015, | |
| "rewards/cosine_scaled_reward": -0.06206565350294113, | |
| "rewards/format_reward": 0.4226190596818924, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2368.2500915527344, | |
| "epoch": 0.596, | |
| "grad_norm": 0.5145498514175415, | |
| "kl": 0.142333984375, | |
| "learning_rate": 8.967309592491052e-07, | |
| "loss": -0.0091, | |
| "reward": 0.1381237395107746, | |
| "reward_std": 0.7537627294659615, | |
| "rewards/cosine_scaled_reward": -0.12736669927835464, | |
| "rewards/format_reward": 0.3928571492433548, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2541.4226684570312, | |
| "epoch": 0.6, | |
| "grad_norm": 0.4558282792568207, | |
| "kl": 0.1424560546875, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.0545, | |
| "reward": 0.517300067236647, | |
| "reward_std": 0.8328704386949539, | |
| "rewards/cosine_scaled_reward": 0.014602408395148814, | |
| "rewards/format_reward": 0.4880952388048172, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2635.4226989746094, | |
| "epoch": 0.604, | |
| "grad_norm": 0.3748377859592438, | |
| "kl": 0.18310546875, | |
| "learning_rate": 8.926922383915315e-07, | |
| "loss": 0.0372, | |
| "reward": 0.09937155619263649, | |
| "reward_std": 0.7007840871810913, | |
| "rewards/cosine_scaled_reward": -0.1259094497654587, | |
| "rewards/format_reward": 0.3511904776096344, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2366.8036499023438, | |
| "epoch": 0.608, | |
| "grad_norm": 0.7343178391456604, | |
| "kl": 0.196533203125, | |
| "learning_rate": 8.906477750432903e-07, | |
| "loss": 0.1069, | |
| "reward": 0.6113147716969252, | |
| "reward_std": 0.8982192724943161, | |
| "rewards/cosine_scaled_reward": 0.028871658723801374, | |
| "rewards/format_reward": 0.5535714402794838, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2539.6726684570312, | |
| "epoch": 0.612, | |
| "grad_norm": 0.3661244213581085, | |
| "kl": 0.23095703125, | |
| "learning_rate": 8.88586709003076e-07, | |
| "loss": 0.0493, | |
| "reward": 0.3096798346377909, | |
| "reward_std": 0.6143878847360611, | |
| "rewards/cosine_scaled_reward": -0.07432675641030073, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2693.077392578125, | |
| "epoch": 0.616, | |
| "grad_norm": 0.39779341220855713, | |
| "kl": 0.26123046875, | |
| "learning_rate": 8.865091407243394e-07, | |
| "loss": 0.0484, | |
| "reward": 0.20376494899392128, | |
| "reward_std": 0.7454717755317688, | |
| "rewards/cosine_scaled_reward": -0.0856175352819264, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2522.8333740234375, | |
| "epoch": 0.62, | |
| "grad_norm": 0.7077339291572571, | |
| "kl": 0.275634765625, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": 0.1048, | |
| "reward": 0.28493453562259674, | |
| "reward_std": 0.7751601040363312, | |
| "rewards/cosine_scaled_reward": -0.050985115580260754, | |
| "rewards/format_reward": 0.3869047649204731, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2791.2083740234375, | |
| "epoch": 0.624, | |
| "grad_norm": 0.6277625560760498, | |
| "kl": 0.3359375, | |
| "learning_rate": 8.823049032816478e-07, | |
| "loss": 0.063, | |
| "reward": 0.15741928666830063, | |
| "reward_std": 0.7891719415783882, | |
| "rewards/cosine_scaled_reward": -0.1058141621761024, | |
| "rewards/format_reward": 0.3690476268529892, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2756.0596313476562, | |
| "epoch": 0.628, | |
| "grad_norm": 0.9464259147644043, | |
| "kl": 0.35107421875, | |
| "learning_rate": 8.801784390262943e-07, | |
| "loss": 0.1337, | |
| "reward": 0.20047340355813503, | |
| "reward_std": 0.7717511355876923, | |
| "rewards/cosine_scaled_reward": -0.10214426182210445, | |
| "rewards/format_reward": 0.4047619178891182, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2546.386993408203, | |
| "epoch": 0.632, | |
| "grad_norm": 0.9672547578811646, | |
| "kl": 0.3583984375, | |
| "learning_rate": 8.780358823396352e-07, | |
| "loss": 0.1309, | |
| "reward": 0.3896455895155668, | |
| "reward_std": 0.8362017869949341, | |
| "rewards/cosine_scaled_reward": -0.025415319949388504, | |
| "rewards/format_reward": 0.4404761902987957, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2490.3869018554688, | |
| "epoch": 0.636, | |
| "grad_norm": 0.5016924738883972, | |
| "kl": 0.37744140625, | |
| "learning_rate": 8.758773376468604e-07, | |
| "loss": 0.0548, | |
| "reward": 0.3971053212881088, | |
| "reward_std": 0.6817308068275452, | |
| "rewards/cosine_scaled_reward": -0.07823306252248585, | |
| "rewards/format_reward": 0.5535714328289032, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2715.90478515625, | |
| "epoch": 0.64, | |
| "grad_norm": 0.776878833770752, | |
| "kl": 0.4169921875, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.1414, | |
| "reward": 0.37737663462758064, | |
| "reward_std": 0.8348212540149689, | |
| "rewards/cosine_scaled_reward": -0.03452597954310477, | |
| "rewards/format_reward": 0.4464285746216774, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2636.0357666015625, | |
| "epoch": 0.644, | |
| "grad_norm": 1.2749825716018677, | |
| "kl": 0.48486328125, | |
| "learning_rate": 8.715127058347614e-07, | |
| "loss": 0.1445, | |
| "reward": 0.24750607460737228, | |
| "reward_std": 0.7917188853025436, | |
| "rewards/cosine_scaled_reward": -0.10541364271193743, | |
| "rewards/format_reward": 0.4583333469927311, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2622.9405517578125, | |
| "epoch": 0.648, | |
| "grad_norm": 1.3737562894821167, | |
| "kl": 0.5888671875, | |
| "learning_rate": 8.693068314414344e-07, | |
| "loss": 0.1549, | |
| "reward": 0.10282446062774397, | |
| "reward_std": 0.6833581179380417, | |
| "rewards/cosine_scaled_reward": -0.18073063343763351, | |
| "rewards/format_reward": 0.4642857275903225, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2187.9286193847656, | |
| "epoch": 0.652, | |
| "grad_norm": 1.2476062774658203, | |
| "kl": 0.64453125, | |
| "learning_rate": 8.670853944836176e-07, | |
| "loss": 0.1442, | |
| "reward": 0.5821249708533287, | |
| "reward_std": 0.8525291532278061, | |
| "rewards/cosine_scaled_reward": -0.0303660926874727, | |
| "rewards/format_reward": 0.6428571492433548, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2645.2202758789062, | |
| "epoch": 0.656, | |
| "grad_norm": 0.8759295344352722, | |
| "kl": 0.83203125, | |
| "learning_rate": 8.648485032310144e-07, | |
| "loss": 0.1369, | |
| "reward": 0.354750145226717, | |
| "reward_std": 0.6708278656005859, | |
| "rewards/cosine_scaled_reward": -0.09941063448786736, | |
| "rewards/format_reward": 0.5535714328289032, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2744.5952758789062, | |
| "epoch": 0.66, | |
| "grad_norm": 1.443908452987671, | |
| "kl": 0.9365234375, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": 0.1514, | |
| "reward": 0.07671361323446035, | |
| "reward_std": 0.7401341199874878, | |
| "rewards/cosine_scaled_reward": -0.16997654270380735, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2762.2381591796875, | |
| "epoch": 0.664, | |
| "grad_norm": 2.171701192855835, | |
| "kl": 1.064453125, | |
| "learning_rate": 8.603287946810513e-07, | |
| "loss": 0.0493, | |
| "reward": 0.3810354620218277, | |
| "reward_std": 0.6359066590666771, | |
| "rewards/cosine_scaled_reward": -0.05650608614087105, | |
| "rewards/format_reward": 0.4940476231276989, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2410.6726684570312, | |
| "epoch": 0.668, | |
| "grad_norm": 1.1915135383605957, | |
| "kl": 0.9716796875, | |
| "learning_rate": 8.580461976679099e-07, | |
| "loss": 0.1178, | |
| "reward": 0.5956609398126602, | |
| "reward_std": 0.7429262697696686, | |
| "rewards/cosine_scaled_reward": -0.011693337932229042, | |
| "rewards/format_reward": 0.6190476268529892, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2624.7083740234375, | |
| "epoch": 0.672, | |
| "grad_norm": 1.2750567197799683, | |
| "kl": 1.111328125, | |
| "learning_rate": 8.557485869176825e-07, | |
| "loss": 0.1676, | |
| "reward": 0.35937594436109066, | |
| "reward_std": 0.7485721707344055, | |
| "rewards/cosine_scaled_reward": -0.0613834522664547, | |
| "rewards/format_reward": 0.482142873108387, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2566.7857666015625, | |
| "epoch": 0.676, | |
| "grad_norm": 0.8985515832901001, | |
| "kl": 1.0439453125, | |
| "learning_rate": 8.534360744126753e-07, | |
| "loss": 0.1232, | |
| "reward": 0.23157138470560312, | |
| "reward_std": 0.6288014650344849, | |
| "rewards/cosine_scaled_reward": -0.14314288273453712, | |
| "rewards/format_reward": 0.5178571492433548, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2820.5059814453125, | |
| "epoch": 0.68, | |
| "grad_norm": 1.1454522609710693, | |
| "kl": 0.9677734375, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": 0.1412, | |
| "reward": 0.08721911488100886, | |
| "reward_std": 0.6948041319847107, | |
| "rewards/cosine_scaled_reward": -0.16769996285438538, | |
| "rewards/format_reward": 0.4226190522313118, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2376.166748046875, | |
| "epoch": 0.684, | |
| "grad_norm": 0.9355194568634033, | |
| "kl": 0.9521484375, | |
| "learning_rate": 8.487667956935087e-07, | |
| "loss": 0.128, | |
| "reward": 0.41750996466726065, | |
| "reward_std": 0.7085302621126175, | |
| "rewards/cosine_scaled_reward": -0.05910217575728893, | |
| "rewards/format_reward": 0.535714291036129, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2571.5000610351562, | |
| "epoch": 0.688, | |
| "grad_norm": 0.9496890902519226, | |
| "kl": 1.0341796875, | |
| "learning_rate": 8.464102570534061e-07, | |
| "loss": 0.147, | |
| "reward": 0.21527537889778614, | |
| "reward_std": 0.6487467139959335, | |
| "rewards/cosine_scaled_reward": -0.2048623152077198, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2577.839324951172, | |
| "epoch": 0.692, | |
| "grad_norm": 1.125106692314148, | |
| "kl": 1.005859375, | |
| "learning_rate": 8.440392717955475e-07, | |
| "loss": 0.1126, | |
| "reward": 0.29065654147416353, | |
| "reward_std": 0.5777322202920914, | |
| "rewards/cosine_scaled_reward": -0.11955267190933228, | |
| "rewards/format_reward": 0.5297619178891182, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2297.0416870117188, | |
| "epoch": 0.696, | |
| "grad_norm": 1.5477794408798218, | |
| "kl": 0.9482421875, | |
| "learning_rate": 8.416539554784089e-07, | |
| "loss": 0.0866, | |
| "reward": 0.35003964975476265, | |
| "reward_std": 0.7120198756456375, | |
| "rewards/cosine_scaled_reward": -0.13152779638767242, | |
| "rewards/format_reward": 0.6130952388048172, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2239.952423095703, | |
| "epoch": 0.7, | |
| "grad_norm": 1.1404165029525757, | |
| "kl": 0.7734375, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": 0.1326, | |
| "reward": 0.7693988904356956, | |
| "reward_std": 0.8029063045978546, | |
| "rewards/cosine_scaled_reward": 0.0543422931805253, | |
| "rewards/format_reward": 0.6607142984867096, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2214.148895263672, | |
| "epoch": 0.704, | |
| "grad_norm": 0.976016104221344, | |
| "kl": 0.8193359375, | |
| "learning_rate": 8.368407953869103e-07, | |
| "loss": 0.1005, | |
| "reward": 0.5222894381731749, | |
| "reward_std": 0.6858630776405334, | |
| "rewards/cosine_scaled_reward": -0.07814099243842065, | |
| "rewards/format_reward": 0.6785714477300644, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2167.4345092773438, | |
| "epoch": 0.708, | |
| "grad_norm": 1.6724809408187866, | |
| "kl": 0.740234375, | |
| "learning_rate": 8.344131861991828e-07, | |
| "loss": 0.1424, | |
| "reward": 0.3468378521502018, | |
| "reward_std": 0.6407709717750549, | |
| "rewards/cosine_scaled_reward": -0.16289059445261955, | |
| "rewards/format_reward": 0.6726190596818924, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2593.5654907226562, | |
| "epoch": 0.712, | |
| "grad_norm": 1.3712421655654907, | |
| "kl": 0.9814453125, | |
| "learning_rate": 8.319717151140072e-07, | |
| "loss": 0.1121, | |
| "reward": 0.27433447539806366, | |
| "reward_std": 0.6857093423604965, | |
| "rewards/cosine_scaled_reward": -0.16342800296843052, | |
| "rewards/format_reward": 0.6011904925107956, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2240.9583129882812, | |
| "epoch": 0.716, | |
| "grad_norm": 2.2109479904174805, | |
| "kl": 0.7880859375, | |
| "learning_rate": 8.295165011252396e-07, | |
| "loss": 0.0613, | |
| "reward": 0.3249462991952896, | |
| "reward_std": 0.7396285533905029, | |
| "rewards/cosine_scaled_reward": -0.12919352855533361, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2391.261962890625, | |
| "epoch": 0.72, | |
| "grad_norm": 0.9252892136573792, | |
| "kl": 0.8369140625, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.0766, | |
| "reward": 0.37066294252872467, | |
| "reward_std": 0.5772489011287689, | |
| "rewards/cosine_scaled_reward": -0.15395426377654076, | |
| "rewards/format_reward": 0.6785714477300644, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2188.21435546875, | |
| "epoch": 0.724, | |
| "grad_norm": 1.4679890871047974, | |
| "kl": 0.7177734375, | |
| "learning_rate": 8.245653237555705e-07, | |
| "loss": 0.1271, | |
| "reward": 0.47163213789463043, | |
| "reward_std": 0.7110278159379959, | |
| "rewards/cosine_scaled_reward": -0.10942202992737293, | |
| "rewards/format_reward": 0.6904762089252472, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2330.3572387695312, | |
| "epoch": 0.728, | |
| "grad_norm": 0.8398174047470093, | |
| "kl": 0.71875, | |
| "learning_rate": 8.220696016880687e-07, | |
| "loss": 0.0837, | |
| "reward": 0.5167603380978107, | |
| "reward_std": 0.704664558172226, | |
| "rewards/cosine_scaled_reward": -0.08090554922819138, | |
| "rewards/format_reward": 0.6785714402794838, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2319.154815673828, | |
| "epoch": 0.732, | |
| "grad_norm": 1.0028657913208008, | |
| "kl": 0.7421875, | |
| "learning_rate": 8.195606193320136e-07, | |
| "loss": 0.1069, | |
| "reward": 0.6520561873912811, | |
| "reward_std": 0.8034340292215347, | |
| "rewards/cosine_scaled_reward": -0.04301954247057438, | |
| "rewards/format_reward": 0.7380952537059784, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2419.6131591796875, | |
| "epoch": 0.736, | |
| "grad_norm": 0.9799902439117432, | |
| "kl": 0.794921875, | |
| "learning_rate": 8.170384989716657e-07, | |
| "loss": 0.1, | |
| "reward": 0.5134465768933296, | |
| "reward_std": 0.7416307479143143, | |
| "rewards/cosine_scaled_reward": -0.10637196339666843, | |
| "rewards/format_reward": 0.7261904776096344, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2279.3630981445312, | |
| "epoch": 0.74, | |
| "grad_norm": 1.1403197050094604, | |
| "kl": 0.75390625, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": 0.08, | |
| "reward": 0.5693989507853985, | |
| "reward_std": 0.6981105357408524, | |
| "rewards/cosine_scaled_reward": -0.06946719996631145, | |
| "rewards/format_reward": 0.7083333507180214, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2087.2679443359375, | |
| "epoch": 0.744, | |
| "grad_norm": 0.8785580992698669, | |
| "kl": 0.6123046875, | |
| "learning_rate": 8.119553365707802e-07, | |
| "loss": 0.0849, | |
| "reward": 0.4244233965873718, | |
| "reward_std": 0.718925341963768, | |
| "rewards/cosine_scaled_reward": -0.1449311599135399, | |
| "rewards/format_reward": 0.7142857313156128, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2415.8036499023438, | |
| "epoch": 0.748, | |
| "grad_norm": 1.325434684753418, | |
| "kl": 0.6298828125, | |
| "learning_rate": 8.093945422764069e-07, | |
| "loss": 0.0477, | |
| "reward": 0.594460990279913, | |
| "reward_std": 0.7041322290897369, | |
| "rewards/cosine_scaled_reward": -0.021221883594989777, | |
| "rewards/format_reward": 0.636904776096344, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2371.0714721679688, | |
| "epoch": 0.752, | |
| "grad_norm": 1.3853912353515625, | |
| "kl": 0.638671875, | |
| "learning_rate": 8.068211054579943e-07, | |
| "loss": 0.1131, | |
| "reward": 0.5956445932388306, | |
| "reward_std": 0.7780069708824158, | |
| "rewards/cosine_scaled_reward": -0.062296761316247284, | |
| "rewards/format_reward": 0.7202381044626236, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2243.3333740234375, | |
| "epoch": 0.756, | |
| "grad_norm": 0.7066504955291748, | |
| "kl": 0.564453125, | |
| "learning_rate": 8.04235151541222e-07, | |
| "loss": 0.043, | |
| "reward": 0.7391829118132591, | |
| "reward_std": 0.6626263409852982, | |
| "rewards/cosine_scaled_reward": -0.014337139204144478, | |
| "rewards/format_reward": 0.767857164144516, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2111.875030517578, | |
| "epoch": 0.76, | |
| "grad_norm": 1.1808303594589233, | |
| "kl": 0.5361328125, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.0212, | |
| "reward": 0.6303885579109192, | |
| "reward_std": 0.7266089022159576, | |
| "rewards/cosine_scaled_reward": -0.04790095146745443, | |
| "rewards/format_reward": 0.7261904925107956, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2512.1607666015625, | |
| "epoch": 0.764, | |
| "grad_norm": 1.169936180114746, | |
| "kl": 0.54736328125, | |
| "learning_rate": 7.990261971595048e-07, | |
| "loss": 0.0239, | |
| "reward": 0.4208872392773628, | |
| "reward_std": 0.6789906620979309, | |
| "rewards/cosine_scaled_reward": -0.12288972595706582, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2421.7381591796875, | |
| "epoch": 0.768, | |
| "grad_norm": 1.9125944375991821, | |
| "kl": 0.44970703125, | |
| "learning_rate": 7.964034505716476e-07, | |
| "loss": 0.162, | |
| "reward": 0.6703099310398102, | |
| "reward_std": 0.7079124301671982, | |
| "rewards/cosine_scaled_reward": 0.022654948756098747, | |
| "rewards/format_reward": 0.625, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2342.3333435058594, | |
| "epoch": 0.772, | |
| "grad_norm": 1.1848394870758057, | |
| "kl": 0.4287109375, | |
| "learning_rate": 7.93768694627233e-07, | |
| "loss": 0.1217, | |
| "reward": 0.3946942985057831, | |
| "reward_std": 0.7293716818094254, | |
| "rewards/cosine_scaled_reward": -0.14789094775915146, | |
| "rewards/format_reward": 0.6904762089252472, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2488.1786193847656, | |
| "epoch": 0.776, | |
| "grad_norm": 0.8427687883377075, | |
| "kl": 0.40673828125, | |
| "learning_rate": 7.911220577405484e-07, | |
| "loss": 0.0681, | |
| "reward": 0.33857931289821863, | |
| "reward_std": 0.7693478316068649, | |
| "rewards/cosine_scaled_reward": -0.11642462853342295, | |
| "rewards/format_reward": 0.5714285671710968, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2235.4762573242188, | |
| "epoch": 0.78, | |
| "grad_norm": 1.9778449535369873, | |
| "kl": 0.4599609375, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": 0.1203, | |
| "reward": 0.7276730462908745, | |
| "reward_std": 0.8504652380943298, | |
| "rewards/cosine_scaled_reward": 0.02455079648643732, | |
| "rewards/format_reward": 0.6785714328289032, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2252.7262268066406, | |
| "epoch": 0.784, | |
| "grad_norm": 1.251224398612976, | |
| "kl": 0.49169921875, | |
| "learning_rate": 7.857936576865356e-07, | |
| "loss": 0.0753, | |
| "reward": 0.6360676661133766, | |
| "reward_std": 0.8185366541147232, | |
| "rewards/cosine_scaled_reward": -0.01827568793669343, | |
| "rewards/format_reward": 0.6726190596818924, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2399.7500610351562, | |
| "epoch": 0.788, | |
| "grad_norm": 0.9470409154891968, | |
| "kl": 0.517578125, | |
| "learning_rate": 7.831121542179086e-07, | |
| "loss": 0.1036, | |
| "reward": 0.550631508231163, | |
| "reward_std": 0.7208298593759537, | |
| "rewards/cosine_scaled_reward": -0.037184251472353935, | |
| "rewards/format_reward": 0.625, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2269.5000610351562, | |
| "epoch": 0.792, | |
| "grad_norm": 2.047698974609375, | |
| "kl": 0.595703125, | |
| "learning_rate": 7.804192891917571e-07, | |
| "loss": 0.1831, | |
| "reward": 0.29151881486177444, | |
| "reward_std": 0.666583925485611, | |
| "rewards/cosine_scaled_reward": -0.15483582392334938, | |
| "rewards/format_reward": 0.601190485060215, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2316.7857666015625, | |
| "epoch": 0.796, | |
| "grad_norm": 1.6713296175003052, | |
| "kl": 0.60205078125, | |
| "learning_rate": 7.777151938545235e-07, | |
| "loss": 0.1356, | |
| "reward": 0.5018086154013872, | |
| "reward_std": 0.8012387007474899, | |
| "rewards/cosine_scaled_reward": -0.0615957040572539, | |
| "rewards/format_reward": 0.6250000223517418, | |
| "step": 199 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2471.1964721679688, | |
| "epoch": 0.8, | |
| "grad_norm": 0.9633775949478149, | |
| "kl": 0.740234375, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.1277, | |
| "reward": 0.3792301341891289, | |
| "reward_std": 0.76199010014534, | |
| "rewards/cosine_scaled_reward": -0.11693255044519901, | |
| "rewards/format_reward": 0.6130952537059784, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2280.6607971191406, | |
| "epoch": 0.804, | |
| "grad_norm": 1.1369765996932983, | |
| "kl": 0.7587890625, | |
| "learning_rate": 7.72273839962904e-07, | |
| "loss": 0.1174, | |
| "reward": 0.4361310079693794, | |
| "reward_std": 0.7977508455514908, | |
| "rewards/cosine_scaled_reward": -0.0736011671833694, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2239.3809814453125, | |
| "epoch": 0.808, | |
| "grad_norm": 1.1852681636810303, | |
| "kl": 0.80078125, | |
| "learning_rate": 7.695368466124296e-07, | |
| "loss": 0.1861, | |
| "reward": 0.4273875653743744, | |
| "reward_std": 0.7939650565385818, | |
| "rewards/cosine_scaled_reward": -0.08987765479832888, | |
| "rewards/format_reward": 0.6071428656578064, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2270.9524536132812, | |
| "epoch": 0.812, | |
| "grad_norm": 2.2510244846343994, | |
| "kl": 1.05859375, | |
| "learning_rate": 7.667891533457718e-07, | |
| "loss": 0.1778, | |
| "reward": 0.5268369093537331, | |
| "reward_std": 0.7606751769781113, | |
| "rewards/cosine_scaled_reward": -0.05205773119814694, | |
| "rewards/format_reward": 0.630952388048172, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2307.83935546875, | |
| "epoch": 0.816, | |
| "grad_norm": 3.0754034519195557, | |
| "kl": 1.107421875, | |
| "learning_rate": 7.640308940816239e-07, | |
| "loss": 0.1251, | |
| "reward": 0.046380717772990465, | |
| "reward_std": 0.6517826318740845, | |
| "rewards/cosine_scaled_reward": -0.23573821783065796, | |
| "rewards/format_reward": 0.5178571566939354, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2065.482177734375, | |
| "epoch": 0.82, | |
| "grad_norm": 3.317054033279419, | |
| "kl": 0.8037109375, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.1229, | |
| "reward": 0.629617914557457, | |
| "reward_std": 0.7360707223415375, | |
| "rewards/cosine_scaled_reward": -0.012572012841701508, | |
| "rewards/format_reward": 0.654761902987957, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2119.2679443359375, | |
| "epoch": 0.824, | |
| "grad_norm": 1.985148310661316, | |
| "kl": 0.70849609375, | |
| "learning_rate": 7.584832158039378e-07, | |
| "loss": 0.1697, | |
| "reward": 0.4503296762704849, | |
| "reward_std": 0.7717154771089554, | |
| "rewards/cosine_scaled_reward": -0.07840658072382212, | |
| "rewards/format_reward": 0.6071428656578064, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2085.839324951172, | |
| "epoch": 0.828, | |
| "grad_norm": 2.4033172130584717, | |
| "kl": 0.6025390625, | |
| "learning_rate": 7.556940671764124e-07, | |
| "loss": 0.1578, | |
| "reward": 0.4145805863663554, | |
| "reward_std": 0.7361099421977997, | |
| "rewards/cosine_scaled_reward": -0.11116209626197815, | |
| "rewards/format_reward": 0.6369047611951828, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1872.482177734375, | |
| "epoch": 0.832, | |
| "grad_norm": 2.11576247215271, | |
| "kl": 0.408203125, | |
| "learning_rate": 7.528948933102438e-07, | |
| "loss": 0.0839, | |
| "reward": 0.4670650511980057, | |
| "reward_std": 0.7250475585460663, | |
| "rewards/cosine_scaled_reward": -0.10872937482781708, | |
| "rewards/format_reward": 0.6845238208770752, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2085.6786499023438, | |
| "epoch": 0.836, | |
| "grad_norm": 0.8786793351173401, | |
| "kl": 0.51806640625, | |
| "learning_rate": 7.500858306332172e-07, | |
| "loss": 0.0649, | |
| "reward": 0.46545055881142616, | |
| "reward_std": 0.6805593073368073, | |
| "rewards/cosine_scaled_reward": -0.09465568419545889, | |
| "rewards/format_reward": 0.6547619104385376, | |
| "step": 209 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2425.7916870117188, | |
| "epoch": 0.84, | |
| "grad_norm": 1.3337445259094238, | |
| "kl": 0.58837890625, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": 0.1561, | |
| "reward": 0.4684627018868923, | |
| "reward_std": 0.824245348572731, | |
| "rewards/cosine_scaled_reward": -0.04553056287113577, | |
| "rewards/format_reward": 0.5595238208770752, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2630.5655517578125, | |
| "epoch": 0.844, | |
| "grad_norm": 1.3039979934692383, | |
| "kl": 0.732421875, | |
| "learning_rate": 7.444385869608921e-07, | |
| "loss": 0.1559, | |
| "reward": 0.1796425711363554, | |
| "reward_std": 0.6979469060897827, | |
| "rewards/cosine_scaled_reward": -0.17803586274385452, | |
| "rewards/format_reward": 0.5357143059372902, | |
| "step": 211 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1983.9762268066406, | |
| "epoch": 0.848, | |
| "grad_norm": 0.9129418134689331, | |
| "kl": 0.546875, | |
| "learning_rate": 7.416006812042827e-07, | |
| "loss": 0.1352, | |
| "reward": 0.4564796891063452, | |
| "reward_std": 0.6133182421326637, | |
| "rewards/cosine_scaled_reward": -0.11402205377817154, | |
| "rewards/format_reward": 0.6845238208770752, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2315.5119018554688, | |
| "epoch": 0.852, | |
| "grad_norm": 1.2220977544784546, | |
| "kl": 0.5751953125, | |
| "learning_rate": 7.387534371007797e-07, | |
| "loss": 0.1683, | |
| "reward": 0.6708191484212875, | |
| "reward_std": 0.9547160714864731, | |
| "rewards/cosine_scaled_reward": 0.016957183834165335, | |
| "rewards/format_reward": 0.636904776096344, | |
| "step": 213 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2233.0655212402344, | |
| "epoch": 0.856, | |
| "grad_norm": 0.7978451251983643, | |
| "kl": 0.61474609375, | |
| "learning_rate": 7.358969934210438e-07, | |
| "loss": 0.1304, | |
| "reward": 0.40765415877103806, | |
| "reward_std": 0.7158278822898865, | |
| "rewards/cosine_scaled_reward": -0.12057768838712946, | |
| "rewards/format_reward": 0.6488095372915268, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2194.791748046875, | |
| "epoch": 0.86, | |
| "grad_norm": 1.0176509618759155, | |
| "kl": 0.61181640625, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": 0.0991, | |
| "reward": 0.5912733934819698, | |
| "reward_std": 0.6540912538766861, | |
| "rewards/cosine_scaled_reward": -0.013887112960219383, | |
| "rewards/format_reward": 0.6190476194024086, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2189.5535888671875, | |
| "epoch": 0.864, | |
| "grad_norm": 0.7862021923065186, | |
| "kl": 0.6572265625, | |
| "learning_rate": 7.301570646506027e-07, | |
| "loss": 0.1369, | |
| "reward": 0.4810000769793987, | |
| "reward_std": 0.6697472035884857, | |
| "rewards/cosine_scaled_reward": -0.09283328615128994, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2238.6487731933594, | |
| "epoch": 0.868, | |
| "grad_norm": 0.675116240978241, | |
| "kl": 0.662109375, | |
| "learning_rate": 7.27273859315928e-07, | |
| "loss": 0.1234, | |
| "reward": 0.3597661480307579, | |
| "reward_std": 0.6638298779726028, | |
| "rewards/cosine_scaled_reward": -0.13559313118457794, | |
| "rewards/format_reward": 0.6309523731470108, | |
| "step": 217 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2330.7560424804688, | |
| "epoch": 0.872, | |
| "grad_norm": 0.7294526696205139, | |
| "kl": 0.6875, | |
| "learning_rate": 7.243820139034464e-07, | |
| "loss": 0.1182, | |
| "reward": 0.5070892386138439, | |
| "reward_std": 0.770987793803215, | |
| "rewards/cosine_scaled_reward": -0.029193488880991936, | |
| "rewards/format_reward": 0.565476194024086, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2425.4048461914062, | |
| "epoch": 0.876, | |
| "grad_norm": 0.9955194592475891, | |
| "kl": 0.76171875, | |
| "learning_rate": 7.214816693576234e-07, | |
| "loss": 0.145, | |
| "reward": 0.3850390911102295, | |
| "reward_std": 0.72886823117733, | |
| "rewards/cosine_scaled_reward": -0.11700426135212183, | |
| "rewards/format_reward": 0.6190476417541504, | |
| "step": 219 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2443.869110107422, | |
| "epoch": 0.88, | |
| "grad_norm": 0.8245673179626465, | |
| "kl": 0.7412109375, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": 0.1517, | |
| "reward": 0.3367026010528207, | |
| "reward_std": 0.6719767898321152, | |
| "rewards/cosine_scaled_reward": -0.1471248921006918, | |
| "rewards/format_reward": 0.630952388048172, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2401.65478515625, | |
| "epoch": 0.884, | |
| "grad_norm": 0.6434879302978516, | |
| "kl": 0.552734375, | |
| "learning_rate": 7.156560487081051e-07, | |
| "loss": 0.0964, | |
| "reward": 0.5589644331485033, | |
| "reward_std": 0.6387112140655518, | |
| "rewards/cosine_scaled_reward": -0.030041599762625992, | |
| "rewards/format_reward": 0.619047611951828, | |
| "step": 221 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2159.9107666015625, | |
| "epoch": 0.888, | |
| "grad_norm": 0.8747764229774475, | |
| "kl": 0.48974609375, | |
| "learning_rate": 7.127310565369415e-07, | |
| "loss": 0.0648, | |
| "reward": 1.0575831979513168, | |
| "reward_std": 0.8345089554786682, | |
| "rewards/cosine_scaled_reward": 0.15379157848656178, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2736.8095703125, | |
| "epoch": 0.892, | |
| "grad_norm": 1.644534707069397, | |
| "kl": 0.650390625, | |
| "learning_rate": 7.097981330836616e-07, | |
| "loss": 0.0898, | |
| "reward": 0.28782752249389887, | |
| "reward_std": 0.6842672526836395, | |
| "rewards/cosine_scaled_reward": -0.11799101112410426, | |
| "rewards/format_reward": 0.5238095298409462, | |
| "step": 223 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2585.4464721679688, | |
| "epoch": 0.896, | |
| "grad_norm": 0.5411848425865173, | |
| "kl": 0.603515625, | |
| "learning_rate": 7.068574212948169e-07, | |
| "loss": 0.1243, | |
| "reward": 0.49723897874355316, | |
| "reward_std": 0.810086615383625, | |
| "rewards/cosine_scaled_reward": -0.07280909270048141, | |
| "rewards/format_reward": 0.6428571492433548, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2394.8988647460938, | |
| "epoch": 0.9, | |
| "grad_norm": 0.7165555357933044, | |
| "kl": 0.52783203125, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.0888, | |
| "reward": 0.5129196643829346, | |
| "reward_std": 0.787805512547493, | |
| "rewards/cosine_scaled_reward": -0.056040180614218116, | |
| "rewards/format_reward": 0.6250000298023224, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2482.7678833007812, | |
| "epoch": 0.904, | |
| "grad_norm": 0.5211958289146423, | |
| "kl": 0.51416015625, | |
| "learning_rate": 7.009532063876148e-07, | |
| "loss": 0.0812, | |
| "reward": 0.4906727410852909, | |
| "reward_std": 0.7880082875490189, | |
| "rewards/cosine_scaled_reward": -0.0641874436987564, | |
| "rewards/format_reward": 0.6190476417541504, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2290.0000610351562, | |
| "epoch": 0.908, | |
| "grad_norm": 0.5630519986152649, | |
| "kl": 0.382568359375, | |
| "learning_rate": 6.979899910323624e-07, | |
| "loss": 0.1034, | |
| "reward": 0.6861637309193611, | |
| "reward_std": 0.7359699308872223, | |
| "rewards/cosine_scaled_reward": -0.049775293562561274, | |
| "rewards/format_reward": 0.7857142984867096, | |
| "step": 227 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2686.7440795898438, | |
| "epoch": 0.912, | |
| "grad_norm": 0.6647688746452332, | |
| "kl": 0.4326171875, | |
| "learning_rate": 6.950195628537299e-07, | |
| "loss": 0.0594, | |
| "reward": 0.5352285588160157, | |
| "reward_std": 0.7634364515542984, | |
| "rewards/cosine_scaled_reward": -0.047861908678896725, | |
| "rewards/format_reward": 0.6309524029493332, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2687.5416564941406, | |
| "epoch": 0.916, | |
| "grad_norm": 0.37424567341804504, | |
| "kl": 0.39208984375, | |
| "learning_rate": 6.920420666261961e-07, | |
| "loss": 0.0465, | |
| "reward": 0.43462158273905516, | |
| "reward_std": 0.6648337990045547, | |
| "rewards/cosine_scaled_reward": -0.1070939814671874, | |
| "rewards/format_reward": 0.6488095298409462, | |
| "step": 229 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2462.8452758789062, | |
| "epoch": 0.92, | |
| "grad_norm": 0.52641361951828, | |
| "kl": 0.37158203125, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.1061, | |
| "reward": 0.5536616146564484, | |
| "reward_std": 0.6706894189119339, | |
| "rewards/cosine_scaled_reward": -0.06840727850794792, | |
| "rewards/format_reward": 0.690476194024086, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2395.279754638672, | |
| "epoch": 0.924, | |
| "grad_norm": 0.5165700912475586, | |
| "kl": 0.369140625, | |
| "learning_rate": 6.860664508377001e-07, | |
| "loss": 0.0951, | |
| "reward": 0.4786584824323654, | |
| "reward_std": 0.774825245141983, | |
| "rewards/cosine_scaled_reward": -0.1267421804368496, | |
| "rewards/format_reward": 0.7321428656578064, | |
| "step": 231 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2468.2857666015625, | |
| "epoch": 0.928, | |
| "grad_norm": 0.4581441879272461, | |
| "kl": 0.31591796875, | |
| "learning_rate": 6.83068622519821e-07, | |
| "loss": 0.0555, | |
| "reward": 0.6299031171947718, | |
| "reward_std": 0.7808382511138916, | |
| "rewards/cosine_scaled_reward": -0.045167478267103434, | |
| "rewards/format_reward": 0.7202381044626236, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2712.58935546875, | |
| "epoch": 0.932, | |
| "grad_norm": 0.7744795083999634, | |
| "kl": 0.333984375, | |
| "learning_rate": 6.800643086250121e-07, | |
| "loss": 0.0623, | |
| "reward": 0.6912369206547737, | |
| "reward_std": 0.7789230197668076, | |
| "rewards/cosine_scaled_reward": 0.04204704426229, | |
| "rewards/format_reward": 0.607142873108387, | |
| "step": 233 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2556.619110107422, | |
| "epoch": 0.936, | |
| "grad_norm": 0.8385416865348816, | |
| "kl": 0.38427734375, | |
| "learning_rate": 6.770536555792944e-07, | |
| "loss": 0.0662, | |
| "reward": 0.4830031730234623, | |
| "reward_std": 0.7291474640369415, | |
| "rewards/cosine_scaled_reward": -0.07397460378706455, | |
| "rewards/format_reward": 0.6309523731470108, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2724.8809814453125, | |
| "epoch": 0.94, | |
| "grad_norm": 0.6943939328193665, | |
| "kl": 0.330078125, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": 0.1008, | |
| "reward": 0.38701344281435013, | |
| "reward_std": 0.7834271490573883, | |
| "rewards/cosine_scaled_reward": -0.11304090730845928, | |
| "rewards/format_reward": 0.6130952388048172, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2819.7738647460938, | |
| "epoch": 0.944, | |
| "grad_norm": 0.3916683495044708, | |
| "kl": 0.32177734375, | |
| "learning_rate": 6.710139192768694e-07, | |
| "loss": 0.0365, | |
| "reward": 0.5249419808387756, | |
| "reward_std": 0.8138006925582886, | |
| "rewards/cosine_scaled_reward": -0.023243289440870285, | |
| "rewards/format_reward": 0.571428582072258, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2499.0536499023438, | |
| "epoch": 0.948, | |
| "grad_norm": 0.9175835847854614, | |
| "kl": 0.321533203125, | |
| "learning_rate": 6.679851303883891e-07, | |
| "loss": 0.1055, | |
| "reward": 0.6389507204294205, | |
| "reward_std": 0.8023868650197983, | |
| "rewards/cosine_scaled_reward": -0.04064369201660156, | |
| "rewards/format_reward": 0.7202381044626236, | |
| "step": 237 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2557.0655517578125, | |
| "epoch": 0.952, | |
| "grad_norm": 0.4397272765636444, | |
| "kl": 0.30810546875, | |
| "learning_rate": 6.649505910711058e-07, | |
| "loss": 0.0869, | |
| "reward": 0.4888541977852583, | |
| "reward_std": 0.7550098150968552, | |
| "rewards/cosine_scaled_reward": -0.09783481806516647, | |
| "rewards/format_reward": 0.684523805975914, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2600.422637939453, | |
| "epoch": 0.956, | |
| "grad_norm": 0.9344379305839539, | |
| "kl": 0.345703125, | |
| "learning_rate": 6.619104492241847e-07, | |
| "loss": 0.1329, | |
| "reward": 0.27865387313067913, | |
| "reward_std": 0.6713129729032516, | |
| "rewards/cosine_scaled_reward": -0.18210165202617645, | |
| "rewards/format_reward": 0.6428571417927742, | |
| "step": 239 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2331.8928833007812, | |
| "epoch": 0.96, | |
| "grad_norm": 0.5995355248451233, | |
| "kl": 0.326904296875, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": 0.0705, | |
| "reward": 0.7613647617399693, | |
| "reward_std": 0.8133140057325363, | |
| "rewards/cosine_scaled_reward": 0.023539513116702437, | |
| "rewards/format_reward": 0.7142857164144516, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2355.52978515625, | |
| "epoch": 0.964, | |
| "grad_norm": 0.3258729875087738, | |
| "kl": 0.307861328125, | |
| "learning_rate": 6.558139508961654e-07, | |
| "loss": 0.0608, | |
| "reward": 0.6096780672669411, | |
| "reward_std": 0.7518916502594948, | |
| "rewards/cosine_scaled_reward": -0.05230383496382274, | |
| "rewards/format_reward": 0.7142857313156128, | |
| "step": 241 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2414.0834350585938, | |
| "epoch": 0.968, | |
| "grad_norm": 0.3521139919757843, | |
| "kl": 0.3212890625, | |
| "learning_rate": 6.527578915497951e-07, | |
| "loss": 0.0742, | |
| "reward": 0.6254040375351906, | |
| "reward_std": 0.8331593424081802, | |
| "rewards/cosine_scaled_reward": -0.03551226551644504, | |
| "rewards/format_reward": 0.696428582072258, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2138.107177734375, | |
| "epoch": 0.972, | |
| "grad_norm": 0.5599615573883057, | |
| "kl": 0.33251953125, | |
| "learning_rate": 6.496968239287603e-07, | |
| "loss": 0.0315, | |
| "reward": 0.8373362571001053, | |
| "reward_std": 0.6551230400800705, | |
| "rewards/cosine_scaled_reward": 0.04366813227534294, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 243 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2277.714324951172, | |
| "epoch": 0.976, | |
| "grad_norm": 0.6147165298461914, | |
| "kl": 0.336181640625, | |
| "learning_rate": 6.466308972251785e-07, | |
| "loss": 0.1075, | |
| "reward": 0.46155789494514465, | |
| "reward_std": 0.6391154229640961, | |
| "rewards/cosine_scaled_reward": -0.14124487387016416, | |
| "rewards/format_reward": 0.7440476417541504, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2401.232208251953, | |
| "epoch": 0.98, | |
| "grad_norm": 0.8454631567001343, | |
| "kl": 0.4814453125, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": 0.0417, | |
| "reward": 0.5565547049045563, | |
| "reward_std": 0.6768698394298553, | |
| "rewards/cosine_scaled_reward": -0.04315121428226121, | |
| "rewards/format_reward": 0.6428571492433548, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2027.3453369140625, | |
| "epoch": 0.984, | |
| "grad_norm": 0.38341155648231506, | |
| "kl": 0.289794921875, | |
| "learning_rate": 6.404850645156841e-07, | |
| "loss": 0.0993, | |
| "reward": 0.7784423977136612, | |
| "reward_std": 0.6467820554971695, | |
| "rewards/cosine_scaled_reward": -0.009588314220309258, | |
| "rewards/format_reward": 0.7976190596818924, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2170.5416564941406, | |
| "epoch": 0.988, | |
| "grad_norm": 0.445024311542511, | |
| "kl": 0.4326171875, | |
| "learning_rate": 6.374054580489873e-07, | |
| "loss": 0.1244, | |
| "reward": 0.6971250772476196, | |
| "reward_std": 0.7919557690620422, | |
| "rewards/cosine_scaled_reward": -0.026437478853040375, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 247 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2060.2559814453125, | |
| "epoch": 0.992, | |
| "grad_norm": 0.49659866094589233, | |
| "kl": 0.36865234375, | |
| "learning_rate": 6.343215915635761e-07, | |
| "loss": 0.0959, | |
| "reward": 0.6287773251533508, | |
| "reward_std": 0.7386345416307449, | |
| "rewards/cosine_scaled_reward": -0.060611339285969734, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2148.869140625, | |
| "epoch": 0.996, | |
| "grad_norm": 0.4539166986942291, | |
| "kl": 0.3701171875, | |
| "learning_rate": 6.31233615362752e-07, | |
| "loss": 0.1019, | |
| "reward": 0.4996798560023308, | |
| "reward_std": 0.6163481399416924, | |
| "rewards/cosine_scaled_reward": -0.11027912324061617, | |
| "rewards/format_reward": 0.7202381044626236, | |
| "step": 249 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2330.3482971191406, | |
| "epoch": 1.0, | |
| "grad_norm": 0.5344291925430298, | |
| "kl": 0.5205078125, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.0866, | |
| "reward": 0.42578159645199776, | |
| "reward_std": 0.7348527163267136, | |
| "rewards/cosine_scaled_reward": -0.08472825400531292, | |
| "rewards/format_reward": 0.5952381044626236, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2106.059539794922, | |
| "epoch": 1.004, | |
| "grad_norm": 0.5930522680282593, | |
| "kl": 0.38232421875, | |
| "learning_rate": 6.25045936022246e-07, | |
| "loss": 0.1423, | |
| "reward": 0.5456876549869776, | |
| "reward_std": 0.6847013607621193, | |
| "rewards/cosine_scaled_reward": -0.0842990386299789, | |
| "rewards/format_reward": 0.7142857164144516, | |
| "step": 251 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2112.4762573242188, | |
| "epoch": 1.008, | |
| "grad_norm": 0.4610899090766907, | |
| "kl": 0.39111328125, | |
| "learning_rate": 6.219465344613258e-07, | |
| "loss": 0.0641, | |
| "reward": 0.6148004308342934, | |
| "reward_std": 0.6790047585964203, | |
| "rewards/cosine_scaled_reward": -0.05867121648043394, | |
| "rewards/format_reward": 0.7321428805589676, | |
| "step": 252 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2289.309600830078, | |
| "epoch": 1.012, | |
| "grad_norm": 0.3950199782848358, | |
| "kl": 0.387451171875, | |
| "learning_rate": 6.188436263278172e-07, | |
| "loss": 0.1336, | |
| "reward": 0.626802071928978, | |
| "reward_std": 0.6337872818112373, | |
| "rewards/cosine_scaled_reward": -0.028860883321613073, | |
| "rewards/format_reward": 0.6845238283276558, | |
| "step": 253 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2465.6130981445312, | |
| "epoch": 1.016, | |
| "grad_norm": 0.6084108352661133, | |
| "kl": 0.45947265625, | |
| "learning_rate": 6.157373628530852e-07, | |
| "loss": 0.0932, | |
| "reward": 0.6250473670661449, | |
| "reward_std": 0.7445118278264999, | |
| "rewards/cosine_scaled_reward": 2.3671891540288925e-05, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 254 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2127.8988647460938, | |
| "epoch": 1.02, | |
| "grad_norm": 0.8596522212028503, | |
| "kl": 0.368896484375, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": 0.0589, | |
| "reward": 0.4597589522600174, | |
| "reward_std": 0.710930123925209, | |
| "rewards/cosine_scaled_reward": -0.1361919562332332, | |
| "rewards/format_reward": 0.7321428656578064, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2113.029815673828, | |
| "epoch": 1.024, | |
| "grad_norm": 0.6557802557945251, | |
| "kl": 0.39306640625, | |
| "learning_rate": 6.095153756157051e-07, | |
| "loss": 0.0808, | |
| "reward": 0.7969172149896622, | |
| "reward_std": 0.7165066450834274, | |
| "rewards/cosine_scaled_reward": 0.02643477637320757, | |
| "rewards/format_reward": 0.7440476268529892, | |
| "step": 256 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2350.4881591796875, | |
| "epoch": 1.028, | |
| "grad_norm": 0.7259902954101562, | |
| "kl": 0.37548828125, | |
| "learning_rate": 6.06399955103937e-07, | |
| "loss": 0.0556, | |
| "reward": 0.6144686937332153, | |
| "reward_std": 0.7161982655525208, | |
| "rewards/cosine_scaled_reward": -0.0052656568586826324, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 257 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2659.2500610351562, | |
| "epoch": 1.032, | |
| "grad_norm": 0.6974296569824219, | |
| "kl": 0.4482421875, | |
| "learning_rate": 6.032817857379256e-07, | |
| "loss": 0.1425, | |
| "reward": 0.38613639771938324, | |
| "reward_std": 0.7526693046092987, | |
| "rewards/cosine_scaled_reward": -0.10455084778368473, | |
| "rewards/format_reward": 0.595238097012043, | |
| "step": 258 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2219.6964721679688, | |
| "epoch": 1.036, | |
| "grad_norm": 0.5528798699378967, | |
| "kl": 0.33984375, | |
| "learning_rate": 6.001610194928464e-07, | |
| "loss": 0.1191, | |
| "reward": 0.6971464306116104, | |
| "reward_std": 0.7383679300546646, | |
| "rewards/cosine_scaled_reward": -0.023450596883776598, | |
| "rewards/format_reward": 0.744047611951828, | |
| "step": 259 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2010.1726989746094, | |
| "epoch": 1.04, | |
| "grad_norm": 0.36631372570991516, | |
| "kl": 0.30126953125, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.0699, | |
| "reward": 0.771461233496666, | |
| "reward_std": 0.5148339942097664, | |
| "rewards/cosine_scaled_reward": -0.01307891309261322, | |
| "rewards/format_reward": 0.7976190596818924, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2153.559539794922, | |
| "epoch": 1.044, | |
| "grad_norm": 0.48435378074645996, | |
| "kl": 0.3251953125, | |
| "learning_rate": 5.939123048916173e-07, | |
| "loss": 0.0931, | |
| "reward": 0.5015835016965866, | |
| "reward_std": 0.69777412712574, | |
| "rewards/cosine_scaled_reward": -0.121232058852911, | |
| "rewards/format_reward": 0.7440476268529892, | |
| "step": 261 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2309.1131591796875, | |
| "epoch": 1.048, | |
| "grad_norm": 0.6150787472724915, | |
| "kl": 0.37060546875, | |
| "learning_rate": 5.907846610890011e-07, | |
| "loss": 0.1074, | |
| "reward": 0.656824603676796, | |
| "reward_std": 0.7539815902709961, | |
| "rewards/cosine_scaled_reward": -0.025754368398338556, | |
| "rewards/format_reward": 0.7083333283662796, | |
| "step": 262 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2073.1488647460938, | |
| "epoch": 1.052, | |
| "grad_norm": 0.5915967226028442, | |
| "kl": 0.32958984375, | |
| "learning_rate": 5.87655029499542e-07, | |
| "loss": 0.1016, | |
| "reward": 0.5839189141988754, | |
| "reward_std": 0.6906930133700371, | |
| "rewards/cosine_scaled_reward": -0.10089768993202597, | |
| "rewards/format_reward": 0.7857142984867096, | |
| "step": 263 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2258.6607971191406, | |
| "epoch": 1.056, | |
| "grad_norm": 0.5032393932342529, | |
| "kl": 0.421875, | |
| "learning_rate": 5.845235626570683e-07, | |
| "loss": 0.0833, | |
| "reward": 0.7445018216967583, | |
| "reward_std": 0.7239043861627579, | |
| "rewards/cosine_scaled_reward": 0.0002271006815135479, | |
| "rewards/format_reward": 0.7440476417541504, | |
| "step": 264 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2421.386962890625, | |
| "epoch": 1.06, | |
| "grad_norm": 0.5948444604873657, | |
| "kl": 0.46826171875, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": 0.1342, | |
| "reward": 0.3432777523994446, | |
| "reward_std": 0.7306928038597107, | |
| "rewards/cosine_scaled_reward": -0.1527658887207508, | |
| "rewards/format_reward": 0.6488095223903656, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1943.8214416503906, | |
| "epoch": 1.064, | |
| "grad_norm": 0.672618567943573, | |
| "kl": 0.33251953125, | |
| "learning_rate": 5.78255733788191e-07, | |
| "loss": 0.074, | |
| "reward": 0.5523176118731499, | |
| "reward_std": 0.6472664028406143, | |
| "rewards/cosine_scaled_reward": -0.08693643007427454, | |
| "rewards/format_reward": 0.7261904776096344, | |
| "step": 266 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2289.4107666015625, | |
| "epoch": 1.068, | |
| "grad_norm": 0.43480613827705383, | |
| "kl": 0.41064453125, | |
| "learning_rate": 5.751196772469237e-07, | |
| "loss": 0.116, | |
| "reward": 0.6816908866167068, | |
| "reward_std": 0.7700821459293365, | |
| "rewards/cosine_scaled_reward": -0.01034504920244217, | |
| "rewards/format_reward": 0.70238097012043, | |
| "step": 267 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2265.166748046875, | |
| "epoch": 1.072, | |
| "grad_norm": 0.8894410729408264, | |
| "kl": 0.37890625, | |
| "learning_rate": 5.71982396408026e-07, | |
| "loss": 0.1102, | |
| "reward": 0.5768959820270538, | |
| "reward_std": 0.7392304837703705, | |
| "rewards/cosine_scaled_reward": -0.04786152858287096, | |
| "rewards/format_reward": 0.6726190447807312, | |
| "step": 268 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2102.4345092773438, | |
| "epoch": 1.076, | |
| "grad_norm": 1.40628182888031, | |
| "kl": 0.34814453125, | |
| "learning_rate": 5.688440441781398e-07, | |
| "loss": 0.1523, | |
| "reward": 0.6540864631533623, | |
| "reward_std": 0.7483679950237274, | |
| "rewards/cosine_scaled_reward": -0.030099631054326892, | |
| "rewards/format_reward": 0.7142857164144516, | |
| "step": 269 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1760.5893249511719, | |
| "epoch": 1.08, | |
| "grad_norm": 0.655262291431427, | |
| "kl": 0.34228515625, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.0938, | |
| "reward": 0.7075737789273262, | |
| "reward_std": 0.712226152420044, | |
| "rewards/cosine_scaled_reward": -0.045022654812783, | |
| "rewards/format_reward": 0.7976190596818924, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1989.1488037109375, | |
| "epoch": 1.084, | |
| "grad_norm": 0.5984042286872864, | |
| "kl": 0.3974609375, | |
| "learning_rate": 5.625647374256061e-07, | |
| "loss": 0.0893, | |
| "reward": 0.5623346008360386, | |
| "reward_std": 0.7052316814661026, | |
| "rewards/cosine_scaled_reward": -0.08192794572096318, | |
| "rewards/format_reward": 0.7261904925107956, | |
| "step": 271 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1998.327392578125, | |
| "epoch": 1.088, | |
| "grad_norm": 0.41462650895118713, | |
| "kl": 0.37939453125, | |
| "learning_rate": 5.594240889475106e-07, | |
| "loss": 0.1384, | |
| "reward": 0.6586858294904232, | |
| "reward_std": 0.8071554154157639, | |
| "rewards/cosine_scaled_reward": -0.018871376756578684, | |
| "rewards/format_reward": 0.696428582072258, | |
| "step": 272 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1905.9880981445312, | |
| "epoch": 1.092, | |
| "grad_norm": 1.1817877292633057, | |
| "kl": 0.4287109375, | |
| "learning_rate": 5.562829811526154e-07, | |
| "loss": 0.108, | |
| "reward": 0.585694283246994, | |
| "reward_std": 0.6987177431583405, | |
| "rewards/cosine_scaled_reward": -0.08512906730175018, | |
| "rewards/format_reward": 0.755952388048172, | |
| "step": 273 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1958.0892639160156, | |
| "epoch": 1.096, | |
| "grad_norm": 0.6756201982498169, | |
| "kl": 0.44580078125, | |
| "learning_rate": 5.531415671340826e-07, | |
| "loss": 0.1298, | |
| "reward": 0.5423668641597033, | |
| "reward_std": 0.5766877979040146, | |
| "rewards/cosine_scaled_reward": -0.09786419570446014, | |
| "rewards/format_reward": 0.7380952537059784, | |
| "step": 274 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1423.4405212402344, | |
| "epoch": 1.1, | |
| "grad_norm": 0.9936150908470154, | |
| "kl": 0.283203125, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0068, | |
| "reward": 0.8336242958903313, | |
| "reward_std": 0.6556554213166237, | |
| "rewards/cosine_scaled_reward": -0.02366404954227619, | |
| "rewards/format_reward": 0.8809524178504944, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1477.011962890625, | |
| "epoch": 1.104, | |
| "grad_norm": 1.4654834270477295, | |
| "kl": 0.30419921875, | |
| "learning_rate": 5.468584328659172e-07, | |
| "loss": 0.1583, | |
| "reward": 0.9086148589849472, | |
| "reward_std": 0.7289283871650696, | |
| "rewards/cosine_scaled_reward": 0.0197836235165596, | |
| "rewards/format_reward": 0.8690476417541504, | |
| "step": 276 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1600.8333740234375, | |
| "epoch": 1.108, | |
| "grad_norm": 0.5991122126579285, | |
| "kl": 0.39697265625, | |
| "learning_rate": 5.437170188473847e-07, | |
| "loss": 0.0615, | |
| "reward": 0.6998666599392891, | |
| "reward_std": 0.6800315380096436, | |
| "rewards/cosine_scaled_reward": -0.06375712971203029, | |
| "rewards/format_reward": 0.82738097012043, | |
| "step": 277 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2001.3035583496094, | |
| "epoch": 1.112, | |
| "grad_norm": 0.9033568501472473, | |
| "kl": 0.4404296875, | |
| "learning_rate": 5.405759110524894e-07, | |
| "loss": 0.0566, | |
| "reward": 0.5947119817137718, | |
| "reward_std": 0.6757695525884628, | |
| "rewards/cosine_scaled_reward": -0.0806201882660389, | |
| "rewards/format_reward": 0.755952388048172, | |
| "step": 278 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1884.9286499023438, | |
| "epoch": 1.116, | |
| "grad_norm": 1.0505043268203735, | |
| "kl": 0.41162109375, | |
| "learning_rate": 5.37435262574394e-07, | |
| "loss": 0.0838, | |
| "reward": 0.546771340072155, | |
| "reward_std": 0.5643983408808708, | |
| "rewards/cosine_scaled_reward": -0.13137624226510525, | |
| "rewards/format_reward": 0.8095238208770752, | |
| "step": 279 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1721.6309814453125, | |
| "epoch": 1.12, | |
| "grad_norm": 2.6171982288360596, | |
| "kl": 0.35400390625, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.119, | |
| "reward": 0.7959851026535034, | |
| "reward_std": 0.6236628741025925, | |
| "rewards/cosine_scaled_reward": -0.03057891083881259, | |
| "rewards/format_reward": 0.8571428805589676, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1974.3809814453125, | |
| "epoch": 1.124, | |
| "grad_norm": 0.9569424390792847, | |
| "kl": 0.4814453125, | |
| "learning_rate": 5.311559558218603e-07, | |
| "loss": 0.1494, | |
| "reward": 0.573462575674057, | |
| "reward_std": 0.6640851646661758, | |
| "rewards/cosine_scaled_reward": -0.09422110859304667, | |
| "rewards/format_reward": 0.761904776096344, | |
| "step": 281 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1699.2142944335938, | |
| "epoch": 1.1280000000000001, | |
| "grad_norm": 0.5432654619216919, | |
| "kl": 0.33935546875, | |
| "learning_rate": 5.28017603591974e-07, | |
| "loss": 0.0877, | |
| "reward": 0.7524446099996567, | |
| "reward_std": 0.6557567343115807, | |
| "rewards/cosine_scaled_reward": -0.04342056508176029, | |
| "rewards/format_reward": 0.8392857164144516, | |
| "step": 282 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2023.1012573242188, | |
| "epoch": 1.1320000000000001, | |
| "grad_norm": 1.5788854360580444, | |
| "kl": 0.498046875, | |
| "learning_rate": 5.248803227530763e-07, | |
| "loss": 0.1449, | |
| "reward": 0.471544723957777, | |
| "reward_std": 0.7016247361898422, | |
| "rewards/cosine_scaled_reward": -0.14220385067164898, | |
| "rewards/format_reward": 0.755952388048172, | |
| "step": 283 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1751.6190795898438, | |
| "epoch": 1.1360000000000001, | |
| "grad_norm": 0.8654693365097046, | |
| "kl": 0.45654296875, | |
| "learning_rate": 5.21744266211809e-07, | |
| "loss": 0.096, | |
| "reward": 0.8401590138673782, | |
| "reward_std": 0.7027324140071869, | |
| "rewards/cosine_scaled_reward": -0.008491916581988335, | |
| "rewards/format_reward": 0.8571428805589676, | |
| "step": 284 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1953.1309814453125, | |
| "epoch": 1.1400000000000001, | |
| "grad_norm": 0.7724223732948303, | |
| "kl": 0.43017578125, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.1257, | |
| "reward": 0.5251086875796318, | |
| "reward_std": 0.75553198158741, | |
| "rewards/cosine_scaled_reward": -0.11244566680397838, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1932.9940795898438, | |
| "epoch": 1.144, | |
| "grad_norm": 0.6642920970916748, | |
| "kl": 0.470703125, | |
| "learning_rate": 5.154764373429315e-07, | |
| "loss": 0.096, | |
| "reward": 0.8715938031673431, | |
| "reward_std": 0.7678115516901016, | |
| "rewards/cosine_scaled_reward": 0.036987369414418936, | |
| "rewards/format_reward": 0.7976190745830536, | |
| "step": 286 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1784.1428527832031, | |
| "epoch": 1.148, | |
| "grad_norm": 0.9823849201202393, | |
| "kl": 0.38134765625, | |
| "learning_rate": 5.123449705004581e-07, | |
| "loss": 0.0437, | |
| "reward": 0.7326274067163467, | |
| "reward_std": 0.6021066680550575, | |
| "rewards/cosine_scaled_reward": -0.044400574173778296, | |
| "rewards/format_reward": 0.8214285969734192, | |
| "step": 287 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1929.0536193847656, | |
| "epoch": 1.152, | |
| "grad_norm": 2.430745840072632, | |
| "kl": 0.45458984375, | |
| "learning_rate": 5.09215338910999e-07, | |
| "loss": 0.1275, | |
| "reward": 0.76754130423069, | |
| "reward_std": 0.6635829508304596, | |
| "rewards/cosine_scaled_reward": -0.0001579252420924604, | |
| "rewards/format_reward": 0.7678571492433548, | |
| "step": 288 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1567.4226684570312, | |
| "epoch": 1.156, | |
| "grad_norm": 1.8522855043411255, | |
| "kl": 0.35302734375, | |
| "learning_rate": 5.060876951083828e-07, | |
| "loss": 0.0659, | |
| "reward": 0.8090793639421463, | |
| "reward_std": 0.6970714181661606, | |
| "rewards/cosine_scaled_reward": -0.02105556521564722, | |
| "rewards/format_reward": 0.8511905074119568, | |
| "step": 289 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1916.0298156738281, | |
| "epoch": 1.16, | |
| "grad_norm": 0.8320524096488953, | |
| "kl": 0.353515625, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": 0.0397, | |
| "reward": 0.8147249445319176, | |
| "reward_std": 0.7559010833501816, | |
| "rewards/cosine_scaled_reward": 0.014505308354273438, | |
| "rewards/format_reward": 0.7857143133878708, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1871.5595703125, | |
| "epoch": 1.164, | |
| "grad_norm": 1.639461636543274, | |
| "kl": 0.44482421875, | |
| "learning_rate": 4.998389805071536e-07, | |
| "loss": 0.02, | |
| "reward": 0.7966814041137695, | |
| "reward_std": 0.6868171393871307, | |
| "rewards/cosine_scaled_reward": -0.03320692107081413, | |
| "rewards/format_reward": 0.8630952686071396, | |
| "step": 291 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1849.0714721679688, | |
| "epoch": 1.168, | |
| "grad_norm": 0.9159106016159058, | |
| "kl": 0.41357421875, | |
| "learning_rate": 4.967182142620745e-07, | |
| "loss": 0.1098, | |
| "reward": 0.8123535662889481, | |
| "reward_std": 0.7406510710716248, | |
| "rewards/cosine_scaled_reward": -0.0075137000530958176, | |
| "rewards/format_reward": 0.82738097012043, | |
| "step": 292 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1627.2262268066406, | |
| "epoch": 1.172, | |
| "grad_norm": 1.2907826900482178, | |
| "kl": 0.3271484375, | |
| "learning_rate": 4.93600044896063e-07, | |
| "loss": 0.028, | |
| "reward": 0.7378726750612259, | |
| "reward_std": 0.6904594451189041, | |
| "rewards/cosine_scaled_reward": -0.07153987139463425, | |
| "rewards/format_reward": 0.8809524029493332, | |
| "step": 293 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2141.2202758789062, | |
| "epoch": 1.176, | |
| "grad_norm": 0.7737708687782288, | |
| "kl": 0.4482421875, | |
| "learning_rate": 4.904846243842949e-07, | |
| "loss": 0.0644, | |
| "reward": 0.7625293210148811, | |
| "reward_std": 0.7152971476316452, | |
| "rewards/cosine_scaled_reward": 0.009240844286978245, | |
| "rewards/format_reward": 0.7440476417541504, | |
| "step": 294 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2187.3809814453125, | |
| "epoch": 1.18, | |
| "grad_norm": 1.1525542736053467, | |
| "kl": 0.51025390625, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.0634, | |
| "reward": 0.5901899486780167, | |
| "reward_std": 0.6728092133998871, | |
| "rewards/cosine_scaled_reward": -0.0739526596153155, | |
| "rewards/format_reward": 0.7380952537059784, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2388.577392578125, | |
| "epoch": 1.184, | |
| "grad_norm": 0.9084761738777161, | |
| "kl": 0.52734375, | |
| "learning_rate": 4.842626371469149e-07, | |
| "loss": 0.0587, | |
| "reward": 0.4302752036601305, | |
| "reward_std": 0.615352213382721, | |
| "rewards/cosine_scaled_reward": -0.12117192603182048, | |
| "rewards/format_reward": 0.6726190596818924, | |
| "step": 296 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1568.6786193847656, | |
| "epoch": 1.188, | |
| "grad_norm": 0.852024495601654, | |
| "kl": 0.1533203125, | |
| "learning_rate": 4.811563736721829e-07, | |
| "loss": 0.0574, | |
| "reward": 0.7380149587988853, | |
| "reward_std": 0.7155523598194122, | |
| "rewards/cosine_scaled_reward": -0.029802043922245502, | |
| "rewards/format_reward": 0.7976190596818924, | |
| "step": 297 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1983.4524230957031, | |
| "epoch": 1.192, | |
| "grad_norm": 0.7617373466491699, | |
| "kl": 0.303955078125, | |
| "learning_rate": 4.780534655386743e-07, | |
| "loss": 0.068, | |
| "reward": 0.7127486318349838, | |
| "reward_std": 0.7076264545321465, | |
| "rewards/cosine_scaled_reward": -0.018625682685524225, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 298 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2038.3750305175781, | |
| "epoch": 1.196, | |
| "grad_norm": 0.8094474673271179, | |
| "kl": 0.25830078125, | |
| "learning_rate": 4.749540639777539e-07, | |
| "loss": 0.0566, | |
| "reward": 0.6301854252815247, | |
| "reward_std": 0.6336864829063416, | |
| "rewards/cosine_scaled_reward": -0.036097751930356026, | |
| "rewards/format_reward": 0.7023809552192688, | |
| "step": 299 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1825.9643249511719, | |
| "epoch": 1.2, | |
| "grad_norm": 1.8039993047714233, | |
| "kl": 0.225830078125, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.0501, | |
| "reward": 0.9151953011751175, | |
| "reward_std": 0.6518659368157387, | |
| "rewards/cosine_scaled_reward": 0.03200240898877382, | |
| "rewards/format_reward": 0.8511905074119568, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1954.3809814453125, | |
| "epoch": 1.204, | |
| "grad_norm": 0.9098180532455444, | |
| "kl": 0.2685546875, | |
| "learning_rate": 4.68766384637248e-07, | |
| "loss": 0.08, | |
| "reward": 0.903901144862175, | |
| "reward_std": 0.7074443101882935, | |
| "rewards/cosine_scaled_reward": 0.059093400835990906, | |
| "rewards/format_reward": 0.7857142835855484, | |
| "step": 301 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2221.6012268066406, | |
| "epoch": 1.208, | |
| "grad_norm": 0.628447949886322, | |
| "kl": 0.297119140625, | |
| "learning_rate": 4.656784084364238e-07, | |
| "loss": 0.0225, | |
| "reward": 0.7435066364705563, | |
| "reward_std": 0.7286128550767899, | |
| "rewards/cosine_scaled_reward": 0.002705696038901806, | |
| "rewards/format_reward": 0.7380952537059784, | |
| "step": 302 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1967.6607360839844, | |
| "epoch": 1.212, | |
| "grad_norm": 1.4870760440826416, | |
| "kl": 0.2193603515625, | |
| "learning_rate": 4.6259454195101267e-07, | |
| "loss": 0.0076, | |
| "reward": 0.6118638888001442, | |
| "reward_std": 0.6256552934646606, | |
| "rewards/cosine_scaled_reward": -0.08990138117223978, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 303 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2091.8810119628906, | |
| "epoch": 1.216, | |
| "grad_norm": 1.0213916301727295, | |
| "kl": 0.28857421875, | |
| "learning_rate": 4.59514935484316e-07, | |
| "loss": 0.0819, | |
| "reward": 0.8858746439218521, | |
| "reward_std": 0.760543704032898, | |
| "rewards/cosine_scaled_reward": 0.04412779211997986, | |
| "rewards/format_reward": 0.7976190596818924, | |
| "step": 304 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2117.5357971191406, | |
| "epoch": 1.22, | |
| "grad_norm": 1.1696289777755737, | |
| "kl": 0.266845703125, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": 0.0319, | |
| "reward": 0.4878672659397125, | |
| "reward_std": 0.5883132815361023, | |
| "rewards/cosine_scaled_reward": -0.13999494537711143, | |
| "rewards/format_reward": 0.7678571492433548, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1962.8512268066406, | |
| "epoch": 1.224, | |
| "grad_norm": 1.1181604862213135, | |
| "kl": 0.26171875, | |
| "learning_rate": 4.5336910277482155e-07, | |
| "loss": 0.0553, | |
| "reward": 0.8040256127715111, | |
| "reward_std": 0.7542890757322311, | |
| "rewards/cosine_scaled_reward": 0.00915566342882812, | |
| "rewards/format_reward": 0.7857143059372902, | |
| "step": 306 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2503.541717529297, | |
| "epoch": 1.228, | |
| "grad_norm": 1.0075181722640991, | |
| "kl": 0.274658203125, | |
| "learning_rate": 4.503031760712397e-07, | |
| "loss": 0.0639, | |
| "reward": 0.5502185635268688, | |
| "reward_std": 0.7036140263080597, | |
| "rewards/cosine_scaled_reward": -0.043343101628124714, | |
| "rewards/format_reward": 0.6369047686457634, | |
| "step": 307 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2027.6905212402344, | |
| "epoch": 1.232, | |
| "grad_norm": 2.7786951065063477, | |
| "kl": 0.265380859375, | |
| "learning_rate": 4.4724210845020494e-07, | |
| "loss": 0.1529, | |
| "reward": 0.8017951250076294, | |
| "reward_std": 0.7912951856851578, | |
| "rewards/cosine_scaled_reward": 0.005064212018623948, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 308 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2560.6845703125, | |
| "epoch": 1.236, | |
| "grad_norm": 1.6693713665008545, | |
| "kl": 0.2939453125, | |
| "learning_rate": 4.441860491038345e-07, | |
| "loss": 0.1046, | |
| "reward": 0.6068699322640896, | |
| "reward_std": 0.7445466667413712, | |
| "rewards/cosine_scaled_reward": -0.00013647368177771568, | |
| "rewards/format_reward": 0.6071428656578064, | |
| "step": 309 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2221.3988037109375, | |
| "epoch": 1.24, | |
| "grad_norm": 0.7167072892189026, | |
| "kl": 0.253173828125, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": 0.046, | |
| "reward": 0.5108997635543346, | |
| "reward_std": 0.6983606815338135, | |
| "rewards/cosine_scaled_reward": -0.09276440553367138, | |
| "rewards/format_reward": 0.696428582072258, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2308.1012573242188, | |
| "epoch": 1.244, | |
| "grad_norm": 1.289093255996704, | |
| "kl": 0.24072265625, | |
| "learning_rate": 4.3808955077581546e-07, | |
| "loss": 0.074, | |
| "reward": 0.49418094009160995, | |
| "reward_std": 0.6803844273090363, | |
| "rewards/cosine_scaled_reward": -0.0862428704276681, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 311 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2221.500030517578, | |
| "epoch": 1.248, | |
| "grad_norm": 0.7747544646263123, | |
| "kl": 0.284423828125, | |
| "learning_rate": 4.350494089288943e-07, | |
| "loss": 0.0127, | |
| "reward": 0.5928547494113445, | |
| "reward_std": 0.6995180547237396, | |
| "rewards/cosine_scaled_reward": -0.0875012082979083, | |
| "rewards/format_reward": 0.7678571492433548, | |
| "step": 312 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2528.3750610351562, | |
| "epoch": 1.252, | |
| "grad_norm": 0.9067274928092957, | |
| "kl": 0.261962890625, | |
| "learning_rate": 4.3201486961161093e-07, | |
| "loss": 0.0588, | |
| "reward": 0.580617468804121, | |
| "reward_std": 0.7565959244966507, | |
| "rewards/cosine_scaled_reward": -0.05195318069308996, | |
| "rewards/format_reward": 0.6845238208770752, | |
| "step": 313 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2290.434539794922, | |
| "epoch": 1.256, | |
| "grad_norm": 1.0397149324417114, | |
| "kl": 0.2939453125, | |
| "learning_rate": 4.2898608072313045e-07, | |
| "loss": 0.0843, | |
| "reward": 0.923637330532074, | |
| "reward_std": 0.8029050081968307, | |
| "rewards/cosine_scaled_reward": 0.07491390081122518, | |
| "rewards/format_reward": 0.7738095372915268, | |
| "step": 314 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2221.1785583496094, | |
| "epoch": 1.26, | |
| "grad_norm": 0.8451793789863586, | |
| "kl": 0.2978515625, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": 0.0794, | |
| "reward": 0.9175606220960617, | |
| "reward_std": 0.6950835883617401, | |
| "rewards/cosine_scaled_reward": 0.06592314876616001, | |
| "rewards/format_reward": 0.7857142984867096, | |
| "step": 315 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2737.1131591796875, | |
| "epoch": 1.264, | |
| "grad_norm": 0.8914613723754883, | |
| "kl": 0.4072265625, | |
| "learning_rate": 4.2294634442070553e-07, | |
| "loss": 0.0266, | |
| "reward": 0.39799112919718027, | |
| "reward_std": 0.5211281925439835, | |
| "rewards/cosine_scaled_reward": -0.0896949004381895, | |
| "rewards/format_reward": 0.5773809626698494, | |
| "step": 316 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2356.8155517578125, | |
| "epoch": 1.268, | |
| "grad_norm": 0.8658885955810547, | |
| "kl": 0.32080078125, | |
| "learning_rate": 4.1993569137498776e-07, | |
| "loss": 0.0558, | |
| "reward": 0.4528093598783016, | |
| "reward_std": 0.5718662440776825, | |
| "rewards/cosine_scaled_reward": -0.11585722491145134, | |
| "rewards/format_reward": 0.6845238283276558, | |
| "step": 317 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2067.0952758789062, | |
| "epoch": 1.272, | |
| "grad_norm": 0.6174459457397461, | |
| "kl": 0.2724609375, | |
| "learning_rate": 4.1693137748017915e-07, | |
| "loss": 0.0532, | |
| "reward": 0.9527914822101593, | |
| "reward_std": 0.7573249191045761, | |
| "rewards/cosine_scaled_reward": 0.059729063883423805, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 318 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2158.3631286621094, | |
| "epoch": 1.276, | |
| "grad_norm": 0.5749858617782593, | |
| "kl": 0.2744140625, | |
| "learning_rate": 4.1393354916230005e-07, | |
| "loss": 0.0398, | |
| "reward": 0.7759583368897438, | |
| "reward_std": 0.7076128423213959, | |
| "rewards/cosine_scaled_reward": 0.00702677620574832, | |
| "rewards/format_reward": 0.761904776096344, | |
| "step": 319 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2376.7857666015625, | |
| "epoch": 1.28, | |
| "grad_norm": 0.4824450612068176, | |
| "kl": 0.358642578125, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.0579, | |
| "reward": 0.5863704346120358, | |
| "reward_std": 0.69185970723629, | |
| "rewards/cosine_scaled_reward": -0.058005278930068016, | |
| "rewards/format_reward": 0.7023809552192688, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2094.6786193847656, | |
| "epoch": 1.284, | |
| "grad_norm": 1.153307318687439, | |
| "kl": 0.32373046875, | |
| "learning_rate": 4.079579333738039e-07, | |
| "loss": 0.0147, | |
| "reward": 0.5667938031256199, | |
| "reward_std": 0.6206858605146408, | |
| "rewards/cosine_scaled_reward": -0.12434119766112417, | |
| "rewards/format_reward": 0.8154762089252472, | |
| "step": 321 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2360.970245361328, | |
| "epoch": 1.288, | |
| "grad_norm": 0.7556703090667725, | |
| "kl": 0.314697265625, | |
| "learning_rate": 4.0498043714627006e-07, | |
| "loss": 0.0413, | |
| "reward": 0.54334956407547, | |
| "reward_std": 0.7112371101975441, | |
| "rewards/cosine_scaled_reward": -0.10927761369384825, | |
| "rewards/format_reward": 0.761904776096344, | |
| "step": 322 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2466.9703369140625, | |
| "epoch": 1.292, | |
| "grad_norm": 0.6241899728775024, | |
| "kl": 0.345703125, | |
| "learning_rate": 4.020100089676376e-07, | |
| "loss": 0.0463, | |
| "reward": 0.5620089694857597, | |
| "reward_std": 0.6381285488605499, | |
| "rewards/cosine_scaled_reward": -0.0672098146751523, | |
| "rewards/format_reward": 0.696428582072258, | |
| "step": 323 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2297.2916564941406, | |
| "epoch": 1.296, | |
| "grad_norm": 1.050784945487976, | |
| "kl": 0.289306640625, | |
| "learning_rate": 3.9904679361238526e-07, | |
| "loss": 0.095, | |
| "reward": 0.6569867879152298, | |
| "reward_std": 0.6581598520278931, | |
| "rewards/cosine_scaled_reward": -0.034601859748363495, | |
| "rewards/format_reward": 0.7261904925107956, | |
| "step": 324 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2391.8095703125, | |
| "epoch": 1.3, | |
| "grad_norm": 0.5910518169403076, | |
| "kl": 0.327392578125, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.065, | |
| "reward": 0.6689947620034218, | |
| "reward_std": 0.5862837731838226, | |
| "rewards/cosine_scaled_reward": -0.0434788279235363, | |
| "rewards/format_reward": 0.7559524029493332, | |
| "step": 325 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2161.34521484375, | |
| "epoch": 1.304, | |
| "grad_norm": 1.3934383392333984, | |
| "kl": 0.2412109375, | |
| "learning_rate": 3.931425787051832e-07, | |
| "loss": 0.0952, | |
| "reward": 0.7927189618349075, | |
| "reward_std": 0.8861154615879059, | |
| "rewards/cosine_scaled_reward": 0.03624042624142021, | |
| "rewards/format_reward": 0.7202381044626236, | |
| "step": 326 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2205.75, | |
| "epoch": 1.308, | |
| "grad_norm": 0.5909004211425781, | |
| "kl": 0.276611328125, | |
| "learning_rate": 3.902018669163384e-07, | |
| "loss": 0.0265, | |
| "reward": 0.7868844717741013, | |
| "reward_std": 0.6631656885147095, | |
| "rewards/cosine_scaled_reward": 0.024394613516051322, | |
| "rewards/format_reward": 0.7380952388048172, | |
| "step": 327 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2526.71435546875, | |
| "epoch": 1.312, | |
| "grad_norm": 0.37658610939979553, | |
| "kl": 0.30908203125, | |
| "learning_rate": 3.872689434630585e-07, | |
| "loss": 0.0593, | |
| "reward": 0.3922804482281208, | |
| "reward_std": 0.7164648473262787, | |
| "rewards/cosine_scaled_reward": -0.11933596897870302, | |
| "rewards/format_reward": 0.6309523731470108, | |
| "step": 328 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2406.6726684570312, | |
| "epoch": 1.316, | |
| "grad_norm": 0.5439748764038086, | |
| "kl": 0.28759765625, | |
| "learning_rate": 3.843439512918949e-07, | |
| "loss": 0.0395, | |
| "reward": 0.457830130122602, | |
| "reward_std": 0.6897861212491989, | |
| "rewards/cosine_scaled_reward": -0.10739446245133877, | |
| "rewards/format_reward": 0.6726190447807312, | |
| "step": 329 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2233.6548461914062, | |
| "epoch": 1.32, | |
| "grad_norm": 1.2243571281433105, | |
| "kl": 0.289306640625, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": 0.1087, | |
| "reward": 0.6516863703727722, | |
| "reward_std": 0.7036527991294861, | |
| "rewards/cosine_scaled_reward": -0.08189492486417294, | |
| "rewards/format_reward": 0.8154762089252472, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2212.7560119628906, | |
| "epoch": 1.324, | |
| "grad_norm": 0.8144615888595581, | |
| "kl": 0.28857421875, | |
| "learning_rate": 3.785183306423767e-07, | |
| "loss": 0.0775, | |
| "reward": 0.5815620422363281, | |
| "reward_std": 0.5177476480603218, | |
| "rewards/cosine_scaled_reward": -0.042552310740575194, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 331 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2412.0059814453125, | |
| "epoch": 1.328, | |
| "grad_norm": 0.42855292558670044, | |
| "kl": 0.3232421875, | |
| "learning_rate": 3.7561798609655373e-07, | |
| "loss": 0.0791, | |
| "reward": 0.642042949795723, | |
| "reward_std": 0.6289803832769394, | |
| "rewards/cosine_scaled_reward": -0.04207377042621374, | |
| "rewards/format_reward": 0.7261904925107956, | |
| "step": 332 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2163.9822387695312, | |
| "epoch": 1.332, | |
| "grad_norm": 1.0114275217056274, | |
| "kl": 0.255859375, | |
| "learning_rate": 3.72726140684072e-07, | |
| "loss": 0.088, | |
| "reward": 0.811268161451153, | |
| "reward_std": 0.6822613030672073, | |
| "rewards/cosine_scaled_reward": 0.042538831010460854, | |
| "rewards/format_reward": 0.7261904925107956, | |
| "step": 333 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2329.0178833007812, | |
| "epoch": 1.336, | |
| "grad_norm": 0.7170870900154114, | |
| "kl": 0.3486328125, | |
| "learning_rate": 3.6984293534939737e-07, | |
| "loss": 0.0455, | |
| "reward": 0.8848401606082916, | |
| "reward_std": 0.7328508943319321, | |
| "rewards/cosine_scaled_reward": 0.04361054569017142, | |
| "rewards/format_reward": 0.7976190745830536, | |
| "step": 334 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2709.2262573242188, | |
| "epoch": 1.34, | |
| "grad_norm": 0.47293010354042053, | |
| "kl": 0.38037109375, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.0416, | |
| "reward": 0.3898888286203146, | |
| "reward_std": 0.6401937156915665, | |
| "rewards/cosine_scaled_reward": -0.10862701199948788, | |
| "rewards/format_reward": 0.6071428582072258, | |
| "step": 335 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2530.416748046875, | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 0.4423607885837555, | |
| "kl": 0.282958984375, | |
| "learning_rate": 3.641030065789562e-07, | |
| "loss": 0.0446, | |
| "reward": 0.6725399196147919, | |
| "reward_std": 0.7871751934289932, | |
| "rewards/cosine_scaled_reward": 0.0059128133580088615, | |
| "rewards/format_reward": 0.6607142984867096, | |
| "step": 336 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2311.5358276367188, | |
| "epoch": 1.3479999999999999, | |
| "grad_norm": 0.5007253885269165, | |
| "kl": 0.3203125, | |
| "learning_rate": 3.612465628992203e-07, | |
| "loss": 0.0455, | |
| "reward": 0.8073793947696686, | |
| "reward_std": 0.7870100140571594, | |
| "rewards/cosine_scaled_reward": 0.010832530329935253, | |
| "rewards/format_reward": 0.7857142984867096, | |
| "step": 337 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2489.7500610351562, | |
| "epoch": 1.3519999999999999, | |
| "grad_norm": 0.36444640159606934, | |
| "kl": 0.305908203125, | |
| "learning_rate": 3.5839931879571725e-07, | |
| "loss": 0.0652, | |
| "reward": 0.6751855611801147, | |
| "reward_std": 0.6701688021421432, | |
| "rewards/cosine_scaled_reward": 0.001283254474401474, | |
| "rewards/format_reward": 0.6726190596818924, | |
| "step": 338 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2460.8154907226562, | |
| "epoch": 1.3559999999999999, | |
| "grad_norm": 0.43892228603363037, | |
| "kl": 0.3369140625, | |
| "learning_rate": 3.555614130391079e-07, | |
| "loss": 0.0519, | |
| "reward": 0.6638183146715164, | |
| "reward_std": 0.770327016711235, | |
| "rewards/cosine_scaled_reward": 0.010480590397492051, | |
| "rewards/format_reward": 0.6428571566939354, | |
| "step": 339 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2244.3631896972656, | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 0.6102768778800964, | |
| "kl": 0.31201171875, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": 0.0694, | |
| "reward": 0.8422182202339172, | |
| "reward_std": 0.6671302318572998, | |
| "rewards/cosine_scaled_reward": 0.04610910080373287, | |
| "rewards/format_reward": 0.7500000074505806, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2239.75, | |
| "epoch": 1.3639999999999999, | |
| "grad_norm": 0.6582260727882385, | |
| "kl": 0.3271484375, | |
| "learning_rate": 3.4991416936678276e-07, | |
| "loss": 0.076, | |
| "reward": 0.6709855943918228, | |
| "reward_std": 0.7041856721043587, | |
| "rewards/cosine_scaled_reward": -0.03355482150800526, | |
| "rewards/format_reward": 0.7380952537059784, | |
| "step": 341 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2438.3214721679688, | |
| "epoch": 1.3679999999999999, | |
| "grad_norm": 0.5521511435508728, | |
| "kl": 0.320556640625, | |
| "learning_rate": 3.471051066897562e-07, | |
| "loss": 0.047, | |
| "reward": 0.6942454129457474, | |
| "reward_std": 0.6340186148881912, | |
| "rewards/cosine_scaled_reward": 0.010813180379045662, | |
| "rewards/format_reward": 0.6726190745830536, | |
| "step": 342 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2433.6488647460938, | |
| "epoch": 1.3719999999999999, | |
| "grad_norm": 0.928674042224884, | |
| "kl": 0.40478515625, | |
| "learning_rate": 3.4430593282358777e-07, | |
| "loss": 0.0328, | |
| "reward": 0.5231252759695053, | |
| "reward_std": 0.7485495656728745, | |
| "rewards/cosine_scaled_reward": -0.11641356535255909, | |
| "rewards/format_reward": 0.755952388048172, | |
| "step": 343 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2388.2202758789062, | |
| "epoch": 1.376, | |
| "grad_norm": 0.43529966473579407, | |
| "kl": 0.32080078125, | |
| "learning_rate": 3.4151678419606233e-07, | |
| "loss": 0.0303, | |
| "reward": 0.7449862584471703, | |
| "reward_std": 0.6971839666366577, | |
| "rewards/cosine_scaled_reward": 0.024278827477246523, | |
| "rewards/format_reward": 0.696428582072258, | |
| "step": 344 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2686.15478515625, | |
| "epoch": 1.38, | |
| "grad_norm": 0.5191164016723633, | |
| "kl": 0.36328125, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": 0.0602, | |
| "reward": 0.4467791821807623, | |
| "reward_std": 0.6689166128635406, | |
| "rewards/cosine_scaled_reward": -0.07125327130779624, | |
| "rewards/format_reward": 0.5892857238650322, | |
| "step": 345 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2138.119110107422, | |
| "epoch": 1.384, | |
| "grad_norm": 0.40859875082969666, | |
| "kl": 0.344970703125, | |
| "learning_rate": 3.359691059183761e-07, | |
| "loss": 0.0894, | |
| "reward": 0.7263324186205864, | |
| "reward_std": 0.7082626074552536, | |
| "rewards/cosine_scaled_reward": -0.029690947383642197, | |
| "rewards/format_reward": 0.7857143133878708, | |
| "step": 346 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2158.6429138183594, | |
| "epoch": 1.388, | |
| "grad_norm": 0.35558465123176575, | |
| "kl": 0.29638671875, | |
| "learning_rate": 3.3321084665422803e-07, | |
| "loss": 0.0262, | |
| "reward": 0.6269577667117119, | |
| "reward_std": 0.5908889323472977, | |
| "rewards/cosine_scaled_reward": -0.04664018237963319, | |
| "rewards/format_reward": 0.7202381119132042, | |
| "step": 347 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2144.619110107422, | |
| "epoch": 1.392, | |
| "grad_norm": 1.211071491241455, | |
| "kl": 0.306640625, | |
| "learning_rate": 3.3046315338757026e-07, | |
| "loss": -0.0105, | |
| "reward": 0.6653935462236404, | |
| "reward_std": 0.6245283707976341, | |
| "rewards/cosine_scaled_reward": -0.04230323247611523, | |
| "rewards/format_reward": 0.75, | |
| "step": 348 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2366.4940795898438, | |
| "epoch": 1.396, | |
| "grad_norm": 0.5814414620399475, | |
| "kl": 0.33154296875, | |
| "learning_rate": 3.2772616003709616e-07, | |
| "loss": 0.0485, | |
| "reward": 0.5602632537484169, | |
| "reward_std": 0.5761818215250969, | |
| "rewards/cosine_scaled_reward": -0.0978445541113615, | |
| "rewards/format_reward": 0.755952388048172, | |
| "step": 349 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2348.6607666015625, | |
| "epoch": 1.4, | |
| "grad_norm": 0.675369918346405, | |
| "kl": 0.29931640625, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.0825, | |
| "reward": 0.475093599408865, | |
| "reward_std": 0.604865163564682, | |
| "rewards/cosine_scaled_reward": -0.07792939431965351, | |
| "rewards/format_reward": 0.6309523731470108, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2099.279815673828, | |
| "epoch": 1.404, | |
| "grad_norm": 0.5227596163749695, | |
| "kl": 0.33447265625, | |
| "learning_rate": 3.222848061454764e-07, | |
| "loss": 0.0454, | |
| "reward": 0.6502892896533012, | |
| "reward_std": 0.676431730389595, | |
| "rewards/cosine_scaled_reward": -0.05878393305465579, | |
| "rewards/format_reward": 0.7678571492433548, | |
| "step": 351 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2465.202392578125, | |
| "epoch": 1.408, | |
| "grad_norm": 0.4936739206314087, | |
| "kl": 0.33154296875, | |
| "learning_rate": 3.195807108082429e-07, | |
| "loss": 0.0349, | |
| "reward": 0.51472207903862, | |
| "reward_std": 0.6474315822124481, | |
| "rewards/cosine_scaled_reward": -0.05216278973966837, | |
| "rewards/format_reward": 0.6190476417541504, | |
| "step": 352 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2241.089324951172, | |
| "epoch": 1.412, | |
| "grad_norm": 0.4653976857662201, | |
| "kl": 0.3046875, | |
| "learning_rate": 3.168878457820915e-07, | |
| "loss": 0.0576, | |
| "reward": 0.7246856689453125, | |
| "reward_std": 0.7023278325796127, | |
| "rewards/cosine_scaled_reward": -0.02456192229874432, | |
| "rewards/format_reward": 0.7738095223903656, | |
| "step": 353 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2174.4107666015625, | |
| "epoch": 1.416, | |
| "grad_norm": 1.179158091545105, | |
| "kl": 0.31982421875, | |
| "learning_rate": 3.142063423134644e-07, | |
| "loss": 0.1321, | |
| "reward": 0.4120100736618042, | |
| "reward_std": 0.5803252756595612, | |
| "rewards/cosine_scaled_reward": -0.19280448742210865, | |
| "rewards/format_reward": 0.7976190596818924, | |
| "step": 354 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2560.9405517578125, | |
| "epoch": 1.42, | |
| "grad_norm": 0.6409890651702881, | |
| "kl": 0.3291015625, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.0637, | |
| "reward": 0.6557277590036392, | |
| "reward_std": 0.8805683702230453, | |
| "rewards/cosine_scaled_reward": -0.02332661801483482, | |
| "rewards/format_reward": 0.70238097012043, | |
| "step": 355 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1872.1012573242188, | |
| "epoch": 1.424, | |
| "grad_norm": 0.4570577144622803, | |
| "kl": 0.244873046875, | |
| "learning_rate": 3.0887794225945143e-07, | |
| "loss": 0.0537, | |
| "reward": 0.8301898017525673, | |
| "reward_std": 0.6987727582454681, | |
| "rewards/cosine_scaled_reward": -0.016452712705358863, | |
| "rewards/format_reward": 0.8630952537059784, | |
| "step": 356 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2199.4880981445312, | |
| "epoch": 1.428, | |
| "grad_norm": 0.5453688502311707, | |
| "kl": 0.3388671875, | |
| "learning_rate": 3.062313053727671e-07, | |
| "loss": 0.1004, | |
| "reward": 0.5429714322090149, | |
| "reward_std": 0.757801964879036, | |
| "rewards/cosine_scaled_reward": -0.11244285944849253, | |
| "rewards/format_reward": 0.7678571492433548, | |
| "step": 357 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2392.5536193847656, | |
| "epoch": 1.432, | |
| "grad_norm": 0.4179025888442993, | |
| "kl": 0.36767578125, | |
| "learning_rate": 3.0359654942835247e-07, | |
| "loss": 0.057, | |
| "reward": 0.6754717975854874, | |
| "reward_std": 0.8176562935113907, | |
| "rewards/cosine_scaled_reward": -0.004526023752987385, | |
| "rewards/format_reward": 0.6845238357782364, | |
| "step": 358 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2280.5000915527344, | |
| "epoch": 1.436, | |
| "grad_norm": 0.5272053480148315, | |
| "kl": 0.273681640625, | |
| "learning_rate": 3.0097380284049523e-07, | |
| "loss": 0.0565, | |
| "reward": 0.644446611404419, | |
| "reward_std": 0.7567472010850906, | |
| "rewards/cosine_scaled_reward": -0.02896718680858612, | |
| "rewards/format_reward": 0.70238097012043, | |
| "step": 359 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2270.3928833007812, | |
| "epoch": 1.44, | |
| "grad_norm": 0.8152810335159302, | |
| "kl": 0.34619140625, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.0306, | |
| "reward": 0.7786325067281723, | |
| "reward_std": 0.559767447412014, | |
| "rewards/cosine_scaled_reward": -0.012469482608139515, | |
| "rewards/format_reward": 0.8035714477300644, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2094.2083740234375, | |
| "epoch": 1.444, | |
| "grad_norm": 0.9731494188308716, | |
| "kl": 0.33203125, | |
| "learning_rate": 2.9576484845877793e-07, | |
| "loss": 0.0315, | |
| "reward": 0.7239094823598862, | |
| "reward_std": 0.6780030280351639, | |
| "rewards/cosine_scaled_reward": -0.057688117027282715, | |
| "rewards/format_reward": 0.839285746216774, | |
| "step": 361 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2515.3809814453125, | |
| "epoch": 1.448, | |
| "grad_norm": 0.5006127953529358, | |
| "kl": 0.3583984375, | |
| "learning_rate": 2.931788945420058e-07, | |
| "loss": 0.0632, | |
| "reward": 0.5585716450586915, | |
| "reward_std": 0.6955743506550789, | |
| "rewards/cosine_scaled_reward": -0.08976180851459503, | |
| "rewards/format_reward": 0.7380952462553978, | |
| "step": 362 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2665.2560424804688, | |
| "epoch": 1.452, | |
| "grad_norm": 0.4868517220020294, | |
| "kl": 0.373046875, | |
| "learning_rate": 2.9060545772359305e-07, | |
| "loss": 0.0555, | |
| "reward": 0.5607914663851261, | |
| "reward_std": 0.6483574956655502, | |
| "rewards/cosine_scaled_reward": -0.07377092959359288, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 363 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2244.7262573242188, | |
| "epoch": 1.456, | |
| "grad_norm": 0.6844132542610168, | |
| "kl": 0.3173828125, | |
| "learning_rate": 2.8804466342921987e-07, | |
| "loss": 0.0109, | |
| "reward": 0.7073798812925816, | |
| "reward_std": 0.6621369272470474, | |
| "rewards/cosine_scaled_reward": -0.01833386719226837, | |
| "rewards/format_reward": 0.7440476417541504, | |
| "step": 364 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2576.1905517578125, | |
| "epoch": 1.46, | |
| "grad_norm": 0.5755227208137512, | |
| "kl": 0.35400390625, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": 0.0531, | |
| "reward": 0.6706622801721096, | |
| "reward_std": 0.8000525310635567, | |
| "rewards/cosine_scaled_reward": -0.03371649980545044, | |
| "rewards/format_reward": 0.7380952537059784, | |
| "step": 365 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2664.3928833007812, | |
| "epoch": 1.464, | |
| "grad_norm": 0.6695978045463562, | |
| "kl": 0.4052734375, | |
| "learning_rate": 2.829615010283344e-07, | |
| "loss": 0.1001, | |
| "reward": 0.6332942470908165, | |
| "reward_std": 0.9363250732421875, | |
| "rewards/cosine_scaled_reward": -0.04049574676901102, | |
| "rewards/format_reward": 0.7142857313156128, | |
| "step": 366 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2493.2857666015625, | |
| "epoch": 1.468, | |
| "grad_norm": 0.41825661063194275, | |
| "kl": 0.269775390625, | |
| "learning_rate": 2.8043938066798645e-07, | |
| "loss": 0.0634, | |
| "reward": 0.6000736728310585, | |
| "reward_std": 0.6958686709403992, | |
| "rewards/cosine_scaled_reward": -0.04520127363502979, | |
| "rewards/format_reward": 0.690476194024086, | |
| "step": 367 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2441.184600830078, | |
| "epoch": 1.472, | |
| "grad_norm": 0.6742368936538696, | |
| "kl": 0.29248046875, | |
| "learning_rate": 2.7793039831193133e-07, | |
| "loss": 0.0205, | |
| "reward": 0.7077510952949524, | |
| "reward_std": 0.8173489719629288, | |
| "rewards/cosine_scaled_reward": -0.003267320804297924, | |
| "rewards/format_reward": 0.7142857313156128, | |
| "step": 368 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2645.202392578125, | |
| "epoch": 1.476, | |
| "grad_norm": 0.6914957761764526, | |
| "kl": 0.298095703125, | |
| "learning_rate": 2.7543467624442956e-07, | |
| "loss": 0.0967, | |
| "reward": 0.2303389220032841, | |
| "reward_std": 0.6355866640806198, | |
| "rewards/cosine_scaled_reward": -0.1616162583231926, | |
| "rewards/format_reward": 0.5535714328289032, | |
| "step": 369 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2256.9286193847656, | |
| "epoch": 1.48, | |
| "grad_norm": 0.9714637994766235, | |
| "kl": 0.255126953125, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.0866, | |
| "reward": 0.7436040937900543, | |
| "reward_std": 0.6377575844526291, | |
| "rewards/cosine_scaled_reward": -0.012126525864005089, | |
| "rewards/format_reward": 0.767857164144516, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2519.7202758789062, | |
| "epoch": 1.484, | |
| "grad_norm": 0.6541756987571716, | |
| "kl": 0.32470703125, | |
| "learning_rate": 2.7048349887476037e-07, | |
| "loss": 0.0731, | |
| "reward": 0.8480066582560539, | |
| "reward_std": 0.7711106240749359, | |
| "rewards/cosine_scaled_reward": 0.031146179419010878, | |
| "rewards/format_reward": 0.7857142984867096, | |
| "step": 371 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2420.970245361328, | |
| "epoch": 1.488, | |
| "grad_norm": 0.5346278548240662, | |
| "kl": 0.2998046875, | |
| "learning_rate": 2.6802828488599294e-07, | |
| "loss": 0.0556, | |
| "reward": 0.6287192776799202, | |
| "reward_std": 0.6931318640708923, | |
| "rewards/cosine_scaled_reward": -0.03683085576631129, | |
| "rewards/format_reward": 0.7023809552192688, | |
| "step": 372 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2542.5654907226562, | |
| "epoch": 1.492, | |
| "grad_norm": 0.43199771642684937, | |
| "kl": 0.33544921875, | |
| "learning_rate": 2.655868138008171e-07, | |
| "loss": 0.0657, | |
| "reward": 0.4730634540319443, | |
| "reward_std": 0.5836888402700424, | |
| "rewards/cosine_scaled_reward": -0.1533492412418127, | |
| "rewards/format_reward": 0.7797619104385376, | |
| "step": 373 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2848.3572387695312, | |
| "epoch": 1.496, | |
| "grad_norm": 0.6630088686943054, | |
| "kl": 0.35009765625, | |
| "learning_rate": 2.631592046130896e-07, | |
| "loss": 0.0207, | |
| "reward": 0.2956889607012272, | |
| "reward_std": 0.614417277276516, | |
| "rewards/cosine_scaled_reward": -0.1319174226373434, | |
| "rewards/format_reward": 0.5595238283276558, | |
| "step": 374 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2754.0060424804688, | |
| "epoch": 1.5, | |
| "grad_norm": 0.4504316449165344, | |
| "kl": 0.302490234375, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.0225, | |
| "reward": 0.42709287256002426, | |
| "reward_std": 0.6112170070409775, | |
| "rewards/cosine_scaled_reward": -0.07514405064284801, | |
| "rewards/format_reward": 0.5773809552192688, | |
| "step": 375 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2403.52978515625, | |
| "epoch": 1.504, | |
| "grad_norm": 0.4335888624191284, | |
| "kl": 0.266845703125, | |
| "learning_rate": 2.583460445215911e-07, | |
| "loss": 0.0347, | |
| "reward": 0.37878482323139906, | |
| "reward_std": 0.5512942001223564, | |
| "rewards/cosine_scaled_reward": -0.14691711403429508, | |
| "rewards/format_reward": 0.6726190745830536, | |
| "step": 376 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2539.71435546875, | |
| "epoch": 1.508, | |
| "grad_norm": 0.6600142121315002, | |
| "kl": 0.35546875, | |
| "learning_rate": 2.5596072820445254e-07, | |
| "loss": 0.0879, | |
| "reward": 0.6289402991533279, | |
| "reward_std": 0.7740087658166885, | |
| "rewards/cosine_scaled_reward": -0.04564890172332525, | |
| "rewards/format_reward": 0.7202381044626236, | |
| "step": 377 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2566.4107666015625, | |
| "epoch": 1.512, | |
| "grad_norm": 0.5574218034744263, | |
| "kl": 0.310791015625, | |
| "learning_rate": 2.5358974294659373e-07, | |
| "loss": 0.0794, | |
| "reward": 0.5734596885740757, | |
| "reward_std": 0.6776000708341599, | |
| "rewards/cosine_scaled_reward": -0.058508249232545495, | |
| "rewards/format_reward": 0.6904762089252472, | |
| "step": 378 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2749.4642944335938, | |
| "epoch": 1.516, | |
| "grad_norm": 0.4314301908016205, | |
| "kl": 0.330078125, | |
| "learning_rate": 2.512332043064913e-07, | |
| "loss": 0.0655, | |
| "reward": 0.42858002707362175, | |
| "reward_std": 0.7303398549556732, | |
| "rewards/cosine_scaled_reward": -0.10416238568723202, | |
| "rewards/format_reward": 0.6369047611951828, | |
| "step": 379 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2403.684539794922, | |
| "epoch": 1.52, | |
| "grad_norm": 0.42673397064208984, | |
| "kl": 0.299560546875, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.0799, | |
| "reward": 0.7241241782903671, | |
| "reward_std": 0.7478837221860886, | |
| "rewards/cosine_scaled_reward": -0.006985542830079794, | |
| "rewards/format_reward": 0.7380952388048172, | |
| "step": 380 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2391.5179443359375, | |
| "epoch": 1.524, | |
| "grad_norm": 0.8130372762680054, | |
| "kl": 0.3203125, | |
| "learning_rate": 2.465639255873246e-07, | |
| "loss": 0.0286, | |
| "reward": 0.49668359011411667, | |
| "reward_std": 0.6931805461645126, | |
| "rewards/cosine_scaled_reward": -0.12665820121765137, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 381 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2430.5654907226562, | |
| "epoch": 1.528, | |
| "grad_norm": 0.4374740719795227, | |
| "kl": 0.310302734375, | |
| "learning_rate": 2.4425141308231765e-07, | |
| "loss": 0.0341, | |
| "reward": 0.6685771271586418, | |
| "reward_std": 0.8352404981851578, | |
| "rewards/cosine_scaled_reward": -0.016901913098990917, | |
| "rewards/format_reward": 0.7023809552192688, | |
| "step": 382 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2296.1607666015625, | |
| "epoch": 1.532, | |
| "grad_norm": 0.5494891405105591, | |
| "kl": 0.30224609375, | |
| "learning_rate": 2.4195380233209006e-07, | |
| "loss": 0.0859, | |
| "reward": 0.7063075229525566, | |
| "reward_std": 0.7431895136833191, | |
| "rewards/cosine_scaled_reward": -0.009941489901393652, | |
| "rewards/format_reward": 0.7261904925107956, | |
| "step": 383 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2517.827392578125, | |
| "epoch": 1.536, | |
| "grad_norm": 0.5410645604133606, | |
| "kl": 0.33203125, | |
| "learning_rate": 2.3967120531894857e-07, | |
| "loss": 0.0459, | |
| "reward": 0.42114658281207085, | |
| "reward_std": 0.6721706539392471, | |
| "rewards/cosine_scaled_reward": -0.12573623820208013, | |
| "rewards/format_reward": 0.6726190596818924, | |
| "step": 384 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2484.96435546875, | |
| "epoch": 1.54, | |
| "grad_norm": 0.4815540313720703, | |
| "kl": 0.3095703125, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": 0.0759, | |
| "reward": 0.7441006675362587, | |
| "reward_std": 0.8601991981267929, | |
| "rewards/cosine_scaled_reward": 0.029788417392410338, | |
| "rewards/format_reward": 0.6845238208770752, | |
| "step": 385 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2389.9703063964844, | |
| "epoch": 1.544, | |
| "grad_norm": 0.6783538460731506, | |
| "kl": 0.2802734375, | |
| "learning_rate": 2.3515149676898552e-07, | |
| "loss": 0.0716, | |
| "reward": 0.479885321110487, | |
| "reward_std": 0.7240753322839737, | |
| "rewards/cosine_scaled_reward": -0.06958115100860596, | |
| "rewards/format_reward": 0.6190476417541504, | |
| "step": 386 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2359.964324951172, | |
| "epoch": 1.548, | |
| "grad_norm": 0.8481286764144897, | |
| "kl": 0.296630859375, | |
| "learning_rate": 2.3291460551638237e-07, | |
| "loss": 0.0148, | |
| "reward": 0.5802747337147593, | |
| "reward_std": 0.5601852983236313, | |
| "rewards/cosine_scaled_reward": -0.04617217415943742, | |
| "rewards/format_reward": 0.672619067132473, | |
| "step": 387 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2159.5655212402344, | |
| "epoch": 1.552, | |
| "grad_norm": 0.5140231251716614, | |
| "kl": 0.302001953125, | |
| "learning_rate": 2.306931685585657e-07, | |
| "loss": 0.063, | |
| "reward": 0.5727366209030151, | |
| "reward_std": 0.6229267343878746, | |
| "rewards/cosine_scaled_reward": -0.11244121752679348, | |
| "rewards/format_reward": 0.7976190745830536, | |
| "step": 388 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2253.7559814453125, | |
| "epoch": 1.556, | |
| "grad_norm": 0.4566425681114197, | |
| "kl": 0.292724609375, | |
| "learning_rate": 2.2848729416523859e-07, | |
| "loss": 0.0398, | |
| "reward": 0.6296885460615158, | |
| "reward_std": 0.7193648666143417, | |
| "rewards/cosine_scaled_reward": -0.04825095273554325, | |
| "rewards/format_reward": 0.7261904925107956, | |
| "step": 389 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2653.6250610351562, | |
| "epoch": 1.56, | |
| "grad_norm": 0.6326945424079895, | |
| "kl": 0.38818359375, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": 0.1043, | |
| "reward": 0.531873881816864, | |
| "reward_std": 0.7026529461145401, | |
| "rewards/cosine_scaled_reward": -0.0822773426771164, | |
| "rewards/format_reward": 0.696428582072258, | |
| "step": 390 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2613.7261962890625, | |
| "epoch": 1.564, | |
| "grad_norm": 0.398603618144989, | |
| "kl": 0.305908203125, | |
| "learning_rate": 2.2412266235313973e-07, | |
| "loss": 0.0464, | |
| "reward": 0.2553718090057373, | |
| "reward_std": 0.6311058104038239, | |
| "rewards/cosine_scaled_reward": -0.1699331346899271, | |
| "rewards/format_reward": 0.595238097012043, | |
| "step": 391 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2196.5833435058594, | |
| "epoch": 1.568, | |
| "grad_norm": 1.320838451385498, | |
| "kl": 0.2607421875, | |
| "learning_rate": 2.2196411766036487e-07, | |
| "loss": 0.1165, | |
| "reward": 0.6960010007023811, | |
| "reward_std": 0.8236257880926132, | |
| "rewards/cosine_scaled_reward": -0.044856662629172206, | |
| "rewards/format_reward": 0.7857143133878708, | |
| "step": 392 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2663.9285888671875, | |
| "epoch": 1.572, | |
| "grad_norm": 0.5183250904083252, | |
| "kl": 0.328857421875, | |
| "learning_rate": 2.1982156097370557e-07, | |
| "loss": 0.0708, | |
| "reward": 0.34957781434059143, | |
| "reward_std": 0.6104390919208527, | |
| "rewards/cosine_scaled_reward": -0.12878252286463976, | |
| "rewards/format_reward": 0.6071428805589676, | |
| "step": 393 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2630.2560424804688, | |
| "epoch": 1.576, | |
| "grad_norm": 0.2792785167694092, | |
| "kl": 0.34619140625, | |
| "learning_rate": 2.1769509671835223e-07, | |
| "loss": 0.0772, | |
| "reward": 0.45473287999629974, | |
| "reward_std": 0.7525355666875839, | |
| "rewards/cosine_scaled_reward": -0.0940621355548501, | |
| "rewards/format_reward": 0.6428571492433548, | |
| "step": 394 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2484.2679138183594, | |
| "epoch": 1.58, | |
| "grad_norm": 0.47966378927230835, | |
| "kl": 0.3330078125, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": 0.0783, | |
| "reward": 0.5024484526365995, | |
| "reward_std": 0.6865183711051941, | |
| "rewards/cosine_scaled_reward": -0.07615673809777945, | |
| "rewards/format_reward": 0.6547619104385376, | |
| "step": 395 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2481.96435546875, | |
| "epoch": 1.584, | |
| "grad_norm": 0.4925695061683655, | |
| "kl": 0.34228515625, | |
| "learning_rate": 2.134908592756607e-07, | |
| "loss": 0.1001, | |
| "reward": 0.6872217282652855, | |
| "reward_std": 0.7295544147491455, | |
| "rewards/cosine_scaled_reward": -0.037341527407988906, | |
| "rewards/format_reward": 0.761904776096344, | |
| "step": 396 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2154.494110107422, | |
| "epoch": 1.588, | |
| "grad_norm": 0.635874330997467, | |
| "kl": 0.281494140625, | |
| "learning_rate": 2.1141329099692406e-07, | |
| "loss": 0.0341, | |
| "reward": 0.6967096533626318, | |
| "reward_std": 0.7243114337325096, | |
| "rewards/cosine_scaled_reward": -0.04450232535600662, | |
| "rewards/format_reward": 0.7857143059372902, | |
| "step": 397 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2704.7322387695312, | |
| "epoch": 1.592, | |
| "grad_norm": 0.36841636896133423, | |
| "kl": 0.390625, | |
| "learning_rate": 2.0935222495670968e-07, | |
| "loss": 0.0683, | |
| "reward": 0.35482142120599747, | |
| "reward_std": 0.6981697529554367, | |
| "rewards/cosine_scaled_reward": -0.1410416765138507, | |
| "rewards/format_reward": 0.6369047611951828, | |
| "step": 398 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2155.1905212402344, | |
| "epoch": 1.596, | |
| "grad_norm": 0.6153831481933594, | |
| "kl": 0.2763671875, | |
| "learning_rate": 2.0730776160846853e-07, | |
| "loss": 0.075, | |
| "reward": 0.7129835858941078, | |
| "reward_std": 0.7049887701869011, | |
| "rewards/cosine_scaled_reward": -0.04827013239264488, | |
| "rewards/format_reward": 0.8095238208770752, | |
| "step": 399 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2736.9286499023438, | |
| "epoch": 1.6, | |
| "grad_norm": 0.4656315743923187, | |
| "kl": 0.38330078125, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.0486, | |
| "reward": 0.3270074762403965, | |
| "reward_std": 0.6684512719511986, | |
| "rewards/cosine_scaled_reward": -0.17578197922557592, | |
| "rewards/format_reward": 0.6785714477300644, | |
| "step": 400 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2684.994140625, | |
| "epoch": 1.604, | |
| "grad_norm": 0.4505021274089813, | |
| "kl": 0.34423828125, | |
| "learning_rate": 2.032690407508949e-07, | |
| "loss": 0.0772, | |
| "reward": 0.5096228048205376, | |
| "reward_std": 0.7098966240882874, | |
| "rewards/cosine_scaled_reward": -0.0814981039147824, | |
| "rewards/format_reward": 0.6726190522313118, | |
| "step": 401 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2508.327423095703, | |
| "epoch": 1.608, | |
| "grad_norm": 0.3696132302284241, | |
| "kl": 0.34033203125, | |
| "learning_rate": 2.0127498008311922e-07, | |
| "loss": 0.0554, | |
| "reward": 0.6811544820666313, | |
| "reward_std": 0.7352585643529892, | |
| "rewards/cosine_scaled_reward": -0.03442276082932949, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 402 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2329.654815673828, | |
| "epoch": 1.612, | |
| "grad_norm": 0.5500597953796387, | |
| "kl": 0.310302734375, | |
| "learning_rate": 1.9929791578083655e-07, | |
| "loss": 0.0912, | |
| "reward": 0.5886539276689291, | |
| "reward_std": 0.6953590214252472, | |
| "rewards/cosine_scaled_reward": -0.05091113201342523, | |
| "rewards/format_reward": 0.6904762089252472, | |
| "step": 403 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2279.5238342285156, | |
| "epoch": 1.616, | |
| "grad_norm": 0.825627326965332, | |
| "kl": 0.314208984375, | |
| "learning_rate": 1.9733794420337213e-07, | |
| "loss": 0.0303, | |
| "reward": 0.4903724156320095, | |
| "reward_std": 0.6759866625070572, | |
| "rewards/cosine_scaled_reward": -0.1268376000225544, | |
| "rewards/format_reward": 0.7440476417541504, | |
| "step": 404 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2298.607208251953, | |
| "epoch": 1.62, | |
| "grad_norm": 0.7521853446960449, | |
| "kl": 0.300048828125, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.0742, | |
| "reward": 0.6187992710620165, | |
| "reward_std": 0.6595779061317444, | |
| "rewards/cosine_scaled_reward": -0.050719428109005094, | |
| "rewards/format_reward": 0.7202381044626236, | |
| "step": 405 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2462.0416870117188, | |
| "epoch": 1.624, | |
| "grad_norm": 0.4565219581127167, | |
| "kl": 0.36181640625, | |
| "learning_rate": 1.934696604901642e-07, | |
| "loss": 0.0655, | |
| "reward": 0.7676936537027359, | |
| "reward_std": 0.8463387489318848, | |
| "rewards/cosine_scaled_reward": 0.032656354829669, | |
| "rewards/format_reward": 0.7023809552192688, | |
| "step": 406 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2593.9285888671875, | |
| "epoch": 1.6280000000000001, | |
| "grad_norm": 0.7805240154266357, | |
| "kl": 0.37109375, | |
| "learning_rate": 1.915615368891117e-07, | |
| "loss": 0.0336, | |
| "reward": 0.4898635447025299, | |
| "reward_std": 0.6100385710597038, | |
| "rewards/cosine_scaled_reward": -0.0943539384752512, | |
| "rewards/format_reward": 0.6785714477300644, | |
| "step": 407 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2179.8810119628906, | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": 0.7494162321090698, | |
| "kl": 0.3154296875, | |
| "learning_rate": 1.8967088307307e-07, | |
| "loss": 0.0819, | |
| "reward": 0.5508405864238739, | |
| "reward_std": 0.6755202859640121, | |
| "rewards/cosine_scaled_reward": -0.12934163073077798, | |
| "rewards/format_reward": 0.8095238357782364, | |
| "step": 408 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2213.1131591796875, | |
| "epoch": 1.6360000000000001, | |
| "grad_norm": 0.7274454832077026, | |
| "kl": 0.3466796875, | |
| "learning_rate": 1.8779779118983867e-07, | |
| "loss": 0.0553, | |
| "reward": 0.6235681027173996, | |
| "reward_std": 0.6233258098363876, | |
| "rewards/cosine_scaled_reward": -0.10488261096179485, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 409 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2276.5774536132812, | |
| "epoch": 1.6400000000000001, | |
| "grad_norm": 1.2181180715560913, | |
| "kl": 0.357421875, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": 0.1457, | |
| "reward": 0.6739452332258224, | |
| "reward_std": 0.7620265781879425, | |
| "rewards/cosine_scaled_reward": -0.0350511996075511, | |
| "rewards/format_reward": 0.7440476268529892, | |
| "step": 410 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2513.0833740234375, | |
| "epoch": 1.6440000000000001, | |
| "grad_norm": 0.3816966414451599, | |
| "kl": 0.37060546875, | |
| "learning_rate": 1.8410465752883758e-07, | |
| "loss": 0.0631, | |
| "reward": 0.503595694899559, | |
| "reward_std": 0.6215758174657822, | |
| "rewards/cosine_scaled_reward": -0.08748787135118619, | |
| "rewards/format_reward": 0.6785714328289032, | |
| "step": 411 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2348.7916870117188, | |
| "epoch": 1.6480000000000001, | |
| "grad_norm": 0.647936224937439, | |
| "kl": 0.3623046875, | |
| "learning_rate": 1.822847957491922e-07, | |
| "loss": 0.1147, | |
| "reward": 0.7675136551260948, | |
| "reward_std": 0.7814928591251373, | |
| "rewards/cosine_scaled_reward": 0.020661589689552784, | |
| "rewards/format_reward": 0.7261904925107956, | |
| "step": 412 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2329.9464721679688, | |
| "epoch": 1.6520000000000001, | |
| "grad_norm": 0.7966573238372803, | |
| "kl": 0.3505859375, | |
| "learning_rate": 1.804828558898332e-07, | |
| "loss": 0.0507, | |
| "reward": 0.7220299392938614, | |
| "reward_std": 0.7220810800790787, | |
| "rewards/cosine_scaled_reward": -0.01993740734178573, | |
| "rewards/format_reward": 0.761904776096344, | |
| "step": 413 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2510.83935546875, | |
| "epoch": 1.6560000000000001, | |
| "grad_norm": 0.3402910828590393, | |
| "kl": 0.36767578125, | |
| "learning_rate": 1.7869892577476722e-07, | |
| "loss": 0.0717, | |
| "reward": 0.6566540375351906, | |
| "reward_std": 0.7324352562427521, | |
| "rewards/cosine_scaled_reward": -0.016911087092012167, | |
| "rewards/format_reward": 0.6904762089252472, | |
| "step": 414 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2362.952423095703, | |
| "epoch": 1.6600000000000001, | |
| "grad_norm": 0.5068221688270569, | |
| "kl": 0.41357421875, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": 0.0817, | |
| "reward": 0.5704538598656654, | |
| "reward_std": 0.83931764960289, | |
| "rewards/cosine_scaled_reward": -0.08977308124303818, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 415 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2197.886962890625, | |
| "epoch": 1.6640000000000001, | |
| "grad_norm": 0.5192682147026062, | |
| "kl": 0.349609375, | |
| "learning_rate": 1.7518544168045524e-07, | |
| "loss": 0.0456, | |
| "reward": 0.7760029062628746, | |
| "reward_std": 0.7372387051582336, | |
| "rewards/cosine_scaled_reward": -0.0048556849360466, | |
| "rewards/format_reward": 0.7857142984867096, | |
| "step": 416 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2365.0655517578125, | |
| "epoch": 1.6680000000000001, | |
| "grad_norm": 0.7471702098846436, | |
| "kl": 0.357421875, | |
| "learning_rate": 1.7345605894346726e-07, | |
| "loss": 0.0258, | |
| "reward": 0.6658978462219238, | |
| "reward_std": 0.7144315093755722, | |
| "rewards/cosine_scaled_reward": -0.02419395267497748, | |
| "rewards/format_reward": 0.7142857313156128, | |
| "step": 417 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2210.7500610351562, | |
| "epoch": 1.6720000000000002, | |
| "grad_norm": 0.4305538833141327, | |
| "kl": 0.37060546875, | |
| "learning_rate": 1.7174502842694212e-07, | |
| "loss": 0.0715, | |
| "reward": 0.6991885676980019, | |
| "reward_std": 0.6301053613424301, | |
| "rewards/cosine_scaled_reward": -0.03433429542928934, | |
| "rewards/format_reward": 0.7678571492433548, | |
| "step": 418 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2631.9048461914062, | |
| "epoch": 1.6760000000000002, | |
| "grad_norm": 0.31225350499153137, | |
| "kl": 0.35888671875, | |
| "learning_rate": 1.7005243352409333e-07, | |
| "loss": 0.0697, | |
| "reward": 0.6943976636976004, | |
| "reward_std": 0.7306639850139618, | |
| "rewards/cosine_scaled_reward": 0.02874644659459591, | |
| "rewards/format_reward": 0.636904776096344, | |
| "step": 419 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2425.6607055664062, | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 0.6987324953079224, | |
| "kl": 0.38330078125, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": 0.0585, | |
| "reward": 0.7563354596495628, | |
| "reward_std": 0.7114580571651459, | |
| "rewards/cosine_scaled_reward": -0.038498950423672795, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 420 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2277.4464721679688, | |
| "epoch": 1.6840000000000002, | |
| "grad_norm": 0.34894487261772156, | |
| "kl": 0.373046875, | |
| "learning_rate": 1.6672287963562852e-07, | |
| "loss": 0.0935, | |
| "reward": 0.6365625336766243, | |
| "reward_std": 0.7153737097978592, | |
| "rewards/cosine_scaled_reward": -0.05969492206349969, | |
| "rewards/format_reward": 0.7559524029493332, | |
| "step": 421 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2504.916748046875, | |
| "epoch": 1.688, | |
| "grad_norm": 0.6229146718978882, | |
| "kl": 0.35498046875, | |
| "learning_rate": 1.6508608292777203e-07, | |
| "loss": 0.0181, | |
| "reward": 0.5784893482923508, | |
| "reward_std": 0.6405449658632278, | |
| "rewards/cosine_scaled_reward": -0.06194583047181368, | |
| "rewards/format_reward": 0.7023809552192688, | |
| "step": 422 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2308.52978515625, | |
| "epoch": 1.692, | |
| "grad_norm": 0.4013311266899109, | |
| "kl": 0.328125, | |
| "learning_rate": 1.6346804638120098e-07, | |
| "loss": 0.0574, | |
| "reward": 0.6359338611364365, | |
| "reward_std": 0.7808969020843506, | |
| "rewards/cosine_scaled_reward": -0.03917593788355589, | |
| "rewards/format_reward": 0.714285746216774, | |
| "step": 423 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2253.6785888671875, | |
| "epoch": 1.696, | |
| "grad_norm": 0.7038490176200867, | |
| "kl": 0.288330078125, | |
| "learning_rate": 1.6186884885673413e-07, | |
| "loss": 0.0231, | |
| "reward": 0.6301399618387222, | |
| "reward_std": 0.7140125781297684, | |
| "rewards/cosine_scaled_reward": -0.07481098547577858, | |
| "rewards/format_reward": 0.7797619104385376, | |
| "step": 424 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2256.464385986328, | |
| "epoch": 1.7, | |
| "grad_norm": 0.4849310517311096, | |
| "kl": 0.34228515625, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": 0.0754, | |
| "reward": 0.6902762800455093, | |
| "reward_std": 0.7732263505458832, | |
| "rewards/cosine_scaled_reward": -0.029861881979741156, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 425 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2638.3750610351562, | |
| "epoch": 1.704, | |
| "grad_norm": 0.4661174416542053, | |
| "kl": 0.34716796875, | |
| "learning_rate": 1.5872728172265146e-07, | |
| "loss": 0.0631, | |
| "reward": 0.4628839958459139, | |
| "reward_std": 0.6522120535373688, | |
| "rewards/cosine_scaled_reward": -0.06915326602756977, | |
| "rewards/format_reward": 0.601190485060215, | |
| "step": 426 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2258.279815673828, | |
| "epoch": 1.708, | |
| "grad_norm": 0.5582512021064758, | |
| "kl": 0.325439453125, | |
| "learning_rate": 1.5718506522858572e-07, | |
| "loss": 0.0416, | |
| "reward": 0.6841344758868217, | |
| "reward_std": 0.716413825750351, | |
| "rewards/cosine_scaled_reward": -0.04483753815293312, | |
| "rewards/format_reward": 0.7738095298409462, | |
| "step": 427 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2152.732208251953, | |
| "epoch": 1.712, | |
| "grad_norm": 0.362613320350647, | |
| "kl": 0.35546875, | |
| "learning_rate": 1.5566199398026147e-07, | |
| "loss": 0.0663, | |
| "reward": 0.8665853589773178, | |
| "reward_std": 0.746289573609829, | |
| "rewards/cosine_scaled_reward": 0.013649825006723404, | |
| "rewards/format_reward": 0.8392857164144516, | |
| "step": 428 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2589.8690795898438, | |
| "epoch": 1.716, | |
| "grad_norm": 0.392411470413208, | |
| "kl": 0.35693359375, | |
| "learning_rate": 1.5415814221002265e-07, | |
| "loss": 0.0781, | |
| "reward": 0.6071142517030239, | |
| "reward_std": 0.7519797533750534, | |
| "rewards/cosine_scaled_reward": -0.044657152146101, | |
| "rewards/format_reward": 0.696428582072258, | |
| "step": 429 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2220.1012573242188, | |
| "epoch": 1.72, | |
| "grad_norm": 0.5445396900177002, | |
| "kl": 0.341796875, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.0534, | |
| "reward": 0.701055221259594, | |
| "reward_std": 0.6740739792585373, | |
| "rewards/cosine_scaled_reward": -0.04232952371239662, | |
| "rewards/format_reward": 0.7857142984867096, | |
| "step": 430 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1902.5595397949219, | |
| "epoch": 1.724, | |
| "grad_norm": 0.3450181186199188, | |
| "kl": 0.2607421875, | |
| "learning_rate": 1.5120838934595337e-07, | |
| "loss": 0.0472, | |
| "reward": 1.0963951796293259, | |
| "reward_std": 0.746511772274971, | |
| "rewards/cosine_scaled_reward": 0.11962614580988884, | |
| "rewards/format_reward": 0.8571428805589676, | |
| "step": 431 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2478.732177734375, | |
| "epoch": 1.728, | |
| "grad_norm": 0.48547589778900146, | |
| "kl": 0.373046875, | |
| "learning_rate": 1.4976263201891613e-07, | |
| "loss": 0.0409, | |
| "reward": 0.5677317231893539, | |
| "reward_std": 0.6051659360527992, | |
| "rewards/cosine_scaled_reward": -0.09113414993043989, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 432 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2518.7560424804688, | |
| "epoch": 1.732, | |
| "grad_norm": 0.9139849543571472, | |
| "kl": 0.3701171875, | |
| "learning_rate": 1.483363816965435e-07, | |
| "loss": 0.01, | |
| "reward": 0.6564267948269844, | |
| "reward_std": 0.6321954727172852, | |
| "rewards/cosine_scaled_reward": -0.04083424177952111, | |
| "rewards/format_reward": 0.7380952388048172, | |
| "step": 433 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2439.119110107422, | |
| "epoch": 1.736, | |
| "grad_norm": 0.4629580080509186, | |
| "kl": 0.3349609375, | |
| "learning_rate": 1.469297078922642e-07, | |
| "loss": 0.0828, | |
| "reward": 0.6547855883836746, | |
| "reward_std": 0.6274382770061493, | |
| "rewards/cosine_scaled_reward": -0.05653578881174326, | |
| "rewards/format_reward": 0.767857164144516, | |
| "step": 434 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2385.047637939453, | |
| "epoch": 1.74, | |
| "grad_norm": 0.4398196041584015, | |
| "kl": 0.342529296875, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": 0.034, | |
| "reward": 0.59782674908638, | |
| "reward_std": 0.7165493220090866, | |
| "rewards/cosine_scaled_reward": -0.09394377004355192, | |
| "rewards/format_reward": 0.7857142984867096, | |
| "step": 435 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2235.4405212402344, | |
| "epoch": 1.744, | |
| "grad_norm": 0.5894522070884705, | |
| "kl": 0.321044921875, | |
| "learning_rate": 1.4417536311769885e-07, | |
| "loss": 0.0578, | |
| "reward": 0.7302387952804565, | |
| "reward_std": 0.7528126537799835, | |
| "rewards/cosine_scaled_reward": -0.018809196539223194, | |
| "rewards/format_reward": 0.7678571492433548, | |
| "step": 436 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2364.2083740234375, | |
| "epoch": 1.748, | |
| "grad_norm": 0.517291784286499, | |
| "kl": 0.262451171875, | |
| "learning_rate": 1.4282782639029128e-07, | |
| "loss": 0.0626, | |
| "reward": 0.5654323399066925, | |
| "reward_std": 0.6579017788171768, | |
| "rewards/cosine_scaled_reward": -0.03276003524661064, | |
| "rewards/format_reward": 0.630952388048172, | |
| "step": 437 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2464.0238647460938, | |
| "epoch": 1.752, | |
| "grad_norm": 0.719810426235199, | |
| "kl": 0.34619140625, | |
| "learning_rate": 1.4150013466019114e-07, | |
| "loss": 0.0289, | |
| "reward": 0.5253645405173302, | |
| "reward_std": 0.593732014298439, | |
| "rewards/cosine_scaled_reward": -0.10934155760332942, | |
| "rewards/format_reward": 0.7440476268529892, | |
| "step": 438 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2688.2559814453125, | |
| "epoch": 1.756, | |
| "grad_norm": 0.47081246972084045, | |
| "kl": 0.320068359375, | |
| "learning_rate": 1.4019235263722034e-07, | |
| "loss": 0.0737, | |
| "reward": 0.5551765933632851, | |
| "reward_std": 0.750535324215889, | |
| "rewards/cosine_scaled_reward": -0.043840276543051004, | |
| "rewards/format_reward": 0.6428571492433548, | |
| "step": 439 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2691.5298461914062, | |
| "epoch": 1.76, | |
| "grad_norm": 0.3561669588088989, | |
| "kl": 0.30517578125, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.0412, | |
| "reward": 0.6305891573429108, | |
| "reward_std": 0.7718498408794403, | |
| "rewards/cosine_scaled_reward": -0.02399112842977047, | |
| "rewards/format_reward": 0.6785714477300644, | |
| "step": 440 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2423.3274536132812, | |
| "epoch": 1.764, | |
| "grad_norm": 0.8448560237884521, | |
| "kl": 0.34130859375, | |
| "learning_rate": 1.3763677169699217e-07, | |
| "loss": 0.0974, | |
| "reward": 0.6258707121014595, | |
| "reward_std": 0.7022215574979782, | |
| "rewards/cosine_scaled_reward": -0.06504084914922714, | |
| "rewards/format_reward": 0.7559524029493332, | |
| "step": 441 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2509.6845703125, | |
| "epoch": 1.768, | |
| "grad_norm": 0.49845507740974426, | |
| "kl": 0.3271484375, | |
| "learning_rate": 1.3638909733514452e-07, | |
| "loss": 0.0368, | |
| "reward": 0.6952026858925819, | |
| "reward_std": 0.7732700109481812, | |
| "rewards/cosine_scaled_reward": -0.03037486458197236, | |
| "rewards/format_reward": 0.7559524029493332, | |
| "step": 442 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2251.6370239257812, | |
| "epoch": 1.772, | |
| "grad_norm": 1.6570687294006348, | |
| "kl": 0.32861328125, | |
| "learning_rate": 1.351615817851748e-07, | |
| "loss": 0.1251, | |
| "reward": 0.5299716778099537, | |
| "reward_std": 0.6262076199054718, | |
| "rewards/cosine_scaled_reward": -0.10108558752108365, | |
| "rewards/format_reward": 0.7321428507566452, | |
| "step": 443 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2569.8392944335938, | |
| "epoch": 1.776, | |
| "grad_norm": 0.5071792602539062, | |
| "kl": 0.285400390625, | |
| "learning_rate": 1.3395428487445914e-07, | |
| "loss": 0.0461, | |
| "reward": 0.3558937795460224, | |
| "reward_std": 0.6386721879243851, | |
| "rewards/cosine_scaled_reward": -0.12860072287730873, | |
| "rewards/format_reward": 0.6130952537059784, | |
| "step": 444 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2434.52978515625, | |
| "epoch": 1.78, | |
| "grad_norm": 0.6472364068031311, | |
| "kl": 0.33935546875, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": 0.0324, | |
| "reward": 0.5342502817511559, | |
| "reward_std": 0.7027776390314102, | |
| "rewards/cosine_scaled_reward": -0.08108916692435741, | |
| "rewards/format_reward": 0.696428582072258, | |
| "step": 445 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2600.7678833007812, | |
| "epoch": 1.784, | |
| "grad_norm": 0.4613596796989441, | |
| "kl": 0.36767578125, | |
| "learning_rate": 1.316005813502869e-07, | |
| "loss": 0.0366, | |
| "reward": 0.3209609054028988, | |
| "reward_std": 0.6079899072647095, | |
| "rewards/cosine_scaled_reward": -0.15499573945999146, | |
| "rewards/format_reward": 0.630952388048172, | |
| "step": 446 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2406.6131286621094, | |
| "epoch": 1.788, | |
| "grad_norm": 0.5609318017959595, | |
| "kl": 0.3349609375, | |
| "learning_rate": 1.3045428945301953e-07, | |
| "loss": 0.1069, | |
| "reward": 0.8279012702405453, | |
| "reward_std": 0.6648521721363068, | |
| "rewards/cosine_scaled_reward": 0.035974426195025444, | |
| "rewards/format_reward": 0.755952388048172, | |
| "step": 447 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2292.5238647460938, | |
| "epoch": 1.792, | |
| "grad_norm": 0.6498924493789673, | |
| "kl": 0.3251953125, | |
| "learning_rate": 1.2932844562179352e-07, | |
| "loss": 0.1002, | |
| "reward": 0.9468577206134796, | |
| "reward_std": 0.8284895867109299, | |
| "rewards/cosine_scaled_reward": 0.1073574130423367, | |
| "rewards/format_reward": 0.7321428656578064, | |
| "step": 448 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2359.7203063964844, | |
| "epoch": 1.796, | |
| "grad_norm": 0.8151586651802063, | |
| "kl": 0.29248046875, | |
| "learning_rate": 1.2822310472864885e-07, | |
| "loss": 0.1025, | |
| "reward": 0.6154885776340961, | |
| "reward_std": 0.643234595656395, | |
| "rewards/cosine_scaled_reward": -0.028565243119373918, | |
| "rewards/format_reward": 0.6726190596818924, | |
| "step": 449 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2456.5952758789062, | |
| "epoch": 1.8, | |
| "grad_norm": 0.6727307438850403, | |
| "kl": 0.3369140625, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.0518, | |
| "reward": 0.5276128388941288, | |
| "reward_std": 0.6850098147988319, | |
| "rewards/cosine_scaled_reward": -0.05166977294720709, | |
| "rewards/format_reward": 0.6309523805975914, | |
| "step": 450 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2180.250030517578, | |
| "epoch": 1.804, | |
| "grad_norm": 0.4327715039253235, | |
| "kl": 0.2841796875, | |
| "learning_rate": 1.260741462457165e-07, | |
| "loss": 0.0396, | |
| "reward": 0.9219238460063934, | |
| "reward_std": 0.8085188716650009, | |
| "rewards/cosine_scaled_reward": 0.029414291959255934, | |
| "rewards/format_reward": 0.8630952537059784, | |
| "step": 451 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2414.9584045410156, | |
| "epoch": 1.808, | |
| "grad_norm": 0.5890040993690491, | |
| "kl": 0.33642578125, | |
| "learning_rate": 1.2503063339313356e-07, | |
| "loss": 0.0116, | |
| "reward": 0.45377534069120884, | |
| "reward_std": 0.6864534169435501, | |
| "rewards/cosine_scaled_reward": -0.07668375968933105, | |
| "rewards/format_reward": 0.607142873108387, | |
| "step": 452 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2325.2619018554688, | |
| "epoch": 1.812, | |
| "grad_norm": 0.8580995798110962, | |
| "kl": 0.38330078125, | |
| "learning_rate": 1.2400783294793668e-07, | |
| "loss": 0.1211, | |
| "reward": 0.41875267028808594, | |
| "reward_std": 0.5978472009301186, | |
| "rewards/cosine_scaled_reward": -0.17455224692821503, | |
| "rewards/format_reward": 0.7678571492433548, | |
| "step": 453 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2524.8452758789062, | |
| "epoch": 1.8159999999999998, | |
| "grad_norm": 0.6263750195503235, | |
| "kl": 0.369140625, | |
| "learning_rate": 1.2300579475997657e-07, | |
| "loss": 0.0599, | |
| "reward": 0.5164637118577957, | |
| "reward_std": 0.7428598999977112, | |
| "rewards/cosine_scaled_reward": -0.0989109962247312, | |
| "rewards/format_reward": 0.7142857313156128, | |
| "step": 454 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2222.684600830078, | |
| "epoch": 1.8199999999999998, | |
| "grad_norm": 0.46227335929870605, | |
| "kl": 0.257568359375, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": 0.0452, | |
| "reward": 0.6773854792118073, | |
| "reward_std": 0.6582589149475098, | |
| "rewards/cosine_scaled_reward": -0.03333106730133295, | |
| "rewards/format_reward": 0.744047611951828, | |
| "step": 455 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2381.2916870117188, | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": 1.395836591720581, | |
| "kl": 0.346923828125, | |
| "learning_rate": 1.2106419949317388e-07, | |
| "loss": -0.0036, | |
| "reward": 0.6790124624967575, | |
| "reward_std": 0.7677509784698486, | |
| "rewards/cosine_scaled_reward": -0.0354937631636858, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 456 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2236.1369018554688, | |
| "epoch": 1.8279999999999998, | |
| "grad_norm": 0.363459974527359, | |
| "kl": 0.359375, | |
| "learning_rate": 1.2012473704494537e-07, | |
| "loss": 0.0897, | |
| "reward": 0.8419362753629684, | |
| "reward_std": 0.8306869268417358, | |
| "rewards/cosine_scaled_reward": 0.034063366474583745, | |
| "rewards/format_reward": 0.7738095223903656, | |
| "step": 457 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2312.5059814453125, | |
| "epoch": 1.8319999999999999, | |
| "grad_norm": 0.5052822232246399, | |
| "kl": 0.35791015625, | |
| "learning_rate": 1.1920622611056974e-07, | |
| "loss": 0.0619, | |
| "reward": 0.744497187435627, | |
| "reward_std": 0.6325250118970871, | |
| "rewards/cosine_scaled_reward": -0.02953713061287999, | |
| "rewards/format_reward": 0.8035714477300644, | |
| "step": 458 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2242.0357971191406, | |
| "epoch": 1.8359999999999999, | |
| "grad_norm": 0.4124799966812134, | |
| "kl": 0.30615234375, | |
| "learning_rate": 1.1830871145697412e-07, | |
| "loss": 0.0801, | |
| "reward": 0.7117128595709801, | |
| "reward_std": 0.7263730615377426, | |
| "rewards/cosine_scaled_reward": -0.031048328906763345, | |
| "rewards/format_reward": 0.773809552192688, | |
| "step": 459 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2795.3512573242188, | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 0.5879099369049072, | |
| "kl": 0.341796875, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": 0.0299, | |
| "reward": 0.41843298077583313, | |
| "reward_std": 0.6329772919416428, | |
| "rewards/cosine_scaled_reward": -0.11518826894462109, | |
| "rewards/format_reward": 0.6488095223903656, | |
| "step": 460 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2640.6904907226562, | |
| "epoch": 1.8439999999999999, | |
| "grad_norm": 0.3294979929924011, | |
| "kl": 0.38818359375, | |
| "learning_rate": 1.1657684494105386e-07, | |
| "loss": 0.0662, | |
| "reward": 0.6549192667007446, | |
| "reward_std": 0.8022814393043518, | |
| "rewards/cosine_scaled_reward": -0.02373085916042328, | |
| "rewards/format_reward": 0.7023809552192688, | |
| "step": 461 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2345.4285888671875, | |
| "epoch": 1.8479999999999999, | |
| "grad_norm": 0.3771924376487732, | |
| "kl": 0.35595703125, | |
| "learning_rate": 1.1574257748745986e-07, | |
| "loss": 0.105, | |
| "reward": 0.6399585753679276, | |
| "reward_std": 0.7968022599816322, | |
| "rewards/cosine_scaled_reward": -0.08478261809796095, | |
| "rewards/format_reward": 0.8095238208770752, | |
| "step": 462 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2589.1666870117188, | |
| "epoch": 1.8519999999999999, | |
| "grad_norm": 0.3122951090335846, | |
| "kl": 0.3798828125, | |
| "learning_rate": 1.1492947512799328e-07, | |
| "loss": 0.0565, | |
| "reward": 0.43525535613298416, | |
| "reward_std": 0.7020779103040695, | |
| "rewards/cosine_scaled_reward": -0.09784852154552937, | |
| "rewards/format_reward": 0.6309523954987526, | |
| "step": 463 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2481.261962890625, | |
| "epoch": 1.8559999999999999, | |
| "grad_norm": 0.5730969905853271, | |
| "kl": 0.33984375, | |
| "learning_rate": 1.1413757749211602e-07, | |
| "loss": 0.0699, | |
| "reward": 0.7782276198267937, | |
| "reward_std": 0.6072199195623398, | |
| "rewards/cosine_scaled_reward": 0.02304239757359028, | |
| "rewards/format_reward": 0.7321428656578064, | |
| "step": 464 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2644.619140625, | |
| "epoch": 1.8599999999999999, | |
| "grad_norm": 0.39649447798728943, | |
| "kl": 0.373046875, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.0712, | |
| "reward": 0.6695687249302864, | |
| "reward_std": 0.8249562680721283, | |
| "rewards/cosine_scaled_reward": 0.007403409108519554, | |
| "rewards/format_reward": 0.6547619178891182, | |
| "step": 465 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2491.0119018554688, | |
| "epoch": 1.8639999999999999, | |
| "grad_norm": 0.4567016065120697, | |
| "kl": 0.36376953125, | |
| "learning_rate": 1.1261754973965422e-07, | |
| "loss": 0.0702, | |
| "reward": 0.46852924674749374, | |
| "reward_std": 0.7603975385427475, | |
| "rewards/cosine_scaled_reward": -0.1199020454660058, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 466 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2532.6607666015625, | |
| "epoch": 1.8679999999999999, | |
| "grad_norm": 0.36717355251312256, | |
| "kl": 0.330078125, | |
| "learning_rate": 1.1188949370707787e-07, | |
| "loss": 0.0491, | |
| "reward": 0.5859625339508057, | |
| "reward_std": 0.6559573635458946, | |
| "rewards/cosine_scaled_reward": -0.028447304794099182, | |
| "rewards/format_reward": 0.6428571492433548, | |
| "step": 467 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2622.2559814453125, | |
| "epoch": 1.8719999999999999, | |
| "grad_norm": 0.4103451669216156, | |
| "kl": 0.36279296875, | |
| "learning_rate": 1.1118279056249653e-07, | |
| "loss": 0.0606, | |
| "reward": 0.6576881408691406, | |
| "reward_std": 0.8001267910003662, | |
| "rewards/cosine_scaled_reward": -0.007465461269021034, | |
| "rewards/format_reward": 0.6726190596818924, | |
| "step": 468 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2134.279815673828, | |
| "epoch": 1.876, | |
| "grad_norm": 0.5138155817985535, | |
| "kl": 0.256103515625, | |
| "learning_rate": 1.1049747474962444e-07, | |
| "loss": 0.0759, | |
| "reward": 0.6648233011364937, | |
| "reward_std": 0.7618712484836578, | |
| "rewards/cosine_scaled_reward": -0.06044549681246281, | |
| "rewards/format_reward": 0.7857143133878708, | |
| "step": 469 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2160.202392578125, | |
| "epoch": 1.88, | |
| "grad_norm": 0.3472672700881958, | |
| "kl": 0.30126953125, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": 0.0461, | |
| "reward": 0.7422109395265579, | |
| "reward_std": 0.6609758958220482, | |
| "rewards/cosine_scaled_reward": -0.012823125813156366, | |
| "rewards/format_reward": 0.767857164144516, | |
| "step": 470 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2532.3809814453125, | |
| "epoch": 1.884, | |
| "grad_norm": 0.2868484556674957, | |
| "kl": 0.2763671875, | |
| "learning_rate": 1.0919113768029517e-07, | |
| "loss": 0.0493, | |
| "reward": 0.48562416061758995, | |
| "reward_std": 0.7373960316181183, | |
| "rewards/cosine_scaled_reward": -0.08159269354655407, | |
| "rewards/format_reward": 0.648809552192688, | |
| "step": 471 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2409.3392944335938, | |
| "epoch": 1.888, | |
| "grad_norm": 0.4656960964202881, | |
| "kl": 0.390380859375, | |
| "learning_rate": 1.0857018009286381e-07, | |
| "loss": 0.0561, | |
| "reward": 0.6008469248190522, | |
| "reward_std": 0.6282935440540314, | |
| "rewards/cosine_scaled_reward": -0.05374322272837162, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 472 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2294.0178833007812, | |
| "epoch": 1.892, | |
| "grad_norm": 0.5274000763893127, | |
| "kl": 0.3369140625, | |
| "learning_rate": 1.0797073717209013e-07, | |
| "loss": 0.0877, | |
| "reward": 0.7744100838899612, | |
| "reward_std": 0.7935537397861481, | |
| "rewards/cosine_scaled_reward": 0.00030028633773326874, | |
| "rewards/format_reward": 0.7738095372915268, | |
| "step": 473 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2676.6727294921875, | |
| "epoch": 1.896, | |
| "grad_norm": 0.5600417256355286, | |
| "kl": 0.302978515625, | |
| "learning_rate": 1.0739283813397639e-07, | |
| "loss": 0.0338, | |
| "reward": 0.47137061692774296, | |
| "reward_std": 0.7821067273616791, | |
| "rewards/cosine_scaled_reward": -0.07681469712406397, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 474 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2442.0952758789062, | |
| "epoch": 1.9, | |
| "grad_norm": 0.5208225846290588, | |
| "kl": 0.32470703125, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.0837, | |
| "reward": 0.38532300293445587, | |
| "reward_std": 0.5505756810307503, | |
| "rewards/cosine_scaled_reward": -0.1287670750170946, | |
| "rewards/format_reward": 0.6428571492433548, | |
| "step": 475 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2030.7857360839844, | |
| "epoch": 1.904, | |
| "grad_norm": 0.6633386611938477, | |
| "kl": 0.269287109375, | |
| "learning_rate": 1.063017833182728e-07, | |
| "loss": 0.0183, | |
| "reward": 0.796258918941021, | |
| "reward_std": 0.8103819191455841, | |
| "rewards/cosine_scaled_reward": 0.017177060712128878, | |
| "rewards/format_reward": 0.761904776096344, | |
| "step": 476 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2734.2262573242188, | |
| "epoch": 1.908, | |
| "grad_norm": 0.5043067932128906, | |
| "kl": 0.35791015625, | |
| "learning_rate": 1.0578868071715544e-07, | |
| "loss": 0.0378, | |
| "reward": 0.2551159653812647, | |
| "reward_std": 0.5920611470937729, | |
| "rewards/cosine_scaled_reward": -0.155180131085217, | |
| "rewards/format_reward": 0.5654762089252472, | |
| "step": 477 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2301.0952758789062, | |
| "epoch": 1.912, | |
| "grad_norm": 0.7771977186203003, | |
| "kl": 0.28662109375, | |
| "learning_rate": 1.0529722834905125e-07, | |
| "loss": 0.0228, | |
| "reward": 0.6790298409759998, | |
| "reward_std": 0.6661486774682999, | |
| "rewards/cosine_scaled_reward": -0.06524700409499928, | |
| "rewards/format_reward": 0.8095238357782364, | |
| "step": 478 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2247.6012573242188, | |
| "epoch": 1.916, | |
| "grad_norm": 0.599141001701355, | |
| "kl": 0.3212890625, | |
| "learning_rate": 1.0482745016665526e-07, | |
| "loss": 0.0857, | |
| "reward": 0.8667033798992634, | |
| "reward_std": 0.8036679923534393, | |
| "rewards/cosine_scaled_reward": 0.06132788397371769, | |
| "rewards/format_reward": 0.7440476268529892, | |
| "step": 479 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2093.3333740234375, | |
| "epoch": 1.92, | |
| "grad_norm": 0.5312609076499939, | |
| "kl": 0.2958984375, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.0735, | |
| "reward": 0.7348574697971344, | |
| "reward_std": 0.689183309674263, | |
| "rewards/cosine_scaled_reward": -0.034356983145698905, | |
| "rewards/format_reward": 0.8035714328289032, | |
| "step": 480 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2448.1726684570312, | |
| "epoch": 1.924, | |
| "grad_norm": 0.5402917861938477, | |
| "kl": 0.36669921875, | |
| "learning_rate": 1.0395300688680625e-07, | |
| "loss": 0.0831, | |
| "reward": 0.43995974212884903, | |
| "reward_std": 0.6862698197364807, | |
| "rewards/cosine_scaled_reward": -0.13121061958372593, | |
| "rewards/format_reward": 0.70238097012043, | |
| "step": 481 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2658.7977294921875, | |
| "epoch": 1.928, | |
| "grad_norm": 0.5909121632575989, | |
| "kl": 0.3623046875, | |
| "learning_rate": 1.0354838440848501e-07, | |
| "loss": 0.0547, | |
| "reward": 0.5179771184921265, | |
| "reward_std": 0.7944772690534592, | |
| "rewards/cosine_scaled_reward": -0.0981543204979971, | |
| "rewards/format_reward": 0.7142857313156128, | |
| "step": 482 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2292.1428833007812, | |
| "epoch": 1.932, | |
| "grad_norm": 0.549201488494873, | |
| "kl": 0.30615234375, | |
| "learning_rate": 1.0316552135205837e-07, | |
| "loss": 0.0906, | |
| "reward": 0.6546563804149628, | |
| "reward_std": 0.7558221146464348, | |
| "rewards/cosine_scaled_reward": -0.017909929156303406, | |
| "rewards/format_reward": 0.690476194024086, | |
| "step": 483 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2001.2381286621094, | |
| "epoch": 1.936, | |
| "grad_norm": 0.9190180897712708, | |
| "kl": 0.2490234375, | |
| "learning_rate": 1.0280443637773163e-07, | |
| "loss": -0.0131, | |
| "reward": 0.6368911117315292, | |
| "reward_std": 0.5770624950528145, | |
| "rewards/cosine_scaled_reward": -0.07441157009452581, | |
| "rewards/format_reward": 0.7857142835855484, | |
| "step": 484 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2369.482177734375, | |
| "epoch": 1.94, | |
| "grad_norm": 0.48303577303886414, | |
| "kl": 0.32861328125, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": 0.0806, | |
| "reward": 0.5949805751442909, | |
| "reward_std": 0.7235869467258453, | |
| "rewards/cosine_scaled_reward": -0.04179543023929, | |
| "rewards/format_reward": 0.6785714328289032, | |
| "step": 485 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2607.375030517578, | |
| "epoch": 1.944, | |
| "grad_norm": 0.45922327041625977, | |
| "kl": 0.334228515625, | |
| "learning_rate": 1.0214767000817596e-07, | |
| "loss": 0.0495, | |
| "reward": 0.4739008713513613, | |
| "reward_std": 0.7411531507968903, | |
| "rewards/cosine_scaled_reward": -0.10828767996281385, | |
| "rewards/format_reward": 0.690476194024086, | |
| "step": 486 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2427.3929443359375, | |
| "epoch": 1.948, | |
| "grad_norm": 0.42017099261283875, | |
| "kl": 0.281494140625, | |
| "learning_rate": 1.0185202062281336e-07, | |
| "loss": 0.0322, | |
| "reward": 0.3904539607465267, | |
| "reward_std": 0.6817184686660767, | |
| "rewards/cosine_scaled_reward": -0.13810635451227427, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 487 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2579.0120239257812, | |
| "epoch": 1.952, | |
| "grad_norm": 0.7048377394676208, | |
| "kl": 0.322265625, | |
| "learning_rate": 1.0157821333772304e-07, | |
| "loss": 0.028, | |
| "reward": 0.5259524993598461, | |
| "reward_std": 0.6612162664532661, | |
| "rewards/cosine_scaled_reward": -0.09714281000196934, | |
| "rewards/format_reward": 0.7202381193637848, | |
| "step": 488 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2246.9524536132812, | |
| "epoch": 1.956, | |
| "grad_norm": 0.4814748167991638, | |
| "kl": 0.313720703125, | |
| "learning_rate": 1.013262614978859e-07, | |
| "loss": 0.0807, | |
| "reward": 0.7873745709657669, | |
| "reward_std": 0.7711023241281509, | |
| "rewards/cosine_scaled_reward": -0.005122252739965916, | |
| "rewards/format_reward": 0.7976190745830536, | |
| "step": 489 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2849.761962890625, | |
| "epoch": 1.96, | |
| "grad_norm": 0.5227950215339661, | |
| "kl": 0.37890625, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": 0.0171, | |
| "reward": 0.4944131616503, | |
| "reward_std": 0.6706523001194, | |
| "rewards/cosine_scaled_reward": -0.06826960667967796, | |
| "rewards/format_reward": 0.6309524029493332, | |
| "step": 490 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2594.3631591796875, | |
| "epoch": 1.964, | |
| "grad_norm": 0.5116230249404907, | |
| "kl": 0.40576171875, | |
| "learning_rate": 1.0088797220727779e-07, | |
| "loss": 0.0511, | |
| "reward": 0.40869739279150963, | |
| "reward_std": 0.703234076499939, | |
| "rewards/cosine_scaled_reward": -0.12600845471024513, | |
| "rewards/format_reward": 0.6607142984867096, | |
| "step": 491 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2416.8632202148438, | |
| "epoch": 1.968, | |
| "grad_norm": 0.9567095637321472, | |
| "kl": 0.27783203125, | |
| "learning_rate": 1.0070165611810855e-07, | |
| "loss": 0.1235, | |
| "reward": 0.5404094010591507, | |
| "reward_std": 0.6638472378253937, | |
| "rewards/cosine_scaled_reward": -0.054200079292058945, | |
| "rewards/format_reward": 0.6488095372915268, | |
| "step": 492 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2514.1666870117188, | |
| "epoch": 1.972, | |
| "grad_norm": 0.4846276044845581, | |
| "kl": 0.28466796875, | |
| "learning_rate": 1.005372381963547e-07, | |
| "loss": 0.0436, | |
| "reward": 0.5605661012232304, | |
| "reward_std": 0.6418938338756561, | |
| "rewards/cosine_scaled_reward": -0.03519314527511597, | |
| "rewards/format_reward": 0.630952388048172, | |
| "step": 493 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2289.71435546875, | |
| "epoch": 1.976, | |
| "grad_norm": 0.6296063661575317, | |
| "kl": 0.3212890625, | |
| "learning_rate": 1.0039472645551372e-07, | |
| "loss": 0.0439, | |
| "reward": 0.6024229377508163, | |
| "reward_std": 0.7128957360982895, | |
| "rewards/cosine_scaled_reward": -0.07081234554061666, | |
| "rewards/format_reward": 0.744047611951828, | |
| "step": 494 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2491.3334045410156, | |
| "epoch": 1.98, | |
| "grad_norm": 0.622008204460144, | |
| "kl": 0.283447265625, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": 0.0361, | |
| "reward": 0.594695046544075, | |
| "reward_std": 0.6841184943914413, | |
| "rewards/cosine_scaled_reward": -0.03896199120208621, | |
| "rewards/format_reward": 0.6726190745830536, | |
| "step": 495 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2395.636962890625, | |
| "epoch": 1.984, | |
| "grad_norm": 0.30918648838996887, | |
| "kl": 0.29931640625, | |
| "learning_rate": 1.0017544823184055e-07, | |
| "loss": 0.0827, | |
| "reward": 0.5910248765721917, | |
| "reward_std": 0.6182541996240616, | |
| "rewards/cosine_scaled_reward": -0.05567805375903845, | |
| "rewards/format_reward": 0.70238097012043, | |
| "step": 496 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2368.8631591796875, | |
| "epoch": 1.988, | |
| "grad_norm": 1.1213865280151367, | |
| "kl": 0.3515625, | |
| "learning_rate": 1.0009869243631952e-07, | |
| "loss": 0.0439, | |
| "reward": 0.48846913129091263, | |
| "reward_std": 0.6297848075628281, | |
| "rewards/cosine_scaled_reward": -0.13374162535183132, | |
| "rewards/format_reward": 0.755952388048172, | |
| "step": 497 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2289.3690795898438, | |
| "epoch": 1.992, | |
| "grad_norm": 0.810573399066925, | |
| "kl": 0.302734375, | |
| "learning_rate": 1.000438641958131e-07, | |
| "loss": 0.0228, | |
| "reward": 0.6517780050635338, | |
| "reward_std": 0.7580654174089432, | |
| "rewards/cosine_scaled_reward": -0.046134804193570744, | |
| "rewards/format_reward": 0.7440476417541504, | |
| "step": 498 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2452.2738647460938, | |
| "epoch": 1.996, | |
| "grad_norm": 0.357543408870697, | |
| "kl": 0.32763671875, | |
| "learning_rate": 1.0001096618257236e-07, | |
| "loss": 0.0643, | |
| "reward": 0.3793360572308302, | |
| "reward_std": 0.7790006846189499, | |
| "rewards/cosine_scaled_reward": -0.15557007491588593, | |
| "rewards/format_reward": 0.6904762089252472, | |
| "step": 499 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 2528.7500610351562, | |
| "epoch": 2.0, | |
| "grad_norm": 0.9307948350906372, | |
| "kl": 0.306640625, | |
| "learning_rate": 1e-07, | |
| "loss": 0.1364, | |
| "reward": 0.5999207645654678, | |
| "reward_std": 0.6981495916843414, | |
| "rewards/cosine_scaled_reward": -0.03634915268048644, | |
| "rewards/format_reward": 0.6726190596818924, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "step": 500, | |
| "total_flos": 0.0, | |
| "train_loss": 0.0725239302306436, | |
| "train_runtime": 62033.0192, | |
| "train_samples_per_second": 1.354, | |
| "train_steps_per_second": 0.008 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |