Text Generation
Transformers
Safetensors
qwen2
Generated from Trainer
open-r1
trl
grpo
conversational
text-generation-inference
Instructions to use kangdawei/MMR-GRPO with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use kangdawei/MMR-GRPO with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="kangdawei/MMR-GRPO") messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("kangdawei/MMR-GRPO") model = AutoModelForCausalLM.from_pretrained("kangdawei/MMR-GRPO") messages = [ {"role": "user", "content": "Who are you?"}, ] inputs = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", ).to(model.device) outputs = model.generate(**inputs, max_new_tokens=40) print(tokenizer.decode(outputs[0][inputs["input_ids"].shape[-1]:])) - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use kangdawei/MMR-GRPO with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "kangdawei/MMR-GRPO" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "kangdawei/MMR-GRPO", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/kangdawei/MMR-GRPO
- SGLang
How to use kangdawei/MMR-GRPO with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "kangdawei/MMR-GRPO" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "kangdawei/MMR-GRPO", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "kangdawei/MMR-GRPO" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "kangdawei/MMR-GRPO", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use kangdawei/MMR-GRPO with Docker Model Runner:
docker model run hf.co/kangdawei/MMR-GRPO
| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.5714285714285714, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 2571.2083587646484, | |
| "epoch": 0.001142857142857143, | |
| "grad_norm": 0.19727857410907745, | |
| "kl": 0.0, | |
| "learning_rate": 2e-08, | |
| "loss": -0.0, | |
| "reward": 0.1723687592893839, | |
| "reward_std": 0.7976016625761986, | |
| "rewards/cosine_scaled_reward": -0.015534311532974243, | |
| "rewards/format_reward": 0.5208333488553762, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 2804.395881652832, | |
| "epoch": 0.002285714285714286, | |
| "grad_norm": 0.18166053295135498, | |
| "kl": 0.0, | |
| "learning_rate": 4e-08, | |
| "loss": 0.0, | |
| "reward": -0.018269629566930234, | |
| "reward_std": 0.44402940198779106, | |
| "rewards/cosine_scaled_reward": -0.04980122856795788, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 3291.9583587646484, | |
| "epoch": 0.0034285714285714284, | |
| "grad_norm": 0.20012830197811127, | |
| "kl": 4.538148641586304e-05, | |
| "learning_rate": 6e-08, | |
| "loss": 0.0, | |
| "reward": -0.4293696880340576, | |
| "reward_std": 0.42283543944358826, | |
| "rewards/cosine_scaled_reward": -0.20520474947988987, | |
| "rewards/format_reward": 0.1250000037252903, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 2115.8542098999023, | |
| "epoch": 0.004571428571428572, | |
| "grad_norm": 0.25242456793785095, | |
| "kl": 3.884732723236084e-05, | |
| "learning_rate": 8e-08, | |
| "loss": 0.0, | |
| "reward": 0.2737832348793745, | |
| "reward_std": 0.9233334362506866, | |
| "rewards/cosine_scaled_reward": -0.041339562041684985, | |
| "rewards/format_reward": 0.6875000055879354, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 3488.5416870117188, | |
| "epoch": 0.005714285714285714, | |
| "grad_norm": 0.2051168829202652, | |
| "kl": 4.386343061923981e-05, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0, | |
| "reward": -0.37347570061683655, | |
| "reward_std": 0.6771278530359268, | |
| "rewards/cosine_scaled_reward": -0.20584097504615784, | |
| "rewards/format_reward": 0.1666666716337204, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 3050.5417404174805, | |
| "epoch": 0.006857142857142857, | |
| "grad_norm": 0.2063540816307068, | |
| "kl": 4.404783248901367e-05, | |
| "learning_rate": 1.2e-07, | |
| "loss": 0.0, | |
| "reward": -0.21201085951179266, | |
| "reward_std": 0.8398218862712383, | |
| "rewards/cosine_scaled_reward": -0.17123706359416246, | |
| "rewards/format_reward": 0.29166666977107525, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 3078.5000610351562, | |
| "epoch": 0.008, | |
| "grad_norm": 0.1487358957529068, | |
| "kl": 2.9146671295166016e-05, | |
| "learning_rate": 1.4e-07, | |
| "loss": 0.0, | |
| "reward": -0.06183330807834864, | |
| "reward_std": 0.7480270601809025, | |
| "rewards/cosine_scaled_reward": -0.12833543797023594, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 2689.9375381469727, | |
| "epoch": 0.009142857142857144, | |
| "grad_norm": 0.18276464939117432, | |
| "kl": 1.888629049062729e-05, | |
| "learning_rate": 1.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.2490266114473343, | |
| "reward_std": 0.8049478307366371, | |
| "rewards/cosine_scaled_reward": 0.06792760454118252, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 3336.5000610351562, | |
| "epoch": 0.010285714285714285, | |
| "grad_norm": 0.2246587723493576, | |
| "kl": 4.10228967666626e-05, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.0, | |
| "reward": -0.2021113825030625, | |
| "reward_std": 0.6383066512644291, | |
| "rewards/cosine_scaled_reward": -0.1338323038071394, | |
| "rewards/format_reward": 0.27083333395421505, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 2547.7916831970215, | |
| "epoch": 0.011428571428571429, | |
| "grad_norm": 0.2385585457086563, | |
| "kl": 3.205146640539169e-05, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0, | |
| "reward": -0.010044756345450878, | |
| "reward_std": 0.6000867374241352, | |
| "rewards/cosine_scaled_reward": -0.08963373815640807, | |
| "rewards/format_reward": 0.43750000186264515, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 3325.1041870117188, | |
| "epoch": 0.012571428571428572, | |
| "grad_norm": 0.22885829210281372, | |
| "kl": 3.9190053939819336e-05, | |
| "learning_rate": 2.1999999999999998e-07, | |
| "loss": 0.0, | |
| "reward": -0.3854191079735756, | |
| "reward_std": 0.6930013746023178, | |
| "rewards/cosine_scaled_reward": -0.21312777569983155, | |
| "rewards/format_reward": 0.1666666716337204, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 2436.750045776367, | |
| "epoch": 0.013714285714285714, | |
| "grad_norm": 0.2692394554615021, | |
| "kl": 4.0858983993530273e-05, | |
| "learning_rate": 2.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.05584145151078701, | |
| "reward_std": 0.7531977295875549, | |
| "rewards/cosine_scaled_reward": -0.14725533686578274, | |
| "rewards/format_reward": 0.6250000018626451, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 2905.625030517578, | |
| "epoch": 0.014857142857142857, | |
| "grad_norm": 0.18586185574531555, | |
| "kl": 3.486126661300659e-05, | |
| "learning_rate": 2.6e-07, | |
| "loss": 0.0, | |
| "reward": -0.06436450174078345, | |
| "reward_std": 0.5915969423949718, | |
| "rewards/cosine_scaled_reward": -0.10991157777607441, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 2978.0208892822266, | |
| "epoch": 0.016, | |
| "grad_norm": 0.21407712996006012, | |
| "kl": 2.9122806154191494e-05, | |
| "learning_rate": 2.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.008545992895960808, | |
| "reward_std": 0.8296682890504599, | |
| "rewards/cosine_scaled_reward": -0.05597527138888836, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 2821.291679382324, | |
| "epoch": 0.017142857142857144, | |
| "grad_norm": 0.18756185472011566, | |
| "kl": 2.925284206867218e-05, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0, | |
| "reward": 0.09492451697587967, | |
| "reward_std": 0.5804186388850212, | |
| "rewards/cosine_scaled_reward": -0.005186626687645912, | |
| "rewards/format_reward": 0.41666667349636555, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 3463.1458435058594, | |
| "epoch": 0.018285714285714287, | |
| "grad_norm": 0.19236689805984497, | |
| "kl": 3.2588839530944824e-05, | |
| "learning_rate": 3.2e-07, | |
| "loss": 0.0, | |
| "reward": -0.41888961754739285, | |
| "reward_std": 0.5801856927573681, | |
| "rewards/cosine_scaled_reward": -0.19731347833294421, | |
| "rewards/format_reward": 0.10416666977107525, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 2245.083366394043, | |
| "epoch": 0.019428571428571427, | |
| "grad_norm": 0.24209940433502197, | |
| "kl": 4.054419696331024e-05, | |
| "learning_rate": 3.4000000000000003e-07, | |
| "loss": 0.0, | |
| "reward": 0.233390836045146, | |
| "reward_std": 0.9472852721810341, | |
| "rewards/cosine_scaled_reward": -0.0257677553454414, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 2929.062530517578, | |
| "epoch": 0.02057142857142857, | |
| "grad_norm": 0.14878323674201965, | |
| "kl": 2.1327286958694458e-05, | |
| "learning_rate": 3.6e-07, | |
| "loss": 0.0, | |
| "reward": -0.019776458386331797, | |
| "reward_std": 0.5773040689527988, | |
| "rewards/cosine_scaled_reward": -0.08265479793772101, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 2807.437545776367, | |
| "epoch": 0.021714285714285714, | |
| "grad_norm": 0.16009841859340668, | |
| "kl": 2.41696834564209e-05, | |
| "learning_rate": 3.7999999999999996e-07, | |
| "loss": 0.0, | |
| "reward": 0.3507382436655462, | |
| "reward_std": 0.921161625534296, | |
| "rewards/cosine_scaled_reward": 0.12314888671971858, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 2367.8958892822266, | |
| "epoch": 0.022857142857142857, | |
| "grad_norm": 0.18880178034305573, | |
| "kl": 1.8640421330928802e-05, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0, | |
| "reward": 0.5233698049560189, | |
| "reward_std": 0.9742525108158588, | |
| "rewards/cosine_scaled_reward": 0.14227081835269928, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 2770.4583740234375, | |
| "epoch": 0.024, | |
| "grad_norm": 0.24240563809871674, | |
| "kl": 3.499537706375122e-05, | |
| "learning_rate": 4.1999999999999995e-07, | |
| "loss": 0.0, | |
| "reward": -0.16247276589274406, | |
| "reward_std": 0.5624231658875942, | |
| "rewards/cosine_scaled_reward": -0.1772752869874239, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 1995.1875305175781, | |
| "epoch": 0.025142857142857144, | |
| "grad_norm": 0.43609920144081116, | |
| "kl": 3.091990947723389e-05, | |
| "learning_rate": 4.3999999999999997e-07, | |
| "loss": 0.0, | |
| "reward": 0.36330048087984324, | |
| "reward_std": 0.6434031687676907, | |
| "rewards/cosine_scaled_reward": 0.004692776128649712, | |
| "rewards/format_reward": 0.750000013038516, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 2544.666717529297, | |
| "epoch": 0.026285714285714287, | |
| "grad_norm": 0.23585595190525055, | |
| "kl": 3.0001625418663025e-05, | |
| "learning_rate": 4.6e-07, | |
| "loss": 0.0, | |
| "reward": -0.11523125227540731, | |
| "reward_std": 0.6521053463220596, | |
| "rewards/cosine_scaled_reward": -0.16176861617714167, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 2566.333335876465, | |
| "epoch": 0.027428571428571427, | |
| "grad_norm": 0.18629349768161774, | |
| "kl": 2.0964653231203556e-05, | |
| "learning_rate": 4.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.23491991590708494, | |
| "reward_std": 0.7349754758179188, | |
| "rewards/cosine_scaled_reward": -0.0008563240990042686, | |
| "rewards/format_reward": 0.5833333469927311, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 2641.145881652832, | |
| "epoch": 0.02857142857142857, | |
| "grad_norm": 0.1711462438106537, | |
| "kl": 3.1113624572753906e-05, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0, | |
| "reward": 0.03685340657830238, | |
| "reward_std": 0.7854471541941166, | |
| "rewards/cosine_scaled_reward": -0.06554372794926167, | |
| "rewards/format_reward": 0.43750001303851604, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 3136.604217529297, | |
| "epoch": 0.029714285714285714, | |
| "grad_norm": 0.19311213493347168, | |
| "kl": 4.175305366516113e-05, | |
| "learning_rate": 5.2e-07, | |
| "loss": 0.0, | |
| "reward": -0.036069474183022976, | |
| "reward_std": 0.7284176610410213, | |
| "rewards/cosine_scaled_reward": -0.06703814025968313, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 2979.4583740234375, | |
| "epoch": 0.030857142857142857, | |
| "grad_norm": 0.2027575522661209, | |
| "kl": 3.305543214082718e-05, | |
| "learning_rate": 5.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.02418377436697483, | |
| "reward_std": 0.8362919054925442, | |
| "rewards/cosine_scaled_reward": -0.07142194919288158, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 2897.979202270508, | |
| "epoch": 0.032, | |
| "grad_norm": 0.19330666959285736, | |
| "kl": 3.600865602493286e-05, | |
| "learning_rate": 5.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.09552552457898855, | |
| "reward_std": 0.6331899762153625, | |
| "rewards/cosine_scaled_reward": -0.015744205564260483, | |
| "rewards/format_reward": 0.43750001303851604, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 3302.8958740234375, | |
| "epoch": 0.03314285714285714, | |
| "grad_norm": 0.16206476092338562, | |
| "kl": 2.98917293548584e-05, | |
| "learning_rate": 5.8e-07, | |
| "loss": 0.0, | |
| "reward": -0.2337268814444542, | |
| "reward_std": 0.5517648197710514, | |
| "rewards/cosine_scaled_reward": -0.12987937778234482, | |
| "rewards/format_reward": 0.2291666753590107, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 2989.604232788086, | |
| "epoch": 0.03428571428571429, | |
| "grad_norm": 0.15634141862392426, | |
| "kl": 2.584606409072876e-05, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0, | |
| "reward": 0.2867018459364772, | |
| "reward_std": 0.8898419290781021, | |
| "rewards/cosine_scaled_reward": 0.0461630261852406, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 2971.3750228881836, | |
| "epoch": 0.03542857142857143, | |
| "grad_norm": 0.17451131343841553, | |
| "kl": 2.5779008865356445e-05, | |
| "learning_rate": 6.2e-07, | |
| "loss": 0.0, | |
| "reward": -0.1252674162387848, | |
| "reward_std": 0.6488835532218218, | |
| "rewards/cosine_scaled_reward": -0.11415221774950624, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 3259.375030517578, | |
| "epoch": 0.036571428571428574, | |
| "grad_norm": 0.1704079955816269, | |
| "kl": 2.6818830519914627e-05, | |
| "learning_rate": 6.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.015448510646820068, | |
| "reward_std": 0.6844744682312012, | |
| "rewards/cosine_scaled_reward": -0.022644946351647377, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 3343.5208435058594, | |
| "epoch": 0.037714285714285714, | |
| "grad_norm": 0.1502685844898224, | |
| "kl": 2.950429916381836e-05, | |
| "learning_rate": 6.6e-07, | |
| "loss": 0.0, | |
| "reward": -0.3123129680752754, | |
| "reward_std": 0.4822962246835232, | |
| "rewards/cosine_scaled_reward": -0.17962717823684216, | |
| "rewards/format_reward": 0.22916666977107525, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 2390.2083435058594, | |
| "epoch": 0.038857142857142854, | |
| "grad_norm": 0.32976019382476807, | |
| "kl": 3.445148468017578e-05, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.0, | |
| "reward": 0.5364574566483498, | |
| "reward_std": 0.7043890058994293, | |
| "rewards/cosine_scaled_reward": 0.2165452465415001, | |
| "rewards/format_reward": 0.5625, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 3077.145866394043, | |
| "epoch": 0.04, | |
| "grad_norm": 0.21368089318275452, | |
| "kl": 4.202127456665039e-05, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0, | |
| "reward": -0.110074105206877, | |
| "reward_std": 0.9502048678696156, | |
| "rewards/cosine_scaled_reward": -0.08587458729743958, | |
| "rewards/format_reward": 0.2500000037252903, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 3360.687530517578, | |
| "epoch": 0.04114285714285714, | |
| "grad_norm": 0.1669422686100006, | |
| "kl": 3.0137598514556885e-05, | |
| "learning_rate": 7.2e-07, | |
| "loss": 0.0, | |
| "reward": -0.4919657548889518, | |
| "reward_std": 0.4837723895907402, | |
| "rewards/cosine_scaled_reward": -0.28421421349048615, | |
| "rewards/format_reward": 0.18750000558793545, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 3358.7291870117188, | |
| "epoch": 0.04228571428571429, | |
| "grad_norm": 0.1673896312713623, | |
| "kl": 1.73947773873806e-05, | |
| "learning_rate": 7.4e-07, | |
| "loss": 0.0, | |
| "reward": -0.35696901264600456, | |
| "reward_std": 0.4194028750061989, | |
| "rewards/cosine_scaled_reward": -0.20523495320230722, | |
| "rewards/format_reward": 0.22916666977107525, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 3273.2708435058594, | |
| "epoch": 0.04342857142857143, | |
| "grad_norm": 0.17379561066627502, | |
| "kl": 3.8963742554187775e-05, | |
| "learning_rate": 7.599999999999999e-07, | |
| "loss": 0.0, | |
| "reward": -0.2825834956020117, | |
| "reward_std": 0.6573631279170513, | |
| "rewards/cosine_scaled_reward": -0.14204833284020424, | |
| "rewards/format_reward": 0.1666666679084301, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 2792.2500228881836, | |
| "epoch": 0.044571428571428574, | |
| "grad_norm": 0.2008347511291504, | |
| "kl": 1.8621794879436493e-05, | |
| "learning_rate": 7.799999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.01606392953544855, | |
| "reward_std": 0.6000814624130726, | |
| "rewards/cosine_scaled_reward": -0.07735766470432281, | |
| "rewards/format_reward": 0.4583333469927311, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 2641.7084045410156, | |
| "epoch": 0.045714285714285714, | |
| "grad_norm": 0.21129944920539856, | |
| "kl": 6.28940761089325e-05, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0, | |
| "reward": 0.01600947417318821, | |
| "reward_std": 0.6365399248898029, | |
| "rewards/cosine_scaled_reward": -0.11234160404001159, | |
| "rewards/format_reward": 0.5208333469927311, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 2874.9791870117188, | |
| "epoch": 0.046857142857142854, | |
| "grad_norm": 0.17788062989711761, | |
| "kl": 3.086775541305542e-05, | |
| "learning_rate": 8.199999999999999e-07, | |
| "loss": 0.0, | |
| "reward": -0.050673360005021095, | |
| "reward_std": 0.9452972374856472, | |
| "rewards/cosine_scaled_reward": -0.14972026087343693, | |
| "rewards/format_reward": 0.4583333469927311, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 2774.2916984558105, | |
| "epoch": 0.048, | |
| "grad_norm": 0.3309200704097748, | |
| "kl": 6.945431232452393e-05, | |
| "learning_rate": 8.399999999999999e-07, | |
| "loss": 0.0, | |
| "reward": -0.2789352214895189, | |
| "reward_std": 0.5039437636733055, | |
| "rewards/cosine_scaled_reward": -0.24113959958776832, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 2986.1667098999023, | |
| "epoch": 0.04914285714285714, | |
| "grad_norm": 0.19303752481937408, | |
| "kl": 3.374367952346802e-05, | |
| "learning_rate": 8.599999999999999e-07, | |
| "loss": 0.0, | |
| "reward": -0.041154950857162476, | |
| "reward_std": 0.6381138861179352, | |
| "rewards/cosine_scaled_reward": -0.029581679904367775, | |
| "rewards/format_reward": 0.27083333395421505, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 2853.937530517578, | |
| "epoch": 0.05028571428571429, | |
| "grad_norm": 0.24794019758701324, | |
| "kl": 0.00017474591732025146, | |
| "learning_rate": 8.799999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.08132272958755493, | |
| "reward_std": 0.7235868386924267, | |
| "rewards/cosine_scaled_reward": -0.0631332267075777, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 3266.812530517578, | |
| "epoch": 0.05142857142857143, | |
| "grad_norm": 0.15977022051811218, | |
| "kl": 5.7324767112731934e-05, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0, | |
| "reward": -0.18004001304507256, | |
| "reward_std": 0.7050215303897858, | |
| "rewards/cosine_scaled_reward": -0.12542735040187836, | |
| "rewards/format_reward": 0.2708333358168602, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 3250.2083435058594, | |
| "epoch": 0.052571428571428575, | |
| "grad_norm": 0.19012750685214996, | |
| "kl": 8.171796798706055e-05, | |
| "learning_rate": 9.2e-07, | |
| "loss": 0.0, | |
| "reward": -0.32018135441467166, | |
| "reward_std": 0.6548310741782188, | |
| "rewards/cosine_scaled_reward": -0.17957252322230488, | |
| "rewards/format_reward": 0.18750000186264515, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 2832.333351135254, | |
| "epoch": 0.053714285714285714, | |
| "grad_norm": 0.2044163942337036, | |
| "kl": 3.51807102560997e-05, | |
| "learning_rate": 9.399999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.25673263892531395, | |
| "reward_std": 0.8521627373993397, | |
| "rewards/cosine_scaled_reward": 0.061620082706213, | |
| "rewards/format_reward": 0.47916667349636555, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 2832.937530517578, | |
| "epoch": 0.054857142857142854, | |
| "grad_norm": 0.20488843321800232, | |
| "kl": 0.00012751109898090363, | |
| "learning_rate": 9.6e-07, | |
| "loss": 0.0, | |
| "reward": -0.01403064839541912, | |
| "reward_std": 0.847560465335846, | |
| "rewards/cosine_scaled_reward": -0.08664842648431659, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 2368.645896911621, | |
| "epoch": 0.056, | |
| "grad_norm": 0.21413730084896088, | |
| "kl": 6.522238254547119e-05, | |
| "learning_rate": 9.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.28362084983382374, | |
| "reward_std": 0.9133072569966316, | |
| "rewards/cosine_scaled_reward": 0.0036178361624479294, | |
| "rewards/format_reward": 0.6250000037252903, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 2862.520835876465, | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.184512659907341, | |
| "kl": 0.00011056661605834961, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "reward": 0.04502496123313904, | |
| "reward_std": 0.570039439946413, | |
| "rewards/cosine_scaled_reward": -0.0024489806964993477, | |
| "rewards/format_reward": 0.3541666679084301, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 2245.750015258789, | |
| "epoch": 0.05828571428571429, | |
| "grad_norm": 0.21354830265045166, | |
| "kl": 0.00020595639944076538, | |
| "learning_rate": 9.999890338174275e-07, | |
| "loss": 0.0, | |
| "reward": 0.049862007377669215, | |
| "reward_std": 0.5920056514441967, | |
| "rewards/cosine_scaled_reward": -0.11850284365937114, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 2897.93754196167, | |
| "epoch": 0.05942857142857143, | |
| "grad_norm": 0.222417950630188, | |
| "kl": 9.300559759140015e-05, | |
| "learning_rate": 9.999561358041868e-07, | |
| "loss": 0.0, | |
| "reward": 0.30677789729088545, | |
| "reward_std": 0.9850648231804371, | |
| "rewards/cosine_scaled_reward": 0.08488897839561105, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 2835.979202270508, | |
| "epoch": 0.060571428571428575, | |
| "grad_norm": 0.2359960377216339, | |
| "kl": 0.00013843923807144165, | |
| "learning_rate": 9.999013075636804e-07, | |
| "loss": 0.0, | |
| "reward": 0.12393231969326735, | |
| "reward_std": 0.8365762289613485, | |
| "rewards/cosine_scaled_reward": -0.017082044621929526, | |
| "rewards/format_reward": 0.45833334513008595, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 2773.250030517578, | |
| "epoch": 0.061714285714285715, | |
| "grad_norm": 0.16520988941192627, | |
| "kl": 6.0267746448516846e-05, | |
| "learning_rate": 9.998245517681593e-07, | |
| "loss": 0.0, | |
| "reward": 0.4487614845857024, | |
| "reward_std": 0.9010818786919117, | |
| "rewards/cosine_scaled_reward": 0.14949666894972324, | |
| "rewards/format_reward": 0.5625000186264515, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 2909.0000762939453, | |
| "epoch": 0.06285714285714286, | |
| "grad_norm": 0.17167004942893982, | |
| "kl": 8.597038686275482e-05, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.0, | |
| "reward": 0.11020261491648853, | |
| "reward_std": 0.8988244608044624, | |
| "rewards/cosine_scaled_reward": 0.003133175428956747, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 2952.333366394043, | |
| "epoch": 0.064, | |
| "grad_norm": 0.18516214191913605, | |
| "kl": 4.446879029273987e-05, | |
| "learning_rate": 9.996052735444862e-07, | |
| "loss": 0.0, | |
| "reward": -0.027613874524831772, | |
| "reward_std": 0.6530672702938318, | |
| "rewards/cosine_scaled_reward": -0.09866334870457649, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 3412.1041870117188, | |
| "epoch": 0.06514285714285714, | |
| "grad_norm": 0.12485022842884064, | |
| "kl": 1.195073127746582e-05, | |
| "learning_rate": 9.994627618036452e-07, | |
| "loss": 0.0, | |
| "reward": -0.1474010832607746, | |
| "reward_std": 0.7124437093734741, | |
| "rewards/cosine_scaled_reward": -0.10700338426977396, | |
| "rewards/format_reward": 0.2708333358168602, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 2150.3750534057617, | |
| "epoch": 0.06628571428571428, | |
| "grad_norm": 0.1975310742855072, | |
| "kl": 0.0003490075469017029, | |
| "learning_rate": 9.992983438818915e-07, | |
| "loss": 0.0, | |
| "reward": 0.4446102287620306, | |
| "reward_std": 0.8766555078327656, | |
| "rewards/cosine_scaled_reward": 0.0772969089448452, | |
| "rewards/format_reward": 0.6875000186264515, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 2856.041717529297, | |
| "epoch": 0.06742857142857143, | |
| "grad_norm": 0.1683138906955719, | |
| "kl": 3.864988684654236e-05, | |
| "learning_rate": 9.991120277927223e-07, | |
| "loss": 0.0, | |
| "reward": -0.09963385201990604, | |
| "reward_std": 0.6718280278146267, | |
| "rewards/cosine_scaled_reward": -0.10808059107512236, | |
| "rewards/format_reward": 0.35416668094694614, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 3047.6041870117188, | |
| "epoch": 0.06857142857142857, | |
| "grad_norm": 0.16972136497497559, | |
| "kl": 4.49158251285553e-05, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0, | |
| "reward": -0.3548573371954262, | |
| "reward_std": 0.5078517571091652, | |
| "rewards/cosine_scaled_reward": -0.25396787002682686, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 3066.0000610351562, | |
| "epoch": 0.06971428571428571, | |
| "grad_norm": 0.16273915767669678, | |
| "kl": 0.00016046315431594849, | |
| "learning_rate": 9.98673738502114e-07, | |
| "loss": 0.0, | |
| "reward": 0.08701966446824372, | |
| "reward_std": 0.6688249669969082, | |
| "rewards/cosine_scaled_reward": -0.06528997980058193, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 2751.4583740234375, | |
| "epoch": 0.07085714285714285, | |
| "grad_norm": 0.28360143303871155, | |
| "kl": 0.0005411431193351746, | |
| "learning_rate": 9.98421786662277e-07, | |
| "loss": 0.0, | |
| "reward": 0.1395623767748475, | |
| "reward_std": 0.8662982396781445, | |
| "rewards/cosine_scaled_reward": -0.021802921197377145, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 2314.854217529297, | |
| "epoch": 0.072, | |
| "grad_norm": 0.21262036263942719, | |
| "kl": 0.0005500763654708862, | |
| "learning_rate": 9.981479793771866e-07, | |
| "loss": 0.0, | |
| "reward": 0.5263404976576567, | |
| "reward_std": 0.880022443830967, | |
| "rewards/cosine_scaled_reward": 0.14735051710158587, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 2922.4583892822266, | |
| "epoch": 0.07314285714285715, | |
| "grad_norm": 0.17176398634910583, | |
| "kl": 0.00019650161266326904, | |
| "learning_rate": 9.97852329991824e-07, | |
| "loss": 0.0, | |
| "reward": 0.010981407947838306, | |
| "reward_std": 0.8544860817492008, | |
| "rewards/cosine_scaled_reward": -0.04802468419075012, | |
| "rewards/format_reward": 0.35416667349636555, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 2751.0625381469727, | |
| "epoch": 0.07428571428571429, | |
| "grad_norm": 0.19784730672836304, | |
| "kl": 0.00014169886708259583, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": 0.0, | |
| "reward": -0.14385659247636795, | |
| "reward_std": 0.6372686810791492, | |
| "rewards/cosine_scaled_reward": -0.16209950670599937, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 2123.6666717529297, | |
| "epoch": 0.07542857142857143, | |
| "grad_norm": 0.2654637396335602, | |
| "kl": 0.00044383853673934937, | |
| "learning_rate": 9.971955636222684e-07, | |
| "loss": 0.0, | |
| "reward": 0.21715208888053894, | |
| "reward_std": 0.5327535588294268, | |
| "rewards/cosine_scaled_reward": 0.03916000574827194, | |
| "rewards/format_reward": 0.5, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 3416.0625, | |
| "epoch": 0.07657142857142857, | |
| "grad_norm": 0.14119970798492432, | |
| "kl": 0.0003875941038131714, | |
| "learning_rate": 9.968344786479415e-07, | |
| "loss": 0.0, | |
| "reward": -0.544161144644022, | |
| "reward_std": 0.4329412467777729, | |
| "rewards/cosine_scaled_reward": -0.28361437405692413, | |
| "rewards/format_reward": 0.1250000037252903, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 2162.645881652832, | |
| "epoch": 0.07771428571428571, | |
| "grad_norm": 0.25515133142471313, | |
| "kl": 0.0016131103038787842, | |
| "learning_rate": 9.964516155915151e-07, | |
| "loss": 0.0001, | |
| "reward": 0.18700658343732357, | |
| "reward_std": 0.9016527011990547, | |
| "rewards/cosine_scaled_reward": -0.053536335937678814, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 2560.0625534057617, | |
| "epoch": 0.07885714285714286, | |
| "grad_norm": 0.2858283817768097, | |
| "kl": 0.0012285411357879639, | |
| "learning_rate": 9.960469931131936e-07, | |
| "loss": 0.0, | |
| "reward": -0.2131683579646051, | |
| "reward_std": 0.6651364080607891, | |
| "rewards/cosine_scaled_reward": -0.23473481088876724, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 3074.875015258789, | |
| "epoch": 0.08, | |
| "grad_norm": 0.18202482163906097, | |
| "kl": 0.0006042998284101486, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": 0.0, | |
| "reward": -0.12620488926768303, | |
| "reward_std": 0.6856267843395472, | |
| "rewards/cosine_scaled_reward": -0.13051698391791433, | |
| "rewards/format_reward": 0.3541666679084301, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 2711.4375228881836, | |
| "epoch": 0.08114285714285714, | |
| "grad_norm": 0.37329548597335815, | |
| "kl": 0.0009671822190284729, | |
| "learning_rate": 9.951725498333448e-07, | |
| "loss": 0.0, | |
| "reward": 0.09207316488027573, | |
| "reward_std": 0.7490551918745041, | |
| "rewards/cosine_scaled_reward": 0.00596673134714365, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 2840.895866394043, | |
| "epoch": 0.08228571428571428, | |
| "grad_norm": 0.2226312905550003, | |
| "kl": 0.0008899271488189697, | |
| "learning_rate": 9.947027716509488e-07, | |
| "loss": 0.0, | |
| "reward": -0.16130333952605724, | |
| "reward_std": 0.48493045195937157, | |
| "rewards/cosine_scaled_reward": -0.13195209205150604, | |
| "rewards/format_reward": 0.33333334140479565, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 3428.4583740234375, | |
| "epoch": 0.08342857142857144, | |
| "grad_norm": 0.18076418340206146, | |
| "kl": 0.00024478137493133545, | |
| "learning_rate": 9.942113192828444e-07, | |
| "loss": 0.0, | |
| "reward": -0.3564223051071167, | |
| "reward_std": 0.7073610313236713, | |
| "rewards/cosine_scaled_reward": -0.18848324241116643, | |
| "rewards/format_reward": 0.14583333767950535, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 3174.479248046875, | |
| "epoch": 0.08457142857142858, | |
| "grad_norm": 0.19092188775539398, | |
| "kl": 0.0007887572282925248, | |
| "learning_rate": 9.93698216681727e-07, | |
| "loss": 0.0, | |
| "reward": -0.04798390786163509, | |
| "reward_std": 0.9048562906682491, | |
| "rewards/cosine_scaled_reward": -0.06373507156968117, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 3045.8958587646484, | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.1473468691110611, | |
| "kl": 0.0011369436979293823, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.0, | |
| "reward": 0.051474731182679534, | |
| "reward_std": 0.6175347343087196, | |
| "rewards/cosine_scaled_reward": 0.00460137240588665, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 2876.5208740234375, | |
| "epoch": 0.08685714285714285, | |
| "grad_norm": 0.22771689295768738, | |
| "kl": 0.00022584199905395508, | |
| "learning_rate": 9.926071618660237e-07, | |
| "loss": 0.0, | |
| "reward": -0.2855505235493183, | |
| "reward_std": 0.3848421312868595, | |
| "rewards/cosine_scaled_reward": -0.23991595953702927, | |
| "rewards/format_reward": 0.39583333767950535, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 2967.3958435058594, | |
| "epoch": 0.088, | |
| "grad_norm": 0.15564849972724915, | |
| "kl": 0.00028374046087265015, | |
| "learning_rate": 9.9202926282791e-07, | |
| "loss": 0.0, | |
| "reward": -0.11336795706301928, | |
| "reward_std": 0.42253825441002846, | |
| "rewards/cosine_scaled_reward": -0.09275355748832226, | |
| "rewards/format_reward": 0.33333334140479565, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 3223.9583892822266, | |
| "epoch": 0.08914285714285715, | |
| "grad_norm": 0.1456020325422287, | |
| "kl": 0.00031157582998275757, | |
| "learning_rate": 9.91429819907136e-07, | |
| "loss": 0.0, | |
| "reward": -0.039307162165641785, | |
| "reward_std": 0.6996998488903046, | |
| "rewards/cosine_scaled_reward": -0.04336683638393879, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 2270.0417098999023, | |
| "epoch": 0.09028571428571429, | |
| "grad_norm": 0.20563355088233948, | |
| "kl": 0.0013409852981567383, | |
| "learning_rate": 9.908088623197048e-07, | |
| "loss": 0.0001, | |
| "reward": 0.09923344664275646, | |
| "reward_std": 0.7259157933294773, | |
| "rewards/cosine_scaled_reward": -0.10380276478827, | |
| "rewards/format_reward": 0.6041666679084301, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 3190.3333587646484, | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.21478085219860077, | |
| "kl": 0.0009518936276435852, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": 0.0, | |
| "reward": -0.2797470228979364, | |
| "reward_std": 0.6308383457362652, | |
| "rewards/cosine_scaled_reward": -0.18668191879987717, | |
| "rewards/format_reward": 0.27083333767950535, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 2895.583354949951, | |
| "epoch": 0.09257142857142857, | |
| "grad_norm": 0.2533528804779053, | |
| "kl": 0.002989828586578369, | |
| "learning_rate": 9.895025252503755e-07, | |
| "loss": 0.0001, | |
| "reward": -0.1824733428657055, | |
| "reward_std": 0.5710629485547543, | |
| "rewards/cosine_scaled_reward": -0.15205951500684023, | |
| "rewards/format_reward": 0.33333333395421505, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 2906.6458740234375, | |
| "epoch": 0.09371428571428571, | |
| "grad_norm": 0.19861973822116852, | |
| "kl": 0.0019384026527404785, | |
| "learning_rate": 9.888172094375033e-07, | |
| "loss": 0.0001, | |
| "reward": 0.002590768039226532, | |
| "reward_std": 0.6845040954649448, | |
| "rewards/cosine_scaled_reward": -0.050318428009632044, | |
| "rewards/format_reward": 0.37500000931322575, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 2862.9583587646484, | |
| "epoch": 0.09485714285714286, | |
| "grad_norm": 0.5050163865089417, | |
| "kl": 0.0011357255280017853, | |
| "learning_rate": 9.881105062929221e-07, | |
| "loss": 0.0, | |
| "reward": -0.2405467852950096, | |
| "reward_std": 0.5951391458511353, | |
| "rewards/cosine_scaled_reward": -0.16155940247699618, | |
| "rewards/format_reward": 0.27083333395421505, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 3058.375045776367, | |
| "epoch": 0.096, | |
| "grad_norm": 0.1727391928434372, | |
| "kl": 0.0007171034812927246, | |
| "learning_rate": 9.873824502603459e-07, | |
| "loss": 0.0, | |
| "reward": 0.07485563680529594, | |
| "reward_std": 0.8334123902022839, | |
| "rewards/cosine_scaled_reward": -0.0007621082477271557, | |
| "rewards/format_reward": 0.35416667349636555, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 2977.7709045410156, | |
| "epoch": 0.09714285714285714, | |
| "grad_norm": 0.14627371728420258, | |
| "kl": 0.0006437301635742188, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": 0.0, | |
| "reward": -0.01777968415990472, | |
| "reward_std": 0.8515256457030773, | |
| "rewards/cosine_scaled_reward": -0.09596842993050814, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 2917.0625228881836, | |
| "epoch": 0.09828571428571428, | |
| "grad_norm": 0.20686082541942596, | |
| "kl": 0.001413583755493164, | |
| "learning_rate": 9.85862422507884e-07, | |
| "loss": 0.0001, | |
| "reward": 0.06965514738112688, | |
| "reward_std": 0.6772139333188534, | |
| "rewards/cosine_scaled_reward": -0.03397463448345661, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 2726.0833740234375, | |
| "epoch": 0.09942857142857142, | |
| "grad_norm": 0.22425347566604614, | |
| "kl": 0.001748785376548767, | |
| "learning_rate": 9.850705248720068e-07, | |
| "loss": 0.0001, | |
| "reward": 0.15189366973936558, | |
| "reward_std": 0.8081401586532593, | |
| "rewards/cosine_scaled_reward": -0.0596234705299139, | |
| "rewards/format_reward": 0.5833333469927311, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 2768.354202270508, | |
| "epoch": 0.10057142857142858, | |
| "grad_norm": 0.1945296823978424, | |
| "kl": 0.002580702304840088, | |
| "learning_rate": 9.8425742251254e-07, | |
| "loss": 0.0001, | |
| "reward": -0.015582697466015816, | |
| "reward_std": 0.7642906121909618, | |
| "rewards/cosine_scaled_reward": -0.13294149283319712, | |
| "rewards/format_reward": 0.5, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 3245.0833740234375, | |
| "epoch": 0.10171428571428572, | |
| "grad_norm": 0.1559896320104599, | |
| "kl": 0.0016347765922546387, | |
| "learning_rate": 9.83423155058946e-07, | |
| "loss": 0.0001, | |
| "reward": 0.022897440940141678, | |
| "reward_std": 0.6935332976281643, | |
| "rewards/cosine_scaled_reward": -0.016921459697186947, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 2285.729179382324, | |
| "epoch": 0.10285714285714286, | |
| "grad_norm": 0.3084622621536255, | |
| "kl": 0.0027695298194885254, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.0001, | |
| "reward": -0.18626173213124275, | |
| "reward_std": 0.48771001771092415, | |
| "rewards/cosine_scaled_reward": -0.2529527278384194, | |
| "rewards/format_reward": 0.5416666679084301, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 2982.937530517578, | |
| "epoch": 0.104, | |
| "grad_norm": 0.2009773999452591, | |
| "kl": 0.0014831870794296265, | |
| "learning_rate": 9.816912885430258e-07, | |
| "loss": 0.0001, | |
| "reward": -0.15441028028726578, | |
| "reward_std": 0.5955647341907024, | |
| "rewards/cosine_scaled_reward": -0.11808557622134686, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 2740.3125228881836, | |
| "epoch": 0.10514285714285715, | |
| "grad_norm": 0.22146174311637878, | |
| "kl": 0.003424704074859619, | |
| "learning_rate": 9.807937738894303e-07, | |
| "loss": 0.0001, | |
| "reward": -0.21460794005542994, | |
| "reward_std": 0.6763530150055885, | |
| "rewards/cosine_scaled_reward": -0.24455153848975897, | |
| "rewards/format_reward": 0.4583333469927311, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 3584.0, | |
| "epoch": 0.10628571428571429, | |
| "grad_norm": 0.15897655487060547, | |
| "kl": 0.0015739202499389648, | |
| "learning_rate": 9.798752629550546e-07, | |
| "loss": 0.0001, | |
| "reward": -0.5496739558875561, | |
| "reward_std": 0.33528773859143257, | |
| "rewards/cosine_scaled_reward": -0.22698437981307507, | |
| "rewards/format_reward": 0.02083333395421505, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 3051.958366394043, | |
| "epoch": 0.10742857142857143, | |
| "grad_norm": 0.17778827250003815, | |
| "kl": 0.0024556964635849, | |
| "learning_rate": 9.78935800506826e-07, | |
| "loss": 0.0001, | |
| "reward": -0.12991986190900207, | |
| "reward_std": 0.6625968441367149, | |
| "rewards/cosine_scaled_reward": -0.10003662994131446, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 3445.2500610351562, | |
| "epoch": 0.10857142857142857, | |
| "grad_norm": 0.1542736142873764, | |
| "kl": 0.0006103254854679108, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": 0.0, | |
| "reward": -0.22923543583601713, | |
| "reward_std": 0.6721294671297073, | |
| "rewards/cosine_scaled_reward": -0.14534958777949214, | |
| "rewards/format_reward": 0.25000000931322575, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 2762.812545776367, | |
| "epoch": 0.10971428571428571, | |
| "grad_norm": 0.1754182130098343, | |
| "kl": 0.0021101534366607666, | |
| "learning_rate": 9.769942052400235e-07, | |
| "loss": 0.0001, | |
| "reward": -0.0632266215980053, | |
| "reward_std": 0.49075169675052166, | |
| "rewards/cosine_scaled_reward": -0.09756997041404247, | |
| "rewards/format_reward": 0.39583333395421505, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 3206.5208740234375, | |
| "epoch": 0.11085714285714286, | |
| "grad_norm": 0.175271138548851, | |
| "kl": 0.0009508877992630005, | |
| "learning_rate": 9.759921670520634e-07, | |
| "loss": 0.0, | |
| "reward": 0.05997042031958699, | |
| "reward_std": 0.8464921619743109, | |
| "rewards/cosine_scaled_reward": -0.04258055402897298, | |
| "rewards/format_reward": 0.41666668094694614, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 2683.291702270508, | |
| "epoch": 0.112, | |
| "grad_norm": 0.1968272626399994, | |
| "kl": 0.00044539570808410645, | |
| "learning_rate": 9.749693666068663e-07, | |
| "loss": 0.0, | |
| "reward": 0.1016645822674036, | |
| "reward_std": 0.7722124308347702, | |
| "rewards/cosine_scaled_reward": -0.06791386939585209, | |
| "rewards/format_reward": 0.520833345130086, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 2768.7500076293945, | |
| "epoch": 0.11314285714285714, | |
| "grad_norm": 0.22673000395298004, | |
| "kl": 0.0013333559036254883, | |
| "learning_rate": 9.739258537542835e-07, | |
| "loss": 0.0001, | |
| "reward": -0.15783914551138878, | |
| "reward_std": 0.5159456543624401, | |
| "rewards/cosine_scaled_reward": -0.11131853237748146, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 2545.4792098999023, | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.18069936335086823, | |
| "kl": 0.002035856246948242, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.0001, | |
| "reward": 0.4198254104703665, | |
| "reward_std": 0.8630654141306877, | |
| "rewards/cosine_scaled_reward": 0.12847260013222694, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 2497.000030517578, | |
| "epoch": 0.11542857142857142, | |
| "grad_norm": 0.22976230084896088, | |
| "kl": 0.0024021267890930176, | |
| "learning_rate": 9.717768952713511e-07, | |
| "loss": 0.0001, | |
| "reward": 0.06796598061919212, | |
| "reward_std": 0.587594460695982, | |
| "rewards/cosine_scaled_reward": -0.033663152426015586, | |
| "rewards/format_reward": 0.4375, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 2184.916717529297, | |
| "epoch": 0.11657142857142858, | |
| "grad_norm": 0.24666009843349457, | |
| "kl": 0.003571033477783203, | |
| "learning_rate": 9.706715543782064e-07, | |
| "loss": 0.0001, | |
| "reward": 0.22202930855564773, | |
| "reward_std": 0.7155356109142303, | |
| "rewards/cosine_scaled_reward": -0.06345116719603539, | |
| "rewards/format_reward": 0.6875000186264515, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 2856.0000610351562, | |
| "epoch": 0.11771428571428572, | |
| "grad_norm": 0.2828216850757599, | |
| "kl": 0.0014602243900299072, | |
| "learning_rate": 9.695457105469804e-07, | |
| "loss": 0.0001, | |
| "reward": 0.047393606044352055, | |
| "reward_std": 0.709520248696208, | |
| "rewards/cosine_scaled_reward": -0.06169603951275349, | |
| "rewards/format_reward": 0.45833334513008595, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 2754.2083587646484, | |
| "epoch": 0.11885714285714286, | |
| "grad_norm": 0.6580407023429871, | |
| "kl": 0.02184271812438965, | |
| "learning_rate": 9.683994186497132e-07, | |
| "loss": 0.0009, | |
| "reward": -0.11339985858649015, | |
| "reward_std": 0.5312070623040199, | |
| "rewards/cosine_scaled_reward": -0.11011355556547642, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 2917.104232788086, | |
| "epoch": 0.12, | |
| "grad_norm": 0.2027282565832138, | |
| "kl": 0.0013439655303955078, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": 0.0001, | |
| "reward": -0.16319485567510128, | |
| "reward_std": 0.8842182755470276, | |
| "rewards/cosine_scaled_reward": -0.14117630943655968, | |
| "rewards/format_reward": 0.291666679084301, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 2187.4166870117188, | |
| "epoch": 0.12114285714285715, | |
| "grad_norm": 0.23987863957881927, | |
| "kl": 0.0010932087898254395, | |
| "learning_rate": 9.66045715125541e-07, | |
| "loss": 0.0, | |
| "reward": 0.5684326654300094, | |
| "reward_std": 0.8030262924730778, | |
| "rewards/cosine_scaled_reward": 0.16187836416065693, | |
| "rewards/format_reward": 0.7083333414047956, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 2847.750030517578, | |
| "epoch": 0.12228571428571429, | |
| "grad_norm": 0.2214292734861374, | |
| "kl": 0.001035928726196289, | |
| "learning_rate": 9.648384182148252e-07, | |
| "loss": 0.0, | |
| "reward": 0.11794419679790735, | |
| "reward_std": 0.49015025421977043, | |
| "rewards/cosine_scaled_reward": -0.0253047663718462, | |
| "rewards/format_reward": 0.5000000037252903, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 2861.7916870117188, | |
| "epoch": 0.12342857142857143, | |
| "grad_norm": 0.21802686154842377, | |
| "kl": 0.001120924949645996, | |
| "learning_rate": 9.636109026648554e-07, | |
| "loss": 0.0, | |
| "reward": 0.12055499106645584, | |
| "reward_std": 0.8332936242222786, | |
| "rewards/cosine_scaled_reward": -0.0109476950019598, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 3038.4583587646484, | |
| "epoch": 0.12457142857142857, | |
| "grad_norm": 0.17514924705028534, | |
| "kl": 0.0005573034286499023, | |
| "learning_rate": 9.623632283030077e-07, | |
| "loss": 0.0, | |
| "reward": -0.07062571635469794, | |
| "reward_std": 0.6698406562209129, | |
| "rewards/cosine_scaled_reward": -0.08270325418561697, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 2691.1250228881836, | |
| "epoch": 0.12571428571428572, | |
| "grad_norm": 0.2149229645729065, | |
| "kl": 0.000943649560213089, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": 0.0, | |
| "reward": -0.06578723713755608, | |
| "reward_std": 0.8007752932608128, | |
| "rewards/cosine_scaled_reward": -0.16101466468535364, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 3023.562545776367, | |
| "epoch": 0.12685714285714286, | |
| "grad_norm": 0.1930156946182251, | |
| "kl": 0.0017292499542236328, | |
| "learning_rate": 9.598076473627796e-07, | |
| "loss": 0.0001, | |
| "reward": 0.05401752423495054, | |
| "reward_std": 0.6248798258602619, | |
| "rewards/cosine_scaled_reward": -0.012754572555422783, | |
| "rewards/format_reward": 0.3750000149011612, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 3057.2083740234375, | |
| "epoch": 0.128, | |
| "grad_norm": 0.17147748172283173, | |
| "kl": 0.000888526439666748, | |
| "learning_rate": 9.58499865339809e-07, | |
| "loss": 0.0, | |
| "reward": 0.12023577280342579, | |
| "reward_std": 1.0201664790511131, | |
| "rewards/cosine_scaled_reward": -0.015141034498810768, | |
| "rewards/format_reward": 0.41666668094694614, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 2712.000030517578, | |
| "epoch": 0.12914285714285714, | |
| "grad_norm": 0.2709903419017792, | |
| "kl": 0.0015894174575805664, | |
| "learning_rate": 9.571721736097088e-07, | |
| "loss": 0.0001, | |
| "reward": 0.06785004865378141, | |
| "reward_std": 0.7393594160676003, | |
| "rewards/cosine_scaled_reward": -0.029724635183811188, | |
| "rewards/format_reward": 0.41666667349636555, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 2551.7500228881836, | |
| "epoch": 0.13028571428571428, | |
| "grad_norm": 0.23112726211547852, | |
| "kl": 0.002204298973083496, | |
| "learning_rate": 9.55824636882301e-07, | |
| "loss": 0.0001, | |
| "reward": -0.08507447713054717, | |
| "reward_std": 0.6186108030378819, | |
| "rewards/cosine_scaled_reward": -0.19081172411097214, | |
| "rewards/format_reward": 0.5416666846722364, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 2909.4583587646484, | |
| "epoch": 0.13142857142857142, | |
| "grad_norm": 0.20251956582069397, | |
| "kl": 0.002556443214416504, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": 0.0001, | |
| "reward": 0.06335902085993439, | |
| "reward_std": 0.6296372078359127, | |
| "rewards/cosine_scaled_reward": -0.025603776797652245, | |
| "rewards/format_reward": 0.41666666977107525, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 3241.979179382324, | |
| "epoch": 0.13257142857142856, | |
| "grad_norm": 0.19871990382671356, | |
| "kl": 0.0013921260833740234, | |
| "learning_rate": 9.530702921077358e-07, | |
| "loss": 0.0001, | |
| "reward": -0.24349494744092226, | |
| "reward_std": 0.7026082035154104, | |
| "rewards/cosine_scaled_reward": -0.13611777569167316, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 2976.6041870117188, | |
| "epoch": 0.1337142857142857, | |
| "grad_norm": 0.20164498686790466, | |
| "kl": 0.0022377967834472656, | |
| "learning_rate": 9.516636183034564e-07, | |
| "loss": 0.0001, | |
| "reward": -0.2154933400452137, | |
| "reward_std": 0.7364092990756035, | |
| "rewards/cosine_scaled_reward": -0.16451909206807613, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 2854.416732788086, | |
| "epoch": 0.13485714285714287, | |
| "grad_norm": 0.18143188953399658, | |
| "kl": 0.001271367073059082, | |
| "learning_rate": 9.502373679810839e-07, | |
| "loss": 0.0001, | |
| "reward": 0.48517339397221804, | |
| "reward_std": 1.1588403210043907, | |
| "rewards/cosine_scaled_reward": 0.1642640804639086, | |
| "rewards/format_reward": 0.5416666753590107, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 2527.312530517578, | |
| "epoch": 0.136, | |
| "grad_norm": 0.2451496571302414, | |
| "kl": 0.004798531532287598, | |
| "learning_rate": 9.487916106540465e-07, | |
| "loss": 0.0002, | |
| "reward": 0.31419147294946015, | |
| "reward_std": 0.5915131606161594, | |
| "rewards/cosine_scaled_reward": 0.07133831456303596, | |
| "rewards/format_reward": 0.5625000037252903, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 2332.937526702881, | |
| "epoch": 0.13714285714285715, | |
| "grad_norm": 0.23065590858459473, | |
| "kl": 0.0033817291259765625, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.0001, | |
| "reward": 0.3343121665529907, | |
| "reward_std": 0.7079905085265636, | |
| "rewards/cosine_scaled_reward": 0.032319008372724056, | |
| "rewards/format_reward": 0.6458333395421505, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 1775.125015258789, | |
| "epoch": 0.1382857142857143, | |
| "grad_norm": 0.2634807825088501, | |
| "kl": 0.004510760307312012, | |
| "learning_rate": 9.458418577899774e-07, | |
| "loss": 0.0002, | |
| "reward": 0.29028519266285, | |
| "reward_std": 0.6599521674215794, | |
| "rewards/cosine_scaled_reward": -0.04384784400463104, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 2823.0208740234375, | |
| "epoch": 0.13942857142857143, | |
| "grad_norm": 0.18922610580921173, | |
| "kl": 0.0015087127685546875, | |
| "learning_rate": 9.443380060197385e-07, | |
| "loss": 0.0001, | |
| "reward": 0.09716521622613072, | |
| "reward_std": 0.784220265224576, | |
| "rewards/cosine_scaled_reward": -0.04663149267435074, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 2748.3125610351562, | |
| "epoch": 0.14057142857142857, | |
| "grad_norm": 0.18109041452407837, | |
| "kl": 0.001852273941040039, | |
| "learning_rate": 9.428149347714143e-07, | |
| "loss": 0.0001, | |
| "reward": 0.029360684799030423, | |
| "reward_std": 0.6506396494805813, | |
| "rewards/cosine_scaled_reward": -0.08873580838553607, | |
| "rewards/format_reward": 0.47916666977107525, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 2168.9583740234375, | |
| "epoch": 0.1417142857142857, | |
| "grad_norm": 0.21630355715751648, | |
| "kl": 0.004555702209472656, | |
| "learning_rate": 9.412727182773486e-07, | |
| "loss": 0.0002, | |
| "reward": 0.06176229752600193, | |
| "reward_std": 0.7377404235303402, | |
| "rewards/cosine_scaled_reward": -0.12269194982945919, | |
| "rewards/format_reward": 0.5833333395421505, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 2883.666702270508, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.21693550050258636, | |
| "kl": 0.001661539077758789, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": 0.0001, | |
| "reward": -0.012577313929796219, | |
| "reward_std": 0.5609125196933746, | |
| "rewards/cosine_scaled_reward": -0.0033534724498167634, | |
| "rewards/format_reward": 0.27083333395421505, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 2796.4584045410156, | |
| "epoch": 0.144, | |
| "grad_norm": 0.16647285223007202, | |
| "kl": 0.0015259981155395508, | |
| "learning_rate": 9.381311511432658e-07, | |
| "loss": 0.0001, | |
| "reward": 0.02252695895731449, | |
| "reward_std": 0.5781615749001503, | |
| "rewards/cosine_scaled_reward": -0.0809040479362011, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 2886.479202270508, | |
| "epoch": 0.14514285714285713, | |
| "grad_norm": 0.2508350610733032, | |
| "kl": 0.0021495819091796875, | |
| "learning_rate": 9.36531953618799e-07, | |
| "loss": 0.0001, | |
| "reward": -0.1614384746644646, | |
| "reward_std": 0.5212756060063839, | |
| "rewards/cosine_scaled_reward": -0.16197758680209517, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 2887.708366394043, | |
| "epoch": 0.1462857142857143, | |
| "grad_norm": 0.20055890083312988, | |
| "kl": 0.0028142929077148438, | |
| "learning_rate": 9.34913917072228e-07, | |
| "loss": 0.0001, | |
| "reward": 0.24019910395145416, | |
| "reward_std": 0.7546846177428961, | |
| "rewards/cosine_scaled_reward": 0.09899781085550785, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 3410.8333740234375, | |
| "epoch": 0.14742857142857144, | |
| "grad_norm": 0.18904834985733032, | |
| "kl": 0.002665996551513672, | |
| "learning_rate": 9.332771203643714e-07, | |
| "loss": 0.0001, | |
| "reward": -0.25656406581401825, | |
| "reward_std": 0.6125860698521137, | |
| "rewards/cosine_scaled_reward": -0.12984570860862732, | |
| "rewards/format_reward": 0.18750000186264515, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 2878.7500228881836, | |
| "epoch": 0.14857142857142858, | |
| "grad_norm": 0.18152809143066406, | |
| "kl": 0.0019643306732177734, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.0001, | |
| "reward": -0.2177959904074669, | |
| "reward_std": 0.5922619923949242, | |
| "rewards/cosine_scaled_reward": -0.16636610962450504, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 2894.3125228881836, | |
| "epoch": 0.14971428571428572, | |
| "grad_norm": 0.19060392677783966, | |
| "kl": 0.0039920806884765625, | |
| "learning_rate": 9.299475664759068e-07, | |
| "loss": 0.0002, | |
| "reward": 0.15484683774411678, | |
| "reward_std": 0.8720951899886131, | |
| "rewards/cosine_scaled_reward": 0.04138324782252312, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 2647.1250762939453, | |
| "epoch": 0.15085714285714286, | |
| "grad_norm": 0.2062901258468628, | |
| "kl": 0.002198457717895508, | |
| "learning_rate": 9.282549715730579e-07, | |
| "loss": 0.0001, | |
| "reward": 0.21524347737431526, | |
| "reward_std": 0.9688759073615074, | |
| "rewards/cosine_scaled_reward": 0.02236761897802353, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 3305.375030517578, | |
| "epoch": 0.152, | |
| "grad_norm": 0.19573596119880676, | |
| "kl": 0.002875089645385742, | |
| "learning_rate": 9.265439410565328e-07, | |
| "loss": 0.0001, | |
| "reward": -0.3930224981158972, | |
| "reward_std": 0.40591937862336636, | |
| "rewards/cosine_scaled_reward": -0.22959117405116558, | |
| "rewards/format_reward": 0.2291666716337204, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 2513.2291870117188, | |
| "epoch": 0.15314285714285714, | |
| "grad_norm": 0.22160597145557404, | |
| "kl": 0.0038213729858398438, | |
| "learning_rate": 9.248145583195447e-07, | |
| "loss": 0.0002, | |
| "reward": 0.14458692615153268, | |
| "reward_std": 0.6939279437065125, | |
| "rewards/cosine_scaled_reward": -0.046293994411826134, | |
| "rewards/format_reward": 0.562500013038516, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 2181.979217529297, | |
| "epoch": 0.15428571428571428, | |
| "grad_norm": 0.2493942677974701, | |
| "kl": 0.0032486915588378906, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": 0.0001, | |
| "reward": 0.49393612146377563, | |
| "reward_std": 0.9125401228666306, | |
| "rewards/cosine_scaled_reward": 0.16448672115802765, | |
| "rewards/format_reward": 0.5833333395421505, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 2919.604248046875, | |
| "epoch": 0.15542857142857142, | |
| "grad_norm": 0.16584192216396332, | |
| "kl": 0.0039215087890625, | |
| "learning_rate": 9.213010742252327e-07, | |
| "loss": 0.0002, | |
| "reward": 0.26563066989183426, | |
| "reward_std": 0.9960127659142017, | |
| "rewards/cosine_scaled_reward": 0.06982969990349375, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 2992.1458892822266, | |
| "epoch": 0.15657142857142858, | |
| "grad_norm": 0.16871950030326843, | |
| "kl": 0.0024118423461914062, | |
| "learning_rate": 9.195171441101668e-07, | |
| "loss": 0.0001, | |
| "reward": -0.12757205590605736, | |
| "reward_std": 0.6657307185232639, | |
| "rewards/cosine_scaled_reward": -0.10885505130863748, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 2352.166748046875, | |
| "epoch": 0.15771428571428572, | |
| "grad_norm": 0.23502451181411743, | |
| "kl": 0.0022039413452148438, | |
| "learning_rate": 9.177152042508077e-07, | |
| "loss": 0.0001, | |
| "reward": 0.05814027041196823, | |
| "reward_std": 0.5509532168507576, | |
| "rewards/cosine_scaled_reward": -0.12178020738065243, | |
| "rewards/format_reward": 0.6041666828095913, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 3276.6041870117188, | |
| "epoch": 0.15885714285714286, | |
| "grad_norm": 0.18198812007904053, | |
| "kl": 0.004057884216308594, | |
| "learning_rate": 9.158953424711624e-07, | |
| "loss": 0.0002, | |
| "reward": -0.3338587903417647, | |
| "reward_std": 0.5466504357755184, | |
| "rewards/cosine_scaled_reward": -0.21947086788713932, | |
| "rewards/format_reward": 0.27083334513008595, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 2934.854202270508, | |
| "epoch": 0.16, | |
| "grad_norm": 0.230448380112648, | |
| "kl": 0.004750251770019531, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.0002, | |
| "reward": -0.08668341906741261, | |
| "reward_std": 0.6167776882648468, | |
| "rewards/cosine_scaled_reward": -0.07037415914237499, | |
| "rewards/format_reward": 0.29166667722165585, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 2843.229248046875, | |
| "epoch": 0.16114285714285714, | |
| "grad_norm": 0.22269631922245026, | |
| "kl": 0.004426002502441406, | |
| "learning_rate": 9.122022088101613e-07, | |
| "loss": 0.0002, | |
| "reward": 0.011526300571858883, | |
| "reward_std": 0.9608168490231037, | |
| "rewards/cosine_scaled_reward": -0.12784388910222333, | |
| "rewards/format_reward": 0.500000013038516, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 2842.2084197998047, | |
| "epoch": 0.16228571428571428, | |
| "grad_norm": 0.201444610953331, | |
| "kl": 0.003329753875732422, | |
| "learning_rate": 9.103291169269299e-07, | |
| "loss": 0.0001, | |
| "reward": 0.07312965765595436, | |
| "reward_std": 0.7833281680941582, | |
| "rewards/cosine_scaled_reward": -0.09690700098872185, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 2778.687530517578, | |
| "epoch": 0.16342857142857142, | |
| "grad_norm": 0.2938580811023712, | |
| "kl": 0.0045318603515625, | |
| "learning_rate": 9.084384631108882e-07, | |
| "loss": 0.0002, | |
| "reward": -0.14028875157237053, | |
| "reward_std": 0.5204294696450233, | |
| "rewards/cosine_scaled_reward": -0.14672685600817204, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 2723.229217529297, | |
| "epoch": 0.16457142857142856, | |
| "grad_norm": 0.2279835343360901, | |
| "kl": 0.0032444000244140625, | |
| "learning_rate": 9.065303395098358e-07, | |
| "loss": 0.0001, | |
| "reward": 0.12287670839577913, | |
| "reward_std": 0.7171412445604801, | |
| "rewards/cosine_scaled_reward": 0.005602353252470493, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 2038.9584007263184, | |
| "epoch": 0.1657142857142857, | |
| "grad_norm": 0.3204021155834198, | |
| "kl": 0.004198551177978516, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": 0.0002, | |
| "reward": 0.3942429169546813, | |
| "reward_std": 0.6445030942559242, | |
| "rewards/cosine_scaled_reward": 0.07814209163188934, | |
| "rewards/format_reward": 0.6458333395421505, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 2138.0209045410156, | |
| "epoch": 0.16685714285714287, | |
| "grad_norm": 0.20871621370315552, | |
| "kl": 0.0025124549865722656, | |
| "learning_rate": 9.026620557966279e-07, | |
| "loss": 0.0001, | |
| "reward": 0.16610028222203255, | |
| "reward_std": 0.7146016918122768, | |
| "rewards/cosine_scaled_reward": -0.11062343697994947, | |
| "rewards/format_reward": 0.7083333469927311, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 2778.729179382324, | |
| "epoch": 0.168, | |
| "grad_norm": 0.20433104038238525, | |
| "kl": 0.004992485046386719, | |
| "learning_rate": 9.007020842191634e-07, | |
| "loss": 0.0002, | |
| "reward": -0.17755376175045967, | |
| "reward_std": 0.5958786718547344, | |
| "rewards/cosine_scaled_reward": -0.16063082218170166, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 2253.812545776367, | |
| "epoch": 0.16914285714285715, | |
| "grad_norm": 0.25221163034439087, | |
| "kl": 0.0036783218383789062, | |
| "learning_rate": 8.987250199168808e-07, | |
| "loss": 0.0001, | |
| "reward": 0.21890676906332374, | |
| "reward_std": 0.684816125780344, | |
| "rewards/cosine_scaled_reward": -0.01959683746099472, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 2701.104232788086, | |
| "epoch": 0.1702857142857143, | |
| "grad_norm": 0.19634747505187988, | |
| "kl": 0.002753734588623047, | |
| "learning_rate": 8.967309592491052e-07, | |
| "loss": 0.0001, | |
| "reward": 0.22702318988740444, | |
| "reward_std": 0.8613481521606445, | |
| "rewards/cosine_scaled_reward": 0.003744029439985752, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 2753.479202270508, | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 0.22827665507793427, | |
| "kl": 0.0054454803466796875, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.0002, | |
| "reward": 0.10332798771560192, | |
| "reward_std": 0.8186976052820683, | |
| "rewards/cosine_scaled_reward": -0.04311029799282551, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 2541.979202270508, | |
| "epoch": 0.17257142857142857, | |
| "grad_norm": 0.18608002364635468, | |
| "kl": 0.004375934600830078, | |
| "learning_rate": 8.926922383915315e-07, | |
| "loss": 0.0002, | |
| "reward": 0.2210833989083767, | |
| "reward_std": 0.9012398943305016, | |
| "rewards/cosine_scaled_reward": -0.0220514964312315, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 2626.9791870117188, | |
| "epoch": 0.1737142857142857, | |
| "grad_norm": 0.232888326048851, | |
| "kl": 0.003198862075805664, | |
| "learning_rate": 8.906477750432903e-07, | |
| "loss": 0.0001, | |
| "reward": -0.07092434912919998, | |
| "reward_std": 0.635146826505661, | |
| "rewards/cosine_scaled_reward": -0.11896505579352379, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 2805.5625610351562, | |
| "epoch": 0.17485714285714285, | |
| "grad_norm": 0.27297356724739075, | |
| "kl": 0.00728607177734375, | |
| "learning_rate": 8.88586709003076e-07, | |
| "loss": 0.0003, | |
| "reward": -0.17417170573025942, | |
| "reward_std": 0.6839433200657368, | |
| "rewards/cosine_scaled_reward": -0.18635874427855015, | |
| "rewards/format_reward": 0.39583334140479565, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 3413.7709045410156, | |
| "epoch": 0.176, | |
| "grad_norm": 0.14731697738170624, | |
| "kl": 0.0031638145446777344, | |
| "learning_rate": 8.865091407243394e-07, | |
| "loss": 0.0001, | |
| "reward": 0.05796261690557003, | |
| "reward_std": 1.0252362191677094, | |
| "rewards/cosine_scaled_reward": -0.010732692433521152, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 2530.604217529297, | |
| "epoch": 0.17714285714285713, | |
| "grad_norm": 0.1957230567932129, | |
| "kl": 0.0033855438232421875, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": 0.0001, | |
| "reward": 0.001878822222352028, | |
| "reward_std": 0.5804712846875191, | |
| "rewards/cosine_scaled_reward": -0.07664009183645248, | |
| "rewards/format_reward": 0.43750000186264515, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 2793.583351135254, | |
| "epoch": 0.1782857142857143, | |
| "grad_norm": 0.17240341007709503, | |
| "kl": 0.0031371116638183594, | |
| "learning_rate": 8.823049032816478e-07, | |
| "loss": 0.0001, | |
| "reward": 0.20785324275493622, | |
| "reward_std": 0.9210717603564262, | |
| "rewards/cosine_scaled_reward": 0.044585417956113815, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 2746.2917098999023, | |
| "epoch": 0.17942857142857144, | |
| "grad_norm": 0.2299441546201706, | |
| "kl": 0.0045833587646484375, | |
| "learning_rate": 8.801784390262943e-07, | |
| "loss": 0.0002, | |
| "reward": -0.09681704035028815, | |
| "reward_std": 0.5040780827403069, | |
| "rewards/cosine_scaled_reward": -0.1591328363865614, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 3003.0209350585938, | |
| "epoch": 0.18057142857142858, | |
| "grad_norm": 0.20773041248321533, | |
| "kl": 0.003810882568359375, | |
| "learning_rate": 8.780358823396352e-07, | |
| "loss": 0.0002, | |
| "reward": 0.33219840307720006, | |
| "reward_std": 0.9787469133734703, | |
| "rewards/cosine_scaled_reward": 0.11911612004041672, | |
| "rewards/format_reward": 0.43750001303851604, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 2543.3958740234375, | |
| "epoch": 0.18171428571428572, | |
| "grad_norm": 0.17278896272182465, | |
| "kl": 0.003597259521484375, | |
| "learning_rate": 8.758773376468604e-07, | |
| "loss": 0.0001, | |
| "reward": -0.10545414686203003, | |
| "reward_std": 0.690795011818409, | |
| "rewards/cosine_scaled_reward": -0.1695910869166255, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 2490.2083740234375, | |
| "epoch": 0.18285714285714286, | |
| "grad_norm": 0.21473944187164307, | |
| "kl": 0.005862236022949219, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.0002, | |
| "reward": 0.2439700961112976, | |
| "reward_std": 0.8647127598524094, | |
| "rewards/cosine_scaled_reward": 0.016008037142455578, | |
| "rewards/format_reward": 0.5416666734963655, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 2415.604202270508, | |
| "epoch": 0.184, | |
| "grad_norm": 0.21809473633766174, | |
| "kl": 0.004559516906738281, | |
| "learning_rate": 8.715127058347614e-07, | |
| "loss": 0.0002, | |
| "reward": 0.08977567870169878, | |
| "reward_std": 0.6846508830785751, | |
| "rewards/cosine_scaled_reward": -0.12287470698356628, | |
| "rewards/format_reward": 0.6250000111758709, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 3056.791702270508, | |
| "epoch": 0.18514285714285714, | |
| "grad_norm": 0.20735777914524078, | |
| "kl": 0.006768226623535156, | |
| "learning_rate": 8.693068314414344e-07, | |
| "loss": 0.0003, | |
| "reward": -0.18869051523506641, | |
| "reward_std": 0.6717521250247955, | |
| "rewards/cosine_scaled_reward": -0.13688376732170582, | |
| "rewards/format_reward": 0.29166667349636555, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 2427.437545776367, | |
| "epoch": 0.18628571428571428, | |
| "grad_norm": 0.24365104734897614, | |
| "kl": 0.004721641540527344, | |
| "learning_rate": 8.670853944836176e-07, | |
| "loss": 0.0002, | |
| "reward": 0.4920094236731529, | |
| "reward_std": 0.755946584045887, | |
| "rewards/cosine_scaled_reward": 0.16996393306180835, | |
| "rewards/format_reward": 0.5833333414047956, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 2403.041732788086, | |
| "epoch": 0.18742857142857142, | |
| "grad_norm": 0.19330641627311707, | |
| "kl": 0.0048122406005859375, | |
| "learning_rate": 8.648485032310144e-07, | |
| "loss": 0.0002, | |
| "reward": 0.08846932090818882, | |
| "reward_std": 0.7613176181912422, | |
| "rewards/cosine_scaled_reward": -0.06676248833537102, | |
| "rewards/format_reward": 0.5000000037252903, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 2565.2083435058594, | |
| "epoch": 0.18857142857142858, | |
| "grad_norm": 0.22222936153411865, | |
| "kl": 0.0052661895751953125, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": 0.0002, | |
| "reward": -0.24459386244416237, | |
| "reward_std": 0.4413093514740467, | |
| "rewards/cosine_scaled_reward": -0.2269913526251912, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 2543.312515258789, | |
| "epoch": 0.18971428571428572, | |
| "grad_norm": 0.18792009353637695, | |
| "kl": 0.003421306610107422, | |
| "learning_rate": 8.603287946810513e-07, | |
| "loss": 0.0001, | |
| "reward": 0.20099905133247375, | |
| "reward_std": 0.7041075564920902, | |
| "rewards/cosine_scaled_reward": 0.01856833230704069, | |
| "rewards/format_reward": 0.5000000037252903, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 2143.854232788086, | |
| "epoch": 0.19085714285714286, | |
| "grad_norm": 0.24791951477527618, | |
| "kl": 0.003101825714111328, | |
| "learning_rate": 8.580461976679099e-07, | |
| "loss": 0.0001, | |
| "reward": 0.2092342609539628, | |
| "reward_std": 0.6548982411623001, | |
| "rewards/cosine_scaled_reward": -0.09624562226235867, | |
| "rewards/format_reward": 0.7500000186264515, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 2693.3333740234375, | |
| "epoch": 0.192, | |
| "grad_norm": 0.19963470101356506, | |
| "kl": 0.0033502578735351562, | |
| "learning_rate": 8.557485869176825e-07, | |
| "loss": 0.0001, | |
| "reward": 0.2024577334523201, | |
| "reward_std": 0.8647420592606068, | |
| "rewards/cosine_scaled_reward": 0.00969130964949727, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 1926.437515258789, | |
| "epoch": 0.19314285714285714, | |
| "grad_norm": 0.20806600153446198, | |
| "kl": 0.003902435302734375, | |
| "learning_rate": 8.534360744126753e-07, | |
| "loss": 0.0002, | |
| "reward": 0.7901292243041098, | |
| "reward_std": 0.9031771421432495, | |
| "rewards/cosine_scaled_reward": 0.24018706334754825, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 2477.1042098999023, | |
| "epoch": 0.19428571428571428, | |
| "grad_norm": 0.19945000112056732, | |
| "kl": 0.0035648345947265625, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": 0.0001, | |
| "reward": 0.14062727987766266, | |
| "reward_std": 0.6545419208705425, | |
| "rewards/cosine_scaled_reward": -0.010002967901527882, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 2406.479202270508, | |
| "epoch": 0.19542857142857142, | |
| "grad_norm": 0.19067376852035522, | |
| "kl": 0.0029478073120117188, | |
| "learning_rate": 8.487667956935087e-07, | |
| "loss": 0.0001, | |
| "reward": 0.1707976944744587, | |
| "reward_std": 0.7575587891042233, | |
| "rewards/cosine_scaled_reward": 0.005063103046268225, | |
| "rewards/format_reward": 0.4791666828095913, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 2873.8750228881836, | |
| "epoch": 0.19657142857142856, | |
| "grad_norm": 0.2227620929479599, | |
| "kl": 0.006169319152832031, | |
| "learning_rate": 8.464102570534061e-07, | |
| "loss": 0.0002, | |
| "reward": 0.2180697526782751, | |
| "reward_std": 0.8882052935659885, | |
| "rewards/cosine_scaled_reward": 0.07928902423009276, | |
| "rewards/format_reward": 0.37500000186264515, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 1677.6875076293945, | |
| "epoch": 0.1977142857142857, | |
| "grad_norm": 0.2608514428138733, | |
| "kl": 0.0040760040283203125, | |
| "learning_rate": 8.440392717955475e-07, | |
| "loss": 0.0002, | |
| "reward": 0.11270578391849995, | |
| "reward_std": 0.7172344215214252, | |
| "rewards/cosine_scaled_reward": -0.1393726442474872, | |
| "rewards/format_reward": 0.6875, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 2115.6458740234375, | |
| "epoch": 0.19885714285714284, | |
| "grad_norm": 0.21290229260921478, | |
| "kl": 0.006714820861816406, | |
| "learning_rate": 8.416539554784089e-07, | |
| "loss": 0.0003, | |
| "reward": 0.4762940816581249, | |
| "reward_std": 0.8587245345115662, | |
| "rewards/cosine_scaled_reward": 0.08201689831912518, | |
| "rewards/format_reward": 0.7291666734963655, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 2647.6041870117188, | |
| "epoch": 0.2, | |
| "grad_norm": 0.15685485303401947, | |
| "kl": 0.004894256591796875, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": 0.0002, | |
| "reward": 0.2985331416130066, | |
| "reward_std": 0.5968715883791447, | |
| "rewards/cosine_scaled_reward": 0.10642770305275917, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 1974.5625686645508, | |
| "epoch": 0.20114285714285715, | |
| "grad_norm": 0.28263431787490845, | |
| "kl": 0.0044193267822265625, | |
| "learning_rate": 8.368407953869103e-07, | |
| "loss": 0.0002, | |
| "reward": 0.1971809258684516, | |
| "reward_std": 0.8409126959741116, | |
| "rewards/cosine_scaled_reward": -0.0763177121989429, | |
| "rewards/format_reward": 0.6666666697710752, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 2667.4583740234375, | |
| "epoch": 0.2022857142857143, | |
| "grad_norm": 0.2523181140422821, | |
| "kl": 0.005799293518066406, | |
| "learning_rate": 8.344131861991828e-07, | |
| "loss": 0.0002, | |
| "reward": 0.11005790415219963, | |
| "reward_std": 0.6982481181621552, | |
| "rewards/cosine_scaled_reward": -0.059121566824615, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 2512.770881652832, | |
| "epoch": 0.20342857142857143, | |
| "grad_norm": 0.24519820511341095, | |
| "kl": 0.0081329345703125, | |
| "learning_rate": 8.319717151140072e-07, | |
| "loss": 0.0003, | |
| "reward": 0.2254039328545332, | |
| "reward_std": 0.8539987355470657, | |
| "rewards/cosine_scaled_reward": -0.00572938984259963, | |
| "rewards/format_reward": 0.5625000018626451, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 2653.083335876465, | |
| "epoch": 0.20457142857142857, | |
| "grad_norm": 0.3077125549316406, | |
| "kl": 0.004204273223876953, | |
| "learning_rate": 8.295165011252396e-07, | |
| "loss": 0.0002, | |
| "reward": 0.029737239703536034, | |
| "reward_std": 0.7201298326253891, | |
| "rewards/cosine_scaled_reward": -0.08826536871492863, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 1912.3333740234375, | |
| "epoch": 0.2057142857142857, | |
| "grad_norm": 0.2891172468662262, | |
| "kl": 0.006420135498046875, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.0003, | |
| "reward": 0.3585444991476834, | |
| "reward_std": 0.922141257673502, | |
| "rewards/cosine_scaled_reward": 0.023937703692354262, | |
| "rewards/format_reward": 0.6666666679084301, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 2936.9583435058594, | |
| "epoch": 0.20685714285714285, | |
| "grad_norm": 0.21322257816791534, | |
| "kl": 0.005759239196777344, | |
| "learning_rate": 8.245653237555705e-07, | |
| "loss": 0.0002, | |
| "reward": -0.0014306185767054558, | |
| "reward_std": 0.7736919745802879, | |
| "rewards/cosine_scaled_reward": -0.05711588263511658, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 2112.875030517578, | |
| "epoch": 0.208, | |
| "grad_norm": 0.18484172224998474, | |
| "kl": 0.0020971298217773438, | |
| "learning_rate": 8.220696016880687e-07, | |
| "loss": 0.0001, | |
| "reward": 0.22709017619490623, | |
| "reward_std": 0.7963708490133286, | |
| "rewards/cosine_scaled_reward": -0.04323145607486367, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 1799.1042175292969, | |
| "epoch": 0.20914285714285713, | |
| "grad_norm": 0.2422093003988266, | |
| "kl": 0.007781982421875, | |
| "learning_rate": 8.195606193320136e-07, | |
| "loss": 0.0003, | |
| "reward": 0.4508074652403593, | |
| "reward_std": 0.8603553548455238, | |
| "rewards/cosine_scaled_reward": 0.010005924385040998, | |
| "rewards/format_reward": 0.8333333469927311, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 2360.8541870117188, | |
| "epoch": 0.2102857142857143, | |
| "grad_norm": 0.26070070266723633, | |
| "kl": 0.00511932373046875, | |
| "learning_rate": 8.170384989716657e-07, | |
| "loss": 0.0002, | |
| "reward": -0.12935980968177319, | |
| "reward_std": 0.5107476897537708, | |
| "rewards/cosine_scaled_reward": -0.21540038008242846, | |
| "rewards/format_reward": 0.5416666734963655, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 2236.8542098999023, | |
| "epoch": 0.21142857142857144, | |
| "grad_norm": 0.22640132904052734, | |
| "kl": 0.004220008850097656, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": 0.0002, | |
| "reward": -0.22387782018631697, | |
| "reward_std": 0.3759702183306217, | |
| "rewards/cosine_scaled_reward": -0.2601937036961317, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 2573.1667098999023, | |
| "epoch": 0.21257142857142858, | |
| "grad_norm": 0.18543511629104614, | |
| "kl": 0.005625724792480469, | |
| "learning_rate": 8.119553365707802e-07, | |
| "loss": 0.0002, | |
| "reward": -0.0019482946954667568, | |
| "reward_std": 0.6483882665634155, | |
| "rewards/cosine_scaled_reward": -0.08200034685432911, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 1852.8958587646484, | |
| "epoch": 0.21371428571428572, | |
| "grad_norm": 0.2344105988740921, | |
| "kl": 0.005881786346435547, | |
| "learning_rate": 8.093945422764069e-07, | |
| "loss": 0.0002, | |
| "reward": 0.061095200944691896, | |
| "reward_std": 0.5442762821912766, | |
| "rewards/cosine_scaled_reward": -0.1909130923449993, | |
| "rewards/format_reward": 0.7500000055879354, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 2904.625030517578, | |
| "epoch": 0.21485714285714286, | |
| "grad_norm": 0.17529600858688354, | |
| "kl": 0.0055294036865234375, | |
| "learning_rate": 8.068211054579943e-07, | |
| "loss": 0.0002, | |
| "reward": -0.289316936628893, | |
| "reward_std": 0.5605541579425335, | |
| "rewards/cosine_scaled_reward": -0.2442341128771659, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 2011.3750457763672, | |
| "epoch": 0.216, | |
| "grad_norm": 0.5071147084236145, | |
| "kl": 0.0047817230224609375, | |
| "learning_rate": 8.04235151541222e-07, | |
| "loss": 0.0002, | |
| "reward": 0.18212716095149517, | |
| "reward_std": 0.6630742475390434, | |
| "rewards/cosine_scaled_reward": -0.06412508455105126, | |
| "rewards/format_reward": 0.6458333395421505, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 2108.0000610351562, | |
| "epoch": 0.21714285714285714, | |
| "grad_norm": 0.20937705039978027, | |
| "kl": 0.005312919616699219, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.0002, | |
| "reward": 0.4989317310974002, | |
| "reward_std": 0.8167940117418766, | |
| "rewards/cosine_scaled_reward": 0.11576890759170055, | |
| "rewards/format_reward": 0.6875000055879354, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 1798.833381652832, | |
| "epoch": 0.21828571428571428, | |
| "grad_norm": 0.24339546263217926, | |
| "kl": 0.005016326904296875, | |
| "learning_rate": 7.990261971595048e-07, | |
| "loss": 0.0002, | |
| "reward": 0.3590816780924797, | |
| "reward_std": 0.886138778179884, | |
| "rewards/cosine_scaled_reward": 0.0189095304813236, | |
| "rewards/format_reward": 0.6875000055879354, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 2366.5000762939453, | |
| "epoch": 0.21942857142857142, | |
| "grad_norm": 0.20198509097099304, | |
| "kl": 0.0047245025634765625, | |
| "learning_rate": 7.964034505716476e-07, | |
| "loss": 0.0002, | |
| "reward": -0.017504574730992317, | |
| "reward_std": 0.6682247072458267, | |
| "rewards/cosine_scaled_reward": -0.1821177799720317, | |
| "rewards/format_reward": 0.6041666734963655, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 2782.4792098999023, | |
| "epoch": 0.22057142857142858, | |
| "grad_norm": 0.33190837502479553, | |
| "kl": 0.0082855224609375, | |
| "learning_rate": 7.93768694627233e-07, | |
| "loss": 0.0003, | |
| "reward": -0.09848117176443338, | |
| "reward_std": 0.7263169325888157, | |
| "rewards/cosine_scaled_reward": -0.16758773289620876, | |
| "rewards/format_reward": 0.45833334513008595, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 2497.958396911621, | |
| "epoch": 0.22171428571428572, | |
| "grad_norm": 0.20004087686538696, | |
| "kl": 0.005490303039550781, | |
| "learning_rate": 7.911220577405484e-07, | |
| "loss": 0.0002, | |
| "reward": 0.4655712964013219, | |
| "reward_std": 0.936052493751049, | |
| "rewards/cosine_scaled_reward": 0.11659512436017394, | |
| "rewards/format_reward": 0.6458333469927311, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 2314.0000915527344, | |
| "epoch": 0.22285714285714286, | |
| "grad_norm": 0.23152968287467957, | |
| "kl": 0.0047397613525390625, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": 0.0002, | |
| "reward": 0.33897530660033226, | |
| "reward_std": 0.9344584122300148, | |
| "rewards/cosine_scaled_reward": 0.003916108049452305, | |
| "rewards/format_reward": 0.6875000167638063, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 3108.625030517578, | |
| "epoch": 0.224, | |
| "grad_norm": 0.17755372822284698, | |
| "kl": 0.006099700927734375, | |
| "learning_rate": 7.857936576865356e-07, | |
| "loss": 0.0002, | |
| "reward": 0.04909700155258179, | |
| "reward_std": 0.5633429922163486, | |
| "rewards/cosine_scaled_reward": -0.03188931196928024, | |
| "rewards/format_reward": 0.41666667349636555, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 1294.9583549499512, | |
| "epoch": 0.22514285714285714, | |
| "grad_norm": 0.26458728313446045, | |
| "kl": 0.0051975250244140625, | |
| "learning_rate": 7.831121542179086e-07, | |
| "loss": 0.0002, | |
| "reward": 0.580374225974083, | |
| "reward_std": 0.8097280263900757, | |
| "rewards/cosine_scaled_reward": 0.0928502269089222, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 2055.2500381469727, | |
| "epoch": 0.22628571428571428, | |
| "grad_norm": 0.2084938883781433, | |
| "kl": 0.0077152252197265625, | |
| "learning_rate": 7.804192891917571e-07, | |
| "loss": 0.0003, | |
| "reward": 0.2619430273771286, | |
| "reward_std": 0.6607060171663761, | |
| "rewards/cosine_scaled_reward": -0.01983140129595995, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 2145.4584045410156, | |
| "epoch": 0.22742857142857142, | |
| "grad_norm": 0.24516309797763824, | |
| "kl": 0.007678985595703125, | |
| "learning_rate": 7.777151938545235e-07, | |
| "loss": 0.0003, | |
| "reward": 0.267922455444932, | |
| "reward_std": 0.8017127625644207, | |
| "rewards/cosine_scaled_reward": -0.057456295005977154, | |
| "rewards/format_reward": 0.7291666846722364, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 1690.3958892822266, | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 0.25067099928855896, | |
| "kl": 0.004558563232421875, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.0002, | |
| "reward": 0.4661361053586006, | |
| "reward_std": 0.831662617623806, | |
| "rewards/cosine_scaled_reward": 0.043831199407577515, | |
| "rewards/format_reward": 0.7916666753590107, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 1972.2083854675293, | |
| "epoch": 0.2297142857142857, | |
| "grad_norm": 0.30941376090049744, | |
| "kl": 0.0045299530029296875, | |
| "learning_rate": 7.72273839962904e-07, | |
| "loss": 0.0002, | |
| "reward": 0.9293034709990025, | |
| "reward_std": 0.8486984223127365, | |
| "rewards/cosine_scaled_reward": 0.35864230850711465, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 201 | |
| }, | |
| { | |
| "completion_length": 1596.5625305175781, | |
| "epoch": 0.23085714285714284, | |
| "grad_norm": 0.20324425399303436, | |
| "kl": 0.0039119720458984375, | |
| "learning_rate": 7.695368466124296e-07, | |
| "loss": 0.0002, | |
| "reward": 0.5681986985728145, | |
| "reward_std": 0.5084148272871971, | |
| "rewards/cosine_scaled_reward": 0.16256619803607464, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 2294.5208435058594, | |
| "epoch": 0.232, | |
| "grad_norm": 0.2826099097728729, | |
| "kl": 0.006962776184082031, | |
| "learning_rate": 7.667891533457718e-07, | |
| "loss": 0.0003, | |
| "reward": 0.20878975582309067, | |
| "reward_std": 0.8034153431653976, | |
| "rewards/cosine_scaled_reward": -0.03315589763224125, | |
| "rewards/format_reward": 0.6041666734963655, | |
| "step": 203 | |
| }, | |
| { | |
| "completion_length": 1520.5000381469727, | |
| "epoch": 0.23314285714285715, | |
| "grad_norm": 0.3326990306377411, | |
| "kl": 0.005408287048339844, | |
| "learning_rate": 7.640308940816239e-07, | |
| "loss": 0.0002, | |
| "reward": 0.5824265358969569, | |
| "reward_std": 0.7743073143064976, | |
| "rewards/cosine_scaled_reward": 0.06728124246001244, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 1976.6250534057617, | |
| "epoch": 0.2342857142857143, | |
| "grad_norm": 0.2846101224422455, | |
| "kl": 0.004940032958984375, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.0002, | |
| "reward": 0.47044914215803146, | |
| "reward_std": 0.9910449758172035, | |
| "rewards/cosine_scaled_reward": 0.09328299947082996, | |
| "rewards/format_reward": 0.6875000093132257, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 2566.541702270508, | |
| "epoch": 0.23542857142857143, | |
| "grad_norm": 0.22673514485359192, | |
| "kl": 0.0060405731201171875, | |
| "learning_rate": 7.584832158039378e-07, | |
| "loss": 0.0002, | |
| "reward": -0.0411979085765779, | |
| "reward_std": 0.6954502202570438, | |
| "rewards/cosine_scaled_reward": -0.14637301303446293, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 1979.3125457763672, | |
| "epoch": 0.23657142857142857, | |
| "grad_norm": 0.2924785315990448, | |
| "kl": 0.007213592529296875, | |
| "learning_rate": 7.556940671764124e-07, | |
| "loss": 0.0003, | |
| "reward": 0.3043129206635058, | |
| "reward_std": 0.830761406570673, | |
| "rewards/cosine_scaled_reward": -0.0667324224486947, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 207 | |
| }, | |
| { | |
| "completion_length": 1684.020851135254, | |
| "epoch": 0.2377142857142857, | |
| "grad_norm": 0.21258480846881866, | |
| "kl": 0.0056591033935546875, | |
| "learning_rate": 7.528948933102438e-07, | |
| "loss": 0.0002, | |
| "reward": 0.4881124454550445, | |
| "reward_std": 0.6019946355372667, | |
| "rewards/cosine_scaled_reward": 0.062041960656642914, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 1827.8125457763672, | |
| "epoch": 0.23885714285714285, | |
| "grad_norm": 0.26896902918815613, | |
| "kl": 0.00604248046875, | |
| "learning_rate": 7.500858306332172e-07, | |
| "loss": 0.0002, | |
| "reward": 0.4111206401139498, | |
| "reward_std": 0.7256677895784378, | |
| "rewards/cosine_scaled_reward": 0.03580674855038524, | |
| "rewards/format_reward": 0.7500000037252903, | |
| "step": 209 | |
| }, | |
| { | |
| "completion_length": 1963.2917175292969, | |
| "epoch": 0.24, | |
| "grad_norm": 0.18104591965675354, | |
| "kl": 0.0045299530029296875, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": 0.0002, | |
| "reward": 0.3350107278674841, | |
| "reward_std": 0.733224093914032, | |
| "rewards/cosine_scaled_reward": -0.016452430087156245, | |
| "rewards/format_reward": 0.7500000074505806, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 1673.8750228881836, | |
| "epoch": 0.24114285714285713, | |
| "grad_norm": 0.2317248284816742, | |
| "kl": 0.005611419677734375, | |
| "learning_rate": 7.444385869608921e-07, | |
| "loss": 0.0002, | |
| "reward": 0.4623087588697672, | |
| "reward_std": 0.633669501170516, | |
| "rewards/cosine_scaled_reward": 0.09324477170594037, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 211 | |
| }, | |
| { | |
| "completion_length": 1575.3125457763672, | |
| "epoch": 0.2422857142857143, | |
| "grad_norm": 0.26243215799331665, | |
| "kl": 0.0056781768798828125, | |
| "learning_rate": 7.416006812042827e-07, | |
| "loss": 0.0002, | |
| "reward": 0.524188793846406, | |
| "reward_std": 0.9563748612999916, | |
| "rewards/cosine_scaled_reward": 0.07078039133921266, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 1991.3750686645508, | |
| "epoch": 0.24342857142857144, | |
| "grad_norm": 0.2917014956474304, | |
| "kl": 0.009765625, | |
| "learning_rate": 7.387534371007797e-07, | |
| "loss": 0.0004, | |
| "reward": 0.44908707588911057, | |
| "reward_std": 0.7470048144459724, | |
| "rewards/cosine_scaled_reward": 0.06995298992842436, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 213 | |
| }, | |
| { | |
| "completion_length": 2065.70841217041, | |
| "epoch": 0.24457142857142858, | |
| "grad_norm": 0.2539423108100891, | |
| "kl": 0.00661468505859375, | |
| "learning_rate": 7.358969934210438e-07, | |
| "loss": 0.0003, | |
| "reward": 0.48666782677173615, | |
| "reward_std": 0.8372413367033005, | |
| "rewards/cosine_scaled_reward": 0.07745558395981789, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 1527.125015258789, | |
| "epoch": 0.24571428571428572, | |
| "grad_norm": 0.22233973443508148, | |
| "kl": 0.0035753250122070312, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": 0.0001, | |
| "reward": 0.27787265577353537, | |
| "reward_std": 0.5949838422238827, | |
| "rewards/cosine_scaled_reward": -0.09243576228618622, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 1395.00004196167, | |
| "epoch": 0.24685714285714286, | |
| "grad_norm": 0.35132068395614624, | |
| "kl": 0.00598907470703125, | |
| "learning_rate": 7.301570646506027e-07, | |
| "loss": 0.0002, | |
| "reward": 0.70725142583251, | |
| "reward_std": 0.6296214163303375, | |
| "rewards/cosine_scaled_reward": 0.16727344412356615, | |
| "rewards/format_reward": 0.8958333395421505, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 1762.5416946411133, | |
| "epoch": 0.248, | |
| "grad_norm": 0.18891237676143646, | |
| "kl": 0.005519866943359375, | |
| "learning_rate": 7.27273859315928e-07, | |
| "loss": 0.0002, | |
| "reward": 0.4034770044963807, | |
| "reward_std": 0.6970177218317986, | |
| "rewards/cosine_scaled_reward": 0.04039741773158312, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 217 | |
| }, | |
| { | |
| "completion_length": 1985.6875305175781, | |
| "epoch": 0.24914285714285714, | |
| "grad_norm": 0.2248099148273468, | |
| "kl": 0.005970954895019531, | |
| "learning_rate": 7.243820139034464e-07, | |
| "loss": 0.0002, | |
| "reward": 0.15873102471232414, | |
| "reward_std": 0.6960118226706982, | |
| "rewards/cosine_scaled_reward": -0.12544306740164757, | |
| "rewards/format_reward": 0.7291666697710752, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 1378.6666946411133, | |
| "epoch": 0.2502857142857143, | |
| "grad_norm": 0.26156318187713623, | |
| "kl": 0.0061511993408203125, | |
| "learning_rate": 7.214816693576234e-07, | |
| "loss": 0.0002, | |
| "reward": 0.537579414434731, | |
| "reward_std": 0.7653206884860992, | |
| "rewards/cosine_scaled_reward": 0.07624521851539612, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 219 | |
| }, | |
| { | |
| "completion_length": 1764.0000228881836, | |
| "epoch": 0.25142857142857145, | |
| "grad_norm": 0.2845953404903412, | |
| "kl": 0.006932258605957031, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": 0.0003, | |
| "reward": 0.02811681106686592, | |
| "reward_std": 0.4322117939591408, | |
| "rewards/cosine_scaled_reward": -0.2192363552749157, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 1456.3958740234375, | |
| "epoch": 0.25257142857142856, | |
| "grad_norm": 0.22674359381198883, | |
| "kl": 0.0051517486572265625, | |
| "learning_rate": 7.156560487081051e-07, | |
| "loss": 0.0002, | |
| "reward": 0.47192759346216917, | |
| "reward_std": 0.6820324845612049, | |
| "rewards/cosine_scaled_reward": 0.045280902180820704, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 221 | |
| }, | |
| { | |
| "completion_length": 1963.2917175292969, | |
| "epoch": 0.2537142857142857, | |
| "grad_norm": 0.21307772397994995, | |
| "kl": 0.005227088928222656, | |
| "learning_rate": 7.127310565369415e-07, | |
| "loss": 0.0002, | |
| "reward": 0.42855058796703815, | |
| "reward_std": 0.7586027830839157, | |
| "rewards/cosine_scaled_reward": 0.055503834038972855, | |
| "rewards/format_reward": 0.7291666828095913, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 1937.0208740234375, | |
| "epoch": 0.25485714285714284, | |
| "grad_norm": 0.26616960763931274, | |
| "kl": 0.0052585601806640625, | |
| "learning_rate": 7.097981330836616e-07, | |
| "loss": 0.0002, | |
| "reward": 0.2477805533562787, | |
| "reward_std": 0.5524286925792694, | |
| "rewards/cosine_scaled_reward": -0.013783978298306465, | |
| "rewards/format_reward": 0.6458333376795053, | |
| "step": 223 | |
| }, | |
| { | |
| "completion_length": 1899.812515258789, | |
| "epoch": 0.256, | |
| "grad_norm": 0.18020717799663544, | |
| "kl": 0.00457000732421875, | |
| "learning_rate": 7.068574212948169e-07, | |
| "loss": 0.0002, | |
| "reward": 0.4362371205352247, | |
| "reward_std": 0.8643104657530785, | |
| "rewards/cosine_scaled_reward": -0.009671762585639954, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 2635.125045776367, | |
| "epoch": 0.2571428571428571, | |
| "grad_norm": 0.2361401468515396, | |
| "kl": 0.011203765869140625, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.0004, | |
| "reward": 0.1398951131850481, | |
| "reward_std": 0.7698212433606386, | |
| "rewards/cosine_scaled_reward": -0.0572519232518971, | |
| "rewards/format_reward": 0.5625, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 1900.4375534057617, | |
| "epoch": 0.2582857142857143, | |
| "grad_norm": 0.21450620889663696, | |
| "kl": 0.0057201385498046875, | |
| "learning_rate": 7.009532063876148e-07, | |
| "loss": 0.0002, | |
| "reward": 0.5856498526409268, | |
| "reward_std": 0.7293794807046652, | |
| "rewards/cosine_scaled_reward": 0.13076626230031252, | |
| "rewards/format_reward": 0.7916666734963655, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 1369.208366394043, | |
| "epoch": 0.25942857142857145, | |
| "grad_norm": 0.323088675737381, | |
| "kl": 0.008134841918945312, | |
| "learning_rate": 6.979899910323624e-07, | |
| "loss": 0.0003, | |
| "reward": 0.4275068351998925, | |
| "reward_std": 0.8954740911722183, | |
| "rewards/cosine_scaled_reward": -0.019443090073764324, | |
| "rewards/format_reward": 0.8541666679084301, | |
| "step": 227 | |
| }, | |
| { | |
| "completion_length": 1602.8750343322754, | |
| "epoch": 0.26057142857142856, | |
| "grad_norm": 0.2612713873386383, | |
| "kl": 0.0060291290283203125, | |
| "learning_rate": 6.950195628537299e-07, | |
| "loss": 0.0002, | |
| "reward": 0.46579291112720966, | |
| "reward_std": 0.7513364851474762, | |
| "rewards/cosine_scaled_reward": 0.05927048996090889, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 1817.6041870117188, | |
| "epoch": 0.26171428571428573, | |
| "grad_norm": 0.2555846869945526, | |
| "kl": 0.0076618194580078125, | |
| "learning_rate": 6.920420666261961e-07, | |
| "loss": 0.0003, | |
| "reward": 0.22703039785847068, | |
| "reward_std": 0.6141838692128658, | |
| "rewards/cosine_scaled_reward": -0.08688993845134974, | |
| "rewards/format_reward": 0.750000013038516, | |
| "step": 229 | |
| }, | |
| { | |
| "completion_length": 1947.083396911621, | |
| "epoch": 0.26285714285714284, | |
| "grad_norm": 0.22352339327335358, | |
| "kl": 0.0059909820556640625, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.0002, | |
| "reward": 0.030418872833251953, | |
| "reward_std": 0.5589127205312252, | |
| "rewards/cosine_scaled_reward": -0.2263335685711354, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 1575.8125381469727, | |
| "epoch": 0.264, | |
| "grad_norm": 0.24202732741832733, | |
| "kl": 0.0063457489013671875, | |
| "learning_rate": 6.860664508377001e-07, | |
| "loss": 0.0003, | |
| "reward": 0.5021250182762742, | |
| "reward_std": 0.6612259335815907, | |
| "rewards/cosine_scaled_reward": 0.03393824491649866, | |
| "rewards/format_reward": 0.8750000111758709, | |
| "step": 231 | |
| }, | |
| { | |
| "completion_length": 2361.145881652832, | |
| "epoch": 0.2651428571428571, | |
| "grad_norm": 0.28824689984321594, | |
| "kl": 0.008266448974609375, | |
| "learning_rate": 6.83068622519821e-07, | |
| "loss": 0.0003, | |
| "reward": -0.13501371257007122, | |
| "reward_std": 0.600494496524334, | |
| "rewards/cosine_scaled_reward": -0.24766290560364723, | |
| "rewards/format_reward": 0.5833333376795053, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 1480.9167022705078, | |
| "epoch": 0.2662857142857143, | |
| "grad_norm": 0.32725366950035095, | |
| "kl": 0.006622314453125, | |
| "learning_rate": 6.800643086250121e-07, | |
| "loss": 0.0003, | |
| "reward": 0.22992298612371087, | |
| "reward_std": 0.7725229002535343, | |
| "rewards/cosine_scaled_reward": -0.15780717965390068, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 233 | |
| }, | |
| { | |
| "completion_length": 1913.5208854675293, | |
| "epoch": 0.2674285714285714, | |
| "grad_norm": 0.27367836236953735, | |
| "kl": 0.008426666259765625, | |
| "learning_rate": 6.770536555792944e-07, | |
| "loss": 0.0003, | |
| "reward": 0.17465345282107592, | |
| "reward_std": 0.7128382474184036, | |
| "rewards/cosine_scaled_reward": -0.08376146724913269, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 1466.6458892822266, | |
| "epoch": 0.26857142857142857, | |
| "grad_norm": 0.25469592213630676, | |
| "kl": 0.00775909423828125, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": 0.0003, | |
| "reward": 0.6011685915291309, | |
| "reward_std": 0.7442786023020744, | |
| "rewards/cosine_scaled_reward": 0.11913689319044352, | |
| "rewards/format_reward": 0.833333333954215, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 1939.0417175292969, | |
| "epoch": 0.26971428571428574, | |
| "grad_norm": 0.23096546530723572, | |
| "kl": 0.0063419342041015625, | |
| "learning_rate": 6.710139192768694e-07, | |
| "loss": 0.0003, | |
| "reward": 0.33170187287032604, | |
| "reward_std": 0.7268010787665844, | |
| "rewards/cosine_scaled_reward": -0.018086417112499475, | |
| "rewards/format_reward": 0.7500000018626451, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 1496.2083740234375, | |
| "epoch": 0.27085714285714285, | |
| "grad_norm": 0.25223881006240845, | |
| "kl": 0.0052013397216796875, | |
| "learning_rate": 6.679851303883891e-07, | |
| "loss": 0.0002, | |
| "reward": 0.5692060198634863, | |
| "reward_std": 0.5697116628289223, | |
| "rewards/cosine_scaled_reward": 0.09943875670433044, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 237 | |
| }, | |
| { | |
| "completion_length": 1303.5625228881836, | |
| "epoch": 0.272, | |
| "grad_norm": 0.23987644910812378, | |
| "kl": 0.00640106201171875, | |
| "learning_rate": 6.649505910711058e-07, | |
| "loss": 0.0003, | |
| "reward": 0.6454257536679506, | |
| "reward_std": 0.8430968932807446, | |
| "rewards/cosine_scaled_reward": 0.09218539297580719, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 1533.0833473205566, | |
| "epoch": 0.27314285714285713, | |
| "grad_norm": 0.22352874279022217, | |
| "kl": 0.004787445068359375, | |
| "learning_rate": 6.619104492241847e-07, | |
| "loss": 0.0002, | |
| "reward": 0.7365382118150592, | |
| "reward_std": 0.6625222954899073, | |
| "rewards/cosine_scaled_reward": 0.24717165902256966, | |
| "rewards/format_reward": 0.770833333954215, | |
| "step": 239 | |
| }, | |
| { | |
| "completion_length": 1882.8125381469727, | |
| "epoch": 0.2742857142857143, | |
| "grad_norm": 0.35037240386009216, | |
| "kl": 0.010135650634765625, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": 0.0004, | |
| "reward": 0.04069505538791418, | |
| "reward_std": 0.6465389877557755, | |
| "rewards/cosine_scaled_reward": -0.16988872209913097, | |
| "rewards/format_reward": 0.6666666772216558, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 2043.1667175292969, | |
| "epoch": 0.2754285714285714, | |
| "grad_norm": 0.2375987321138382, | |
| "kl": 0.009412765502929688, | |
| "learning_rate": 6.558139508961654e-07, | |
| "loss": 0.0004, | |
| "reward": -0.024178337305784225, | |
| "reward_std": 0.5009024143218994, | |
| "rewards/cosine_scaled_reward": -0.20676567568443716, | |
| "rewards/format_reward": 0.6666666753590107, | |
| "step": 241 | |
| }, | |
| { | |
| "completion_length": 1260.9792022705078, | |
| "epoch": 0.2765714285714286, | |
| "grad_norm": 0.29835185408592224, | |
| "kl": 0.010770797729492188, | |
| "learning_rate": 6.527578915497951e-07, | |
| "loss": 0.0004, | |
| "reward": 0.3275939063169062, | |
| "reward_std": 0.5498746670782566, | |
| "rewards/cosine_scaled_reward": -0.09544416703283787, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 1774.5000534057617, | |
| "epoch": 0.2777142857142857, | |
| "grad_norm": 0.19600969552993774, | |
| "kl": 0.0063343048095703125, | |
| "learning_rate": 6.496968239287603e-07, | |
| "loss": 0.0003, | |
| "reward": 0.43449581041932106, | |
| "reward_std": 0.7577872760593891, | |
| "rewards/cosine_scaled_reward": 0.026275813579559326, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 243 | |
| }, | |
| { | |
| "completion_length": 1623.8750457763672, | |
| "epoch": 0.27885714285714286, | |
| "grad_norm": 0.22528576850891113, | |
| "kl": 0.0068035125732421875, | |
| "learning_rate": 6.466308972251785e-07, | |
| "loss": 0.0003, | |
| "reward": 0.7132167363015469, | |
| "reward_std": 0.7599900439381599, | |
| "rewards/cosine_scaled_reward": 0.18282007612287998, | |
| "rewards/format_reward": 0.8541666679084301, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 1842.708381652832, | |
| "epoch": 0.28, | |
| "grad_norm": 0.23914141952991486, | |
| "kl": 0.006076812744140625, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": 0.0002, | |
| "reward": 0.589666100917384, | |
| "reward_std": 0.911857221275568, | |
| "rewards/cosine_scaled_reward": 0.11452391929924488, | |
| "rewards/format_reward": 0.8125000111758709, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 1442.0625457763672, | |
| "epoch": 0.28114285714285714, | |
| "grad_norm": 0.22671173512935638, | |
| "kl": 0.007045745849609375, | |
| "learning_rate": 6.404850645156841e-07, | |
| "loss": 0.0003, | |
| "reward": 0.3358262628316879, | |
| "reward_std": 0.5915343575179577, | |
| "rewards/cosine_scaled_reward": -0.07317247241735458, | |
| "rewards/format_reward": 0.8750000037252903, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 2208.770866394043, | |
| "epoch": 0.2822857142857143, | |
| "grad_norm": 0.29410502314567566, | |
| "kl": 0.010019302368164062, | |
| "learning_rate": 6.374054580489873e-07, | |
| "loss": 0.0004, | |
| "reward": -0.014087029732763767, | |
| "reward_std": 0.6953110322356224, | |
| "rewards/cosine_scaled_reward": -0.19972090609371662, | |
| "rewards/format_reward": 0.6458333469927311, | |
| "step": 247 | |
| }, | |
| { | |
| "completion_length": 1561.9583740234375, | |
| "epoch": 0.2834285714285714, | |
| "grad_norm": 0.2551148533821106, | |
| "kl": 0.007082939147949219, | |
| "learning_rate": 6.343215915635761e-07, | |
| "loss": 0.0003, | |
| "reward": 0.7867583259940147, | |
| "reward_std": 0.7739764004945755, | |
| "rewards/cosine_scaled_reward": 0.28318586223758757, | |
| "rewards/format_reward": 0.7500000055879354, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 1342.2917022705078, | |
| "epoch": 0.2845714285714286, | |
| "grad_norm": 0.2254449427127838, | |
| "kl": 0.00762939453125, | |
| "learning_rate": 6.31233615362752e-07, | |
| "loss": 0.0003, | |
| "reward": 0.7728391233831644, | |
| "reward_std": 0.6931521892547607, | |
| "rewards/cosine_scaled_reward": 0.20786779932677746, | |
| "rewards/format_reward": 0.8958333358168602, | |
| "step": 249 | |
| }, | |
| { | |
| "completion_length": 1144.3750305175781, | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.4075692892074585, | |
| "kl": 0.008514404296875, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.0003, | |
| "reward": 0.4015044257976115, | |
| "reward_std": 0.511387325823307, | |
| "rewards/cosine_scaled_reward": -0.07808645971817896, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 1381.1459045410156, | |
| "epoch": 0.28685714285714287, | |
| "grad_norm": 0.32429951429367065, | |
| "kl": 0.0106201171875, | |
| "learning_rate": 6.25045936022246e-07, | |
| "loss": 0.0004, | |
| "reward": 0.26284962613135576, | |
| "reward_std": 0.8155036717653275, | |
| "rewards/cosine_scaled_reward": -0.09845332545228302, | |
| "rewards/format_reward": 0.7916666679084301, | |
| "step": 251 | |
| }, | |
| { | |
| "completion_length": 1654.3750457763672, | |
| "epoch": 0.288, | |
| "grad_norm": 0.2845689356327057, | |
| "kl": 0.009103775024414062, | |
| "learning_rate": 6.219465344613258e-07, | |
| "loss": 0.0004, | |
| "reward": 0.05027025658637285, | |
| "reward_std": 0.4697149991989136, | |
| "rewards/cosine_scaled_reward": -0.1850506253540516, | |
| "rewards/format_reward": 0.7291666753590107, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 1660.5208740234375, | |
| "epoch": 0.28914285714285715, | |
| "grad_norm": 0.2830834984779358, | |
| "kl": 0.010486602783203125, | |
| "learning_rate": 6.188436263278172e-07, | |
| "loss": 0.0004, | |
| "reward": 0.18601116666104645, | |
| "reward_std": 0.66238809004426, | |
| "rewards/cosine_scaled_reward": -0.1478569945320487, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 253 | |
| }, | |
| { | |
| "completion_length": 1486.208339691162, | |
| "epoch": 0.29028571428571426, | |
| "grad_norm": 0.2770550847053528, | |
| "kl": 0.008548736572265625, | |
| "learning_rate": 6.157373628530852e-07, | |
| "loss": 0.0003, | |
| "reward": 0.3256164574995637, | |
| "reward_std": 0.72544976323843, | |
| "rewards/cosine_scaled_reward": -0.07894822582602501, | |
| "rewards/format_reward": 0.8541666753590107, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 2314.5416946411133, | |
| "epoch": 0.2914285714285714, | |
| "grad_norm": 0.32598185539245605, | |
| "kl": 0.01126861572265625, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": 0.0005, | |
| "reward": -0.04141565319150686, | |
| "reward_std": 0.7138533964753151, | |
| "rewards/cosine_scaled_reward": -0.21306942123919725, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 1553.5208740234375, | |
| "epoch": 0.2925714285714286, | |
| "grad_norm": 0.2590584456920624, | |
| "kl": 0.008270263671875, | |
| "learning_rate": 6.095153756157051e-07, | |
| "loss": 0.0003, | |
| "reward": 0.5183277567848563, | |
| "reward_std": 0.539212403818965, | |
| "rewards/cosine_scaled_reward": 0.05446232855319977, | |
| "rewards/format_reward": 0.8750000111758709, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 1969.1042175292969, | |
| "epoch": 0.2937142857142857, | |
| "grad_norm": 0.2547340989112854, | |
| "kl": 0.007389068603515625, | |
| "learning_rate": 6.06399955103937e-07, | |
| "loss": 0.0003, | |
| "reward": 0.6981238089501858, | |
| "reward_std": 0.957792617380619, | |
| "rewards/cosine_scaled_reward": 0.20680510997772217, | |
| "rewards/format_reward": 0.7708333414047956, | |
| "step": 257 | |
| }, | |
| { | |
| "completion_length": 1860.1042175292969, | |
| "epoch": 0.2948571428571429, | |
| "grad_norm": 0.2526947855949402, | |
| "kl": 0.006866455078125, | |
| "learning_rate": 6.032817857379256e-07, | |
| "loss": 0.0003, | |
| "reward": 0.3261422934010625, | |
| "reward_std": 0.8735288791358471, | |
| "rewards/cosine_scaled_reward": -0.07447653356939554, | |
| "rewards/format_reward": 0.8333333395421505, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 1497.6667137145996, | |
| "epoch": 0.296, | |
| "grad_norm": 0.2954126298427582, | |
| "kl": 0.009616851806640625, | |
| "learning_rate": 6.001610194928464e-07, | |
| "loss": 0.0004, | |
| "reward": 0.4608106706291437, | |
| "reward_std": 0.6903665885329247, | |
| "rewards/cosine_scaled_reward": 0.035531939938664436, | |
| "rewards/format_reward": 0.8125000186264515, | |
| "step": 259 | |
| }, | |
| { | |
| "completion_length": 1280.8958778381348, | |
| "epoch": 0.29714285714285715, | |
| "grad_norm": 0.2724701166152954, | |
| "kl": 0.0075778961181640625, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.0003, | |
| "reward": 0.6839517981279641, | |
| "reward_std": 0.8328232653439045, | |
| "rewards/cosine_scaled_reward": 0.13848848675843328, | |
| "rewards/format_reward": 0.8958333358168602, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 2193.687515258789, | |
| "epoch": 0.29828571428571427, | |
| "grad_norm": 0.18936224281787872, | |
| "kl": 0.00774383544921875, | |
| "learning_rate": 5.939123048916173e-07, | |
| "loss": 0.0003, | |
| "reward": -0.03597256541252136, | |
| "reward_std": 0.5031098667532206, | |
| "rewards/cosine_scaled_reward": -0.18370476551353931, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 261 | |
| }, | |
| { | |
| "completion_length": 1862.2083740234375, | |
| "epoch": 0.29942857142857143, | |
| "grad_norm": 0.2641579806804657, | |
| "kl": 0.0114593505859375, | |
| "learning_rate": 5.907846610890011e-07, | |
| "loss": 0.0005, | |
| "reward": 0.04227437451481819, | |
| "reward_std": 0.5424713045358658, | |
| "rewards/cosine_scaled_reward": -0.1736736847087741, | |
| "rewards/format_reward": 0.6875000037252903, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 1353.3125228881836, | |
| "epoch": 0.30057142857142854, | |
| "grad_norm": 0.2091461718082428, | |
| "kl": 0.00551605224609375, | |
| "learning_rate": 5.87655029499542e-07, | |
| "loss": 0.0002, | |
| "reward": 0.29918272816576064, | |
| "reward_std": 0.7759961858391762, | |
| "rewards/cosine_scaled_reward": -0.13049082271754742, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 263 | |
| }, | |
| { | |
| "completion_length": 1441.7083740234375, | |
| "epoch": 0.3017142857142857, | |
| "grad_norm": 0.2494620829820633, | |
| "kl": 0.0072498321533203125, | |
| "learning_rate": 5.845235626570683e-07, | |
| "loss": 0.0003, | |
| "reward": 0.2639927687123418, | |
| "reward_std": 0.689430944621563, | |
| "rewards/cosine_scaled_reward": -0.11652671941556036, | |
| "rewards/format_reward": 0.8541666772216558, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 1564.6250457763672, | |
| "epoch": 0.3028571428571429, | |
| "grad_norm": 0.2662133276462555, | |
| "kl": 0.010656356811523438, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": 0.0004, | |
| "reward": 0.44831580482423306, | |
| "reward_std": 0.7113944664597511, | |
| "rewards/cosine_scaled_reward": -0.0021858818363398314, | |
| "rewards/format_reward": 0.8750000055879354, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 1647.4375381469727, | |
| "epoch": 0.304, | |
| "grad_norm": 0.2215126007795334, | |
| "kl": 0.00804901123046875, | |
| "learning_rate": 5.78255733788191e-07, | |
| "loss": 0.0003, | |
| "reward": 0.1729487591655925, | |
| "reward_std": 0.6001881808042526, | |
| "rewards/cosine_scaled_reward": -0.10871189273893833, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 2309.1458740234375, | |
| "epoch": 0.30514285714285716, | |
| "grad_norm": 0.33464720845222473, | |
| "kl": 0.013841629028320312, | |
| "learning_rate": 5.751196772469237e-07, | |
| "loss": 0.0006, | |
| "reward": -0.04994155094027519, | |
| "reward_std": 0.63846031203866, | |
| "rewards/cosine_scaled_reward": -0.18117934837937355, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 267 | |
| }, | |
| { | |
| "completion_length": 1347.291732788086, | |
| "epoch": 0.3062857142857143, | |
| "grad_norm": 0.2725054919719696, | |
| "kl": 0.011320114135742188, | |
| "learning_rate": 5.71982396408026e-07, | |
| "loss": 0.0005, | |
| "reward": 0.3638640786521137, | |
| "reward_std": 0.856957983225584, | |
| "rewards/cosine_scaled_reward": -0.07128076790831983, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 1536.8541870117188, | |
| "epoch": 0.30742857142857144, | |
| "grad_norm": 0.242116779088974, | |
| "kl": 0.008541107177734375, | |
| "learning_rate": 5.688440441781398e-07, | |
| "loss": 0.0003, | |
| "reward": 0.2962089798747911, | |
| "reward_std": 0.652573972940445, | |
| "rewards/cosine_scaled_reward": -0.07353247702121735, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 269 | |
| }, | |
| { | |
| "completion_length": 1723.6875305175781, | |
| "epoch": 0.30857142857142855, | |
| "grad_norm": 0.20673726499080658, | |
| "kl": 0.00959014892578125, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.0004, | |
| "reward": 0.6585689373314381, | |
| "reward_std": 0.8569062799215317, | |
| "rewards/cosine_scaled_reward": 0.13866457249969244, | |
| "rewards/format_reward": 0.854166679084301, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 1226.4375534057617, | |
| "epoch": 0.3097142857142857, | |
| "grad_norm": 0.3247756063938141, | |
| "kl": 0.009336471557617188, | |
| "learning_rate": 5.625647374256061e-07, | |
| "loss": 0.0004, | |
| "reward": 0.6650833152234554, | |
| "reward_std": 0.7789544351398945, | |
| "rewards/cosine_scaled_reward": 0.11849892261670902, | |
| "rewards/format_reward": 0.9166666679084301, | |
| "step": 271 | |
| }, | |
| { | |
| "completion_length": 1959.395896911621, | |
| "epoch": 0.31085714285714283, | |
| "grad_norm": 0.2230900079011917, | |
| "kl": 0.0106964111328125, | |
| "learning_rate": 5.594240889475106e-07, | |
| "loss": 0.0004, | |
| "reward": 0.23697172570973635, | |
| "reward_std": 0.7450396865606308, | |
| "rewards/cosine_scaled_reward": -0.0959045309573412, | |
| "rewards/format_reward": 0.7708333507180214, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 1628.1042175292969, | |
| "epoch": 0.312, | |
| "grad_norm": 0.3747721314430237, | |
| "kl": 0.01097869873046875, | |
| "learning_rate": 5.562829811526154e-07, | |
| "loss": 0.0004, | |
| "reward": 0.4147872976027429, | |
| "reward_std": 0.6467026993632317, | |
| "rewards/cosine_scaled_reward": 0.0440314169973135, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 273 | |
| }, | |
| { | |
| "completion_length": 1172.4375457763672, | |
| "epoch": 0.31314285714285717, | |
| "grad_norm": 0.2670040726661682, | |
| "kl": 0.008930206298828125, | |
| "learning_rate": 5.531415671340826e-07, | |
| "loss": 0.0004, | |
| "reward": 0.6682712404581252, | |
| "reward_std": 0.8026308417320251, | |
| "rewards/cosine_scaled_reward": 0.0984476669691503, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 1716.9375534057617, | |
| "epoch": 0.3142857142857143, | |
| "grad_norm": 0.2296976000070572, | |
| "kl": 0.00942230224609375, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0004, | |
| "reward": 0.6355814579874277, | |
| "reward_std": 0.902884915471077, | |
| "rewards/cosine_scaled_reward": 0.1562671698629856, | |
| "rewards/format_reward": 0.791666679084301, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 1406.5625457763672, | |
| "epoch": 0.31542857142857145, | |
| "grad_norm": 0.27804121375083923, | |
| "kl": 0.012622833251953125, | |
| "learning_rate": 5.468584328659172e-07, | |
| "loss": 0.0005, | |
| "reward": 0.47063780203461647, | |
| "reward_std": 0.826964907348156, | |
| "rewards/cosine_scaled_reward": 0.035029259510338306, | |
| "rewards/format_reward": 0.8125000055879354, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 1531.7708892822266, | |
| "epoch": 0.31657142857142856, | |
| "grad_norm": 0.3786030411720276, | |
| "kl": 0.0130615234375, | |
| "learning_rate": 5.437170188473847e-07, | |
| "loss": 0.0005, | |
| "reward": 0.4646074064075947, | |
| "reward_std": 0.7622004933655262, | |
| "rewards/cosine_scaled_reward": 0.04781521949917078, | |
| "rewards/format_reward": 0.7916666734963655, | |
| "step": 277 | |
| }, | |
| { | |
| "completion_length": 1472.6458740234375, | |
| "epoch": 0.3177142857142857, | |
| "grad_norm": 0.29159483313560486, | |
| "kl": 0.012044906616210938, | |
| "learning_rate": 5.405759110524894e-07, | |
| "loss": 0.0005, | |
| "reward": 0.5979791600257158, | |
| "reward_std": 0.6016073673963547, | |
| "rewards/cosine_scaled_reward": 0.13445703126490116, | |
| "rewards/format_reward": 0.8125000186264515, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 1808.8750457763672, | |
| "epoch": 0.31885714285714284, | |
| "grad_norm": 0.45572957396507263, | |
| "kl": 0.0137481689453125, | |
| "learning_rate": 5.37435262574394e-07, | |
| "loss": 0.0006, | |
| "reward": 0.3207678751787171, | |
| "reward_std": 0.7218646891415119, | |
| "rewards/cosine_scaled_reward": -0.07327653095126152, | |
| "rewards/format_reward": 0.854166679084301, | |
| "step": 279 | |
| }, | |
| { | |
| "completion_length": 1779.0000457763672, | |
| "epoch": 0.32, | |
| "grad_norm": 0.3021228611469269, | |
| "kl": 0.011760711669921875, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.0005, | |
| "reward": 0.8133369982242584, | |
| "reward_std": 0.8129135742783546, | |
| "rewards/cosine_scaled_reward": 0.2262397282756865, | |
| "rewards/format_reward": 0.8958333395421505, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 2400.2708740234375, | |
| "epoch": 0.3211428571428571, | |
| "grad_norm": 0.2547665238380432, | |
| "kl": 0.012500762939453125, | |
| "learning_rate": 5.311559558218603e-07, | |
| "loss": 0.0005, | |
| "reward": -0.10774591006338596, | |
| "reward_std": 0.6432830318808556, | |
| "rewards/cosine_scaled_reward": -0.22082971967756748, | |
| "rewards/format_reward": 0.5625000055879354, | |
| "step": 281 | |
| }, | |
| { | |
| "completion_length": 1696.4792251586914, | |
| "epoch": 0.3222857142857143, | |
| "grad_norm": 0.3483729362487793, | |
| "kl": 0.013248443603515625, | |
| "learning_rate": 5.28017603591974e-07, | |
| "loss": 0.0005, | |
| "reward": 0.4830823950469494, | |
| "reward_std": 0.720955528318882, | |
| "rewards/cosine_scaled_reward": 0.053989187348634005, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 2184.6250534057617, | |
| "epoch": 0.32342857142857145, | |
| "grad_norm": 0.26255351305007935, | |
| "kl": 0.01146697998046875, | |
| "learning_rate": 5.248803227530763e-07, | |
| "loss": 0.0005, | |
| "reward": 0.6261483291164041, | |
| "reward_std": 0.8373686634004116, | |
| "rewards/cosine_scaled_reward": 0.18944203667342663, | |
| "rewards/format_reward": 0.7291666772216558, | |
| "step": 283 | |
| }, | |
| { | |
| "completion_length": 1453.270881652832, | |
| "epoch": 0.32457142857142857, | |
| "grad_norm": 0.26391106843948364, | |
| "kl": 0.0076885223388671875, | |
| "learning_rate": 5.21744266211809e-07, | |
| "loss": 0.0003, | |
| "reward": 0.14405585872009397, | |
| "reward_std": 0.5576095655560493, | |
| "rewards/cosine_scaled_reward": -0.18134936597198248, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 1164.7291946411133, | |
| "epoch": 0.32571428571428573, | |
| "grad_norm": 0.40485504269599915, | |
| "kl": 0.0128326416015625, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.0005, | |
| "reward": 0.3652733010239899, | |
| "reward_std": 0.7800908386707306, | |
| "rewards/cosine_scaled_reward": -0.04528142110211775, | |
| "rewards/format_reward": 0.8333333358168602, | |
| "step": 285 | |
| }, | |
| { | |
| "completion_length": 1260.0208587646484, | |
| "epoch": 0.32685714285714285, | |
| "grad_norm": 0.26052573323249817, | |
| "kl": 0.008724212646484375, | |
| "learning_rate": 5.154764373429315e-07, | |
| "loss": 0.0003, | |
| "reward": 0.4141525523737073, | |
| "reward_std": 0.6447485722601414, | |
| "rewards/cosine_scaled_reward": -0.043887258507311344, | |
| "rewards/format_reward": 0.9166666679084301, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 1336.2083702087402, | |
| "epoch": 0.328, | |
| "grad_norm": 0.3096228837966919, | |
| "kl": 0.012447357177734375, | |
| "learning_rate": 5.123449705004581e-07, | |
| "loss": 0.0005, | |
| "reward": 0.5274541154503822, | |
| "reward_std": 0.7009152993559837, | |
| "rewards/cosine_scaled_reward": 0.121887655579485, | |
| "rewards/format_reward": 0.7291666679084301, | |
| "step": 287 | |
| }, | |
| { | |
| "completion_length": 1846.3750228881836, | |
| "epoch": 0.3291428571428571, | |
| "grad_norm": 0.28548097610473633, | |
| "kl": 0.01815032958984375, | |
| "learning_rate": 5.09215338910999e-07, | |
| "loss": 0.0007, | |
| "reward": 0.12442206963896751, | |
| "reward_std": 0.5581525340676308, | |
| "rewards/cosine_scaled_reward": -0.1310774045996368, | |
| "rewards/format_reward": 0.7083333395421505, | |
| "step": 288 | |
| }, | |
| { | |
| "completion_length": 1571.437557220459, | |
| "epoch": 0.3302857142857143, | |
| "grad_norm": 0.3936711251735687, | |
| "kl": 0.012668609619140625, | |
| "learning_rate": 5.060876951083828e-07, | |
| "loss": 0.0005, | |
| "reward": 0.4288631723029539, | |
| "reward_std": 0.593647625297308, | |
| "rewards/cosine_scaled_reward": 0.030754741048440337, | |
| "rewards/format_reward": 0.7916666679084301, | |
| "step": 289 | |
| }, | |
| { | |
| "completion_length": 898.5833473205566, | |
| "epoch": 0.3314285714285714, | |
| "grad_norm": 0.32913920283317566, | |
| "kl": 0.009197235107421875, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": 0.0004, | |
| "reward": 0.7029262520372868, | |
| "reward_std": 0.8086423352360725, | |
| "rewards/cosine_scaled_reward": 0.12271453440189362, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 1254.2917022705078, | |
| "epoch": 0.3325714285714286, | |
| "grad_norm": 0.24602723121643066, | |
| "kl": 0.010408401489257812, | |
| "learning_rate": 4.998389805071536e-07, | |
| "loss": 0.0004, | |
| "reward": 0.44253411889076233, | |
| "reward_std": 0.8852041102945805, | |
| "rewards/cosine_scaled_reward": -0.05237848265096545, | |
| "rewards/format_reward": 0.9375, | |
| "step": 291 | |
| }, | |
| { | |
| "completion_length": 1754.5000610351562, | |
| "epoch": 0.33371428571428574, | |
| "grad_norm": 0.24916251003742218, | |
| "kl": 0.015148162841796875, | |
| "learning_rate": 4.967182142620745e-07, | |
| "loss": 0.0006, | |
| "reward": 0.15450574783608317, | |
| "reward_std": 0.5213648546487093, | |
| "rewards/cosine_scaled_reward": -0.1516056777909398, | |
| "rewards/format_reward": 0.7916666772216558, | |
| "step": 292 | |
| }, | |
| { | |
| "completion_length": 1034.2917022705078, | |
| "epoch": 0.33485714285714285, | |
| "grad_norm": 0.31137824058532715, | |
| "kl": 0.012929916381835938, | |
| "learning_rate": 4.93600044896063e-07, | |
| "loss": 0.0005, | |
| "reward": 0.4599906969233416, | |
| "reward_std": 0.5716858878731728, | |
| "rewards/cosine_scaled_reward": -0.009167976677417755, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 293 | |
| }, | |
| { | |
| "completion_length": 1682.0208740234375, | |
| "epoch": 0.336, | |
| "grad_norm": 0.27208465337753296, | |
| "kl": 0.013782501220703125, | |
| "learning_rate": 4.904846243842949e-07, | |
| "loss": 0.0006, | |
| "reward": 0.3077076869085431, | |
| "reward_std": 0.777203194797039, | |
| "rewards/cosine_scaled_reward": -0.05544556397944689, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 294 | |
| }, | |
| { | |
| "completion_length": 1438.083381652832, | |
| "epoch": 0.33714285714285713, | |
| "grad_norm": 0.4438267946243286, | |
| "kl": 0.017040252685546875, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.0007, | |
| "reward": 0.7139077642932534, | |
| "reward_std": 0.7298413254320621, | |
| "rewards/cosine_scaled_reward": 0.1472441926598549, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 295 | |
| }, | |
| { | |
| "completion_length": 1397.895896911621, | |
| "epoch": 0.3382857142857143, | |
| "grad_norm": 0.3188610076904297, | |
| "kl": 0.01441192626953125, | |
| "learning_rate": 4.842626371469149e-07, | |
| "loss": 0.0006, | |
| "reward": 0.47962956223636866, | |
| "reward_std": 0.759897030889988, | |
| "rewards/cosine_scaled_reward": 0.025067659094929695, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 296 | |
| }, | |
| { | |
| "completion_length": 2094.354217529297, | |
| "epoch": 0.3394285714285714, | |
| "grad_norm": 0.472922682762146, | |
| "kl": 0.0243682861328125, | |
| "learning_rate": 4.811563736721829e-07, | |
| "loss": 0.001, | |
| "reward": 0.12652458110824227, | |
| "reward_std": 0.6632986180484295, | |
| "rewards/cosine_scaled_reward": -0.115331269800663, | |
| "rewards/format_reward": 0.6666666809469461, | |
| "step": 297 | |
| }, | |
| { | |
| "completion_length": 1431.7292175292969, | |
| "epoch": 0.3405714285714286, | |
| "grad_norm": 0.2858993113040924, | |
| "kl": 0.013988494873046875, | |
| "learning_rate": 4.780534655386743e-07, | |
| "loss": 0.0006, | |
| "reward": 0.41945501090958714, | |
| "reward_std": 0.8443580865859985, | |
| "rewards/cosine_scaled_reward": -0.003992303041741252, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 298 | |
| }, | |
| { | |
| "completion_length": 1634.4167175292969, | |
| "epoch": 0.3417142857142857, | |
| "grad_norm": 0.403289258480072, | |
| "kl": 0.019683837890625, | |
| "learning_rate": 4.749540639777539e-07, | |
| "loss": 0.0008, | |
| "reward": 0.3505248324945569, | |
| "reward_std": 0.7320738956332207, | |
| "rewards/cosine_scaled_reward": -0.020624496042728424, | |
| "rewards/format_reward": 0.7708333469927311, | |
| "step": 299 | |
| }, | |
| { | |
| "completion_length": 1821.9166870117188, | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 0.3450154662132263, | |
| "kl": 0.0260009765625, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.001, | |
| "reward": 0.28817289136350155, | |
| "reward_std": 0.6358778662979603, | |
| "rewards/cosine_scaled_reward": -0.05276927351951599, | |
| "rewards/format_reward": 0.7708333358168602, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 1875.5000381469727, | |
| "epoch": 0.344, | |
| "grad_norm": 0.4964129328727722, | |
| "kl": 0.031585693359375, | |
| "learning_rate": 4.68766384637248e-07, | |
| "loss": 0.0013, | |
| "reward": 0.23697214853018522, | |
| "reward_std": 0.731932707130909, | |
| "rewards/cosine_scaled_reward": -0.10828239191323519, | |
| "rewards/format_reward": 0.7916666753590107, | |
| "step": 301 | |
| }, | |
| { | |
| "completion_length": 1755.0000343322754, | |
| "epoch": 0.34514285714285714, | |
| "grad_norm": 0.3546035587787628, | |
| "kl": 0.022808074951171875, | |
| "learning_rate": 4.656784084364238e-07, | |
| "loss": 0.0009, | |
| "reward": 0.3061390779912472, | |
| "reward_std": 0.6646707132458687, | |
| "rewards/cosine_scaled_reward": -0.003997504012659192, | |
| "rewards/format_reward": 0.6875, | |
| "step": 302 | |
| }, | |
| { | |
| "completion_length": 1310.7291984558105, | |
| "epoch": 0.3462857142857143, | |
| "grad_norm": 0.3501085937023163, | |
| "kl": 0.017917633056640625, | |
| "learning_rate": 4.6259454195101267e-07, | |
| "loss": 0.0007, | |
| "reward": 0.49688062351197004, | |
| "reward_std": 0.7856386080384254, | |
| "rewards/cosine_scaled_reward": 0.005726959556341171, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 303 | |
| }, | |
| { | |
| "completion_length": 1531.625015258789, | |
| "epoch": 0.3474285714285714, | |
| "grad_norm": 0.23862285912036896, | |
| "kl": 0.0212249755859375, | |
| "learning_rate": 4.59514935484316e-07, | |
| "loss": 0.0008, | |
| "reward": 0.33517973372363485, | |
| "reward_std": 0.8556637056171894, | |
| "rewards/cosine_scaled_reward": -0.07236111164093018, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 304 | |
| }, | |
| { | |
| "completion_length": 1325.145851135254, | |
| "epoch": 0.3485714285714286, | |
| "grad_norm": 0.3753778338432312, | |
| "kl": 0.013553619384765625, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": 0.0005, | |
| "reward": 0.1812261645682156, | |
| "reward_std": 0.7640487253665924, | |
| "rewards/cosine_scaled_reward": -0.18994974298402667, | |
| "rewards/format_reward": 0.8750000223517418, | |
| "step": 305 | |
| }, | |
| { | |
| "completion_length": 1495.1875610351562, | |
| "epoch": 0.3497142857142857, | |
| "grad_norm": 0.43624046444892883, | |
| "kl": 0.028331756591796875, | |
| "learning_rate": 4.5336910277482155e-07, | |
| "loss": 0.0011, | |
| "reward": 0.5158186480402946, | |
| "reward_std": 0.8463861420750618, | |
| "rewards/cosine_scaled_reward": 0.07386103633325547, | |
| "rewards/format_reward": 0.791666679084301, | |
| "step": 306 | |
| }, | |
| { | |
| "completion_length": 1391.8750228881836, | |
| "epoch": 0.35085714285714287, | |
| "grad_norm": 0.36552807688713074, | |
| "kl": 0.01470184326171875, | |
| "learning_rate": 4.503031760712397e-07, | |
| "loss": 0.0006, | |
| "reward": 0.4544885288923979, | |
| "reward_std": 0.9182650446891785, | |
| "rewards/cosine_scaled_reward": 0.03140606731176376, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 307 | |
| }, | |
| { | |
| "completion_length": 2439.062545776367, | |
| "epoch": 0.352, | |
| "grad_norm": 0.25556859374046326, | |
| "kl": 0.02754974365234375, | |
| "learning_rate": 4.4724210845020494e-07, | |
| "loss": 0.0011, | |
| "reward": 0.16432497836649418, | |
| "reward_std": 0.7285211831331253, | |
| "rewards/cosine_scaled_reward": -0.06424459861591458, | |
| "rewards/format_reward": 0.6041666734963655, | |
| "step": 308 | |
| }, | |
| { | |
| "completion_length": 2007.7709045410156, | |
| "epoch": 0.35314285714285715, | |
| "grad_norm": 0.2907513380050659, | |
| "kl": 0.02512359619140625, | |
| "learning_rate": 4.441860491038345e-07, | |
| "loss": 0.001, | |
| "reward": 0.3333761217072606, | |
| "reward_std": 0.9507267326116562, | |
| "rewards/cosine_scaled_reward": -0.06830872967839241, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 309 | |
| }, | |
| { | |
| "completion_length": 1377.9375610351562, | |
| "epoch": 0.35428571428571426, | |
| "grad_norm": 0.6299467086791992, | |
| "kl": 0.019756317138671875, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": 0.0008, | |
| "reward": 0.14710421487689018, | |
| "reward_std": 0.5830218307673931, | |
| "rewards/cosine_scaled_reward": -0.2127772723324597, | |
| "rewards/format_reward": 0.8958333358168602, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 1264.770851135254, | |
| "epoch": 0.3554285714285714, | |
| "grad_norm": 0.3672167658805847, | |
| "kl": 0.01552581787109375, | |
| "learning_rate": 4.3808955077581546e-07, | |
| "loss": 0.0006, | |
| "reward": 0.5835366472601891, | |
| "reward_std": 0.8306069150567055, | |
| "rewards/cosine_scaled_reward": 0.05884586926549673, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 311 | |
| }, | |
| { | |
| "completion_length": 1228.6875228881836, | |
| "epoch": 0.3565714285714286, | |
| "grad_norm": 0.31035158038139343, | |
| "kl": 0.01593017578125, | |
| "learning_rate": 4.350494089288943e-07, | |
| "loss": 0.0006, | |
| "reward": 0.7780788261443377, | |
| "reward_std": 0.725801732391119, | |
| "rewards/cosine_scaled_reward": 0.23832012061029673, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 312 | |
| }, | |
| { | |
| "completion_length": 2067.1666831970215, | |
| "epoch": 0.3577142857142857, | |
| "grad_norm": 0.3178723454475403, | |
| "kl": 0.03451347351074219, | |
| "learning_rate": 4.3201486961161093e-07, | |
| "loss": 0.0014, | |
| "reward": 0.3173049371689558, | |
| "reward_std": 0.6579355709254742, | |
| "rewards/cosine_scaled_reward": 0.03718515514628962, | |
| "rewards/format_reward": 0.6250000111758709, | |
| "step": 313 | |
| }, | |
| { | |
| "completion_length": 1668.3333892822266, | |
| "epoch": 0.3588571428571429, | |
| "grad_norm": 0.46666356921195984, | |
| "kl": 0.031169891357421875, | |
| "learning_rate": 4.2898608072313045e-07, | |
| "loss": 0.0012, | |
| "reward": 0.28569703502580523, | |
| "reward_std": 0.6306734308600426, | |
| "rewards/cosine_scaled_reward": -0.056163689121603966, | |
| "rewards/format_reward": 0.7708333395421505, | |
| "step": 314 | |
| }, | |
| { | |
| "completion_length": 2082.7084045410156, | |
| "epoch": 0.36, | |
| "grad_norm": 0.2961288392543793, | |
| "kl": 0.05887603759765625, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": 0.0024, | |
| "reward": 0.4379968661814928, | |
| "reward_std": 0.8023070320487022, | |
| "rewards/cosine_scaled_reward": 0.04415438207797706, | |
| "rewards/format_reward": 0.7500000074505806, | |
| "step": 315 | |
| }, | |
| { | |
| "completion_length": 2361.541732788086, | |
| "epoch": 0.36114285714285715, | |
| "grad_norm": 0.477490097284317, | |
| "kl": 0.0577545166015625, | |
| "learning_rate": 4.2294634442070553e-07, | |
| "loss": 0.0023, | |
| "reward": -0.013460966947604902, | |
| "reward_std": 0.6273909620940685, | |
| "rewards/cosine_scaled_reward": -0.19738789275288582, | |
| "rewards/format_reward": 0.6458333488553762, | |
| "step": 316 | |
| }, | |
| { | |
| "completion_length": 1918.9375686645508, | |
| "epoch": 0.36228571428571427, | |
| "grad_norm": 0.6062735319137573, | |
| "kl": 0.03968238830566406, | |
| "learning_rate": 4.1993569137498776e-07, | |
| "loss": 0.0016, | |
| "reward": 0.2778178099542856, | |
| "reward_std": 0.8473985716700554, | |
| "rewards/cosine_scaled_reward": -0.024428293108940125, | |
| "rewards/format_reward": 0.6666666828095913, | |
| "step": 317 | |
| }, | |
| { | |
| "completion_length": 1197.4792098999023, | |
| "epoch": 0.36342857142857143, | |
| "grad_norm": 0.5727298855781555, | |
| "kl": 0.025470733642578125, | |
| "learning_rate": 4.1693137748017915e-07, | |
| "loss": 0.001, | |
| "reward": 0.35039830300956964, | |
| "reward_std": 0.665754821151495, | |
| "rewards/cosine_scaled_reward": -0.09761649183928967, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 318 | |
| }, | |
| { | |
| "completion_length": 1549.0416946411133, | |
| "epoch": 0.36457142857142855, | |
| "grad_norm": 0.44952714443206787, | |
| "kl": 0.022411346435546875, | |
| "learning_rate": 4.1393354916230005e-07, | |
| "loss": 0.0009, | |
| "reward": 0.16405144333839417, | |
| "reward_std": 0.7381913363933563, | |
| "rewards/cosine_scaled_reward": -0.17837723344564438, | |
| "rewards/format_reward": 0.8333333488553762, | |
| "step": 319 | |
| }, | |
| { | |
| "completion_length": 1096.270866394043, | |
| "epoch": 0.3657142857142857, | |
| "grad_norm": 0.9068484902381897, | |
| "kl": 0.045513153076171875, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.0018, | |
| "reward": 0.4451125105842948, | |
| "reward_std": 0.8512261882424355, | |
| "rewards/cosine_scaled_reward": -0.02465624047908932, | |
| "rewards/format_reward": 0.8958333395421505, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 1107.2083778381348, | |
| "epoch": 0.3668571428571429, | |
| "grad_norm": 0.33380454778671265, | |
| "kl": 0.016986846923828125, | |
| "learning_rate": 4.079579333738039e-07, | |
| "loss": 0.0007, | |
| "reward": 0.7083983863703907, | |
| "reward_std": 0.741938479244709, | |
| "rewards/cosine_scaled_reward": 0.13879141584038734, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 321 | |
| }, | |
| { | |
| "completion_length": 1815.958366394043, | |
| "epoch": 0.368, | |
| "grad_norm": 0.5257051587104797, | |
| "kl": 0.06414031982421875, | |
| "learning_rate": 4.0498043714627006e-07, | |
| "loss": 0.0026, | |
| "reward": 0.2582624601200223, | |
| "reward_std": 0.6789763048291206, | |
| "rewards/cosine_scaled_reward": -0.0668359762057662, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 322 | |
| }, | |
| { | |
| "completion_length": 1977.5834045410156, | |
| "epoch": 0.36914285714285716, | |
| "grad_norm": 0.5315828323364258, | |
| "kl": 0.07048416137695312, | |
| "learning_rate": 4.020100089676376e-07, | |
| "loss": 0.0028, | |
| "reward": 0.3757035471498966, | |
| "reward_std": 0.8692245557904243, | |
| "rewards/cosine_scaled_reward": 0.05770140094682574, | |
| "rewards/format_reward": 0.6458333469927311, | |
| "step": 323 | |
| }, | |
| { | |
| "completion_length": 2044.8125457763672, | |
| "epoch": 0.3702857142857143, | |
| "grad_norm": 0.5173450708389282, | |
| "kl": 0.06734085083007812, | |
| "learning_rate": 3.9904679361238526e-07, | |
| "loss": 0.0027, | |
| "reward": 0.1749881288560573, | |
| "reward_std": 0.8538544028997421, | |
| "rewards/cosine_scaled_reward": -0.14555134577676654, | |
| "rewards/format_reward": 0.7708333507180214, | |
| "step": 324 | |
| }, | |
| { | |
| "completion_length": 1892.9375267028809, | |
| "epoch": 0.37142857142857144, | |
| "grad_norm": 0.6045531034469604, | |
| "kl": 0.0422515869140625, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.0017, | |
| "reward": 0.32839928939938545, | |
| "reward_std": 0.8866756781935692, | |
| "rewards/cosine_scaled_reward": -0.03144947811961174, | |
| "rewards/format_reward": 0.7500000093132257, | |
| "step": 325 | |
| }, | |
| { | |
| "completion_length": 1686.708366394043, | |
| "epoch": 0.37257142857142855, | |
| "grad_norm": 0.485423743724823, | |
| "kl": 0.061676025390625, | |
| "learning_rate": 3.931425787051832e-07, | |
| "loss": 0.0025, | |
| "reward": 0.39623109018430114, | |
| "reward_std": 0.8446017913520336, | |
| "rewards/cosine_scaled_reward": 0.0048162119928747416, | |
| "rewards/format_reward": 0.7708333414047956, | |
| "step": 326 | |
| }, | |
| { | |
| "completion_length": 1453.6458435058594, | |
| "epoch": 0.3737142857142857, | |
| "grad_norm": 0.2550465166568756, | |
| "kl": 0.0176849365234375, | |
| "learning_rate": 3.902018669163384e-07, | |
| "loss": 0.0007, | |
| "reward": 0.6734898695722222, | |
| "reward_std": 0.814283449202776, | |
| "rewards/cosine_scaled_reward": 0.15649090707302094, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 327 | |
| }, | |
| { | |
| "completion_length": 1698.7500457763672, | |
| "epoch": 0.37485714285714283, | |
| "grad_norm": 0.561379611492157, | |
| "kl": 0.03546142578125, | |
| "learning_rate": 3.872689434630585e-07, | |
| "loss": 0.0014, | |
| "reward": 0.06343854777514935, | |
| "reward_std": 0.6165986470878124, | |
| "rewards/cosine_scaled_reward": -0.18727782554924488, | |
| "rewards/format_reward": 0.7291666828095913, | |
| "step": 328 | |
| }, | |
| { | |
| "completion_length": 971.5000152587891, | |
| "epoch": 0.376, | |
| "grad_norm": 0.45329251885414124, | |
| "kl": 0.024200439453125, | |
| "learning_rate": 3.843439512918949e-07, | |
| "loss": 0.001, | |
| "reward": 0.6886619143188, | |
| "reward_std": 0.8372432589530945, | |
| "rewards/cosine_scaled_reward": 0.10075276345014572, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 329 | |
| }, | |
| { | |
| "completion_length": 1391.3542213439941, | |
| "epoch": 0.37714285714285717, | |
| "grad_norm": 0.7288292646408081, | |
| "kl": 0.056179046630859375, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": 0.0022, | |
| "reward": 0.17569798463955522, | |
| "reward_std": 0.639821320772171, | |
| "rewards/cosine_scaled_reward": -0.15391897410154343, | |
| "rewards/format_reward": 0.8125000055879354, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 1995.6250457763672, | |
| "epoch": 0.3782857142857143, | |
| "grad_norm": 0.7102898359298706, | |
| "kl": 0.07641983032226562, | |
| "learning_rate": 3.785183306423767e-07, | |
| "loss": 0.0031, | |
| "reward": 0.17617637664079666, | |
| "reward_std": 0.7594183348119259, | |
| "rewards/cosine_scaled_reward": -0.056006991614822255, | |
| "rewards/format_reward": 0.6041666734963655, | |
| "step": 331 | |
| }, | |
| { | |
| "completion_length": 1507.145866394043, | |
| "epoch": 0.37942857142857145, | |
| "grad_norm": 0.6774265170097351, | |
| "kl": 0.041957855224609375, | |
| "learning_rate": 3.7561798609655373e-07, | |
| "loss": 0.0017, | |
| "reward": 0.15361519530415535, | |
| "reward_std": 0.7348885871469975, | |
| "rewards/cosine_scaled_reward": -0.1517366673797369, | |
| "rewards/format_reward": 0.7708333414047956, | |
| "step": 332 | |
| }, | |
| { | |
| "completion_length": 1263.06254196167, | |
| "epoch": 0.38057142857142856, | |
| "grad_norm": 0.40281206369400024, | |
| "kl": 0.034458160400390625, | |
| "learning_rate": 3.72726140684072e-07, | |
| "loss": 0.0014, | |
| "reward": 0.4631691016256809, | |
| "reward_std": 0.8592428974807262, | |
| "rewards/cosine_scaled_reward": -0.046608994947746396, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 333 | |
| }, | |
| { | |
| "completion_length": 2013.9583740234375, | |
| "epoch": 0.38171428571428573, | |
| "grad_norm": 0.7402753829956055, | |
| "kl": 0.076751708984375, | |
| "learning_rate": 3.6984293534939737e-07, | |
| "loss": 0.0031, | |
| "reward": 0.025389771908521652, | |
| "reward_std": 0.8407883942127228, | |
| "rewards/cosine_scaled_reward": -0.1560281114652753, | |
| "rewards/format_reward": 0.583333345130086, | |
| "step": 334 | |
| }, | |
| { | |
| "completion_length": 1345.2291946411133, | |
| "epoch": 0.38285714285714284, | |
| "grad_norm": 0.40346795320510864, | |
| "kl": 0.027713775634765625, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.0011, | |
| "reward": 0.43492193752899766, | |
| "reward_std": 0.6728620305657387, | |
| "rewards/cosine_scaled_reward": -0.019926004111766815, | |
| "rewards/format_reward": 0.8958333358168602, | |
| "step": 335 | |
| }, | |
| { | |
| "completion_length": 1604.708381652832, | |
| "epoch": 0.384, | |
| "grad_norm": 0.46796372532844543, | |
| "kl": 0.049716949462890625, | |
| "learning_rate": 3.641030065789562e-07, | |
| "loss": 0.002, | |
| "reward": 0.4504225810524076, | |
| "reward_std": 0.7610187456011772, | |
| "rewards/cosine_scaled_reward": 0.016421111300587654, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 336 | |
| }, | |
| { | |
| "completion_length": 1863.1875457763672, | |
| "epoch": 0.3851428571428571, | |
| "grad_norm": 0.5092529058456421, | |
| "kl": 0.09614944458007812, | |
| "learning_rate": 3.612465628992203e-07, | |
| "loss": 0.0038, | |
| "reward": 0.38631977140903473, | |
| "reward_std": 0.9551087282598019, | |
| "rewards/cosine_scaled_reward": -0.020930441562086344, | |
| "rewards/format_reward": 0.7916666753590107, | |
| "step": 337 | |
| }, | |
| { | |
| "completion_length": 1182.8542175292969, | |
| "epoch": 0.3862857142857143, | |
| "grad_norm": 0.5370055437088013, | |
| "kl": 0.021060943603515625, | |
| "learning_rate": 3.5839931879571725e-07, | |
| "loss": 0.0008, | |
| "reward": 0.48011522740125656, | |
| "reward_std": 0.71619638428092, | |
| "rewards/cosine_scaled_reward": 0.003307923674583435, | |
| "rewards/format_reward": 0.895833333954215, | |
| "step": 338 | |
| }, | |
| { | |
| "completion_length": 1895.6250610351562, | |
| "epoch": 0.38742857142857146, | |
| "grad_norm": 0.8054518699645996, | |
| "kl": 0.08332443237304688, | |
| "learning_rate": 3.555614130391079e-07, | |
| "loss": 0.0033, | |
| "reward": 0.19063638825900853, | |
| "reward_std": 0.5477262288331985, | |
| "rewards/cosine_scaled_reward": -0.08626681286841631, | |
| "rewards/format_reward": 0.7083333525806665, | |
| "step": 339 | |
| }, | |
| { | |
| "completion_length": 1568.1250305175781, | |
| "epoch": 0.38857142857142857, | |
| "grad_norm": 0.42964616417884827, | |
| "kl": 0.040958404541015625, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": 0.0016, | |
| "reward": 0.2769077487755567, | |
| "reward_std": 0.7800903655588627, | |
| "rewards/cosine_scaled_reward": -0.062285197753226385, | |
| "rewards/format_reward": 0.7500000111758709, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 1290.6250228881836, | |
| "epoch": 0.38971428571428574, | |
| "grad_norm": 0.5903887748718262, | |
| "kl": 0.032962799072265625, | |
| "learning_rate": 3.4991416936678276e-07, | |
| "loss": 0.0013, | |
| "reward": 0.625646581640467, | |
| "reward_std": 0.7541246488690376, | |
| "rewards/cosine_scaled_reward": 0.13255258556455374, | |
| "rewards/format_reward": 0.8333333414047956, | |
| "step": 341 | |
| }, | |
| { | |
| "completion_length": 1680.229232788086, | |
| "epoch": 0.39085714285714285, | |
| "grad_norm": 0.6661213040351868, | |
| "kl": 0.07326126098632812, | |
| "learning_rate": 3.471051066897562e-07, | |
| "loss": 0.0029, | |
| "reward": 0.4082430477719754, | |
| "reward_std": 0.7617458440363407, | |
| "rewards/cosine_scaled_reward": -0.03322136774659157, | |
| "rewards/format_reward": 0.8750000037252903, | |
| "step": 342 | |
| }, | |
| { | |
| "completion_length": 1694.3125457763672, | |
| "epoch": 0.392, | |
| "grad_norm": 0.9473098516464233, | |
| "kl": 0.058170318603515625, | |
| "learning_rate": 3.4430593282358777e-07, | |
| "loss": 0.0023, | |
| "reward": 0.325529879424721, | |
| "reward_std": 0.8013235367834568, | |
| "rewards/cosine_scaled_reward": -0.08207336533814669, | |
| "rewards/format_reward": 0.854166679084301, | |
| "step": 343 | |
| }, | |
| { | |
| "completion_length": 1449.3958892822266, | |
| "epoch": 0.3931428571428571, | |
| "grad_norm": 0.3888903856277466, | |
| "kl": 0.06012725830078125, | |
| "learning_rate": 3.4151678419606233e-07, | |
| "loss": 0.0024, | |
| "reward": 0.9358495399355888, | |
| "reward_std": 0.7060995027422905, | |
| "rewards/cosine_scaled_reward": 0.31333022052422166, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 344 | |
| }, | |
| { | |
| "completion_length": 1473.0417098999023, | |
| "epoch": 0.3942857142857143, | |
| "grad_norm": 0.605884313583374, | |
| "kl": 0.04383659362792969, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": 0.0018, | |
| "reward": 0.25372491776943207, | |
| "reward_std": 0.7733400501310825, | |
| "rewards/cosine_scaled_reward": -0.10962994769215584, | |
| "rewards/format_reward": 0.8125000223517418, | |
| "step": 345 | |
| }, | |
| { | |
| "completion_length": 1525.5833892822266, | |
| "epoch": 0.3954285714285714, | |
| "grad_norm": 0.41509008407592773, | |
| "kl": 0.0490264892578125, | |
| "learning_rate": 3.359691059183761e-07, | |
| "loss": 0.002, | |
| "reward": 0.39042545296251774, | |
| "reward_std": 0.8485492803156376, | |
| "rewards/cosine_scaled_reward": -0.09566876385360956, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 346 | |
| }, | |
| { | |
| "completion_length": 1555.5208740234375, | |
| "epoch": 0.3965714285714286, | |
| "grad_norm": 0.6986513137817383, | |
| "kl": 0.058147430419921875, | |
| "learning_rate": 3.3321084665422803e-07, | |
| "loss": 0.0023, | |
| "reward": 0.1333858126308769, | |
| "reward_std": 0.7080183140933514, | |
| "rewards/cosine_scaled_reward": -0.23809866607189178, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 347 | |
| }, | |
| { | |
| "completion_length": 1694.0625686645508, | |
| "epoch": 0.3977142857142857, | |
| "grad_norm": 0.7975203394889832, | |
| "kl": 0.0868682861328125, | |
| "learning_rate": 3.3046315338757026e-07, | |
| "loss": 0.0035, | |
| "reward": 0.39133906550705433, | |
| "reward_std": 0.7762907817959785, | |
| "rewards/cosine_scaled_reward": -0.012924212962388992, | |
| "rewards/format_reward": 0.8125000111758709, | |
| "step": 348 | |
| }, | |
| { | |
| "completion_length": 1404.1667404174805, | |
| "epoch": 0.39885714285714285, | |
| "grad_norm": 3.111985445022583, | |
| "kl": 0.11113739013671875, | |
| "learning_rate": 3.2772616003709616e-07, | |
| "loss": 0.0044, | |
| "reward": 0.47680344711989164, | |
| "reward_std": 0.7106522209942341, | |
| "rewards/cosine_scaled_reward": 0.014676447957754135, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 349 | |
| }, | |
| { | |
| "completion_length": 820.4791831970215, | |
| "epoch": 0.4, | |
| "grad_norm": 0.2917865216732025, | |
| "kl": 0.01514434814453125, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.0006, | |
| "reward": 0.4086197968572378, | |
| "reward_std": 0.7495947815477848, | |
| "rewards/cosine_scaled_reward": -0.09753232356160879, | |
| "rewards/format_reward": 1.0, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 1446.520866394043, | |
| "epoch": 0.40114285714285713, | |
| "grad_norm": 0.7599288821220398, | |
| "kl": 0.07086944580078125, | |
| "learning_rate": 3.222848061454764e-07, | |
| "loss": 0.0028, | |
| "reward": 0.3731183987110853, | |
| "reward_std": 0.7764743529260159, | |
| "rewards/cosine_scaled_reward": -0.060001387260854244, | |
| "rewards/format_reward": 0.8750000055879354, | |
| "step": 351 | |
| }, | |
| { | |
| "completion_length": 1387.0000381469727, | |
| "epoch": 0.4022857142857143, | |
| "grad_norm": 0.7137279510498047, | |
| "kl": 0.06012725830078125, | |
| "learning_rate": 3.195807108082429e-07, | |
| "loss": 0.0024, | |
| "reward": 0.42939993026084267, | |
| "reward_std": 0.8450379781424999, | |
| "rewards/cosine_scaled_reward": 0.003599647810915485, | |
| "rewards/format_reward": 0.8125000055879354, | |
| "step": 352 | |
| }, | |
| { | |
| "completion_length": 1223.083366394043, | |
| "epoch": 0.4034285714285714, | |
| "grad_norm": 0.5175493359565735, | |
| "kl": 0.0400390625, | |
| "learning_rate": 3.168878457820915e-07, | |
| "loss": 0.0016, | |
| "reward": 0.716075923293829, | |
| "reward_std": 0.9309490397572517, | |
| "rewards/cosine_scaled_reward": 0.1352907968685031, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 353 | |
| }, | |
| { | |
| "completion_length": 1064.8333587646484, | |
| "epoch": 0.4045714285714286, | |
| "grad_norm": 0.6795687079429626, | |
| "kl": 0.04412841796875, | |
| "learning_rate": 3.142063423134644e-07, | |
| "loss": 0.0018, | |
| "reward": 0.5015737505163997, | |
| "reward_std": 0.5811274200677872, | |
| "rewards/cosine_scaled_reward": 0.009966753888875246, | |
| "rewards/format_reward": 0.9375, | |
| "step": 354 | |
| }, | |
| { | |
| "completion_length": 972.6875152587891, | |
| "epoch": 0.4057142857142857, | |
| "grad_norm": 0.3503471910953522, | |
| "kl": 0.02288818359375, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.0009, | |
| "reward": 0.6657696301117539, | |
| "reward_std": 0.7398427277803421, | |
| "rewards/cosine_scaled_reward": 0.08280336670577526, | |
| "rewards/format_reward": 1.0, | |
| "step": 355 | |
| }, | |
| { | |
| "completion_length": 1551.833396911621, | |
| "epoch": 0.40685714285714286, | |
| "grad_norm": 0.4588610529899597, | |
| "kl": 0.06862258911132812, | |
| "learning_rate": 3.0887794225945143e-07, | |
| "loss": 0.0027, | |
| "reward": 0.3966330944094807, | |
| "reward_std": 0.6840399689972401, | |
| "rewards/cosine_scaled_reward": -0.058146869763731956, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 356 | |
| }, | |
| { | |
| "completion_length": 1772.1042175292969, | |
| "epoch": 0.408, | |
| "grad_norm": 1.0075832605361938, | |
| "kl": 0.100799560546875, | |
| "learning_rate": 3.062313053727671e-07, | |
| "loss": 0.004, | |
| "reward": 0.12496365327388048, | |
| "reward_std": 0.8355611003935337, | |
| "rewards/cosine_scaled_reward": -0.15991984121501446, | |
| "rewards/format_reward": 0.7291666828095913, | |
| "step": 357 | |
| }, | |
| { | |
| "completion_length": 1562.562515258789, | |
| "epoch": 0.40914285714285714, | |
| "grad_norm": 0.7331953644752502, | |
| "kl": 0.07072067260742188, | |
| "learning_rate": 3.0359654942835247e-07, | |
| "loss": 0.0028, | |
| "reward": 0.5806331331841648, | |
| "reward_std": 0.8575869612395763, | |
| "rewards/cosine_scaled_reward": 0.05609727092087269, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 358 | |
| }, | |
| { | |
| "completion_length": 942.1666870117188, | |
| "epoch": 0.4102857142857143, | |
| "grad_norm": 0.5968821048736572, | |
| "kl": 0.0392608642578125, | |
| "learning_rate": 3.0097380284049523e-07, | |
| "loss": 0.0016, | |
| "reward": 0.42657787445932627, | |
| "reward_std": 0.5996944792568684, | |
| "rewards/cosine_scaled_reward": -0.06319771538255736, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 359 | |
| }, | |
| { | |
| "completion_length": 1402.3125381469727, | |
| "epoch": 0.4114285714285714, | |
| "grad_norm": 0.7901036143302917, | |
| "kl": 0.07848739624023438, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.0031, | |
| "reward": 0.5172299258410931, | |
| "reward_std": 0.8756576031446457, | |
| "rewards/cosine_scaled_reward": 0.017652488488238305, | |
| "rewards/format_reward": 0.8958333507180214, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 1255.0000534057617, | |
| "epoch": 0.4125714285714286, | |
| "grad_norm": 0.7914440035820007, | |
| "kl": 0.0751495361328125, | |
| "learning_rate": 2.9576484845877793e-07, | |
| "loss": 0.003, | |
| "reward": 0.24485921673476696, | |
| "reward_std": 0.7093796096742153, | |
| "rewards/cosine_scaled_reward": -0.14366490487009287, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 361 | |
| }, | |
| { | |
| "completion_length": 899.2916831970215, | |
| "epoch": 0.4137142857142857, | |
| "grad_norm": 0.6618494391441345, | |
| "kl": 0.0485382080078125, | |
| "learning_rate": 2.931788945420058e-07, | |
| "loss": 0.0019, | |
| "reward": 0.510432411916554, | |
| "reward_std": 0.6674188002943993, | |
| "rewards/cosine_scaled_reward": 0.010434551164507866, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 362 | |
| }, | |
| { | |
| "completion_length": 863.9166946411133, | |
| "epoch": 0.41485714285714287, | |
| "grad_norm": 0.7848973274230957, | |
| "kl": 0.029571533203125, | |
| "learning_rate": 2.9060545772359305e-07, | |
| "loss": 0.0012, | |
| "reward": 0.844133562874049, | |
| "reward_std": 0.6796937808394432, | |
| "rewards/cosine_scaled_reward": 0.20261266455054283, | |
| "rewards/format_reward": 1.0, | |
| "step": 363 | |
| }, | |
| { | |
| "completion_length": 1205.1666870117188, | |
| "epoch": 0.416, | |
| "grad_norm": 0.5032205581665039, | |
| "kl": 0.03617095947265625, | |
| "learning_rate": 2.8804466342921987e-07, | |
| "loss": 0.0014, | |
| "reward": 0.08574948133900762, | |
| "reward_std": 0.4307812377810478, | |
| "rewards/cosine_scaled_reward": -0.26402094028890133, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 364 | |
| }, | |
| { | |
| "completion_length": 1884.0625686645508, | |
| "epoch": 0.41714285714285715, | |
| "grad_norm": 0.9030827283859253, | |
| "kl": 0.1519775390625, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": 0.0061, | |
| "reward": 0.17442028690129519, | |
| "reward_std": 0.685190960764885, | |
| "rewards/cosine_scaled_reward": -0.09430265240371227, | |
| "rewards/format_reward": 0.6875000093132257, | |
| "step": 365 | |
| }, | |
| { | |
| "completion_length": 1026.7708644866943, | |
| "epoch": 0.41828571428571426, | |
| "grad_norm": 0.6462038159370422, | |
| "kl": 0.0249786376953125, | |
| "learning_rate": 2.829615010283344e-07, | |
| "loss": 0.001, | |
| "reward": 0.7736296411603689, | |
| "reward_std": 0.8399604074656963, | |
| "rewards/cosine_scaled_reward": 0.15821054810658097, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 366 | |
| }, | |
| { | |
| "completion_length": 1750.541748046875, | |
| "epoch": 0.41942857142857143, | |
| "grad_norm": 0.8086258769035339, | |
| "kl": 0.12229537963867188, | |
| "learning_rate": 2.8043938066798645e-07, | |
| "loss": 0.0049, | |
| "reward": 0.40639928355813026, | |
| "reward_std": 0.6661986261606216, | |
| "rewards/cosine_scaled_reward": -0.005629323422908783, | |
| "rewards/format_reward": 0.8333333358168602, | |
| "step": 367 | |
| }, | |
| { | |
| "completion_length": 1828.7917289733887, | |
| "epoch": 0.4205714285714286, | |
| "grad_norm": 1.4647676944732666, | |
| "kl": 0.092529296875, | |
| "learning_rate": 2.7793039831193133e-07, | |
| "loss": 0.0037, | |
| "reward": 0.3596602795878425, | |
| "reward_std": 0.8183485567569733, | |
| "rewards/cosine_scaled_reward": -0.04840289568528533, | |
| "rewards/format_reward": 0.8333333395421505, | |
| "step": 368 | |
| }, | |
| { | |
| "completion_length": 1507.0208854675293, | |
| "epoch": 0.4217142857142857, | |
| "grad_norm": 0.9977928996086121, | |
| "kl": 0.105255126953125, | |
| "learning_rate": 2.7543467624442956e-07, | |
| "loss": 0.0042, | |
| "reward": 0.3323223125189543, | |
| "reward_std": 0.8554218038916588, | |
| "rewards/cosine_scaled_reward": -0.0706447935081087, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 369 | |
| }, | |
| { | |
| "completion_length": 1375.958366394043, | |
| "epoch": 0.4228571428571429, | |
| "grad_norm": 0.6904154419898987, | |
| "kl": 0.1561279296875, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.0062, | |
| "reward": 0.2625284339301288, | |
| "reward_std": 0.57734365016222, | |
| "rewards/cosine_scaled_reward": -0.11408653669059277, | |
| "rewards/format_reward": 0.8541666679084301, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 747.4166831970215, | |
| "epoch": 0.424, | |
| "grad_norm": 0.7088914513587952, | |
| "kl": 0.04413604736328125, | |
| "learning_rate": 2.7048349887476037e-07, | |
| "loss": 0.0018, | |
| "reward": 0.6137365428730845, | |
| "reward_std": 0.7054285481572151, | |
| "rewards/cosine_scaled_reward": 0.06669859914109111, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 371 | |
| }, | |
| { | |
| "completion_length": 1583.250015258789, | |
| "epoch": 0.42514285714285716, | |
| "grad_norm": 1.3554654121398926, | |
| "kl": 0.08654022216796875, | |
| "learning_rate": 2.6802828488599294e-07, | |
| "loss": 0.0035, | |
| "reward": 0.5319191414746456, | |
| "reward_std": 0.6091607809066772, | |
| "rewards/cosine_scaled_reward": 0.052222222089767456, | |
| "rewards/format_reward": 0.8958333507180214, | |
| "step": 372 | |
| }, | |
| { | |
| "completion_length": 840.7500114440918, | |
| "epoch": 0.42628571428571427, | |
| "grad_norm": 0.823254406452179, | |
| "kl": 0.02849578857421875, | |
| "learning_rate": 2.655868138008171e-07, | |
| "loss": 0.0011, | |
| "reward": 0.23188667371869087, | |
| "reward_std": 0.6112702935934067, | |
| "rewards/cosine_scaled_reward": -0.20857458282262087, | |
| "rewards/format_reward": 1.0, | |
| "step": 373 | |
| }, | |
| { | |
| "completion_length": 1146.895881652832, | |
| "epoch": 0.42742857142857144, | |
| "grad_norm": 0.8236629962921143, | |
| "kl": 0.047637939453125, | |
| "learning_rate": 2.631592046130896e-07, | |
| "loss": 0.0019, | |
| "reward": 0.5981785822659731, | |
| "reward_std": 0.8187721818685532, | |
| "rewards/cosine_scaled_reward": 0.03387853177264333, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 374 | |
| }, | |
| { | |
| "completion_length": 1634.3750457763672, | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 1.242490530014038, | |
| "kl": 0.20917510986328125, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.0084, | |
| "reward": 0.49338567443192005, | |
| "reward_std": 0.8198688104748726, | |
| "rewards/cosine_scaled_reward": 0.07417132705450058, | |
| "rewards/format_reward": 0.7708333469927311, | |
| "step": 375 | |
| }, | |
| { | |
| "completion_length": 1403.1875381469727, | |
| "epoch": 0.4297142857142857, | |
| "grad_norm": 1.0553152561187744, | |
| "kl": 0.1156768798828125, | |
| "learning_rate": 2.583460445215911e-07, | |
| "loss": 0.0046, | |
| "reward": 0.4271426647901535, | |
| "reward_std": 0.7959622256457806, | |
| "rewards/cosine_scaled_reward": -0.03355884738266468, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 376 | |
| }, | |
| { | |
| "completion_length": 1332.0417098999023, | |
| "epoch": 0.4308571428571429, | |
| "grad_norm": 0.9126045107841492, | |
| "kl": 0.1114654541015625, | |
| "learning_rate": 2.5596072820445254e-07, | |
| "loss": 0.0045, | |
| "reward": 0.5075240693986416, | |
| "reward_std": 0.855946060270071, | |
| "rewards/cosine_scaled_reward": -0.0022887131199240685, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 377 | |
| }, | |
| { | |
| "completion_length": 1077.4583854675293, | |
| "epoch": 0.432, | |
| "grad_norm": 0.5990425944328308, | |
| "kl": 0.030300140380859375, | |
| "learning_rate": 2.5358974294659373e-07, | |
| "loss": 0.0012, | |
| "reward": 0.5777185422666662, | |
| "reward_std": 0.9156284630298615, | |
| "rewards/cosine_scaled_reward": 0.008807800710201263, | |
| "rewards/format_reward": 1.0, | |
| "step": 378 | |
| }, | |
| { | |
| "completion_length": 1503.7708587646484, | |
| "epoch": 0.43314285714285716, | |
| "grad_norm": 0.6305628418922424, | |
| "kl": 0.13201522827148438, | |
| "learning_rate": 2.512332043064913e-07, | |
| "loss": 0.0053, | |
| "reward": 0.3502848669886589, | |
| "reward_std": 0.7390433885157108, | |
| "rewards/cosine_scaled_reward": -0.06138859502971172, | |
| "rewards/format_reward": 0.8541666679084301, | |
| "step": 379 | |
| }, | |
| { | |
| "completion_length": 1135.229175567627, | |
| "epoch": 0.4342857142857143, | |
| "grad_norm": 1.0688378810882568, | |
| "kl": 0.11534881591796875, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.0046, | |
| "reward": 0.415515000699088, | |
| "reward_std": 0.7041169926524162, | |
| "rewards/cosine_scaled_reward": -0.053237104788422585, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 1531.4583854675293, | |
| "epoch": 0.43542857142857144, | |
| "grad_norm": 1.1264177560806274, | |
| "kl": 0.13215255737304688, | |
| "learning_rate": 2.465639255873246e-07, | |
| "loss": 0.0053, | |
| "reward": 0.24819382751593366, | |
| "reward_std": 0.7195433788001537, | |
| "rewards/cosine_scaled_reward": -0.15285431523807347, | |
| "rewards/format_reward": 0.8958333507180214, | |
| "step": 381 | |
| }, | |
| { | |
| "completion_length": 1023.3750305175781, | |
| "epoch": 0.43657142857142855, | |
| "grad_norm": 0.40737247467041016, | |
| "kl": 0.0533447265625, | |
| "learning_rate": 2.4425141308231765e-07, | |
| "loss": 0.0021, | |
| "reward": 0.2534531052224338, | |
| "reward_std": 0.6627279557287693, | |
| "rewards/cosine_scaled_reward": -0.19920427445322275, | |
| "rewards/format_reward": 1.0, | |
| "step": 382 | |
| }, | |
| { | |
| "completion_length": 1496.5417251586914, | |
| "epoch": 0.4377142857142857, | |
| "grad_norm": 1.8750388622283936, | |
| "kl": 0.2808380126953125, | |
| "learning_rate": 2.4195380233209006e-07, | |
| "loss": 0.0112, | |
| "reward": 0.3533962171059102, | |
| "reward_std": 0.8847429379820824, | |
| "rewards/cosine_scaled_reward": -0.04301046393811703, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 383 | |
| }, | |
| { | |
| "completion_length": 1054.5833435058594, | |
| "epoch": 0.43885714285714283, | |
| "grad_norm": 0.5143475532531738, | |
| "kl": 0.0340118408203125, | |
| "learning_rate": 2.3967120531894857e-07, | |
| "loss": 0.0014, | |
| "reward": 0.9491607993841171, | |
| "reward_std": 0.8766042143106461, | |
| "rewards/cosine_scaled_reward": 0.2751440554857254, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 384 | |
| }, | |
| { | |
| "completion_length": 1428.645896911621, | |
| "epoch": 0.44, | |
| "grad_norm": 0.6951473355293274, | |
| "kl": 0.14729690551757812, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": 0.0059, | |
| "reward": 0.4135100084822625, | |
| "reward_std": 0.7766407653689384, | |
| "rewards/cosine_scaled_reward": -0.03412807872518897, | |
| "rewards/format_reward": 0.8750000111758709, | |
| "step": 385 | |
| }, | |
| { | |
| "completion_length": 1270.7291793823242, | |
| "epoch": 0.44114285714285717, | |
| "grad_norm": 1.0250437259674072, | |
| "kl": 0.1134185791015625, | |
| "learning_rate": 2.3515149676898552e-07, | |
| "loss": 0.0045, | |
| "reward": 0.47308777272701263, | |
| "reward_std": 0.6565175838768482, | |
| "rewards/cosine_scaled_reward": -0.013674074783921242, | |
| "rewards/format_reward": 0.9375, | |
| "step": 386 | |
| }, | |
| { | |
| "completion_length": 1528.3542213439941, | |
| "epoch": 0.4422857142857143, | |
| "grad_norm": 2.0494582653045654, | |
| "kl": 0.19681549072265625, | |
| "learning_rate": 2.3291460551638237e-07, | |
| "loss": 0.0079, | |
| "reward": 0.31283304444514215, | |
| "reward_std": 0.583834994584322, | |
| "rewards/cosine_scaled_reward": -0.06756466627120972, | |
| "rewards/format_reward": 0.8333333469927311, | |
| "step": 387 | |
| }, | |
| { | |
| "completion_length": 1097.583351135254, | |
| "epoch": 0.44342857142857145, | |
| "grad_norm": 0.5577840209007263, | |
| "kl": 0.0581207275390625, | |
| "learning_rate": 2.306931685585657e-07, | |
| "loss": 0.0023, | |
| "reward": 0.7101229609397706, | |
| "reward_std": 0.6756577789783478, | |
| "rewards/cosine_scaled_reward": 0.1360636167228222, | |
| "rewards/format_reward": 0.9583333358168602, | |
| "step": 388 | |
| }, | |
| { | |
| "completion_length": 1202.3958587646484, | |
| "epoch": 0.44457142857142856, | |
| "grad_norm": 0.6137182712554932, | |
| "kl": 0.0680389404296875, | |
| "learning_rate": 2.2848729416523859e-07, | |
| "loss": 0.0027, | |
| "reward": 0.27682637330144644, | |
| "reward_std": 0.7127226404845715, | |
| "rewards/cosine_scaled_reward": -0.1509529883041978, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 389 | |
| }, | |
| { | |
| "completion_length": 1435.2917213439941, | |
| "epoch": 0.44571428571428573, | |
| "grad_norm": 0.9228805303573608, | |
| "kl": 0.1553211212158203, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": 0.0062, | |
| "reward": 0.21503479685634375, | |
| "reward_std": 0.7798956334590912, | |
| "rewards/cosine_scaled_reward": -0.11404814245179296, | |
| "rewards/format_reward": 0.7708333544433117, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 999.9167022705078, | |
| "epoch": 0.44685714285714284, | |
| "grad_norm": 0.9710776209831238, | |
| "kl": 0.08495330810546875, | |
| "learning_rate": 2.2412266235313973e-07, | |
| "loss": 0.0034, | |
| "reward": 0.6504320180974901, | |
| "reward_std": 0.6109317727386951, | |
| "rewards/cosine_scaled_reward": 0.1076424578204751, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 391 | |
| }, | |
| { | |
| "completion_length": 1423.895881652832, | |
| "epoch": 0.448, | |
| "grad_norm": 1.6490659713745117, | |
| "kl": 0.223846435546875, | |
| "learning_rate": 2.2196411766036487e-07, | |
| "loss": 0.009, | |
| "reward": 0.1661514127627015, | |
| "reward_std": 0.7237707450985909, | |
| "rewards/cosine_scaled_reward": -0.1779894083738327, | |
| "rewards/format_reward": 0.833333333954215, | |
| "step": 392 | |
| }, | |
| { | |
| "completion_length": 1450.4792022705078, | |
| "epoch": 0.4491428571428571, | |
| "grad_norm": 1.2397853136062622, | |
| "kl": 0.10369110107421875, | |
| "learning_rate": 2.1982156097370557e-07, | |
| "loss": 0.0042, | |
| "reward": 0.49226129427552223, | |
| "reward_std": 0.8764687478542328, | |
| "rewards/cosine_scaled_reward": 0.0059880828484892845, | |
| "rewards/format_reward": 0.8958333358168602, | |
| "step": 393 | |
| }, | |
| { | |
| "completion_length": 1569.3958702087402, | |
| "epoch": 0.4502857142857143, | |
| "grad_norm": 1.4030894041061401, | |
| "kl": 0.1822967529296875, | |
| "learning_rate": 2.1769509671835223e-07, | |
| "loss": 0.0073, | |
| "reward": 0.11027060728520155, | |
| "reward_std": 0.6850339062511921, | |
| "rewards/cosine_scaled_reward": -0.21509641967713833, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 394 | |
| }, | |
| { | |
| "completion_length": 1156.7917175292969, | |
| "epoch": 0.4514285714285714, | |
| "grad_norm": 0.9848508238792419, | |
| "kl": 0.077606201171875, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": 0.0031, | |
| "reward": 0.41267195832915604, | |
| "reward_std": 0.7016527280211449, | |
| "rewards/cosine_scaled_reward": -0.04957490786910057, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 395 | |
| }, | |
| { | |
| "completion_length": 1016.1458511352539, | |
| "epoch": 0.45257142857142857, | |
| "grad_norm": 0.6101754903793335, | |
| "kl": 0.020412445068359375, | |
| "learning_rate": 2.134908592756607e-07, | |
| "loss": 0.0008, | |
| "reward": 0.4085001898929477, | |
| "reward_std": 0.6500335298478603, | |
| "rewards/cosine_scaled_reward": -0.058247581124305725, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 396 | |
| }, | |
| { | |
| "completion_length": 1285.4583892822266, | |
| "epoch": 0.45371428571428574, | |
| "grad_norm": 0.699232280254364, | |
| "kl": 0.16654586791992188, | |
| "learning_rate": 2.1141329099692406e-07, | |
| "loss": 0.0067, | |
| "reward": 0.2716854903846979, | |
| "reward_std": 0.6572816967964172, | |
| "rewards/cosine_scaled_reward": -0.10944613942410797, | |
| "rewards/format_reward": 0.8541666697710752, | |
| "step": 397 | |
| }, | |
| { | |
| "completion_length": 1064.0625305175781, | |
| "epoch": 0.45485714285714285, | |
| "grad_norm": 0.4855163097381592, | |
| "kl": 0.034351348876953125, | |
| "learning_rate": 2.0935222495670968e-07, | |
| "loss": 0.0014, | |
| "reward": 0.40476914402097464, | |
| "reward_std": 0.6724813655018806, | |
| "rewards/cosine_scaled_reward": -0.0830375433433801, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 398 | |
| }, | |
| { | |
| "completion_length": 944.5417098999023, | |
| "epoch": 0.456, | |
| "grad_norm": 0.42374876141548157, | |
| "kl": 0.0376739501953125, | |
| "learning_rate": 2.0730776160846853e-07, | |
| "loss": 0.0015, | |
| "reward": 0.5400755191221833, | |
| "reward_std": 0.7856027092784643, | |
| "rewards/cosine_scaled_reward": -0.010072574485093355, | |
| "rewards/format_reward": 1.0, | |
| "step": 399 | |
| }, | |
| { | |
| "completion_length": 898.6875152587891, | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 0.6626017689704895, | |
| "kl": 0.0309600830078125, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.0012, | |
| "reward": 0.7812614720314741, | |
| "reward_std": 0.6693713776767254, | |
| "rewards/cosine_scaled_reward": 0.1816884014988318, | |
| "rewards/format_reward": 0.9583333358168602, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 1256.6458740234375, | |
| "epoch": 0.4582857142857143, | |
| "grad_norm": 1.2477692365646362, | |
| "kl": 0.08155059814453125, | |
| "learning_rate": 2.032690407508949e-07, | |
| "loss": 0.0033, | |
| "reward": 0.5257098386064172, | |
| "reward_std": 0.81386823579669, | |
| "rewards/cosine_scaled_reward": 0.011443385854363441, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 401 | |
| }, | |
| { | |
| "completion_length": 1103.5208587646484, | |
| "epoch": 0.4594285714285714, | |
| "grad_norm": 3.268733501434326, | |
| "kl": 0.19863510131835938, | |
| "learning_rate": 2.0127498008311922e-07, | |
| "loss": 0.0079, | |
| "reward": 0.4599206205457449, | |
| "reward_std": 0.5488756932318211, | |
| "rewards/cosine_scaled_reward": -0.017510680481791496, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 402 | |
| }, | |
| { | |
| "completion_length": 931.4375190734863, | |
| "epoch": 0.4605714285714286, | |
| "grad_norm": 1.2398287057876587, | |
| "kl": 0.09774017333984375, | |
| "learning_rate": 1.9929791578083655e-07, | |
| "loss": 0.0039, | |
| "reward": 0.45444739051163197, | |
| "reward_std": 0.4980122298002243, | |
| "rewards/cosine_scaled_reward": -0.007617715746164322, | |
| "rewards/format_reward": 0.9166666679084301, | |
| "step": 403 | |
| }, | |
| { | |
| "completion_length": 1211.6041870117188, | |
| "epoch": 0.4617142857142857, | |
| "grad_norm": 1.140418529510498, | |
| "kl": 0.15045547485351562, | |
| "learning_rate": 1.9733794420337213e-07, | |
| "loss": 0.006, | |
| "reward": 0.4547617736971006, | |
| "reward_std": 0.5626656413078308, | |
| "rewards/cosine_scaled_reward": -0.01060919463634491, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 404 | |
| }, | |
| { | |
| "completion_length": 974.6875305175781, | |
| "epoch": 0.46285714285714286, | |
| "grad_norm": 1.321244239807129, | |
| "kl": 0.094329833984375, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.0038, | |
| "reward": 0.7235147282481194, | |
| "reward_std": 0.8120486699044704, | |
| "rewards/cosine_scaled_reward": 0.14601082005538046, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 405 | |
| }, | |
| { | |
| "completion_length": 1133.5416946411133, | |
| "epoch": 0.464, | |
| "grad_norm": 0.6807760000228882, | |
| "kl": 0.1176910400390625, | |
| "learning_rate": 1.934696604901642e-07, | |
| "loss": 0.0047, | |
| "reward": 0.6748548084869981, | |
| "reward_std": 0.78925821185112, | |
| "rewards/cosine_scaled_reward": 0.10528674224042334, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 406 | |
| }, | |
| { | |
| "completion_length": 1320.2500305175781, | |
| "epoch": 0.46514285714285714, | |
| "grad_norm": 1.170783519744873, | |
| "kl": 0.21154022216796875, | |
| "learning_rate": 1.915615368891117e-07, | |
| "loss": 0.0085, | |
| "reward": 0.4940835009329021, | |
| "reward_std": 0.5547701120376587, | |
| "rewards/cosine_scaled_reward": 0.037424055859446526, | |
| "rewards/format_reward": 0.8750000055879354, | |
| "step": 407 | |
| }, | |
| { | |
| "completion_length": 1262.9375457763672, | |
| "epoch": 0.4662857142857143, | |
| "grad_norm": 0.8598933815956116, | |
| "kl": 0.14272689819335938, | |
| "learning_rate": 1.8967088307307e-07, | |
| "loss": 0.0057, | |
| "reward": 0.404069249285385, | |
| "reward_std": 0.7031322456896305, | |
| "rewards/cosine_scaled_reward": -0.0431424961425364, | |
| "rewards/format_reward": 0.895833333954215, | |
| "step": 408 | |
| }, | |
| { | |
| "completion_length": 1639.5000305175781, | |
| "epoch": 0.4674285714285714, | |
| "grad_norm": 1.15615975856781, | |
| "kl": 0.17174530029296875, | |
| "learning_rate": 1.8779779118983867e-07, | |
| "loss": 0.0069, | |
| "reward": 0.31328563997521996, | |
| "reward_std": 0.7064049206674099, | |
| "rewards/cosine_scaled_reward": -0.11326186126098037, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 409 | |
| }, | |
| { | |
| "completion_length": 1499.333366394043, | |
| "epoch": 0.4685714285714286, | |
| "grad_norm": 1.267825961112976, | |
| "kl": 0.22209548950195312, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": 0.0089, | |
| "reward": 0.3295501284301281, | |
| "reward_std": 0.6071663275361061, | |
| "rewards/cosine_scaled_reward": -0.09065644256770611, | |
| "rewards/format_reward": 0.8958333358168602, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 1507.6041946411133, | |
| "epoch": 0.4697142857142857, | |
| "grad_norm": 1.4931728839874268, | |
| "kl": 0.13777923583984375, | |
| "learning_rate": 1.8410465752883758e-07, | |
| "loss": 0.0055, | |
| "reward": 0.3418419687077403, | |
| "reward_std": 0.8959544003009796, | |
| "rewards/cosine_scaled_reward": -0.10114361811429262, | |
| "rewards/format_reward": 0.8958333395421505, | |
| "step": 411 | |
| }, | |
| { | |
| "completion_length": 1131.0208587646484, | |
| "epoch": 0.47085714285714286, | |
| "grad_norm": 1.660409688949585, | |
| "kl": 0.13482666015625, | |
| "learning_rate": 1.822847957491922e-07, | |
| "loss": 0.0054, | |
| "reward": 0.5560126416385174, | |
| "reward_std": 0.7937713004648685, | |
| "rewards/cosine_scaled_reward": 0.06759718805551529, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 412 | |
| }, | |
| { | |
| "completion_length": 1269.9375267028809, | |
| "epoch": 0.472, | |
| "grad_norm": 0.629540205001831, | |
| "kl": 0.112335205078125, | |
| "learning_rate": 1.804828558898332e-07, | |
| "loss": 0.0045, | |
| "reward": 0.46968742460012436, | |
| "reward_std": 0.6266082711517811, | |
| "rewards/cosine_scaled_reward": 0.005679788533598185, | |
| "rewards/format_reward": 0.8958333358168602, | |
| "step": 413 | |
| }, | |
| { | |
| "completion_length": 1644.041732788086, | |
| "epoch": 0.47314285714285714, | |
| "grad_norm": 1.6342582702636719, | |
| "kl": 0.16037750244140625, | |
| "learning_rate": 1.7869892577476722e-07, | |
| "loss": 0.0064, | |
| "reward": 0.17194935493171215, | |
| "reward_std": 0.7680409215390682, | |
| "rewards/cosine_scaled_reward": -0.1542492527514696, | |
| "rewards/format_reward": 0.7916666846722364, | |
| "step": 414 | |
| }, | |
| { | |
| "completion_length": 1607.4167022705078, | |
| "epoch": 0.4742857142857143, | |
| "grad_norm": 2.081047296524048, | |
| "kl": 0.315277099609375, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": 0.0126, | |
| "reward": 0.15895176446065307, | |
| "reward_std": 0.7137060277163982, | |
| "rewards/cosine_scaled_reward": -0.0956956222653389, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 415 | |
| }, | |
| { | |
| "completion_length": 1069.0208587646484, | |
| "epoch": 0.4754285714285714, | |
| "grad_norm": 1.11202871799469, | |
| "kl": 0.07001304626464844, | |
| "learning_rate": 1.7518544168045524e-07, | |
| "loss": 0.0028, | |
| "reward": 0.8406836800277233, | |
| "reward_std": 0.7853529006242752, | |
| "rewards/cosine_scaled_reward": 0.20668393187224865, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 416 | |
| }, | |
| { | |
| "completion_length": 1633.3125534057617, | |
| "epoch": 0.4765714285714286, | |
| "grad_norm": 2.8300588130950928, | |
| "kl": 0.274017333984375, | |
| "learning_rate": 1.7345605894346726e-07, | |
| "loss": 0.011, | |
| "reward": 0.2972092959098518, | |
| "reward_std": 0.8013018406927586, | |
| "rewards/cosine_scaled_reward": -0.08885959023609757, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 417 | |
| }, | |
| { | |
| "completion_length": 1035.1041870117188, | |
| "epoch": 0.4777142857142857, | |
| "grad_norm": 0.9021016359329224, | |
| "kl": 0.11577606201171875, | |
| "learning_rate": 1.7174502842694212e-07, | |
| "loss": 0.0046, | |
| "reward": 0.8001389261335135, | |
| "reward_std": 0.8382466398179531, | |
| "rewards/cosine_scaled_reward": 0.18563345912843943, | |
| "rewards/format_reward": 0.9583333358168602, | |
| "step": 418 | |
| }, | |
| { | |
| "completion_length": 1300.9792175292969, | |
| "epoch": 0.47885714285714287, | |
| "grad_norm": 1.542521595954895, | |
| "kl": 0.1188812255859375, | |
| "learning_rate": 1.7005243352409333e-07, | |
| "loss": 0.0048, | |
| "reward": 0.47798004280775785, | |
| "reward_std": 0.6525602787733078, | |
| "rewards/cosine_scaled_reward": -0.022527330555021763, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 419 | |
| }, | |
| { | |
| "completion_length": 774.5833587646484, | |
| "epoch": 0.48, | |
| "grad_norm": 0.5312778949737549, | |
| "kl": 0.01689910888671875, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": 0.0007, | |
| "reward": 0.34607438833336346, | |
| "reward_std": 0.6675878167152405, | |
| "rewards/cosine_scaled_reward": -0.12272587232291698, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 1449.6250381469727, | |
| "epoch": 0.48114285714285715, | |
| "grad_norm": 1.6367040872573853, | |
| "kl": 0.283203125, | |
| "learning_rate": 1.6672287963562852e-07, | |
| "loss": 0.0114, | |
| "reward": 0.3528535794466734, | |
| "reward_std": 0.7183955535292625, | |
| "rewards/cosine_scaled_reward": -0.09002240933477879, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 421 | |
| }, | |
| { | |
| "completion_length": 1520.2917251586914, | |
| "epoch": 0.48228571428571426, | |
| "grad_norm": 1.4518808126449585, | |
| "kl": 0.29302215576171875, | |
| "learning_rate": 1.6508608292777203e-07, | |
| "loss": 0.0117, | |
| "reward": 0.46497548231855035, | |
| "reward_std": 0.6263095885515213, | |
| "rewards/cosine_scaled_reward": 0.00397377647459507, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 422 | |
| }, | |
| { | |
| "completion_length": 1500.2500534057617, | |
| "epoch": 0.48342857142857143, | |
| "grad_norm": 1.8499155044555664, | |
| "kl": 0.3067626953125, | |
| "learning_rate": 1.6346804638120098e-07, | |
| "loss": 0.0122, | |
| "reward": 0.27377578942105174, | |
| "reward_std": 0.6246924735605717, | |
| "rewards/cosine_scaled_reward": -0.08783163502812386, | |
| "rewards/format_reward": 0.812500013038516, | |
| "step": 423 | |
| }, | |
| { | |
| "completion_length": 1651.6250610351562, | |
| "epoch": 0.4845714285714286, | |
| "grad_norm": 2.137979745864868, | |
| "kl": 0.3133392333984375, | |
| "learning_rate": 1.6186884885673413e-07, | |
| "loss": 0.0125, | |
| "reward": 0.17908507003448904, | |
| "reward_std": 0.8118347823619843, | |
| "rewards/cosine_scaled_reward": -0.165109351859428, | |
| "rewards/format_reward": 0.812500013038516, | |
| "step": 424 | |
| }, | |
| { | |
| "completion_length": 1094.2500381469727, | |
| "epoch": 0.4857142857142857, | |
| "grad_norm": 1.6124348640441895, | |
| "kl": 0.13685989379882812, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": 0.0055, | |
| "reward": 0.9797765575349331, | |
| "reward_std": 0.7934582978487015, | |
| "rewards/cosine_scaled_reward": 0.33494018763303757, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 425 | |
| }, | |
| { | |
| "completion_length": 1124.4166831970215, | |
| "epoch": 0.4868571428571429, | |
| "grad_norm": 1.3242241144180298, | |
| "kl": 0.1932373046875, | |
| "learning_rate": 1.5872728172265146e-07, | |
| "loss": 0.0077, | |
| "reward": 0.3963227402418852, | |
| "reward_std": 0.6691669598221779, | |
| "rewards/cosine_scaled_reward": -0.06672874744981527, | |
| "rewards/format_reward": 0.9375, | |
| "step": 426 | |
| }, | |
| { | |
| "completion_length": 1525.520896911621, | |
| "epoch": 0.488, | |
| "grad_norm": 1.3827719688415527, | |
| "kl": 0.24054718017578125, | |
| "learning_rate": 1.5718506522858572e-07, | |
| "loss": 0.0096, | |
| "reward": 0.38459044555202127, | |
| "reward_std": 0.8512180484831333, | |
| "rewards/cosine_scaled_reward": -0.07769208890385926, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 427 | |
| }, | |
| { | |
| "completion_length": 1536.3542098999023, | |
| "epoch": 0.48914285714285716, | |
| "grad_norm": 1.885048270225525, | |
| "kl": 0.2565765380859375, | |
| "learning_rate": 1.5566199398026147e-07, | |
| "loss": 0.0103, | |
| "reward": 0.1980901760980487, | |
| "reward_std": 0.6689082272350788, | |
| "rewards/cosine_scaled_reward": -0.1701727721374482, | |
| "rewards/format_reward": 0.8750000074505806, | |
| "step": 428 | |
| }, | |
| { | |
| "completion_length": 803.3333587646484, | |
| "epoch": 0.49028571428571427, | |
| "grad_norm": 0.38952067494392395, | |
| "kl": 0.01854705810546875, | |
| "learning_rate": 1.5415814221002265e-07, | |
| "loss": 0.0007, | |
| "reward": 0.3733687801286578, | |
| "reward_std": 0.6376638635993004, | |
| "rewards/cosine_scaled_reward": -0.11222685221582651, | |
| "rewards/format_reward": 1.0, | |
| "step": 429 | |
| }, | |
| { | |
| "completion_length": 1186.7291984558105, | |
| "epoch": 0.49142857142857144, | |
| "grad_norm": 1.133076548576355, | |
| "kl": 0.19187164306640625, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.0077, | |
| "reward": 0.529668789356947, | |
| "reward_std": 0.6670900508761406, | |
| "rewards/cosine_scaled_reward": 0.011455949861556292, | |
| "rewards/format_reward": 0.9583333358168602, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 1090.479190826416, | |
| "epoch": 0.49257142857142855, | |
| "grad_norm": 2.763601064682007, | |
| "kl": 0.3094367980957031, | |
| "learning_rate": 1.5120838934595337e-07, | |
| "loss": 0.0124, | |
| "reward": 0.4001455968245864, | |
| "reward_std": 0.6080403476953506, | |
| "rewards/cosine_scaled_reward": -0.03941826708614826, | |
| "rewards/format_reward": 0.8958333507180214, | |
| "step": 431 | |
| }, | |
| { | |
| "completion_length": 1532.8541870117188, | |
| "epoch": 0.4937142857142857, | |
| "grad_norm": 1.8087928295135498, | |
| "kl": 0.3389739990234375, | |
| "learning_rate": 1.4976263201891613e-07, | |
| "loss": 0.0135, | |
| "reward": 0.2658024498960003, | |
| "reward_std": 0.7025109715759754, | |
| "rewards/cosine_scaled_reward": -0.10354285500943661, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 432 | |
| }, | |
| { | |
| "completion_length": 1383.3542022705078, | |
| "epoch": 0.4948571428571429, | |
| "grad_norm": 1.609007477760315, | |
| "kl": 0.202850341796875, | |
| "learning_rate": 1.483363816965435e-07, | |
| "loss": 0.0081, | |
| "reward": 0.47822056571021676, | |
| "reward_std": 0.6413930989801884, | |
| "rewards/cosine_scaled_reward": -0.032901763916015625, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 433 | |
| }, | |
| { | |
| "completion_length": 1405.7500381469727, | |
| "epoch": 0.496, | |
| "grad_norm": 1.7864830493927002, | |
| "kl": 0.16643524169921875, | |
| "learning_rate": 1.469297078922642e-07, | |
| "loss": 0.0067, | |
| "reward": 0.09692841861397028, | |
| "reward_std": 0.5686304904520512, | |
| "rewards/cosine_scaled_reward": -0.22341302986023948, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 434 | |
| }, | |
| { | |
| "completion_length": 894.8541984558105, | |
| "epoch": 0.49714285714285716, | |
| "grad_norm": 1.2933272123336792, | |
| "kl": 0.20261383056640625, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": 0.0081, | |
| "reward": 0.3131554089486599, | |
| "reward_std": 0.5978329069912434, | |
| "rewards/cosine_scaled_reward": -0.12059530150145292, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 435 | |
| }, | |
| { | |
| "completion_length": 1181.6875343322754, | |
| "epoch": 0.4982857142857143, | |
| "grad_norm": 2.0181899070739746, | |
| "kl": 0.34568023681640625, | |
| "learning_rate": 1.4417536311769885e-07, | |
| "loss": 0.0138, | |
| "reward": 0.5736293898262375, | |
| "reward_std": 0.6847642697393894, | |
| "rewards/cosine_scaled_reward": 0.08606236800551414, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 436 | |
| }, | |
| { | |
| "completion_length": 1276.4791946411133, | |
| "epoch": 0.49942857142857144, | |
| "grad_norm": 2.564283847808838, | |
| "kl": 0.2025604248046875, | |
| "learning_rate": 1.4282782639029128e-07, | |
| "loss": 0.0081, | |
| "reward": 0.4678001292049885, | |
| "reward_std": 0.7040045224130154, | |
| "rewards/cosine_scaled_reward": -0.008855259045958519, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 437 | |
| }, | |
| { | |
| "completion_length": 1573.0833587646484, | |
| "epoch": 0.5005714285714286, | |
| "grad_norm": 2.7367420196533203, | |
| "kl": 0.5104904174804688, | |
| "learning_rate": 1.4150013466019114e-07, | |
| "loss": 0.0204, | |
| "reward": 0.17007037345319986, | |
| "reward_std": 0.5956090353429317, | |
| "rewards/cosine_scaled_reward": -0.1865391266765073, | |
| "rewards/format_reward": 0.8750000055879354, | |
| "step": 438 | |
| }, | |
| { | |
| "completion_length": 1149.6875305175781, | |
| "epoch": 0.5017142857142857, | |
| "grad_norm": 1.5625011920928955, | |
| "kl": 0.2465057373046875, | |
| "learning_rate": 1.4019235263722034e-07, | |
| "loss": 0.0099, | |
| "reward": 0.29737665317952633, | |
| "reward_std": 0.6380414590239525, | |
| "rewards/cosine_scaled_reward": -0.1426175870001316, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 439 | |
| }, | |
| { | |
| "completion_length": 1237.354206085205, | |
| "epoch": 0.5028571428571429, | |
| "grad_norm": 5.108010768890381, | |
| "kl": 0.387542724609375, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.0155, | |
| "reward": 0.17432179488241673, | |
| "reward_std": 0.6026643626391888, | |
| "rewards/cosine_scaled_reward": -0.20669769623782486, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 1345.958381652832, | |
| "epoch": 0.504, | |
| "grad_norm": 3.1474623680114746, | |
| "kl": 0.32805633544921875, | |
| "learning_rate": 1.3763677169699217e-07, | |
| "loss": 0.0131, | |
| "reward": 0.5082171498797834, | |
| "reward_std": 0.5840157195925713, | |
| "rewards/cosine_scaled_reward": 0.023840421810746193, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 441 | |
| }, | |
| { | |
| "completion_length": 1032.1042022705078, | |
| "epoch": 0.5051428571428571, | |
| "grad_norm": 6.944667339324951, | |
| "kl": 0.20575714111328125, | |
| "learning_rate": 1.3638909733514452e-07, | |
| "loss": 0.0082, | |
| "reward": 0.43416818673722446, | |
| "reward_std": 0.6829791888594627, | |
| "rewards/cosine_scaled_reward": -0.0446524852886796, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 442 | |
| }, | |
| { | |
| "completion_length": 1498.2708587646484, | |
| "epoch": 0.5062857142857143, | |
| "grad_norm": 2.155977725982666, | |
| "kl": 0.36579132080078125, | |
| "learning_rate": 1.351615817851748e-07, | |
| "loss": 0.0147, | |
| "reward": 0.35938314939267, | |
| "reward_std": 0.6541690826416016, | |
| "rewards/cosine_scaled_reward": -0.07123458385467529, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 443 | |
| }, | |
| { | |
| "completion_length": 1179.6666870117188, | |
| "epoch": 0.5074285714285715, | |
| "grad_norm": 1.1800274848937988, | |
| "kl": 0.2142486572265625, | |
| "learning_rate": 1.3395428487445914e-07, | |
| "loss": 0.0086, | |
| "reward": 0.20739420503377914, | |
| "reward_std": 0.665216438472271, | |
| "rewards/cosine_scaled_reward": -0.16437909565865993, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 444 | |
| }, | |
| { | |
| "completion_length": 1355.6042098999023, | |
| "epoch": 0.5085714285714286, | |
| "grad_norm": 3.1479392051696777, | |
| "kl": 0.36163330078125, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": 0.0145, | |
| "reward": 0.3757004216313362, | |
| "reward_std": 0.7862649708986282, | |
| "rewards/cosine_scaled_reward": -0.08248981460928917, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 445 | |
| }, | |
| { | |
| "completion_length": 1242.4792175292969, | |
| "epoch": 0.5097142857142857, | |
| "grad_norm": 1.7266709804534912, | |
| "kl": 0.15660858154296875, | |
| "learning_rate": 1.316005813502869e-07, | |
| "loss": 0.0063, | |
| "reward": 0.41175389010459185, | |
| "reward_std": 0.6945022568106651, | |
| "rewards/cosine_scaled_reward": -0.06201653182506561, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 446 | |
| }, | |
| { | |
| "completion_length": 1084.6042098999023, | |
| "epoch": 0.5108571428571429, | |
| "grad_norm": 1.3974360227584839, | |
| "kl": 0.21521759033203125, | |
| "learning_rate": 1.3045428945301953e-07, | |
| "loss": 0.0086, | |
| "reward": 0.36824747081846, | |
| "reward_std": 0.5542174242436886, | |
| "rewards/cosine_scaled_reward": -0.06647356506437063, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 447 | |
| }, | |
| { | |
| "completion_length": 1107.3125228881836, | |
| "epoch": 0.512, | |
| "grad_norm": 1.4814095497131348, | |
| "kl": 0.2049713134765625, | |
| "learning_rate": 1.2932844562179352e-07, | |
| "loss": 0.0082, | |
| "reward": 0.4160159872844815, | |
| "reward_std": 0.5563804637640715, | |
| "rewards/cosine_scaled_reward": -0.026282913982868195, | |
| "rewards/format_reward": 0.8958333507180214, | |
| "step": 448 | |
| }, | |
| { | |
| "completion_length": 1043.958351135254, | |
| "epoch": 0.5131428571428571, | |
| "grad_norm": 0.7390405535697937, | |
| "kl": 0.09143447875976562, | |
| "learning_rate": 1.2822310472864885e-07, | |
| "loss": 0.0037, | |
| "reward": 0.14539668832730968, | |
| "reward_std": 0.49666889011859894, | |
| "rewards/cosine_scaled_reward": -0.22875094041228294, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 449 | |
| }, | |
| { | |
| "completion_length": 1121.479175567627, | |
| "epoch": 0.5142857142857142, | |
| "grad_norm": 1.1870522499084473, | |
| "kl": 0.17742919921875, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.0071, | |
| "reward": 0.31525158043950796, | |
| "reward_std": 0.5219849050045013, | |
| "rewards/cosine_scaled_reward": -0.11353826522827148, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 1166.9583587646484, | |
| "epoch": 0.5154285714285715, | |
| "grad_norm": 1.581584095954895, | |
| "kl": 0.31623077392578125, | |
| "learning_rate": 1.260741462457165e-07, | |
| "loss": 0.0127, | |
| "reward": 0.34822911536321044, | |
| "reward_std": 0.6658046580851078, | |
| "rewards/cosine_scaled_reward": -0.1033438453450799, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 451 | |
| }, | |
| { | |
| "completion_length": 1243.687515258789, | |
| "epoch": 0.5165714285714286, | |
| "grad_norm": 2.1429476737976074, | |
| "kl": 0.2833709716796875, | |
| "learning_rate": 1.2503063339313356e-07, | |
| "loss": 0.0113, | |
| "reward": 0.6556259745266289, | |
| "reward_std": 0.8615197539329529, | |
| "rewards/cosine_scaled_reward": 0.09724390879273415, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 452 | |
| }, | |
| { | |
| "completion_length": 1214.7708740234375, | |
| "epoch": 0.5177142857142857, | |
| "grad_norm": 0.8739696741104126, | |
| "kl": 0.06499481201171875, | |
| "learning_rate": 1.2400783294793668e-07, | |
| "loss": 0.0026, | |
| "reward": 0.3872159831225872, | |
| "reward_std": 0.7029216475784779, | |
| "rewards/cosine_scaled_reward": -0.08512475527822971, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 453 | |
| }, | |
| { | |
| "completion_length": 1117.5417098999023, | |
| "epoch": 0.5188571428571429, | |
| "grad_norm": 2.8257322311401367, | |
| "kl": 0.198486328125, | |
| "learning_rate": 1.2300579475997657e-07, | |
| "loss": 0.0079, | |
| "reward": 0.3443953925743699, | |
| "reward_std": 0.6449627317488194, | |
| "rewards/cosine_scaled_reward": -0.09044338436797261, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 454 | |
| }, | |
| { | |
| "completion_length": 1421.6042022705078, | |
| "epoch": 0.52, | |
| "grad_norm": 1.7228955030441284, | |
| "kl": 0.4829292297363281, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": 0.0193, | |
| "reward": 0.14169234223663807, | |
| "reward_std": 0.591641578823328, | |
| "rewards/cosine_scaled_reward": -0.18374849221436307, | |
| "rewards/format_reward": 0.8333333414047956, | |
| "step": 455 | |
| }, | |
| { | |
| "completion_length": 1353.6250267028809, | |
| "epoch": 0.5211428571428571, | |
| "grad_norm": 1.9811756610870361, | |
| "kl": 0.2818756103515625, | |
| "learning_rate": 1.2106419949317388e-07, | |
| "loss": 0.0113, | |
| "reward": 0.26744416062138043, | |
| "reward_std": 0.691377304494381, | |
| "rewards/cosine_scaled_reward": -0.15542185143567622, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 456 | |
| }, | |
| { | |
| "completion_length": 1305.8125305175781, | |
| "epoch": 0.5222857142857142, | |
| "grad_norm": 2.8912620544433594, | |
| "kl": 0.4429473876953125, | |
| "learning_rate": 1.2012473704494537e-07, | |
| "loss": 0.0177, | |
| "reward": 0.33940141764469445, | |
| "reward_std": 0.8498760014772415, | |
| "rewards/cosine_scaled_reward": -0.04779240628704429, | |
| "rewards/format_reward": 0.7916666753590107, | |
| "step": 457 | |
| }, | |
| { | |
| "completion_length": 1106.770866394043, | |
| "epoch": 0.5234285714285715, | |
| "grad_norm": 1.6600767374038696, | |
| "kl": 0.252227783203125, | |
| "learning_rate": 1.1920622611056974e-07, | |
| "loss": 0.0101, | |
| "reward": 0.3232785561122, | |
| "reward_std": 0.5980268009006977, | |
| "rewards/cosine_scaled_reward": -0.10486513609066606, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 458 | |
| }, | |
| { | |
| "completion_length": 1032.0625228881836, | |
| "epoch": 0.5245714285714286, | |
| "grad_norm": 1.426925778388977, | |
| "kl": 0.23378753662109375, | |
| "learning_rate": 1.1830871145697412e-07, | |
| "loss": 0.0094, | |
| "reward": 0.5453407493187115, | |
| "reward_std": 0.7394749782979488, | |
| "rewards/cosine_scaled_reward": 0.02821425348520279, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 459 | |
| }, | |
| { | |
| "completion_length": 1541.208366394043, | |
| "epoch": 0.5257142857142857, | |
| "grad_norm": 1.5413358211517334, | |
| "kl": 0.431365966796875, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": 0.0172, | |
| "reward": 0.19518441951368004, | |
| "reward_std": 0.6402174085378647, | |
| "rewards/cosine_scaled_reward": -0.1528447875753045, | |
| "rewards/format_reward": 0.8333333395421505, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 1445.0000610351562, | |
| "epoch": 0.5268571428571428, | |
| "grad_norm": 1.153205156326294, | |
| "kl": 0.4829826354980469, | |
| "learning_rate": 1.1657684494105386e-07, | |
| "loss": 0.0194, | |
| "reward": 0.6377177089452744, | |
| "reward_std": 1.0267005078494549, | |
| "rewards/cosine_scaled_reward": 0.11436054221121594, | |
| "rewards/format_reward": 0.8541666772216558, | |
| "step": 461 | |
| }, | |
| { | |
| "completion_length": 989.8333511352539, | |
| "epoch": 0.528, | |
| "grad_norm": 0.7328640818595886, | |
| "kl": 0.04534912109375, | |
| "learning_rate": 1.1574257748745986e-07, | |
| "loss": 0.0018, | |
| "reward": 0.2644703108817339, | |
| "reward_std": 0.6658349372446537, | |
| "rewards/cosine_scaled_reward": -0.180008752271533, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 462 | |
| }, | |
| { | |
| "completion_length": 1494.3125457763672, | |
| "epoch": 0.5291428571428571, | |
| "grad_norm": 2.0038442611694336, | |
| "kl": 0.386627197265625, | |
| "learning_rate": 1.1492947512799328e-07, | |
| "loss": 0.0155, | |
| "reward": 0.5318789854645729, | |
| "reward_std": 0.9075686037540436, | |
| "rewards/cosine_scaled_reward": 0.03875172859989107, | |
| "rewards/format_reward": 0.8750000074505806, | |
| "step": 463 | |
| }, | |
| { | |
| "completion_length": 992.7500381469727, | |
| "epoch": 0.5302857142857142, | |
| "grad_norm": 1.684259295463562, | |
| "kl": 0.20226287841796875, | |
| "learning_rate": 1.1413757749211602e-07, | |
| "loss": 0.0081, | |
| "reward": 0.7598909301450476, | |
| "reward_std": 0.5075674168765545, | |
| "rewards/cosine_scaled_reward": 0.17828124994412065, | |
| "rewards/format_reward": 0.9583333358168602, | |
| "step": 464 | |
| }, | |
| { | |
| "completion_length": 1197.208366394043, | |
| "epoch": 0.5314285714285715, | |
| "grad_norm": 2.0846643447875977, | |
| "kl": 0.156219482421875, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.0063, | |
| "reward": 0.36710425605997443, | |
| "reward_std": 0.7569303959608078, | |
| "rewards/cosine_scaled_reward": -0.09665499581024051, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 465 | |
| }, | |
| { | |
| "completion_length": 1219.9375381469727, | |
| "epoch": 0.5325714285714286, | |
| "grad_norm": 1.655044436454773, | |
| "kl": 0.1450958251953125, | |
| "learning_rate": 1.1261754973965422e-07, | |
| "loss": 0.0058, | |
| "reward": 0.589016193524003, | |
| "reward_std": 0.6661927625536919, | |
| "rewards/cosine_scaled_reward": 0.03179914876818657, | |
| "rewards/format_reward": 1.0, | |
| "step": 466 | |
| }, | |
| { | |
| "completion_length": 1338.2292137145996, | |
| "epoch": 0.5337142857142857, | |
| "grad_norm": 1.6192094087600708, | |
| "kl": 0.24442291259765625, | |
| "learning_rate": 1.1188949370707787e-07, | |
| "loss": 0.0098, | |
| "reward": 0.32355972472578287, | |
| "reward_std": 0.788430068641901, | |
| "rewards/cosine_scaled_reward": -0.11671263433527201, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 467 | |
| }, | |
| { | |
| "completion_length": 1452.9791870117188, | |
| "epoch": 0.5348571428571428, | |
| "grad_norm": 2.234537124633789, | |
| "kl": 0.47918701171875, | |
| "learning_rate": 1.1118279056249653e-07, | |
| "loss": 0.0191, | |
| "reward": 0.42969178780913353, | |
| "reward_std": 0.7676932998001575, | |
| "rewards/cosine_scaled_reward": -0.04107399005442858, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 468 | |
| }, | |
| { | |
| "completion_length": 1134.1250114440918, | |
| "epoch": 0.536, | |
| "grad_norm": 3.1756770610809326, | |
| "kl": 0.333251953125, | |
| "learning_rate": 1.1049747474962444e-07, | |
| "loss": 0.0133, | |
| "reward": 0.36575854755938053, | |
| "reward_std": 0.7719821371138096, | |
| "rewards/cosine_scaled_reward": -0.0800532667490188, | |
| "rewards/format_reward": 0.8958333507180214, | |
| "step": 469 | |
| }, | |
| { | |
| "completion_length": 1417.6042175292969, | |
| "epoch": 0.5371428571428571, | |
| "grad_norm": 1.5227336883544922, | |
| "kl": 0.470703125, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": 0.0188, | |
| "reward": 0.3243638591375202, | |
| "reward_std": 0.8565945476293564, | |
| "rewards/cosine_scaled_reward": -0.0990256522782147, | |
| "rewards/format_reward": 0.8750000223517418, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 1389.9167098999023, | |
| "epoch": 0.5382857142857143, | |
| "grad_norm": 0.8987606167793274, | |
| "kl": 0.13502120971679688, | |
| "learning_rate": 1.0919113768029517e-07, | |
| "loss": 0.0054, | |
| "reward": 0.5736090987920761, | |
| "reward_std": 0.7181516140699387, | |
| "rewards/cosine_scaled_reward": 0.048229770036414266, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 471 | |
| }, | |
| { | |
| "completion_length": 1417.3125305175781, | |
| "epoch": 0.5394285714285715, | |
| "grad_norm": 1.6695373058319092, | |
| "kl": 0.477447509765625, | |
| "learning_rate": 1.0857018009286381e-07, | |
| "loss": 0.019, | |
| "reward": 0.2978296782821417, | |
| "reward_std": 0.8513829745352268, | |
| "rewards/cosine_scaled_reward": -0.10584009531885386, | |
| "rewards/format_reward": 0.8541666753590107, | |
| "step": 472 | |
| }, | |
| { | |
| "completion_length": 1296.083351135254, | |
| "epoch": 0.5405714285714286, | |
| "grad_norm": 1.4655067920684814, | |
| "kl": 0.205474853515625, | |
| "learning_rate": 1.0797073717209013e-07, | |
| "loss": 0.0082, | |
| "reward": 0.1657012979267165, | |
| "reward_std": 0.5747975409030914, | |
| "rewards/cosine_scaled_reward": -0.20028305146843195, | |
| "rewards/format_reward": 0.8958333358168602, | |
| "step": 473 | |
| }, | |
| { | |
| "completion_length": 1289.0833778381348, | |
| "epoch": 0.5417142857142857, | |
| "grad_norm": 2.321017026901245, | |
| "kl": 0.4053497314453125, | |
| "learning_rate": 1.0739283813397639e-07, | |
| "loss": 0.0162, | |
| "reward": 0.743865036405623, | |
| "reward_std": 0.733939703553915, | |
| "rewards/cosine_scaled_reward": 0.17521634395234287, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 474 | |
| }, | |
| { | |
| "completion_length": 1455.7500228881836, | |
| "epoch": 0.5428571428571428, | |
| "grad_norm": 1.4909836053848267, | |
| "kl": 0.37940216064453125, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.0152, | |
| "reward": 0.3271640567108989, | |
| "reward_std": 0.7808557823300362, | |
| "rewards/cosine_scaled_reward": -0.08086569933220744, | |
| "rewards/format_reward": 0.8541666679084301, | |
| "step": 475 | |
| }, | |
| { | |
| "completion_length": 1167.2291793823242, | |
| "epoch": 0.544, | |
| "grad_norm": 2.0223841667175293, | |
| "kl": 0.2843170166015625, | |
| "learning_rate": 1.063017833182728e-07, | |
| "loss": 0.0114, | |
| "reward": 0.5520191243849695, | |
| "reward_std": 0.9013429544866085, | |
| "rewards/cosine_scaled_reward": 0.02163805003510788, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 476 | |
| }, | |
| { | |
| "completion_length": 1188.1041946411133, | |
| "epoch": 0.5451428571428572, | |
| "grad_norm": 0.9858556985855103, | |
| "kl": 0.3222007751464844, | |
| "learning_rate": 1.0578868071715544e-07, | |
| "loss": 0.0129, | |
| "reward": 0.6779396012425423, | |
| "reward_std": 0.7415244840085506, | |
| "rewards/cosine_scaled_reward": 0.11190658155828714, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 477 | |
| }, | |
| { | |
| "completion_length": 1321.9375381469727, | |
| "epoch": 0.5462857142857143, | |
| "grad_norm": 1.826804280281067, | |
| "kl": 0.29107666015625, | |
| "learning_rate": 1.0529722834905125e-07, | |
| "loss": 0.0116, | |
| "reward": 0.23478064546361566, | |
| "reward_std": 0.6792666539549828, | |
| "rewards/cosine_scaled_reward": -0.1663953149691224, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 478 | |
| }, | |
| { | |
| "completion_length": 1477.6041946411133, | |
| "epoch": 0.5474285714285714, | |
| "grad_norm": 2.102496862411499, | |
| "kl": 0.60791015625, | |
| "learning_rate": 1.0482745016665526e-07, | |
| "loss": 0.0243, | |
| "reward": 0.5314750894904137, | |
| "reward_std": 0.8037730753421783, | |
| "rewards/cosine_scaled_reward": 0.02458848152309656, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 479 | |
| }, | |
| { | |
| "completion_length": 1360.0208587646484, | |
| "epoch": 0.5485714285714286, | |
| "grad_norm": 1.7093435525894165, | |
| "kl": 0.4275360107421875, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.0171, | |
| "reward": 0.3702342016622424, | |
| "reward_std": 0.6831196062266827, | |
| "rewards/cosine_scaled_reward": -0.07479118811897933, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 1480.3750534057617, | |
| "epoch": 0.5497142857142857, | |
| "grad_norm": 2.413257122039795, | |
| "kl": 0.6222076416015625, | |
| "learning_rate": 1.0395300688680625e-07, | |
| "loss": 0.0249, | |
| "reward": 0.1323686043615453, | |
| "reward_std": 0.6688967496156693, | |
| "rewards/cosine_scaled_reward": -0.17295160831417888, | |
| "rewards/format_reward": 0.7916666753590107, | |
| "step": 481 | |
| }, | |
| { | |
| "completion_length": 1245.3750381469727, | |
| "epoch": 0.5508571428571428, | |
| "grad_norm": 1.1108511686325073, | |
| "kl": 0.3742408752441406, | |
| "learning_rate": 1.0354838440848501e-07, | |
| "loss": 0.015, | |
| "reward": 0.5778923179022968, | |
| "reward_std": 0.7943699173629284, | |
| "rewards/cosine_scaled_reward": 0.05815020017325878, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 482 | |
| }, | |
| { | |
| "completion_length": 1345.7917022705078, | |
| "epoch": 0.552, | |
| "grad_norm": 3.296278238296509, | |
| "kl": 0.32867431640625, | |
| "learning_rate": 1.0316552135205837e-07, | |
| "loss": 0.0131, | |
| "reward": 0.4724856864195317, | |
| "reward_std": 0.7850163578987122, | |
| "rewards/cosine_scaled_reward": 0.012874770443886518, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 483 | |
| }, | |
| { | |
| "completion_length": 1077.2083625793457, | |
| "epoch": 0.5531428571428572, | |
| "grad_norm": 1.8179315328598022, | |
| "kl": 0.19476699829101562, | |
| "learning_rate": 1.0280443637773163e-07, | |
| "loss": 0.0078, | |
| "reward": 0.460749551653862, | |
| "reward_std": 0.6464533470571041, | |
| "rewards/cosine_scaled_reward": -0.013641191995702684, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 484 | |
| }, | |
| { | |
| "completion_length": 1128.0208740234375, | |
| "epoch": 0.5542857142857143, | |
| "grad_norm": 3.131510019302368, | |
| "kl": 0.27252197265625, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": 0.0109, | |
| "reward": 0.29226525872945786, | |
| "reward_std": 0.67261578515172, | |
| "rewards/cosine_scaled_reward": -0.1184451412409544, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 485 | |
| }, | |
| { | |
| "completion_length": 904.479190826416, | |
| "epoch": 0.5554285714285714, | |
| "grad_norm": 2.3228588104248047, | |
| "kl": 0.23955535888671875, | |
| "learning_rate": 1.0214767000817596e-07, | |
| "loss": 0.0096, | |
| "reward": 0.4756124352570623, | |
| "reward_std": 0.6488616764545441, | |
| "rewards/cosine_scaled_reward": -0.003937863744795322, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 486 | |
| }, | |
| { | |
| "completion_length": 1126.666690826416, | |
| "epoch": 0.5565714285714286, | |
| "grad_norm": 1.264155626296997, | |
| "kl": 0.18535614013671875, | |
| "learning_rate": 1.0185202062281336e-07, | |
| "loss": 0.0074, | |
| "reward": 0.8187248446047306, | |
| "reward_std": 0.7255717366933823, | |
| "rewards/cosine_scaled_reward": 0.23560354206711054, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 487 | |
| }, | |
| { | |
| "completion_length": 975.666690826416, | |
| "epoch": 0.5577142857142857, | |
| "grad_norm": 1.1237510442733765, | |
| "kl": 0.089996337890625, | |
| "learning_rate": 1.0157821333772304e-07, | |
| "loss": 0.0036, | |
| "reward": 0.3682096116244793, | |
| "reward_std": 0.5310206785798073, | |
| "rewards/cosine_scaled_reward": -0.08794048149138689, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 488 | |
| }, | |
| { | |
| "completion_length": 1439.4167289733887, | |
| "epoch": 0.5588571428571428, | |
| "grad_norm": 1.723633050918579, | |
| "kl": 0.398651123046875, | |
| "learning_rate": 1.013262614978859e-07, | |
| "loss": 0.016, | |
| "reward": 0.12414154410362244, | |
| "reward_std": 0.5313785709440708, | |
| "rewards/cosine_scaled_reward": -0.21272556111216545, | |
| "rewards/format_reward": 0.8750000298023224, | |
| "step": 489 | |
| }, | |
| { | |
| "completion_length": 1332.0833740234375, | |
| "epoch": 0.56, | |
| "grad_norm": 1.4947898387908936, | |
| "kl": 0.33425140380859375, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": 0.0134, | |
| "reward": 0.318634120747447, | |
| "reward_std": 0.5996614284813404, | |
| "rewards/cosine_scaled_reward": -0.1064935065805912, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 1447.0833587646484, | |
| "epoch": 0.5611428571428572, | |
| "grad_norm": 2.106166362762451, | |
| "kl": 0.3012504577636719, | |
| "learning_rate": 1.0088797220727779e-07, | |
| "loss": 0.0121, | |
| "reward": 0.4496698835864663, | |
| "reward_std": 0.8973959572613239, | |
| "rewards/cosine_scaled_reward": -0.014357679523527622, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 491 | |
| }, | |
| { | |
| "completion_length": 1104.6041831970215, | |
| "epoch": 0.5622857142857143, | |
| "grad_norm": 2.4530482292175293, | |
| "kl": 0.1375885009765625, | |
| "learning_rate": 1.0070165611810855e-07, | |
| "loss": 0.0055, | |
| "reward": 0.3822294343262911, | |
| "reward_std": 0.6128358133137226, | |
| "rewards/cosine_scaled_reward": -0.07346190325915813, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 492 | |
| }, | |
| { | |
| "completion_length": 1073.3541793823242, | |
| "epoch": 0.5634285714285714, | |
| "grad_norm": 1.1550400257110596, | |
| "kl": 0.18117523193359375, | |
| "learning_rate": 1.005372381963547e-07, | |
| "loss": 0.0072, | |
| "reward": 0.4816288612782955, | |
| "reward_std": 0.8117332980036736, | |
| "rewards/cosine_scaled_reward": -0.030845604138448834, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 493 | |
| }, | |
| { | |
| "completion_length": 1056.62504196167, | |
| "epoch": 0.5645714285714286, | |
| "grad_norm": 0.7965182662010193, | |
| "kl": 0.1199188232421875, | |
| "learning_rate": 1.0039472645551372e-07, | |
| "loss": 0.0048, | |
| "reward": 0.3902022670954466, | |
| "reward_std": 0.7675591520965099, | |
| "rewards/cosine_scaled_reward": -0.09161333832889795, | |
| "rewards/format_reward": 0.9583333358168602, | |
| "step": 494 | |
| }, | |
| { | |
| "completion_length": 1542.6042175292969, | |
| "epoch": 0.5657142857142857, | |
| "grad_norm": 2.6710989475250244, | |
| "kl": 0.34729766845703125, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": 0.0139, | |
| "reward": 0.5254024369642138, | |
| "reward_std": 0.9325296804308891, | |
| "rewards/cosine_scaled_reward": 0.017028740607202053, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 495 | |
| }, | |
| { | |
| "completion_length": 1316.7292022705078, | |
| "epoch": 0.5668571428571428, | |
| "grad_norm": 1.4742833375930786, | |
| "kl": 0.5881500244140625, | |
| "learning_rate": 1.0017544823184055e-07, | |
| "loss": 0.0235, | |
| "reward": 0.619772095582448, | |
| "reward_std": 0.8898240327835083, | |
| "rewards/cosine_scaled_reward": 0.13812840729951859, | |
| "rewards/format_reward": 0.7916666846722364, | |
| "step": 496 | |
| }, | |
| { | |
| "completion_length": 1185.708351135254, | |
| "epoch": 0.568, | |
| "grad_norm": 1.6387114524841309, | |
| "kl": 0.5859603881835938, | |
| "learning_rate": 1.0009869243631952e-07, | |
| "loss": 0.0235, | |
| "reward": 0.7584062092937529, | |
| "reward_std": 0.6907122731208801, | |
| "rewards/cosine_scaled_reward": 0.20853716135025024, | |
| "rewards/format_reward": 0.8750000111758709, | |
| "step": 497 | |
| }, | |
| { | |
| "completion_length": 1406.3958587646484, | |
| "epoch": 0.5691428571428572, | |
| "grad_norm": 2.6451492309570312, | |
| "kl": 0.383056640625, | |
| "learning_rate": 1.000438641958131e-07, | |
| "loss": 0.0153, | |
| "reward": 0.33473338062322, | |
| "reward_std": 0.7394461110234261, | |
| "rewards/cosine_scaled_reward": -0.10398351773619652, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 498 | |
| }, | |
| { | |
| "completion_length": 1517.770881652832, | |
| "epoch": 0.5702857142857143, | |
| "grad_norm": 2.2294161319732666, | |
| "kl": 0.6592826843261719, | |
| "learning_rate": 1.0001096618257236e-07, | |
| "loss": 0.0264, | |
| "reward": 0.40126605180557817, | |
| "reward_std": 0.7697222679853439, | |
| "rewards/cosine_scaled_reward": -0.006456421993789263, | |
| "rewards/format_reward": 0.8125000186264515, | |
| "step": 499 | |
| }, | |
| { | |
| "completion_length": 1280.7500305175781, | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 2.027796983718872, | |
| "kl": 0.46686553955078125, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0186, | |
| "reward": 0.2269948897883296, | |
| "reward_std": 0.7286034636199474, | |
| "rewards/cosine_scaled_reward": -0.13150667655281723, | |
| "rewards/format_reward": 0.8333333507180214, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "step": 500, | |
| "total_flos": 0.0, | |
| "train_loss": 0.0027833052817963252, | |
| "train_runtime": 55581.6431, | |
| "train_samples_per_second": 0.432, | |
| "train_steps_per_second": 0.009 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |